网络爬虫

  这两天要写一个爬虫的程序用来抓取想要的数据然后写入数据库,查了好多资料才写了出来,下载贴出来和大家分享下

 

一、下载spider

二、创建一个c/s程序

三、/// <summary> /// 蜘蛛 /// </summary> /// <param name="sender"></param> /// <param name="e"></param> private void button1_Click(object sender, EventArgs e) { if (txturl.Text == "") { MessageBox.Show("请输入网络地址"); } else { ThreadStart starter = new ThreadStart(this.SpiderThread); Thread spider = new Thread(starter); spider.Start(); } } /// <summary> /// 当前url /// </summary> /// <param name="str"></param> public void SetLastURL(string str) { txtcontent.Text = str; } /// <summary> /// 已用时间 /// </summary> /// <param name="str"></param> public void SetElapsedTime(string str) { elapsed.Text = str; } /// <summary> /// 处理过的url /// </summary> /// <param name="str"></param> public void SetProcessedCount(string str) { processedURLs.Text = str; } public void SpiderThread() { //输出地址 string outdir = "地址"; if(Directory.Exists(outdir)) { Directory.CreateDirectory(outdir); } //线程数字 string threadCount = "10"; if (button1.Text.Equals("Cancel")) { m_spider.Quit = true; button1.Enabled = false; } else { button1.Text = "Cancel"; txturl.Enabled = false; m_spider = new Spider.Spider(); m_spider.ReportTo = this; m_spider.OutputPath = outdir; int threads = int.Parse(threadCount); if (threads < 1) threads = 1; threadCount= ""+ threads; try { m_spider.Start(new Uri(this.txturl.Text), threads); } catch (UriFormatException ex) { System.Windows.Forms.MessageBox.Show(ex.Message); return; } button1.Text = "Begin"; txturl.Enabled = true; //threadCount.Enabled = true; button1.Enabled = true; } } private void Form1_Close(object sender, CancelEventArgs e) { Application.Exit(); } private void Form1_Load(object sender, EventArgs e) { }

四、读取网页的内容提取自己想要的数据

 public void Getfile(string dirPath) { #region 直接匹配 string regex = “正则”; Regex reg = new Regex(regex); string filename = ""; DirectoryInfo Dir = new DirectoryInfo(dirPath); try { foreach (DirectoryInfo d in Dir.GetDirectories()) //查找子目录 { Getfile(Dir + d.ToString() + "//"); this.txtcontent.AppendText(d.FullName.ToString()); this.txtcontent.AppendText("/r/n"); } foreach (FileInfo f in Dir.GetFiles("*.html")) //查找文件 { int count = 0; FileStream fs = new FileStream(f.FullName, FileMode.Open, FileAccess.Read); StreamReader sr = new StreamReader(fs, Encoding.GetEncoding("UTF-8")); MatchCollection mc = reg.Matches(sr.ReadToEnd()); foreach (Match m in mc) { this.txtcontent.AppendText(m.Groups[1].Value.Trim()); txtcontent.AppendText("-----"); this.txtcontent.AppendText(m.Groups[2].Value.Trim()); txtcontent.AppendText("-----"); this.txtcontent.AppendText(m.Groups[3].Value.Trim()); txtcontent.AppendText("-----"); this.txtcontent.AppendText(m.Groups[4].Value.Trim()); txtcontent.AppendText("-----"); this.txtcontent.AppendText("/r/n"); } sr.Close(); fs.Close(); this.txtcontent.AppendText("读取完成!"); } } catch (Exception e) { this.txtcontent.AppendText("出现错误"+e.Message); } #endregion }

完成

 

当然这个只是爬取数据到本地,然后再读取,这样的效率可能会比较慢,大家把这个熟悉后可以通过自己修改下直接接收爬出来的数据进行分析读取然后写到数据库,或者通过HtmlParser来分析buffer来分析数据,在这里我就不一一进行讲解了

你可能感兴趣的:(网络爬虫)