下载生物信息

http://www.catalogueoflife.org/col/browse/classification

这是一个国外的生物信息网站

今天的代码可以抓取指定分类的信息(id,学名)

没有把多线程写进去,略失败...

运用:webclient,regex,io

项目在>>>开源中国

 

 1 using System;

 2 using System.Collections.Generic;

 3 using System.ComponentModel;

 4 using System.Data;

 5 using System.Drawing;

 6 using System.Linq;

 7 using System.Text;

 8 using System.Threading.Tasks;

 9 using System.Windows.Forms;

10 using System.Net;

11 using System.Text.RegularExpressions;

12 using System.Threading;

13 using System.IO;

14 namespace cateoflife

15 {

16     public partial class Form1 : Form

17     {

18         WebClient wc = new WebClient();

19         int start;

20         int end;

21         string url;

22         string reg;

23         string msg;

24         int now = 1;

25         public Form1()

26         {

27             InitializeComponent();

28 

29         }

30 

31         private void button1_Click(object sender, EventArgs e)

32         {

33             start = int.Parse(textBox2.Text);

34 

35             FileInfo fifo = new FileInfo(start+".txt");

36             FileStream fs= fifo.OpenWrite();

37             StreamWriter w = new StreamWriter(fs);

38             w.BaseStream.Seek(0, SeekOrigin.End);

39 

40             end=(int.Parse(textBox3.Text)==0)?99999:int.Parse(textBox3.Text);

41             url = textBox1.Text;

42             reg = textBox4.Text;

43             wc.Encoding = Encoding.UTF8;

44             string Htm;

45             for (int i = start; i <= end; i++)

46             {

47                 try

48                 {

49                     Htm = wc.DownloadString(url + i);

50                     foreach (Match m in Regex.Matches(Htm, reg))

51                     {

52                         gettxt(m.ToString());

53                         w.Write(msg);

54                         w.Flush();   

55                     }                    

56                 }

57                 catch (Exception)

58                 {

59                     Htm = wc.DownloadString(url + i);

60                     foreach (Match m in Regex.Matches(Htm, reg))

61                     {

62                         gettxt(m.ToString());

63                         w.Write(msg);

64                         w.Flush();

65                     }               

66                 }                

67                              

68             }

69             w.Close();

70         }

71         void gettxt(string html)

72         {

73             msg=Regex.Match(html,"(?<=/)\\d+").ToString()+"\t"+Regex.Match(html,"(?<=>)\\w+\\s*\\w+").ToString()+"\r\n";

74         }

75     }

76 }

 

你可能感兴趣的:(生物)