第一版爬虫,爬补天漏洞链接及标题

 1 from HTMLParser import HTMLParser
 2 import urllib2
 3 import re
 4 from time import sleep
 5 
 6 
 7 class MyHTMLParser(HTMLParser):
 8     def __init__(self):
 9         HTMLParser.__init__(self)
10         self.links = []
11  
12     def handle_starttag(self, tag, attrs):
13         #print "Encountered the beginning of a %s tag" % tag
14         if tag == "a":
15             if len(attrs) == 0: pass
16             else:
17                 for (variable,value)  in attrs:
18                         if variable == 'href':
19                             if value.find('/vul/list/page/')+1:
20                                 self.links.append(value)
21 
22 class PageContentParser(HTMLParser):
23     def __init__(self):
24         HTMLParser.__init__(self)
25     self.flag = 0
26     self.urllist = [] # url
27     self.title = [] #title 
28     def handle_starttag(self, tag, attrs):
29         #print "Encountered the beginning of a %s tag" % tag
30         if tag == "a":
31             if len(attrs) == 0: pass
32             else:
33                 for (variable,value)  in attrs:
34                     if value.find('/vul/info/qid/QTVA')+1:
35                         self.flag = 1
36                         self.urllist.append(value)
37                         #url write to native
38                         sleep(0.5)
39                         f = open("./butian.html",'a+')
40                         f.write("http://loudong.360.cn"+value+"---------")
41                         f.close
42     def handle_data(self,data):
43         if self.flag == 1:
44             y = data.decode("utf-8")
45             k = y.encode("gb18030")
46             self.title.append(k)
47             self.flag = 0
48             #storage to native
49             sleep(0.5)
50             f = open('./butian.html','a+')
51             f.write(k+"<br>")
52             f.close
53 
54 
55 
56 if __name__ == "__main__":
57     print "start........"
58     content = urllib2.urlopen("http://loudong.360.cn/vul/list/").read()
59     hp = MyHTMLParser()
60     hp.feed(content)
61     num = int(filter(str.isdigit, hp.links[-1])) #get the total page
62     hp.close()
63     print num
64     for i in  range(num):
65         page = i + 1
66         pagecontent = urllib2.urlopen("http://loudong.360.cn/vul/list/page/"+str(page)).read()
67         page = PageContentParser()
68         page.feed(pagecontent)
69         page.close()

 

 

 

效果如下:

第一版爬虫,爬补天漏洞链接及标题_第1张图片

你可能感兴趣的:(第一版爬虫,爬补天漏洞链接及标题)