一个简单的Python网络爬虫(抓图),针对某论坛.

 1 #coding:utf-8

 2 import urllib2

 3 import re

 4 import threading

 5 

 6 #图片下载

 7 def loadImg(addr,x,y,artName):

 8     data = urllib2.urlopen(addr).read()

 9     f = open(artName.decode("utf-8")+str(y)+'.jpg', 'wb')

10     f.write(data)

11     f.close()

12     

13 #具体帖子页面解析,得到图片链接地址,并使用loadImg下载 artName为帖子名

14 def getImgLink(html,x,artName):

15     relink = '<img src=".*" file="(.*)" width=".*" id=".*" alt=".*.jpg" />'

16     cinfo = re.findall(relink,html)

17     y = 0

18     for lin in cinfo:

19         imgAddr =  'http://www.xxx.com/'+lin

20         print "LoadImg:"+str(x),imgAddr+'\n'

21         t = threading.Thread(target=loadImg(imgAddr,x,y,artName)) #使用threading 多线程下载

22         t.start()

23         y = y+1

24         

25 #论坛版块页面解析,得到具体帖子链接        

26 def getArticleLink(html,page):

27     relink = '<a href="(viewthread\.php\?tid=.*3D.*)">(.*)</a>'

28     cinfo = re.findall(relink,html)

29     x = 1

30     for lin in cinfo:

31         #print lin,'\n'

32         url="http://www.xxx.com/"+lin[0]

33         headers={"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1"}

34         req = urllib2.Request(url,headers=headers)

35         response= urllib2.urlopen(req)

36         html = response.read()

37         getImgLink(html,x,lin[1])

38         x = x+1

39         

40 start = 1 #起始页

41 end = 100 #终止页

42 for page in range(end):

43     url="http://www.xxx.com/forumdisplay.php?fid=19&page="+str(page+start)

44     headers={"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1"}

45     req = urllib2.Request(url,headers=headers)

46     response= urllib2.urlopen(req)

47     html = response.read()

48     print'Start'

49     getArticleLink(html,page)

 

你可能感兴趣的:(python)