转载自我的博客:http://www.mylonly.com/archives/1418.html
经过两个晚上的奋斗,将上一篇文章介绍的爬虫稍微改进了下(Python爬虫之路——简单网页抓图),主要是将获取图片链接任务和下载图片任务用线程分开来处理了,而且这次的爬虫不仅仅可以爬第一页的图片链接的,整个http://desk.zol.com.cn/meinv/下面的图片都会被爬到,而且提供了多种分辨率图片的文件下载,具体设置方法代码注释里面有介绍。
这次的代码仍然有点不足,Ctrl-C无法终止程序,应该是线程无法响应主程序的终止消息导致的,(最好放在后台跑程序)还有线程的分配还可以优化的更好一点,后续会陆续改进.
#coding: utf-8 ############################################################# # File Name: main.py # Author: mylonly # mail: [email protected] # Created Time: Wed 11 Jun 2014 08:22:12 PM CST ######################################################################### #!/usr/bin/python import re,urllib2,HTMLParser,threading,Queue,time #各图集入口链接 htmlDoorList = [] #包含图片的Hmtl链接 htmlUrlList = [] #图片Url链接Queue imageUrlList = Queue.Queue(0) #捕获图片数量 imageGetCount = 0 #已下载图片数量 imageDownloadCount = 0 #每个图集的起始地址,用于判断终止 nextHtmlUrl = '' #本地保存路径 localSavePath = '/data/1920x1080/' #如果你想下你需要的分辨率的,请修改replace_str,有如下分辨率可供选择1920x1200,1980x1920,1680x1050,1600x900,1440x900,1366x768,1280x1024,1024x768,1280x800 replace_str = '1920x1080' replaced_str = '960x600' #内页分析处理类 class ImageHtmlParser(HTMLParser.HTMLParser): def __init__(self): self.nextUrl = '' HTMLParser.HTMLParser.__init__(self) def handle_starttag(self,tag,attrs): global imageUrlList if(tag == 'img' and len(attrs) > 2 ): if(attrs[0] == ('id','bigImg')): url = attrs[1][1] url = url.replace(replaced_str,replace_str) imageUrlList.put(url) global imageGetCount imageGetCount = imageGetCount + 1 print url elif(tag == 'a' and len(attrs) == 4): if(attrs[0] == ('id','pageNext') and attrs[1] == ('class','next')): global nextHtmlUrl nextHtmlUrl = attrs[2][1]; #首页分析类 class IndexHtmlParser(HTMLParser.HTMLParser): def __init__(self): self.urlList = [] self.index = 0 self.nextUrl = '' self.tagList = ['li','a'] self.classList = ['photo-list-padding','pic'] HTMLParser.HTMLParser.__init__(self) def handle_starttag(self,tag,attrs): if(tag == self.tagList[self.index]): for attr in attrs: if (attr[1] == self.classList[self.index]): if(self.index == 0): #第一层找到了 self.index = 1 else: #第二层找到了 self.index = 0 print attrs[1][1] self.urlList.append(attrs[1][1]) break elif(tag == 'a'): for attr in attrs: if (attr[0] == 'id' and attr[1] == 'pageNext'): self.nextUrl = attrs[1][1] print 'nextUrl:',self.nextUrl break #首页Hmtl解析器 indexParser = IndexHtmlParser() #内页Html解析器 imageParser = ImageHtmlParser() #根据首页得到所有入口链接 print '开始扫描首页...' host = 'http://desk.zol.com.cn' indexUrl = '/meinv/' while (indexUrl != ''): print '正在抓取网页:',host+indexUrl request = urllib2.Request(host+indexUrl) try: m = urllib2.urlopen(request) con = m.read() indexParser.feed(con) if (indexUrl == indexParser.nextUrl): break else: indexUrl = indexParser.nextUrl except urllib2.URLError,e: print e.reason print '首页扫描完成,所有图集链接已获得:' htmlDoorList = indexParser.urlList #根据入口链接得到所有图片的url class getImageUrl(threading.Thread): def __init__(self): threading.Thread.__init__(self) def run(self): for door in htmlDoorList: print '开始获取图片地址,入口地址为:',door global nextHtmlUrl nextHtmlUrl = '' while(door != ''): print '开始从网页%s获取图片...'% (host+door) if(nextHtmlUrl != ''): request = urllib2.Request(host+nextHtmlUrl) else: request = urllib2.Request(host+door) try: m = urllib2.urlopen(request) con = m.read() imageParser.feed(con) print '下一个页面地址为:',nextHtmlUrl if(door == nextHtmlUrl): break except urllib2.URLError,e: print e.reason print '所有图片地址均已获得:',imageUrlList class getImage(threading.Thread): def __init__(self): threading.Thread.__init__(self) def run(self): global imageUrlList print '开始下载图片...' while(True): print '目前捕获图片数量:',imageGetCount print '已下载图片数量:',imageDownloadCount image = imageUrlList.get() print '下载文件路径:',image try: cont = urllib2.urlopen(image).read() patter = '[0-9]*\.jpg'; match = re.search(patter,image); if match: print '正在下载文件:',match.group() filename = localSavePath+match.group() f = open(filename,'wb') f.write(cont) f.close() global imageDownloadCount imageDownloadCount = imageDownloadCount + 1 else: print 'no match' if(imageUrlList.empty()): break except urllib2.URLError,e: print e.reason print '文件全部下载完成...' get = getImageUrl() get.start() print '获取图片链接线程启动:' time.sleep(2) download = getImage() download.start() print '下载图片链接线程启动:'