实习期太无聊了,前两周才决定开始看一看被大家推崇的高开发效率的python。打算用python来写个东西,正好在python分区看到有人写了类似的demo,好吧,咱就写这个。在周六上午dota连跪几把后才开始写,周日晚上才能运行。 在某网站下了几百M,测试还算可行。 然后,这两天又修修补补,现在还算比较稳定吧。拿出来给大家批评教育下,毕竟也没写过python。 好了废话不多说,上菜!
注:用的是python 3.3
使用了htmlparser 来分析网页数据、抓取链接, 然后创建了一个ScratchFactory类来过滤链接、保存图片。ScratchFactory 继承了threading.Thread 是一个独立的线程其中下载每个图片也是并发的。
另外开了两个全局变量 UrlSrc、UrlDiged来存储抓取的链接和遍历过的链接。
但是并发不一定好,因为如果你请求太快,网站服务器会认为是ddos攻击,拒绝连接的。所以我在主线程里限制了线程数量,还有给每个请求设置时间间隔,防止出现DDos类似效果。
主线程中控制速度:
while True: if len(threading.enumerate()) > THREAD_NUM: continue mLock.acquire() if UrlSrc.__len__(): temp = UrlSrc.pop(0) t = ScratchFactory(temp) UrlDiged.append(temp) t.start() mLock.release() #打印当前连接数、线程数、urlsrc+urldiged表长 print("Conections:",UrlSrc.__len__(),"*****threads:",\ len(threading.enumerate()),"****TableLength:",\ (len(UrlSrc)+len(UrlDiged))/1000) if time.localtime().tm_min%2 == 0 \ and time.time() - savetime > 60 : save() #保存现场 savetime = time.time() time.sleep(SLEEP_TIME)
以下是两个主要类:
MyHtmlParser
继承于 HTMLParser,主要用来分析html文本,提取出标题、编码方式、链接和图片链接
ScratchFactory
主要有这几个函数:
addHeader: 这个是给提取的是相对路径的链接加上头变成绝对路径
clearData: 这个过滤提取的链接, 过滤title
saveImage:开线程给save下载图片
run:
因为保存图片的path和文件名是根据抓取页面的title来命名的,因为中文涉及到编码的问题,编码方式一般都是放在http的响应头里面,但我不知道在python里面如何获得响应头(知道的请告诉我一声),只好在html文本里的<meta>标签里去找了。(这个问题已经解决,采用了@pjx2013 的方法使用了chardet,chardet在python3下运行不了,不过有个大哥把chardet修改了) 感谢 @pjx2013.
http://www.cnblogs.com/dajianshi/archive/2012/12/18/2827083.html
这里的feed是不接受字节类型的, 所以强制把它转成 utf-8,然
conect = urllib.request.urlopen(self.url) #下载网页数据 data = conect.read() conect.close() htmlx = MyHtmlParser() htmlx.feed(data[:500].decode('utf-8','ignore')) t = htmlx.charset #获得html编码 if t == '': t = 'gb2312' htmlx.reset() htmlx.feed(data.decode(t,'ignore')) self.title = htmlx.title
总的来说 MyHtmlParser写得很垃圾,还可以改进的。 原本偷懒觉得用htmlparser方便,结果发现还是用来很多正则表达式。
下面是源码:
有'<-'的地方可以根据需要修改的, 如果不断出现 10054 应该是请求速度过快了
''' Created on 2013-2-1 @author: 李鹏飞 @mailto: [email protected] 运行环境:Python 3 ''' #coding:utf-8 import re import urllib.request from html.parser import HTMLParser from html.parser import HTMLParseError import os import threading import time import chardet class MyHtmlParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.url = [] self.img = [] self.title = [] def handle_starttag(self, tag, attrs): if tag == "a": for i in attrs: if i[0] == "href": self.url.append(i[1]) elif tag == "title": self.title = 1 for i in attrs: if re.match('http://.+\.(jpg|jepg|png)',str(i[1])): self.img.append(i[1]) def handle_data(self, data): if self.title == 1: self.title = data findimg = re.findall('http://.+?\.jpg',data) for i in range(0,len(findimg)): findimg[i] = findimg[i] self.img += findimg def handle_startendtag(self, tag, attrs): if tag == "a": for i in attrs: if i[1] == "href": self.url.append(i[1]) for i in attrs: if re.match('http://.+\.(jpg|jepg|png)',str(i[1])): self.img.append(i[1]) class ScratchFactory(threading.Thread): def __init__(self,url): threading.Thread.__init__(self) self.url = url self.tempImgs = [] self.tempUrls = [] self.title = [] global seed match = re.search(seed + '.*/',url) if match: self.pwd = match.group() def addHeader(self,data): global seed for i in range(0,len(data)): if re.match("http.+", data[i]) == None: if re.match("/.*",data[i]): data[i] = seed + data[i] elif re.match('./.*',data[i]): data[i] = self.pwd + data[i][2:] else: data[i] = self.pwd + data[i] return data def run(self): try: conect = urllib.request.urlopen(self.url) #下载网页数据 data = conect.read() conect.close() htmlx = MyHtmlParser() t = chardet.detect(data) #获得html编码 if t['encoding']: charset = t['encoding'] else: charset = 'utf-8' htmlx.feed(data.decode(charset,'ignore')) self.title = htmlx.title self.tempUrls = self.addHeader(htmlx.url) #给相对路径链接加上头 self.tempImgs = self.addHeader(htmlx.img) htmlx.close() self.clearData() #过滤无用链接 threading.Thread(target = \ self.saveImages,args = () ).start() #下载图片 except HTMLParseError as e: print("####Error : 1 ######:",e , '--->', self.url) except Exception as e: print("####Error : 2 ######:",e , '--->' , self.url) global UrlSrc,UrlDiged,mLock mLock.acquire() t = [] for temp in self.tempUrls: if not UrlDiged.__contains__(temp): t.append(temp) l = [] for temp in t: if not UrlSrc.__contains__(temp): l.append(temp) UrlSrc += l mLock.release() def clearData(self): #去除重复链接 self.tempUrls = set(self.tempUrls) self.tempImgs = set(self.tempImgs) global seed t = [] for temp in self.tempUrls: #<-链接过滤,正则表达式 if re.match(seed + "/.*", temp): t.append(temp) self.tempUrls = t t = [] for temp in self.tempImgs: #<-图片过滤,正则表达式 if re.match(".+.(gif|jpg|png)",temp): t.append(temp) self.tempImgs = t self.title = re.split('(-|_)',self.title)[0] #<-页面标题分隔,截取title中关键字 #去除title中非法字符 self.title = self.title.replace(' ','') self.title = self.title.replace('/','') self.title = self.title.replace('\\','') self.title = self.title.replace(':','') self.title = self.title.replace('|','') self.title = self.title.replace('?','') self.title = self.title.replace('*','') self.title = self.title.replace('<','') self.title = self.title.replace('>','') self.title = self.title.replace('\r','') self.title = self.title.replace('\n','') self.title = self.title.replace('\t','') def save(self,path,url): global MinSize try: req = urllib.request.Request(url) req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 6.1) \ AppleWebKit/537.11 (KHTML, like Gecko) \ Chrome/23.0.1271.64 Safari/537.11") req.add_header("Referer",self.url) #有些网站防盗链,所以自己加上头 conect = urllib.request.urlopen(req) t = conect.read() conect.close() if t.__len__() < MinSize: return if not os.path.exists(path): os.mkdir(path) f = open(path + "\\" + self.title + time.strftime("%H%M%S",\ time.localtime()) + ".jpg","wb") f.write(t) f.close() except HTMLParseError as e: print("####Error : 3 ######:",e , '--->', url) except Exception as e: print("####Error : 4 ######:",e , '--->', url) def saveImages(self): global IMG_TIME global SAVE_PATH if len(self.tempImgs) == 0: return path = SAVE_PATH + '\\' + self.title print("Downdow------->",self.title) while len(self.tempImgs) != 0: t = threading.Thread(target=self.save,args=\ (path,self.tempImgs.pop(0))) if len(self.tempImgs) != 0: t.start() time.sleep(IMG_TIME) else: t.start() t.join() if os.path.exists(path) and len(os.listdir(path)) == 0: os.rmdir(path) def save(): global mLock global UrlSrc #global ImgDiged #global iLock global SAVE_PATH mLock.acquire() #iLock.acquire() try: f = open(SAVE_PATH + r"\UrlDiged.txt",'w') for i in UrlDiged: f.write(i + '\n') f.close() f = open(SAVE_PATH +r"\UrlSrc.txt",'w') for i in UrlSrc: f.write(i + '\n') f.close() print("********************* Saved **********************") except Exception as e: print (e) finally: mLock.release() def readBackup(): global UrlDiged global UrlSrc try: f = open(SAVE_PATH + r"\UrlDiged.txt",'r') while True: t = f.readline() if t == '': break t = t.replace('\n','') UrlDiged.append(t) f.close() f = open(SAVE_PATH + r"\UrlSrc.txt",'r') while True: t = f.readline() if t == '': break t = t.replace('\n','') UrlSrc.append(t) f.close() except Exception as e: print(e) #*****************************start******************************** if __name__ == '__main__': #timeout = 20 #socket.setdefaulttimeout(timeout) seed = "http://www.xxxx.com/" #<-站点的根页面 SAVE_PATH = r"e:\scratch" #<-存储目录 THREAD_NUM = 35 #<-限制线程数,以控制下载速度,防止出现类DDos攻击 SLEEP_TIME = 2.5 #<-每次请求链接的时间间隔(秒),太快不一定好哟! MinSize = 72000 #<-过滤小图片,初始32k IMG_TIME = 1.5 #<-下载图片速度,初始1.5秒一张 UrlSrc = [seed] #存储获得的未遍历过的链接 UrlDiged = [] #存储遍历过的链接 mLock = threading.Lock() #UrlSrc和UrlDiged的同步锁 savetime = time.time() if not os.path.exists(SAVE_PATH): os.mkdir(SAVE_PATH) if seed[-1:] == '/': seed = seed[:-1] #读取上一次运行的现场 if not os.path.exists(SAVE_PATH + r'\UrlDiged.txt'): try: f = open(SAVE_PATH + r'\UrlDiged.txt','w') f.close() f = open(SAVE_PATH + r'\UrlSrc.txt','w') f.close() except Exception as e: print(e) else: readBackup() while True: if len(threading.enumerate()) > THREAD_NUM: continue mLock.acquire() if UrlSrc.__len__(): temp = UrlSrc.pop(0) t = ScratchFactory(temp) UrlDiged.append(temp) t.start() mLock.release() #打印当前连接数、线程数、urlsrc+urldiged表长 print("Conections:",UrlSrc.__len__(),"*****threads:",\ len(threading.enumerate()),"****TableLength:",\ (len(UrlSrc)+len(UrlDiged))/1000) if time.localtime().tm_min%2 == 0 \ and time.time() - savetime > 60 : save() #保存现场 savetime = time.time() time.sleep(SLEEP_TIME)