批量下载网页上的图片需要三个步骤:
from html.parser import HTMLParser import urllib.request import os,uuid,sys #第1步: class PageLinkParser(HTMLParser): def __init__(self,strict=False): HTMLParser.__init__(self,strict) self.all=[] def handle_starttag(self,tag,attrs): if tag=='a': for i in attrs: if i[0]=='href': if i[1] not in self.all: self.all.append(i[1]) def getPageLinks(url): doing=[url] done=[] while len(doing)>=1: x=doing.pop(); done.append(x) print(x) try: f=urllib.request.urlopen(x) parser=PageLinkParser(strict=False) parser.feed(f.read().decode('utf-8')) for i in parser.all: if i not in done: #doing.insert(0,i) #在此就不遍历了。 done.append(i) parser.all=[] except: continue return done #第2步: class ImgLinkParser(HTMLParser): def __init__(self,strict=False): HTMLParser.__init__(self,strict) self.all=[] def handle_starttag(self,tag,attrs): if tag=='img': for i in attrs: if i[0]=='src': if i[1] not in self.all: self.all.append(i[1]) def getImgLinks(url): parser=ImgLinkParser(strict=False) try: f=urllib.request.urlopen(url) parser.feed(f.read().decode('utf-8'))#解码格式,根据网页的编码格式而定。 finally: return parser.all #第3步: def loadImg(l): for i in l: i=i.strip() print(i) try: f=open(os.path.join(os.getcwd(),uuid.uuid4().hex+'.jpg'),'wb') #防止文件名重复,使用UUID f.write(urllib.request.urlopen(i).read()) f.close() except: print('error:',i) continue #使用 if __name__=='__main__': for i in getPageLinks('http://www.cnblogs.com/'): loadImg(getImgLinks(i))