import urllib if __name__=="__main__": url = "http://www.baidu.com" #根据url读取html源码 content = urllib.urlopen(url).read() #转为中文可读,可以直接查看当前html源文件是什么编码格式,百度的是gb2312 content = content.decode("gb2312").encode("utf-8") print content
#!/usr/bin/python #encoding=utf-8 import htmllib,urllib,formatter,string ''' import chardet,sys type = sys.getdefaultencoding() ''' class GetLinks(htmllib.HTMLParser): #从HTMLParser类中继承 def __init__(self): #初始化的时候调用,将links设置为空。这里的links为字典结构 self.links = {} #存放[地址->链接]的字典 f = formatter.NullFormatter()#将传输过来的数据不做处理,格式化为数据流 htmllib.HTMLParser.__init__(self, f) def anchor_bgn(self, href, name, type): #锚点标签开始的时候处理 self.save_bgn() self.link = href def anchor_end(self): #锚点标签结束的时候处理 text = string.strip(self.save_end()) #去掉A标签保留A标签的信息 if self.link and text: self.links[text] = self.link#self.links.get(text, []) + [self.link] #fp = urllib.urlopen("http://www.baidu.com") #打开指定的URL #data = fp.read() #fp.close() data = '<html><head><title>test</title><body><a href="http: //www.163.com">链接到163</a><a href="http://www.focus.cn">焦点</a></body></html>' linkdemo = GetLinks() #实例化一个LinkDemo对象 linkdemo.feed(data) #给HTMLParser喂食 linkdemo.close() for href, link in linkdemo.links.items(): #打印相关的信息 print href, "=>", link
焦点 => http://www.focus.cn 链接到163 => http: //www.163.com
再如:
# -* - coding: UTF-8 -* - import htmllib, urllib, formatter, string class GetLinks(htmllib.HTMLParser): def __init__(self): self.links = {} f = formatter.NullFormatter() htmllib.HTMLParser.__init__(self, f) def anchor_bgn(self, href, name, type): self.save_bgn() if href[:4] == 'http': self.link = href else: self.link = None def anchor_end(self): text = string.strip(self.save_end()) if self.link and text: self.links[text] = self.link fp = urllib.urlopen("http://list.taobao.com/browse/cat-0.htm") data = fp.read() fp.close() linkdemo = GetLinks() linkdemo.feed(data) linkdemo.close() for href, link in linkdemo.links.items(): href = href.decode('gb2312').encode('utf-8') print href, '-', link pass结果是下载到的淘宝“裤架 - http://ju.atpanel.com/?url=http://list.taobao.com/market/baihuo.htm?spm=1.47613.90750.”这样的列表
# -* - coding: UTF-8 -* - from HTMLParser import HTMLParser import htmllib,urllib,formatter,string import os,sys,time import threading ''' Created on 2012-10-09 @author: xing.gexing ''' #建立线程池,并启动线程直到结束 def parallel(urls): startTime = time.time() threads=[] counts = range(len(urls)) for i in counts: t=MyThread(downloadFromURL,(urls[i],),downloadFromURL.__name__) threads.append(t) for i in counts: threads[i].start() for i in counts: threads[i].join() print 'use time cost:%s'%(time.time()-startTime) #自定义线程类 class MyThread(threading.Thread): def __init__(self,func,args,name=''): threading.Thread.__init__(self) self.name=name self.func=func self.args=args def run(self): apply(self.func,self.args) #根据url找到图片的链接并下载 def downloadFromURL(url): fp = urllib.urlopen(url) data = fp.read() fp.close() hp = MyHTMLParser() hp.feed(data) hp.close() for i in hp.links: print(i) downloadImage(i) #根绝imageUrl下载图片到本地 def downloadImage(imageUrl): dir = "./image_douban" try: if not os.path.exists(dir): os.mkdir(dir) except: print "Failed to create directory in %s"%dir exit() image = imageUrl.split('/')[-1] path = dir+"/"+image data = urllib.urlopen(imageUrl).read() f = file(path,"wb") f.write(data) f.close() #定义html解析,关键在于handle_starttag class MyHTMLParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.links = [] def handle_starttag(self, tag, attrs): if len(attrs) == 0: pass else: for (variable, value) in attrs: if variable=="src" and value[:4]== "http" and value[-4:]==".jpg": self.links.append(value) if __name__ == "__main__": html = """ <a href="www.google.com"> google.com</a> <A Href="www.pythonclub.org"> PythonClub </a> <A HREF = "www.sina.com.cn"> Sina </a> """ #url2 = "http://image.baidu.com/i?ct=201326592&cl=2&lm=-1&tn=baiduimage&pv=&word=car&z=5" #url = "http://image.baidu.com" #url = "http://movie.douban.com/" #下载豆瓣电影图片 base = 20 count = 1 urls = [] while count <= 100: url = "http://movie.douban.com/tag/%E6%83%8A%E6%82%9A?start="+str(base*count)+"&type=T" urls.append(url) count += 1 parallel(urls)
需要特别注意的是对于百度图片的处理:搜索的关键词是其中的word,注意替换。
百度图片搜索的第1页(包含20张图片):http://image.baidu.com/i?z=3&fr=&cl=2&ct=201326592&lm=-1&rn=20&tn=baiduimagenojs&s=0&word=%C6%FB%B3%B5&pn=0
百度图片搜索的第2页(包含20张图片):http://image.baidu.com/i?z=3&fr=&cl=2&ct=201326592&lm=-1&rn=20&tn=baiduimagenojs&s=0&word=%C6%FB%B3%B5&pn=20
...
对于其中每一页,每张图片都有个这样的后缀:/i?ct=503316480&z=3&tn=baiduimagedetailnojs&word=%C6%FB%B3%B5&cl=2&lm=-1&pn=20&rn=1&di=36978446751&ln=1987,所以一共20个,查找i?ct进行匹配即可。
将这个后缀与百度图片地址http://image.baidu.com拼接即可得到该图片源的网页:http://image.baidu.com/i?ct=503316480&z=3&tn=baiduimagedetailnojs&word=%C6%FB%B3%B5&cl=2&lm=-1&pn=20&rn=1&di=36978446751&ln=1987
在该网页中匹配img src即可找到图片绝对路径。
# -* - coding: UTF-8 -* - import os,sys,urllib docString=''' Created on 2012-10-10 @author: xing.gexing ''' def baidu(imgsum,findstr): gbstr=("找到相关图片约".decode("utf8")).encode("gb2312") gbstr2=("找到相关图片".decode("utf8")).encode("gb2312") gbstr3=("张".decode("utf8").encode("gb2312")) if findstr=="": return 0 findstr=(findstr.decode("utf8")).encode("gb2312") findstr=urllib.quote(findstr) url="http://image.baidu.com/i?z=3&fr=&cl=2&ct=201326592&lm=-1&rn=20&tn=baiduimagenojs&s=0&word=%s&pn="%findstr webfile=urllib.urlopen(url+"0").read() start=webfile.find(gbstr) if start==-1: start=webfile.find(gbstr2) start=start+12 else: start=start+14 end=webfile.find(gbstr3,start) sum=webfile[start:end] sum=sum.replace(",","") sum=int(sum) #总图片数 sumpage=sum/20+1 #总页数 print "you have found %d pics in baiduImage"%sum i=0 #下载的图片数 for page in range(sumpage): p_url=url+"%s"%(page*20) #当前页url webfile=urllib.urlopen(p_url).read() i_start = 0 i_end = 0 while True: i_start=webfile.find('''<a href="/i?ct''',i_end) if i_start<0: break i_start+=10 i_end=webfile.find('''"''',i_start) i_url=webfile[i_start:i_end] i_url="http://image.baidu.com/"+i_url webstr=urllib.urlopen(i_url).read() start = 0 end = 0 while True: start=webstr.find('''<img src="''',end) if start<0: break start+=10 end=webstr.find('''"''',start) imgurl=webstr[start:end] if imgurl[-4:]!=".jpg": continue if imgurl.find("img-jg.gif")!=-1: continue i=i+1 print "downloading pic %s from %s"%(i,imgurl) try: data=urllib.urlopen(imgurl).read() except: print "lost 1 pic" break f=open("%s/%d.jpg"%(dir,i),"w") f.write(data) f.close() if i==int(imgsum): print "finish download %s pics"%i return 1 if __name__ == "__main__": print docString print "config your downloading arguments:" findstr = raw_input("search:") if findstr == "": findstr = "汽车" imgsum = raw_input("num:") if imgsum == "": imgsum = 10 dir = "./baiduPic" try: if not os.path.exists(dir): os.mkdir(dir) except: print "Failed to create directory in linux:" exit() print "config OK!" baidu(imgsum,findstr)