import urllib
if __name__=="__main__":
url = "http://www.baidu.com"
#根据url读取html源码
content = urllib.urlopen(url).read()
#转为中文可读,可以直接查看当前html源文件是什么编码格式,百度的是gb2312
content = content.decode("gb2312").encode("utf-8")
print content
#!/usr/bin/python
#encoding=utf-8
import htmllib,urllib,formatter,string
'''
import chardet,sys
type = sys.getdefaultencoding()
'''
class GetLinks(htmllib.HTMLParser): #从HTMLParser类中继承
def __init__(self): #初始化的时候调用,将links设置为空。这里的links为字典结构
self.links = {} #存放[地址->链接]的字典
f = formatter.NullFormatter()#将传输过来的数据不做处理,格式化为数据流
htmllib.HTMLParser.__init__(self, f)
def anchor_bgn(self, href, name, type): #锚点标签开始的时候处理
self.save_bgn()
self.link = href
def anchor_end(self): #锚点标签结束的时候处理
text = string.strip(self.save_end()) #去掉A标签保留A标签的信息
if self.link and text:
self.links[text] = self.link#self.links.get(text, []) + [self.link]
#fp = urllib.urlopen("http://www.baidu.com") #打开指定的URL
#data = fp.read()
#fp.close()
data = 'test 链接到163焦点'
linkdemo = GetLinks() #实例化一个LinkDemo对象
linkdemo.feed(data) #给HTMLParser喂食
linkdemo.close()
for href, link in linkdemo.links.items(): #打印相关的信息
print href, "=>", link
焦点 => http://www.focus.cn
链接到163 => http: //www.163.com
再如:
# -* - coding: UTF-8 -* -
import htmllib, urllib, formatter, string
class GetLinks(htmllib.HTMLParser):
def __init__(self):
self.links = {}
f = formatter.NullFormatter()
htmllib.HTMLParser.__init__(self, f)
def anchor_bgn(self, href, name, type):
self.save_bgn()
if href[:4] == 'http':
self.link = href
else:
self.link = None
def anchor_end(self):
text = string.strip(self.save_end())
if self.link and text:
self.links[text] = self.link
fp = urllib.urlopen("http://list.taobao.com/browse/cat-0.htm")
data = fp.read()
fp.close()
linkdemo = GetLinks()
linkdemo.feed(data)
linkdemo.close()
for href, link in linkdemo.links.items():
href = href.decode('gb2312').encode('utf-8')
print href, '-', link
pass
结果是下载到的淘宝“裤架 - http://ju.atpanel.com/?url=http://list.taobao.com/market/baihuo.htm?spm=1.47613.90750.”这样的列表
# -* - coding: UTF-8 -* -
from HTMLParser import HTMLParser
import htmllib,urllib,formatter,string
import os,sys,time
import threading
'''
Created on 2012-10-09
@author: xing.gexing
'''
#建立线程池,并启动线程直到结束
def parallel(urls):
startTime = time.time()
threads=[]
counts = range(len(urls))
for i in counts:
t=MyThread(downloadFromURL,(urls[i],),downloadFromURL.__name__)
threads.append(t)
for i in counts:
threads[i].start()
for i in counts:
threads[i].join()
print 'use time cost:%s'%(time.time()-startTime)
#自定义线程类
class MyThread(threading.Thread):
def __init__(self,func,args,name=''):
threading.Thread.__init__(self)
self.name=name
self.func=func
self.args=args
def run(self):
apply(self.func,self.args)
#根据url找到图片的链接并下载
def downloadFromURL(url):
fp = urllib.urlopen(url)
data = fp.read()
fp.close()
hp = MyHTMLParser()
hp.feed(data)
hp.close()
for i in hp.links:
print(i)
downloadImage(i)
#根绝imageUrl下载图片到本地
def downloadImage(imageUrl):
dir = "./image_douban"
try:
if not os.path.exists(dir):
os.mkdir(dir)
except:
print "Failed to create directory in %s"%dir
exit()
image = imageUrl.split('/')[-1]
path = dir+"/"+image
data = urllib.urlopen(imageUrl).read()
f = file(path,"wb")
f.write(data)
f.close()
#定义html解析,关键在于handle_starttag
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.links = []
def handle_starttag(self, tag, attrs):
if len(attrs) == 0:
pass
else:
for (variable, value) in attrs:
if variable=="src" and value[:4]== "http" and value[-4:]==".jpg":
self.links.append(value)
if __name__ == "__main__":
html = """
google.com
PythonClub
Sina
"""
#url2 = "http://image.baidu.com/i?ct=201326592&cl=2&lm=-1&tn=baiduimage&pv=&word=car&z=5"
#url = "http://image.baidu.com"
#url = "http://movie.douban.com/"
#下载豆瓣电影图片
base = 20
count = 1
urls = []
while count <= 100:
url = "http://movie.douban.com/tag/%E6%83%8A%E6%82%9A?start="+str(base*count)+"&type=T"
urls.append(url)
count += 1
parallel(urls)
需要特别注意的是对于百度图片的处理:搜索的关键词是其中的word,注意替换。
百度图片搜索的第1页(包含20张图片):http://image.baidu.com/i?z=3&fr=&cl=2&ct=201326592&lm=-1&rn=20&tn=baiduimagenojs&s=0&word=%C6%FB%B3%B5&pn=0
百度图片搜索的第2页(包含20张图片):http://image.baidu.com/i?z=3&fr=&cl=2&ct=201326592&lm=-1&rn=20&tn=baiduimagenojs&s=0&word=%C6%FB%B3%B5&pn=20
...
对于其中每一页,每张图片都有个这样的后缀:/i?ct=503316480&z=3&tn=baiduimagedetailnojs&word=%C6%FB%B3%B5&cl=2&lm=-1&pn=20&rn=1&di=36978446751&ln=1987,所以一共20个,查找i?ct进行匹配即可。
将这个后缀与百度图片地址http://image.baidu.com拼接即可得到该图片源的网页:http://image.baidu.com/i?ct=503316480&z=3&tn=baiduimagedetailnojs&word=%C6%FB%B3%B5&cl=2&lm=-1&pn=20&rn=1&di=36978446751&ln=1987
在该网页中匹配img src即可找到图片绝对路径。
# -* - coding: UTF-8 -* -
import os,sys,urllib
docString='''
Created on 2012-10-10
@author: xing.gexing
'''
def baidu(imgsum,findstr):
gbstr=("找到相关图片约".decode("utf8")).encode("gb2312")
gbstr2=("找到相关图片".decode("utf8")).encode("gb2312")
gbstr3=("张".decode("utf8").encode("gb2312"))
if findstr=="":
return 0
findstr=(findstr.decode("utf8")).encode("gb2312")
findstr=urllib.quote(findstr)
url="http://image.baidu.com/i?z=3&fr=&cl=2&ct=201326592&lm=-1&rn=20&tn=baiduimagenojs&s=0&word=%s&pn="%findstr
webfile=urllib.urlopen(url+"0").read()
start=webfile.find(gbstr)
if start==-1:
start=webfile.find(gbstr2)
start=start+12
else:
start=start+14
end=webfile.find(gbstr3,start)
sum=webfile[start:end]
sum=sum.replace(",","")
sum=int(sum) #总图片数
sumpage=sum/20+1 #总页数
print "you have found %d pics in baiduImage"%sum
i=0 #下载的图片数
for page in range(sumpage):
p_url=url+"%s"%(page*20) #当前页url
webfile=urllib.urlopen(p_url).read()
i_start = 0
i_end = 0
while True:
i_start=webfile.find('''