python抓取百度图片

先用firebug分析请求图片的链接,拿到url。

GET /channel/listjson?fr=channel&tag1=%E7%BE%8E%E5%A5%B3&tag2=%E5%B0%8F%E6%B8%85%E6%96%B0&sorttype=0&pn=30&rn=60&ie=utf8&oe=utf-8&1380172568359 HTTP/1.1
Host: image.baidu.com
User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0
Accept: */*
Accept-Language: zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3
Accept-Encoding: gzip, deflate
X-Requested-With: XMLHttpRequest
Referer: http://image.baidu.com/channel
Cookie: BAIDUID=67B863A46926A8538FAA24A48EBD753D:FG=1; SSUDBTSP=1373611768; SSUDB=3V4YW5LZ3dGU202UTBTcFB5VmtCTkdRTDdqaHZVSFhEYzkwS2kwTXY2fjRNd2RTQVFBQUFBJCQAAAAAAAAAAAEAAADCdwQJaHVhODkxMTE4AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPim31H4pt9RZ; MCITY=-%3A; BDUSS=3V4YW5LZ3dGU202UTBTcFB5VmtCTkdRTDdqaHZVSFhEYzkwS2kwTXY2fjRNd2RTQVFBQUFBJCQAAAAAAAAAAAEAAADCdwQJaHVhODkxMTE4AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPim31H4pt9RZ; BAIDU_WISE_UID=68CEB3BE25F99C3F7250C9D764F65A02; userid=k21u44gs5; Hm_lvt_737dbb498415dd39d8abf5bc2404b290=1380162231,1380170431; H_PS_PSSID=; Hm_lpvt_737dbb498415dd39d8abf5bc2404b290=1380172544; the_nav_width=1479
Connection: keep-alive

#!/usr/bin/python
#coding=utf-8
import urllib,json,socket
import random,os
import sys,datetime

starttime = datetime.datetime.now()
socket.setdefaulttimeout(10) 
dir ='/root/test/pic/'
if not os.path.isdir(dir):
	os.mkdir(dir)
i=0
j=1
p=30
while i<10:
	
	if i%2==0:
		zipname = 'baiduzip_'+str(i)+'.zip'
		print 'make a zip file'
		os.system('zip -6qrm /root/test/'+zipname+' /root/test/pic/*')
		print zipname+' file is ok!'
		#http://image.baidu.com/i?tn=listjson&word=liulan&oe=utf-8&ie=utf8&tag1=%E6%90%9E%E7%AC%91&tag2=%E5%85%A8%E9%83%A8&sorttype=0&pn=30&rn=60&requestType=1&1357639151100
	#url ='http://image.baidu.com/i?tn=listjson&word=liulan&oe=utf-8&ie=utf8&tag1=%E6%91%84%E5%BD%B1&tag2=%E5%85%A8%E9%83%A8&sorttype=0&pn='+str(p*i)+'&rn=60&requestType=1&'+str(random.random())
        url ='http://image.baidu.com/channel/listjson?fr=channel&tag1=%E7%BE%8E%E5%A5%B3&tag2=%E5%B0%8F%E6%B8%85%E6%96%B0&sorttype=0&pn='+str(p*i)+'&rn=60&ie=utf8&oe=utf-8&'+str(random.random())
        print url    
	try:
		ipdata = urllib.urlopen(url).read()
	except IOError,e:
		#if e.message=="time out":
		print('img %s_%s is false1' % (i,j) )
		break
	else:	
		ipdata1 = json.loads(ipdata)
		if ipdata1['data']:
			for n in ipdata1['data']:
				if n and n['obj_url']:
					try:
						dataimg = urllib.urlopen(n['obj_url']).read()
					except IOError,e:
						#if e.message=="time out":
						print('img %s_%s is false2' % (i,j) )
						break
					else:					
						fPostfix = os.path.splitext(n['obj_url'])[1]
						if (fPostfix == '.png' or fPostfix == '.jpg' or fPostfix == '.PNG' or fPostfix == '.JPG'):
							filename = dir+os.path.basename(n['obj_url'])
						else:
							filename = dir+os.path.basename(n['obj_url'])+'.jpg'
						try:
							file_object = open(filename,'w')
							file_object.write(dataimg)
							file_object.close()
						except socket.timeout,e:
							#if e.message=="timed out":
							print('img %s_%s is false3' % (i,j) )
							break
						else:
							#urllib.urlretrieve(n['obj_url'],filename)
							print('img %s_%s is ok' % (i,j) )
							j +=1
		else:
			break
	i +=1	
endtime = datetime.datetime.now()
print (endtime-starttime).seconds
os.system('zip -6qrm /root/test/pic_'+str(i)+'.zip /root/test/pic/*')
sys.exit()






你可能感兴趣的:(抓取,python)