首先声明下网上部分使用的网址是
r'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1497491098685_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1497491098685%5E00_1519X735&word={}'。foramt(word)
#这个是以前翻页版本的
(好像现在在地址栏不是这样了,貌似百度图片网页结构没改变之前是这样的)
有的是使用下面的网址
url="https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm= -1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=\
&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word={}&pn={}".format(word, page)
(网址在地址栏)
第一个网址是以前的,第二个网址是现在,两个网址均可以用于爬虫,但翻页式爬取图片的方法不同。
第2个网址(代码),首选输入在百度图片中输入想要批量下载的图片类型,通过改变搜索的图片类型,地址栏中只有word后面是发生变化的,而后首先遍历所有的页数,并下载。(这样好像再下载的过程中一些图片下载不成功)
# coding=utf-8
import requests
import json
import urllib
import os
import urllib
import urllib.request
from urllib.parse import quote
import re
import os
import time
def dowload_pic(path,pic_url,page):
num=0
for string in pic_url:
ID=(page-1)*30 +num
num+=1
time.sleep(1)
try:
print("正在下载" + string)
f_req = urllib.request.Request(string, headers=headers)
f_url = urllib.request.urlopen(f_req).read()
generate_path(path,wd)
fs = open(path + "/" + wd+'/' + str(num) + ".jpg", "wb+")
fs.write(f_url)
fs.close()
print(ID, "已下载成功")
except Exception as e:
print(ID, "下载失败")
continue
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36",
"referer": "https://image.baidu.com"
}
path=r'F:\国科大\pachong\pic'
wd='成都'
word=quote(wd, encoding="utf-8")
page=1
pic_urls=[]
while True:
print(page)
url="https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm= -1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=\
&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word={}&pn={}".format(word, page)
#req = urllib.request.Request(url, headers=headers)
#f = urllib.request.urlopen(req).read().decode("utf-8")
data=requests.get(url,timeout = 500)
data.encoding='utf-8'
da=data.text
pic_url = re.findall('"objURL":"(.*?)",', da, re.S)
if pic_url==[]:
break
else:
dowload_pic(path,pic_url,page)
page+=1
def generate_path(path,word):
if not os.path.exists(path+'/'+word):
os.mkdir(path+'/'+word)
else:
pass
第一个网址
import re
import sys
import urllib
import requests
import os
keyword = '成都'
def fanye(onepageurl):
"""获取单个翻页的所有图片的urls+当前翻页的下一翻页的url"""
if not onepageurl:
print('已到最后一页, 结束')
return [], ''
try:
html = requests.get(onepageurl)
html.encoding = 'utf-8'
html = html.text
except Exception as e:
print(e)
pic_urls = []
fanye_url = ''
return pic_urls, fanye_url
pic_urls = re.findall('"objURL":"(.*?)",', html, re.S)
fanye_urls = re.findall(re.compile(r'下一页'), html, flags=0)
fanye_url = 'http://image.baidu.com' + fanye_urls[0] if fanye_urls else ''
return pic_urls, fanye_url
def down_pic(pic_urls):
"""给出图片链接列表, 下载所有图片"""
for i, pic_url in enumerate(pic_urls):
try:
pic = requests.get(pic_url, timeout=15)
string = str(i + 1) + '.jpg'
os.chdir(r'F:\国科大\pachong\pic\fanye')
with open(string, 'wb') as f:
f.write(pic.content)
print('成功下载第%s张图片: ' % (str(i + 1)))
except Exception as e:
print('下载第%s张图片时失败: ' % (str(i + 1)))
print(e)
continue
if __name__ == '__main__':
word = '成都'
url = r'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1497491098685_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1497491098685%5E00_1519X735&word='
url_init = url + urllib.parse.quote(word)
pic_urls = []
page_url, fanye_url = fanye(url_init)
pic_urls.extend(page_url)
fanye_count = 0 # 累计翻页数
while 1:
page_url, fanye_url = fanye(fanye_url)
fanye_count += 1
# print('第页' % str(fanye_count))
if fanye_url == '' and page_url == []:
break
pic_urls.extend(page_url)
down_pic(pic_urls)
由于百度的反爬虫,代码在运行过程中可能会出现错误。