从这个网址学习的:https://www.bilibili.com/read/cv10367703/
------------------------------------------------------------------------
https://movie.douban.com/celebrity/1011562/photos/
打开页面F12进入开发者工具,查看 下载的页数,以及每页最多显示30张
a_list=content.find_all('div',attrs={'class','cover'}) #获取网页中的所有a标签对象 picture_list = [] for d in a_list: plist=d.find('img')['src'] picture_list.append(plist)
获取(共348张)并根据正规则获取纯数字:348
clist = content.find('span', attrs={'class', 'count'}) # 获取 ret= re.findall(r'\d+', clist.get_text())
写得太痛苦,直接放完整代码,自行看看理解就是了:
import re
import time
import requests
import os
from bs4 import BeautifulSoup
import lxml
#requests.getj时一定要加headers,否则会获取为空,只需要保留'User-Agent'一项即可
headers={
# 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
# 'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8',
# 'Cache-Control':'max-age=0',
# 'Connection':'keep-alive',
# 'Cookie':'ll="118254"; bid=bzf7LGz3pZA; _vwo_uuid_v2=DB12523A0B0C7127645E914A1FB363352|3d83981785084d997d7462a2ce24a947; __utmz=223695111.1626234491.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; douban-fav-remind=1; __utmz=30149280.1629095213.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1629168071%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D71zldwjBiMBa-xfexgVZ43eTQq2n8KKtTWTsWh37m72e_lfEOE1x3NuDj6egeYBLyqGE4gjSJnbxueQLcYZWsq%26wd%3D%26eqid%3Ddb6736ec000219350000000660ee5e6f%22%5D; _pk_ses.100001.4cf6=*; ap_v=0,6.0; __utma=30149280.1335619985.1616046306.1629095213.1629168072.4; __utmc=30149280; __utmb=30149280.1.10.1629168072; __utma=223695111.444014824.1616046306.1626234491.1629168075.3; __utmb=223695111.0.10.1629168075; __utmc=223695111; _pk_id.100001.4cf6=fa72408676bee41c.1616046306.3.1629168230.1626234491.',
# 'Host':'movie.douban.com',
# 'sec-ch-ua':'" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
# 'sec-ch-ua-mobile':'?0',
# 'Sec-Fetch-Dest':'document',
# 'Sec-Fetch-Mode':'navigate',
# 'Sec-Fetch-Site':'none',
# 'Sec-Fetch-User':'?1',
# 'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36',
}
# 设置post的数据
data = {}
def get_poster_url(res):
content=BeautifulSoup(res.text,'lxml') #将网页源码构造成BeautifulSoup对象,方便操作
#content=BeautifulSoup(res.text,'html.parser') #将网页源码构造成BeautifulSoup对象,方便操作
a_list=content.find_all('div',attrs={'class','cover'}) #获取网页中的所有a标签对象
picture_list = []
for d in a_list:
plist=d.find('img')['src']
picture_list.append(plist)
return picture_list;
def getCount(id,session):
url = 'https://movie.douban.com/celebrity/{0}/photos/'.format(id)
#res = requests.get(url=url, headers=headers)
res = session.get(url=url,data=data, headers=headers)
content = BeautifulSoup(res.text, 'lxml')
#clists = content.find_all('span', attrs={'class', 'count'}) # 获取
#[span.get_text() for span in clists]
clist = content.find('span', attrs={'class', 'count'}) # 获取
ret= re.findall(r'\d+', clist.get_text())
if len(ret)>0:
return [res,int(ret[0])]
else:
return [res,0]
def fire(mc,id,session):
res,pagenums=getCount(id,session)
if pagenums==0:
return
page=0
for i in range(0,pagenums,30):
#print("\n开始爬取山口百惠{}页\n".format(page))
#url='https://movie.douban.com/celebrity/1014823/photos/?type=C&start={0}&sortby=like&size=a&subtype=a'.format(i)
#print("开始爬取刘涛{}页\n".format(page))
#url='https://movie.douban.com/celebrity/1011562/photos/?type=C&start={0}&sortby=like&size=a&subtype=a'.format(i)
print("\n开始爬取{0}{1}页:{2}~{3}张/共{4}张\n".format(mc,page+1,page*30+1,(page+1)*30,pagenums))
url='https://movie.douban.com/celebrity/{0}/photos/?type=C&start={1}&sortby=like&size=a&subtype=a'.format(id,i)
if i>0:
res = session.post(url=url,data=data, headers=headers)
#res=requests.get(url=url,headers=headers)
piclist=get_poster_url(res)
download_picture(piclist,session)
page=page+1
time.sleep(1)
def download_picture(pic_l,session):
if not os.path.exists(r'picture'):
os.mkdir(r'picture')
for i in pic_l:
print("\r开始下载图片{0}".format(i))
#pic=requests.get(i)
pic=session.get(i)
p_name=i.split('/')[7]
with open('picture\\'+p_name,'wb') as f:
f.write(pic.content)
mxarr=[('山口百惠','1014823'),('刘涛','1011562')]
if __name__ == '__main__':
# 创建一个session,Session是requests库中的一个类,创建session对象进行访问的好处是,session对象能够自动维护访问的cookies信息。当然,它是不具备执行javascript代码的能力的,因此通过javascript修改的cookies信息它是记录不到的。
session = requests.Session()
for i,k in mxarr:
fire(i,k,session)