python爬虫笔记一:爬取豆瓣中指定的明星所有图片

从这个网址学习的:https://www.bilibili.com/read/cv10367703/

------------------------------------------------------------------------

https://movie.douban.com/celebrity/1011562/photos/

打开页面F12进入开发者工具,查看 下载的页数,以及每页最多显示30张

python爬虫笔记一:爬取豆瓣中指定的明星所有图片_第1张图片

a_list=content.find_all('div',attrs={'class','cover'}) #获取网页中的所有a标签对象
picture_list = []
for d in a_list:
    plist=d.find('img')['src']
    picture_list.append(plist)

python爬虫笔记一:爬取豆瓣中指定的明星所有图片_第2张图片

获取(共348张)并根据正规则获取纯数字:348

clist = content.find('span', attrs={'class', 'count'})  # 获取
ret= re.findall(r'\d+', clist.get_text())

写得太痛苦,直接放完整代码,自行看看理解就是了:

import re
import time
import requests
import os
from bs4 import BeautifulSoup
import lxml

#requests.getj时一定要加headers,否则会获取为空,只需要保留'User-Agent'一项即可
headers={
    # 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    # 'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8',
    # 'Cache-Control':'max-age=0',
    # 'Connection':'keep-alive',
    # 'Cookie':'ll="118254"; bid=bzf7LGz3pZA; _vwo_uuid_v2=DB12523A0B0C7127645E914A1FB363352|3d83981785084d997d7462a2ce24a947; __utmz=223695111.1626234491.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; douban-fav-remind=1; __utmz=30149280.1629095213.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1629168071%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D71zldwjBiMBa-xfexgVZ43eTQq2n8KKtTWTsWh37m72e_lfEOE1x3NuDj6egeYBLyqGE4gjSJnbxueQLcYZWsq%26wd%3D%26eqid%3Ddb6736ec000219350000000660ee5e6f%22%5D; _pk_ses.100001.4cf6=*; ap_v=0,6.0; __utma=30149280.1335619985.1616046306.1629095213.1629168072.4; __utmc=30149280; __utmb=30149280.1.10.1629168072; __utma=223695111.444014824.1616046306.1626234491.1629168075.3; __utmb=223695111.0.10.1629168075; __utmc=223695111; _pk_id.100001.4cf6=fa72408676bee41c.1616046306.3.1629168230.1626234491.',
    # 'Host':'movie.douban.com',
    # 'sec-ch-ua':'" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
    # 'sec-ch-ua-mobile':'?0',
    # 'Sec-Fetch-Dest':'document',
    # 'Sec-Fetch-Mode':'navigate',
    # 'Sec-Fetch-Site':'none',
    # 'Sec-Fetch-User':'?1',
    # 'Upgrade-Insecure-Requests':'1',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36',
}
# 设置post的数据
data = {}

def get_poster_url(res):
    content=BeautifulSoup(res.text,'lxml') #将网页源码构造成BeautifulSoup对象,方便操作
    #content=BeautifulSoup(res.text,'html.parser') #将网页源码构造成BeautifulSoup对象,方便操作
    a_list=content.find_all('div',attrs={'class','cover'}) #获取网页中的所有a标签对象
    picture_list = []
    for d in a_list:
        plist=d.find('img')['src']
        picture_list.append(plist)
    return picture_list;

def getCount(id,session):
    url = 'https://movie.douban.com/celebrity/{0}/photos/'.format(id)
    #res = requests.get(url=url, headers=headers)
    res = session.get(url=url,data=data, headers=headers)
    content = BeautifulSoup(res.text, 'lxml')
    #clists = content.find_all('span', attrs={'class', 'count'})  # 获取
    #[span.get_text() for span in clists]
    clist = content.find('span', attrs={'class', 'count'})  # 获取
    ret= re.findall(r'\d+', clist.get_text())
    if len(ret)>0:
        return [res,int(ret[0])]
    else:
        return [res,0]

def fire(mc,id,session):
    res,pagenums=getCount(id,session)
    if pagenums==0:
        return

    page=0
    for i in range(0,pagenums,30):
        #print("\n开始爬取山口百惠{}页\n".format(page))
        #url='https://movie.douban.com/celebrity/1014823/photos/?type=C&start={0}&sortby=like&size=a&subtype=a'.format(i)
        #print("开始爬取刘涛{}页\n".format(page))
        #url='https://movie.douban.com/celebrity/1011562/photos/?type=C&start={0}&sortby=like&size=a&subtype=a'.format(i)
        print("\n开始爬取{0}{1}页:{2}~{3}张/共{4}张\n".format(mc,page+1,page*30+1,(page+1)*30,pagenums))
        url='https://movie.douban.com/celebrity/{0}/photos/?type=C&start={1}&sortby=like&size=a&subtype=a'.format(id,i)
        if i>0:
            res = session.post(url=url,data=data, headers=headers)
            #res=requests.get(url=url,headers=headers)
        piclist=get_poster_url(res)
        download_picture(piclist,session)
        page=page+1
        time.sleep(1)

def download_picture(pic_l,session):
    if not os.path.exists(r'picture'):
        os.mkdir(r'picture')
    for i in pic_l:
        print("\r开始下载图片{0}".format(i))
        #pic=requests.get(i)
        pic=session.get(i)
        p_name=i.split('/')[7]
        with open('picture\\'+p_name,'wb') as f:
            f.write(pic.content)


mxarr=[('山口百惠','1014823'),('刘涛','1011562')]
if __name__ == '__main__':
    # 创建一个session,Session是requests库中的一个类,创建session对象进行访问的好处是,session对象能够自动维护访问的cookies信息。当然,它是不具备执行javascript代码的能力的,因此通过javascript修改的cookies信息它是记录不到的。
    session = requests.Session()
    for i,k in mxarr:
        fire(i,k,session)

你可能感兴趣的:(python,人工智能,爬虫)