代码
# Author:ZhouChuang
# coding:utf-8
from bs4 import BeautifulSoup
import requests
import time
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3514.0 Safari/537.36',
'Cookie':'viewed="2166211"; bid=wLwzb9b0g_A; douban-fav-remind=1; ll="118173"; __utmc=30149280; __utmc=223695111; _vwo_uuid_v2=D96C'
'22273BD00491856812822DDB071A2|e5653604c927a32fa93d6e494419f10c; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1533806091%2C%22h'
'ttps%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DuEf5o7h6W1QgcIPLdxBrM9-O5w1pL72KygnR1F15VN2W7NpRddrICJa95QHW8IHb%26wd%3D%26eqid%3'
'Dd58b6e88000163dd000000045b6bfbe0%22%5D; _pk_ses.100001.4cf6=*; ps=y; ck=2wWO; __utma=30149280.1177526221.1531553567.1533803'
'492.1533806181.4; __utmz=30149280.1533806181.4.3.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/phone/b'
'ind; __utma=223695111.801250262.1533803492.1533803492.1533806181.2; __utmz=223695111.1533806181.2.2.utmcsr=accounts.douban.com'
'|utmccn=(referral)|utmcmd=referral|utmcct=/phone/bind; ap=1; push_noty_num=0; push_doumail_num=0; douban-profile-remind=1; __'
'utmv=30149280.15261; __utmb=30149280.22.10.1533806181; _pk_id.100001.4cf6=3eba8e0d5047ec4c.1533803492.2.1533806798.1533803530.;'
' __utmb=223695111.15.10.1533806181'
}
urls = ['https://movie.douban.com/top250?start={}&filter='.format(str(i)) for i in range(0,275,25)]
url = 'https://movie.douban.com/top250?start=0&filter='
def get(url,data=None):
wb_data = requests.get(url,headers=headers)
time.sleep(2)
Soup = BeautifulSoup(wb_data.text,'lxml')
paimings =Soup.select('#content > div > div.article > ol > li > div > div.pic > em')
titles = Soup.select('#content > div > div.article > ol > li > div > div.pic > a > img')
pingfens = Soup.select('#content > div > div.article > ol > li > div > div.info > div.bd > div > span.rating_num')
jianpings = Soup.select('#content > div > div.article > ol > li > div > div.info > div.bd > p.quote > span')
images = Soup.select('#content > div > div.article > ol > li > div > div.pic > a > img')
# for pingfen in pingfens:
# print(pingfen.get_text())
for title,pingfen,image,paiming,jianping in zip(titles,pingfens,images,paimings,jianpings):
data = {
'排名':paiming.get_text(),
'名称':title.get('alt'),
'评分':pingfen.get_text(),
'简评':jianping.get_text(),
'图片链接':image.get('src')
}
print(data)
# print(urls)
for smart in urls:
get(smart)
结果截图
截图.png