正则表达式、lxml、BeautifulSoup统统用不上的状况也是存在的,如果响应类型是json的话。
试着顺下面代码的思路对照下豆瓣电影的 request——response互动
"""
topic:豆瓣是异步加载的,更神奇的是response的内容是json,我试着抓取
author:小灵子
date:2019-6-4
"""
import requests
import time
def build_url():
for page in range(31): #查询三十页即可
url = 'http://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=' + str(page) + '&limit=20'
yield url
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'}
def get_wb_data(films):
for url in build_url():
r = requests.get(url, headers)
wb_data = r.json()
#print(type(wb_data))
for item in wb_data:
rating = item["rating"][0]
cover_url = item["cover_url"]
title = item["title"]
if float(rating) >= 9:
films.append({"片名:": title, "评分:": rating})
with open('D:\\douban\\{}.jpg'.format(title), 'wb') as f:
img = requests.get(cover_url,headers).content
f.write(img)
time.sleep(1)
def main():
good_films = []
get_wb_data(good_films)
print("9分以上剧情电影 %d 部" % len(good_films))
for film in good_films:
print(film)
if __name__ == "__main__":
main()