思路:
- 打开豆瓣相关页,进行抓包
- 抓取json的url,进行爬取
- headers,url,get或post
- 返回json后进行处理
处理json
- 显示采用jsonpath得到电影名
- 因为博主初学,还不会得到电影名与评分,故采取了两次jsonpath.jsonpath
- 得到之后的list进行交叉合并,此处使用的是chain
- 在合并后的list采取算法使其进行换行以及隔开
- 最后保存在本地
import requests
import json
import jsonpath
from itertools import chain
url = "https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&page_limit=100&page_start=0"
headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Mobile Safari/537.36"
}
r = requests.get(url=url,headers=headers)
ret = json.dumps(r.content.decode(),ensure_ascii=False,indent=4)
with open("douban.json","w",encoding="utf-8") as f:
f.write(ret)
name = jsonpath.jsonpath(r.json(),'$..title')
rate = jsonpath.jsonpath(r.json(),'$..rate')
want = list(chain.from_iterable(zip(name,rate)))
count1 = 0
for w in want:
if count1%3 == 1:
want.insert(count1,":")
count1 += 1
count2 = 0
for w in want:
if count2%4 == 0:
want.insert(count2,"\n")
count2 += 1
print(want)
str1 = " ".join(want)
print(str1)
with open("want.txt","w",encoding="utf-8") as f:
f.write(str1)