python爬虫豆瓣高分电影前一百部

思路:

  • 打开豆瓣相关页,进行抓包
  • 抓取json的url,进行爬取
  • headers,url,get或post
  • 返回json后进行处理

处理json

  1. 显示采用jsonpath得到电影名
  2. 因为博主初学,还不会得到电影名与评分,故采取了两次jsonpath.jsonpath
  3. 得到之后的list进行交叉合并,此处使用的是chain
  4. 在合并后的list采取算法使其进行换行以及隔开
  5. 最后保存在本地
import requests
import json
import jsonpath
from itertools import chain


url = "https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&page_limit=100&page_start=0"
headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Mobile Safari/537.36"
}


r = requests.get(url=url,headers=headers)
# print(r.content.decode())
ret = json.dumps(r.content.decode(),ensure_ascii=False,indent=4)
# print(type(r.content.decode()))
with open("douban.json","w",encoding="utf-8") as f:
    f.write(ret)

#
# with open("douban.json","r",encoding="utf-8") as f:
#     ret4 = json.load(f)
#     print(ret4)
#     print(type(ret4))

# print(r.json())
# res = r.json()['subjects'][0]['title']
# print(r.json()['subjects'][0]['title'])
# print(type(res))
# print(r.json())
# print(type(r.json()))
name = jsonpath.jsonpath(r.json(),'$..title')
rate = jsonpath.jsonpath(r.json(),'$..rate')
# print(name)
# print(rate)
# print(type(name))
want = list(chain.from_iterable(zip(name,rate)))
# print(want)

count1 = 0
for w in want:

    if count1%3 == 1:
        want.insert(count1,":")
    # elif count%2 == 0:
    #     want.insert(count,"\n")
        # print()
    count1 += 1

count2 = 0
for w in want:

    if count2%4 == 0:
        want.insert(count2,"\n")
    # elif count%2 == 0:
    #     want.insert(count,"\n")
        # print()
    count2 += 1

print(want)
str1 = " ".join(want)
print(str1)
with open("want.txt","w",encoding="utf-8") as f:
    f.write(str1)

你可能感兴趣的:(PYTHON)