单击分类信息,跳转到分类电影列表。
这个页面是有多页数据加载的,当用户向下滚动右侧的滚动,加载数据,这个经过分析是ajax加载的数据,需要找到ajax请求的网址。
先找到分类,提取分类的名字和类型编号,然后再爬分类下的电影数据。
ajax返回的数据是json,response.json()得到的是字典,用字典操作就可以了,当然肯定可以用正则。
其实专门操作json的有一个模块叫jsonpath。
import requests
import re
import csv
type_url = "https://movie.douban.com/chart"
movie_url = "https://movie.douban.com/j/chart/top_list"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
}
def parse_html(url, params={
}):
"""解析url,得到html"""
response = requests.get(url=url, headers=headers, params=params)
return response.content.decode("utf-8")
def parse_json(url, params={
}):
"""解析url,得到字典"""
response = requests.get(url=url, headers=headers, params=params)
return response.json()
def get_movie_type():
"""获取电影分类"""
content = parse_html(type_url)
return re.findall(r'.*?', content)
def get_movie(movie_type, low_score, high_score):
"""获取电影"""
movie = {
"title": "", # 电影名称
"actors": "", # 主演
"release_date": "", # 上映日期
"regions": "", # 上映地
"types": "", # 类型
"score": "", # 评分
"vote_count": "", # 评论数
"url": "", # url
}
movie_type_name = movie_type[0]
movie_type_num = movie_type[1]
i = 0
while True:
# 参数
params = {
"type": movie_type_num,
"interval_id": "{}:{}".format(high_score, low_score),
"action": "",
"start": i,
"limit": 20
}
# 发请求获取数据
data_list = parse_json(movie_url, params)
# 判断循环退出
if not data_list:
break
# 循环
for data in data_list:
movie["title"] = data["title"]
movie["actors"] = data["actors"]
movie["release_date"] = data["release_date"]
movie["regions"] = data["regions"]
movie["types"] = data["types"]
movie["score"] = data["score"]
movie["vote_count"] = data["vote_count"]
movie["url"] = data["url"]
save(movie)
i += 20
def save(item):
"""将数据保存到csv中"""
with open("./豆瓣电影.csv", "a", encoding="utf-8") as file:
writer = csv.writer(file)
writer.writerow(item.values())
def start():
"""爬虫开始"""
low_score = int(input("输入要爬取的最低分(以5为单位),最高分默认是加10>"))
high_score = low_score + 10
movie_type_list = get_movie_type()
for movie_type in movie_type_list:
print("{}爬取中...".format(movie_type[0]))
get_movie(movie_type, low_score, high_score)
if __name__ == '__main__':
start()