import requests
import re
import csv
"""1、拿到页面源代码"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57'
}
for title in range(0, 250, 25):
url = f'https://movie.douban.com/top250?start={title}&filter='
resp = requests.get(url, headers=headers)
html = resp.text
# print(html)
"""2、解析数据"""
# 写正则表达式
obj = re.compile(r'.*?(?P.*?) .*?.*? '
r'.*?
.*?(?P.*?) / .*?'
r' .*?(?P.*?) .*?'
r'(?P.*?)人评价 ',re.S)
result = obj.finditer(html)
# 写入csv文件
f = open("data.csv", mode="w")
csvwriter = csv.writer(f)
for it in result:
# print(it.group("name"))
# print(it.group("year").strip())
# print(it.group("score").strip())
# print(it.group("people"))
"""把数据整理成字典的格式"""
dic = it.groupdict()
dic['year'] = dic['year'].strip()
csvwriter.writerow(dic.values())
f.close()
print("over!")
运行后打开data.csv