import re
import json
import requests
import pprint
import pandas
import openpyxl
from bs4 import BeautifulSoup
# 构建分页数字列表
page_indexs = list(range(0, 250, 25))
print(page_indexs)
def download_all_htmls():
"""
下载所有列表页面的HTML,用于后续的分析
"""
htmls = []
for index in page_indexs:
url = f"https://movie.douban.com/top250?start={index}"
print('craw html:', url)
headers = {
'User-Agent': 'xxx'.encode('utf-8')
}
res = requests.get(url, headers=headers)
if res.status_code != 200:
print(res.status_code)
raise Exception('error')
htmls.append(res.text)
return htmls
# 执行爬取
htmls = download_all_htmls()
def parse_single_html(html):
"""
解析单个 HTML 得到数据
:return list({'link', 'title', [label]})
"""
soup = BeautifulSoup(html, 'html.parser')
article_items = (
soup.find('div', class_='article')
.find('ol', class_='grid_view')
.find_all('div', class_='item')
)
datas = []
for article_item in article_items:
rank = article_item.find('div', class_='pic').find('em').get_text()
info = article_item.find('div', class_='info')
title = info.find('div', class_='hd').find('span', class_='title').get_text()
stars = (
info.find('div', class_='bd')
.find('div', class_='star')
.find_all('span')
)
rating_star = stars[0]['class'][0]
rating_num = stars[1].get_text()
comments = stars[3].get_text()
datas.append({
'rank': rank,
'title': title,
'rating_star': rating_star.replace('rating', '').replace('-t', ''),
'rating_num': rating_num,
'comments': comments.replace('人评价', '')
})
return datas
# pprint.pprint(parse_single_html(htmls[0]))
# 执行所有的 HTML 页面解析,得到数据
all_datas = []
for html in htmls:
all_datas.extend(parse_single_html(html))
print('--------------------------------')
# print(all_datas)
for data in all_datas:
print(data)
print('--------------------------------')
# 奖结果存入 Excel
df = pandas.DataFrame(all_datas)
print(df)
df.to_excel('豆瓣电影TOP250.xlsx')