Python爬虫实战 —— 爬取豆瓣TOP250电影榜

import re
import json
import requests
import pprint
import pandas
import openpyxl
from bs4 import BeautifulSoup

# 构建分页数字列表
page_indexs = list(range(0, 250, 25))
print(page_indexs)

def download_all_htmls():
    """
    下载所有列表页面的HTML,用于后续的分析
    """
    htmls = []
    for index in page_indexs:
        url = f"https://movie.douban.com/top250?start={index}"
        print('craw html:', url)

        headers = {
            'User-Agent': 'xxx'.encode('utf-8')
        }
        res = requests.get(url, headers=headers)
        if res.status_code != 200:
            print(res.status_code)
            raise Exception('error')
        htmls.append(res.text)
    return htmls


# 执行爬取
htmls = download_all_htmls()


def parse_single_html(html):
    """
    解析单个 HTML 得到数据
    :return list({'link', 'title', [label]})
    """
    soup = BeautifulSoup(html, 'html.parser')
    article_items = (
        soup.find('div', class_='article')
            .find('ol', class_='grid_view')
            .find_all('div', class_='item')
    )
    datas = []
    for article_item in article_items:
        rank = article_item.find('div', class_='pic').find('em').get_text()
        info = article_item.find('div', class_='info')
        title = info.find('div', class_='hd').find('span', class_='title').get_text()
        stars = (
            info.find('div', class_='bd')
                .find('div', class_='star')
                .find_all('span')
        )
        rating_star = stars[0]['class'][0]
        rating_num = stars[1].get_text()
        comments = stars[3].get_text()

        datas.append({
            'rank': rank,
            'title': title,
            'rating_star': rating_star.replace('rating', '').replace('-t', ''),
            'rating_num': rating_num,
            'comments': comments.replace('人评价', '')
        })

    return datas


# pprint.pprint(parse_single_html(htmls[0]))


# 执行所有的 HTML 页面解析,得到数据
all_datas = []
for html in htmls:
    all_datas.extend(parse_single_html(html))

print('--------------------------------')
# print(all_datas)
for data in all_datas:
    print(data)

print('--------------------------------')
# 奖结果存入 Excel
df = pandas.DataFrame(all_datas)
print(df)

df.to_excel('豆瓣电影TOP250.xlsx')

你可能感兴趣的:(python,爬虫,python,开发语言,爬虫)