豆瓣电影前250名爬虫并写入excel源码

import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook
import time
import lxml

def get_page(url):
    headers = {
       "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
    }
    response = requests.get(url, headers = headers)#请求数据
    data = response.text
    soup = BeautifulSoup(data, 'lxml')#解析数据形成文档树结构
    return soup

def get_page_detail(url):
    soup = get_page(url)
    all_film = soup.findAll('div', {'class':'item'})#找到每个电影的html
    for item in all_film:#从每个电影的html中找数据并添加到列表中
        rank = item.find('em').text
        name = item.find('span', {'class':'title'}).text
        score = item.find('span', {'class':'rating_num'}).text
        film_tuple = (rank, name, score)
        list.append(film_tuple)
    return None

def main(start = 0, file = None):
    url = f'{base_url}?start={start}'#每个页面只有25个电影,所以要设置参数start
    get_page_detail(url)

if __name__ == "__main__":
    print('开始执行')
    start = time.perf_counter()

    base_url = "https://movie.douban.com/top250"

#建立excel
    wb = Workbook()
    ws = wb.active
    title = ['排名', '名字', '豆瓣评分']
    ws.append(title)
    list = []

    for i in range(0, 250, 25):
        main(start=i)

#将电影信息写入excel
    for item in list:
        ws.append(item)

#记得保存
    wb.save(filename='豆瓣前250的电影.xlsx')
    end = time.perf_counter()
    print('执行结束')
    print(f'耗时{end-start}')

你可能感兴趣的:(python,爬虫,python)