学习Python的第五天

豆瓣电影排行榜前250爬虫

import requests
from lxml import etree
import pandas as pd

def spider_douban_top250():
    movie_list_info = []
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
    for i in range(0, 250, 25):
        url = 'https://movie.douban.com/top250?start={}&filter='.format(i)
        data = requests.get(url, headers=headers).content
        html = etree.HTML(data)
    # html.xpath('')
        ol_list = html.xpath('//div[@id="content"]//div[@class="article"]/ol/li')
    # print(ol_list)
        for li in ol_list:
            serial_num = li.xpath('./div[@class="item"]/div[@class="pic"]/em/text()')[0]
    # print(serial_num)
    # 影片序号
    # serial_num
            movie_name = li.xpath('./div[@class="item"]/div[@class="info"]/div[@class="hd"]/a/span[1]/text()')[0]
    # print(movie_name)
    # 电影名字
    # movie_name
            movie_inroduce = li.xpath('./div[@class="item"]/div[@class="info"]/div[@class="bd"]/p[1]/text()')[0].strip()
            print(movie_inroduce)
            star = li.xpath('./div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[2]/text()')[0]
    # print(star)
            evaluate = li.xpath('./div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[4]/text()')[0]
            evaluate = int(evaluate.replace("人评价", ""))
    # print(evaluate)
            description = li.xpath('//div[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/p[2]/span')
            movie_img_url = li.xpath('./div[@class="item"]/div[@class="pic"]/a/img/@src')[0]
    # movie_img_url
            movie_list_info.append({
                'serial_num': serial_num,
                'movie_name': movie_name,
                'movie_inroduce': movie_inroduce,
                'star': star,
                'evaluate': evaluate,
                'movie_img_url': movie_img_url
            })
    for movie in movie_list_info:
        print(movie)
    下载图片
    df = pd.DataFrame(book_list)
    df.to_csv('top250.csv')
    for movie in movie_list_info:
        url = movie['movie_img_url']
        resp = requests.get(url)
        if resp.status_code == 200:
            img_name = '0000000{}.jpg'.format(movie['serial_num'])
            with open('./Include/pachong2/{}'.format(img_name), 'wb') as f:
                f.write(resp.content)

spider_douban_top250()

你可能感兴趣的:(学习Python的第五天)