爬取豆瓣电影top250详情页

# Author    : GQ
# Datetime  : 2020/2/12 20:28
# Product   : PyCharm
# Project   : python
# File      : 豆瓣top250带详情页.py

import requests
from fake_useragent import UserAgent
from lxml import etree
import csv

ua = UserAgent()
headers = {
    'User-Agent': ua.random,
    'Referer': 'https://movie.douban.com/top250?start=0&filter='
           }

def get_detaile_url(url):
    response = requests.get(url,headers=headers)
    text = response.text
    html = etree.HTML(text)
    detaile_urls = html.xpath('//div[@class="hd"]/a/@href')
    return detaile_urls

def parse_detaile_url(url):
    response = requests.get(url, headers=headers)
    text = response.text
    html = etree.HTML(text)

    movie = {}
    
    title = html.xpath('//h1/span[@property="v:itemreviewed"]/text()')[0]
    score = html.xpath('//div[@id="interest_sectl"]//strong[@class="ll rating_num"]/text()')
    people = html.xpath('//div[@id="interest_sectl"]//div[@class="rating_sum"]/a/span/text()')[0] + '人'
    director = html.xpath('//div[@id="info"]/span[1]/span[2]/a/text()')
    screenwriter = html.xpath('//div[@id="info"]/span[2]/span[2]//a/text()')
    actor = html.xpath('//div[@id="info"]/span[@class="actor"]/span[2]//a/text()')
    style = html.xpath('//div[@id="info"]//span[@property="v:genre"]/text()')
    # area = html.xpath('//*[@id="info"]/text()')

    movie['标题'] = title
    movie['评分'] = score
    movie['评价人数'] = people
    movie['导演'] = director
    movie['编剧'] = screenwriter
    movie['主演'] = actor
    movie['类型'] = style

    return movie

def run():
    url = 'https://movie.douban.com/top250?start={}&filter='
    movielist = []
    for i in range(10):
        douban_url = url.format(i * 25)
        detaile_urls = get_detaile_url(douban_url)
        for detaile_url in detaile_urls:
            movie = parse_detaile_url(detaile_url)
            movies = movielist.append(movie)
            print(movies)
if __name__ == '__main__':
    run()

你可能感兴趣的:(python爬虫)