requests爬取豆瓣电影top250

代码

import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor


class Crawl_douban(object):
    # 设定初始化方法,并设置实例变量header、page_url(list类型)分别记录爬虫的请求头和目标抓取页的url
    def __init__(self):
        self.url = []
        self.header = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3902.4 Safari/537.36',
        }

    # 将构造好的目标页URL存储于记录目标抓取页url的列表中
    def page_url(self):
        for i in range(0, 10):
            link = 'https://movie.douban.com/top250?start={}'.format(i * 25)
            self.url.append(link)

    # 传递目标页url参数
    def sent_request(self, url):
        # 发送爬虫请求
        response = requests.get(url=url, headers=self.header)
        response.encoding = 'urf-8'
        # 返回爬取的文本类型数据
        return response.text

    # 传递目标页url参数
    def page_detail(self, url):
        # 解析当前页的返回数据
        url_text = self.sent_request(url)
        html = etree.HTML(url_text)
        moive_list = []

        all_moive = html.xpath('//ol/li')
        for item in all_moive:
            movie_info = {}
            # 爬取目标数据
            movie_info['movie_name'] = ''.join(''.join(item.xpath('.//a/span/text()')).split())
            movie_info['actors_information'] = ''.join(''.join(item.xpath('.//p[1]/text()')).split())
            movie_info['score'] = item.xpath('.//span[2][@class="rating_num"]/text()')[0]
            movie_info['evaluate'] = item.xpath('.//span[4]/text()')[0]
            movie_info['describe'] = item.xpath('.//p[@class="quote"]/span/text()')[0]
            movie_info['from_url'] = url
            moive_list.append(movie_info)

        print(moive_list)


    # 通过线程池方式启动爬虫
    def run(self):
        self.page_url()
        executor = ThreadPoolExecutor()
        executor.map(self.page_detail, self.url)
        executor.shutdown()


# 设置程序入口函数
if __name__ == '__main__':
    douban = Crawl_douban()
    douban.run()

输出效果如图
在这里插入图片描述

你可能感兴趣的:(requests爬取豆瓣电影top250)