python获取豆瓣电影

打开豆瓣,点击选电影
python获取豆瓣电影_第1张图片
到这里你会发现很简单,直接用xpath不久很容易获取到电影名及评分了吗。其实我们看到的页面是经js渲染过的,真正数据的网页在⬇
python获取豆瓣电影_第2张图片
当你点击加载更多时,会发现这个网址的前部分不变,0变成20。
我们先来获取热门这一类。
用requests获取到的是json格式的数据,也就是python中的字典。
获取‘subjects’对应的值,是一个包含电影信息的列表。
遍历列表,即可获取对应的影片信息,择需保存。
完成一页后,判断这一页是否获取到信息。若有,则继续下一页获取。
获取方式,增加20计数器,递归自身。

base_url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start='
count = 0


def download():
    url = base_url + str(count)
    result = requests.get(url=url, headers=header)
    result_json = json.loads(result.text)['subjects']
    print(result_json)
    for item in result_json:
        film_name = item['title']
        rate = item['rate']
        url = item['url']
        content = '影片名:' + film_name + '\n豆瓣评分:' + rate + '\n网址:' + url + '\n\n'
        with open('豆瓣电影/' + self.target + '.txt', 'a', errors='ignore') as f:
            f.write(content)
    if result_json:
        count += 20
        download()

其它几大类除了base_url不一样,其它一致。
完整代码

import requests, json, os


class DouBanSpider:
    def __init__(self):
        print('现有分类:热门、经典、豆瓣高分、冷门佳作、华语、喜剧、悬疑、最新、动作')
        self.target = input('请输入要获取影片的类别:')
        self.count = 0

    def confirm_base_url(self):
        if not os.path.exists('豆瓣电影'):
            os.makedirs('豆瓣电影')
        if os.path.exists('豆瓣电影/' + self.target + '.txt'):
            os.remove('豆瓣电影/' + self.target + '.txt')
        if self.target == '热门':
            self.base_url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=' \
                            '%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start='
        elif self.target == '经典':
            self.base_url = 'https://movie.douban.com/j/search_subjects?type=movie&' \
                            'tag=%E7%BB%8F%E5%85%B8&sort=time&page_limit=20&page_start='
        elif self.target == '豆瓣高分':
            self.base_url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=' \
                            '%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=time&page_limit=20&page_start='
        elif self.target == '冷门佳作':
            self.base_url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=' \
                            '%E5%86%B7%E9%97%A8%E4%BD%B3%E7%89%87&sort=time&page_limit=20&page_start='
        elif self.target == '最新':
            self.base_url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=' \
                            '%E6%9C%80%E6%96%B0&page_limit=20&page_start='
        elif self.target == '华语':
            self.base_url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=' \
                            '%E5%8D%8E%E8%AF%AD&sort=time&page_limit=20&page_start='
        elif self.target == '悬疑':
            self.base_url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=' \
                            '%E6%82%AC%E7%96%91&sort=time&page_limit=20&page_start='
        elif self.target == '喜剧':
            self.base_url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=' \
                            '%E5%96%9C%E5%89%A7&sort=time&page_limit=20&page_start='
        elif self.target == '动作':
            self.base_url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=' \
                            '%E5%8A%A8%E4%BD%9C&sort=time&page_limit=20&page_start='
        else:
            print('输入有误')
            self.target = input('请重新输入要获取影片的类别:')
            self.confirm_base_url()
        self.download()

    def download(self):
        url = self.base_url + str(self.count)
        result = requests.get(url=url, headers=header)
        result_json = json.loads(result.text)['subjects']
        print(result_json)
        for item in result_json:
            film_name = item['title']
            rate = item['rate']
            url = item['url']
            content = '影片名:' + film_name + '\n豆瓣评分:' + rate + '\n网址:' + url + '\n\n'
            with open('豆瓣电影/' + self.target + '.txt', 'a', errors='ignore') as f:
                f.write(content)
        if result_json:
            self.count += 20
            self.download()


if __name__ == '__main__':
    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                            '(KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
    t = DouBanSpider()
    t.confirm_base_url()

你可能感兴趣的:(python获取豆瓣电影)