使用requests爬取豆瓣电视

暴力点直接上代码

爬取的过程中发现不涉及cookie和ip的反爬虫策略,数据量大的部分使用协程挺快的!

import json
import requests
import gevent
from gevent import monkey

monkey.patch_socket()


class DoubanTVSpider:
    def __init__(self):
        self.base_url = "https://movie.douban.com/j/search_subjects"
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
        }
        self.page_limit = 20

    def parse_url(self, url, params):
        response = requests.get(url, params=params, headers=self.headers)
        assert response.status_code == 200
        return response.content.decode()

    def get_content_list(self, json_str):
        dict_data = json.loads(json_str)
        print(dict_data)
        count = len(dict_data["subjects"])
        return dict_data, count

    def save_cotent_list(self, content_list, tag):
        content_list.update({"tag": tag})
        with open("doubanTV.json", "a", encoding="utf-8") as f:
            f.write(json.dumps(content_list, ensure_ascii=False, indent=4))
            f.write(";")
        print("OK")

    def _run(self, tag="热门"):
        params = {
            "type": "tv",
            "tag": tag,
            "sort": "recommend",
            "page_limit": 20,
            "page_start": 0
        }
        print(tag)
        while True:
            json_str = self.parse_url(self.base_url, params)
            content_list, count = self.get_content_list(json_str)
            self.save_cotent_list(content_list, params['tag'])
            if count < self.page_limit:
                # 到达尾部,退出爬虫程序
                break
            params['page_start'] = params['page_start'] + 20

    def run(self):
        print("""请输入你的选择:
 0: "热门"
1: "美剧"
2: "英剧"
3: "韩剧"
4: "日剧"
5: "国产剧"
6: "港剧"
7: "日本动画"
8: "综艺"
9: "纪录片"
10:以上所有
11.退出系统
        """)
        select = input()
        switch_list = {
            "0": "热门",
            "1": "美剧",
            "2": "英剧",
            "3": "韩剧",
            "4": "日剧",
            "5": "国产剧",
            "6": "港剧",
            "7": "日本动画",
            "8": "综艺",
            "9": "纪录片",
        }
        if select == "11":
            exit()
        elif select == "10":
            gevent.joinall([
                gevent.spawn(self._run, switch_list[i]) for i in switch_list
            ])
        else:
            self._run(switch_list[select])


if __name__ == "__main__":
    doubanTV_spider = DoubanTVSpider()
    doubanTV_spider.run()

爬取下来的结果(全部太多了,只放部分)


            "title": "我们不能是朋友",
            "url": "https://movie.douban.com/subject/30309331/",
            "playable": false,
            "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2557149941.jpg",
            "id": "30309331",
            "cover_y": 916,
            "is_new": false
        },
        {
            "rate": "6.8",
            "cover_x": 1432,
            "title": "少年派",
            "url": "https://movie.douban.com/subject/27598254/",
            "playable": true,
            "cover": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2558828119.jpg",
            "id": "27598254",
            "cover_y": 2048,
            "is_new": false
        },
        {
            "rate": "7.7",
            "cover_x": 1000,
            "title": "动物管理局",
            "url": "https://movie.douban.com/subject/27107725/",
            "playable": true,
            "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2522426850.jpg",
            "id": "27107725",
            "cover_y": 1419,
            "is_new": false
        },
        {
            "rate": "9.5",
            "cover_x": 1204,
            "title": "我们与恶的距离",
            "url": "https://movie.douban.com/subject/30181230/",
            "playable": true,
            "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2554916825.jpg",
            "id": "30181230",
            "cover_y": 1720,
            "is_new": false
        },
        {
            "rate": "3.7",
            "cover_x": 1071,
            "title": "带着爸爸去留学",
            "url": "https://movie.douban.com/subject/30238247/",
            "playable": true,
            "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2540911433.jpg",
            "id": "30238247",
            "cover_y": 1500,
            "is_new": false
        },
        {
            "rate": "7.8",
            "cover_x": 6732,
            "title": "大宋少年志",
            "url": "https://movie.douban.com/subject/30170894/",
            "playable": true,
            "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2559266281.jpg",
            "id": "30170894",
            "cover_y": 11968,
            "is_new": false
        },
        {
            "rate": "9.2",
            "cover_x": 600,
            "title": "大小谎言 第二季",
            "url": "https://movie.douban.com/subject/27195401/",
            "playable": false,
            "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2558544696.jpg",
            "id": "27195401",
            "cover_y": 889,
            "is_new": false
        },
        {
            "rate": "8.8",
            "cover_x": 750,
            "title": "请输入搜索词:WWW",
            "url": "https://movie.douban.com/subject/30403333/",
            "playable": false,
            "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2556419121.jpg",
            "id": "30403333",
            "cover_y": 1062,
            "is_new": false
        },
        {
            "rate": "8.7",
            "cover_x": 803,
            "title": "春夜",
            "url": "https://movie.douban.com/subject/30428225/",
            "playable": false,
            "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2554598542.jpg",
            "id": "30428225",
            "cover_y": 1200,
            "is_new": false
        },
        {
            "rate": "9.4",
            "cover_x": 1080,
            "title": "这!就是街舞 第二季",
            "url": "https://movie.douban.com/subject/30486671/",
            "playable": true,
            "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2552553650.jpg",
            "id": "30486671",
            "cover_y": 1566,
            "is_new": false
        },
        {
            "rate": "8.8",
            "cover_x": 1457,
            "title": "好兆头",
            "url": "https://movie.douban.com/subject/26846856/",
            "playable": false,
            "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2558290974.jpg",
            "id": "26846856",
            "cover_y": 2159,
            "is_new": false
        },
        {
            "rate": "7.0",
            "cover_x": 770,
            "title": "暗恋橘生淮南",
            "url": "https://movie.douban.com/subject/26811775/",
            "playable": true,
            "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2559173820.jpg",
            "id": "26811775",
            "cover_y": 1080,
            "is_new": false
        },
        {
            "rate": "8.4",
            "cover_x": 800,
            "title": "吹落的树叶",
            "url": "https://movie.douban.com/subject/30438479/",
            "playable": false,
            "cover": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2550315269.jpg",
            "id": "30438479",
            "cover_y": 1131,
            "is_new": false
        },
        {
            "rate": "8.6",
            "cover_x": 945,
            "title": "白色强人",
            "url": "https://movie.douban.com/subject/27195042/",
            "playable": false,
            "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2559049945.jpg",
            "id": "27195042",
            "cover_y": 1350,
            "is_new": false
        },
        {
            "rate": "7.1",
            "cover_x": 1080,
            "title": "破冰行动",
            "url": "https://movie.douban.com/subject/27052168/",
            "playable": true,
            "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2556232270.jpg",
            "id": "27052168",
            "cover_y": 1920,
            "is_new": false
        }
    ],
    "tag": "热门"

你可能感兴趣的:(网络爬虫)