爬取的过程中发现不涉及cookie和ip的反爬虫策略,数据量大的部分使用协程挺快的!
import json
import requests
import gevent
from gevent import monkey
monkey.patch_socket()
class DoubanTVSpider:
def __init__(self):
self.base_url = "https://movie.douban.com/j/search_subjects"
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
}
self.page_limit = 20
def parse_url(self, url, params):
response = requests.get(url, params=params, headers=self.headers)
assert response.status_code == 200
return response.content.decode()
def get_content_list(self, json_str):
dict_data = json.loads(json_str)
print(dict_data)
count = len(dict_data["subjects"])
return dict_data, count
def save_cotent_list(self, content_list, tag):
content_list.update({"tag": tag})
with open("doubanTV.json", "a", encoding="utf-8") as f:
f.write(json.dumps(content_list, ensure_ascii=False, indent=4))
f.write(";")
print("OK")
def _run(self, tag="热门"):
params = {
"type": "tv",
"tag": tag,
"sort": "recommend",
"page_limit": 20,
"page_start": 0
}
print(tag)
while True:
json_str = self.parse_url(self.base_url, params)
content_list, count = self.get_content_list(json_str)
self.save_cotent_list(content_list, params['tag'])
if count < self.page_limit:
# 到达尾部,退出爬虫程序
break
params['page_start'] = params['page_start'] + 20
def run(self):
print("""请输入你的选择:
0: "热门"
1: "美剧"
2: "英剧"
3: "韩剧"
4: "日剧"
5: "国产剧"
6: "港剧"
7: "日本动画"
8: "综艺"
9: "纪录片"
10:以上所有
11.退出系统
""")
select = input()
switch_list = {
"0": "热门",
"1": "美剧",
"2": "英剧",
"3": "韩剧",
"4": "日剧",
"5": "国产剧",
"6": "港剧",
"7": "日本动画",
"8": "综艺",
"9": "纪录片",
}
if select == "11":
exit()
elif select == "10":
gevent.joinall([
gevent.spawn(self._run, switch_list[i]) for i in switch_list
])
else:
self._run(switch_list[select])
if __name__ == "__main__":
doubanTV_spider = DoubanTVSpider()
doubanTV_spider.run()
"title": "我们不能是朋友",
"url": "https://movie.douban.com/subject/30309331/",
"playable": false,
"cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2557149941.jpg",
"id": "30309331",
"cover_y": 916,
"is_new": false
},
{
"rate": "6.8",
"cover_x": 1432,
"title": "少年派",
"url": "https://movie.douban.com/subject/27598254/",
"playable": true,
"cover": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2558828119.jpg",
"id": "27598254",
"cover_y": 2048,
"is_new": false
},
{
"rate": "7.7",
"cover_x": 1000,
"title": "动物管理局",
"url": "https://movie.douban.com/subject/27107725/",
"playable": true,
"cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2522426850.jpg",
"id": "27107725",
"cover_y": 1419,
"is_new": false
},
{
"rate": "9.5",
"cover_x": 1204,
"title": "我们与恶的距离",
"url": "https://movie.douban.com/subject/30181230/",
"playable": true,
"cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2554916825.jpg",
"id": "30181230",
"cover_y": 1720,
"is_new": false
},
{
"rate": "3.7",
"cover_x": 1071,
"title": "带着爸爸去留学",
"url": "https://movie.douban.com/subject/30238247/",
"playable": true,
"cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2540911433.jpg",
"id": "30238247",
"cover_y": 1500,
"is_new": false
},
{
"rate": "7.8",
"cover_x": 6732,
"title": "大宋少年志",
"url": "https://movie.douban.com/subject/30170894/",
"playable": true,
"cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2559266281.jpg",
"id": "30170894",
"cover_y": 11968,
"is_new": false
},
{
"rate": "9.2",
"cover_x": 600,
"title": "大小谎言 第二季",
"url": "https://movie.douban.com/subject/27195401/",
"playable": false,
"cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2558544696.jpg",
"id": "27195401",
"cover_y": 889,
"is_new": false
},
{
"rate": "8.8",
"cover_x": 750,
"title": "请输入搜索词:WWW",
"url": "https://movie.douban.com/subject/30403333/",
"playable": false,
"cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2556419121.jpg",
"id": "30403333",
"cover_y": 1062,
"is_new": false
},
{
"rate": "8.7",
"cover_x": 803,
"title": "春夜",
"url": "https://movie.douban.com/subject/30428225/",
"playable": false,
"cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2554598542.jpg",
"id": "30428225",
"cover_y": 1200,
"is_new": false
},
{
"rate": "9.4",
"cover_x": 1080,
"title": "这!就是街舞 第二季",
"url": "https://movie.douban.com/subject/30486671/",
"playable": true,
"cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2552553650.jpg",
"id": "30486671",
"cover_y": 1566,
"is_new": false
},
{
"rate": "8.8",
"cover_x": 1457,
"title": "好兆头",
"url": "https://movie.douban.com/subject/26846856/",
"playable": false,
"cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2558290974.jpg",
"id": "26846856",
"cover_y": 2159,
"is_new": false
},
{
"rate": "7.0",
"cover_x": 770,
"title": "暗恋橘生淮南",
"url": "https://movie.douban.com/subject/26811775/",
"playable": true,
"cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2559173820.jpg",
"id": "26811775",
"cover_y": 1080,
"is_new": false
},
{
"rate": "8.4",
"cover_x": 800,
"title": "吹落的树叶",
"url": "https://movie.douban.com/subject/30438479/",
"playable": false,
"cover": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2550315269.jpg",
"id": "30438479",
"cover_y": 1131,
"is_new": false
},
{
"rate": "8.6",
"cover_x": 945,
"title": "白色强人",
"url": "https://movie.douban.com/subject/27195042/",
"playable": false,
"cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2559049945.jpg",
"id": "27195042",
"cover_y": 1350,
"is_new": false
},
{
"rate": "7.1",
"cover_x": 1080,
"title": "破冰行动",
"url": "https://movie.douban.com/subject/27052168/",
"playable": true,
"cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2556232270.jpg",
"id": "27052168",
"cover_y": 1920,
"is_new": false
}
],
"tag": "热门"