1 #-*- coding: utf-8 -*-
2 #@author: Tele
3 #@Time : 2019/04/08 下午 1:01
4 importrequests5 importjson6 importos7 importre8 importshutil9 from lxml importetree10
11
12 #爬取每個菜單的前5頁內容
13 classBiliSplider:14 def __init__(self, save_dir, menu_list):15 self.target =menu_list16 self.url_temp = "https://www.bilibili.com/"
17 self.headers ={18 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",19 # "Cookie": "LIVE_BUVID=AUTO6715546997211617; buvid3=07192BD6-2288-4BA5-9259-8E0BF6381C9347193infoc; stardustvideo=1; CURRENT_FNVAL=16; sid=l0fnfa5e; rpdid=bfAHHkDF:cq6flbmZ:Ohzhw:1Hdog8",20 }21 self.proxies ={22 "http": "http://61.190.102.50:15845"
23 }24 self.father_dir =save_dir25
26 defget_menu_list(self):27 regex = re.compile("//")28 response = requests.get(self.url_temp, headers=self.headers)29 html_element =etree.HTML(response.content)30 nav_menu_list = html_element.xpath("//div[@id='primary_menu']/ul[@class='nav-menu']/li/a")31
32 menu_list =list()33 for item innav_menu_list:34 menu =dict()35 title = item.xpath("./*/text()")36 menu["title"] = title[0] if len(title) > 0 elseNone37 href = item.xpath("./@href")38 menu["href"] = "https://" + regex.sub("", href[0]) if len(href) > 0 elseNone39
40 #子菜單
41 submenu_list =list()42 sub_nav_list = item.xpath("./../ul[@class='sub-nav']/li")43 if len(sub_nav_list) >0:44 for sub insub_nav_list:45 submenu =dict()46 sub_title = sub.xpath("./a/span/text()")47 submenu["title"] = sub_title[0] if len(sub_title) > 0 elseNone48 sub_href = sub.xpath("./a/@href")49 submenu["href"] = "https://" + regex.sub("", sub_href[0]) if len(sub_href) > 0 elseNone50 submenu_list.append(submenu)51 menu["submenu_list"] = submenu_list if len(submenu_list) > 0 elseNone52 menu_list.append(menu)53 returnmenu_list54
55 #rid=tid
56 defparse_index_url(self, url):57 result_list =list()58 #正則匹配
59 regex = re.compile("")60 response = requests.get(url, headers=self.headers)61 result =regex.findall(response.content.decode())62 temp = re.compile("(.*);\(function").findall(result[0]) if len(result) > 0 elseNone63 sub_list = json.loads(temp[0])["config"]["sub"] if temp elselist()64 if len(sub_list) >0:65 for sub insub_list:66 #一些子菜單沒有rid,需要請求不同的url,暫不處理
67 if "tid" insub:68 if sub["tid"]:69 sub_menu =dict()70 sub_menu["rid"] = sub["tid"] if sub["tid"] elseNone71 sub_menu["title"] = sub["name"] if sub["name"] elseNone72 result_list.append(sub_menu)73 else:74 pass
75
76 returnresult_list77
78 #最新動態 region?callback
79 #數據 newlist?callback
80 defparse_sub_url(self, item):81 self.headers["Referer"] = item["referer"]82 url_pattern = "https://api.bilibili.com/x/web-interface/newlist?rid={}&type=0&pn={}&ps=20"
83
84 #每個菜單爬取前5頁
85 for i in range(1, 6):86 data =dict()87 url = url_pattern.format(item["rid"], i)88 print(url)89 try:90 response = requests.get(url, headers=self.headers, proxies=self.proxies, timeout=10)91 except:92 return
93 if response.status_code == 200:94 data["content"] = json.loads(response.content.decode())["data"]95 data["title"] = item["title"]96 data["index"] =i97 data["menu"] = item["menu"]98 #保存數據
99 self.save_data(data)100 else:101 print("請求超時") #一般是403,被封IP了
102
103 defsave_data(self, data):104 if len(data["content"]) ==0:105 return
106 parent_path = self.father_dir + "/" + data["menu"] + "/" + data["title"]107 if notos.path.exists(parent_path):108 os.makedirs(parent_path)109 file_dir = parent_path + "/" + "第" + str(data["index"]) + "頁.txt"
110
111 #保存
112 with open(file_dir, "w", encoding="utf-8") as file:113 file.write(json.dumps(data["content"], ensure_ascii=False, indent=2))114
115 defrun(self):116 #清除之前保存的數據
117 ifos.path.exists(self.father_dir):118 shutil.rmtree(self.father_dir)119
120 menu_list =self.get_menu_list()121 menu_info =list()122 #獲得目標菜單信息
123 #特殊列表,一些菜單的rid必須從子菜單的url中獲得
124 special_list =list()125 for menu inmenu_list:126 for t inself.target:127 if menu["title"] ==t:128 if menu["title"] == "番劇" or menu["title"] == "國創" or menu["title"] == "影視":129 special_list.append(menu)130 menu_info.append(menu)131 break
132
133 #目標菜單的主頁
134 if len(menu_info) >0:135 for info inmenu_info:136 menu_index_url = info["href"]137 #處理特殊列表
138 if info inspecial_list:139 menu_index_url = info["submenu_list"][0]["href"]140 #獲得rid
141 result_list =self.parse_index_url(menu_index_url)142 print(result_list)143 if len(result_list) >0:144 for item inresult_list:145 #大菜單
146 item["menu"] = info["title"]147 item["referer"] =menu_index_url148 #爬取子菜單
149 self.parse_sub_url(item)150
151
152 defmain():153 target = ["動畫", "番劇", "國創", "音樂", "舞蹈", "游戲", "科技", "數碼", "生活", "鬼畜", "時尚", "廣告", "娛樂", "影視"]154 splider = BiliSplider("f:/bili_splider", target)155 splider.run()156
157
158 if __name__ == '__main__':159 main()