本代码分为两个部分:第一个部分时根据一个视频的BV号或者视频的链接URL来爬取视频;第二个部分是根据一个UP主的ID来获取其所有的信息和发布的视频信息。第二个功能主要实现获取一个UP主发布的视频的BV号,再结合第一个部分的代码就可以实现爬取一个UP主所有一已经发布的视频。
1.首先定义一个类,并且对其进行属性初始化
class Bilibili(object):
def __init__(self, url):
self.url = url
self.header = {
"referer": "https://www.bilibili.com",
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'}
self.url_text = None
self.json = None
self.video = None
self.audio = None
self.video_main_name = None
self.video_sub_name = None
注意这里再请求头之中需要加入Referer,来指明访问的网站,否则后续请求不成功。
2.解析网站
def get_url_content(self):
resp = requests.get(self.url, headers=self.header)
self.url_text = resp.text
return None
3.获取存放视频和音频链接的URL
这里注意下,B站的视频爬取的时候不能同时将视频和音频链接同时缓存的,需要分别请求
def get_mp4url_and_mp3url_and_name(self):
url_dict = re.findall(r'', self.url_text)[0]
self.json = json.loads(url_dict)
self.video = self.json['data']['dash']['video'][0]['base_url']
self.audio = self.json['data']['dash']['audio'][0]['base_url']
self.video_main_name = re.findall(r',
self.url_text)[0]
print('主标题为:', self.video_main_name)
try:
page = re.findall(r'p=(.*)', self.url)[0]
self.video_sub_name = re.findall(r'"part":"(.*?)"', self.url_text)[int(page)]
except:
self.video_sub_name = '无'
print('子标题为:', self.video_sub_name)
return None
4.解析存放视音频URL,并写入文件
def write_video_to_content(self):
print('正在爬取视频...')
resp2 = requests.get(self.video, headers=self.header)
with open('1.mp4', mode='wb') as f:
f.write(resp2.content)
def write_audio_to_content(self):
print('正在爬取音频...')
resp2 = requests.get(self.audio, headers=self.header)
with open(f'2.mp3', mode='wb') as f:
f.write(resp2.content)
5.合并音频和视频
这里需要借助FFMPEG工具,具体安装请见评论
def final_video(self):
video_path = '1.mp4'
audio_path = '2.mp3'
path_out = re.findall(r'video/(.*?)\?', self.url)[0] + '.mp4'
print(path_out)
ffmpeg_dir = r'E:\FFmpeg\bin\ffmpeg.exe'
cmd = f'{ffmpeg_dir} -i {video_path} -i {audio_path} {path_out} -loglevel quiet'
os.system(cmd)
os.remove(video_path)
os.remove(audio_path)
6.主方法
def start(self):
print('创建任务中...')
print('解析url...')
self.get_url_content()
self.get_mp4url_and_mp3url_and_name()
self.write_video_to_content()
self.write_audio_to_content()
print('爬虫阶段已完成!')
print('准备合并音频视频...')
self.final_video()
print('合并阶段已完成!')
print('任务完成!')
return None
6.测试
if __name__ == '__main__':
print(r'使用本程序之前请参考:https://blog.csdn.net/pythonlaodi/article/details/109222790安装FFMPEG;安装位置也如链接提到如此。')
time.sleep(1)
video_url = input('请输入链接(或者BV号):')
if 'http' in video_url:
video_url = video_url
else:
video_url = 'https://www.bilibili.com/video/' + video_url + '?'
bilibili = Bilibili(url=video_url)
bilibili.start()
# 'https://www.bilibili.com/video/BV1NU4y137PZ?from=search&seid=1218725641028602102&spm_id_from=333.337.0.0'
6.第一部分完整代码
import requests
import re
import json
import os
import time
class Bilibili(object):
def __init__(self, url):
self.url = url
self.header = {
"referer": "https://www.bilibili.com",
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'}
self.url_text = None
self.json = None
self.video = None
self.audio = None
self.video_main_name = None
self.video_sub_name = None
def get_url_content(self):
resp = requests.get(self.url, headers=self.header)
self.url_text = resp.text
return None
def get_mp4url_and_mp3url_and_name(self):
url_dict = re.findall(r'', self.url_text)[0]
self.json = json.loads(url_dict)
self.video = self.json['data']['dash']['video'][0]['base_url']
self.audio = self.json['data']['dash']['audio'][0]['base_url']
self.video_main_name = re.findall(r',
self.url_text)[0]
print('主标题为:', self.video_main_name)
try:
page = re.findall(r'p=(.*)', self.url)[0]
self.video_sub_name = re.findall(r'"part":"(.*?)"', self.url_text)[int(page)]
except:
self.video_sub_name = '无'
print('子标题为:', self.video_sub_name)
return None
def write_video_to_content(self):
print('正在爬取视频...')
resp2 = requests.get(self.video, headers=self.header)
with open('1.mp4', mode='wb') as f:
f.write(resp2.content)
def write_audio_to_content(self):
print('正在爬取音频...')
resp2 = requests.get(self.audio, headers=self.header)
with open(f'2.mp3', mode='wb') as f:
f.write(resp2.content)
def final_video(self):
video_path = '1.mp4'
audio_path = '2.mp3'
path_out = re.findall(r'video/(.*?)\?', self.url)[0] + '.mp4'
print(path_out)
ffmpeg_dir = r'E:\FFmpeg\bin\ffmpeg.exe'
cmd = f'{ffmpeg_dir} -i {video_path} -i {audio_path} {path_out} -loglevel quiet'
os.system(cmd)
os.remove(video_path)
os.remove(audio_path)
def start(self):
print('创建任务中...')
print('解析url...')
self.get_url_content()
self.get_mp4url_and_mp3url_and_name()
self.write_video_to_content()
self.write_audio_to_content()
print('爬虫阶段已完成!')
print('准备合并音频视频...')
self.final_video()
print('合并阶段已完成!')
print('任务完成!')
return None
if __name__ == '__main__':
time.sleep(1)
video_url = input('请输入链接(或者BV号):')
if 'http' in video_url:
video_url = video_url
else:
video_url = 'https://www.bilibili.com/video/' + video_url + '?'
bilibili = Bilibili(url=video_url)
bilibili.start()
1.首先定义一个类,并且对其进行属性初始化
class BilibiliUp(object):
def __init__(self, id):
self.id = str(id)
self.url = None
self.header = {
"referer": "https://www.bilibili.com",
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'}
self.resp_text = None
self.vedio_count = None
self.page_count = None
self.last_page_count = None
self.bv_list = []
2.根据ID建立请求链接
def creat_url_request(self):
self.url = f'https://api.bilibili.com/x/space/arc/search?mid={self.id}&ps=30&tid=0&pn=1&keyword=&order=pubdate&jsonp=jsonp'
print(self.url)
resp = requests.get(self.url, headers=self.header)
self.resp_text = resp.text
return None
3.获取UP主视频页数和最后一页的视频数量
由于BV号存在于不同视频页中,所以需要请求多次网站
def get_page_last(self):
data_json = json.loads(self.resp_text)
self.vedio_count = int(data_json['data']['page']['count'])
if self.vedio_count >= 30:
self.page_count, self.last_page_count = divmod(self.vedio_count, 30)
else:
self.page_count = 1
return self.bv_list
4.开始请求
因为这里我就获取BV号就可以了,所以并未获取多余信息,如果读者需要,可以自行再这几行代码上加上内容。
def new_get(self):
for i in range(1, int(self.page_count) + 1):
new_url = f'https://api.bilibili.com/x/space/arc/search?mid={self.id}&ps={str(30)}&tid=0&pn={str(i)}&keyword=&order=pubdate&jsonp=jsonp'
resp2 = requests.get(new_url, headers=self.header)
json_data = json.loads(resp2.text)
for bv in json_data['data']['list']['vlist']:
self.bv_list.append(bv['bvid'])
new_new_url = f'https://api.bilibili.com/x/space/arc/search?mid={self.id}&ps=30&tid=0&pn={str(self.page_count + 1)}&keyword=&order=pubdate&jsonp=jsonp'
resp2 = requests.get(new_new_url, headers=self.header)
json_data = json.loads(resp2.text)
for bv in json_data['data']['list']['vlist']:
self.bv_list.append(bv['bvid'])
return self.bv_list
5.主方法
def get_bv(self):
self.creat_url_request()
self.get_page_last()
result = self.new_get()
return result
6.测试
if __name__ == '__main__':
ob = BilibiliUp(id=6125938)
result = ob.get_bv()
print(result)
7.第二部分完整代码
import requests
import json
class BilibiliUp(object):
def __init__(self, id):
self.id = str(id)
self.url = None
self.header = {
"referer": "https://www.bilibili.com",
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'}
self.resp_text = None
self.vedio_count = None
self.page_count = None
self.last_page_count = None
self.bv_list = []
def creat_url_request(self):
self.url = f'https://api.bilibili.com/x/space/arc/search?mid={self.id}&ps=30&tid=0&pn=1&keyword=&order=pubdate&jsonp=jsonp'
print(self.url)
resp = requests.get(self.url, headers=self.header)
self.resp_text = resp.text
return None
def get_page_last(self):
data_json = json.loads(self.resp_text)
self.vedio_count = int(data_json['data']['page']['count'])
if self.vedio_count >= 30:
self.page_count, self.last_page_count = divmod(self.vedio_count, 30)
else:
self.page_count = 1
return self.bv_list
def new_get(self):
for i in range(1, int(self.page_count) + 1):
new_url = f'https://api.bilibili.com/x/space/arc/search?mid={self.id}&ps={str(30)}&tid=0&pn={str(i)}&keyword=&order=pubdate&jsonp=jsonp'
resp2 = requests.get(new_url, headers=self.header)
json_data = json.loads(resp2.text)
for bv in json_data['data']['list']['vlist']:
self.bv_list.append(bv['bvid'])
new_new_url = f'https://api.bilibili.com/x/space/arc/search?mid={self.id}&ps=30&tid=0&pn={str(self.page_count + 1)}&keyword=&order=pubdate&jsonp=jsonp'
resp2 = requests.get(new_new_url, headers=self.header)
json_data = json.loads(resp2.text)
for bv in json_data['data']['list']['vlist']:
self.bv_list.append(bv['bvid'])
return self.bv_list
def get_bv(self):
self.creat_url_request()
self.get_page_last()
result = self.new_get()
return result
if __name__ == '__main__':
ob = BilibiliUp(id=612593877)
result = ob.get_bv()
print(result)
获取一个UP主所有的视频
import BilibiliUP
import BilibiliBV
bvget = BilibiliUP.BilibiliUp(id=672600531)
bv_list = bvget.get_bv()[:]
for bv in bv_list:
video_url = 'https://www.bilibili.com/video/' + bv + '?'
ob = BilibiliBV.Bilibili(url=video_url)
ob.start()
以上就是,本文主要内容,如有疑问,请评论!