import requests
from lxml import etree
import re
from moviepy.editor import *
import jsonpath
import time
if __name__ == '__main__':
# 输入得到关键字
data_ = input('请输入你想要爬取的内容:')
url_ = ' '#自行添加小破站的url
headers_ = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
'referer': ' ',#自行添加小破站的referer
"cookie": " "#自行添加cookie
}
params_ = {
'__refresh__': 'true',
'_extra':'',
'context':'',
'page': '1',
'page_size': '30',
'from_source':'',
'from_spmid':'333.337',
'platform': 'pc',
'highlight': '1',
'single_column': '0',
'keyword': data_,
'qv_id': 'VXWHEaclSzhGv2962GrDRsUp2lXwliu9',
'ad_resource': '5654',
'source_tag': '3',
'gaia_vtoken':'',
'category_id':'',
'search_type': 'video',
'dynamic_offset': '0',
'w_rid': '01ca483419929d1bc5556062c4f6b179',
'wts': '1680096104'
}
response_ = requests.get(url_,headers=headers_,params=params_)
py_data = response_.json()
# 提取出30个视频的详情页的url
url_list = jsonpath.jsonpath(py_data,'$..arcurl')
print(len(url_list),url_list)
# 循环发送请求,获取到所有的视频
for url_ in url_list:
# 设置用户代理,cookie
headers_ = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
'referer': url_
}
# 发送请求,得到响应对象
response_ = requests.get(url_,headers=headers_)
str_data = response_.text # 视频主页的html代码,类型是字符串
# 使用xpath解析html代码,得到想要的url
html_obj = etree.HTML(str_data) # 转换格式类型
# 获取视频的视频
title_ = html_obj.xpath('//title/text()')[0]
# 影响视频合成的特殊字符的处理
title_ = title_.replace('/','')
title_ = title_.replace(' ','')
title_ = title_.replace('&','')
# 使用xpath语法获取数据
# 取到数据为列表,索引[0]取值取出里面的字符串,即包含纯视频纯音频文件的url字符串
url_list_str = html_obj.xpath("//script[contains(text(),'window.__playinfo__')]/text()")[0]
# 使用正则提取纯视频url
video_url = re.findall(r'"video":\[{"id":\d+,"baseUrl":"(.*?)"',url_list_str)[0]
# 使用正则提取纯音频url
audio_url = re.findall(r'"audio":\[{"id":\d+,"baseUrl":"(.*?)"',url_list_str)[0]
# 设置灵活的headers
headers_ = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
'referer': url_
}
# 获取纯视频的数据
response_video = requests.get(video_url,headers=headers_,stream=True)
bytes_video = response_video.content
# 获取纯音频的数据
response_audio = requests.get(audio_url,headers=headers_,stream=True)
bytes_audio = response_audio.content
# 获取文件大小,单位为KB
video_size = int(int(response_video.headers['content-length'])/1024)
audio_size = int(int(response_audio.headers['content-length'])/1024)
title_1 = title_ + '!' # 名称进行修改,避免重名
# 保存
with open(f'{title_1}.mp4','wb') as f:
f.write(bytes_video)
print(f'{title_1}>>>>纯视频文件下载完毕....,大小为{video_size}KB,{int(video_size/1024)}MB')
with open(f'{title_1}.mp3','wb') as f:
f.write(bytes_audio)
print(f'{title_1}>>>>纯音频文件下载完毕....,大小为{audio_size}KB,{int(audio_size / 1024)}MB')
ffmpeg_tools.ffmpeg_merge_video_audio(f'{title_1}.mp4', f'{title_1}.mp3', f'{title_}.mp4')
print('视频合成成功....')
# 显示合成文件的大小
res_ = int(os.stat(f'{title_}.mp4').st_size / 1024)
print(f'{title_1}>>>>视频合成成功....,大小为{res_}KB,{int(res_ / 1024)}MB')
# 移除纯视频文件
os.remove(f'{title_1}.mp4')
# 移除纯音频文件
os.remove(f'{title_1}.mp3')
# 手动降低请求频率,避免被反爬
time.sleep(1)
# 隔开每一个视频的信息
print('*'*80)
print('视频全部抓取完毕.....')
>输入你想要搜索的关键字,然后回车,会出现这一页的全部视频,这些通过本篇代码可以全部爬取下来
**本博客仅适用于小破站**