目录
1.酷6
2.快手
3.A站
4.B站(音,视频未合成)
5.抖音
6.虎牙
import json
import requests
def ku6_spider():
for page in range(0,11):
print('正在抓取第{}页数据'.format(page+1))
#1.确定URL路径,headers参数
base_url = 'https://www.ku6.com/video/feed'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
}
params = {
'pageNo': str(page),
'pageSize': '40',
'subjectId': '76',
}
#2.模拟浏览器-requests发送请求获取数据
response = requests.get(base_url,headers=headers,params=params)
res_data = response.text
#print(res_data)
#3.解析数据
json_data = json.loads(res_data) #--字典
data_list = json_data['data']
#遍历列表
for data in data_list:
#print(data)
video_name = data['title']+".mp4"
video_url = data['playUrl']
#print(video_url,video_name)
print('下载中:',video_name)
video_data = requests.get(video_url,headers=headers).content
#4.保存数据
with open('video\\'+video_name,'wb') as f:
f.write(video_data)
print('下载完成...')
if __name__ == '__main__':
ku6_spider()
import os
import time
import requests
import json
import pprint
import re
# 在当前目录创建一个保存视频的目录
dir_name = 'video'
if not os.path.exists(dir_name):
os.mkdir(dir_name)
# 响应头,整个复制之后,利用ctrl+r勾选正则表达式来替换(上面原来的(.*?): (.*) (冒号后面的空格)下面替换的格式 ‘$1': '$2',(冒号后面的空格,最后加逗号分隔)
headers = {
'accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Length': '1380',
'content-type': 'application/json',
'Cookie': 'kpf=PC_WEB; kpn=KUAISHOU_VISION; clientid=3; did=web_acb408fff3a5f7cd020782d58bb9caa9; ktrace-context=1|MS43NjQ1ODM2OTgyODY2OTgyLjI4ODYxOTgxLjE2MzczNzIwMzc5NTkuMTQ1NDUxNA==|MS43NjQ1ODM2OTgyODY2OTgyLjI3NzMzOTY1LjE2MzczNzIwMzc5NTkuMTQ1NDUxNQ==|0|graphql-server|webservice|false|NA; client_key=65890b29; userId=1232368006; kuaishou.server.web_st=ChZrdWFpc2hvdS5zZXJ2ZXIud2ViLnN0EqABXhLnnN974NXDx7wxD7EXA0gUwiENGncAU1PMNvGRI8hgQVPES30K2a6e8FZ9L3yv89WVXIZ5I1HsDjjWJlzDijZgHPj64KgQ8dkTm8-Aq5monZejiGHAuenrIuDovugsUnncYRtFHLY_bmEtKpBDoaswti5UnDOkiVHAuhMMPlqdPBKYwV_LZ3SGFMeznHUrJv5Wg4o4C45yi-1iuOPyDRoSsmhEcimAl3NtJGybSc8y6sdlIiCHg_pUdXqAoXPplQJ-iHcM2h_MTI_3Wkdnw9ucUMR5UCgFMAE; kuaishou.server.web_ph=b3651a369fb9eb9f33d30ccc2cc691a5ecbf',
'Host': 'www.kuaishou.com',
'Origin': 'https://www.kuaishou.com',
'Referer': 'https://www.kuaishou.com/search/video?searchKey=%E6%85%A2%E6%91%87',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
}
# 响应头右边的选项payload(报错的地方加上引号) (单引号里面括着双引号)
keyword = input("请输入你想要查询的关键词:")
for pcursor in range(0, 2):
pcursor = str(pcursor)
data = {
'operationName': "visionSearchPhoto",
'query': "query visionSearchPhoto($keyword: String, $pcursor: String, $searchSessionId: String, $page: String, $webPageArea: String) {\n visionSearchPhoto(keyword: $keyword, pcursor: $pcursor, searchSessionId: $searchSessionId, page: $page, webPageArea: $webPageArea) {\n result\n llsid\n webPageArea\n feeds {\n type\n author {\n id\n name\n following\n headerUrl\n headerUrls {\n cdn\n url\n __typename\n }\n __typename\n }\n tags {\n type\n name\n __typename\n }\n photo {\n id\n duration\n caption\n likeCount\n realLikeCount\n coverUrl\n photoUrl\n liked\n timestamp\n expTag\n coverUrls {\n cdn\n url\n __typename\n }\n photoUrls {\n cdn\n url\n __typename\n }\n animatedCoverUrl\n stereoType\n videoRatio\n __typename\n }\n canAddComment\n currentPcursor\n llsid\n status\n __typename\n }\n searchSessionId\n pcursor\n aladdinBanner {\n imgUrl\n link\n __typename\n }\n __typename\n }\n}\n",
'variables': {'keyword': keyword, 'pcursor': pcursor, 'page': "search"}
} # "keyword"这个控制关键词,"pcursor"控制翻页(手动在网页中下滑之后会出现两个数据包)
# 页面搜索视频名字,然后找到抓包,再找响应网址
baseurl = "https://www.kuaishou.com/graphql"
# headers有一个 'content-type': 'application/json', 这个定义了data(这里类似账号密码之类的数据),要求data是json字符串
# print(type(data))
data = json.dumps(data) # 将data由字典类型转换为字符串类型
# print(type(data))
time.sleep(2)
# 发送请求,url:链接地址,headers:伪装,data:查询参数
request = requests.post(url=baseurl, headers=headers, data=data)
# print(request)
response = request.json()
# print(response)
# pprint.pprint(response)
##字典数据利用键来找值 {"键":"值"} |列表直接利用位置索引 [值][值] [0][1]
# title_list = response['data']['visionSearchPhoto']['feeds'][5]['photo']['caption']
# print(title_list)
# url_list = response['data']['visionSearchPhoto']['feeds'][5]['photo']['photoUrl']
# print(url_list)
feeds_list = response['data']['visionSearchPhoto']['feeds']
#print(feeds_list)
for feeds in feeds_list:
# 每个feeds是feeds_list列表当中的一个个字典
# print(feeds) #利用这条可以把每个视频的信息都分别打印出来
title = feeds['photo']['caption']
print(title)
list = feeds['photo']['photoUrl']
print(list)
# #下面这个打印出来把所有类似的数据都放在了同一个列表当中,与下载无关
# # titles = [i['photo']['caption']for i in feeds_list]
# # print(titles)
# # list = [i['photo']['photoUrl']for i in feeds_list]
# # print(list)
##保存视频 【搜索关键词下载视频/知道一个用户的视频/翻页下载】
new_title = re.sub(r'[\/:*?"<>|\n]', '_', title) # 在windows操作系统当中,必须是没有一些特殊字符 #标题过长可以替换(字符串的切片)当>=256
# 发送网络请求,请求每一个视频地址,获取视频二进制数据
mp4_data = requests.get(list).content
with open(dir_name + "/" + new_title + '.mp4', mode='wb') as f:
f.write(mp4_data)
print(new_title, "下载完成")
mp4_data.close()
request.close()
import requests
import re
import os
import zipfile
url = input('请输入视频网址:')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
}
response = requests.get(url=url,headers=headers)
#print(response.text)
#取第一行, 替换字符
m3u8_url = re.findall('"backupUrl(.*?)"]',response.text)[0].replace('\\":[\\"','').replace('\\','')
title = re.findall('(.*?)- AcFun弹幕视频网 - 认真你就输啦 \(\?\ω\?\)ノ- \( ゜- ゜\)つロ ',response.text)[0]
#print(m3u8_url)
#os自动创建文件夹
filename = f'{title}\\'
if not os.path.exists(filename):
os.mkdir(filename)
m3u8_data = requests.get(url=m3u8_url,headers=headers).text
m3u8_data = re.sub('#EXTM3U','',m3u8_data)
m3u8_data = re.sub('#EXT-X-VERSION:\d','',m3u8_data)
m3u8_data = re.sub('#EXT-X-TARGETDURATION:\d','',m3u8_data)
m3u8_data = re.sub('#EXT-X-MEDIA-SEQUENCE:\d','',m3u8_data)
m3u8_data = re.sub('#EXTINF:\d\.\d+,','',m3u8_data)
m3u8_data = re.sub('#EXT-X-ENDLIST','',m3u8_data).split()
#print(m3u8_data)
for index in m3u8_data:
ts_url = 'https://ali-safety-video.acfun.cn/mediacloud/acfun/acfun_video/hls/' + index
ts_name = ts_url.split('.')[3]
#二进制
ts_content = requests.get(url=ts_url,headers=headers).content
with open(filename + ts_name + '.ts',mode='wb') as f:
f.write(ts_content)
print(ts_name)
print('视频片段下载完成')
print('开始合并......')
files = os.listdir(filename)
print(files)
with zipfile.ZipFile(filename + title + '.mp4',mode='w') as f:
for i in files:
file = filename + i
f.write(file)
os.remove(file)
print('爬取完成')
import json
import re
import subprocess
import requests
# 访问网站
def get_response(html_url):
# 出现403加防盗链referer
headers = {
'referer': 'https://www.bilibili.com/video/BV1TF411w7vv?spm_id_from=333.337.search-card.all.click&vd_source=415a9fdfbb14115b672b4063903571a0',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
}
response = requests.get(url=html_url, headers=headers)
return response
# 获取信息
def get_video_info(html_url):
response = get_response(html_url=html_url)
# print(response.text)
# 提取视频标题
title = re.findall('', response.text)[0]
html_data = re.findall('', response.text)[0]
# print(title)
# print(html_data)
# pprint.pprint(html_data)
json_data = json.loads(html_data)
# 根据冒号左边内容,提取右边内容,键取对值
audio_url = json_data['data']['dash']['audio'][0]['baseUrl']
video_url = json_data['data']['dash']['video'][0]['baseUrl']
video_info = [title, audio_url, video_url]
# pprint.pprint(json_data)
# print(audio_url)
# print(video_url)
return video_info
# 保存数据
def save(title, audio_url, video_url):
audio_content = get_response(html_url=audio_url).content
video_content = get_response(html_url=video_url).content
with open(title + '.mp3', mode='wb') as f:
f.write(audio_content)
with open(title + '.mp4', mode='wb') as f:
f.write(video_content)
print(title, '保存成功')
# def merge_data(vide_name):
# print('视频开始合成', vide_name)
# cmd = f"ffmpeg -i {vide_name}.mp4 -i {vide_name}.mp3 -c:a aac -strict experimental {vide_name}output.mp4"
# #print(cmd)
# subprocess.run(cmd, shell=True)
# print('视频合成完毕', vide_name)
def main(bv_id):
url = f'https://www.bilibili.com/video/{bv_id}'
video_info = get_video_info(url)
save(video_info[0], video_info[1], video_info[2])
#merge_data(video_info[0])
keyword = input('请输入要下载的视频BV号:')
main(keyword)
# url = 'https://www.bilibili.com/video/BV1TF411w7vv'
# video_info = get_video_info(url)
# print(video_info)
import requests
import re
url = 'https://www.douyin.com/video/7114220525978668303'
headers = {
'cookie': 'douyin.com; ttcid=444dfe8e89ff4d99b0662076ad171c8775; ttwid=1%7CTnFKlrGi3lHjKf5bshFdP9Nwu_Vsiwo-TxvX9NISgj8%7C1642083887%7Cfbfa904ea2900763eb6ac090bdd09014d80840da1ca485bbfea193d5401b330e; MONITOR_WEB_ID=c27b9f4a-4917-4256-be93-e948308467e3; odin_tt=0510c3c4196f54b541a96ac64e8b585b3a755be85057da8a1f3fa068e3f7b75ca2de4345e2b856f1e7b3f9455d86079731fe7d07a9f10890f26855d3674858e1; passport_csrf_token=e0b90cb756903c370592bd558c2b0cf5; passport_csrf_token_default=e0b90cb756903c370592bd558c2b0cf5; s_v_web_id=verify_l268jj46_kc7yYkD6_YHWW_4x4v_9snI_EDE0zro77uRn; AVATAR_FULL_LOGIN_GUIDE_COUNT=1; AVATAR_FULL_LOGIN_GUIDE_TIMESTAMP=1650982839652; AVATAR_FULL_LOGIN_GUIDE_ITA_COUNT=1; AVATAR_FULL_LOGIN_GUIDE_ITA_TIMESTAMP=1650982839652; __ac_nonce=0627ba36600d465d72261; __ac_signature=_02B4Z6wo00f01zrB8EAAAIDCWcswKSh.eLM65fTAAKzW8srQpmSjmL6YX9IsdmMSL4a9EBuyJvIwNMROqFQktniG-Ur-UDPK6wHInC8QKqRYUmyGnflwUXLpKzPgVt2FtREyprGmCDAZLrIpcc; douyin.com; strategyABtestKey=1652269927.635; AB_LOGIN_GUIDE_TIMESTAMP=1652269927510; AVATAR_LOGIN_GUIDE_COUNT=1; _tea_utm_cache_2285=undefined; _tea_utm_cache_6383=undefined; _tea_utm_cache_1300=undefined; pwa_guide_count=3; IS_HIDE_THEME_CHANGE=1; THEME_STAY_TIME=299808; msToken=XGPVAVUHDi9iTEQRjdXuQ0YyetxhHq0c9EH1dLLpttanbCXsNSD0DRxwk9oUB0vZ7LB9vKd-ABi2kAkzj2lCn1x98lJ4iTFbf260RcLav-G4QkhNyq8qV9i3oEJRyc8t; home_can_add_dy_2_desktop=1; msToken=3ALqenaebbJHw7kQDiDG6aRAgVYm5WM1pVGqmyyidbGgYpWRWKn-wQ9tcjoxWrHvwcqoYAx3tQ4IGE1qixdq2ei_fPrirMeeI6HeooU3sGR2wyWQ2OAAh2RejVJOrmpA; tt_scid=Gp0q0JW0LDreTqplgpajIZNHCB0.p1NcVv0hhZBgaGDw4SFxkXGlXfKafiCVmWAWc537',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36'
}
response = requests.get(url=url, headers=headers)
response.encoding = response.apparent_encoding
#print(response.text)
title = re.findall('(.*?) ', response.text)[0]
href = re.findall('src(.*?)vRLmmJ', response.text)[0]
video_url = requests.utils.unquote(href).replace('":"', 'https:')
#print(video_url)
video_content = requests.get(url=video_url).content
with open('video\\' + title + '.mp4', mode='wb') as f:
f.write(video_content)
print(title, video_url)
import requests
import re
for page in range(1, 5):
print(f'正在采集第{page}页的数据内容')
link = f'https://v.huya.com/g/all?set_id=51&order=hot&page={page}'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36'
}
response = requests.get(url=link, headers=headers)
# print(response.text)
# 从哪里找什么数据 \d+ 表示匹配一个或者多个数字
video_id_list = re.findall(' ', response.text)
print(video_id_list)
for video_id in video_id_list:
url = f'https://liveapi.huya.com/moment/getMomentContent?videoId={video_id}&uid=&_=1652789442223'
response = requests.get(url=url, headers=headers)
# print(response.json()['status'])
title = response.json()['data']['moment']['title']
video_url = response.json()['data']['moment']['videoInfo']['definitions'][0]['url']
# 保存数据 >>> 也是需要发送请求, 获取数据
video_content = requests.get(url=video_url, headers=headers).content # 获取二进制数据
with open('video\\' + title + '.mp4', mode='wb') as f:
f.write(video_content)
print(title, '保存成功')
文章存在借鉴,如有侵权请联系修改删除!