python小程序——视频篇

目录

1.酷6

2.快手

3.A站

4.B站(音,视频未合成)

5.抖音

6.虎牙


import json
import requests


def ku6_spider():

    for page in range(0,11):
        print('正在抓取第{}页数据'.format(page+1))
        #1.确定URL路径,headers参数
        base_url = 'https://www.ku6.com/video/feed'
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
        }
        params = {
            'pageNo': str(page),
            'pageSize': '40',
            'subjectId': '76',
        }


        #2.模拟浏览器-requests发送请求获取数据
        response = requests.get(base_url,headers=headers,params=params)
        res_data = response.text
        #print(res_data)


        #3.解析数据
        json_data = json.loads(res_data)    #--字典

        data_list = json_data['data']

        #遍历列表
        for data in data_list:
            #print(data)
            video_name = data['title']+".mp4"
            video_url = data['playUrl']
            #print(video_url,video_name)

            print('下载中:',video_name)
            video_data = requests.get(video_url,headers=headers).content

            #4.保存数据
            with open('video\\'+video_name,'wb') as f:
                f.write(video_data)
                print('下载完成...')



if __name__ == '__main__':
    ku6_spider()

2.快手

import os
import time
import requests
import json
import pprint
import re

# 在当前目录创建一个保存视频的目录
dir_name = 'video'
if not os.path.exists(dir_name):
    os.mkdir(dir_name)

# 响应头,整个复制之后,利用ctrl+r勾选正则表达式来替换(上面原来的(.*?): (.*) (冒号后面的空格)下面替换的格式 ‘$1': '$2',(冒号后面的空格,最后加逗号分隔)
headers = {
    'accept': '*/*',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Connection': 'keep-alive',
    'Content-Length': '1380',
    'content-type': 'application/json',
    'Cookie': 'kpf=PC_WEB; kpn=KUAISHOU_VISION; clientid=3; did=web_acb408fff3a5f7cd020782d58bb9caa9; ktrace-context=1|MS43NjQ1ODM2OTgyODY2OTgyLjI4ODYxOTgxLjE2MzczNzIwMzc5NTkuMTQ1NDUxNA==|MS43NjQ1ODM2OTgyODY2OTgyLjI3NzMzOTY1LjE2MzczNzIwMzc5NTkuMTQ1NDUxNQ==|0|graphql-server|webservice|false|NA; client_key=65890b29; userId=1232368006; kuaishou.server.web_st=ChZrdWFpc2hvdS5zZXJ2ZXIud2ViLnN0EqABXhLnnN974NXDx7wxD7EXA0gUwiENGncAU1PMNvGRI8hgQVPES30K2a6e8FZ9L3yv89WVXIZ5I1HsDjjWJlzDijZgHPj64KgQ8dkTm8-Aq5monZejiGHAuenrIuDovugsUnncYRtFHLY_bmEtKpBDoaswti5UnDOkiVHAuhMMPlqdPBKYwV_LZ3SGFMeznHUrJv5Wg4o4C45yi-1iuOPyDRoSsmhEcimAl3NtJGybSc8y6sdlIiCHg_pUdXqAoXPplQJ-iHcM2h_MTI_3Wkdnw9ucUMR5UCgFMAE; kuaishou.server.web_ph=b3651a369fb9eb9f33d30ccc2cc691a5ecbf',
    'Host': 'www.kuaishou.com',
    'Origin': 'https://www.kuaishou.com',
    'Referer': 'https://www.kuaishou.com/search/video?searchKey=%E6%85%A2%E6%91%87',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-origin',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
}
# 响应头右边的选项payload(报错的地方加上引号)   (单引号里面括着双引号)
keyword = input("请输入你想要查询的关键词:")
for pcursor in range(0, 2):
    pcursor = str(pcursor)
    data = {
        'operationName': "visionSearchPhoto",
        'query': "query visionSearchPhoto($keyword: String, $pcursor: String, $searchSessionId: String, $page: String, $webPageArea: String) {\n  visionSearchPhoto(keyword: $keyword, pcursor: $pcursor, searchSessionId: $searchSessionId, page: $page, webPageArea: $webPageArea) {\n    result\n    llsid\n    webPageArea\n    feeds {\n      type\n      author {\n        id\n        name\n        following\n        headerUrl\n        headerUrls {\n          cdn\n          url\n          __typename\n        }\n        __typename\n      }\n      tags {\n        type\n        name\n        __typename\n      }\n      photo {\n        id\n        duration\n        caption\n        likeCount\n        realLikeCount\n        coverUrl\n        photoUrl\n        liked\n        timestamp\n        expTag\n        coverUrls {\n          cdn\n          url\n          __typename\n        }\n        photoUrls {\n          cdn\n          url\n          __typename\n        }\n        animatedCoverUrl\n        stereoType\n        videoRatio\n        __typename\n      }\n      canAddComment\n      currentPcursor\n      llsid\n      status\n      __typename\n    }\n    searchSessionId\n    pcursor\n    aladdinBanner {\n      imgUrl\n      link\n      __typename\n    }\n    __typename\n  }\n}\n",
        'variables': {'keyword': keyword, 'pcursor': pcursor, 'page': "search"}
    }  # "keyword"这个控制关键词,"pcursor"控制翻页(手动在网页中下滑之后会出现两个数据包)
    # 页面搜索视频名字,然后找到抓包,再找响应网址
    baseurl = "https://www.kuaishou.com/graphql"
    # headers有一个  'content-type':  'application/json',  这个定义了data(这里类似账号密码之类的数据),要求data是json字符串
    # print(type(data))
    data = json.dumps(data)  # 将data由字典类型转换为字符串类型
    # print(type(data))
    time.sleep(2)
    # 发送请求,url:链接地址,headers:伪装,data:查询参数
    request = requests.post(url=baseurl, headers=headers, data=data)
    # print(request)
    response = request.json()
    # print(response)
    # pprint.pprint(response)
    ##字典数据利用键来找值  {"键":"值"} |列表直接利用位置索引 [值][值]  [0][1]
    # title_list = response['data']['visionSearchPhoto']['feeds'][5]['photo']['caption']
    # print(title_list)
    # url_list = response['data']['visionSearchPhoto']['feeds'][5]['photo']['photoUrl']
    # print(url_list)
    feeds_list = response['data']['visionSearchPhoto']['feeds']
    #print(feeds_list)
    for feeds in feeds_list:
        # 每个feeds是feeds_list列表当中的一个个字典
        # print(feeds)  #利用这条可以把每个视频的信息都分别打印出来
        title = feeds['photo']['caption']
        print(title)
        list = feeds['photo']['photoUrl']
        print(list)
        # #下面这个打印出来把所有类似的数据都放在了同一个列表当中,与下载无关
        # # titles = [i['photo']['caption']for i in feeds_list]
        # # print(titles)
        # # list = [i['photo']['photoUrl']for i in feeds_list]
        # # print(list)
        ##保存视频  【搜索关键词下载视频/知道一个用户的视频/翻页下载】
        new_title = re.sub(r'[\/:*?"<>|\n]', '_', title)  # 在windows操作系统当中,必须是没有一些特殊字符  #标题过长可以替换(字符串的切片)当>=256
        # 发送网络请求,请求每一个视频地址,获取视频二进制数据
        mp4_data = requests.get(list).content
        with open(dir_name + "/" + new_title + '.mp4', mode='wb') as f:
            f.write(mp4_data)
            print(new_title, "下载完成")
    mp4_data.close()
    request.close()

3.A站

import requests
import re
import os
import zipfile


url = input('请输入视频网址:')

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
}
response = requests.get(url=url,headers=headers)
#print(response.text)
                                            #取第一行,       替换字符
m3u8_url = re.findall('"backupUrl(.*?)"]',response.text)[0].replace('\\":[\\"','').replace('\\','')
title = re.findall('(.*?)- AcFun弹幕视频网 - 认真你就输啦 \(\?\ω\?\)ノ- \( ゜- ゜\)つロ',response.text)[0]
#print(m3u8_url)

#os自动创建文件夹
filename = f'{title}\\'
if not os.path.exists(filename):
    os.mkdir(filename)


m3u8_data = requests.get(url=m3u8_url,headers=headers).text
m3u8_data = re.sub('#EXTM3U','',m3u8_data)
m3u8_data = re.sub('#EXT-X-VERSION:\d','',m3u8_data)
m3u8_data = re.sub('#EXT-X-TARGETDURATION:\d','',m3u8_data)
m3u8_data = re.sub('#EXT-X-MEDIA-SEQUENCE:\d','',m3u8_data)
m3u8_data = re.sub('#EXTINF:\d\.\d+,','',m3u8_data)
m3u8_data = re.sub('#EXT-X-ENDLIST','',m3u8_data).split()
#print(m3u8_data)
for index in m3u8_data:
    ts_url = 'https://ali-safety-video.acfun.cn/mediacloud/acfun/acfun_video/hls/' + index
    ts_name = ts_url.split('.')[3]
                                                         #二进制
    ts_content = requests.get(url=ts_url,headers=headers).content

    with open(filename + ts_name + '.ts',mode='wb') as f:
        f.write(ts_content)
        print(ts_name)

print('视频片段下载完成')
print('开始合并......')

files = os.listdir(filename)
print(files)
with zipfile.ZipFile(filename + title + '.mp4',mode='w') as f:
    for i in files:
        file = filename + i
        f.write(file)
        os.remove(file)
print('爬取完成')



4.B站(音,视频未合成)

import json
import re
import subprocess
import requests


# 访问网站
def get_response(html_url):
    # 出现403加防盗链referer
    headers = {
        'referer': 'https://www.bilibili.com/video/BV1TF411w7vv?spm_id_from=333.337.search-card.all.click&vd_source=415a9fdfbb14115b672b4063903571a0',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
    }
    response = requests.get(url=html_url, headers=headers)
    return response


# 获取信息
def get_video_info(html_url):
    response = get_response(html_url=html_url)
    # print(response.text)
    # 提取视频标题
    title = re.findall('

', response.text)[0] html_data = re.findall('', response.text)[0] # print(title) # print(html_data) # pprint.pprint(html_data) json_data = json.loads(html_data) # 根据冒号左边内容,提取右边内容,键取对值 audio_url = json_data['data']['dash']['audio'][0]['baseUrl'] video_url = json_data['data']['dash']['video'][0]['baseUrl'] video_info = [title, audio_url, video_url] # pprint.pprint(json_data) # print(audio_url) # print(video_url) return video_info # 保存数据 def save(title, audio_url, video_url): audio_content = get_response(html_url=audio_url).content video_content = get_response(html_url=video_url).content with open(title + '.mp3', mode='wb') as f: f.write(audio_content) with open(title + '.mp4', mode='wb') as f: f.write(video_content) print(title, '保存成功') # def merge_data(vide_name): # print('视频开始合成', vide_name) # cmd = f"ffmpeg -i {vide_name}.mp4 -i {vide_name}.mp3 -c:a aac -strict experimental {vide_name}output.mp4" # #print(cmd) # subprocess.run(cmd, shell=True) # print('视频合成完毕', vide_name) def main(bv_id): url = f'https://www.bilibili.com/video/{bv_id}' video_info = get_video_info(url) save(video_info[0], video_info[1], video_info[2]) #merge_data(video_info[0]) keyword = input('请输入要下载的视频BV号:') main(keyword) # url = 'https://www.bilibili.com/video/BV1TF411w7vv' # video_info = get_video_info(url) # print(video_info)

5.抖音

import requests
import re


url = 'https://www.douyin.com/video/7114220525978668303'

headers = {
    'cookie': 'douyin.com; ttcid=444dfe8e89ff4d99b0662076ad171c8775; ttwid=1%7CTnFKlrGi3lHjKf5bshFdP9Nwu_Vsiwo-TxvX9NISgj8%7C1642083887%7Cfbfa904ea2900763eb6ac090bdd09014d80840da1ca485bbfea193d5401b330e; MONITOR_WEB_ID=c27b9f4a-4917-4256-be93-e948308467e3; odin_tt=0510c3c4196f54b541a96ac64e8b585b3a755be85057da8a1f3fa068e3f7b75ca2de4345e2b856f1e7b3f9455d86079731fe7d07a9f10890f26855d3674858e1; passport_csrf_token=e0b90cb756903c370592bd558c2b0cf5; passport_csrf_token_default=e0b90cb756903c370592bd558c2b0cf5; s_v_web_id=verify_l268jj46_kc7yYkD6_YHWW_4x4v_9snI_EDE0zro77uRn; AVATAR_FULL_LOGIN_GUIDE_COUNT=1; AVATAR_FULL_LOGIN_GUIDE_TIMESTAMP=1650982839652; AVATAR_FULL_LOGIN_GUIDE_ITA_COUNT=1; AVATAR_FULL_LOGIN_GUIDE_ITA_TIMESTAMP=1650982839652; __ac_nonce=0627ba36600d465d72261; __ac_signature=_02B4Z6wo00f01zrB8EAAAIDCWcswKSh.eLM65fTAAKzW8srQpmSjmL6YX9IsdmMSL4a9EBuyJvIwNMROqFQktniG-Ur-UDPK6wHInC8QKqRYUmyGnflwUXLpKzPgVt2FtREyprGmCDAZLrIpcc; douyin.com; strategyABtestKey=1652269927.635; AB_LOGIN_GUIDE_TIMESTAMP=1652269927510; AVATAR_LOGIN_GUIDE_COUNT=1; _tea_utm_cache_2285=undefined; _tea_utm_cache_6383=undefined; _tea_utm_cache_1300=undefined; pwa_guide_count=3; IS_HIDE_THEME_CHANGE=1; THEME_STAY_TIME=299808; msToken=XGPVAVUHDi9iTEQRjdXuQ0YyetxhHq0c9EH1dLLpttanbCXsNSD0DRxwk9oUB0vZ7LB9vKd-ABi2kAkzj2lCn1x98lJ4iTFbf260RcLav-G4QkhNyq8qV9i3oEJRyc8t; home_can_add_dy_2_desktop=1; msToken=3ALqenaebbJHw7kQDiDG6aRAgVYm5WM1pVGqmyyidbGgYpWRWKn-wQ9tcjoxWrHvwcqoYAx3tQ4IGE1qixdq2ei_fPrirMeeI6HeooU3sGR2wyWQ2OAAh2RejVJOrmpA; tt_scid=Gp0q0JW0LDreTqplgpajIZNHCB0.p1NcVv0hhZBgaGDw4SFxkXGlXfKafiCVmWAWc537',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36'
}

response = requests.get(url=url, headers=headers)
response.encoding = response.apparent_encoding
#print(response.text)

title = re.findall('(.*?)', response.text)[0]
href = re.findall('src(.*?)vRLmmJ', response.text)[0]
video_url = requests.utils.unquote(href).replace('":"', 'https:')
#print(video_url)

video_content = requests.get(url=video_url).content
with open('video\\' + title + '.mp4', mode='wb') as f:
    f.write(video_content)
    print(title, video_url)

6.虎牙

import requests  
import re  
for page in range(1, 5):
    print(f'正在采集第{page}页的数据内容')
    link = f'https://v.huya.com/g/all?set_id=51&order=hot&page={page}'
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36'
    }
    response = requests.get(url=link, headers=headers)
    # print(response.text)
    # 从哪里找什么数据
  • \d+ 表示匹配一个或者多个数字 video_id_list = re.findall('
  • ', response.text) print(video_id_list) for video_id in video_id_list: url = f'https://liveapi.huya.com/moment/getMomentContent?videoId={video_id}&uid=&_=1652789442223' response = requests.get(url=url, headers=headers) # print(response.json()['status']) title = response.json()['data']['moment']['title'] video_url = response.json()['data']['moment']['videoInfo']['definitions'][0]['url'] # 保存数据 >>> 也是需要发送请求, 获取数据 video_content = requests.get(url=video_url, headers=headers).content # 获取二进制数据 with open('video\\' + title + '.mp4', mode='wb') as f: f.write(video_content) print(title, '保存成功')
  • 文章存在借鉴,如有侵权请联系修改删除!

    你可能感兴趣的:(python,python,json)