20231110_python练习_b站视频爬取音频组合

修改为批量下载视频音频,需要根据喜好设定 text_mw

import requests
import json
import re
from moviepy.editor import *
import time

def requests_url(url,header):
    resp = requests.get(url=url, headers=header)
    obj = re.compile(r'(.*?)', re.S)
    html_biaoti = obj.findall(resp.text)[0]  # 从列表转换为字符串
    biaoti = str(html_biaoti).replace('、', '_').replace(',', '_').replace('/','_').replace(' ','_').replace('|','')
    print('html_biaoti:', biaoti)
    # baseUrl视频地址
    obj = re.compile(r'window.__playinfo__=(.*?)', re.S)
    html_data = obj.findall(resp.text)[0]  # 从列表转换为字符串
    # print(html_data)
    json_data = json.loads(html_data)
    # video和audio分别时视频与音频
    videos = json_data['data']['dash']['video']
    video_url = videos[0]['baseUrl']
    audios = json_data['data']['dash']['audio']
    audio_url = audios[0]['baseUrl']

    resp1 = requests.get(url=video_url, headers=header)
    with open(biaoti + '.mp4', mode='wb') as f:
        f.write(resp1.content)

    resp2 = requests.get(url=audio_url, headers=header)
    with open(biaoti + '.mp3', mode='wb') as f:
        f.write(resp2.content)
    # 视频音频文件组合
    video_path = biaoti + '.mp4'
    audio_path = biaoti + '.mp3'
    # 提取音轨
    audio = AudioFileClip(audio_path)
    # 读入视频
    video = VideoFileClip(video_path)
    # 将音轨合并到视频中
    video = video.set_audio(audio)
    # 输出
    video.write_videofile(f"{biaoti}(含音频).mp4")

    # 将涉及MP3MP4清除
    os.remove(video_path)
    os.remove(audio_path)
    return biaoti,'爬取完成'

text_mw = '老高和小茉'
text_mw = str(text_mw.encode('utf-8')).upper().replace('\\X','%')[2:-1]
text_url = 'https://search.bilibili.com/all?keyword=' + text_mw
print(text_url)
#设置请求头
header={
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.43",
    "Referer":"https://www.bilibili.com/" #设置防盗链
}
resp = requests.get(url=text_url,headers=header)
#print(resp.text)

obj = re.compile(r'href="//(www.bilibili.com.*?)/" ',re.S)
html_list=obj.findall(resp.text) #从列表转换为字符串
html_list2 = list(set(html_list))
html_list2.sort(key=html_list.index)
#print(html_list2)

for url in html_list2:
    url = 'https://' + url + '/'
    print(url)
    biaoti,requests_status = requests_url(url,header)
    print(url,biaoti,requests_status)
    time.sleep(2)


你可能感兴趣的:(爬虫练习,python,音视频,开发语言)