背景:在涉及到正常ts文件时,建议用以下代码,如果碰到ts文件是伪png头的,那就查看本人主页找第一篇文章。
废话不多说,上代码!
# Haul Lee 始于2022年10月16日 星期日
import os.path
import random
from multiprocessing import Pool
import pandas as pd
import requests
def main():
if not os.path.exists('./银河护卫队_圣诞特别行动'):
os.mkdir('./银河护卫队_圣诞特别行动')
# m3u8文件(可以直接通过网页扩展软件直接获取)
# 若要验证是否正确,直接复制到网址会下载一个m3u8文件,把文件格式修改成txt查看是否是ts文件
url = 'https://new.qqaku.com/20221125/9HstxazX/1100kb/hls/playlist_up.m3u8'
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Cookie': 'channelid=0; sid=1670315916352926; _ga=GA1.2.1059162088.1670315989; _gid=GA1.2.845845072.1670315989',
'DNT': '1',
'Pragma': 'no-cache',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.62',
'sec-ch-ua': '"Microsoft Edge";v="107", "Chromium";v="107", "Not=A?Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
df = pd.read_csv('ip_pool.csv')
row = random.randint(2, len(df)-1)
ip = df.iloc[row]['http']
proxy = {
'http': ip
}
url_ts_list = get_ts(url, header, proxy)
pool = Pool(50) # 10个进程足以了,1分钟内即可搞定,亲测30多秒
pool.map(get_video_data, url_ts_list) # 使用线程池对视频数据进行请求(主要用于处理较为耗时的阻塞操作)
pool.close() # 关闭进程池,不再接受新的进程
pool.join() # 主进程阻塞等待子进程的退出
def get_ts(url, header, proxy):
data = requests.get(url=url, headers=header, proxies=proxy).content
with open('./m3u8文件', 'wb') as fp:
fp.write(data)
url_ts_list = [] # 创建一个空列表用于装取ts文件的url
i = 1
# 处理m3u8文件,提取有用的ts文件url
with open('m3u8文件', 'r', encoding='utf-8') as rf:
for url_ts in rf: # 获取文件的每一行
url_ts = url_ts.strip() # 去除文件中的空格、空白、换行
if url_ts.startswith('#'): # 如果以#开头,不要获取,直接下行
continue # 继续跳转for取url_ts
# 将ts文件的url和name进行封装成字典
name_ts = '%04d' % i
dic_ts = {
'video_name': name_ts + '.ts',
'video_url': url_ts
}
url_ts_list.append(dic_ts) # 将ts文件字典加到列表中存储
i += 1
return url_ts_list # 将值返回给main()
def get_video_data(dic_ts): # 下载ts文件
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Cookie': 'channelid=0; sid=1670315916352926; _ga=GA1.2.1059162088.1670315989; _gid=GA1.2.845845072.1670315989',
'DNT': '1',
'Pragma': 'no-cache',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.62',
'sec-ch-ua': '"Microsoft Edge";v="107", "Chromium";v="107", "Not=A?Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
df = pd.read_csv('ip_pool.csv')
row = random.randint(2, len(df)-1)
ip = df.iloc[row]['http']
proxy = {
'http': ip
}
url = dic_ts['video_url'] # 读取ts文件中的每一个url
data = requests.get(url=url, headers=header, proxies=proxy).content
with open(f'./银河护卫队_圣诞特别行动/' + dic_ts['video_name'], 'wb') as wf:
wf.write(data)
print(dic_ts['video_name'], '下载完成!')
if __name__ == '__main__':
main()