这篇文章干货很多
套路就是 通过 网站的第一个m3u8 找到 第二个m3u8 第二个m3u8很重要 这个里面记录了 是否加密 和 所有的ts文件 然后就是把所有的ts文件进行下载 如果加密了 就通过第二个m3u8提供的KEY 对ts进行解密 每一个ts都要解密 最后把解密后的ts 合并成一个mp4
包的安装
协程相关
pip install asyncio
pip install aiohttp
pip install aiofiles
解密相关
pip install pycryptodome
pip install crypto
需要注意的是 pycryptohome 这个东西在windows上得安装VC++14的包
这个就很坑,建议你安装一个vscode2019 然后安装
#!/usr/bin/python
import asyncio
import json
import os
import re
from urllib import parse
import aiofiles
import aiohttp
import requests
from Crypto.Cipher import AES
from bs4 import BeautifulSoup
def get_page_source(link):
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'
}
response = requests.get(link, headers=head)
response.encoding = 'utf-8'
return response.text
def parse_page_source(html_content, video_files):
soup = BeautifulSoup(html_content, 'lxml')
script = soup.find('div', attrs={'id': 'cms_player'}).find('script').text
m3_1_data = script.split('=')[1].replace(';', '')
m3_1_json_data = json.loads(m3_1_data)
m3u8_one_url = m3_1_json_data['url']
m3u8_one_page = get_page_source(m3u8_one_url)
second_m3u8_url = ""
for item in m3u8_one_page.split():
if not item.startswith("#"):
second_m3u8_url = parse.urljoin(m3u8_one_url, item)
break
second_m3u8 = get_page_source(second_m3u8_url)
with open(video_files, mode="w") as f:
f.write(second_m3u8)
def get_all_ts(file_path):
ts_url_list = []
with open(file_path, 'r', encoding='utf-8') as file_object:
lines = file_object.readlines()
for line in lines:
new_line = line.strip()
if not new_line.startswith('#'):
ts_url_list.append(new_line)
return ts_url_list
"""
协程下载
这个里面叫爬虫自省
去下载ts,for循环+try+协程 失败了再去重新下载,这个套路还是得好好学学一下
"""
async def aio_download_ts(save_path, ts_url, session):
for c in range(10):
try:
async with session.get(ts_url) as resp:
movie_content = await resp.content.read()
# 存储文件
async with aiofiles.open(save_path, mode="wb") as f:
await f.write(movie_content)
print(save_path, "下载完毕!~")
return ""
except:
print(ts_url, "下载失败!~, 重新下载. ")
return ts_url
async def aio_download(name, movie_file_list):
tasks = []
file_path = f"./{name}"
if not os.path.exists(file_path):
os.makedirs(file_path)
async with aiohttp.ClientSession() as session:
for ts_url in movie_file_list:
file_name = ts_url.split("/")[-1]
movie_save_path = os.path.join(file_path, file_name)
tasks.append(asyncio.create_task(aio_download_ts(movie_save_path, ts_url, session)))
# 启动多任务异步下载
result, pending = await asyncio.wait(tasks)
# 如果result里有东西. 那就坏菜了. 这里可以考虑让程序休息一会儿. 然后重新下载.
# 或者直接记录在文件里. 等以后再下载.
"""
从第二个m3u8获取key
"""
def get_key(second_m3u8):
with open(second_m3u8, 'r', encoding='utf-8') as file_object:
files = file_object.read()
obj = re.compile(r'URI="(?P.*?)"' )
result = obj.search(files)
key_url = result.group("key_url")
return get_page_source(key_url)
"""
在服务器做协程解密,估计是性能不行,我就写了个下面的单线程的解密
"""
async def aio_decrypt_ts(file_path, new_file_path, key):
async with aiofiles.open(file_path, mode="rb") as f1, \
aiofiles.open(new_file_path, mode="wb") as f2:
content = await f1.read()
aes = AES.new(key.encode("utf-8"), IV=b"0000000000000000", mode=AES.MODE_CBC)
decrypt_content = aes.decrypt(content)
await f2.write(decrypt_content)
print(f"解密成功, 文件被存放在{new_file_path}")
async def aio_decrypt(name, decrypt_name, ts_url_list, key):
file_path_dir = f"./{name}"
new_file_path_dir = f"./{decrypt_name}"
# new_file_path_dir = f"./{movie_name}/temp"
if not os.path.exists(new_file_path_dir):
os.makedirs(new_file_path_dir)
tasks = []
for ts_url in ts_url_list:
ts_name = ts_url.split("/")[-1]
file_path = os.path.join(file_path_dir, ts_name)
new_file_path = os.path.join(new_file_path_dir, ts_name)
tasks.append(asyncio.create_task(aio_decrypt_ts(file_path, new_file_path, key)))
result = await asyncio.gather(*tasks)
return result
"""
解密这个事
网站是aes-128进行加密的
aes = AES.new(key.encode("utf-8"), IV=b"0000000000000000", mode=AES.MODE_CBC)
IV 网站上没有,如果有的话,用网站的,所有我这里就16个0
网上还有个m3u8的包,这个我没研究过
"""
def decrypt(name, decrypt_name, ts_url_list, key):
file_path_dir = f"{name}"
new_file_path_dir = f"{decrypt_name}"
if not os.path.exists(new_file_path_dir):
os.makedirs(new_file_path_dir)
for files in ts_url_list:
ts_name = files.split("/")[-1]
file_path = os.path.join(file_path_dir, ts_name)
new_file_path = os.path.join(new_file_path_dir, ts_name)
with open(file_path, 'rb') as f1:
content = f1.read()
with open(new_file_path, 'wb') as f2:
aes = AES.new(key.encode("utf-8"), IV=b"0000000000000000", mode=AES.MODE_CBC)
decrypt_content = aes.decrypt(content)
f2.write(decrypt_content)
print(f"解密成功, 文件被存放在{new_file_path_dir}")
"""
最后的合并,就是50个,50个的合并,形成一个大的ts,再把这个大的ts进行合并,调用了linux 系统的cat 多个 重定向到一个的 cat 1 2 3 > 4.mp4
windows是 copy /b 整个的.mp4
还有一个方法ffmpg 这个我没用过,
"""
def merge(name, new_file_path_dir, ts_url_list):
# 进入到该文件夹内
cwd = os.getcwd()
os.chdir(new_file_path_dir)
# 合并, 每50个合并为1个.
part = 1
last = []
ts_list = []
for i in range(len(ts_url_list)):
ts_url = ts_url_list[i]
ts_name = ts_url.split("/")[-1]
ts_list.append(ts_name)
if i != 0 and i % 50 == 0:
# "cat {names} > movie.mp4"
os.popen(f"cat {' '.join(ts_list)} > big_movie_{part}.ts")
last.append(f"big_movie_{part}.ts")
part += 1
ts_list = []
# 最后的最后还剩下一些没有合并呢
os.popen(f"cat {' '.join(ts_list)} > big_movie_{part}.ts")
last.append(f"big_movie_{part}.ts")
os.popen(f"cat {' '.join(last)} > %s.mp4" % name)
os.chdir(cwd)
if __name__ == '__main__':
movie_name = 'BlackWidow'
movie_name_decrypt = 'New_BlackWidow'
url = 'https://www.tudouyy.com/video/dongzuo/177708/2-1.html'
second_m3u8_file = 'second_m3u8.txt'
print(f'开干...电影名称:{movie_name},开始解析地址:{url}')
source = get_page_source(url)
parse_page_source(source, second_m3u8_file)
ts_list = get_all_ts(second_m3u8_file)
asyncio.run(aio_download(movie_name, ts_list))
print('所有文件下载完成...准备进行解密工作...')
movie_key = get_key(second_m3u8_file)
"""
在服务器去协程解密,代码没毛病,总报错,就用普通的解密方式吧
"""
decrypt(movie_name, movie_name_decrypt, ts_list, movie_key)
print(f'还有最后一步,把所有文件合并成电影:{movie_name}')
merge(movie_name, movie_name_decrypt, ts_list)
print('可以看电影了...')