Python多进程爬取正常ts视频文件-改进版(By初世小白)

背景:在涉及到正常ts文件时,建议用以下代码,如果碰到ts文件是伪png头的,那就查看本人主页找第一篇文章。

废话不多说,上代码!

# Haul Lee 始于2022年10月16日 星期日
import os.path
import random
from multiprocessing import Pool
import pandas as pd
import requests


def main():
    if not os.path.exists('./银河护卫队_圣诞特别行动'):
        os.mkdir('./银河护卫队_圣诞特别行动')

    # m3u8文件(可以直接通过网页扩展软件直接获取)
    # 若要验证是否正确,直接复制到网址会下载一个m3u8文件,把文件格式修改成txt查看是否是ts文件
    url = 'https://new.qqaku.com/20221125/9HstxazX/1100kb/hls/playlist_up.m3u8'
    header = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Cookie': 'channelid=0; sid=1670315916352926; _ga=GA1.2.1059162088.1670315989; _gid=GA1.2.845845072.1670315989',
        'DNT': '1',
        'Pragma': 'no-cache',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.62',
        'sec-ch-ua': '"Microsoft Edge";v="107", "Chromium";v="107", "Not=A?Brand";v="24"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
    }
    df = pd.read_csv('ip_pool.csv')
    row = random.randint(2, len(df)-1)
    ip = df.iloc[row]['http']
    proxy = {
        'http': ip
    }
    url_ts_list = get_ts(url, header, proxy)
    pool = Pool(50)  # 10个进程足以了,1分钟内即可搞定,亲测30多秒
    pool.map(get_video_data, url_ts_list)  # 使用线程池对视频数据进行请求(主要用于处理较为耗时的阻塞操作)
    pool.close()  # 关闭进程池,不再接受新的进程
    pool.join()  # 主进程阻塞等待子进程的退出


def get_ts(url, header, proxy):
    data = requests.get(url=url, headers=header, proxies=proxy).content
    with open('./m3u8文件', 'wb') as fp:
        fp.write(data)
    url_ts_list = []  # 创建一个空列表用于装取ts文件的url
    i = 1
    # 处理m3u8文件,提取有用的ts文件url
    with open('m3u8文件', 'r', encoding='utf-8') as rf:
        for url_ts in rf:  # 获取文件的每一行
            url_ts = url_ts.strip()  # 去除文件中的空格、空白、换行
            if url_ts.startswith('#'):  # 如果以#开头,不要获取,直接下行
                continue  # 继续跳转for取url_ts
            # 将ts文件的url和name进行封装成字典
            name_ts = '%04d' % i
            dic_ts = {
                'video_name': name_ts + '.ts',
                'video_url': url_ts
            }
            url_ts_list.append(dic_ts)  # 将ts文件字典加到列表中存储
            i += 1
    return url_ts_list  # 将值返回给main()


def get_video_data(dic_ts):  # 下载ts文件
    header = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Cookie': 'channelid=0; sid=1670315916352926; _ga=GA1.2.1059162088.1670315989; _gid=GA1.2.845845072.1670315989',
        'DNT': '1',
        'Pragma': 'no-cache',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.62',
        'sec-ch-ua': '"Microsoft Edge";v="107", "Chromium";v="107", "Not=A?Brand";v="24"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
    }

    df = pd.read_csv('ip_pool.csv')
    row = random.randint(2, len(df)-1)
    ip = df.iloc[row]['http']
    proxy = {
        'http': ip
    }
    url = dic_ts['video_url']  # 读取ts文件中的每一个url
    data = requests.get(url=url, headers=header, proxies=proxy).content
    with open(f'./银河护卫队_圣诞特别行动/' + dic_ts['video_name'], 'wb') as wf:
        wf.write(data)
        print(dic_ts['video_name'], '下载完成!')


if __name__ == '__main__':
    main()

你可能感兴趣的:(python,开发语言)