Python爬虫之电影下载器

这几天在某奇艺上看电影,这某奇艺真是坑,只能看6分钟。就去网上随便找了个解析网址,就可以观看了。但是这个网址没有提供下载接口,就打算写爬虫把它下载下载

视频解析网址:http://jx.618g.com/?url=

这里以叶问4https://www.iqiyi.com/v_19rrc1di2c.html
为例

简单抓包,发现了两个m3u8文件,其中一个是另一个的链接,第二个是ts文件
在这里插入图片描述
Python爬虫之电影下载器_第1张图片
而m3u8就在网页源代码中

算了,不想比比了,先上源代码

import requests
import os
import shutil
import re
from multiprocessing import Pool
from lxml import etree


def get_ts_list(url):
    r = etree.HTML(requests.get(url).text)
    try:
        movie_name = r.xpath('//title/text()')[0]
        m3u8_url = r.xpath('//*[@id="player"]/@src')[0][17:]
        response = requests.get(m3u8_url).text
        ts = re.findall(re.compile(r'(.*\.ts)'), response)
        if ts == []:
            main_path = m3u8_url.split('/')
            main_path.pop(-1)
            s = ''
            for item in main_path:
                s += item + '/'
            r = s + re.findall(re.compile(r'(.*m3u8)'), requests.get(m3u8_url).text)[0]
            s2 = ''
            v = r.split('/')
            v.pop(-1)
            for i in v:
                s2 += i + "/"
            response = requests.get(r).text
            ts_list = [s2 + x for x in re.findall(re.compile(r'(.*ts)'), response)]
            return movie_name, ts_list
        else:
            return movie_name, ts
    except IndexError:
        pass


def download(number, url):
    with open(r'ts/%d.ts' % number, 'wb') as f:
        f.write(requests.get(url).content)
    print(number)


def pinjie():
    lengh = len(ts_list)
    v = lengh // 1000
    if v == 0:
        command = 'copy /b '
        for i in range(lengh):
            command += '%d.ts+' % i
        command = command[0:-1] + '../%s.mp4' % movie_name
        os.chdir('ts')
        os.system(command)
    elif v == 1:
        command1 = 'copy /b '
        command2 = 'copy /b '
        for i in range(1000):
            command1 += "%d.ts+" % i
        command1 = command1[0:-1] + r' ../new1.ts'
        for i in range(1000, len(ts_list)):
            command2 += '%d.ts+' % i
        command2 = command2[0:-1] + r' ../new2.ts'
        os.chdir(os.getcwd() + "/ts")
        os.system(command1)
        os.system(command2)
        os.chdir('..')
        os.system('copy /b 0.ts+1000.ts %s.mp4' % movie_name)
        print("%s 下载完成" % movie_name)
    elif v == 2:
        command1 = 'copy /b '
        command2 = 'copy /b '
        command3 = 'copy /b '
        for i in range(1000):
            command1 += "%d.ts+" % i
        command1 = command1[0:-1] + r' ../new1.ts'
        for i in range(1000, 2000):
            command2 += '%d.ts+' % i
        command2 = command2[0:-1] + r' ../new2.ts'
        for i in range(2000, len(ts_list)):
            command3 += '%d.ts+' % i
        command3 = command3[0:-1] + r' ../new3.ts'
        os.chdir(os.getcwd() + "/ts")
        os.system(command1)
        os.system(command2)
        os.system(command3)
        os.chdir('..')
        os.system('copy /b 0.ts+1000.ts+2000.ts %s.mp4' % movie_name)
        print("%s 下载完成" % movie_name)
    else:
        print("你的电影太长,作者懒得写了")


if __name__ == '__main__':
    print("""**********电影下载器**********
支持下载付费电影
支持某奇艺,某酷等各大平台
仅供学习,禁止用于商业用途""")
    while True:
        URL = 'http://jx.618g.com/?url='
        MOVIE = input("\n请输入你要下载的电影链接>>>")
        print("已经开始下载,请等待(可能耗时较长)")
        result = get_ts_list(URL + MOVIE)
        movie_name = result[0]
        ts_list = result[1]
        if not os.path.exists('ts'):
            os.mkdir('ts')
        p = Pool(30)
        for item in ts_list:
            p.apply_async(download, args=(ts_list.index(item), item,))
        p.close()
        p.join()
        pinjie()
        try:
            shutil.rmtree('ts')
            os.remove('0.ts')
            os.remove('1000.ts')
            os.remove('2000.ts')

        except:
            pass
        guess=input("继续下载(y/n)>>>").upper()
        if guess=='YES' or guess=='Y':
            continue
        else:
            exit()

思路:爬取网页源代码—>找到m3u8文件—>获得ts文件列表—>多进程下载ts文件—>拼接ts文件—>删除文件夹

删除文件夹

os.rmdir删除文件夹只能删除空文件夹,这时候就要用更高级的shutil

用法

import shutil
shutil.rmtree("文件夹名")

拼接

使用copy指令进行拼接

copy /b 1.ts+2.ts+...+n.ts name.mp4

多进程

通过进程池创建进程加速下载ts文件

在使用前要先导入

from multiprocessing import Pool

实例

from multiprocessing import Pool
def task(invalte):
    print("I am %d"%invalte)
if __name__ == '__main__':
    print("start")
    p=Pool(5)
    for i in range(10):
        p.apply_async(task,args=(i,))
    p.close()
    p.join()
    print("end")

运行结果

start
I am 0
I am 1
I am 2
I am 3
I am 4
I am 5
I am 6
I am 7
I am 8
I am 9
end

人生苦短,我用Python

你可能感兴趣的:(python)