这几天在某奇艺上看电影,这某奇艺真是坑,只能看6分钟。就去网上随便找了个解析网址,就可以观看了。但是这个网址没有提供下载接口,就打算写爬虫把它下载下载
视频解析网址:http://jx.618g.com/?url=
这里以叶问4https://www.iqiyi.com/v_19rrc1di2c.html
为例
简单抓包,发现了两个m3u8文件,其中一个是另一个的链接,第二个是ts文件
而m3u8就在网页源代码中
算了,不想比比了,先上源代码
import requests
import os
import shutil
import re
from multiprocessing import Pool
from lxml import etree
def get_ts_list(url):
r = etree.HTML(requests.get(url).text)
try:
movie_name = r.xpath('//title/text()')[0]
m3u8_url = r.xpath('//*[@id="player"]/@src')[0][17:]
response = requests.get(m3u8_url).text
ts = re.findall(re.compile(r'(.*\.ts)'), response)
if ts == []:
main_path = m3u8_url.split('/')
main_path.pop(-1)
s = ''
for item in main_path:
s += item + '/'
r = s + re.findall(re.compile(r'(.*m3u8)'), requests.get(m3u8_url).text)[0]
s2 = ''
v = r.split('/')
v.pop(-1)
for i in v:
s2 += i + "/"
response = requests.get(r).text
ts_list = [s2 + x for x in re.findall(re.compile(r'(.*ts)'), response)]
return movie_name, ts_list
else:
return movie_name, ts
except IndexError:
pass
def download(number, url):
with open(r'ts/%d.ts' % number, 'wb') as f:
f.write(requests.get(url).content)
print(number)
def pinjie():
lengh = len(ts_list)
v = lengh // 1000
if v == 0:
command = 'copy /b '
for i in range(lengh):
command += '%d.ts+' % i
command = command[0:-1] + '../%s.mp4' % movie_name
os.chdir('ts')
os.system(command)
elif v == 1:
command1 = 'copy /b '
command2 = 'copy /b '
for i in range(1000):
command1 += "%d.ts+" % i
command1 = command1[0:-1] + r' ../new1.ts'
for i in range(1000, len(ts_list)):
command2 += '%d.ts+' % i
command2 = command2[0:-1] + r' ../new2.ts'
os.chdir(os.getcwd() + "/ts")
os.system(command1)
os.system(command2)
os.chdir('..')
os.system('copy /b 0.ts+1000.ts %s.mp4' % movie_name)
print("%s 下载完成" % movie_name)
elif v == 2:
command1 = 'copy /b '
command2 = 'copy /b '
command3 = 'copy /b '
for i in range(1000):
command1 += "%d.ts+" % i
command1 = command1[0:-1] + r' ../new1.ts'
for i in range(1000, 2000):
command2 += '%d.ts+' % i
command2 = command2[0:-1] + r' ../new2.ts'
for i in range(2000, len(ts_list)):
command3 += '%d.ts+' % i
command3 = command3[0:-1] + r' ../new3.ts'
os.chdir(os.getcwd() + "/ts")
os.system(command1)
os.system(command2)
os.system(command3)
os.chdir('..')
os.system('copy /b 0.ts+1000.ts+2000.ts %s.mp4' % movie_name)
print("%s 下载完成" % movie_name)
else:
print("你的电影太长,作者懒得写了")
if __name__ == '__main__':
print("""**********电影下载器**********
支持下载付费电影
支持某奇艺,某酷等各大平台
仅供学习,禁止用于商业用途""")
while True:
URL = 'http://jx.618g.com/?url='
MOVIE = input("\n请输入你要下载的电影链接>>>")
print("已经开始下载,请等待(可能耗时较长)")
result = get_ts_list(URL + MOVIE)
movie_name = result[0]
ts_list = result[1]
if not os.path.exists('ts'):
os.mkdir('ts')
p = Pool(30)
for item in ts_list:
p.apply_async(download, args=(ts_list.index(item), item,))
p.close()
p.join()
pinjie()
try:
shutil.rmtree('ts')
os.remove('0.ts')
os.remove('1000.ts')
os.remove('2000.ts')
except:
pass
guess=input("继续下载(y/n)>>>").upper()
if guess=='YES' or guess=='Y':
continue
else:
exit()
思路:爬取网页源代码—>找到m3u8文件—>获得ts文件列表—>多进程下载ts文件—>拼接ts文件—>删除文件夹
os.rmdir删除文件夹只能删除空文件夹,这时候就要用更高级的shutil
用法
import shutil
shutil.rmtree("文件夹名")
使用copy指令进行拼接
copy /b 1.ts+2.ts+...+n.ts name.mp4
通过进程池创建进程加速下载ts文件
在使用前要先导入
from multiprocessing import Pool
实例
from multiprocessing import Pool
def task(invalte):
print("I am %d"%invalte)
if __name__ == '__main__':
print("start")
p=Pool(5)
for i in range(10):
p.apply_async(task,args=(i,))
p.close()
p.join()
print("end")
运行结果
start
I am 0
I am 1
I am 2
I am 3
I am 4
I am 5
I am 6
I am 7
I am 8
I am 9
end
人生苦短,我用Python