人生苦短,我用Python-----爬取视频

           人生苦短,我学python!

           上一篇 是爬取图片的小程序,今天来写一下爬视频的小程序.

           我自己在写爬视频的时候,搞环境真的是搞了好久,更具网上的各个版本的代码,看看这个,看看那个,很多都能实现,但是他们的版本库可能和你的版本库是不一样的,你的版本就引入不了他使用的包.我这篇文章使用的是 3.8的版本,使用的idea是PyCharm.

           写爬虫还是需要一点前端的功底,多少懂一些,自己能写一个jsp就ok.我是很早之前的工作需要前后端很写,前端写的是过时的jsp,不过在当时或者一些二线的城市用的都是jsp,大上海基本就没人用jsp了.然后 就是思路,不同的网站爬取的思路是不一样的,根据请求返回的内容,找到请求转发的位置,就能找到最终的文件地址,我爬的这个网站是之前在上面看过动漫,电影什么的,感觉网站还不错,广告也不多,就想搞一下,嘿嘿

           非法爬虫是犯法的!切记! 很严重的哦~

# -*- coding: UTF-8 -*-
import datetime
from multiprocessing.pool import Pool

import requests, os
from bs4 import BeautifulSoup

"""
    这个是爬取 雅酷电影网站的demo,只能爬取视频链接地址是 .m3u8结尾的视频
"""
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}

url_host = r"http://www.yakuhd.com/"

def file_walker(path):
    file_list = []
    for root, dirs, files in os.walk(path):  # 生成器
        for fn in files:
            p = str(root + '/' + fn)
            file_list.append(p)
    print(file_list)
    return file_list


def combine(ts_path, combine_path, file_name):
    file_list = file_walker(ts_path)
    file_path = combine_path + file_name + '.mp4'
    with open(file_path, 'wb+') as fw:
        for i in range(len(file_list)):
            fw.write(open(file_list[i], 'rb').read())


def down_load(item):
    print("正在执行任务:" + item[0])
    ts_url = item[1] + item[2]
    start = datetime.datetime.now().replace(microsecond=0)
    try:
        response = requests.get(ts_url, stream=True, verify=False)
    except Exception as e:
        print("异常请求:%s" % e.args)
        return
    ts_path = item[3] + '/' + str(item[0]) + '.mp4'
    with open(ts_path, "wb+") as file:
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                file.write(chunk)
    end = datetime.datetime.now().replace(microsecond=0)
    print("耗时:%s" % (end - start))
    print("执行任务结束:" + item[0])

def get_pageurl():
    target_urls = []
    url = r"http://www.yakuhd.com/Playlist/7509"
    response = requests.get(url=url, headers=headers)
    if response.status_code != 200:
        return None
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'lxml')
    find_all = soup.find_all(class_='zhuanji_bottom_list')
    for item in find_all:
        target_urls.append(item.img.get('title') + '=' + item.a.get('href'))
    return target_urls

def name_aaa(url):
    split = url.split("=")
    fileName = split[0]
    url = url_host + split[1]
    pool = Pool(4)
    response = requests.get(url=url, headers=headers)
    soup_html = BeautifulSoup(response.text, 'lxml')
    find_all = soup_html.find_all(attrs={'id': 'watchPlayer'})
    split = str(find_all).split("'")
    print(split)
    index_m3u8_url = str(split[1]).split("'")[0]
    response_m3u8 = requests.get(url=index_m3u8_url, headers=headers)
    st = str(response_m3u8.text).split('\n')[2]
    target_begin_url_nums = index_m3u8_url.split("index.m3u8")
    target_bejin_url = target_begin_url_nums[0]
    print("我是 target_bejin_url:", target_bejin_url)
    target_complete_url = target_bejin_url + st
    target_response = requests.get(url=target_complete_url, headers=headers)
    text_split_nums = target_response.text.split('\n')
    ts_bejin_url = target_complete_url.split('index.m3u8')[0]
    dir_name = r'C:/Users/weiming.sun/Desktop/ts/'+fileName
    paramsList = [];
    count = 0
    for item in text_split_nums:
        if not item.endswith('.ts'):
            continue
        param = []
        param.append(str(count))
        param.append(ts_bejin_url)
        param.append(item)
        param.append(dir_name)
        paramsList.append(param)
        count = count + 1
    print(paramsList)
    pool.map(down_load, paramsList)
    pool.close()  # 将进程池关闭,不再接受新的进程
    pool.join()  #
    combine(dir_name, r"C:/Users/weiming.sun/Desktop/movie/"+fileName, fileName)

if __name__ == '__main__':
    target_urls = get_pageurl()
    for item in target_urls:
        name_aaa(item)


         

你可能感兴趣的:(python)