西瓜视频破解js爬虫(截止2020-06-11可用)

声明:本文仅供学习交流使用,请勿用于商业用途或不正当行为!如果你实在要这样干,别找我背锅

主流程

# _*_ coding: utf-8 _*_
# @Author : lxx
# @time   : 2020/5/24 15:40
# @File   : xg_vedio_download.py
# Software: PyCharm

import requests
import time
import pymongo
import json
from xg_download.code.download_url import get_download_url
from xg_download.code.save_video import download_video

headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/70.0.3538.110 Safari/537.36",
    'referer': 'https://www.ixigua.com/channel/keji/'
}


def get_api_urls(crawl_list_pages):
    """
    拼接api_url
    :param crawl_list_pages: 要爬取的页数
    :return: list_urls 列表页url
    """
    base_list_url = 'https://www.ixigua.com/api/feedv2/feedById?_signature=jmy8FAAgEAliUMcmRZJ1d45svAAANCt' \
                    '&channelId=94349546885&count=12&maxTime=%d&request_from=702&queryCount=1'
    list_urls = []
    for i in range(crawl_list_pages):
        timestamp = int(time.time())
        max_time = timestamp + i * 70
        url = base_list_url % max_time
        list_urls.append(url)
    return list_urls


def parse_detail(api_list_urls):
    """
    解析详情页,并保存数据到mongodb中
    :param api_list_urls:详情页url
    :return: 下载视频所需数据
    """
    # 连接mongodb
    client = pymongo.MongoClient('mongodb://127.0.0.1:27017/')
    db = client.xigua
    p = db['data']

    download_msgs = []
    for url in api_list_urls:
        detail_dict = requests.get(url, headers=headers).text
        detail_json = json.loads(detail_dict)  # json格式数据
        # print(detail_json)
        data_list = detail_json['data']['channelFeed']['Data']
        for video_msg in data_list:
            title = video_msg['data']['title']  # 视频标题
            play_num = video_msg['data']['playNum']  # 观看量
            author = video_msg['data']['user_info']['name']  # 作者名
            author_id = video_msg['data']['user_info']['user_id']  # 作者id
            author_url = 'https://www.ixigua.com/home/' + str(author_id)  # 作者主页url
            author_desc = video_msg['data']['user_info']['description']  # 作者简介
            video_key = video_msg['key']
            video_url = 'https://www.ixigua.com/embed?group_id=%s' % video_key
            information = {'title': title, 'play_name': play_num, 'author': author, 'author_id': author_id,
                           'author_url': author_url,
                           'author_desc': author_desc, 'video_key': video_key, 'video_url': video_url}
            try:
                p.insert_one(information)
                # print('一条数据保存成功~')
            except Exception as e:
                print('%s数据保存失败!原因:%s' % (video_url, e))
            download_msg = {'title': title, 'group_id': video_key}
            download_msgs.append(download_msg)
    return download_msgs


def video_download_params(download_msgs):
    """
    获取视频下载所需参数
    :param download_msgs: 视频url, group_id
    :return: 下载链接
    """
    for msg in download_msgs:
        # print(msg)
        # {'title':'', 'group_id': ''}
        group_id = msg['group_id']
        # 拼接接口url
        # 'https://www.ixigua.com/api/public/videov2/brief/details?group_id=6808039640256217614'
        url = 'https://www.ixigua.com/api/public/videov2/brief/details?group_id=%s' % group_id
        api_response = requests.get(url, headers=headers).json()
        # print(api_response)
        video_id = api_response['data']['vid']  # 获取video_id
        url = get_download_url(video_id)
        # print(url)
        msg['download_url'] = url
        # {'title': '', 'group_id': '', 'download_url': ''}
        # print(msg)
    return download_msgs


def spider():
    while True:
        try:
            crawl_list_pages = int(input('请输入你想要获取多少页视频数据(注:每页12条数据):'))
            if crawl_list_pages > 0:
                break
        except Exception as e:
            print('输入有误,请重新输入', e)

    start = time.time()
    list_api_urls = get_api_urls(crawl_list_pages)
    # print(api_urls)
    download_msgs = parse_detail(list_api_urls)
    download_params = video_download_params(download_msgs)
    print(download_params)
    download_video(download_params)
    end = time.time()
    print('总用时:%s秒' % (int(end - start)))


if __name__ == '__main__':
    spider()

破解下载视频的url

# _*_ coding: utf-8 _*_
# @Author : lxx
# @time   : 2020/5/25 16:52
# @File   : download_url.py
# Software: PyCharm

import requests
import random
from zlib import crc32
from base64 import b64decode


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
}


def get_video_url_api(video_id):
	# 拼接完整的api_url
    r = str(random.random())[2:]
    url_part = '/video/urls/v/1/toutiao/mp4/{}?r={}'.format(video_id, r)
    s = crc32(url_part.encode())
    url = 'https://ib.365yg.com{}&s={}'.format(url_part, s)
    # print(url)
    return url


def get_video_url(url):
	# 从返回的json数据重获取下载url,并解密
    resp = requests.get(url, headers=headers)
    j_resp = resp.json()
    video_url = j_resp['data']['video_list']['video_1']['main_url']
    video_url = b64decode(video_url.encode()).decode()
    return video_url


def get_download_url(video_id):
	# 根据vid获取视频下载url
    video_url_api = get_video_url_api(video_id)
    video_url = get_video_url(video_url_api)
    return video_url


if __name__ == '__main__':
    # video_id = 'v02004da0000bptg5da6tgqbogvp0f1g'
    # get_download_url(video_id)
    url = 'https://vas.snssdk.com/video/openapi/v1/?aid=1768&barragemask=true&action=GetPlayInfo&video_id=v020046b0000bqd14qppjc2kj6fl3ab0&nobase64=false&ptoken=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjE1OTA0NTU0ODEsInZlciI6InYxIiwiYWsiOiJjZmMwNjdiYjM5ZmVmZjU5MmFmODIwODViNDJlNmRjMyIsInN1YiI6InBnY19ub3JtYWwifQ.MDBPTGwBTEUAzsd8G1jLx6dsQbsNdk1NPjqtrViPeFE&vfrom=xgplayer'
    print(get_video_url(url))

# 'https://v3-tt.ixigua.com/977b803f3207b119e6b275c48ea0d95b/5ecba387/video/tos/hxsy/tos-hxsy-ve-0004/22bce6c5d543435d92306bb04b14ebd2/?a=1768&br=5238&bt=1746&cr=0&cs=0&dr=0&ds=3&er=&l=2020052517505801001708707922384282&lr=&mime_type=video%2Fmp4&qs=0&rc=amg6bGdtaDtybjMzNjczM0ApNmk2NmczZmVkN2lnPDU1O2dgNTZkZmVxYGhfLS1fLTBzczAzLl5jMDEuX2MwXjAtXzU6Yw%3D%3D&vl=&vr='

异步(同步)下载视频

# _*_ coding: utf-8 _*_
# @Author : lxx
# @time   : 2020/5/25 19:37
# @File   : save_video.py
# Software: PyCharm

import requests
from tqdm import tqdm
import os
import aiohttp
import asyncio
import re


def download_video(download_params):
    """
    异步批量下载视频到本地
    :param download_params: 下载链接,视频标题
    :return: None
    """
    # for param in download_params:
    #     url = param['download_url']
    #     dst = param['title']
    #     download_from_url(url, dst)

    tasks = [asyncio.ensure_future(async_download_from_url(download_params[i]['download_url'], f"{download_params[i]['title']}.mp4")) for i in range(len(download_params))]
    loop = asyncio.get_event_loop()
    try:
        loop.run_until_complete(asyncio.wait(tasks))
    except Exception as e:
        print(e, download_params)
        loop.run_until_complete(loop.shutdown_asyncgens())
    finally:
        loop.close()


async def fetch(session, url, dst, p_bar=None, headers=None):
    if headers:
        async with session.get(url, headers=headers) as req:
            with(open(dst, 'ab')) as f:
                while True:
                    chunk = await req.content.read(1024)
                    if not chunk:
                        break
                    f.write(chunk)
                    p_bar.update(1024)
            p_bar.close()
    else:
        async with session.get(url) as req:
            return req


async def async_download_from_url(url, dst):
    """
    异步下载视频
    :param url: 下载链接
    :param dst: 标题
    :return:
    """
    dst = re.sub(r'[\/:*?"<>|「」]', '-', dst)  # 去掉非法字符
    async with aiohttp.ClientSession() as session:
        req = await fetch(session, url, dst)

        file_size = int(req.headers['content-length'])
        print(f"%s的视频总长度:{file_size}" % dst)
        if os.path.exists(dst):
            first_byte = os.path.getsize(dst)
        else:
            first_byte = 0
        if first_byte >= file_size:
            return file_size
        header = {"Range": f"bytes={first_byte}-{file_size}"}
        pbar = tqdm(
            total=file_size, initial=first_byte,
            unit='B', unit_scale=True, desc=dst)
        await fetch(session, url, dst, p_bar=pbar, headers=header)


def download_from_url(url, dst):
    """
    同步下载视频
    :param url: 下载链接
    :param dst: 标题
    :return:
    """
    dst = re.sub(r'[\/:*?"<>|「」]', '-', dst)  # 去掉非法字符
    response = requests.get(url, stream=True)
    file_size = int(response.headers['content-length'])
    if os.path.exists(dst):
        first_byte = os.path.getsize(dst)
    else:
        first_byte = 0
    if first_byte >= file_size:
        return file_size
    header = {"Range": f"bytes={first_byte}-{file_size}"}
    pbar = tqdm(
        total=file_size, initial=first_byte,
        unit='B', unit_scale=True, desc=dst)
    req = requests.get(url, headers=header, timeout=60, stream=True)
    with(open(dst, 'ab')) as f:
        for chunk in req.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)
                pbar.update(1024)
    pbar.close()
    return file_size


# if __name__ == '__main__':
#     # 异步方式下载
#     url = "http://v11-tt.ixigua.com/7da2b219bc734de0f0d04706a9629b61/5c77ed4b/video/m" \
#           "/220d4f4e99b7bfd49efb110892d892bea9011612eb3100006b7bebf69d81/?rc" \
#           "=am12NDw4dGlqajMzNzYzM0ApQHRAbzU6Ojw8MzQzMzU4NTUzNDVvQGgzdSlAZjN1KWRzcmd5a3VyZ3lybHh3Zjc2QHFubHBfZDJrbV8tLTYxL3NzLW8jbyMxLTEtLzEtLjMvLTUvNi06I28jOmEtcSM6YHZpXGJmK2BeYmYrXnFsOiMzLl4%3D "
#     task = [asyncio.ensure_future(async_download_from_url(url, f"{i}.mp4")) for i in range(1, 12)]
#     loop = asyncio.get_event_loop()
#     loop.run_until_complete(asyncio.wait(task))
#     loop.close()

你可能感兴趣的:(爬虫,应用,python,爬虫,js逆向)