声明:本文仅供学习交流使用,请勿用于商业用途或不正当行为!如果你实在要这样干,别找我背锅
# _*_ coding: utf-8 _*_
# @Author : lxx
# @time : 2020/5/24 15:40
# @File : xg_vedio_download.py
# Software: PyCharm
import requests
import time
import pymongo
import json
from xg_download.code.download_url import get_download_url
from xg_download.code.save_video import download_video
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/70.0.3538.110 Safari/537.36",
'referer': 'https://www.ixigua.com/channel/keji/'
}
def get_api_urls(crawl_list_pages):
"""
拼接api_url
:param crawl_list_pages: 要爬取的页数
:return: list_urls 列表页url
"""
base_list_url = 'https://www.ixigua.com/api/feedv2/feedById?_signature=jmy8FAAgEAliUMcmRZJ1d45svAAANCt' \
'&channelId=94349546885&count=12&maxTime=%d&request_from=702&queryCount=1'
list_urls = []
for i in range(crawl_list_pages):
timestamp = int(time.time())
max_time = timestamp + i * 70
url = base_list_url % max_time
list_urls.append(url)
return list_urls
def parse_detail(api_list_urls):
"""
解析详情页,并保存数据到mongodb中
:param api_list_urls:详情页url
:return: 下载视频所需数据
"""
# 连接mongodb
client = pymongo.MongoClient('mongodb://127.0.0.1:27017/')
db = client.xigua
p = db['data']
download_msgs = []
for url in api_list_urls:
detail_dict = requests.get(url, headers=headers).text
detail_json = json.loads(detail_dict) # json格式数据
# print(detail_json)
data_list = detail_json['data']['channelFeed']['Data']
for video_msg in data_list:
title = video_msg['data']['title'] # 视频标题
play_num = video_msg['data']['playNum'] # 观看量
author = video_msg['data']['user_info']['name'] # 作者名
author_id = video_msg['data']['user_info']['user_id'] # 作者id
author_url = 'https://www.ixigua.com/home/' + str(author_id) # 作者主页url
author_desc = video_msg['data']['user_info']['description'] # 作者简介
video_key = video_msg['key']
video_url = 'https://www.ixigua.com/embed?group_id=%s' % video_key
information = {'title': title, 'play_name': play_num, 'author': author, 'author_id': author_id,
'author_url': author_url,
'author_desc': author_desc, 'video_key': video_key, 'video_url': video_url}
try:
p.insert_one(information)
# print('一条数据保存成功~')
except Exception as e:
print('%s数据保存失败!原因:%s' % (video_url, e))
download_msg = {'title': title, 'group_id': video_key}
download_msgs.append(download_msg)
return download_msgs
def video_download_params(download_msgs):
"""
获取视频下载所需参数
:param download_msgs: 视频url, group_id
:return: 下载链接
"""
for msg in download_msgs:
# print(msg)
# {'title':'', 'group_id': ''}
group_id = msg['group_id']
# 拼接接口url
# 'https://www.ixigua.com/api/public/videov2/brief/details?group_id=6808039640256217614'
url = 'https://www.ixigua.com/api/public/videov2/brief/details?group_id=%s' % group_id
api_response = requests.get(url, headers=headers).json()
# print(api_response)
video_id = api_response['data']['vid'] # 获取video_id
url = get_download_url(video_id)
# print(url)
msg['download_url'] = url
# {'title': '', 'group_id': '', 'download_url': ''}
# print(msg)
return download_msgs
def spider():
while True:
try:
crawl_list_pages = int(input('请输入你想要获取多少页视频数据(注:每页12条数据):'))
if crawl_list_pages > 0:
break
except Exception as e:
print('输入有误,请重新输入', e)
start = time.time()
list_api_urls = get_api_urls(crawl_list_pages)
# print(api_urls)
download_msgs = parse_detail(list_api_urls)
download_params = video_download_params(download_msgs)
print(download_params)
download_video(download_params)
end = time.time()
print('总用时:%s秒' % (int(end - start)))
if __name__ == '__main__':
spider()
# _*_ coding: utf-8 _*_
# @Author : lxx
# @time : 2020/5/25 16:52
# @File : download_url.py
# Software: PyCharm
import requests
import random
from zlib import crc32
from base64 import b64decode
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
}
def get_video_url_api(video_id):
# 拼接完整的api_url
r = str(random.random())[2:]
url_part = '/video/urls/v/1/toutiao/mp4/{}?r={}'.format(video_id, r)
s = crc32(url_part.encode())
url = 'https://ib.365yg.com{}&s={}'.format(url_part, s)
# print(url)
return url
def get_video_url(url):
# 从返回的json数据重获取下载url,并解密
resp = requests.get(url, headers=headers)
j_resp = resp.json()
video_url = j_resp['data']['video_list']['video_1']['main_url']
video_url = b64decode(video_url.encode()).decode()
return video_url
def get_download_url(video_id):
# 根据vid获取视频下载url
video_url_api = get_video_url_api(video_id)
video_url = get_video_url(video_url_api)
return video_url
if __name__ == '__main__':
# video_id = 'v02004da0000bptg5da6tgqbogvp0f1g'
# get_download_url(video_id)
url = 'https://vas.snssdk.com/video/openapi/v1/?aid=1768&barragemask=true&action=GetPlayInfo&video_id=v020046b0000bqd14qppjc2kj6fl3ab0&nobase64=false&ptoken=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjE1OTA0NTU0ODEsInZlciI6InYxIiwiYWsiOiJjZmMwNjdiYjM5ZmVmZjU5MmFmODIwODViNDJlNmRjMyIsInN1YiI6InBnY19ub3JtYWwifQ.MDBPTGwBTEUAzsd8G1jLx6dsQbsNdk1NPjqtrViPeFE&vfrom=xgplayer'
print(get_video_url(url))
# 'https://v3-tt.ixigua.com/977b803f3207b119e6b275c48ea0d95b/5ecba387/video/tos/hxsy/tos-hxsy-ve-0004/22bce6c5d543435d92306bb04b14ebd2/?a=1768&br=5238&bt=1746&cr=0&cs=0&dr=0&ds=3&er=&l=2020052517505801001708707922384282&lr=&mime_type=video%2Fmp4&qs=0&rc=amg6bGdtaDtybjMzNjczM0ApNmk2NmczZmVkN2lnPDU1O2dgNTZkZmVxYGhfLS1fLTBzczAzLl5jMDEuX2MwXjAtXzU6Yw%3D%3D&vl=&vr='
# _*_ coding: utf-8 _*_
# @Author : lxx
# @time : 2020/5/25 19:37
# @File : save_video.py
# Software: PyCharm
import requests
from tqdm import tqdm
import os
import aiohttp
import asyncio
import re
def download_video(download_params):
"""
异步批量下载视频到本地
:param download_params: 下载链接,视频标题
:return: None
"""
# for param in download_params:
# url = param['download_url']
# dst = param['title']
# download_from_url(url, dst)
tasks = [asyncio.ensure_future(async_download_from_url(download_params[i]['download_url'], f"{download_params[i]['title']}.mp4")) for i in range(len(download_params))]
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(asyncio.wait(tasks))
except Exception as e:
print(e, download_params)
loop.run_until_complete(loop.shutdown_asyncgens())
finally:
loop.close()
async def fetch(session, url, dst, p_bar=None, headers=None):
if headers:
async with session.get(url, headers=headers) as req:
with(open(dst, 'ab')) as f:
while True:
chunk = await req.content.read(1024)
if not chunk:
break
f.write(chunk)
p_bar.update(1024)
p_bar.close()
else:
async with session.get(url) as req:
return req
async def async_download_from_url(url, dst):
"""
异步下载视频
:param url: 下载链接
:param dst: 标题
:return:
"""
dst = re.sub(r'[\/:*?"<>|「」]', '-', dst) # 去掉非法字符
async with aiohttp.ClientSession() as session:
req = await fetch(session, url, dst)
file_size = int(req.headers['content-length'])
print(f"%s的视频总长度:{file_size}" % dst)
if os.path.exists(dst):
first_byte = os.path.getsize(dst)
else:
first_byte = 0
if first_byte >= file_size:
return file_size
header = {"Range": f"bytes={first_byte}-{file_size}"}
pbar = tqdm(
total=file_size, initial=first_byte,
unit='B', unit_scale=True, desc=dst)
await fetch(session, url, dst, p_bar=pbar, headers=header)
def download_from_url(url, dst):
"""
同步下载视频
:param url: 下载链接
:param dst: 标题
:return:
"""
dst = re.sub(r'[\/:*?"<>|「」]', '-', dst) # 去掉非法字符
response = requests.get(url, stream=True)
file_size = int(response.headers['content-length'])
if os.path.exists(dst):
first_byte = os.path.getsize(dst)
else:
first_byte = 0
if first_byte >= file_size:
return file_size
header = {"Range": f"bytes={first_byte}-{file_size}"}
pbar = tqdm(
total=file_size, initial=first_byte,
unit='B', unit_scale=True, desc=dst)
req = requests.get(url, headers=header, timeout=60, stream=True)
with(open(dst, 'ab')) as f:
for chunk in req.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
pbar.update(1024)
pbar.close()
return file_size
# if __name__ == '__main__':
# # 异步方式下载
# url = "http://v11-tt.ixigua.com/7da2b219bc734de0f0d04706a9629b61/5c77ed4b/video/m" \
# "/220d4f4e99b7bfd49efb110892d892bea9011612eb3100006b7bebf69d81/?rc" \
# "=am12NDw4dGlqajMzNzYzM0ApQHRAbzU6Ojw8MzQzMzU4NTUzNDVvQGgzdSlAZjN1KWRzcmd5a3VyZ3lybHh3Zjc2QHFubHBfZDJrbV8tLTYxL3NzLW8jbyMxLTEtLzEtLjMvLTUvNi06I28jOmEtcSM6YHZpXGJmK2BeYmYrXnFsOiMzLl4%3D "
# task = [asyncio.ensure_future(async_download_from_url(url, f"{i}.mp4")) for i in range(1, 12)]
# loop = asyncio.get_event_loop()
# loop.run_until_complete(asyncio.wait(task))
# loop.close()