python爬虫系列1-沙沙野视频多线程

任务需求:

  • 网站地址:https://www.ssyer.com/
  • 使用的库 requests,multiprocessing
  • 使用的技术点:
    • 使用多进程下载
    • 解决cookies过期问题
  • 使用工具:一种可以快速生成headers以及cookie的工具https://curl.trillworks.com/
# -*- coding: utf-8 -*-
# @Time    : 2020/7/29 6:05 下午
# @Author  : livein80
# @Email   : [email protected]
# @File    : ssyer.py
# @Software : PyCharm
import requests
import os
# 多进程下载
from multiprocessing import Pool


json_dir='./json_dir/'
headers = {
    'Connection': 'keep-alive',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache',
    'Accept': 'application/json',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
    'Content-Type': 'application/json',
    'Origin': 'https://www.ssyer.com',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Dest': 'empty',
    'Referer': 'https://www.ssyer.com/cate/2',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,ko;q=0.7,und;q=0.6',
}
data = '{"cateId":2,"order":2,"recommendType":1,"page":{"showCount":20,"currentPage":1}}'
session = requests.session()
def get_data():
    cookies = {
        'UM_distinctid': '17398ee35fb8eb-0679366a6e0d54-31627304-232800-17398ee35fc744',
        'CNZZDATA1278764889': '613028250-1595997139-https%253A%252F%252Fwww.google.com%252F%7C1595997139',
        '_dg_playback.7b6028a56aac520d.ce42': '1',
        '_dg_abtestInfo.7b6028a56aac520d.ce42': '1',
        '_dg_check.7b6028a56aac520d.ce42': '1',
        '_dg_antiBotFlag.7b6028a56aac520d.ce42': '1',
        '_dg_antiBotInfo.7b6028a56aac520d.ce42': '10%7C%7C%7C3600',
        'SESSION': 'ZTg3OGVjMGUtZjA0Ni00NmVkLTg2MjctMTY0ZWJhODRmYTc2',
        'Hm_lvt_8f50334c83664955c1a1a866dd168053': '1595998616,1595998662',
        'Hm_lpvt_8f50334c83664955c1a1a866dd168053': '1595998662',
        '_dg_id.7b6028a56aac520d.ce42': 'fc0bc167b752f00b%7C%7C%7C1595998616%7C%7C%7C0%7C%7C%7C1595998662%7C%7C%7C1595998616%7C%7C%7C%7C%7C%7Ce809b4e64783781d%7C%7C%7Chttps%3A%2F%2Fwww.google.com%2F%7C%7C%7Chttps%3A%2F%2Fwww.google.com%2F%7C%7C%7C1%7C%7C%7Cundefined',
    }
    # +++++++++++++++++++++
    response = session.post('https://www.ssyer.com/apis/20001', headers=headers, cookies=cookies, data=data)
    return response

# 开始下载视频
def start_load_vid(vid_name,vid_url):
    res = requests.get(vid_url,verify=False).content
    if not os.path.isdir('./vid/'):
        os.mkdir('./vid/')
    with open('./vid/{}.mp4'.format(vid_name),'wb') as file:
        file.write(res)
    print('%s 视频下载完成'%vid_name)
# 获取视频列表
def get_vid_lis(list):
    vid_list = []
    for item in list:
        # start_load_vid(item['title'],item['zip'])
        vid_list.append({'name':item['title'],'link':item['zip']})
        # print("%s 视频下载完成" % item['title'])
    return vid_list

# === 开始爬虫 ===
def start_spider(callback):
    count=0
    def check():
        nonlocal count
        count += 1
        print('check-->', count)
        response = get_data()
        print(response.status_code,session.cookies)
        if response.status_code == 200:
            json_obj = response.json()
            callback(json_obj)
        else:
            # 更新 cookies
            json_obj=None
            if count < 5:
                check()
            else:
                print('cookies更新失败!')
        return json_obj
    return check()

def start_download(json_obj):
    list = json_obj['data']
    # ===获取视频列表===
    vid_list = get_vid_lis(list)
    # 多线程下载
    pool = Pool(15)
    for obj in vid_list:
        pool.apply_async(start_load_vid, args=(obj['name'], obj['link']))
    # 关闭池
    pool.close()
    pool.join()
    print('所有视频现在完成!')

if __name__=='__main__':
    # ===== 开始爬虫 =====
    start_spider(start_download)

你可能感兴趣的:(python爬虫系列1-沙沙野视频多线程)