python爬取某网站短视频

"""
实现步骤:
1.发送网络请求
2.获取数据
3.解析数据:提取视频地址及标题
4.发送网络请求:请求每一个视频地址,获取视频二进制数据
5.保存视频
6.通过关键词下载视频/指定一个用户的视频/翻页下载
"""

import os
import pprint
import time
import requests
import json
import re
# fake_useragent第三方库,实现随机请求头的设置 pip install fake-useragent
from fake_useragent import UserAgent

dir_name = 'videos'  # 视频保存文件夹
# 判断该文件夹是否存在,不存在则创建
if not os.path.exists(dir_name):
    os.mkdir(dir_name)

# 1.发送请求 get post
# post 表单请求
# :Response:响应体对象 200:访问成功

# 禁用服务器缓存,忽略ssl验证
ua = UserAgent(use_cache_server=False, verify_ssl=False).random
# 请求头:伪装 用来伪装python代码,防止被识别出是爬虫程序
headers = {
    'accept': '*/*',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Connection': 'keep-alive',
    'Content-Length': '1380',
    'content-type': 'application/json',
    'Cookie': 'kpf=PC_WEB; kpn=KUAISHOU_VISION; clientid=3; did=web_acb408fff3a5f7cd020782d58bb9caa9; ktrace-context=1|MS43NjQ1ODM2OTgyODY2OTgyLjI4ODYxOTgxLjE2MzczNzIwMzc5NTkuMTQ1NDUxNA==|MS43NjQ1ODM2OTgyODY2OTgyLjI3NzMzOTY1LjE2MzczNzIwMzc5NTkuMTQ1NDUxNQ==|0|graphql-server|webservice|false|NA; client_key=65890b29; userId=1232368006; kuaishou.server.web_st=ChZrdWFpc2hvdS5zZXJ2ZXIud2ViLnN0EqABXhLnnN974NXDx7wxD7EXA0gUwiENGncAU1PMNvGRI8hgQVPES30K2a6e8FZ9L3yv89WVXIZ5I1HsDjjWJlzDijZgHPj64KgQ8dkTm8-Aq5monZejiGHAuenrIuDovugsUnncYRtFHLY_bmEtKpBDoaswti5UnDOkiVHAuhMMPlqdPBKYwV_LZ3SGFMeznHUrJv5Wg4o4C45yi-1iuOPyDRoSsmhEcimAl3NtJGybSc8y6sdlIiCHg_pUdXqAoXPplQJ-iHcM2h_MTI_3Wkdnw9ucUMR5UCgFMAE; kuaishou.server.web_ph=b3651a369fb9eb9f33d30ccc2cc691a5ecbf',
    'Host': 'www.kuaishou.com',
    'Origin': 'https://www.kuaishou.com',
    'Referer': 'https://www.kuaishou.com/search/video?searchKey=%E6%85%A2%E6%91%87',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-origin',
    "User-Agent": ua
}

# 关键词
keyword = input("请输入你想要查询的关键词:")
# 实现翻页
for page_num in range(1, 6):
    data = {
        'operationName': "visionSearchPhoto",
        'query': "query visionSearchPhoto($keyword: String, $pcursor: String, $searchSessionId: String, $page: String, $webPageArea: String) {\n  visionSearchPhoto(keyword: $keyword, pcursor: $pcursor, searchSessionId: $searchSessionId, page: $page, webPageArea: $webPageArea) {\n    result\n    llsid\n    webPageArea\n    feeds {\n      type\n      author {\n        id\n        name\n        following\n        headerUrl\n        headerUrls {\n          cdn\n          url\n          __typename\n        }\n        __typename\n      }\n      tags {\n        type\n        name\n        __typename\n      }\n      photo {\n        id\n        duration\n        caption\n        likeCount\n        realLikeCount\n        coverUrl\n        photoUrl\n        liked\n        timestamp\n        expTag\n        coverUrls {\n          cdn\n          url\n          __typename\n        }\n        photoUrls {\n          cdn\n          url\n          __typename\n        }\n        animatedCoverUrl\n        stereoType\n        videoRatio\n        __typename\n      }\n      canAddComment\n      currentPcursor\n      llsid\n      status\n      __typename\n    }\n    searchSessionId\n    pcursor\n    aladdinBanner {\n      imgUrl\n      link\n      __typename\n    }\n    __typename\n  }\n}\n",
        'variables': {'keyword': keyword, 'pcursor': f'{page_num}', 'page': "search"}
    }
    # 'content-type' :'application/json' 要求返回的data是一个json字符串-->字典类型
    # print(type(data))
    data = json.dumps(data)  # 将字典类型转换为字符串类型
    # print(type(data))
    time.sleep(2)
    try:
        url = "https://www.kuaishou.com/graphql"
        # 发送一个post请求,url:链接地址,headers:伪装,data:查询参数
        # 1.发送网络请求
        response = requests.post(url=url, headers=headers, data=data)
        # 2.获取数据
        json_data = response.json()
        # pprint.pprint(json_data)
        # 3.解析数据--提取视频地址及标题
        # 字典类型--键值对的方式取值
        feeds_list = json_data['data']['visionSearchPhoto']['feeds']
        # print(len(feeds_list))
        # print(feeds_list)
        for feeds in feeds_list:
            # feeds是字典类型
            # 获取视频标题
            title = feeds['photo']['caption']
            photoUrl = feeds['photo']['photoUrl']
            # print(title, photoUrl)
            # 在Windows操作系统中,文件名不能包含一些特殊字符,需要进行替换
            new_title = re.sub(r'[\/:*?"<>|\n]', '_', title)
            # 4.发送网络请求:请求每一个视频地址,获取视频二进制数据
            mp4_data = requests.get(photoUrl).content
            # 5.保存视频
            with open(dir_name + "/" + new_title + '.mp4', mode='wb') as f:
                f.write(mp4_data)
                print(f'{new_title}--下载完成')
        if len(feeds_list) < 20:
            break
    except Exception as e:
        print(e)

爬取效果:
python爬取某网站短视频_第1张图片

python爬取某网站短视频_第2张图片


参考文章:

  • https://www.cnblogs.com/jingxindeyi/p/13443755.html

你可能感兴趣的:(Python,爬虫,python,爬虫,视频)