快手(非逆向)

前言

本文章非逆向破解,方法是PC端快手视频采集。

PC端的接口,比较好拿。只需登录快手账号,使用抓包工具,即能抓到接口。

爬取需要cookie,只需一个参数did=web_xxxxxxxxxxxxxxxx;获取方式很简单,随便复制一个短链接,到浏览器,打开,复制did=xxxxxxxxx,即可。

直接上代码!

from lxml import etree
import re,requests,json,time

class KuaiShou(object):
    def __init__(self,collect_urls_list):
        self.collect_urls_list = collect_urls_list
        # cookie需要自行获取,目前测试有效期至少4天,不需更换。
        self.cookies = 'did=xxxxxxxxxx;',
        # 大量爬取需要加代理
        self.proxies = ''

        for i in range(len(self.collect_urls_list)):
            url = self.getKuaiShouRealAddress(self.collect_urls_list[i])
            if not url:continue
            print(url)
            # 单个视频url
            if re.search('/fw/photo/', url):
                print('单个视频')
                self.requestsSingleVideo(url)

            # 用户视频列表
            if re.search('/fw/user/',url):
                print('用户作品')
                user_id = re.findall("/fw/user/(.*)\?",url)[0]
                fid = re.findall('fid=(\d+)',url)[0]
                share_id = re.findall('shareId=(\d+)',url)[0]
                self.requestsUserVideo(user_id,fid,share_id)

    def getKuaiShouRealAddress(self, url):
        HEADERS = {
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'pragma': 'no-cache',
            'cache-control': 'no-cache',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Linux; Android 4.3; Nexus 7 Build/JSS15Q) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.72 Safari/537.36'
        }
        res = requests.get(url, headers=HEADERS, allow_redirects=False, verify=False)
        if res.status_code == 302:
            long_url = res.headers['Location']
            return long_url


    def requestsSingleVideo(self,url):
        long_url = url
        try:
            headers2 = {
                'Host': 'npstianjin.s.kuaishouapp.com',
                'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Mobile Safari/537.36'
            }
            resp = requests.get(long_url,headers=headers2,proxies=self.proxies,verify=False)
            # print(resp.text)
            resp_html = etree.HTML(resp.text)
            video_content = resp_html.xpath('*//div[@id="hide-pagedata"]/@data-pagedata')[0]
            json_data = json.loads(video_content)
            # 视频ID
            aweme_id = json_data['photoId']
            # 时长
            duration = json_data['rawPhoto']['ext_params']['video']
            # 视频描述
            desc = json_data['rawPhoto']['caption']
            # 视频创建时间
            create_time = json_data['rawPhoto']['timestamp']
            # 视频URL链接
            download_url = json_data['video']['srcNoMark']
            # 视频命名
            uri = aweme_id
            # 视频发布者
            author = json_data['rawPhoto']['userName']
            # 评论量
            comment_count = json_data['rawPhoto']['commentCount']
            # 点赞量
            digg_count = json_data['rawPhoto']['likeCount']
            # 下载量
            download_count = 0
            # 播放量
            play_count = json_data['rawPhoto']['viewCount']
            # 分享次数
            share_count = json_data['rawPhoto']['forwardCount']
            # 背景音乐作者
            music_author = ''
            # 背景音乐名称
            music_title = ''
            # 封面url
            picture_url = json_data['video']['poster']
            print(aweme_id, author, duration, desc, create_time, download_url, comment_count,
                  digg_count,
                  download_count, play_count, share_count, uri, music_author,
                  music_title, picture_url)
            self.get_comment(aweme_id)
        except(ValueError, KeyError) as e:
            print("Json Error", str(e))



    def requestsUserVideo(self,user_id,fid,share_id):
        i = 0
        print(user_id)
        pcursor = ''
        while i < 1:
            data = {
                'operationName': 'publicFeedsQuery',
                'query': 'query publicFeedsQuery($principalId: String, $pcursor: String, $count: Int) {\r\n  publicFeeds(principalId: $principalId, pcursor: $pcursor, count: $count) {\r\n    pcursor\r\n    live {\r\n      user {\r\n        id\r\n        avatar\r\n        name\r\n        __typename\r\n      }\r\n      watchingCount\r\n      poster\r\n      coverUrl\r\n      caption\r\n      id\r\n      playUrls {\r\n        quality\r\n        url\r\n        __typename\r\n      }\r\n      quality\r\n      gameInfo {\r\n        category\r\n        name\r\n        pubgSurvival\r\n        type\r\n        kingHero\r\n        __typename\r\n      }\r\n      hasRedPack\r\n      liveGuess\r\n      expTag\r\n      __typename\r\n    }\r\n    list {\r\n      id\r\n      thumbnailUrl\r\n      poster\r\n      workType\r\n      type\r\n      useVideoPlayer\r\n      imgUrls\r\n      imgSizes\r\n      magicFace\r\n      musicName\r\n      caption\r\n      location\r\n      liked\r\n      onlyFollowerCanComment\r\n      relativeHeight\r\n      timestamp\r\n      width\r\n      height\r\n      counts {\r\n        displayView\r\n        displayLike\r\n        displayComment\r\n        __typename\r\n      }\r\n      user {\r\n        id\r\n        eid\r\n        name\r\n        avatar\r\n        __typename\r\n      }\r\n      expTag\r\n      __typename\r\n    }\r\n    __typename\r\n  }\r\n}\r\n',
                'variables': {"principalId": user_id, "pcursor": pcursor, "count": 24}
            }
            headers = {
                'Origin': 'https://live.kuaishou.com',
                'Host': 'live.kuaishou.com',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
                'content-type': 'application/json',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'Accept-Encoding': 'gzip, deflate, br',
                'accept': '*/*',
                'Sec-Fetch-Mode': 'cors',
                'Sec-Fetch-Site': 'same-origin',
                'Cookie': self.cookies,
                'Referer':'https://live.kuaishou.com/profile/{}'.format(user_id)
            }
            a_url = 'https://live.kuaishou.com/m_graphql'
            resp = requests.post(a_url, headers=headers, json=data, verify=False)
            # print(resp.text)
            json_data = json.loads(resp.content)
            user_list = json_data['data']['publicFeeds']['list']
            if len(user_list) > 0:
                for j in user_list:
                    video_id = j['id']
                    user_id = j['user']['id']
                    every_url = 'https://npsshanghai.s.kuaishouapp.com/fw/photo/{}?fid={}&cc=share_copylink&shareMethod=TOKEN&docId=0&kpn=NEBULA&subBiz=PHOTO&photoId={}&shareId={}&shareToken=&shareResourceType=PHOTO_OTHER&userId={}&shareType=1&et=1_i%2F0_unknown0&groupName=&appType=22'.format(video_id,fid,video_id,share_id,user_id)
                    print(every_url)
                    self.requestsSingleVideo(every_url)
                pcursor = json_data['data']['publicFeeds']
                if len(pcursor) > 0:
                    pcursor = pcursor
                else:
                    break
            else:
                break
            i +=1


    def get_comment(self,aweme_id):
        url = "https://live.kuaishou.com/m_graphql"
        pcursor = ''
        i = 0
        while i <= 10:
            headers = {
                'Origin': 'https://live.kuaishou.com',
                'Host': 'live.kuaishou.com',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
                'content-type': 'application/json',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'Accept-Encoding': 'gzip, deflate, br',
                'accept': '*/*',
                'Sec-Fetch-Site': 'same-origin',
                'Sec-Fetch-Dest': 'empty',
                'Sec-Fetch-Mode': 'cors',
                'Cookie': self.cookies,
            }
            parm_data = {"operationName": "commentListQuery",
                         "variables": {"pcursor": pcursor, "photoId": aweme_id, "page": 1, "count": 20},
                         "query": 'query commentListQuery($photoId: String, $page: Int, $pcursor: String, $count: Int) {\n  shortVideoCommentList(photoId: $photoId, page: $page, pcursor: $pcursor, count: $count) {\n    commentCount\n    realCommentCount\n    pcursor\n    commentList {\n      commentId\n      authorId\n      authorName\n      content\n      headurl\n      timestamp\n      authorEid\n      status\n      subCommentCount\n      subCommentsPcursor\n      likedCount\n      liked\n      subComments {\n        commentId\n        authorId\n        authorName\n        content\n        headurl\n        timestamp\n        authorEid\n        status\n        replyToUserName\n        replyTo\n        replyToEid\n        __typename\n      }\n      __typename\n    }\n    __typename\n  }\n}\n'}
            resp = requests.post(url, json=parm_data, headers=headers,verify=False)
            json_data = json.loads(resp.content)
            pcursor = json_data['data']['shortVideoCommentList']['pcursor']
            for comments in json_data['data']['shortVideoCommentList']['commentList']:
                # print('comments',comments)
                # 评论ID
                cid = comments['commentId']
                text = comments['content']  # 评论内容
                create_time = comments['timestamp']  # 评论时间
                digg_count = comments['likedCount']  # 评论点赞量
                nickname = comments['authorName']  # 发布评论名字
                print(cid,nickname,text,create_time,digg_count)
            print('.....................................')
            i+=1

KuaiShou(['https://v.kuaishouapp.com/s/swoBNPWu'])

本文有参考其他文章,目前找不到链接,如有侵权,请联系本人。有冒犯之处,见谅。

你可能感兴趣的:(爬虫)