本文章非逆向破解,方法是PC端快手视频采集。
PC端的接口,比较好拿。只需登录快手账号,使用抓包工具,即能抓到接口。
爬取需要cookie,只需一个参数did=web_xxxxxxxxxxxxxxxx;获取方式很简单,随便复制一个短链接,到浏览器,打开,复制did=xxxxxxxxx,即可。
直接上代码!
from lxml import etree
import re,requests,json,time
class KuaiShou(object):
def __init__(self,collect_urls_list):
self.collect_urls_list = collect_urls_list
# cookie需要自行获取,目前测试有效期至少4天,不需更换。
self.cookies = 'did=xxxxxxxxxx;',
# 大量爬取需要加代理
self.proxies = ''
for i in range(len(self.collect_urls_list)):
url = self.getKuaiShouRealAddress(self.collect_urls_list[i])
if not url:continue
print(url)
# 单个视频url
if re.search('/fw/photo/', url):
print('单个视频')
self.requestsSingleVideo(url)
# 用户视频列表
if re.search('/fw/user/',url):
print('用户作品')
user_id = re.findall("/fw/user/(.*)\?",url)[0]
fid = re.findall('fid=(\d+)',url)[0]
share_id = re.findall('shareId=(\d+)',url)[0]
self.requestsUserVideo(user_id,fid,share_id)
def getKuaiShouRealAddress(self, url):
HEADERS = {
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'pragma': 'no-cache',
'cache-control': 'no-cache',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Linux; Android 4.3; Nexus 7 Build/JSS15Q) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.72 Safari/537.36'
}
res = requests.get(url, headers=HEADERS, allow_redirects=False, verify=False)
if res.status_code == 302:
long_url = res.headers['Location']
return long_url
def requestsSingleVideo(self,url):
long_url = url
try:
headers2 = {
'Host': 'npstianjin.s.kuaishouapp.com',
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Mobile Safari/537.36'
}
resp = requests.get(long_url,headers=headers2,proxies=self.proxies,verify=False)
# print(resp.text)
resp_html = etree.HTML(resp.text)
video_content = resp_html.xpath('*//div[@id="hide-pagedata"]/@data-pagedata')[0]
json_data = json.loads(video_content)
# 视频ID
aweme_id = json_data['photoId']
# 时长
duration = json_data['rawPhoto']['ext_params']['video']
# 视频描述
desc = json_data['rawPhoto']['caption']
# 视频创建时间
create_time = json_data['rawPhoto']['timestamp']
# 视频URL链接
download_url = json_data['video']['srcNoMark']
# 视频命名
uri = aweme_id
# 视频发布者
author = json_data['rawPhoto']['userName']
# 评论量
comment_count = json_data['rawPhoto']['commentCount']
# 点赞量
digg_count = json_data['rawPhoto']['likeCount']
# 下载量
download_count = 0
# 播放量
play_count = json_data['rawPhoto']['viewCount']
# 分享次数
share_count = json_data['rawPhoto']['forwardCount']
# 背景音乐作者
music_author = ''
# 背景音乐名称
music_title = ''
# 封面url
picture_url = json_data['video']['poster']
print(aweme_id, author, duration, desc, create_time, download_url, comment_count,
digg_count,
download_count, play_count, share_count, uri, music_author,
music_title, picture_url)
self.get_comment(aweme_id)
except(ValueError, KeyError) as e:
print("Json Error", str(e))
def requestsUserVideo(self,user_id,fid,share_id):
i = 0
print(user_id)
pcursor = ''
while i < 1:
data = {
'operationName': 'publicFeedsQuery',
'query': 'query publicFeedsQuery($principalId: String, $pcursor: String, $count: Int) {\r\n publicFeeds(principalId: $principalId, pcursor: $pcursor, count: $count) {\r\n pcursor\r\n live {\r\n user {\r\n id\r\n avatar\r\n name\r\n __typename\r\n }\r\n watchingCount\r\n poster\r\n coverUrl\r\n caption\r\n id\r\n playUrls {\r\n quality\r\n url\r\n __typename\r\n }\r\n quality\r\n gameInfo {\r\n category\r\n name\r\n pubgSurvival\r\n type\r\n kingHero\r\n __typename\r\n }\r\n hasRedPack\r\n liveGuess\r\n expTag\r\n __typename\r\n }\r\n list {\r\n id\r\n thumbnailUrl\r\n poster\r\n workType\r\n type\r\n useVideoPlayer\r\n imgUrls\r\n imgSizes\r\n magicFace\r\n musicName\r\n caption\r\n location\r\n liked\r\n onlyFollowerCanComment\r\n relativeHeight\r\n timestamp\r\n width\r\n height\r\n counts {\r\n displayView\r\n displayLike\r\n displayComment\r\n __typename\r\n }\r\n user {\r\n id\r\n eid\r\n name\r\n avatar\r\n __typename\r\n }\r\n expTag\r\n __typename\r\n }\r\n __typename\r\n }\r\n}\r\n',
'variables': {"principalId": user_id, "pcursor": pcursor, "count": 24}
}
headers = {
'Origin': 'https://live.kuaishou.com',
'Host': 'live.kuaishou.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'content-type': 'application/json',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'accept': '*/*',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'Cookie': self.cookies,
'Referer':'https://live.kuaishou.com/profile/{}'.format(user_id)
}
a_url = 'https://live.kuaishou.com/m_graphql'
resp = requests.post(a_url, headers=headers, json=data, verify=False)
# print(resp.text)
json_data = json.loads(resp.content)
user_list = json_data['data']['publicFeeds']['list']
if len(user_list) > 0:
for j in user_list:
video_id = j['id']
user_id = j['user']['id']
every_url = 'https://npsshanghai.s.kuaishouapp.com/fw/photo/{}?fid={}&cc=share_copylink&shareMethod=TOKEN&docId=0&kpn=NEBULA&subBiz=PHOTO&photoId={}&shareId={}&shareToken=&shareResourceType=PHOTO_OTHER&userId={}&shareType=1&et=1_i%2F0_unknown0&groupName=&appType=22'.format(video_id,fid,video_id,share_id,user_id)
print(every_url)
self.requestsSingleVideo(every_url)
pcursor = json_data['data']['publicFeeds']
if len(pcursor) > 0:
pcursor = pcursor
else:
break
else:
break
i +=1
def get_comment(self,aweme_id):
url = "https://live.kuaishou.com/m_graphql"
pcursor = ''
i = 0
while i <= 10:
headers = {
'Origin': 'https://live.kuaishou.com',
'Host': 'live.kuaishou.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'content-type': 'application/json',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'accept': '*/*',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Cookie': self.cookies,
}
parm_data = {"operationName": "commentListQuery",
"variables": {"pcursor": pcursor, "photoId": aweme_id, "page": 1, "count": 20},
"query": 'query commentListQuery($photoId: String, $page: Int, $pcursor: String, $count: Int) {\n shortVideoCommentList(photoId: $photoId, page: $page, pcursor: $pcursor, count: $count) {\n commentCount\n realCommentCount\n pcursor\n commentList {\n commentId\n authorId\n authorName\n content\n headurl\n timestamp\n authorEid\n status\n subCommentCount\n subCommentsPcursor\n likedCount\n liked\n subComments {\n commentId\n authorId\n authorName\n content\n headurl\n timestamp\n authorEid\n status\n replyToUserName\n replyTo\n replyToEid\n __typename\n }\n __typename\n }\n __typename\n }\n}\n'}
resp = requests.post(url, json=parm_data, headers=headers,verify=False)
json_data = json.loads(resp.content)
pcursor = json_data['data']['shortVideoCommentList']['pcursor']
for comments in json_data['data']['shortVideoCommentList']['commentList']:
# print('comments',comments)
# 评论ID
cid = comments['commentId']
text = comments['content'] # 评论内容
create_time = comments['timestamp'] # 评论时间
digg_count = comments['likedCount'] # 评论点赞量
nickname = comments['authorName'] # 发布评论名字
print(cid,nickname,text,create_time,digg_count)
print('.....................................')
i+=1
KuaiShou(['https://v.kuaishouapp.com/s/swoBNPWu'])
本文有参考其他文章,目前找不到链接,如有侵权,请联系本人。有冒犯之处,见谅。