-- coding: utf-8 --
from bs4 import BeautifulSoup
import requests
import time
import json
import random
import re
import math
url = 'https://www.bilibili.com/video/BV1tu411Z7Kb'
url = 'https://www.bilibili.com/video/BV1fM4y1V75r'
url = 'https://www.bilibili.com/video/BV16g411G7T3'
user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/94.0.4606.81 Safari/537.36'
cookie = 'sn-assist={"show":false,"audio":false,"speed":"middle","zomm":1,"cursor":false,"pointer":false,"bigtext":false,"overead":false,"bgcolor":false}; isScale=false; _snvd=1634609426115AdhC1LXO4xC; tradeMA=137; cityId=9051; districtId=10346; hm_guid=fc453f86-178f-4108-963d-c6de8332e851; totalProdQty=0; SN_CITY=190_755_1000051_9051_01_10346_1_1_99_7550199; _df_ud=462966b2-f8b5-4d70-8607-852014a9a6e3; _snmc=1; _snsr=baidu|brand||title|suning:brand; authId=sin5aINSpE0u6Rl0Pc9aBk7S5MrM8WCoPa; secureToken=56DA93357439DB6E8EDFB00E413167DB; ssotbrd=TGTVyLKJDzszqNUH6W1V0CpV0pEm0IAfcVz3FSoW1eW; streetCode=7550199; _snzwt=THFA3x17c9b4393975Lmu34b3; _snms=16346924830053819; route=866d5a3e6e0a7894cd5f6be4e51b1c51; _snadtp=1; _snadid=HT_40097935_100000006_12203835633_20211020; smhst=12203835633|0071201482a12290322739|0071201482a12122946310|0000000000a12289097817|0071201482; _snma=1|163460942575786614|1634609425757|1634692548010|1634692575751|10|3; _snmp=163469257503650408; _snmb=163469246909449769|1634692575778|1634692575757|8; token=64466f3f-496d-4aad-ada1-dd1bcbc95d80'
headers = {'User-Agent': user_agent, 'Cookie': cookie}
wb_data = requests.get(url, headers=headers)
soup = BeautifulSoup(wb_data.text, 'html.parser')
print(soup)
oid_raw = re.findall('window.INITIAL_STATE={"aid":(\d+)', soup.text)
if oid_raw:
oid = oid_raw[0] # 猜测为视频唯一ID,用于拼接二级评论url
else:
print('oid获取失败!!')
taskName = '' # 任务名称
platform_name = 'B站' # 渠道名称
item_id = url.rsplit('/', 1)[-1] # 页面ID
print('item_id', item_id)
if soup.select('span[class="tit"]'):
title_raw = soup.select('span[class="tit"]')
else:
title_raw = soup.select('span[class="tit tr-fix"]') # 内容标题
title = title_raw[0].get_text() # 内容标题
print('title', title)
article_url = url # 内容链接
content = soup.select('div[class="desc-info desc-v2 open"]')[0].text # 正文
if content == '':
content = '无'
print('content', content)
media_name_raw = soup.select('div[class="name"]')
if media_name_raw:
media_name = media_name_raw[0].get_text().strip().split(' ')[0] # 一个 创作者账户名
print('media_name', media_name)
else:
media_name_raw = soup.select('div[id="member-container"]')[0].findAll('div',
class_="avatar-name__container") # 创作团队
for item in media_name_raw:
media_name = item.get_text()
print('media_name', media_name)
read_count = soup.select('span[class="view"]')[0].get_text().split('播放')[0] # 内容浏览量(播放量)
print('read_count', read_count)
comments_count = re.findall('"stat":{\S+"reply":(\d)', soup.text)[0] # 总评论数
print('comments_count', comments_count)
share_count = re.findall('"stat":{\S+"share":(\d)', soup.text)[0] # 转发量
print('share_count', share_count)
like_count = re.findall('"stat":{\S+"like":(\d*)', soup.text)[0] # 点赞数
print('like_count', like_count)
datetime = soup.select('div[class="video-data"]')[0].findAll('span')[2].get_text() # 内容上传时间
print('datetime', datetime)
def get_comments_json_data(comments_url):
"""
把异步加载的数据转换成json格式
:param url:
:return:
"""
time.sleep(random.random() + 1)
comments_data = requests.get(comments_url, headers=headers)
comments_soup = BeautifulSoup(comments_data.text, 'html.parser')
return json.loads(comments_soup.text)
def comment_time_handle(para_time):
"""
解析转换网页中的数字时间
:param para_time:
:return: %Y-%m-%d %H:%M:%S格式日期时间
"""
tupTime = time.localtime(int(para_time)) # 秒时间戳
return time.strftime("%Y-%m-%d %H:%M:%S", tupTime) # 评论时间
def variable_time_stamp():
"""
返回url所用的时间戳
:return:
"""
time_stamp_raw = str(time.time()).split('.')
return time_stamp_raw[0] + time_stamp_raw[1][0:3]
comments_url_base = 'https://api.bilibili.com/x/v2/reply/main?jsonp=jsonp&next={page}&type=1&oid={oid}&mode=3&plat=1&={time_stamp}'
fold_url_base = 'https://api.bilibili.com/x/v2/reply/reply?jsonp=jsonp&pn={sub_page}&type=1&oid={oid}&ps=10&root={rpid}&={time_stamp}'
comments_num = 0
page = 1
print('math.ceil(int(comments_count) / 20)+1', math.ceil(int(comments_count) / 20) + 1)
for page in range(1, math.ceil(int(comments_count) / 20) + 1):
while 1:
print('page——————>', page)
comments_url = comments_url_base.format(page=page, oid=oid, time_stamp=variable_time_stamp())
comments_json_data = get_comments_json_data(comments_url) # 返回json一级评论数据
comments_list = comments_json_data.get('data').get('replies')
if comments_list:
time_stamp = variable_time_stamp()
for item in comments_list: # 遍历每一页一级评论
user_name = item.get('member').get('uname') # 一级评论人账户名
comment_like_count = item.get('like') # 一级评论点赞数
comment_time = comment_time_handle(item.get('ctime')) # 一级评论时间
text = item.get('content').get('message') # 一级评论内容
print(user_name, comment_like_count, comment_time, text)
comments_num += 1
rpid = item.get('rpid') # 猜测是一级评论用户ID
fold_url = fold_url_base.format(sub_page=1, oid=oid, rpid=rpid, time_stamp=time_stamp)
fold_json_data = get_comments_json_data(fold_url) # 折叠回复
sub_comment_count = int(fold_json_data.get('data').get('page').get('count')) # 一级评论回复数
if sub_comment_count > 0: # 如果有回复
# print(fold_json_data)
sub_comment_page = math.ceil(sub_comment_count / 10) # 折叠回复一页10条,向上取整
for sub_page in range(1, sub_comment_page + 1): # 遍历每一页折叠回复
fold_url = fold_url_base.format(sub_page=sub_page, oid=oid, rpid=rpid, time_stamp=time_stamp)
fold_json_data = get_comments_json_data(fold_url) # 折叠回复
fold_comments_list = fold_json_data.get('data').get('replies')
for sub_item in fold_comments_list: # 遍历当前页折叠回复
sub_user_name = sub_item.get('member').get('uname') # 二级评论人账户名
sub_comment_like_count = sub_item.get('like') # 二级评论点赞数
sub_comment_time = comment_time_handle(sub_item.get('ctime')) # 二级评论时间
sub_text = sub_item.get('content').get('message') # 二级评论内容
print(sub_user_name, sub_comment_like_count, sub_comment_time, sub_text)
comments_num += 1
page += 1
else:
print('总评论数为:', comments_count, '已爬取评论数为:', comments_num, '当前评论页数:', page)
print('评论爬取结束。')
exit(1)
# if int(comments_count) * 0.99 <= comments_num <= int(comments_count) * 1.1:
# print('评论爬取结束。')
# else:
# deviation_raw = (comments_num - int(comments_count)) / int(comments_count) *100
# deviation = '%.2f' % deviation_raw
# print('评论爬取数偏差超过百分之一!偏差为:', str(deviation) + '%')
# exit(1)
create_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 采集创建时间