B站爬虫

-- coding: utf-8 --

from bs4 import BeautifulSoup
import requests
import time
import json
import random
import re
import math

url = 'https://www.bilibili.com/video/BV1tu411Z7Kb'

url = 'https://www.bilibili.com/video/BV1fM4y1V75r'

url = 'https://www.bilibili.com/video/BV16g411G7T3'
user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/94.0.4606.81 Safari/537.36'
cookie = 'sn-assist={"show":false,"audio":false,"speed":"middle","zomm":1,"cursor":false,"pointer":false,"bigtext":false,"overead":false,"bgcolor":false}; isScale=false; _snvd=1634609426115AdhC1LXO4xC; tradeMA=137; cityId=9051; districtId=10346; hm_guid=fc453f86-178f-4108-963d-c6de8332e851; totalProdQty=0; SN_CITY=190_755_1000051_9051_01_10346_1_1_99_7550199; _df_ud=462966b2-f8b5-4d70-8607-852014a9a6e3; _snmc=1; _snsr=baidu|brand||title|suning:brand; authId=sin5aINSpE0u6Rl0Pc9aBk7S5MrM8WCoPa; secureToken=56DA93357439DB6E8EDFB00E413167DB; ssotbrd=TGTVyLKJDzszqNUH6W1V0CpV0pEm0IAfcVz3FSoW1eW; streetCode=7550199; _snzwt=THFA3x17c9b4393975Lmu34b3; _snms=16346924830053819; route=866d5a3e6e0a7894cd5f6be4e51b1c51; _snadtp=1; _snadid=HT_40097935_100000006_12203835633_20211020; smhst=12203835633|0071201482a12290322739|0071201482a12122946310|0000000000a12289097817|0071201482; _snma=1|163460942575786614|1634609425757|1634692548010|1634692575751|10|3; _snmp=163469257503650408; _snmb=163469246909449769|1634692575778|1634692575757|8; token=64466f3f-496d-4aad-ada1-dd1bcbc95d80'
headers = {'User-Agent': user_agent, 'Cookie': cookie}
wb_data = requests.get(url, headers=headers)
soup = BeautifulSoup(wb_data.text, 'html.parser')

print(soup)

oid_raw = re.findall('window.INITIAL_STATE={"aid":(\d+)', soup.text)
if oid_raw:
oid = oid_raw[0] # 猜测为视频唯一ID,用于拼接二级评论url
else:
print('oid获取失败!!')

taskName = '' # 任务名称
platform_name = 'B站' # 渠道名称
item_id = url.rsplit('/', 1)[-1] # 页面ID
print('item_id', item_id)
if soup.select('span[class="tit"]'):
title_raw = soup.select('span[class="tit"]')
else:
title_raw = soup.select('span[class="tit tr-fix"]') # 内容标题
title = title_raw[0].get_text() # 内容标题
print('title', title)
article_url = url # 内容链接
content = soup.select('div[class="desc-info desc-v2 open"]')[0].text # 正文
if content == '':
content = '无'
print('content', content)
media_name_raw = soup.select('div[class="name"]')
if media_name_raw:
media_name = media_name_raw[0].get_text().strip().split(' ')[0] # 一个 创作者账户名
print('media_name', media_name)
else:
media_name_raw = soup.select('div[id="member-container"]')[0].findAll('div',
class_="avatar-name__container") # 创作团队
for item in media_name_raw:
media_name = item.get_text()
print('media_name', media_name)

read_count = soup.select('span[class="view"]')[0].get_text().split('播放')[0] # 内容浏览量(播放量)
print('read_count', read_count)
comments_count = re.findall('"stat":{\S+"reply":(\d)', soup.text)[0] # 总评论数
print('comments_count', comments_count)
share_count = re.findall('"stat":{\S+"share":(\d
)', soup.text)[0] # 转发量
print('share_count', share_count)
like_count = re.findall('"stat":{\S+"like":(\d*)', soup.text)[0] # 点赞数
print('like_count', like_count)
datetime = soup.select('div[class="video-data"]')[0].findAll('span')[2].get_text() # 内容上传时间
print('datetime', datetime)

def get_comments_json_data(comments_url):
"""
把异步加载的数据转换成json格式
:param url:
:return:
"""
time.sleep(random.random() + 1)
comments_data = requests.get(comments_url, headers=headers)
comments_soup = BeautifulSoup(comments_data.text, 'html.parser')
return json.loads(comments_soup.text)

def comment_time_handle(para_time):
"""
解析转换网页中的数字时间
:param para_time:
:return: %Y-%m-%d %H:%M:%S格式日期时间
"""
tupTime = time.localtime(int(para_time)) # 秒时间戳
return time.strftime("%Y-%m-%d %H:%M:%S", tupTime) # 评论时间

def variable_time_stamp():
"""
返回url所用的时间戳
:return:
"""
time_stamp_raw = str(time.time()).split('.')
return time_stamp_raw[0] + time_stamp_raw[1][0:3]

comments_url_base = 'https://api.bilibili.com/x/v2/reply/main?jsonp=jsonp&next={page}&type=1&oid={oid}&mode=3&plat=1&={time_stamp}'
fold_url_base = 'https://api.bilibili.com/x/v2/reply/reply?jsonp=jsonp&pn={sub_page}&type=1&oid={oid}&ps=10&root={rpid}&
={time_stamp}'
comments_num = 0
page = 1

print('math.ceil(int(comments_count) / 20)+1', math.ceil(int(comments_count) / 20) + 1)

for page in range(1, math.ceil(int(comments_count) / 20) + 1):

while 1:
print('page——————>', page)
comments_url = comments_url_base.format(page=page, oid=oid, time_stamp=variable_time_stamp())
comments_json_data = get_comments_json_data(comments_url) # 返回json一级评论数据
comments_list = comments_json_data.get('data').get('replies')

if comments_list:
    time_stamp = variable_time_stamp()
    for item in comments_list:  # 遍历每一页一级评论
        user_name = item.get('member').get('uname')  # 一级评论人账户名
        comment_like_count = item.get('like')  # 一级评论点赞数
        comment_time = comment_time_handle(item.get('ctime'))  # 一级评论时间
        text = item.get('content').get('message')  # 一级评论内容
        print(user_name, comment_like_count, comment_time, text)
        comments_num += 1

        rpid = item.get('rpid')  # 猜测是一级评论用户ID
        fold_url = fold_url_base.format(sub_page=1, oid=oid, rpid=rpid, time_stamp=time_stamp)
        fold_json_data = get_comments_json_data(fold_url)  # 折叠回复
        sub_comment_count = int(fold_json_data.get('data').get('page').get('count'))  # 一级评论回复数
        if sub_comment_count > 0:  # 如果有回复
            # print(fold_json_data)
            sub_comment_page = math.ceil(sub_comment_count / 10)  # 折叠回复一页10条,向上取整
            for sub_page in range(1, sub_comment_page + 1):  # 遍历每一页折叠回复
                fold_url = fold_url_base.format(sub_page=sub_page, oid=oid, rpid=rpid, time_stamp=time_stamp)
                fold_json_data = get_comments_json_data(fold_url)  # 折叠回复
                fold_comments_list = fold_json_data.get('data').get('replies')
                for sub_item in fold_comments_list:  # 遍历当前页折叠回复
                    sub_user_name = sub_item.get('member').get('uname')  # 二级评论人账户名
                    sub_comment_like_count = sub_item.get('like')  # 二级评论点赞数
                    sub_comment_time = comment_time_handle(sub_item.get('ctime'))  # 二级评论时间
                    sub_text = sub_item.get('content').get('message')  # 二级评论内容
                    print(sub_user_name, sub_comment_like_count, sub_comment_time, sub_text)
                    comments_num += 1
    page += 1
else:
    print('总评论数为:', comments_count, '已爬取评论数为:', comments_num, '当前评论页数:', page)
    print('评论爬取结束。')
    exit(1)
    # if int(comments_count) * 0.99 <= comments_num <= int(comments_count) * 1.1:
    #     print('评论爬取结束。')
    # else:
    #     deviation_raw = (comments_num - int(comments_count)) / int(comments_count) *100
    #     deviation = '%.2f' % deviation_raw
    #     print('评论爬取数偏差超过百分之一!偏差为:', str(deviation) + '%')
    #     exit(1)

create_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 采集创建时间

print(item_id, title, content, media_name, read_count, comments_count, like_count, datetime, sep='\n')

你可能感兴趣的:(B站爬虫)