#bilibili弹幕抓取
import requests
import re
from bs4 import BeautifulSoup
import operator#排序
def getHTMLText(url):
try:
print("获取url中...")
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36'}
r = requests.get(url,headers=headers)
print("获取url完成")
return r.text
except:
print("获取Url失败")
def parsePage(text):
try:
print("解析文本...")
keyStr = re.findall(r'upgcxcode/78/39/[\d]*',text)#B站有两种寻址方式,第二种多一些
key = eval(keyStr[0].split('/')[3])
print(key)
commentUrl = 'https://api.bilibili.com/x/v1/dm/list.so?oid=' + str(key) # 弹幕存储地址
res=requests.get(commentUrl)
res.encoding = 'utf-8'
commentText=res.text
soup = BeautifulSoup(commentText, "html.parser")
soup2=BeautifulSoup(text,"html.parser")
commentList={}
title=soup2.find('h1').get_text().strip()#find()方法,获取文本,去掉空格
for comment in soup.find_all('d'):
time=float(comment.attrs['p'].split(',')[0])#tag.attrs(标签属性,字典类型)
commentList[time]=comment.string
newDict=sorted(commentList.items(),key=operator.itemgetter(0))#字典排序
commentList=dict(newDict)
print("解析文本完成")
return commentList,title
except:
print("解析失败")
def float2time(f):
timePlus=int(f)
m=timePlus//60
s=timePlus-m*60
return str(m)+':'+str(s).zfill(2)
def ioFunc(commentList,title,root):
print("写入文本中...")
path = root + "\\" + title + '.txt'
print(path)
f = open(path, 'w',encoding='utf-8')#windows默认gbk编码输出,与网络编码“utf-8”不符
begin = "{}\n共有{}条弹幕\n".format(title, len(commentList))
f.write(begin)
ws = "{:7}\t{}\n".format('time', 'comment')
f.write(ws)
lastTime=0
for time,string in commentList.items():#记得items()
lastTime = float2time(time)
ws = "{:7}\t{}\n".format(lastTime,string)
f.write(ws) # 手动换行
f.close()
def main():
# av =input('Put in av number: ') # 视频地址
av = 77282475
url=r"https://www.bilibili.com/video/av"+str(av)
root = r"E:\1"
text=getHTMLText(url)
commentList,title=parsePage(text)
ioFunc(commentList, title, root)
print("Finish.")
main()