B站的弹幕的api是:"https://api.bilibili.com/x/v1/dm/list.so?oid=26495963
oid是视频的id
这个api用谷歌的开发者工具是看不到内容的,建议都用火狐来 找api
from selenium import webdriver
from lxml import etree
import requests
#这个是 B站弹幕爬取
url="https://api.bilibili.com/x/v1/dm/list.so?oid=26495963"
a={"cid":26495965} #cid=oid
driver=webdriver.PhantomJS(executable_path=r'D:\ysc桌面\Desktop\phantomjs-2.1.1-windows\bin\phantomjs.exe')
driver.get(url)
response=requests.get(url)
print(type(driver.page_source)) #字符串
print(type(response.content)) #是二进制
html=etree.HTML(response.content)
danmu=html.xpath('//d/text()')
with open("danmu.txt","w",encoding="utf-8")as fp:
for i in danmu:
fp.write(i+"\n")
#这个是斗鱼直播爬取
from selenium import webdriver
from lxml import etree
import time
from bs4 import BeautifulSoup
driver=webdriver.PhantomJS(executable_path=r'D:\ysc桌面\Desktop\phantomjs-2.1.1-windows\bin\phantomjs.exe')
url="https://www.douyu.com/g_LOL"
driver.get(url)
driver.save_screenshot("douyu.png")
i=1
while True:
html_tree = etree.HTML(driver.page_source)
soup = BeautifulSoup(driver.page_source, "lxml")
ables = html_tree.xpath(
'//div[@id="J-pager"]//a[@class="shark-pager-next shark-pager-disable shark-pager-disable-next"]')
print(ables)
abc=soup.select('div[id="J-pager"] a')
print(abc[-2].attrs)
a=driver.page_source.find("shark-pager-disable-next")
print(a)
# if ables:
# break
# else:
driver.find_element_by_link_text("下一页").click()
driver.implicitly_wait(3)
print(i)
driver.save_screenshot("douyu%d.png"%i)
i += 1
time.sleep(2)
#根据我们爬下来的弹幕做一个词云
import matplotlib.pyplot as plt
from wordcloud import WordCloud
## 导入笼罩图 是一个二值化的图
background_img=plt.imread(r'D:\ysc桌面\Desktop\timg.jpg')
#生成词云文档 打开下载弹幕的文件
f = open("danmu.txt","r",encoding="utf-8").read()
#生成词云
wordcloud=WordCloud(
background_color="white", #背景颜色
mask=background_img,#背景图
font_path=r'C:\Windows\Fonts\STFANGSO.TTF',
width=1000,
height=800,
margin=2,
min_font_size=10,
max_words=100,
).generate(f)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
#保存图
wordcloud.to_file("ciyun2.png")