selenium爬取B站的弹幕制作词云

B站的弹幕的api是:"https://api.bilibili.com/x/v1/dm/list.so?oid=26495963
oid是视频的id
这个api用谷歌的开发者工具是看不到内容的,建议都用火狐来 找api

from selenium import webdriver
from lxml import etree
import requests

#这个是 B站弹幕爬取
url="https://api.bilibili.com/x/v1/dm/list.so?oid=26495963"

a={"cid":26495965}  #cid=oid

driver=webdriver.PhantomJS(executable_path=r'D:\ysc桌面\Desktop\phantomjs-2.1.1-windows\bin\phantomjs.exe')

driver.get(url)
response=requests.get(url)
print(type(driver.page_source)) #字符串
print(type(response.content)) #是二进制
html=etree.HTML(response.content)

danmu=html.xpath('//d/text()')
with open("danmu.txt","w",encoding="utf-8")as fp:
    for i in danmu:
        fp.write(i+"\n")




#这个是斗鱼直播爬取
from selenium import webdriver
from lxml import etree
import time
from bs4 import BeautifulSoup


driver=webdriver.PhantomJS(executable_path=r'D:\ysc桌面\Desktop\phantomjs-2.1.1-windows\bin\phantomjs.exe')


url="https://www.douyu.com/g_LOL"

driver.get(url)

driver.save_screenshot("douyu.png")






i=1
while True:
    html_tree = etree.HTML(driver.page_source)
    soup = BeautifulSoup(driver.page_source, "lxml")
    ables = html_tree.xpath(
        '//div[@id="J-pager"]//a[@class="shark-pager-next shark-pager-disable shark-pager-disable-next"]')
    print(ables)
    abc=soup.select('div[id="J-pager"] a')
    print(abc[-2].attrs)
    a=driver.page_source.find("shark-pager-disable-next")
    print(a)

    # if ables:
    #     break
    # else:
    driver.find_element_by_link_text("下一页").click()
    driver.implicitly_wait(3)
    print(i)
    driver.save_screenshot("douyu%d.png"%i)
    i += 1
    time.sleep(2)



#根据我们爬下来的弹幕做一个词云

import matplotlib.pyplot as plt
from wordcloud import WordCloud

## 导入笼罩图  是一个二值化的图
background_img=plt.imread(r'D:\ysc桌面\Desktop\timg.jpg')

#生成词云文档 打开下载弹幕的文件
f = open("danmu.txt","r",encoding="utf-8").read()

#生成词云
wordcloud=WordCloud(
    background_color="white", #背景颜色
    mask=background_img,#背景图
    font_path=r'C:\Windows\Fonts\STFANGSO.TTF',
    width=1000,
    height=800,
    margin=2,
    min_font_size=10,
    max_words=100,
).generate(f)

plt.imshow(wordcloud)
plt.axis("off")
plt.show()

#保存图
wordcloud.to_file("ciyun2.png")

你可能感兴趣的:(python)