python爬取B站弹幕制作词云

思路:

待续

 

效果:

python爬取B站弹幕制作词云_第1张图片python爬取B站弹幕制作词云_第2张图片

代码:

# -*- coding: utf-8 -*- 
# @Time : 2019/9/13 20:45 
# @Author : Zhao HL
# @File : jackMa.py 

'''
cid获取方法:
heartbeat-->Form Data-->cid
'''
import requests
import xml.sax
import xml.sax.handler
import pprint
import xml.etree.ElementTree as Etree
import jieba
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np




xml_url = 'http://comment.bilibili.com/116788897.xml'
xml_savePath = 'jackMa.txt'
# pic_path= 'jackMa.jpg'
pic_path= 'jackMa.png'

def getHTMLText(url):
    try:
        r = requests.get(url, timeout = 30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        #print(r.status_code)
        return r.text
    except:
        return "产生异常"

def xmlParse(xml):
    root = Etree.fromstring(xml)
    barrages = root.findall('d')
    with open('jackMa.txt', "w",encoding='utf-8') as f:
        for barrage in barrages:
            f.write(barrage.text+'\n')

def get_cloud():
    f = open(xml_savePath, 'r', encoding='UTF-8').read()
    cut_text = " ".join(jieba.cut(f))
    # print(cut_text)
    alice_mask = np.array(Image.open(pic_path))
    stopwords = set(STOPWORDS)
    stopwords.add("said")

    wordcloud = WordCloud(
        # 设置字体,不然会出现口字乱码,文字的路径是电脑的字体一般路径,可以换成别的
        font_path="C:/Windows/Fonts/simfang.ttf",
        # 设置了背景,宽高
        background_color="white", width=1000, height=880,mask=alice_mask,stopwords=stopwords).generate(cut_text)

    wordcloud.generate(cut_text)
    wordcloud.to_file(r"wordcloud.png")

    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()

# txt_xml = getHTMLText(xml_url)
# print(txt_xml)
# xmlParse(txt_xml)
get_cloud()

 

你可能感兴趣的:(pythonTrick)