爬取B站弹幕生成词云

一、爬取弹幕

import requests
import json
import re

#下载页面
def download_page(url):
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }
    res = requests.get(url,headers)
    return res

#根据av号获取cid
def get_cid(av):
    """
    :param av:B站视频的av号 如:https://www.bilibili.com/video/av95811021
    :return:cid
    """
    av = av.strip('av')
    url = f'https://api.bilibili.com/x/player/pagelist?aid={av}&jsonp=jsonpa'
    res = download_page(url)
    res_text = res.text
    res_dict = json.loads(res_text)
    cid = res_dict['data'][0]['cid']
    return cid


#根据cid请求弹幕
def get_danmu(cid):
    """
    :param cid: 获取弹幕所需的id
    :return: 弹幕列表
    """
    url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={cid}'
    res = download_page(url)
    res_xml = res.content.decode('utf-8')
    pattern = re.compile('(.*?)')
    danmu_list = pattern.findall(res_xml)
    return  danmu_list

#保存弹幕文件
def save_to_file(danmu_list,filename):
    """
    :param danmu_list: 弹幕列表
    :param filename: 文件名
    """
    with open(filename,mode='w',encoding='utf-8') as f:
        for one_danmu in danmu_list:
            f.write(one_danmu)
            f.write('\n')


def main(av):
    cid = get_cid(av)
    danmu_list = get_danmu(cid)
    save_to_file(danmu_list, f'{av}.txt')

if __name__ == '__main__':
    av = 'av95811021'
    main(av)

二、生成词云

import jieba
from wordcloud import WordCloud

#读取弹幕文件
def read_file(filename):
    """
    :param filename: 文件名
    :return: 所有弹幕字符串
    """
    with open(filename,mode='r',encoding='utf-8') as f:
        danmu = f.read()
        return danmu


#jieba分词
def jieba_cut(str):
    """
    :param str: 所有待分词的弹幕字符串
    :return: 词组列表
    """
    jieba.suggest_freq('原声大碟', tune=True) #指定不切割词组
    jieba.suggest_freq('前方高能', tune=True)
    cut_list = jieba.lcut(str)
    return cut_list


#生成词云图
def gen_word_cloud(cut_list):
    """
    :param cut_list: 词组列表
    """
    word_str = '  '.join(cut_list)
    wc_settings = {
        'font_path' :'msyh.ttc', #字体
        'width' : 800, #图片宽度
        'height' : 600, #图片高度
        'max_words' : 200, #最大词数
        'background_color' : 'white' #背景颜色
    }
    #生成词云对象
    wc = WordCloud(**wc_settings).generate(word_str)
    #保存词云图片
    wc.to_file('经典对线.png')

def main(av):
    str = read_file(f'{av}.txt')
    cut_list = jieba_cut(str)
    gen_word_cloud(cut_list)


if __name__ == '__main__':
    av = 'av95811021'
    main(av)


爬取B站弹幕生成词云_第1张图片
改进:
备注:需要一张背景为白色的图片

import jieba
from wordcloud import WordCloud
import numpy as np
from PIL import Image

#读取弹幕文件
def read_file(filename):
    """
    :param filename: 文件名
    :return: 所有弹幕字符串
    """
    with open(filename,mode='r',encoding='utf-8') as f:
        danmu = f.read()
        return danmu


#jieba分词
def jieba_cut(str):
    """
    :param str: 所有待分词的弹幕字符串
    :return: 词组列表
    """
    jieba.suggest_freq('原声大碟', tune=True)
    jieba.suggest_freq('前方高能', tune=True)
    cut_list = jieba.lcut(str)
    return cut_list


#生成词云图
def gen_word_cloud(cut_list):
    """
    :param cut_list: 词组列表
    """
    word_str = '  '.join(cut_list)
    mask = np.array(Image.open('ysg.jpg'))
    wc_settings = {
        'font_path' :'msyh.ttc',
        'width' : 800,
        'height' : 600,
        'max_words' : 600,
        'background_color' : 'white',
        'mask':mask,
        'colormap':'Reds',
        'contour_width':1,
        'contour_color':'red',
        'collocations':False
    }
    #生成词云对象
    wc = WordCloud(**wc_settings).generate(word_str)
    #保存词云图片
    wc.to_file('经典对线.png')

def main(av):
    str = read_file(f'{av}.txt')
    cut_list = jieba_cut(str)
    gen_word_cloud(cut_list)


if __name__ == '__main__':
    av = 'av95811021'
    main(av)


爬取B站弹幕生成词云_第2张图片

你可能感兴趣的:(词云,爬虫)