python 网页爬取数据生成文字云图

1. 需要的三个包:

from wordcloud import WordCloud        #词云库
import matplotlib.pyplot as plt        #数学绘图库
import jieba;

2. 定义变量(将对于的变量到一个全局的文件中):

3. 抓取数据

import requests;
import re;
from GrabData import Param;
import pandas as pd;
from bs4 import BeautifulSoup;

class GrabComent:
    ren = re.compile(r'(.*?).*?comment">.*?.*?(.*?).*?(.*?).*?title="(.*?)">.*?title="(.*?)">

(.*?)

',re.S) def __init__(self): print('开始抓取数据'); html = requests.get(Param.pdurl_first, headers=Param.head, cookies=Param.cookies); while html.status_code == 200: url_next = 'https://movie.douban.com/subject/26363254/comments' + re.findall(Param.reg, html.text)[0] zhanlang = re.findall(self.ren, html.text) print(zhanlang) data = pd.DataFrame(zhanlang) data.to_csv('H:\\python_projects\\ticket\\zhanlangpinglun.csv', header=False, index=False, mode='a+') # 写入csv文件,'a+'是追加模式 data = [] zhanlang = [] print("下一页地址:"+url_next); html = requests.get(url_next, cookies=Param.cookies, headers=Param.head) if __name__ == '__main__': GrabComent();

 

4. 生成云图

from wordcloud import WordCloud        #词云库
import matplotlib.pyplot as plt        #数学绘图库
import jieba;

class WordYun:

    def __init__(self):
        print("开始读取文件!");
        self.main();

    def main(self):
        text = self.readFile();
        self.showTitle(text);

    def showTitle(self,text1):
        wc1 = WordCloud(
            background_color="white",
            width=1000,
            height=860,
            font_path="D:\\Windows\\Fonts\\STFANGSO.ttf",  # 不加这一句显示口字形乱码
            margin=2);

        wc2 = wc1.generate(text1)  # 我们观察到generate()接受一个Unicode的对象,所以之前要把文本处理成unicode类型
        plt.imshow(wc2)
        plt.axis("off")
        plt.show();

    def readFile(self):
        a = []
        f = open(r'H:\\python_projects\\ticket\\zhanlangpinglun.csv', 'r').read()
        words = list(jieba.cut(f))
        for word in words:
            if len(word) > 1:
                a.append(word);
        txt = r' '.join(a)
        print("readFile返回的结果:"+txt);
        return txt;

if __name__ == '__main__':
    WordYun();

 

转载于:https://www.cnblogs.com/wangshunyao/p/7534883.html

你可能感兴趣的:(python 网页爬取数据生成文字云图)