前言:前几天爬取了起点中文网的大部分排行榜数据,今天用matplotlib,wordCloud,jieba做了数据展示。
以下为具体代码实现
import matplotlib.pyplot as plt
import numpy as np
# 用来正常显示中文标签
plt.rcParams['font.sans-serif'] = ['SimHei']
# 用来正常显示负号
plt.rcParams['axes.unicode_minus'] = False
#figure
figure=plt.figure()
p1=figure.add_subplot(3,2,1)
data=np.array(((519, 1), (78, 2), (17, 3), (19, 4), (11, 5), (25, 6), (6, 7), (15, 8)))
data=data.T
xData=data[1]
yData=data[0]
p1.plot(xData,yData,'r-',label="不同等级作家人数图") #生成折线图
p1.bar(xData,yData,width=0.3,color=np.array(['#E9967A', '#FFDAB9', '#87CEEB', '#FF1493', '#778899', '#008000', '#F0FFF0', '#40E0D0'])) #生成柱状图
p1.grid(linestyle='--')
p1.legend(loc="best")
p1.set_ylabel("人数")
p1.set_xlabel("等级")
p1.set_xticks(xData)
#p1.title("不同等级作家人数图")
for x,y in zip(xData,yData):
print(x,y)
plt.text(x,y+0.05,"{0}人".format(y),ha='center',va='bottom')
p2=figure.add_subplot(3,2,2)
data=np.array(((4.62755905511811, '仙侠'), (1.0, '体育'), (4.585, '军事'), (5.538461538461537, '历史'), (2.397297297297298, '奇幻'), (7.187499999999999, '悬疑'), (1.362962962962963, '武侠'), (4.510389610389609, '游戏'), (5.8419213973799105, '玄幻'), (2.25, '现实'), (0.0, '短篇'), (4.866901408450705, '科幻'), (1.2220689655172414, '轻小说'), (4.422267206477732, '都市')))
data=data.T
xData=np.arange(1,15,1)
yData=np.array(data[0],dtype=np.float)
p2.plot(xData,yData,'g--',label="不同类别小说的平均分数")
color=['#00BFFF', '#FAA460', '#FFFFE0', '#FFFFF0', '#BA55D3', '#A52A2A', '#EEE8AA', '#1E90FF', '#FDF5E6', '#FFFFFF', '#ADD8E6', '#FFEBCD', '#6A5ACD', '#FFB6C1']
p2.bar(xData,yData,width=0.3,color=np.array(color))
p2.grid(linestyle='--')
p2.legend(loc="best")
#ax=plt.subplot()
p2.set_ylabel("小说平均分数")
p2.set_xlabel("小说类别")
p2.set_xticks(xData)
p2.set_xticklabels(data[1])
#p2.title("不同类别小说的平均分数")
for x,y in zip(xData,yData):
print(x,y)
plt.text(x,y+0.05,"{0}分".format(np.around(y,decimals=3)),ha='center',va='bottom')
p3=figure.add_subplot(3,2,3)
data=np.array(((127, '仙侠'), (10, '体育'), (20, '军事'), (78, '历史'), (37, '奇幻'), (24, '悬疑'), (27, '武侠'), (77, '游戏'), (229, '玄幻'), (4, '现实'), (2, '短篇'), (142, '科幻'), (145, '轻小说'), (247, '都市')))
data=data.T
xData=np.arange(1,15,1)
yData=np.array(data[0],dtype=np.float)
#plt.figure()
p3.plot(xData,yData,'b--',label="不同类别小说的数量")
color=['#5F9EA0', '#ADD8E6', '#FFFACD', '#E0FFFF', '#B22222', '#BC8F8F', '#9370DB', '#FFFFE0', '#F0FFFF', '#FAFAD2', '#B8860B', '#9ACD32', '#191970', '#696969']
p3.bar(xData,yData,width=0.3,color=np.array(color))
p3.grid(linestyle='--')
p3.legend(loc="best")
#ax=plt.subplot()
p3.set_ylabel("小说数量")
p3.set_xlabel("小说类别")
p3.set_xticks(xData)
p3.set_xticklabels(data[1])
#p3.title("不同类别小说的数量")
for x,y in zip(xData,yData):
print(x,y)
plt.text(x,y+0.05,"{0}本".format(y),ha='center',va='bottom')
p4=figure.add_subplot(3,2,4)
data=np.array(((201, '大神'), (838, '暂无称号'), (130, '白金')))
data1=np.array(data.T[0],dtype=(np.float))
data1=data1/np.sum(data1)
labels=['%.2f %%'%(np.float(x*100)) for x in data1]
list=[]
a=''
for x,y in zip(data.T[1],labels):
a=str(x)+"-"+str(y)
list.append(a)
pass
#print(list)
#print(data2)
labels=['%s'%(np.array(list[x])) for x in range(len(list))]
p4.pie(data1,colors=['#FFFF00','#00FF7F','#ADD8E6'],labels=labels) #生成饼图
#p4.title("起点作家等级评价占比图")
p4.legend(loc="best")
#plt.title("起点作家等级评价占比图")
p5=figure.add_subplot(3,2,6)
data=np.array(((127, '仙侠'), (10, '体育'), (20, '军事'), (78, '历史'), (37, '奇幻'), (24, '悬疑'), (27, '武侠'), (77, '游戏'), (229, '玄幻'), (2, '短篇'), (142, '科幻'), (145, '轻小说'), (247, '都市'),(4, '现实'), ))
data1=np.array(data.T[0],dtype=(np.float))
data1=data1/np.sum(data1)
labels=['%.2f %%'%(np.float(x*100)) for x in data1]
list=[]
a=''
for x,y in zip(data.T[1],labels):
a=str(x)+"-"+str(y)
list.append(a)
pass
labels=['%s'%(np.array(list[x])) for x in range(len(list))]
colors=['#556B2F', '#808000', '#5F9EA0', '#FFDAB9', '#F8F8FF', '#FFF5EE', '#FFA07A', '#FF00FF', '#FFEFD5', '#00008B', '#F0FFFF', '#FFFF00', '#A52A2A', '#FFC0CB']
p5.pie(data1,colors=np.array(colors),labels=labels)
#p5.title("起点各类小说占比图")
plt.title("起点各类小说占比图")
plt.show()
我将爬取的所有小说的简介存为一个.txt文件作了词云分析
以下为代码实现
#删除停用词和分词
import jieba
# 创建停用词列表
def stopwordslist():
stopwords = [line.strip() for line in open('stopwords.txt',encoding='ISO-8859-1').readlines()]
return stopwords
# 对句子进行中文分词
def seg_depart(sentence):
# 对文档中的每一行进行中文分词
print("正在分词")
sentence_depart = jieba.cut(sentence.strip(),cut_all=True)
# 创建一个停用词列表
stopwords = stopwordslist()
# 输出结果为outstr
outstr = ''
# 去停用词
for word in sentence_depart:
if word not in stopwords:
if word != '\t':
if word !='u3000' and word!='2019' and word!='2018' and word!='暂无'and word!='累积' and word!='获得' and word!='月票':
outstr += word
outstr += " "
return outstr
# 给出文档路径
filename = "a.txt"
outfilename = "out.txt"
inputs = open(filename, 'r', encoding='UTF-8')
outputs = open(outfilename, 'w', encoding='UTF-8')
# 将输出结果写入ou.txt中
for line in inputs:
line_seg = seg_depart(line)
outputs.write(line_seg + '\n')
print("-------------------正在分词和去停用词-----------")
outputs.close()
inputs.close()
print("删除停用词和分词成功!!!")
#使用wordcloud进行词云展示
from os import path
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud,ImageColorGenerator
d = path.dirname(__file__)
text = open(path.join(d, 'out.txt'),encoding='utf-8').read()
import jieba
# 结巴分词
wordlist = jieba.cut(text, cut_all=True)
wl = " ".join(wordlist)
coloring = np.array(Image.open('c.jpg'))
# 你可以通过 mask 参数 来设置词云形状
wc = WordCloud(background_color="white", max_words=100,mask=coloring,
max_font_size=50,random_state=50, font_path='C:/Windows/Fonts/simkai.ttf').generate(wl)
# create coloring from image
image_colors = ImageColorGenerator(coloring)
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()