python倚天屠龙记人物关系、词云、柱状图、-词频
import re
import networkx as nx
import matplotlib.pyplot as plt
import jieba.posseg as pseg
import random
import codecs
from pyecharts import options as opts
from pyecharts.charts import Bar
from pyecharts.charts import WordCloud
import wordcloud
import imageio
keshihuaTop=10
mainTop = 100
peopleTop=10
fn = open('prepare/yitiantulongji.txt', encoding="utf-8")
string_data = fn.read()
fn.close()
pattern = re.compile(u'\t|\n|\.|-|:|;|\)|\(|\?|"')
txt = re.sub(pattern, '', string_data)
print('预处理完毕')
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
stopwords = stopwordslist('prepare/tingyong.txt')
counts1 = {}
counts2={}
def getWordTimes1():
cutFinal = pseg.cut(txt)
for w in cutFinal:
if w.word in stopwords or w.word == None:
continue
else:
real_word = w.word+'_'+w.flag
counts1[real_word] = counts1.get(real_word, 0) + 1
getWordTimes1()
items1 = list(counts1.items())
items1.sort(key=lambda x: x[1], reverse=True)
def wordFreq1(filepath, topn1):
with codecs.open(filepath, "w", "utf-8") as f:
for i in range(topn1):
word, count = items1[i]
f.write("{}:{}\n".format(word, count))
wordFreq1("output/倚天屠龙记词频词性.txt", 300)
fr1 = open('output/倚天屠龙记词频词性.txt', 'r', encoding='utf-8')
dic1 = {}
keys1 = []
for line in fr1:
v1 = line.strip().split(':')
dic1[v1[0]] = v1[1]
keys1.append(v1[0])
fr1.close()
list_name1 = list(dic1.keys())
list_name_times1 = list(dic1.values())
def create_wordproperties():
bar1 = Bar()
bar1.add_xaxis(list_name1[0:keshihuaTop])
bar1.add_yaxis("词语出现次数", list_name_times1)
bar1.set_global_opts(title_opts=opts.TitleOpts(title="词频词性可视化图", subtitle="词频词性top10"),
xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate": 45}))
bar1.set_series_opts(label_opts=opts.LabelOpts(position="top"))
bar1.render("output/倚天屠龙记词频词性可视化图.html")
def getWordTimes2():
poss = pseg.cut(txt)
for w in poss:
if w.flag != 'nr' or len(w.word) < 2 :
continue
else:
real_word = w.word
counts2[real_word] = counts2.get(real_word, 0) + 1
getWordTimes2()
items2 = list(counts2.items())
items2.sort(key=lambda x: x[1], reverse=True)
def wordFreq2(filepath, topn):
with codecs.open(filepath, "w", "utf-8") as f:
for i in range(topn):
word, count = items2[i]
f.write("{}:{}\n".format(word, count))
wordFreq2("output/倚天屠龙记词频_人名.txt", 300)
fr = open('output/倚天屠龙记词频_人名.txt', 'r', encoding='utf-8')
dic = {}
keys = []
for line in fr:
v = line.strip().split(':')
dic[v[0]] = v[1]
keys.append(v[0])
fr.close()
print("人物出现次数TOP", mainTop)
print(list(dic.items())[:mainTop])
list_name = list(dic.keys())
list_name_times = list(dic.values())
def creat_people_view():
bar = Bar()
bar.add_xaxis(list_name[0:keshihuaTop])
bar.add_yaxis("人物出场次数", list_name_times)
bar.set_global_opts(title_opts=opts.TitleOpts(title="人物出场次数可视化图", subtitle="倚天屠龙记人物TOP10"),
xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate": 45}))
bar.set_series_opts(label_opts=opts.LabelOpts(position="top"))
bar.render("output/倚天屠龙记人物出场次数可视化图.html")
def creat_wordcloud():
bg_pic = imageio.imread('prepare/setting.png')
wc = wordcloud.WordCloud(
font_path='c:\Windows\Fonts\simhei.ttf',
background_color=None,
width=480, height=853,
contour_color="red", contour_width=8,
max_words=500,
max_font_size=100,
mask=bg_pic
)
wc.generate_from_frequencies(counts2)
wc.to_file('output/倚天屠龙记词云_人名.png')
plt.imshow(wc)
plt.axis('off')
plt.show()
def creat_wordcloud_pyecharts():
wordsAndTimes = list(dic.items())
(
WordCloud()
.add(series_name="人物次数", data_pair=wordsAndTimes,
word_size_range=[20, 100], textstyle_opts=opts.TextStyleOpts(font_family="cursive"), )
.set_global_opts(title_opts=opts.TitleOpts(title="倚天屠龙记词云"))
.render("output/倚天屠龙记词云_人名.html")
)
colorNum = len(list_name[0:peopleTop])
def randomcolor():
colorArr = ['1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F']
color = ""
for i in range(6):
color += colorArr[random.randint(0, 14)]
return "#" + color
def color_list():
colorList = []
for i in range(colorNum):
colorList.append(randomcolor())
return colorList
plt.rcParams['font.sans-serif'] = ['SimHei']
def creat_relationship():
colors = color_list()
Names = list_name[0:peopleTop]
relations = {}
lst_para = (txt).split('\n')
for text in lst_para:
for name_0 in Names:
if name_0 in text:
for name_1 in Names:
if name_1 in text and name_0 != name_1 and (name_1, name_0) not in relations:
relations[(name_0, name_1)] = relations.get((name_0, name_1), 0) + 1
maxRela = max([v for k, v in relations.items()])
relations = {k: v / maxRela for k, v in relations.items()}
plt.figure(figsize=(15, 15))
G = nx.Graph()
for k, v in relations.items():
G.add_edge(k[0], k[1], weight=v)
elarge = [(u, v) for (u, v, d) in G.edges(data=True) if d['weight'] > 0.6]
emidle = [(u, v) for (u, v, d) in G.edges(data=True) if (d['weight'] > 0.3) & (d['weight'] <= 0.6)]
esmall = [(u, v) for (u, v, d) in G.edges(data=True) if d['weight'] <= 0.3]
pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, pos, alpha=0.8, node_size=1300, node_color=colors)
nx.draw_networkx_edges(G, pos, edgelist=elarge, width=2.5, alpha=0.9, edge_color='g')
nx.draw_networkx_edges(G, pos, edgelist=emidle, width=1.5, alpha=0.6, edge_color='y')
nx.draw_networkx_edges(G, pos, edgelist=esmall, width=1, alpha=0.4, edge_color='b', style='dashed')
nx.draw_networkx_labels(G, pos, font_size=14)
plt.title("《倚天屠龙记》主要人物社交关系网络图")
plt.axis('off')
plt.savefig('output/《倚天屠龙记》主要人物社交关系网络图.png', bbox_inches='tight')
plt.show()
def main():
create_wordproperties()
creat_people_view()
creat_wordcloud()
creat_wordcloud_pyecharts()
creat_relationship()
if __name__ == '__main__':
main()