下面代码是使用python的分词和词云绘制的水浒top20人物出现的次数
from wordcloud import WordCloud
import jieba
import imageio
mask = imageio.imread('./su/1.png') #要绘制词云的形状
# 读取小说内容
with open('./su/水浒.txt', 'r', encoding='utf-8') as f:
words = f.read()
# counts用于存放出现的名字与次数
counts = {} # {‘曹操’:234,‘回寨’:56}
# 将不是人名(或者代表某个确定人,例孔明曰)的词放入excluds中
excludes = {"两个", "一个", "只见", "如何", "那里", "哥哥", "不能", "如此", "说道",
"军马", "次日", "军士", "头领", "左右", "一齐", "引兵", "里面", "梁山泊",
"只得", "于是", "今日", "不敢", "知府", "都督", "人马", "不知", "先锋",
"前面", "将来", "弟兄", "众人", "这里", "小人", "出来", "妇人", "好汉",
"便是", "问道", "起来", "却是", "我们", "因此", "甚么", "这个", "正是",
"三个", "且慢", "兄弟", "不是", "只是", "不曾", "且说", "不得", "一面",
"看时", "如今", "来到", "当下", "原来", "将军", "山寨", "喝道", "兄长",
"大喜", "天子", "商议", "那个", "商议", "公人", "性命", "城中", "喽罗",
"那厮", "小弟", "商议", "东京", "下山", "不见", "怎地", "上山", "随即",
"不要", "一条", "和尚", "背后", "洒家", "许多", "太尉", "答道", "收拾",
"却说", "一声"
}
# 分词
words_list = jieba.lcut(words)
# print(words_list)
# 将分词后的词语和创新次数保存在counts中
for word in words_list:
# 如果长度小于1代表不是人名,就跳出本次循环
if len(word) <= 1:
continue
else:
# 更新字典中的值
# counts[word] = 取出字典中原来键对应的值 + 1
# counts[word] = counts[word] + 1 # counts[word]如果没有就要报错
# 字典c.get(k) 如果字典中没有这个键 返回 NONE
counts[word] = counts.get(word, 0) + 1
# 将不是人名的词语删除
for word in excludes:
del counts[word]
# 排序
items = list(counts.items())
def sort_by_count(x):
return x[1]
items.sort(key=sort_by_count, reverse=True)
print(items)
# 将top20取出来
li = []
for i in range(20):
# 序列解包
role, count = items[i] #role代表人名,count是次数
print(role, count)
for _ in range(count): # _是告诉看代码的人循环里不需要使用临时变量
li.append(role)
# 将前十的人名以图片的形式保存到top.png中
text = ' '.join(li) # 将列表转换为字符串
WordCloud(
font_path='msyh.ttc', # 如果不加这句代码中文显示不出来
background_color='white',
width=800,
height=600,
mask=mask,
# 相邻两个值的重复
collocations=False
).generate(text).to_file('水浒Top20.png')
效果图