jieba:优秀的中文分词第三方库
ThreeKingdoms.txt(三国演义.txt):https://python123.io/resources/pye/threekingdoms.txt
# CalThreeKingdomsV1.py
import jieba
txt = open("ThreeKingdoms.txt", encoding="utf-8").read() # 打开文件
words = jieba.lcut(txt) # 分词
counts = {} # 建字典
for word in words:
if len(word) == 1:
continue
else:
counts[word] = counts.get(word, 0) + 1 # 判断名字是否在字典中存在,存在则值加1,否则值为1
items = list(counts.items()) # 列表化
items.sort(key=lambda x: x[1], reverse=True) # 排序,默认从小到大,reverse反序输出
for i in range(20): # 输出前20名的词
word, count = items[i]
print("{0:<10}{1:>5}".format(word, count))
曹操 953
孔明 836
将军 772
却说 656
玄德 585
关公 510
丞相 491
二人 469
不可 440
荆州 425
玄德曰 390
孔明曰 390
不能 384
如此 378
张飞 358
商议 344
如何 338
主公 331
军士 317
吕布 300
Process finished with exit code 0
很明显输出的结果中有些不是人名,需要去掉;还有一些名字指的是同一个人,需要合并,所以就有了
# CalThreeKingdomsV2.py
import jieba
txt = open("ThreeKingdoms.txt", encoding="utf-8").read()
excludes = {"将军", "却说", "荆州", "二人", "不可", "不能", "如此", "商议", "如何", "军士", "主公", "左右", "军马",
"次日", "引兵", "大喜"} # 多次运行代码,找出那些不是人物名的词
words = jieba.lcut(txt) # 分词
counts = {} # 建字典
for word in words:
if len(word) == 1:
continue
elif word == "诸葛亮" or word == "孔明曰": # 合并同一个人的名词
r_word = "孔明"
elif word == "关公" or word == "云长":
r_word = "关羽"
elif word == "玄德" or word == "玄德曰":
r_word = "刘备"
elif word == "孟德" or word == "丞相": # 暂且先认为所有的“丞相”都是指“曹操”
r_word = "曹操"
else:
r_word = word
counts[r_word] = counts.get(r_word, 0) + 1 # 统计词频
for word in excludes:
del counts[word] # 删掉非人的词
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True) # 排序,反序输出
for i in range(8): # 输出前8名
word, count = items[i]
print("{0:<10}{1:>5}".format(word, count))
曹操 1451
孔明 1383
刘备 1252
关羽 784
张飞 358
吕布 300
赵云 278
孙权 264
Process finished with exit code 0