txt = open('hamlet.txt','r').read()
# 将大写变小写,排除大小写差异的干扰
txt = txt.lower()
# 将文本中的特殊字符转化为空格,统一分割方式
for ch in ',./?;:'"<>=+-[]{}!~%@()#':
txt.replace(ch, ' ')
words = txt.split() # 按空格分隔,列表形式返回
counts = {} #计数器
for word in words:
counts[word] = counts.get(word, 0) + 1
# 按照词频从高到低排序
counts = sorted(counts.items(), key = lambda x: x[1], reverse = True)
for i in range(10):
word, count = counts[i]
print('{0:<10}:{0:>5}'.format(word,count)
运行之后发现高频单词大多数是冠词、代词、连接词等语法型词汇,并不能代表文章含义
建立一个排除词库encludes
excludes = {'the','and','of','you','a','i','my','in'}
txt = open('hamlet.txt', 'r').read()
txt = txt.lower() # 排除大小写干扰
for ch in ',./?;:'"<>=+-[]{}!~%@()#': # 统一分割字符
txt = txt.replace(ch, ' ')
words = txt.split() # 按空格分词
counts = {} # 计数器
for word in words:
counts[word] = counts.get(word, 0) + 1
for word in excludes:
del counts[word]
# 词频从高到低排序
counts = sorted(counts.items(), key = lambda x:x[1],reverse = True)
for i in range(10):
print('{:<10}:{:>5}'.format(counts[i][0],counts[i][1])
1)重要的第三方中文分词函数库
2)安装pip3 install jieba
3)常用函数
import jieba
txt = open('三国演义.txt','r', encoding='utf-8').read()
words = jieba.lcut(txt) # 精确模式,返回列表类型
counts = {}
for word in words:
if len(word) == 1: # 排除单个字符分词的影响
continue
else:
counts[word] = counts.get(word, 0) + 1
# 按词频从高到低排序
counts = sorted(counts.items(), key = lambda x: x[1], reverse = True)
for i in range(5):
word, count = counts[i]
print('{:<10}{:>5}'.format(word, count))
【代码改进】
1、排除与人名无关的词汇
2、同一个人有不同称谓
encludes = {'将军','却说','荆州','二人','不可','不能','如此'}
import jieba
txt = open('三国演义.txt','r', encoding='utf-8').read()
words = jiaba.lcut(s) # 精确模式分词,返回list
counts = {}
for word in words:
if len(word) == 1: # 排除单字符
continue
# 将一人多称谓统一
elif word == '诸葛亮' or '孔明曰':
rword = '孔明'
elif word == '关公' or '云长':
rword = '关羽'
elif word == '玄德' or '玄德曰':
rword = '刘备'
elif word == '孟德' or '丞相':
rword = '曹操'
else:
rword = word
counts[rword] = counts.get(rword, 0) + 1
# 排除与人名无关的词
for word in excludes:
del counts[word]
#按词频排序
counts = sorted(counts.items(), key = lambda x:x[1], reverse=True)
for i in range(10):
print('{:<10}:{:>5}'.format(counts[i][0], counts[i][1]))