用Python写了一个汉字词频统计程序,针对已经分好词的文本。
#python 3.4.3
#功能:统计文本中的词频。
#缺陷:标点符号的频数也会计算在内。
with open('test.txt', mode='r', encoding='utf-8') as inFile:
dict={} # 创建一个空字典
word='' # 空字符串以便于连接字符
for char in inFile.read():
if char != ' ':
word += char # 连接字符
else:
if word in dict:
dict[word]+=1
word='' # 将word置为空,否则,word值无限增大
else:
dict.setdefault(word, 1)
word=''
with open('WordFreq.xls', mode='w', encoding='gbk') as outFile:
for word, freq in dict.items():
s = '{0}\t{1}\n'.format(word, freq)
outFile.write(s)
功能并不完善,期待解决。