Hamlet词频统计(含Hamlet原文文本)
#CalHamletV1.py
defgetText():
txt =open("hamlet.txt", "r").read()
txt =txt.lower()
forch in'!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~':
txt =txt.replace(ch, " ") #将文本中特殊字符替换为空格
returntxt
hamletTxt =getText()
words =hamletTxt.split()
counts ={}
forword inwords:
counts[word] =counts.get(word,0) +1
items =list(counts.items())
items.sort(key=lambdax:x[1], reverse=True)
fori inrange(10):
word, count =items[i]
print("{0:<10}{1:>5}".format(word, count))
《三国演义》人物出场统计(上)(含《三国演义》原文文本)
#CalThreeKingdomsV1.py
importjieba
txt =open("threekingdoms.txt", "r", encoding='utf-8').read()
words =jieba.lcut(txt)
counts ={}
forword inwords:
iflen(word) ==1:
continue
else:
counts[word] =counts.get(word,0) +1
items =list(counts.items())
items.sort(key=lambdax:x[1], reverse=True)
fori inrange(15):
word, count =items[i]
print("{0:<10}{1:>5}".format(word, count))
《三国演义》人物出场统计(下)(含《三国演义》原文文本)
#CalThreeKingdomsV2.py
importjieba
excludes ={"将军","却说","荆州","二人","不可","不能","如此"}
txt =open("threekingdoms.txt", "r", encoding='utf-8').read()
words =jieba.lcut(txt)
counts ={}
forword inwords:
iflen(word) ==1:
continue
elifword =="诸葛亮"orword =="孔明曰":
rword ="孔明"
elifword =="关公"orword =="云长":
rword ="关羽"
elifword =="玄德"orword =="玄德曰":
rword ="刘备"
elifword =="孟德"orword =="丞相":
rword ="曹操"
else:
rword =word
counts[rword] =counts.get(rword,0) +1
forword inexcludes:
delcounts[word]
items =list(counts.items())
items.sort(key=lambdax:x[1], reverse=True)
fori inrange(10):
word, count =items[i]
print("{0:<10}{1:>5}".format(word, count))