在英文中文中,出现哪些词,出现多少次?
# CalHamletV1.py
def getText(): #获取文本,归一化处理文本
txt = open("E:\python\learn\hamlet.txt","r").read() #打开文件
txt = txt.lower() #将所有大写字母变成小写
for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_{|}~"': #将文本中特殊符号替换为空格
txt = txt.replace(ch," ")
return txt
hamletTxt = getText()
words = hamletTxt.split() #采用空格将字符串中的信息进行分隔,以列表形式返回给变量
counts = {}
for word in words:
counts[word] = counts.get(word,0) + 1 #统计字数
items = list(counts.items()) #将字典类型转换成列表类型
items.sort(key=lambda x:x[1], reverse=True) #对一个列表按照键值对的2个元素的第2个元素进行排序,排序方式为由大到小的倒排
for i in range(10): #将前10位字数打印输出
word, count = items[i]
print("{0:<10}{1:>5}".format(word,count))
the 137
to 99
he 71
and 65
hamlet 57
of 52
his 52
was 49
that 36
king 31
#CalThreeKingdomsV1.py 三国演义人物出场文本统计
import jieba
txt = open("E:/python/learn/threekingdoms.txt","r",encoding="utf-8").read()
words = jieba.lcut(txt)
counts = {}
for word in words:
if len(word) == 1:
continue
else:
counts[word] = counts.get(word,0) + 1
items = list(counts.items())
items.sort(key=lambda x:x[1],reverse=True)
for i in range(15):
word, count = items[i]
print("{0:<10}{1:>5}".format(word, count))
曹操 934
孔明 831
将军 761
却说 647
玄德 569
关公 509
丞相 488
二人 465
不可 435
荆州 420
孔明曰 385
玄德曰 383
不能 383
如此 376
张飞 348
#CalThreeKingdomsV2.py 三国演义人物出场文本统计
import jieba
txt = open("E:/python/learn/threekingdoms.txt","r",encoding="utf-8").read()
excludes = {"将军","却说","荆州","二人","不可","不能","如此"} #构建排除集合
words = jieba.lcut(txt)
counts = {}
for word in words:
if len(word) == 1:
continue
elif word =="诸葛亮" or word =="孔明曰":
rword = "孔明"
elif word =="关公" or word =="云长":
rword = "关羽"
elif word =="玄德" or word =="玄德曰":
rword = "刘备"
elif word =="孟德" or word =="丞相":
rword = "曹操"
else:
rword = word
counts[rword] = counts.get(rword,0) + 1
for word in excludes:
del counts[word]
items = list(counts.items())
items.sort(key=lambda x:x[1],reverse=True)
for i in range(15):
word, count = items[i]
print("{0:<10}{1:>5}".format(word, count))
曹操 1429
孔明 1373
刘备 1223
关羽 779
张飞 348
商议 344
如何 336
主公 327
军士 310
吕布 300
左右 291
军马 288
次日 270
引兵 269
大喜 265
点赞,关注,收藏,➕,点赞,关注,收藏,➕,点赞,关注,收藏,➕,