原码
def getText():
txt=open("hmlt.txt","r").read()
txt=txt.lower()
for ch in '`!@#~$%^&*()_+-=*/{}[];,./?<>':
txt=txt.replace(ch," ")
return txt
hmltTxt=getText()
words=hmltTxt.split()
counts={}
for word in words:
counts[word]=counts.get(word,0)+1
items=list(counts.items())
items.sort(key=lambda x:x[1],reverse=True)
for i in range(100):
word,count=items[i]
print("{0:<10}{1:>5}".format(word,count))
带解析
def getText():
txt=open("hmlt.txt","r").read() #打开文件
txt=txt.lower() #将所有单词转换为小写去掉大小写的干扰
for ch in '`!@#~$%^&*()_+-=*/{}[];,./?<>': #去掉所有的特殊符号
txt=txt.replace(ch," ") #将特殊符号替换成空格 即去掉
return txt
hmltTxt=getText() #对文件进行读取
words=hmltTxt.split()
#因为现在单词间均为空格分隔开来,所以用split用空格分隔他们并变成列表返回
counts={} #建立一个字典
for word in words:
counts[word]=counts.get(word,0)+1
#用当前的某一个单词作为键索引字典 如果在里面则返回次数再加一 若不在里面则直接加1
items=list(counts.items())
#用list将counts变为一个列表类型 counts.items()-->返回可遍历的(键,值)元组数组
items.sort(key=lambda x:x[1],reverse=True)
#使用list.sort()方法来排序,此时list本身将被修改
for i in range(100):
word,count=items[i]
print("{0:<10}{1:>5}".format(word,count))
#从输出结果来看,高频单词大多数是冠词,代词、连接词等词汇,并不能代表文章的含义
#进一步的可以采用集合类型构建一个排除词汇库excludes,在输出结果中排除这个词汇库中的内容
excludes={"the","and","of","you","a","with","but","as","be","in","or","are"}
def getText():
txt=open("hmlt.txt","r").read()
txt=txt.lower()
for ch in '`!@#~$%^&*()_+-=*/{}[];,./?<>':
txt=txt.replace(ch," ")
return txt
hmltTxt=getText()
words=hmltTxt.split()
counts={}
for word in words:
counts[word]=counts.get(word,0)+1
for word in excludes:
del(counts[word])
items=list(counts.items())
items.sort(key=lambda x:x[1],reverse=True)
for i in range(10):
word,count=items[i]
print("{0:<10}{1:>5}".format(word,count))
/**************中文文本********************/
import jieba
txt=open("threekingdoms.txt","r",encoding='utf-8').read()
words=jieba.lcut(txt)
counts={}
for word in words:
if len(word)==1: #排除单个字符的分词结果
continue
else:
counts[word]=counts.get(word,0)+1
items=list(counts.items())
items.sort(key=lambda x:x[1],reverse=True)
for i in range(15):
word,count=items[i]
print("{0:<10}{1:>5}".format(word,count))
#输出结果中,出现了“玄德”、“玄德曰”,应该为同一个人但jieba划分为两个词汇,这种情况需要整合处理
excludes={"将军","却说","二人","不可","荆州","不能","如此"}
import jieba
txt=open("threekingdoms.txt","r",encoding='utf-8').read()
words=jieba.lcut(txt)
counts={}
for word in words:
if len(word)==1: #排除单个字符的分词结果
continue
elif word=="诸葛亮"or word=="孔明曰":
rword="孔明"
elif word=="关公"or word=="云长":
rword="关羽"
elif word=="玄德"or word=="玄德曰":
rword="刘备"
elif word=="孟德"or word=="丞相":
rword="曹操"
else:
rword=word
counts[word]=counts.get(word,0)+1
for word in excludes:
del(counts[word])
items=list(counts.items())
items.sort(key=lambda x:x[1],reverse=True)
for i in range(15):
word,count=items[i]
print("{0:<10}{1:>5}".format(word,count))
其中sort()的使用
1.方法sort用于对列表就地排序。就地排序意味着对原来的列表进行修改,使其元素按顺序排列,而不是返回排序后的列表的副本
x = [4, 6, 2, 1, 7, 9]
x.sort()
print x
# [1, 2, 4, 6, 7, 9]
如果需要一个排序好的副本,同时保持原有列表不变,怎么实现呢
①
>>> x = [4, 6, 2, 1, 7, 9]
>>> y=x[ : ]
>>> y.sort()
>>> print(y)
[1, 2, 4, 6, 7, 9]
>>> print(x)
[4, 6, 2, 1, 7, 9]
注意:y = x[:] 通过分片操作将列表x的元素全部拷贝给y,如果简单的把x赋值给y:y = x,y和x还是指向同一个列表,并没有产生新的副本。
②
>>> x = [4, 6, 2, 1, 7, 9]
>>> y=x.copy()
>>> y.sort()
>>> print(y)
[1, 2, 4, 6, 7, 9]
>>> print(x)
[4, 6, 2, 1, 7, 9]
先产生一个副本赋予y然后再对y排序