词频统计

#1.
wakeFile =  open('wake.txt',mode='r',encoding='utf-8')
wakeText = wakeFile.read()
wakeFile.close()
print(wakeText)
#2.
replaceList = ["'",'\n']
for c in replaceList:
    wakeText = wakeText.replace(c,' ')
print(wakeText)
#3.
print(wakeText.split(' '))
wakeList = wakeText.split(' ')
#4.
wakeSet = set(wakeList)
print(wakeSet)

wakeDict = {}
for word in wakeSet:
    wakeDict[word] = wakeList.count(word)
print (wakeDict)
for d in wakeDict:
    print(d,wakeDict[d])
#5
wordCountList = list(wakeDict.items())
print(wordCountList)
wordCountList.sort(key=lambda x:x[1],reverse=True)
print(wordCountList)
#6.
for i in range(20):
    print(wordCountList[i])
#7.
wakeCountFile = open('wakecount.txt',mode='a',encoding='utf-8')
for i in range(len(wordCountList)):
    wakeCountFile.write(str(wordCountList[i][1])+' '+wordCountList[i][0]+'\n')
wakeCountFile.close()

 

你可能感兴趣的:(词频统计)