- 比如在中文分词前,将中文逗号、中文句号、中文冒号、中文引号,英文空格替换为空字符。
replace方法
import jieba
d = {}
with open("sgld.txt","r",encoding ="utf-8") as f:
lssgld = f.readlines()
for word in lssgld:
word = word.replace(',','').replace('。','').replace('“','').replace('”','').replace(':','').replace(' ','').replace('\n','')
wo = jieba.lcut(word)
for w in wo:
d[w] = d.get(w,0) + 1
ls = list(d.items())
ls.sort(key=lambda x:x[1], reverse = True)
for j in range(5):
print(ls[j][0],end='、')
for循环的方法
import jieba
with open("sgld.txt","r",encoding ="utf-8")as f:
lssgld = f.readlines()
d = {}
for ls in lssgld:
ls = ls.replace("\n","")
for c in "。,:”“ ":
ls = ls.replace(c, "")
wordlist = jieba.cut(ls)
for word in wordlist:
d[word] = d.get(word,0) + 1
ls = list(d.items())
ls.sort(key=lambda x:x[1], reverse = True)
for i in range(5):
a = ls[i][0]
print("{}".format(a),end = "、")