NLTK统计中文词频并输出

# -*- coding: utf-8 -*-
'''
使用NLTK对中文进行词频统计并输出
'''
from nltk import FreqDist


def delblankline(infile, outfile):
    infopen = open(infile, 'r',encoding="utf-8")
    outfopen = open(outfile, 'w',encoding="utf-8")
    lines = infopen.readlines()
    cnt = Counter()
    for char in lines:
        cnt[char]+=1
    vocab = cnt.most_common()
    for each in vocab[:15000]:#对前15000个词进行输出
        outfopen.write(each[0])

delblankline("源文件路径", "目标文件路径")

你可能感兴趣的:(技术之路,python,NLTK)