# coding: utf8
'''
@Author: LCY
@Contact: [email protected]
@blog: http://http://blog.csdn.net/lcyong_
@Date: 2018-01-09
@Time: 23:06
'''
import jieba
from collections import Counter
fenci = open("fenci.txt",'r',encoding='UTF-8') # 待分词的文件
fenci_result = open("fenci_result.txt", 'a',encoding='UTF-8') # 分词后的文件
lines = fenci.readlines() # 读取全部内容
for line in lines: # 按行读取
line.replace('\t', '').replace('\n', '').replace(' ', '')
# seg_list = jieba.cut(line, cut_all=False) # 全部分词
seg_list =[ x for x in jieba.cut_for_search(line) if len(x) >= 2] # 只要前长度大于2的词
seg_list = Counter(seg_list).most_common(50) # 获取频率前五十
print(seg_list)
fenci_result.write(" ".join(seg_list))
fenci.close()
fenci_result.close()
关于jieba具体用法的博客文章:
http://blog.csdn.net/john_xyz/article/details/54645527