python中的jieba分词

# coding: utf8
'''
    @Author: LCY
    @Contact: [email protected]
    @blog: http://http://blog.csdn.net/lcyong_
    @Date: 2018-01-09
    @Time: 23:06
'''
import jieba
from collections import Counter

fenci = open("fenci.txt",'r',encoding='UTF-8') # 待分词的文件
fenci_result = open("fenci_result.txt", 'a',encoding='UTF-8') # 分词后的文件
lines = fenci.readlines()  # 读取全部内容
for line in lines: # 按行读取
    line.replace('\t', '').replace('\n', '').replace(' ', '') 
    # seg_list = jieba.cut(line, cut_all=False) # 全部分词
    seg_list =[ x for x in jieba.cut_for_search(line) if len(x) >= 2] # 只要前长度大于2的词
    seg_list = Counter(seg_list).most_common(50)  # 获取频率前五十
    print(seg_list) 
    fenci_result.write(" ".join(seg_list))
fenci.close()
fenci_result.close()


关于jieba具体用法的博客文章:

http://blog.csdn.net/john_xyz/article/details/54645527





你可能感兴趣的:(python,数据分析)