使用kmeans

#coding:utf-8

import jieba
import numpy as np
 
fr = open("sk.txt")
fr_list = fr.read()
dataList = fr_list.split("\n")
data = []
for oneline in dataList:
    data.append(" ".join(jieba.cut(oneline)))

from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
stop_words = [",","。","(",")"," 来话人","来话","2017",
        '希望','部门','核实','处理',"认为"," 不合理"]

freWord = CountVectorizer(stop_words=stop_words)
fre = freWord.fit_transform(data[:50])
word = freWord.get_feature_names()
print(repr(word).decode('unicode-escape'))
print(len(word))
#加入tf-idf

transformer = TfidfTransformer()
#tf-idf和上面的fre是差不多的结果,只不过,频次变成了小数

tfidf = transformer.fit_transform(fre)
#得到权重
#print(tfidf)

weight = tfidf.toarray()
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5, random_state=0).fit(weight)
print(kmeans.labels_)
label = kmeans.labels_
print(len(label))
for i in range(5):
   print("")
   print("")
   print("第"+str(i)+"类有:")
   for x in range(len(label)):
       if(label[x]==i):
           print(dataList[x])

你可能感兴趣的:(使用kmeans)