from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
corpus = []
tfidfdict = {}
seg_ty = open('E:\\kmeans.txt', 'r')
for line in seg_ty:
corpus.append(line.strip())
vectorizer=CountVectorizer()
transformer=TfidfTransformer()
tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))
word=vectorizer.get_feature_names()
weight=tfidf.toarray()
for i in range(len(weight)):
for j in range(len(word)):
getword = word[j]
getvalue = weight[i][j]
K = range(1,15)
for k in K:
print("第几次聚类:"+ str(k) + "\n")
clf = KMeans(n_clusters = k)
s = clf.fit(weight)
order_centroids = clf.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for ss in range(k):
print("/n")
print("Cluster %d:" % ss, end='')
for ind in order_centroids[ss, :10]:
print(' %s' % terms[ind], end='')