首先感谢Eastmount写的内容http://blog.csdn.net/Eastmount/article/details/50545937。点击打开链接
在此基础上,主要实现以下改进及结果
1.替换使用sklearn.feature_extraction.text.TfidfVectorizer,将corpus文本转换为tfidf值的svm向量。
2.通过PAC降维和Matplotlib显示聚类3d三维图像,更容易区分各类。
3.jieba提取30个关键词,作为特征值,而非全文档内容,进行SVM。
目前刚到找工作中,希望大家看到后,能够给些建议到[email protected],感谢!
具体代码如下:
# coding:utf-8 """ Created on 2016-05-03 @author:[email protected] """ import re import codecs import jieba.analyse import matplotlib.pyplot as plt from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans from sklearn.decomposition import PCA from mpl_toolkits.mplot3d import Axes3D if __name__ == "__main__": corpus = [] for line in open('01_All_BHSpider_Content_Result.txt','r').readlines(): # 读取1000行字符串的txt文本 line = line.strip() # 运用正则表达式,过滤英文及标点符号,中文标点符号 line = line.decode("utf8") line = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+|[a-zA-Z]+".decode("utf8"),"/".decode("utf8"),line) # 用jieba的textrank算法,每行字符串提取30个特征词 line_feature = jieba.analyse.textrank(line , topK=30) line = "/".join(line_feature) corpus.append(line) # 参考官方文档运用sklearn.feature_extraction.text.TfidfVectorizer,将corpus文本转换为tfidf值的svm向量 tfidfvec = TfidfVectorizer() cop_tfidf = tfidfvec.fit_transform(corpus) weight = cop_tfidf.toarray() # KMeans聚类,确定为4类 clf = KMeans(n_clusters=4) #景区 动物 人物 国家 s = clf.fit(weight) # 输出lable文档 result = codecs.open("lable.txt",'w','utf-8') i=1 while i<=len(clf.labels_): result.write(str(i)+','+str(clf.labels_[i-1])+'\r\n') i=i+1 result.close() # PCA降维,绘制三维图 pca = PCA(n_components=3) newData = pca.fit_transform(weight) x1 =[] y1 =[] z1 =[] i=0 while i<400: x1.append(newData[i][0]) y1.append(newData[i][1]) z1.append(newData[i][2]) i +=1 x2 =[] y2 =[] z2=[] i=400 while i<600: x2.append(newData[i][0]) y2.append(newData[i][1]) z2.append(newData[i][2]) i +=1 x3 =[] y3 =[] z3=[] i=600 while i<800: x3.append(newData[i][0]) y3.append(newData[i][1]) z3.append(newData[i][2]) i +=1 x4 =[] y4 =[] z4=[] i=800 while i<1000: x4.append(newData[i][0]) y4.append(newData[i][1]) z4.append(newData[i][2]) i +=1 fig=plt.figure() ax=fig.add_subplot(111,projection='3d') ax.scatter(x1,y1,z1,c='b') ax.scatter(x2,y2,z2,c='r') ax.scatter(x3,y3,z3,c='k') ax.scatter(x4,y4,z4,c='g') plt.show()