import re
import os
import string
import jieba
import logging
import sys
import codecs
import traceback
import pandas as pd
import numpy as np
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from collections import Counter
from sklearn import metrics
# ======== 数据编码格式转化 ========def get_text():
base_path = "D:\yuliao\"
filelist = os.listdir(base_path) #功能相当于在base_path目录下执行dir命令,返回为list类型 data_dict = {}
f2 = open("D:\data.txt", "w",encoding="utf-8")
for files in filelist:
f = open(base_path + files, "r",encoding="utf-8")
text = f.read().replace(" ", "")
#data_temp = text # 转换为unicode编码形式 data = "".join(re.findall(u"[一-鿿]+", text)) # 必须为unicode类型,取出所有中文字符 f2.write(data + " ")
f2.close()
class TextCluster(object):
# 初始化函数,重写父类函数 def __init__(self):
pass
def seg_words(self, sentence):
seg_list = jieba.cut(sentence) # 默认是精确模式 return " ".join(seg_list) # 分词,然后将结果列表形式转换为字符串
# 加载用户词典 def load_userdictfile(self, dict_file):
jieba.load_userdict(dict_file)
def load_processfile(self, process_file):
corpus_list = []
try:
fp = open(process_file, "r",encoding="utf-8")
for line in fp:
conline = line.strip()
corpus_list.append(conline)
return True, corpus_list
except:
logging.error(traceback.format_exc())
return False, "get process file fail"
def output_file(self, out_file, item):
try:
fw = open(out_file, "a")
fw.write("%s" % (item),encoding="utf-8")
fw.close()
except:
logging.error(traceback.format_exc())
return False, "out file fail"
# 释放内存资源 def __del__(self):
pass
def process(self, process_file, tf_ResFileName, tfidf_ResFileName, num_clusters, cluster_ResFileName):
try:
sen_seg_list = []
flag, lines = self.load_processfile(process_file)
if flag == False:
logging.error("load error")
return False, "load error"
for line in lines:
sen_seg_list.append(self.seg_words(line))
# 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频 tf_vectorizer = CountVectorizer()
# fit_transform是将文本转为词频矩阵 tf_matrix = tf_vectorizer.fit_transform(sen_seg_list)
tf_weight = tf_matrix.toarray()
# print tf_weight
# 该类会统计每个词语的tf-idf权值 tfidf_transformer = TfidfTransformer()
# fit_transform是计算tf-idf tfidf_matrix = tfidf_transformer.fit_transform(tf_matrix)
# 获取词袋模型中的所有词语 word_list = tf_vectorizer.get_feature_names()
# 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重 tfidf_weight = tfidf_matrix.toarray()
# 打印特征向量文本内容 # print "Features length: " + str(len(word_list)) tf_Res = codecs.open(tf_ResFileName, "w", "utf-8")
word_list_len = len(word_list)
for num in range(word_list_len):
if num == word_list_len - 1:
tf_Res.write(word_list[num])
else:
tf_Res.write(word_list[num] + " ")
tf_Res.write(" ")
# 打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重 for i in range(len(tf_weight)):
# print u"-------这里输出第", i, u"类文本的词语tf-idf权重------" for j in range(word_list_len):
if j == word_list_len - 1:
tf_Res.write(str(tf_weight[i][j]))
else:
tf_Res.write(str(tf_weight[i][j]) + " ")
tf_Res.write(" ")
tf_Res.close()
# 输出tfidf矩阵 tfidf_Res = codecs.open(tfidf_ResFileName, "w", "utf-8")
for num in range(word_list_len):
if num == word_list_len - 1:
tfidf_Res.write(word_list[num])
else:
tfidf_Res.write(word_list[num] + " ")
tfidf_Res.write(" ")
# 打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重 for i in range(len(tfidf_weight)):
for j in range(len(word_list)):
if j == word_list_len - 1:
tfidf_Res.write(str(tfidf_weight[i][j]))
else:
tfidf_Res.write(str(tfidf_weight[i][j]) + " ")
tfidf_Res.write(" ")
tfidf_Res.close()
# ======== 聚类分析 ========= km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)
print ("===== 总体轮廓系数 =====")
print (metrics.silhouette_score(tfidf_matrix, km.labels_, metric="euclidean")) #轮廓系数[-1,1] kas=metrics.silhouette_samples(tfidf_matrix, km.labels_, metric="euclidean") #生成每个文件轮廓系数 print ("===== 每类样本个数 =====")
print (Counter(km.labels_)) # 每类分到多少样本 # 中心点 # print(km.cluster_centers_) # 每个样本所属的簇 clusterRes = codecs.open(cluster_ResFileName, "w", "utf-8")
# 将数据写入文件 count = 1
a=os.listdir("D:\yuliao\")
kas1=kas.tolist()
while count <= len(km.labels_):
clusterRes.write(str(count) +" "+a[count-1]+" "+str(km.labels_[count - 1])+" "+str(kas1[count-1]))
clusterRes.write(" ")
count = count + 1
clusterRes.close()
except:
logging.error(traceback.format_exc())
return False, "process fail"
# 输出聚类结果if __name__ == "__main__":
rootdir = "D:\"
get_text()
tc = TextCluster()
tc.process(rootdir+"data.txt", rootdir+"tf_Result.txt", rootdir+"tfidf_Result.txt",3, rootdir+"cluster_Result.txt")
# data.txt记录所有数据集,一行是一个文本 # tf_Result.txt记录关键词及对应文本出现频数 # tfidf_Result.txt记录关键词及对应文本重要性 # cluster_Result.txt记录聚类结果文件名+对应类别