python 文本聚类-Python之 文本聚类

import re

import os

import string

import jieba

import logging

import sys

import codecs

import traceback

import pandas as pd

import numpy as np

from sklearn import feature_extraction

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.cluster import KMeans

from collections import Counter

from sklearn import metrics

# ======== 数据编码格式转化 ========def get_text():

base_path = "D:\yuliao\"

filelist = os.listdir(base_path) #功能相当于在base_path目录下执行dir命令,返回为list类型 data_dict = {}

f2 = open("D:\data.txt", "w",encoding="utf-8")

for files in filelist:

f = open(base_path + files, "r",encoding="utf-8")

text = f.read().replace(" ", "")

#data_temp = text # 转换为unicode编码形式 data = "".join(re.findall(u"[一-鿿]+", text)) # 必须为unicode类型,取出所有中文字符 f2.write(data + " ")

f2.close()

class TextCluster(object):

# 初始化函数,重写父类函数 def __init__(self):

pass

def seg_words(self, sentence):

seg_list = jieba.cut(sentence) # 默认是精确模式 return " ".join(seg_list) # 分词,然后将结果列表形式转换为字符串

# 加载用户词典 def load_userdictfile(self, dict_file):

jieba.load_userdict(dict_file)

def load_processfile(self, process_file):

corpus_list = []

try:

fp = open(process_file, "r",encoding="utf-8")

for line in fp:

conline = line.strip()

corpus_list.append(conline)

return True, corpus_list

except:

logging.error(traceback.format_exc())

return False, "get process file fail"

def output_file(self, out_file, item):

try:

fw = open(out_file, "a")

fw.write("%s" % (item),encoding="utf-8")

fw.close()

except:

logging.error(traceback.format_exc())

return False, "out file fail"

# 释放内存资源 def __del__(self):

pass

def process(self, process_file, tf_ResFileName, tfidf_ResFileName, num_clusters, cluster_ResFileName):

try:

sen_seg_list = []

flag, lines = self.load_processfile(process_file)

if flag == False:

logging.error("load error")

return False, "load error"

for line in lines:

sen_seg_list.append(self.seg_words(line))

# 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频 tf_vectorizer = CountVectorizer()

# fit_transform是将文本转为词频矩阵 tf_matrix = tf_vectorizer.fit_transform(sen_seg_list)

tf_weight = tf_matrix.toarray()

# print tf_weight

# 该类会统计每个词语的tf-idf权值 tfidf_transformer = TfidfTransformer()

# fit_transform是计算tf-idf tfidf_matrix = tfidf_transformer.fit_transform(tf_matrix)

# 获取词袋模型中的所有词语 word_list = tf_vectorizer.get_feature_names()

# 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重 tfidf_weight = tfidf_matrix.toarray()

# 打印特征向量文本内容 # print "Features length: " + str(len(word_list)) tf_Res = codecs.open(tf_ResFileName, "w", "utf-8")

word_list_len = len(word_list)

for num in range(word_list_len):

if num == word_list_len - 1:

tf_Res.write(word_list[num])

else:

tf_Res.write(word_list[num] + " ")

tf_Res.write(" ")

# 打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重 for i in range(len(tf_weight)):

# print u"-------这里输出第", i, u"类文本的词语tf-idf权重------" for j in range(word_list_len):

if j == word_list_len - 1:

tf_Res.write(str(tf_weight[i][j]))

else:

tf_Res.write(str(tf_weight[i][j]) + " ")

tf_Res.write(" ")

tf_Res.close()

# 输出tfidf矩阵 tfidf_Res = codecs.open(tfidf_ResFileName, "w", "utf-8")

for num in range(word_list_len):

if num == word_list_len - 1:

tfidf_Res.write(word_list[num])

else:

tfidf_Res.write(word_list[num] + " ")

tfidf_Res.write(" ")

# 打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重 for i in range(len(tfidf_weight)):

for j in range(len(word_list)):

if j == word_list_len - 1:

tfidf_Res.write(str(tfidf_weight[i][j]))

else:

tfidf_Res.write(str(tfidf_weight[i][j]) + " ")

tfidf_Res.write(" ")

tfidf_Res.close()

# ======== 聚类分析 ========= km = KMeans(n_clusters=num_clusters)

km.fit(tfidf_matrix)

print ("===== 总体轮廓系数 =====")

print (metrics.silhouette_score(tfidf_matrix, km.labels_, metric="euclidean")) #轮廓系数[-1,1] kas=metrics.silhouette_samples(tfidf_matrix, km.labels_, metric="euclidean") #生成每个文件轮廓系数 print ("===== 每类样本个数 =====")

print (Counter(km.labels_)) # 每类分到多少样本 # 中心点 # print(km.cluster_centers_) # 每个样本所属的簇 clusterRes = codecs.open(cluster_ResFileName, "w", "utf-8")

# 将数据写入文件 count = 1

a=os.listdir("D:\yuliao\")

kas1=kas.tolist()

while count <= len(km.labels_):

clusterRes.write(str(count) +" "+a[count-1]+" "+str(km.labels_[count - 1])+" "+str(kas1[count-1]))

clusterRes.write(" ")

count = count + 1

clusterRes.close()

except:

logging.error(traceback.format_exc())

return False, "process fail"

# 输出聚类结果if __name__ == "__main__":

rootdir = "D:\"

get_text()

tc = TextCluster()

tc.process(rootdir+"data.txt", rootdir+"tf_Result.txt", rootdir+"tfidf_Result.txt",3, rootdir+"cluster_Result.txt")

# data.txt记录所有数据集,一行是一个文本 # tf_Result.txt记录关键词及对应文本出现频数 # tfidf_Result.txt记录关键词及对应文本重要性 # cluster_Result.txt记录聚类结果文件名+对应类别

你可能感兴趣的:(python 文本聚类-Python之 文本聚类)