import jieba
import jieba.analyse
import math
import operator
from sklearn.cluster import KMeans, MiniBatchKMeans, AffinityPropagation, DBSCAN
from sklearn.cluster import MeanShift, estimate_bandwidth
from collections import Counter
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import os
#np.set_printoptions(threshold=np.inf)
#加载手工设置的某些词的idf值
def load_idf_file(path):
idf_dict = {}
handle = open(path, 'r',encoding= 'utf8')
line = handle.readline() # 调用文件的 readline()方法
datas = []
while line:
line = line.strip()
if len(line)>0:
line_arr = line.split(' ')
idf_dict[line_arr[0]] = float(line_arr[1])
line = handle.readline()
handle.close()
return idf_dict
def cal_idf(data_set,idf_dict):
doc_num = len(data_set)
word_doc_count=defaultdict(int)
for word_str in data_set:
word_list = word_str.split(' ')
word_list = list(set(word_list))
for item in word_list:
if item and item.strip()!='':
word_doc_count[item]+=1
word_idf = {}
default_idf_keys = idf_dict.keys()
for k,v in word_doc_count.items():
idf = math.log(doc_num*1.0 / v)
if k in default_idf_keys: word_idf[k] = idf_dict[k]
else:word_idf[k] = idf
#path = "idf.txt"
#save(word_idf, path)
return word_idf
def cal_tfidf(data_set,idf_ret):
doc_word_tfidf = []
i = 0
for word_str in data_set:
word_list = word_str.split(' ')
doc_word_total = len(word_list)
doc_word_dict = defaultdict(int)
doc_word_tfidf_dict = defaultdict(int)
for item in word_list:
if item and item.strip()!='':
doc_word_dict[item]+=1
for k,v in doc_word_dict.items():
doc_word_tfidf_dict[k]=(v/doc_word_total)*idf_ret[k]
doc_word_tfidf.append(doc_word_tfidf_dict)
i=i+1
return doc_word_tfidf
def save(idf_dict, path):
f = open(path, 'a+',encoding= 'utf8')
f.truncate()
for key in idf_dict.keys():
f.write(str(key) + " " + str(idf_dict[key]) + "\n")
f.close()
# 切词
def jieba_tokenize():
jieba_need =[]
for item in datas:
temp_list1 = jieba.analyse.extract_tags(item,topK=10)
w_len = len(temp_list1)
if w_len>2:
w_num = math.ceil(float(w_len)*0.6)
temp_list1 = temp_list1[0:w_num]
temp_list=[e for e in temp_list1 if e not in lines + [' '] and len(e)>1]
jieba_need.append(" ".join(temp_list))
return jieba_need
# 降维
def reduction(matrix):
svd = TruncatedSVD(50)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(matrix)
return X
jieba.analyse.set_idf_path("./jieba/idf.txt")
data_num = 10000
k_num = 200
i = 0
output = open('../data.txt', 'r',encoding= 'utf8')
line = output.readline() # 调用文件的 readline()方法
datas = []
while line:
line = line.strip()
if len(line)>0:
i=i+1
datas.append(line)
if i>=data_num:break
line = output.readline()
output.close()
npyfile = "data.npy"
if os.path.exists(npyfile):
X = np.load("data.npy")
else:
# 读取停用词
with open('./stop_words.txt',encoding='utf-8') as f:
entities = list(f)
lines = []
for line in entities:
line1 = line.strip()
lines.append(line1)
#加载手工设置的idf
default_idf_dict = load_idf_file("./jieba/idf.txt")
my_train = jieba_tokenize()
idf_ret = cal_idf(my_train,default_idf_dict)
tfidf_ret = cal_tfidf(my_train,idf_ret)
doc_rows = len(datas)
word_rows = len(idf_ret)
#把每个文档每个分词的词频 转换成 矩阵[文档数,分词数]
X = np.zeros([doc_rows,word_rows])
for i in range(doc_rows):
j = 0
for k,v in idf_ret.items():
X[i][j] = tfidf_ret[i][k]
j=j+1
X = np.array(X)
np.save("data.npy",X)
#降维
X = reduction(X)
#cl = MiniBatchKMeans(n_clusters=k_num, init='k-means++', n_init=1,init_size=1000, batch_size=10000, verbose=False)
#cl = KMeans(n_clusters=k_num, init='k-means++', random_state=30, n_init=1,verbose=False)
cl = DBSCAN(eps=0.2, min_samples=30)
result = cl.fit_predict(X)
num_clusters = len(set(result))
# 结果输出
ret = [[] for y in range(len(result))]
for i in range(len(datas)):
classid = result[i]
ret[classid].append(datas[i])
for m in range(num_clusters):
file = "./result/result_"+str(m)+".txt"
handle = open(file, 'w+',encoding= 'utf8')
for n in range(len(ret[m])):
handle.write(ret[m][n]+"\n")
handle.close()