目录
一、canopy算法
二、canopy代码
三、kmeans
四、整体思路
4.1、整体代码:
先看一个示意图
canopy可以理解为“粗聚类”,算法通过比较样本点和t1及t2的距离来划分聚类中心。算法步骤很简单如下:
1)选取t1及t2,确保t1>t2
2)从样本集中随机选取一个样本,当作一个canopy类
3)比较样本中的点到canopy类的一个距离,若距离介于t1与t2之间,则归为点所在的canopy类,下次迭代并参与运算(可能会成为新的canopy类,或可能有两个canopy中存在重叠的点 )。小于t2则认为此点和canopy具有高相似,故从样本集合中删除小于t2的点。
4)重复2-3,直至样本集中无点为止。算法收敛,得出粗聚类情况及聚类中心个数。
唯一不足的是算法的t1及t2参数值的确定,经过上面步骤其实可以看出t1和t2的取值很重要。这时候就需要某些方法或专家经验(博主也不太清楚什么方法来确定t1和t2)
可以拉下这个代码:GitHub - AlanConstantine/CanopyByPython: 很好用直接跑。
额。这个算法应该算是比较基础了,这边就不介绍了。不了解的可以自行百度学习下。
先用canopy初步确定kmeans的k的取值。得到k值后,再代入到kmeans算法中进一步聚类。这样解决了kmeans的k值敏感问题。
下面是一个canopy+kmeans的一个文本聚类,
from __future__ import print_function
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, MiniBatchKMeans
from bs4 import BeautifulSoup
import pandas as pd
import sys
import os
import jieba.posseg as pseg
import operator
import random
from sklearn.externals import joblib
from Kmeans.canopy import Canopy
import numpy as np
def save_model_file(model,save_model_name):
joblib.dump(model, save_model_name)
def jieba_postag(text):
words = pseg.cut(text)
return words
# 停用词
def defined_stop_words():
all_stop_words = []
for i, file in enumerate(os.listdir(r'D:\Gitlab\extract_key\stop_words')):
# 读取图片
filepath = fr'D:\Gitlab\extract_key\stop_words\{file}'
with open(filepath, 'r', encoding='utf-8') as fp:
all_line = fp.readlines()
for line in all_line:
all_stop_words.append(line.replace('\n',''))
return all_stop_words
# 加载数据
def loadDataset(filepath):
'''导入文本数据集'''
dataset = []
key_list = ['公司','项目','专业投资团队','元宇宙投资项目','养老项目','养老服务','老年产品','高回报','理财','募集','贷款','抵押','利息','保险','包赔','高利贷']
c = 0
with open(filepath,'r',encoding='utf-8') as fp:
all_line = fp.readlines()
for line in all_line:
dataset.append(line.replace('\n','' ))
if c == 10000:
break
c += 1
fp.close()
# print(len(dataset))
# # 随机抽样10W条
# dataset = random.sample(dataset,10000)
# print(len(dataset))
stop_words = defined_stop_words()
all_sen = []
original_sen = []
for sen in list(set(dataset)):
# 判断句子是否包含关键字
for key in key_list:
if operator.contains(sen,key):
sentence = ""
# jieba分词
word = jieba_postag(sen)
for w in word:
# 去停用词
if w.word not in stop_words:
sentence += w.word + ' '
all_sen.append(sentence)
original_sen.append(sen)
break
return original_sen,all_sen
def transform(dataset, n_features=1000):
vectorizer = TfidfVectorizer(max_df=0.5,max_features=n_features, min_df=2, use_idf=True)
X = vectorizer.fit_transform(dataset)
return X, vectorizer
def train(X, vectorizer, true_k=10, minibatch=False):
if minibatch:
km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
init_size=1000, batch_size=1000, verbose=False)
else:
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=300, n_init=1,
verbose=False)
km.fit(X)
# 保存模型
save_model_file(km,'Kmeans.pkl')
result = list(km.predict(X))
return result
if __name__ == '__main__':
filepath = r'D:\Gitlab\extract_key\all.txt'
original_data,dataset = loadDataset(filepath)
print("%d documents" % len(dataset))
X, vectorizer = transform(dataset, n_features=500)
# 使用canopy算法,粗聚类找出聚类个数
canopy_k = Canopy(X.toarray())
canopy_k.setThreshold(0.3,0.2)
clusters_k = len(canopy_k.clustering())
print(f"经过canopy粗聚类,得到聚类中心簇的个数为:{clusters_k}")
class_result = train(X, vectorizer, true_k=clusters_k)
abc_dict = {
'original_sentence':original_data,
'class':class_result,
'cut_words':dataset
}
result = pd.DataFrame(abc_dict)
# print(result)
result.to_csv('result.csv',index=False)