目录
一、数据
二、代码
2.1、加载停用词
2.2、加载数据
2.3、计算tf-idf向量值
2.4、训练
三、完整代码
通过爬虫爬取贴吧数据,这里怎么爬取的就不记录了。然后以一句一行的格式存入到txt中。接着我们要通过对每句话进行分词转向量,最后使用kmeans进行聚类并输出结果。
在stop_words目录下有多个停用词表,需要循环加总所有停用词。
def defined_stop_words():
all_stop_words = []
for i, file in enumerate(os.listdir(r'D:\Gitlab\extract_key\stop_words')):
# 读取图片
filepath = fr'D:\Gitlab\extract_key\stop_words\{file}'
with open(filepath, 'r', encoding='utf-8') as fp:
all_line = fp.readlines()
for line in all_line:
all_stop_words.append(line.replace('\n',''))
return all_stop_words
这边主要是对原始数据的一个筛选+jieba分词+去停用词。这是相对标准的一个流程。
def loadDataset(filepath):
'''导入文本数据集'''
dataset = []
key_list = ['公司','项目','专业投资团队','元宇宙投资项目','养老项目','养老服务','老年产品','高回报','理财','募集','贷款','抵押','利息','保险','包赔','高利贷']
with open(filepath,'r',encoding='utf-8') as fp:
all_line = fp.readlines()
for line in all_line:
dataset.append(line.replace('\n','' ))
fp.close()
# print(len(dataset))
# # 随机抽样10W条
# dataset = random.sample(dataset,10000)
# print(len(dataset))
# 加载停用词
stop_words = defined_stop_words()
all_sen = []
original_sen = []
for sen in list(set(dataset)):
# 判断句子是否包含关键字
for key in key_list:
if operator.contains(sen,key):
sentence = ""
# jieba分词
word = jieba_postag(sen)
for w in word:
# 去停用词
if w.word not in stop_words:
sentence += w.word + ' '
all_sen.append(sentence)
original_sen.append(sen)
break
# 原句 原句分词结果
return original_sen,all_sen
X返回输入dataset的向量值,参数看数据选择合适的。
def transform(dataset, n_features=1000):
vectorizer = TfidfVectorizer(max_df=0.5, max_features=n_features, min_df=2, use_idf=True)
X = vectorizer.fit_transform(dataset)
return X, vectorizer
这里选择Kmeans的方式,自定义k值,欠考虑的一个方案。
def train(X, vectorizer, true_k=10, minibatch=False):
# 使用采样数据还是原始数据训练k-means,
if minibatch:
km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
init_size=1000, batch_size=1000, verbose=False)
else:
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=300, n_init=1,
verbose=False)
km.fit(X)
# 报存模型
save_model_file(km,'Kmeans.pkl')
result = list(km.predict(X))
print('Cluster distribution:')
print(dict([(i, result.count(i)) for i in result]))
return km.score(X),result
根据实际的数据情况有部分是需要调整的,这里是做的文本聚类。这样盲目的定义k的取值为100是不太合理的。感兴趣的可以看下Canopy算法,它能根据你的数据集来输出最佳k的取值。使用Canopy + Kmeans 或许效果会好一些。
from __future__ import print_function
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, MiniBatchKMeans
import pandas as pd
import sys
import os
import jieba.posseg as pseg
import operator
import random
from sklearn.externals import joblib
def save_model_file(model,save_model_name):
joblib.dump(model, save_model_name)
def jieba_postag(text):
words = pseg.cut(text)
return words
def defined_stop_words():
all_stop_words = []
for i, file in enumerate(os.listdir(r'D:\Gitlab\extract_key\stop_words')):
# 读取图片
filepath = fr'D:\Gitlab\extract_key\stop_words\{file}'
with open(filepath, 'r', encoding='utf-8') as fp:
all_line = fp.readlines()
for line in all_line:
all_stop_words.append(line.replace('\n',''))
return all_stop_words
def loadDataset(filepath):
'''导入文本数据集'''
dataset = []
key_list = ['公司','项目','专业投资团队','元宇宙投资项目','养老项目','养老服务','老年产品','高回报','理财','募集','贷款','抵押','利息','保险','包赔','高利贷']
with open(filepath,'r',encoding='utf-8') as fp:
all_line = fp.readlines()
for line in all_line:
dataset.append(line.replace('\n','' ))
fp.close()
# print(len(dataset))
# # 随机抽样10W条
# dataset = random.sample(dataset,10000)
# print(len(dataset))
stop_words = defined_stop_words()
all_sen = []
original_sen = []
for sen in list(set(dataset)):
# 判断句子是否包含关键字
for key in key_list:
if operator.contains(sen,key):
sentence = ""
# jieba分词
word = jieba_postag(sen)
for w in word:
# 去停用词
if w.word not in stop_words:
sentence += w.word + ' '
all_sen.append(sentence)
original_sen.append(sen)
break
return original_sen,all_sen
def transform(dataset, n_features=1000):
vectorizer = TfidfVectorizer(max_df=0.5, max_features=n_features, min_df=2, use_idf=True)
X = vectorizer.fit_transform(dataset)
return X, vectorizer
def train(X, vectorizer, true_k=10, minibatch=False):
# 使用采样数据还是原始数据训练k-means,
if minibatch:
km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
init_size=1000, batch_size=1000, verbose=False)
else:
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=300, n_init=1,
verbose=False)
km.fit(X)
# 报存模型
save_model_file(km,'Kmeans.pkl')
result = list(km.predict(X))
print('Cluster distribution:')
print(dict([(i, result.count(i)) for i in result]))
return -km.score(X),result
def test():
'''测试选择最优参数'''
# 读数据
filepath = r'D:\Gitlab\extract_key\all.txt'
original_data,dataset = loadDataset(filepath)
X, vectorizer = transform(dataset, n_features=500)
train_score,class_result = train(X, vectorizer, true_k=100)
socre = train_score / len(dataset)
print(socre)
abc_dict = {
'original_sentence':original_data,
'class':class_result,
'cut_words':dataset
}
result = pd.DataFrame(abc_dict)
# print(result)
result.to_csv('result.csv',index=False)
if __name__ == '__main__':
test()