代码参考:【文本分析实操干货】短文本主题建模利器 - BERTopic
• https://github.com/MaartenGr/BERTopic
(论文:https://arxiv.org/abs/2203.05794)
• 深度语义向量 + 传统聚类方法:
(1)通过Bert计算得到语句的深度语义向量
(2)通过HDBSCAN进行聚簇处理
(3)通过c-tfidf进行调整聚簇的粒度并提取主题关键词
链接:https://pan.baidu.com/s/1I8HLhNvFQCj5ogNpa0qDyw
提取码:0lwt
250万篇新闻( 原始数据9G,压缩文件3.6G;新闻内容跨度:2014-2016年,新闻来源涵盖了6.3万个媒体)
json格式:title标题,content正文,keywords关键词,desc描述,source来源
1.数据预处理:数据预处理:读取json文件,分词,去除标点符号后存入CSV文件(保留了"keywords", “title”, “desc”, “content”)
#解压
import zipfile
zf = zipfile.ZipFile('./data/new2016zh.zip')
print(zf.namelist())
zf.extractall()
zf.close()
#读取json文件
import json
with open('./data/news2016zh_train.json', 'r', encoding="utf-8") as f:
lines = f.readlines()
#数据预处理,并存到csv文件
import csv
import os
import jieba
import re
stopwords = [i.strip() for i in open('./cn_stop_words.txt',"r", encoding="utf-8").readlines()]
#分词并且去除停用词
def pretty_cut(sentence):
cut_list = jieba.lcut(''.join(re.findall('[\u4e00-\u9fa5]', sentence)), cut_all=True)
for i in range(len(cut_list) - 1, -1, -1):
if cut_list[i] in stopwords:
del cut_list[i]
return cut_list
with open(os.path.join("./data/news2016zh_valid.csv"), "w", encoding="utf-8", newline='') as g:
writer = csv.writer(g)
writer.writerow(["keywords", "title", "desc", "content"])
for line in lines:
news = json.loads(line)
keywords = news["keywords"].strip(" ")
title = news["desc"].strip(" ")
desc = news["desc"].strip(" ")
content = news["content"].strip(" ")
cut_keywords = " ".join(pretty_cut(keywords))
cut_title = " ".join(pretty_cut(title))
cut_desc = " ".join(pretty_cut(desc))
cut_content = " ".join(pretty_cut(content))
writer.writerow([cut_keywords, cut_title, cut_desc, cut_content])
2.导入必要的库
import sys
import numpy as np
import pandas as pd
import jieba
import umap
import hdbscan
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import matplotlib.pyplot as plt
3.读取刚刚处理好的额文本
data = pd.read_csv('/content/drive/MyDrive/news2016zh_valid.csv')
data.head()
#处理异常值
print("在 news 列中总共有 %d 个空值." % data['content'].isnull().sum())
data[data.isnull().values==True]#isnull返回一个布尔数组
data = data[pd.notnull(data['content'])]#保留非null的news
data['content']=data['content'].astype(str) #将数据类型都换成str
5.创建句子嵌入
#创建语句嵌入
#model = SBert('paraphrase-multilingual-MiniLM-L12-v2')
%%time
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
embeddings = model.encode(data['content'].tolist(), show_progress_bar=True)
embeddings.shape
#(75787, 384)
6.句子嵌入降维处理
sys.setrecursionlimit(1000000)
umap_embeddings = umap.UMAP(n_neighbors=25,
n_components=10,
min_dist=0.00,
metric='cosine',
random_state=2020).fit_transform(embeddings)
7.利用HDBSCAN进行文档聚类
cluster = hdbscan.HDBSCAN(min_cluster_size=100,
metric='euclidean',
cluster_selection_method='eom',
prediction_data=True).fit(umap_embeddings)
# Prepare data
umap_data = umap.UMAP(n_neighbors=15,
n_components=2,
min_dist=0.0,
metric='cosine').fit_transform(embeddings)
result = pd.DataFrame(umap_data, columns=['x', 'y'])
result['labels'] = cluster.labels_
# Visualize clusters
fig, ax = plt.subplots(figsize=(25, 15))
outliers = result.loc[result.labels == -1, :]
clustered = result.loc[result.labels != -1, :]
plt.scatter(outliers.x, outliers.y, color='#BDBDBD', s=0.05)
plt.scatter(clustered.x, clustered.y, c=clustered.labels, s=0.05, cmap='hsv_r')
plt.colorbar()
plt.savefig("result1.png", dpi = 300)
def c_tf_idf(documents, m, ngram_range=(1, 1)):
my_stopwords = [i.strip() for i in open('/content/drive/MyDrive/cn_stop_words.txt',encoding='utf-8').readlines()]
""" Calculate a class-based TF-IDF where m is the number of total documents. """
count = CountVectorizer(ngram_range=ngram_range,
#tokenizer = lambda x : ' '.join(jieba.lcut(x)),
stop_words= my_stopwords).fit(documents)
t = count.transform(documents).toarray()
w = t.sum(axis=1)
tf = np.divide(t.T, w)
sum_t = t.sum(axis=0)
idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
tf_idf = np.multiply(tf, idf)
return tf_idf, count
def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
words = count.get_feature_names()
labels = list(docs_per_topic.Topic)
tf_idf_transposed = tf_idf.T
indices = tf_idf_transposed.argsort()[:, -n:]
top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
return top_n_words
def extract_topic_sizes(df):
topic_sizes = (df.groupby(['Topic'])
.Doc
.count()
.reset_index()
.rename({"Topic": "Topic", "Doc": "Size"}, axis='columns')
.sort_values("Size", ascending=False))
return topic_sizes
9.计算每个主题下的TOP主题词
docs_df = pd.DataFrame(data['content'].tolist(), columns=["Doc"])
docs_df['Topic'] = cluster.labels_
docs_df['Doc_ID'] = range(len(docs_df))
docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})
#产生的主题数
len(docs_per_topic.Doc.tolist())
#238
#提取每个簇中TF-IDF值最高的词作为该主题下的主题词
tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m = len(data))
top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
topic_sizes = extract_topic_sizes(docs_df); topic_sizes.head(10)
10.查看#看看主题索引为10的主题下的主题词是哪些:
top_n_words[10]
# 删除索引为-1的“噪声"聚类簇群
top_n_words[-1]
#查看目前的所有主题及其对应的主题词列表
from pprint import pprint
for i in list(range(len(top_n_words) - 1)):
print('Most 20 Important words in TOPIC {} :\n'.format(i))
pprint(top_n_words[i])
pprint('***'*20)
for i in tqdm(range(20)):
# Calculate cosine similarity
similarities = cosine_similarity(tf_idf.T)
np.fill_diagonal(similarities, 0)
# Extract label to merge into and from where
topic_sizes = docs_df.groupby(['Topic']).count().sort_values("Doc", ascending=False).reset_index()
topic_to_merge = topic_sizes.iloc[-1].Topic
topic_to_merge_into = np.argmax(similarities[topic_to_merge + 1]) - 1
# Adjust topics
docs_df.loc[docs_df.Topic == topic_to_merge, "Topic"] = topic_to_merge_into
old_topics = docs_df.sort_values("Topic").Topic.unique()
map_topics = {old_topic: index - 1 for index, old_topic in enumerate(old_topics)}
docs_df.Topic = docs_df.Topic.map(map_topics)
docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})
# Calculate new topic words
m = len(data)
tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m)
top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
topic_sizes = extract_topic_sizes(docs_df); topic_sizes.head(10)
#查看归并之后的主题数
len(docs_per_topic.Doc.tolist())
#218