NLP实战学习(2):基于Bertopic的新闻主题建模

代码参考:【文本分析实操干货】短文本主题建模利器 - BERTopic

开箱即用的工具:bertopic

• https://github.com/MaartenGr/BERTopic
(论文:https://arxiv.org/abs/2203.05794)
• 深度语义向量 + 传统聚类方法:
(1)通过Bert计算得到语句的深度语义向量
(2)通过HDBSCAN进行聚簇处理
(3)通过c-tfidf进行调整聚簇的粒度并提取主题关键词

数据集:

链接:https://pan.baidu.com/s/1I8HLhNvFQCj5ogNpa0qDyw
提取码:0lwt
250万篇新闻( 原始数据9G,压缩文件3.6G;新闻内容跨度:2014-2016年,新闻来源涵盖了6.3万个媒体)
json格式:title标题,content正文,keywords关键词,desc描述,source来源

NLP实战学习(2):基于Bertopic的新闻主题建模_第1张图片

1.数据预处理:数据预处理:读取json文件,分词,去除标点符号后存入CSV文件(保留了"keywords", “title”, “desc”, “content”)

#解压
import zipfile
zf = zipfile.ZipFile('./data/new2016zh.zip')
print(zf.namelist())
zf.extractall()
zf.close()


#读取json文件
import json
with open('./data/news2016zh_train.json', 'r', encoding="utf-8") as f:
    lines = f.readlines()

#数据预处理,并存到csv文件
import csv
import os
import jieba
import re

stopwords = [i.strip() for i in open('./cn_stop_words.txt',"r", encoding="utf-8").readlines()]

#分词并且去除停用词
def pretty_cut(sentence):
    cut_list = jieba.lcut(''.join(re.findall('[\u4e00-\u9fa5]', sentence)), cut_all=True)
    for i in range(len(cut_list) - 1, -1, -1):
        if cut_list[i] in stopwords:
            del cut_list[i]
    return cut_list

with open(os.path.join("./data/news2016zh_valid.csv"), "w", encoding="utf-8", newline='') as g:
    writer = csv.writer(g)
    writer.writerow(["keywords", "title", "desc", "content"])
    for line in lines:
        news = json.loads(line)
        keywords = news["keywords"].strip(" ")
        title = news["desc"].strip(" ")
        desc = news["desc"].strip(" ")
        content = news["content"].strip(" ")
        cut_keywords = " ".join(pretty_cut(keywords))
        cut_title = " ".join(pretty_cut(title))
        cut_desc = " ".join(pretty_cut(desc))
        cut_content = " ".join(pretty_cut(content))
        writer.writerow([cut_keywords, cut_title, cut_desc, cut_content])

2.导入必要的库

import sys
import numpy as np
import pandas as pd
import jieba
import umap
import hdbscan
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import matplotlib.pyplot as plt

3.读取刚刚处理好的额文本

data = pd.read_csv('/content/drive/MyDrive/news2016zh_valid.csv')
data.head()

NLP实战学习(2):基于Bertopic的新闻主题建模_第2张图片
4.处理异常值:空值和float

#处理异常值
print("在 news 列中总共有 %d 个空值." % data['content'].isnull().sum())
data[data.isnull().values==True]#isnull返回一个布尔数组
data = data[pd.notnull(data['content'])]#保留非null的news

data['content']=data['content'].astype(str) #将数据类型都换成str

在这里插入图片描述

5.创建句子嵌入

#创建语句嵌入
#model = SBert('paraphrase-multilingual-MiniLM-L12-v2')
%%time
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
embeddings = model.encode(data['content'].tolist(), show_progress_bar=True)

NLP实战学习(2):基于Bertopic的新闻主题建模_第3张图片

embeddings.shape
#(75787, 384)

6.句子嵌入降维处理

sys.setrecursionlimit(1000000)
umap_embeddings = umap.UMAP(n_neighbors=25,
               n_components=10,
               min_dist=0.00,
               metric='cosine',
               random_state=2020).fit_transform(embeddings)

7.利用HDBSCAN进行文档聚类

cluster = hdbscan.HDBSCAN(min_cluster_size=100,
                          metric='euclidean',
                          cluster_selection_method='eom', 
                          prediction_data=True).fit(umap_embeddings)

# Prepare data
umap_data = umap.UMAP(n_neighbors=15, 
                      n_components=2, 
                      min_dist=0.0,
                      metric='cosine').fit_transform(embeddings)

result = pd.DataFrame(umap_data, columns=['x', 'y'])
result['labels'] = cluster.labels_

# Visualize clusters
fig, ax = plt.subplots(figsize=(25, 15))
outliers = result.loc[result.labels == -1, :]
clustered = result.loc[result.labels != -1, :]
plt.scatter(outliers.x, outliers.y, color='#BDBDBD', s=0.05)
plt.scatter(clustered.x, clustered.y, c=clustered.labels, s=0.05, cmap='hsv_r')
plt.colorbar()
plt.savefig("result1.png", dpi = 300)

NLP实战学习(2):基于Bertopic的新闻主题建模_第4张图片
8.c-TF-IDF

def c_tf_idf(documents, m, ngram_range=(1, 1)):
    my_stopwords =  [i.strip() for i in open('/content/drive/MyDrive/cn_stop_words.txt',encoding='utf-8').readlines()]
    """ Calculate a class-based TF-IDF where m is the number of total documents. """
    count = CountVectorizer(ngram_range=ngram_range, 
                            #tokenizer = lambda x : ' '.join(jieba.lcut(x)),
                            stop_words= my_stopwords).fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count

def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names()
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['Topic'])
                     .Doc
                     .count()
                     .reset_index()
                     .rename({"Topic": "Topic", "Doc": "Size"}, axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes

9.计算每个主题下的TOP主题词

docs_df = pd.DataFrame(data['content'].tolist(), columns=["Doc"]) 
docs_df['Topic'] = cluster.labels_
docs_df['Doc_ID'] = range(len(docs_df))
docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})
#产生的主题数
len(docs_per_topic.Doc.tolist())
#238

#提取每个簇中TF-IDF值最高的词作为该主题下的主题词
tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m = len(data))

top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
topic_sizes = extract_topic_sizes(docs_df); topic_sizes.head(10)

NLP实战学习(2):基于Bertopic的新闻主题建模_第5张图片

10.查看#看看主题索引为10的主题下的主题词是哪些:

top_n_words[10]

NLP实战学习(2):基于Bertopic的新闻主题建模_第6张图片
11.查看目前的所有主题及其对应的主题词列表

# 删除索引为-1的“噪声"聚类簇群
top_n_words[-1]
#查看目前的所有主题及其对应的主题词列表
from pprint import pprint
for i in list(range(len(top_n_words) - 1)):
    print('Most 20 Important words in TOPIC {} :\n'.format(i))
    pprint(top_n_words[i])
    pprint('***'*20)

NLP实战学习(2):基于Bertopic的新闻主题建模_第7张图片
12.主题归并

for i in tqdm(range(20)):
    # Calculate cosine similarity
    similarities = cosine_similarity(tf_idf.T)
    np.fill_diagonal(similarities, 0)

    # Extract label to merge into and from where
    topic_sizes = docs_df.groupby(['Topic']).count().sort_values("Doc", ascending=False).reset_index()
    topic_to_merge = topic_sizes.iloc[-1].Topic
    topic_to_merge_into = np.argmax(similarities[topic_to_merge + 1]) - 1

    # Adjust topics
    docs_df.loc[docs_df.Topic == topic_to_merge, "Topic"] = topic_to_merge_into
    old_topics = docs_df.sort_values("Topic").Topic.unique()
    map_topics = {old_topic: index - 1 for index, old_topic in enumerate(old_topics)}
    docs_df.Topic = docs_df.Topic.map(map_topics)
    docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})

    # Calculate new topic words
    m = len(data)
    tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m)
    top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)

topic_sizes = extract_topic_sizes(docs_df); topic_sizes.head(10)

#查看归并之后的主题数
len(docs_per_topic.Doc.tolist())
#218

你可能感兴趣的:(自然语言处理,学习,机器学习)