如题,本文是LDA(Latent Dirichlet Allocation)主题模型的第二篇,第一篇是 折肘法+困惑度确定LDA主题模型的主题数。在上一篇文章中,简单介绍LDA模型的概念和LDA主题模型的主题数的确定方法-折肘法+困惑度折线法。本文将基于上一篇文章中的方法,确定给定文本中的主题数=5,进而提取关键词(key word)。
LDA主题模型如何提取文本中的关键词?本文采用如下方案:
- 使用 tf-idf 对数据集中的每个词进行加权,得到加权后的向量表示;
- 通过词空间构建和向量化方法,得到给定数据集的主题-词分布;
- 计算词的分布和文档的分布的相似度,取相似度最高的keyword num个词作为关键词。
import gensim
import math
import jieba
import jieba.posseg as posseg
from jieba import analyse
from gensim import corpora, models
import functools
import numpy as np
import os
import time
from tqdm import tqdm
class TopicModel(object):
"""使用gensim的接口,将文本转为向量化表示"""
def __init__(self, doc_list, keyword_num, model='LSI', num_topics=5):
"""形参:给定数据集,关键词数,候选主题模型(可选LSI、LDA等),主题数"""
"""构建词空间,BOW模型向量化"""
self.dictionary = corpora.Dictionary(doc_list)
corpus = [self.dictionary.doc2bow(doc) for doc in doc_list]
"""对于文本中的每个单词,使用tf-idf进行加权,得到加权后的向量表示"""
self.tfidf_model = models.TfidfModel(corpus)
self.tfidf_corpus = self.tfidf_model[corpus]
self.keyword_num = keyword_num
self.num_topics = num_topics
"""加载主题模型"""
if model == 'LSI':
self.model = self.train_lsi()
else:
self.model = self.train_lda()
"""给定数据集的主题-词分布"""
word_dic = self.word_dictionary(doc_list)
self.wordtopic_dic = self.get_wordtopic(word_dic)
"""BOW向量化"""
def doc2bowvec(self, word_list):
vec_list = [1 if word in word_list else 0 for word in self.dictionary]
print("vec_list", vec_list)
return vec_list
"""词空间构建方法和向量化方法"""
def word_dictionary(self, doc_list):
dictionary = []
for doc in doc_list:
dictionary.extend(doc)
dictionary = list(set(dictionary))
return dictionary
"""给定数据集的主题-词分布"""
def get_wordtopic(self, word_dic):
wordtopic_dic = {}
for word in word_dic:
singlist = [word]
# 计算每个词的加权向量
word_corpus = self.tfidf_model[self.dictionary.doc2bow(singlist)]
# 计算每个词的主题向量
word_topic = self.model[word_corpus]
wordtopic_dic[word] = word_topic
return wordtopic_dic
"""加载主题模型"""
def train_lsi(self):
lsi = models.LsiModel(self.tfidf_corpus, id2word=self.dictionary, num_topics=self.num_topics)
return lsi
def train_lda(self):
lda = models.LdaModel(self.tfidf_corpus, id2word=self.dictionary, num_topics=self.num_topics)
return lda
"""计算词的分布和文档的分布的相似度,将相似度最高的keyword_num个词作为关键词"""
def get_simword(self, word_list):
# 文档的加权向量
sentcorpus = self.tfidf_model[self.dictionary.doc2bow(word_list)]
# 文档主题
senttopic = self.model[sentcorpus]
def calsim(l1, l2):
a, b, c = 0.0, 0.0, 0.0
for t1, t2 in zip(l1, l2):
x1 = t1[1]
x2 = t2[1]
a += x1 * x1
b += x1 * x1
c += x2 * x2
sim = a / math.sqrt(b * c) if not (b * c) == 0.0 else 0.0
return sim
# 输入文本和每个词的主题分布相似度
sim_dic = {}
for k, v in self.wordtopic_dic.items():
# 计算每个文档中的词和文档的相识度
if k not in word_list:
continue
sim = calsim(v, senttopic)
sim_dic[k] = sim
counts = {}
keyWordDict = []
for k, v in sorted(sim_dic.items(), key=functools.cmp_to_key(cmp), reverse=True)[:self.keyword_num]:
if k is not None:
keyWordDict.append(k)
return keyWordDict
# 加载数据
def load_whole_dataSet(datafolder_path):
prepared_data =[]
files = os.listdir(datafolder_path)
for file in files:
if not os.path.isdir(datafolder_path + file):
for line in open(datafolder_path+"/"+file, 'r', encoding='utf-8'):
prepared_data.append(line)
return prepared_data
# 数据预处理
def pre_dataSet(prepared_data, pos=False):
doc_list = []
for line in prepared_data:
content = line.strip()
seg_list = seg_to_list(content, pos)
filetr_list = word_filter(seg_list, pos)
doc_list.append(filetr_list)
return doc_list
# 停用词
def get_stopword_list(stopword_path):
stopword_list = [stopword.replace('\n', ' ') for stopword in open(stopword_path, encoding='gbk').readlines()]
return stopword_list
# jieba分词
def seg_to_list(sentence, pos=False):
if not pos:
seg_list = jieba.cut(sentence)
else:
seg_list = posseg.cut(sentence)
return seg_list
# 干扰词
def word_filter(seg_list, stopword_path, pos=False):
stopword_list = get_stopword_list(stopword_path)
filter_list = []
for seg in seg_list:
if not pos:
word = seg
flag = 'n'
else:
word = seg.word
flag = seg.flag
if not flag.startswith('n'):
continue
if word not in stopword_list and len(word)>1:
filter_list.append(word)
return filter_list
# 排序函数,抽取topK关键词
def cmp(e1, e2):
res = np.sign(e1[1] - e2[1])
if res != 0:
return res
else:
a = e1[0] + e2[0]
b = e2[0] + e1[0]
if a > b:
return 1
elif a == b:
return 0
else:
return -1
# 构建关键词字典,提取给定数据集的关键词
def load_whole_dataSet(datafolder_path, pos=False, model='LDA', keyword_num=100):
prepared_data =[]
files = os.listdir(datafolder_path)
counts = {}
for file in tqdm(files):
prepared_data =[]
if not os.path.isdir(file):
for line in open(datafolder_path+"/"+file, 'r', encoding='utf-8'):
content = line.strip()
seg_list = seg_to_list(content, pos)
filetr_list = word_filter(seg_list, pos)
prepared_data.append(filetr_list)
if prepared_data:
try:
topic_model = TopicModel(prepared_data, keyword_num, model=model)
except ValueError as e:
pass
for doc_list_i in prepared_data:
keyWordDict = topic_model.get_simword(doc_list_i)
for word in keyWordDict:
if len(word) == 1:
continue
else:
counts[word] = counts.get(word, 0) + 1
return counts
if __name__ == '__main__':
# 设置停用词路径
stopword_path = '~/dataSet/stop_words/stop_words.txt'
keyword_num = 5
# text为需要提取关键词的文本
text = "^ ^"
prepared_data = []
content = text.strip()
seg_list = seg_to_list(content, pos=False)
filetr_list = word_filter(seg_list, pos=False)
prepared_data.append(filetr_list)
topic_model = TopicModel(prepared_data, keyword_num, model='LDA')
counts = {}
for item in prepared_data:
keyWordDict = topic_model.get_simword(item)
for word in keyWordDict:
if len(word) == 1:
continue
else:
counts[word] = counts.get(word, 0) + 1
print(counts)