一、使用gensim python拓展包
1. install gensim,sklearn, nltk
2.lda python代码实现:
#!/usr/bin/env python # -*- coding: utf-8 -*- """ __title__ = 'topic model - build lda - 20news dataset' __author__ = 'pi' __mtime__ = '12/26/2014-026' # code is far away from bugs with the god animal protecting I love animals. They taste delicious. ┏┓ ┏┓ ┏┛┻━━━┛┻┓ ┃ ☃ ┃ ┃ ┳┛ ┗┳ ┃ ┃ ┻ ┃ ┗━┓ ┏━┛ ┃ ┗━━━┓ ┃ 神兽保佑 ┣┓ ┃ 永无BUG! ┏┛ ┗┓┓┏━┳┓┏┛ ┃┫┫ ┃┫┫ ┗┻┛ ┗┻┛ """ from Colors import * from collections import defaultdict import re import datetime from sklearn import datasets import nltk from gensim import corpora from gensim import models import numpy as np from scipy import spatial from CorePyPro.Fun.TimeStump import totalTime def load_texts(dataset_type='train', groups=None): """ load datasets to bytes list :return:train_dataset_bunch.data bytes list """ if groups == 'small': groups = ['comp.graphics', 'comp.os.ms-windows.misc'] # 仅用于小数据测试时用, #1368 elif groups == 'medium': groups = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.ma c.hardware', 'comp.windows.x', 'sci.space'] # 中量数据时用 #3414 train_dataset_bunch = datasets.load_mlcomp('20news-18828', dataset_type, mlcomp_root='./datasets', categories=groups) # 13180 return train_dataset_bunch.data def preprocess_texts(texts, test_doc_id=1): """ texts preprocessing :param texts: bytes list :return:bytes list """ texts = [t.decode(errors='ignore') for t in texts] # bytes2str # print(REDH, 'original texts[%d]: ' % test_doc_id, DEFAULT, '\n',texts[test_doc_id]) # split_texts = [t.lower().split() for t in texts] # print(REDH, 'split texts[%d]: #%d' % (test_doc_id, len(split_texts)), DEFAULT, '\n',split_texts[test_doc_id]) # lower str & split str 2 word list with sep=... & delete None SEPS = '[\s()-/,:.?!]\s*' texts = [re.split(SEPS, t.lower()) for t in texts] for t in texts: while '' in t: t.remove('') # print(REDH, 'texts[%d] lower & split(seps= %s) & delete None: #%d' % (test_doc_id, SEPS, len(texts[test_doc_id])), DEFAULT, '\n',texts[test_doc_id]) # nltk.download() #then choose the corpus.stopwords stopwords = set(nltk.corpus.stopwords.words('english')) # #127 stopwords.update(['from', 'subject', 'writes']) # #129 word_usage = defaultdict(int) for t in texts: for w in t: word_usage[w] += 1 COMMON_LINE = len(texts) / 10 too_common_words = [w for w in t if word_usage[w] > COMMON_LINE] # set(too_common_words) # print('too_common_words: #', len(too_common_words), '\n', too_common_words) #68 stopwords.update(too_common_words) # print('stopwords: #', len(stopwords), '\n', stopwords) # #147 english_stemmer = nltk.SnowballStemmer('english') MIN_WORD_LEN = 3 # 4 texts = [[english_stemmer.stem(w) for w in t if not set(w) & set('@+>0123456789*') and w not in stopwords and len(w) >= MIN_WORD_LEN] for t in texts] # set('+-.?!()>@0123456789*/') # print(REDH, 'texts[%d] delete ^alphanum & stopwords & len<%d & stemmed: #' % (test_doc_id, MIN_WORD_LEN), # len(texts[test_doc_id]), DEFAULT, '\n', texts[test_doc_id]) return texts def build_corpus(texts): """ build corpora :param texts: bytes list :return: corpus DirectTextCorpus(corpora.TextCorpus) """ class DirectTextCorpus(corpora.TextCorpus): def get_texts(self): return self.input def __len__(self): return len(self.input) corpus = DirectTextCorpus(texts) return corpus def build_id2word(corpus): """ from corpus build id2word=dict :param corpus: :return:dict = corpus.dictionary """ dict = corpus.dictionary # gensim.corpora.dictionary.Dictionary # print(dict.id2token) try: dict['anything'] except: pass # print("dict.id2token is not {} now") # print(dict.id2token) return dict def save_corpus_dict(dict, corpus, dictDir='./LDA/id_word.dict', corpusDir='./LDA/corpus.mm'): dict.save(dictDir) print(GREENL, 'dict saved into %s successfully ...' % dictDir, DEFAULT) corpora.MmCorpus.serialize(corpusDir, corpus) print(GREENL, 'corpus saved into %s successfully ...' % corpusDir, DEFAULT) # corpus.save(fname='./LDA/corpus.mm') # stores only the (tiny) iteration object def load_ldamodel(modelDir='./lda.pkl'): model = models.LdaModel.load(fname=modelDir) print(GREENL, 'ldamodel load from %s successfully ...' % modelDir, DEFAULT) return model def load_corpus_dict(dictDir='./LDA/id_word.dict', corpusDir='./LDA/corpus.mm'): dict = corpora.Dictionary.load(fname=dictDir) print(GREENL, 'dict load from %s successfully ...' % dictDir, DEFAULT) # dict = corpora.Dictionary.load_from_text('./id_word.txt') corpus = corpora.MmCorpus(corpusDir) # corpora.mmcorpus.MmCorpus print(GREENL, 'corpus load from %s successfully ...' % corpusDir, DEFAULT) return dict, corpus def build_doc_word_mat(corpus, model, num_topics): """ build doc_word_mat in topic space :param corpus: :param model: :param num_topics: int :return:doc_word_mat np.array (len(topics) * num_topics) """ topics = [model[c] for c in corpus] # (word_id, weight) list doc_word_mat = np.zeros((len(topics), num_topics)) for doc, topic in enumerate(topics): for word_id, weight in topic: doc_word_mat[doc, word_id] += weight return doc_word_mat def compute_pairwise_dist(doc_word_mat): """ compute pairwise dist :param doc_word_mat: np.array (len(topics) * num_topics) :return:pairwise_dist <class 'numpy.ndarray'> """ pairwise_dist = spatial.distance.squareform(spatial.distance.pdist(doc_word_mat)) max_weight = pairwise_dist.max() + 1 for i in list(range(len(pairwise_dist))): pairwise_dist[i, i] = max_weight return pairwise_dist def closest_texts(corpus, model, num_topics, test_doc_id=1, topn=5): """ find the closest_doc_ids for doc[test_doc_id] :param corpus: :param model: :param num_topics: :param test_doc_id: :param topn: :return: """ doc_word_mat = build_doc_word_mat(corpus, model, num_topics) pairwise_dist = compute_pairwise_dist(doc_word_mat) # print(REDH, 'original texts[%d]: ' % test_doc_id, DEFAULT, '\n', original_texts[test_doc_id]) closest_doc_ids = pairwise_dist[test_doc_id].argsort() # return closest_doc_ids[:topn] for closest_doc_id in closest_doc_ids[:topn]: print(RED, 'closest doc[%d]' % closest_doc_id, DEFAULT, '\n', original_texts[closest_doc_id]) def evaluate_model(model): """ 計算模型在test data的Perplexity :param model: :return:model.log_perplexity float """ test_texts = load_texts(dataset_type='test', groups='small') test_texts = preprocess_texts(test_texts) test_corpus = build_corpus(test_texts) return model.log_perplexity(test_corpus) def test_num_topics(): dict, corpus = load_corpus_dict() print("#corpus_items:", len(corpus)) for num_topics in [3, 5, 10, 30, 50, 100, 150, 200, 300]: start_time = datetime.datetime.now() model = models.LdaModel(corpus, num_topics=num_topics, id2word=dict) end_time = datetime.datetime.now() print("total running time = ", end_time - start_time) print(REDL, 'model.log_perplexity for test_texts with num_topics=%d : ' % num_topics, evaluate_model(model), DEFAULT) def test(): texts = load_texts(dataset_type='train', groups='small') original_texts = texts test_doc_id = 1 # texts = preprocess_texts(texts, test_doc_id=test_doc_id) # corpus = build_corpus(texts=texts) # corpus DirectTextCorpus(corpora.TextCorpus) # dict = build_id2word(corpus) # save_corpus_dict(dict, corpus) dict, corpus = load_corpus_dict() # print(len(corpus)) num_topics = 100 model = models.LdaModel(corpus, num_topics=num_topics, id2word=dict) # 每次结果不同 model.show_topic(0) # model.save(fname='./lda.pkl') # model = load_ldamodel() # closest_texts(corpus, model, num_topics, test_doc_id=1, topn=3) print(REDL, 'model.log_perplexity for test_texts', evaluate_model(model), DEFAULT) if __name__ == '__main__': test() # test_num_topics()
二、python实现LDA时需要考虑的问题
1. How did you choose alpha and beta?alpha和beta的选择
guide1:Appropriate values for ALPHA and BETA depend on the number of topics and the number of words in vocabulary. For most applications, good results can be obtained by setting ALPHA = 50 / T and BETA = 200 / W
guide2:I suppose it has no advantage of Dirichlet parameters greater than 1 on topic model.I always choose parameters as small as possible, e.g. ALPHA=0.01/T and so on.
2.主题数目的选择
前述主题数目的选择都相当随意,如果我们仅仅把主题抽取作为处理的中间环节,那么对大多数用户来说,主题数量对最终结果并无影响,也就是说,只要你抽取了足够多的主题,最终结果并无区别。
HDP
不过即使这样,你有时候仍然需要去确定需要抽取多少主题,通过垂直狄利克莱过程的方法,它在Gensim中有所实现。
hdp = gensim.models.hdpmodel.HdpModel(mm,id2word)剩余流程和使用LDA一样,不过使用这种方法花费的时间更长一些。
不用LDP
hierarchical(分层的) Dirichlet process learn the number of topics from the data, In practice however, the inferred(推论的) topic counts and resulting topics are often not what’s expected. The optimal(最佳的) number of topics from the structural/syntactic(句法的) point of view isn’t necessarily optimal from the semantic(语义的) point of view.
Thus in practice, running LDA with a number of different topic counts, evaluating the learned topics, and deciding whether the number topics should be increased or decreased usually gives better results.
3. 每次运行LDA产生的主题都不同
由于LDA使用随机性的训练和推理步骤。
怎么产生稳定的主题?
通过重置numpy.random种子一样
SOME_FIXED_SEED = 42np.random.seed(SOME_FIXED_SEED)
Note: trying lots of random seeds until you hit the right model (as tested on a validation set) is a pretty standard trick.
[lda-model-generates-different-topics-everytime-i-train-on-the-same-corpus]
[http://blog.csdn.net/pipisorry/article/details/42129099]
4.Naming the topics命名主题
1> The LDA topics are distributions over words, which naturally lends itself as a naming scheme: just take a number (for example 5-10) of most probable words in the distribution as the topic descriptor(描述符号). This typically works quite well.
2> There are interesting approaches on how to improve topic naming, for example taking into account word centrality(中心) in the word network in the corpus(语料库) etc.
ps:
Corpora and Vector Spaces
将用字符串表示的文档转换为用id表示的文档向量:
documents = ["Human machine interface for lab abc computer applications", "A survey of user opinion of computer system response time", "The EPS user interface management system", "System and human system engineering testing of EPS", "Relation of user perceived response time to error measurement", "The generation of random binary unordered trees", "The intersection graph of paths in trees", "Graph minors IV Widths of trees and well quasi ordering", "Graph minors A survey"] """ #use StemmedCountVectorizer to get stemmed without stop words corpus Vectorizer = StemmedCountVectorizer # Vectorizer = CountVectorizer vectorizer = Vectorizer(stop_words='english') vectorizer.fit_transform(documents) texts = vectorizer.get_feature_names() # print(texts) """ texts = [doc.lower().split() for doc in documents] # print(texts) dict = corpora.Dictionary(texts) #自建词典 # print dict, dict.token2id #通过dict将用字符串表示的文档转换为用id表示的文档向量 corpus = [dict.doc2bow(text) for text in texts] print(corpus)[http://www.52nlp.cn/ %E]
from:http://blog.csdn.net/pipisorry/article/details/42129099
ref:http://radimrehurek.com/gensim/tutorial.html
Gensim and LDA: a quick tour
http://blog.sina.com.cn/s/blog_4c9dc2a10102v9a8.html