TopicModel主题模型 - LDA的python实现及参数选择

http:// blog.csdn.net/pipisorry/article/details/42129099

一、使用gensim python拓展包

1. install gensim,sklearn, nltk

2.lda python代码实现:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
__title__ = 'topic model - build lda - 20news dataset'
__author__ = 'pi'
__mtime__ = '12/26/2014-026'
# code is far away from bugs with the god animal protecting
    I love animals. They taste delicious.
              ┏┓      ┏┓
            ┏┛┻━━━┛┻┓
            ┃      ☃      ┃
            ┃  ┳┛  ┗┳  ┃
            ┃      ┻      ┃
            ┗━┓      ┏━┛
                ┃      ┗━━━┓
                ┃  神兽保佑    ┣┓
                ┃ 永无BUG!   ┏┛
                ┗┓┓┏━┳┓┏┛
                  ┃┫┫  ┃┫┫
                  ┗┻┛  ┗┻┛
"""
from Colors import *
from collections import defaultdict
import re
import datetime
from sklearn import datasets
import nltk
from gensim import corpora
from gensim import models
import numpy as np
from scipy import spatial
from CorePyPro.Fun.TimeStump import totalTime


def load_texts(dataset_type='train', groups=None):
    """
    load datasets to bytes list
    :return:train_dataset_bunch.data bytes list
    """
    if groups == 'small':
        groups = ['comp.graphics', 'comp.os.ms-windows.misc']  # 仅用于小数据测试时用, #1368
    elif groups == 'medium':
        groups = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.ma c.hardware',
                  'comp.windows.x', 'sci.space']  # 中量数据时用    #3414
    train_dataset_bunch = datasets.load_mlcomp('20news-18828', dataset_type, mlcomp_root='./datasets',
                                               categories=groups)  # 13180
    return train_dataset_bunch.data


def preprocess_texts(texts, test_doc_id=1):
    """
    texts preprocessing
    :param texts: bytes list
    :return:bytes list
    """
    texts = [t.decode(errors='ignore') for t in texts]  # bytes2str
    # print(REDH, 'original texts[%d]: ' % test_doc_id, DEFAULT, '\n',texts[test_doc_id])
    # split_texts = [t.lower().split() for t in texts]
    # print(REDH, 'split texts[%d]: #%d' % (test_doc_id, len(split_texts)), DEFAULT, '\n',split_texts[test_doc_id])


    # lower str & split str 2 word list with sep=... & delete None
    SEPS = '[\s()-/,:.?!]\s*'
    texts = [re.split(SEPS, t.lower()) for t in texts]
    for t in texts:
        while '' in t:
            t.remove('')
    # print(REDH, 'texts[%d] lower & split(seps= %s) & delete None: #%d' % (test_doc_id, SEPS, len(texts[test_doc_id])), DEFAULT, '\n',texts[test_doc_id])


    # nltk.download()   #then choose the corpus.stopwords
    stopwords = set(nltk.corpus.stopwords.words('english'))  # #127
    stopwords.update(['from', 'subject', 'writes'])  # #129
    word_usage = defaultdict(int)
    for t in texts:
        for w in t:
            word_usage[w] += 1
    COMMON_LINE = len(texts) / 10
    too_common_words = [w for w in t if word_usage[w] > COMMON_LINE]  # set(too_common_words)
    # print('too_common_words: #', len(too_common_words), '\n', too_common_words)   #68
    stopwords.update(too_common_words)
    # print('stopwords: #', len(stopwords), '\n', stopwords)  #   #147

    english_stemmer = nltk.SnowballStemmer('english')
    MIN_WORD_LEN = 3  # 4
    texts = [[english_stemmer.stem(w) for w in t if
              not set(w) & set('@+>0123456789*') and w not in stopwords and len(w) >= MIN_WORD_LEN] for t in
             texts]  # set('+-.?!()>@0123456789*/')
    # print(REDH, 'texts[%d] delete ^alphanum & stopwords & len<%d & stemmed: #' % (test_doc_id, MIN_WORD_LEN),
    # len(texts[test_doc_id]), DEFAULT, '\n', texts[test_doc_id])
    return texts


def build_corpus(texts):
    """
    build corpora
    :param texts: bytes list
    :return: corpus DirectTextCorpus(corpora.TextCorpus)
    """

    class DirectTextCorpus(corpora.TextCorpus): 
        def get_texts(self):
            return self.input

        def __len__(self):
            return len(self.input)

    corpus = DirectTextCorpus(texts)
    return corpus


def build_id2word(corpus):
    """
    from corpus build id2word=dict
    :param corpus:
    :return:dict = corpus.dictionary
    """
    dict = corpus.dictionary  # gensim.corpora.dictionary.Dictionary
    # print(dict.id2token)
    try:
        dict['anything']
    except:
        pass
        # print("dict.id2token is not {} now")
    # print(dict.id2token)
    return dict


def save_corpus_dict(dict, corpus, dictDir='./LDA/id_word.dict', corpusDir='./LDA/corpus.mm'):
    dict.save(dictDir)
    print(GREENL, 'dict saved into %s successfully ...' % dictDir, DEFAULT)
    corpora.MmCorpus.serialize(corpusDir, corpus)
    print(GREENL, 'corpus saved into %s successfully ...' % corpusDir, DEFAULT)
    # corpus.save(fname='./LDA/corpus.mm')  # stores only the (tiny) iteration object


def load_ldamodel(modelDir='./lda.pkl'):
    model = models.LdaModel.load(fname=modelDir)
    print(GREENL, 'ldamodel load from %s successfully ...' % modelDir, DEFAULT)
    return model


def load_corpus_dict(dictDir='./LDA/id_word.dict', corpusDir='./LDA/corpus.mm'):
    dict = corpora.Dictionary.load(fname=dictDir)
    print(GREENL, 'dict load from %s successfully ...' % dictDir, DEFAULT)
    # dict = corpora.Dictionary.load_from_text('./id_word.txt')
    corpus = corpora.MmCorpus(corpusDir)  # corpora.mmcorpus.MmCorpus
    print(GREENL, 'corpus load from %s successfully ...' % corpusDir, DEFAULT)
    return dict, corpus


def build_doc_word_mat(corpus, model, num_topics):
    """
    build doc_word_mat in topic space
    :param corpus:
    :param model:
    :param num_topics: int
    :return:doc_word_mat np.array (len(topics) * num_topics)
    """
    topics = [model[c] for c in corpus]  # (word_id, weight) list
    doc_word_mat = np.zeros((len(topics), num_topics))
    for doc, topic in enumerate(topics):
        for word_id, weight in topic:
            doc_word_mat[doc, word_id] += weight
    return doc_word_mat


def compute_pairwise_dist(doc_word_mat):
    """
    compute pairwise dist
    :param doc_word_mat: np.array (len(topics) * num_topics)
    :return:pairwise_dist <class 'numpy.ndarray'>
    """
    pairwise_dist = spatial.distance.squareform(spatial.distance.pdist(doc_word_mat))
    max_weight = pairwise_dist.max() + 1
    for i in list(range(len(pairwise_dist))):
        pairwise_dist[i, i] = max_weight
    return pairwise_dist


def closest_texts(corpus, model, num_topics, test_doc_id=1, topn=5):
    """
    find the closest_doc_ids for  doc[test_doc_id]
    :param corpus:
    :param model:
    :param num_topics:
    :param test_doc_id:
    :param topn:
    :return:
    """
    doc_word_mat = build_doc_word_mat(corpus, model, num_topics)
    pairwise_dist = compute_pairwise_dist(doc_word_mat)
    # print(REDH, 'original texts[%d]: ' % test_doc_id, DEFAULT, '\n', original_texts[test_doc_id])
    closest_doc_ids = pairwise_dist[test_doc_id].argsort()
    # return closest_doc_ids[:topn]
    for closest_doc_id in closest_doc_ids[:topn]:
        print(RED, 'closest doc[%d]' % closest_doc_id, DEFAULT, '\n', original_texts[closest_doc_id])


def evaluate_model(model):
    """
    計算模型在test data的Perplexity
    :param model:
    :return:model.log_perplexity float
    """
    test_texts = load_texts(dataset_type='test', groups='small')
    test_texts = preprocess_texts(test_texts)
    test_corpus = build_corpus(test_texts)
    return model.log_perplexity(test_corpus)


def test_num_topics():
    dict, corpus = load_corpus_dict()
    print("#corpus_items:", len(corpus))
    for num_topics in [3, 5, 10, 30, 50, 100, 150, 200, 300]:
        start_time = datetime.datetime.now()
        model = models.LdaModel(corpus, num_topics=num_topics, id2word=dict)
        end_time = datetime.datetime.now()
        print("total running time = ", end_time - start_time)
        print(REDL, 'model.log_perplexity for test_texts with num_topics=%d : ' % num_topics, evaluate_model(model),
              DEFAULT)


def test():
    texts = load_texts(dataset_type='train', groups='small')
    original_texts = texts
    test_doc_id = 1

    # texts = preprocess_texts(texts, test_doc_id=test_doc_id)
    # corpus = build_corpus(texts=texts)  # corpus DirectTextCorpus(corpora.TextCorpus)
    # dict = build_id2word(corpus)
    # save_corpus_dict(dict, corpus)
    dict, corpus = load_corpus_dict()
    # print(len(corpus))

    num_topics = 100
    model = models.LdaModel(corpus, num_topics=num_topics, id2word=dict)  # 每次结果不同
    model.show_topic(0)
    # model.save(fname='./lda.pkl')

    # model = load_ldamodel()
    # closest_texts(corpus, model, num_topics, test_doc_id=1, topn=3)

    print(REDL, 'model.log_perplexity for test_texts', evaluate_model(model), DEFAULT)


if __name__ == '__main__':
    test()
    # test_num_topics()




二、python实现LDA时需要考虑的问题

1. How did you choose alpha and beta?alpha和beta的选择

guide1:Appropriate values for ALPHA and BETA depend on the number of topics and the number of words in vocabulary. For most applications, good results can be obtained by setting ALPHA = 50 / T and BETA = 200 / W

guide2:I suppose it has no advantage of Dirichlet parameters greater than 1 on topic model.I always choose parameters as small as possible, e.g. ALPHA=0.01/T and so on.

2.主题数目的选择

前述主题数目的选择都相当随意,如果我们仅仅把主题抽取作为处理的中间环节,那么对大多数用户来说,主题数量对最终结果并无影响,也就是说,只要你抽取了足够多的主题,最终结果并无区别。

HDP

不过即使这样,你有时候仍然需要去确定需要抽取多少主题,通过垂直狄利克莱过程的方法,它在Gensim中有所实现。

   hdp = gensim.models.hdpmodel.HdpModel(mm,id2word)

剩余流程和使用LDA一样,不过使用这种方法花费的时间更长一些。

不用LDP

hierarchical(分层的) Dirichlet process learn the number of topics from the data, In practice however, the inferred(推论的) topic counts and resulting topics are often not what’s expected. The optimal(最佳的) number of topics from the structural/syntactic(句法的) point of view isn’t necessarily optimal from  the semantic(语义的) point of view. 

Thus in practice,  running LDA with a number of different topic counts, evaluating the learned topics, and deciding whether the number topics should be increased or decreased usually gives better results.

3. 每次运行LDA产生的主题都不同

由于LDA使用随机性的训练和推理步骤。

怎么产生稳定的主题?

通过重置numpy.random种子一样

SOME_FIXED_SEED = 42
# before training/inference:

np.random.seed(SOME_FIXED_SEED)

Note: trying lots of random seeds until you hit the right model (as tested on a validation set) is a pretty standard trick.

[lda-model-generates-different-topics-everytime-i-train-on-the-same-corpus]

[http://blog.csdn.net/pipisorry/article/details/42129099]

4.Naming the topics命名主题

1> The LDA topics are distributions over words, which naturally lends itself  as a naming scheme: just take a number (for example 5-10) of most probable words in the distribution as the topic descriptor(描述符号). This typically works quite well.

2> There are interesting approaches on how to improve topic naming, for example taking into account word centrality(中心) in the word network in the corpus(语料库) etc. 



ps:

Corpora and Vector Spaces

将用字符串表示的文档转换为用id表示的文档向量:

documents = ["Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey"]
"""
#use StemmedCountVectorizer to get stemmed without stop words corpus
Vectorizer = StemmedCountVectorizer
# Vectorizer = CountVectorizer
vectorizer = Vectorizer(stop_words='english')
vectorizer.fit_transform(documents)
texts = vectorizer.get_feature_names()
# print(texts)
"""
texts = [doc.lower().split() for doc in documents]
# print(texts)
dict = corpora.Dictionary(texts)    #自建词典
# print dict, dict.token2id
#通过dict将用字符串表示的文档转换为用id表示的文档向量
corpus = [dict.doc2bow(text) for text in texts]
print(corpus)
[http://www.52nlp.cn/ %E]

from:http://blog.csdn.net/pipisorry/article/details/42129099

ref:http://radimrehurek.com/gensim/tutorial.html

Gensim and LDA: a quick tour

http://blog.sina.com.cn/s/blog_4c9dc2a10102v9a8.html


你可能感兴趣的:(python,Model,topic,LDA,gensim)