# !/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import lda
import lda.datasets
from pprint import pprint
if __name__ == "__main__":
# document-term matrix
X = lda.datasets.load_reuters()
print("type(X): {}".format(type(X)))
print("shape: {}\n".format(X.shape))
print(X[:10, :10])
# the vocab
vocab = lda.datasets.load_reuters_vocab()
print("type(vocab): {}".format(type(vocab)))
print("len(vocab): {}\n".format(len(vocab)))
print(vocab[:10])
# titles for each story
titles = lda.datasets.load_reuters_titles()
print("type(titles): {}".format(type(titles)))
print("len(titles): {}\n".format(len(titles)))
pprint(titles[:10])
print( 'LDA start ----')
topic_num = 20
model = lda.LDA(n_topics=topic_num, n_iter=500, random_state=1)
model.fit(X)
# topic-word
topic_word = model.topic_word_
print("type(topic_word): {}".format(type(topic_word)))
print("shape: {}".format(topic_word.shape))
print(vocab[:5])
print(topic_word[:, :5])
# Print Topic distribution
n = 7
for i, topic_dist in enumerate(topic_word):
topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n + 1):-1]
print('*Topic {}\n- {}'.format(i, ' '.join(topic_words)))
# Document - topic
doc_topic = model.doc_topic_
print("type(doc_topic): {}".format(type(doc_topic)))
print("shape: {}".format(doc_topic.shape))
for i in range(10):
topic_most_pr = doc_topic[i].argmax()
print(u"文档: {} 主题: {} value: {}".format(i, topic_most_pr, doc_topic[i][topic_most_pr]))
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False
# Topic - word
plt.figure(figsize=(8, 9))
# f, ax = plt.subplots(5, 1, sharex=True)
for i, k in enumerate([0, 5, 9, 14, 19]):
ax = plt.subplot(5, 1, i+1)
ax.plot(topic_word[k, :], 'r-')
ax.set_xlim(-50, 4350) # [0,4258]
ax.set_ylim(0, 0.08)
ax.set_ylabel(u"概率")
ax.set_title(u"主题 {}".format(k))
plt.xlabel(u"词", fontsize=14)
plt.tight_layout()
plt.suptitle(u'主题的词分布', fontsize=18)
plt.subplots_adjust(top=0.9)
plt.show()
# Document - Topic
plt.figure(figsize=(8, 9))
# f, ax= plt.subplots(5, 1, figsize=(8, 6), sharex=True)
for i, k in enumerate([1, 3, 4, 8, 9]):
ax = plt.subplot(5, 1, i+1)
ax.stem(doc_topic[k, :], linefmt='g-', markerfmt='ro')
ax.set_xlim(-1, topic_num+1)
ax.set_ylim(0, 1)
ax.set_ylabel(u"概率")
ax.set_title(u"文档 {}".format(k))
plt.xlabel(u"主题", fontsize=14)
plt.suptitle(u'文档的主题分布', fontsize=18)
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.show()
打印结果:
INFO:lda:n_documents: 395 INFO:lda:vocab_size: 4258 INFO:lda:n_words: 84010 INFO:lda:n_topics: 20 INFO:lda:n_iter: 500
type(X):shape: (395, 4258) [[ 1 0 1 0 0 0 1 0 0 1] [ 7 0 2 0 0 0 0 1 0 0] [ 0 0 0 1 10 0 4 1 1 0] [ 6 0 1 0 0 0 1 1 1 0] [ 0 0 0 2 14 1 1 0 2 1] [ 0 0 2 2 24 0 2 0 2 1] [ 0 0 0 2 7 1 1 0 1 0] [ 0 0 2 2 20 0 2 0 3 1] [ 0 1 0 2 17 2 2 0 0 0] [ 2 0 2 0 0 2 0 1 0 3]] type(vocab): len(vocab): 4258 ('church', 'pope', 'years', 'people', 'mother', 'last', 'told', 'first', 'world', 'year') type(titles): len(titles): 395 ('0 UK: Prince Charles spearheads British royal revolution. LONDON 1996-08-20', '1 GERMANY: Historic Dresden church rising from WW2 ashes. DRESDEN, Germany ' '1996-08-21', "2 INDIA: Mother Teresa's condition said still unstable. CALCUTTA 1996-08-23", '3 UK: Palace warns British weekly over Charles pictures. LONDON 1996-08-25', '4 INDIA: Mother Teresa, slightly stronger, blesses nuns. CALCUTTA 1996-08-25', "5 INDIA: Mother Teresa's condition unchanged, thousands pray. CALCUTTA " '1996-08-25', '6 INDIA: Mother Teresa shows signs of strength, blesses nuns. CALCUTTA ' '1996-08-26', "7 INDIA: Mother Teresa's condition improves, many pray. CALCUTTA, India " '1996-08-25', '8 INDIA: Mother Teresa improves, nuns pray for "miracle". CALCUTTA ' '1996-08-26', '9 UK: Charles under fire over prospect of Queen Camilla. LONDON 1996-08-26') LDA start ----
INFO:lda:<0> log likelihood: -1051748 INFO:lda:<10> log likelihood: -719800 INFO:lda:<20> log likelihood: -699115 INFO:lda:<30> log likelihood: -689370 INFO:lda:<40> log likelihood: -684918 INFO:lda:<50> log likelihood: -681322 INFO:lda:<60> log likelihood: -678979 INFO:lda:<70> log likelihood: -676598 INFO:lda:<80> log likelihood: -675383 INFO:lda:<90> log likelihood: -673316 INFO:lda:<100> log likelihood: -672761 INFO:lda:<110> log likelihood: -671320 INFO:lda:<120> log likelihood: -669744 INFO:lda:<130> log likelihood: -669292 INFO:lda:<140> log likelihood: -667940 INFO:lda:<150> log likelihood: -668038 INFO:lda:<160> log likelihood: -667429 INFO:lda:<170> log likelihood: -666475 INFO:lda:<180> log likelihood: -665562 INFO:lda:<190> log likelihood: -664920 INFO:lda:<200> log likelihood: -664979 INFO:lda:<210> log likelihood: -664722 INFO:lda:<220> log likelihood: -664459 INFO:lda:<230> log likelihood: -664360 INFO:lda:<240> log likelihood: -663600 INFO:lda:<250> log likelihood: -664164 INFO:lda:<260> log likelihood: -663826 INFO:lda:<270> log likelihood: -663458 INFO:lda:<280> log likelihood: -663393 INFO:lda:<290> log likelihood: -662904 INFO:lda:<300> log likelihood: -662294 INFO:lda:<310> log likelihood: -662031 INFO:lda:<320> log likelihood: -662430 INFO:lda:<330> log likelihood: -661601 INFO:lda:<340> log likelihood: -662108 INFO:lda:<350> log likelihood: -662152 INFO:lda:<360> log likelihood: -661899 INFO:lda:<370> log likelihood: -661012 INFO:lda:<380> log likelihood: -661278 INFO:lda:<390> log likelihood: -661085 INFO:lda:<400> log likelihood: -660418 INFO:lda:<410> log likelihood: -660510 INFO:lda:<420> log likelihood: -660343 INFO:lda:<430> log likelihood: -659789 INFO:lda:<440> log likelihood: -659336 INFO:lda:<450> log likelihood: -659039 INFO:lda:<460> log likelihood: -659329 INFO:lda:<470> log likelihood: -658707 INFO:lda:<480> log likelihood: -658879 INFO:lda:<490> log likelihood: -658819 INFO:lda:<499> log likelihood: -658407
type(topic_word):shape: (20, 4258) ('church', 'pope', 'years', 'people', 'mother') [[2.72436509e-06 2.72436509e-06 2.72708945e-03 2.72436509e-06 2.72436509e-06] [2.29518860e-02 1.08771556e-06 7.83263973e-03 1.15308726e-02 1.08771556e-06] [3.97404221e-03 4.96135108e-06 2.98177200e-03 4.96135108e-06 4.96135108e-06] [3.27374625e-03 2.72585033e-06 2.72585033e-06 2.45599115e-03 2.72585033e-06] [8.26262882e-03 8.56893407e-02 1.61980569e-06 4.87561512e-04 1.61980569e-06] [1.30107788e-02 2.95632328e-06 2.95632328e-06 2.95632328e-06 2.95632328e-06] [2.80145003e-06 2.80145003e-06 2.80145003e-06 2.80145003e-06 2.80145003e-06] [2.42858077e-02 4.66944966e-06 4.66944966e-06 4.66944966e-06 2.42858077e-02] [6.84655429e-03 1.90129250e-06 6.84655429e-03 1.90129250e-06 1.90129250e-06] [3.48361655e-06 3.48361655e-06 3.48361655e-06 3.48361655e-06 3.48361655e-06] [2.98781661e-03 3.31611166e-06 3.31611166e-06 8.29359526e-03 3.31611166e-06] [4.27062069e-06 4.27062069e-06 4.27062069e-06 1.19620086e-02 4.27062069e-06] [1.50994982e-02 1.64107142e-06 1.64107142e-06 1.59200339e-02 2.95556963e-03] [7.73480150e-07 7.73480150e-07 1.70946848e-02 7.73480150e-07 7.73480150e-07] [2.82280146e-06 2.82280146e-06 2.82280146e-06 6.77754631e-03 7.28311005e-02] [5.15309856e-06 5.15309856e-06 4.64294180e-03 5.15309856e-06 5.15309856e-06] [3.41695768e-06 3.41695768e-06 3.41695768e-06 1.29878561e-02 3.41695768e-06] [3.90980357e-02 1.70316633e-03 4.42279319e-03 3.39953358e-06 3.39953358e-06] [2.39373034e-06 2.39373034e-06 2.39373034e-06 2.39612407e-03 2.39373034e-06] [3.32493234e-06 3.32493234e-06 3.32493234e-06 3.32493234e-06 3.32493234e-06]] *Topic 0 - government british minister west group letters party *Topic 1 - church first during people political country ceremony *Topic 2 - elvis king wright fans presley concert life *Topic 3 - yeltsin russian russia president kremlin michael romania *Topic 4 - pope vatican paul surgery pontiff john hospital *Topic 5 - family police miami versace cunanan funeral home *Topic 6 - south simpson born york white north african *Topic 7 - order church mother successor since election religious *Topic 8 - charles prince diana royal queen king parker *Topic 9 - film france french against actor paris bardot *Topic 10 - germany german war nazi christian letter book *Topic 11 - east prize peace timor quebec belo indonesia *Topic 12 - n't told life people church show very *Topic 13 - years world time year last say three *Topic 14 - mother teresa heart charity calcutta missionaries sister *Topic 15 - city salonika exhibition buddhist byzantine vietnam swiss *Topic 16 - music first people tour including off opera *Topic 17 - church catholic bernardin cardinal bishop death cancer *Topic 18 - harriman clinton u.s churchill paris president ambassador *Topic 19 - century art million museum city churches works type(doc_topic): shape: (395, 20) 文档: 0 主题: 8 value: 0.4830434782608696 文档: 1 主题: 1 value: 0.29057971014492756 文档: 2 主题: 14 value: 0.6656903765690377 文档: 3 主题: 8 value: 0.5076555023923446 文档: 4 主题: 14 value: 0.7789667896678965 文档: 5 主题: 14 value: 0.844097222222222 文档: 6 主题: 14 value: 0.8035353535353537 文档: 7 主题: 14 value: 0.877474402730375 文档: 8 主题: 14 value: 0.8196153846153844 文档: 9 主题: 8 value: 0.5342105263157892