sklearn 实现LDA主题建模

import pandas as pd
import numpy as np
import mglearn

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
#from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

abstract = pd.read_csv('final_data.csv')

#train_text = abstract.loc[0:999,]
#train_text = train_text[(['TI', 'JI', 'PY', 'C1', 'AB', 'AU'])]
input_data = abstract['AB']


#vect = CountVectorizer(max_features=10000, max_df=0.8, min_df=10,
#                       stop_words='english')
vect = TfidfVectorizer(max_features=10000, min_df=10, max_df=0.95,
                       stop_words='english')
X = vect.fit_transform(input_data)

lda = LatentDirichletAllocation(n_topics=10, learning_method='batch', 
                                max_iter=25, random_state=0)
lda_topics = lda.fit_transform(X)

sorting = np.argsort(lda.components_, axis=1)[:, ::-1]
feature_names = np.array(vect.get_feature_names())

mglearn.tools.print_topics(topics=range(10), feature_names=feature_names,
                           sorting=sorting, topics_per_chunk=5, n_words=20)



#TfidfVectorizer.fit_transform 返回的是Tf-idf-weighted document-term matrix
feature_names=np.array(vect.get_feature_names())
sorted_by_tfidf = np.argsort(X.max(axis=0).toarray().ravel())

feature_names[sorted_by_tfidf[20:]]
feature_names[sorted_by_tfidf[-20:]]

sorted_by_idf = np.argsort(vect.idf_)
feature_names[sorted_by_idf[:20]]

#lda.fit_transform 返回的是document-topic matrix
#lda.components_返回的是topic-term matrix
topic_term_matrix = lda.components_ 

你可能感兴趣的:(LDA)