LDA+可视化

from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import csv
import jieba
import codecs
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt
import pyLDAvis.gensim
from gensim import corpora
from gensim.models import LdaModel

def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        pass
    try:
        import unicodedata
        unicodedata.numeric(s)
        return True
    except (TypeError, ValueError):
        pass
    return False

info = []
def data_g(filename):
    csv_reader = csv.reader(open(filename))
    sta = 0
    for row in csv_reader:
        if sta == 0:
            sta = 1
            continue
        
        if len(row[8]) != 0:
            info.append(row[8])

data_g('C:\\Users\\imac\\Desktop\\2018\\bigdata\\py\\data.csv')
stopwords = codecs.open('C:\\Users\\imac\\Desktop\\2018\\bigdata\
\\py\\stop.txt','r',encoding='GBK').readlines()
stw = [',','\n',' ','―','副','专业','学习','主任','市',"委员",\
"书记","其间","干部","成员","工作","政府"]

for wd in stopwords:
    stw.append(wd[:-2])

#print(stw)
#print(stopwords[7])
train = []
def data_p():
    for line in info:
        tmp = list(jieba.cut(line))
        train.append([ w for w in tmp if w not in stw and not is_number(w)])

data_p()
print(train)


dictionary = corpora.Dictionary(train)  
# convert tokenized documents into a document-term matrix  
corpus = [dictionary.doc2bow(text) for text in train]  
# generate LDA model  
lda = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=10)
topic_list=lda.print_topics(20)  

print (type(lda.print_topics(20)))  
print (len(lda.print_topics(20)))  
  
for topic in topic_list:  
    print (topic)  


print ('输出其主题分布')  
  
for p in train:
    #test_doc = list(new_doc) #新文档进行分词  
    test_doc= p#查看训练集中第三个样本的主题分布  
    doc_bow = dictionary.doc2bow(test_doc)      #文档转换成bow  
    doc_lda = lda[doc_bow]                   #得到新文档的主题分布  
    #输出新文档的主题分布  
    print (doc_lda)  
    #for topic in doc_lda:  
        #print ("%s\t%f\n"%(lda.print_topic(topic[0]), topic[1]))  


#print(train)

def test_lda():
    #corpus, dictionary = get_corpus_dictionary()
    #lda = LdaModel(corpus=corpus,num_topics=5)
    data = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
    pyLDAvis.show(data,open_browser=True)

if __name__ == "__main__":
    test_lda()

你可能感兴趣的:(数据挖掘)