最近硕士毕业小论文想写LDA结合深度学习的,论文看来看去,看的头大,也没什么好的创新点,杂七杂八小的创新带你想了一大堆,要么自己给否了,要么后来在看论文的时候发现写过了(那么LOW的点也能写),想写个差不多的’有价值意义的创新点,结果想来想去想着头大,就继续看论文。
言归正传,看了大几十篇外文文献了(外文的闻起来就厉害,实际上好的也没几个),结合点就一两个有价值的,推荐几篇有创新点的论文《A Hybrid Document Feature Extraction Method Using Latent Dirichlet Allocation andWord2Vec》;《 Improving topic models with latent featureword representations》;《Topic2Vec:learning distributed representations of topics》,别的大多在应用层面,创新的遇见的比较少,当然要是有好的论文求推荐(脑海里一万个感谢)。
这里实现了《AHybrid Document Feature Extraction Method Using Latent Dirichlet Allocation andWord2Vec》里面的方法,这里简要说下他的思路:
1、 选取主题T的前N个词作为主题词
2、对主题词做归一化处理,即,计算每个词w占主题的权重
3、将主题映射到word2vec空间上,即:w在ord2vec空间的坐标*w占主题T的权重
4、计算文档在word2vec空间的坐标:每个词在word2vec空间下的坐标相加,再除以总词数
5、计算每篇文档与各个主题的距离doc_t
6、这里我用KNN分类器,做分类模型
7、计算测试文档与各个主题的距离,计算过程如4、5,将得到的矩阵带入模型中预测
上代码了:
from gensim import models,corpora,similarities
from sklearn.cross_validation import train_test_split
from gensim.models.doc2vec import TaggedDocument,Doc2Vec
from gensim.models import LdaModel
import numpy as np
import os
import random
from sklearn.neighbors import KNeighborsClassifier
from gensim.models import word2vec
import sklearn.metrics as metrics
doc=[]
label_c=[]
ii=0
for x in os.listdir("D:\Documents\data\zhaiyao\\1\\"):
print x
for line in open("D:\Documents\data\zhaiyao\\1\\"+x,"r").readlines():
doc.append(line.strip().split(" "))
label_c.append(ii)
ii+=1
size_word=100
size_lda=200
print "训练词向量"
model_wv=word2vec.Word2Vec(doc, size=size_word,workers=2,min_count=1,iter=10)#size维度,min_count最少出现次数
x_train_1, x_test_1, y_train_1, y_test_1 = train_test_split(doc, label_c, test_size=0.2)
dictionary = corpora.Dictionary(doc)
corpus = [ dictionary.doc2bow(text) for text in x_train_1 ]
print "训练LDA"
lda = LdaModel(corpus=corpus,id2word=dictionary, num_topics=size_lda)
#lda.get_document_topics
for doc_num in x_test_1:
doc_bow=dictionary.doc2bow(doc_num)#将训练集转
doc_lda = lda[doc_bow]
print "输出新文档的主题分布",doc_lda#输出新文档的主题分布
break
for topic in doc_lda:
print "该主题站的比例","%s\t%f\n"%(lda.print_topic(topic[0]), topic[1])
print lda.get_topic_terms(0, topn=10)#获取某个主题0下的前topn个词语,越相关的在前面
aa=[x[0] for x in lda.get_topic_terms(0, topn=10)]
for it in aa:
print "第%d个主题的第%d个主题词"%(0,it), dictionary[it]
#在整个样本下,某个词属于某些主题的可能性;获取某个词最有可能的主题列表,word_id为词id值,minimum_probability为阈值;
print lda.get_term_topics(0, minimum_probability=0.05)
#per_word_topics为True时还得到文档中每个词属于的主题列表,minimum_probability确定主题的阈值,minimum_phi_value判断词属于主题的阈值
print lda.get_document_topics(corpus[0], minimum_probability=None, minimum_phi_value=0.1, per_word_topics=False)
print "映射主题到word2vec空间"
the_id=[]#每个主题的前10个词的ID
the_vl=[]#每个主题的前10个词的value
the_w =[]#每个主题的前10个词的占权重
print "计算主题内个词的权重"
for x in range(size_lda):
the_id.append([xx[0] for xx in lda.get_topic_terms(x,topn=5)])
the_sum=sum([xx[1] for xx in lda.get_topic_terms(x,topn=5)])
the_w.append([xx[1]/the_sum for xx in lda.get_topic_terms(x,topn=5)])
#print x,"主题",the_sum,the_w
print "开始映射到坐标"
m=0
the_wv=np.zeros([size_lda,size_word])#每个主题映射到word2vec,主题数,word2vec
#主题下每个词在word2vec下坐标加权求和
for it in the_id:
n=0
for it_id in it:
word_t=dictionary[it_id]
#print word_t+"**",np.shape(model_wv[word_t]),the_w[m][n]
the_wv[m]+=[x_word*the_w[m][n] for x_word in model_wv[word_t.encode("utf-8")]]
n+=1
m+=1
doc_word=np.zeros([len(x_train_1),size_word])
print "映射训练文档到word2vec"
m=0
for each_doc in x_train_1:
for each_word in each_doc:
#print each_word
doc_word[m]+=model_wv[each_word]
n=0
for doc_word_each in doc_word[m]:
doc_word[m][n]=doc_word[m][n]/len(doc_word[m])
n+=1
m+=1
print "计算训练集文档与个主题间距"
def destince(a,b):#计算a,b间欧氏距离距离
dt=0
for each_dt in range(len(a)):
dt+=(a[each_dt]-b[each_dt])*(a[each_dt]-b[each_dt])
return np.sqrt(dt)
doc_t=np.zeros([len(doc_word),size_lda])
m=0
for each_d in doc_word:
n=0
for each_t in the_wv:
doc_t[m][n]=destince(each_d,each_t)
n+=1
m+=1
doc_word_test=np.zeros([len(x_test_1),size_word])
print "映射测试文档到word2vec"
m=0
for each_doc in x_test_1:
for each_word in each_doc:
#print each_word
doc_word_test[m]+=model_wv[each_word]
n=0
for doc_word_each in doc_word_test[m]:
doc_word_test[m][n]=doc_word_test[m][n]/size_word
n+=1
m+=1
print "计算测试集文档与个主题间距"
doc_t_test=np.zeros([len(doc_word_test),size_lda])
m=0
for each_d in doc_word_test:
n=0
for each_t in the_wv:
doc_t_test[m][n]=destince(each_d,each_t)
n+=1
m+=1
KN =KNeighborsClassifier(n_neighbors=10, algorithm='brute', metric='cosine')
KN.fit(doc_t,y_train_1)
test_labels_KN=KN.predict(doc_t_test)
print "准确率",metrics.accuracy_score(test_labels_KN,y_test_1)#准确率
print "召回率",confusion_matrix(test_labels_KN,y_test_1)#召回率
过程打印的比较多,我就捡最后的贴下:
预测结果
训练集大小 (5932L, 100L)
准确率 0.433962264151
召回率 [[428 83 63 389]
[ 11 0 2 9]
[ 14 0 2 25]
[193 25 26 214]]
2017-11-20 16:15:07,Mon doc2vec.py INFO precomputing L2-norms of doc weight vectors
相似度计算 0.927223719677
还没有调参什么的,笔记本不咋地懒得等,有条件了多运行几次,语料再大些,效果肯定好.
这里也可以用doc2vec代替word2vec ,效果更好
呃,doc2vec 模型就是
model
,再上一篇博客里有源代码,直接运行就行,我这里懒得加了
代码如下:
#doc2vec +lda
print "开始映射到坐标"
m=0
the_wv=np.zeros([size_lda,size_doc2])#
#主题下每个词在doc2vec下坐标加权求和
for it in the_id:
n=0
for it_id in it:
word_t=dictionary[it_id]
#print word_t+"**",np.shape(model[word_t.encode("utf-8")]),the_w[m][n]
the_wv[m]+=[x_word*the_w[m][n] for x_word in model[word_t.encode("utf-8")]]
n+=1
m+=1
doc_word=np.zeros([len(x_train_1),size_word])
print "映射训练文档到doc2vec"
m=0
for each_doc in x_train_1:
for each_word in each_doc:
#print each_word
doc_word[m]+=model[each_word]
n=0
for doc_word_each in doc_word[m]:
doc_word[m][n]=doc_word[m][n]/len(doc_word[m])
n+=1
m+=1
print "计算训练集文档与个主题间距"
def destince(a,b):#计算a,b间欧氏距离距离
dt=0
for each_dt in range(len(a)):
dt+=(a[each_dt]-b[each_dt])*(a[each_dt]-b[each_dt])
return np.sqrt(dt)
doc_t=np.zeros([len(doc_word),size_lda])
m=0
for each_d in doc_word:
n=0
for each_t in the_wv:
doc_t[m][n]=destince(each_d,each_t)
n+=1
m+=1
doc_word_test=np.zeros([len(x_test_1),size_word])
print "映射测试文档到doc2vec"
m=0
for each_doc in x_test_1:
for each_word in each_doc:
#print each_word
doc_word_test[m]+=model[each_word]
n=0
for doc_word_each in doc_word_test[m]:
doc_word_test[m][n]=doc_word_test[m][n]/size_word
n+=1
m+=1
print "计算测试集文档与个主题间距"
doc_t_test=np.zeros([len(doc_word_test),size_lda])
m=0
for each_d in doc_word_test:
n=0
for each_t in the_wv:
doc_t_test[m][n]=destince(each_d,each_t)
n+=1
m+=1
KN =KNeighborsClassifier(n_neighbors=10, algorithm='brute', metric='cosine')
KN.fit(doc_t,y_train_1)
test_labels_KN=KN.predict(doc_t_test)
print "准确率",metrics.accuracy_score(test_labels_KN,y_test_1)#准确率
print "召回率",confusion_matrix(test_labels_KN,y_test_1)#召回率
pret=0.0
for doc_num in range(len(x_test_1)):
inferred_vector = model.infer_vector(x_test_1[doc_num])
sims = model.docvecs.most_similar([inferred_vector], topn=3)
pre=[to[0] for to in sims ]
sims_doc=[label_c[ind] for ind in pre]
label_=dict([(sims_doc.count(i),i) for i in sims_doc])
if y_test_1[doc_num]==label_[max(label_.keys())]:#sims是一个tuples,(index_of_document, similarity)
pret+=1
print "相似度计算",pret/len(x_test)