用gensim.doc2vec 建模、利用相似度做文本分类

想看看doc2vec的效果怎么说,按照 基于gensim的Doc2Vec简析    上面的实验做了下,发现用随机森林做的模型,二分类的准确率50%,换sklearn的KNN,分类结果也是50¥上下。看了半天觉得过程什么的没什么错误,之后就又参考了情感分析利器——Doc2vec  ’两个意思差不多。最后利用模型,在随便调参的情况下,分类准确率95%+。效果总算令人满意。代码如下:(过程懒得精简了,效果好用就行)

#DOC2VEC
import codecs
import os
from gensim import models,corpora,similarities
from gensim.models.doc2vec import TaggedDocument,Doc2Vec
from sklearn.cross_validation import train_test_split
from gensim.models import LdaModel
import numpy as np
import random
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import confusion_matrix  
import sklearn.metrics as metrics  
size_doc2=100
all_doc=[]
corpora_documents = []
label_c=[]
doc=[]            #输出时使用,用来存储未经过TaggedDocument处理的数据,如果输出document,前面会有u
ii=0
for x in os.listdir("D:\Documents\data\zhaiyao\\1\\"):
    print x
    for line in open("D:\Documents\data\zhaiyao\\1\\"+x,"r").readlines():
        label_c.append(ii)
        all_doc.append(line)
    ii=ii+1
print "建立段落向量",ii
for i, item_text in enumerate(all_doc):
    words_list=item_text.strip().split("  ")
    document = TaggedDocument(words=words_list,tags=[i])
    corpora_documents.append(document)
    doc.append(words_list)
print "创建model"
model= Doc2Vec(min_count=1, window=5, size=size_doc2, sample=1e-3, negative=5, workers=3,dm=0) 
ifmodel_vo=model.build_vocab(corpora_documents)
print "区分训练测试集",ifmodel_vo
x_train, x_test, y_train, y_test = train_test_split(corpora_documents, label_c, test_size=0.2)
print "doc2vec 训练"
for epoch in range(10): #进行多次重复训练,每一次都需要对训练数据重新打乱,以提高精度
    random.shuffle(x_train)
    model.train(corpora_documents,total_examples=len(corpora_documents),epochs=model.iter)
print '#########', model.vector_size

print "预测结果"
def getVecs(model, corpus, size):
    vecs = [np.array(model.docvecs[z.tags[0]]).reshape((1, size)) for z in corpus]
    return np.concatenate(vecs)
train_arrays=getVecs(model, x_train, size_doc2)
test_vecs=[]
for doc_num in x_test:
    #print doc_num[1][0]
    inferred_vector = model.infer_vector(doc[doc_num[1][0]])
    test_vecs.append(inferred_vector)
print "训练集大小",np.shape(train_arrays)

RF =KNeighborsClassifier(n_neighbors=4, algorithm='brute', metric='cosine') 
RF.fit(train_arrays,y_train) 
test_labels_RF=RF.predict(test_vecs)
print "准确率",metrics.accuracy_score(test_labels_RF,y_test)#准确率
print "召回率",confusion_matrix(test_labels_RF,y_test)#召回率
pret=0.0
for doc_num in x_test:
    #print doc_num[1][0]
    inferred_vector = model.infer_vector(doc[doc_num[1][0]])
    sims = model.docvecs.most_similar([inferred_vector], topn=3)
    pre=[to[0] for to in sims ]
    sims_doc=[label_c[ind] for ind in pre]
    label_=dict([(sims_doc.count(i),i) for i in sims_doc])
    if label_c[doc_num[1][0]]==label_[max(label_.keys())]:#sims是一个tuples,(index_of_document, similarity)
        pret+=1
print "相似度计算",pret/len(x_test)
Doc2vec 模型参数,DBOW的效果明显好过DM的效果。

没有做交叉验证,嗯,是缺点;没算 F1 和 recall 也是缺点。不过效果有了就好

数据的话,就是我自己从知网上下载的7000+摘要,分好词、去除停用词,一行一个文档,间隔是俩空格,就可以用啦


你可能感兴趣的:(用gensim.doc2vec 建模、利用相似度做文本分类)