doc2vec训练模型zhiwiki_news.doc2vec下载

1.doc2vec训练

train_model.py

import gensim.models as g
from gensim.corpora import WikiCorpus
import logging
from langconv import *

#enable logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

docvec_size=192
class TaggedWikiDocument(object):
    def __init__(self, wiki):
        self.wiki = wiki
        self.wiki.metadata = True
    def __iter__(self):
        import jieba
        for content, (page_id, title) in self.wiki.get_texts():
            yield g.doc2vec.TaggedDocument(words=[w for c in content for w in jieba.cut(Converter('zh-hans').convert(c))], tags=[title])

def my_function():
    zhwiki_name = './data/zhwiki-latest-pages-articles.xml.bz2'
    wiki = WikiCorpus(zhwiki_name, dictionary={})
    documents = TaggedWikiDocument(wiki)

    model = g.Doc2Vec(documents, dm=0, dbow_words=1, vector_size=docvec_size, window=8, min_count=19, epochs=5, workers=8)
    model.save('data/zhiwiki_news.doc2vec')

if __name__ == '__main__':
    my_function()

训练所需的语料库:zhiwiki-latest-pages-articles.xml.bz2

训练所需的时间较长,训练完后可以得到如下几个文件,训练所得的文件可以从https://pan.baidu.com/s/1oP3WBySVw7WgA1l6J5bHtA下载,提取码为0604。

doc2vec训练模型zhiwiki_news.doc2vec下载_第1张图片

 2.计算网页文本相似度

doc2vec_sim.py

import gensim.models as g
import codecs
import numpy
import numpy as np

model_path = 'data/zhiwiki_news.doc2vec'
start_alpha = 0.01
infer_epoch = 1000
docvec_size = 192


def simlarityCalu(vector1, vector2):
    vector1Mod = np.sqrt(vector1.dot(vector1))
    vector2Mod = np.sqrt(vector2.dot(vector2))
    if vector2Mod != 0 and vector1Mod != 0:
        simlarity = (vector1.dot(vector2)) / (vector1Mod * vector2Mod)
    else:
        simlarity = 0
    return simlarity


def doc2vec(file_name, model):
    import jieba
    doc = [w for x in codecs.open(file_name, 'r', 'utf-8').readlines() for w in jieba.cut(x.strip())]
    doc_vec_all = model.infer_vector(doc, alpha=start_alpha, epochs=infer_epoch)
    return doc_vec_all


if __name__ == '__main__':
    model = g.Doc2Vec.load(model_path)
    p1 = 'data/P1.txt'
    p2 = 'data/P2.txt'
    P1_doc2vec = doc2vec(p1, model)
    P2_doc2vec = doc2vec(p2, model)
    print(simlarityCalu(P1_doc2vec, P2_doc2vec))

两个文本经过运行所得的相似度结果如下:

doc2vec训练模型zhiwiki_news.doc2vec下载_第2张图片

 

 

你可能感兴趣的:(深度学习,python,人工智能)