Doc2vec 使用小结

"生物沙文主义,在这样一个小范围中,如果它影响到对知识的处理,是可笑的。将任何你担忧的专家交给我,我就会将他的价值发挥到最基本机魂的十倍以上。"
——摘录自《组织思想的问题》,第七章

 

doc2vec继承自word2vec。比起word2vec,doc能更好的使用文章或短句来进行训练与建模。

和word2vec不同,doc2vec需要每次把训练短句的tag作为值传入

TaggededDocument = gensim.models.doc2vec.TaggedDocument
model = gensim.models.Doc2Vec(size=contact_word_num,dm=1, window=1, min_count=1, workers=10) #dm=0-distributed bag of words; dm=1-distributed memory
if os.path.exists(save_path):
    model = gensim.models.Doc2Vec.load(save_path)


#储存分词,用来训练
sentence_cut_list = []

#根据excel的短句学习
def learn_excel_word():

    global sentence_cut_list

    sheet_name = sheet_names[0]
    sheet_ = workbook.sheet_by_name(sheet_name)

    for docIndex_ in MaxWordCount:
        #获取到词库里面的词
        text_ = sheet_.row_values(index_)[0].encode("utf-8")

        word_seg_list = jieba.cut(text_)
        for word_ in word_seg_list:
            sentence_cut_list.append(TaggededDocument(word_seg_, tags=[docIndex_]) )


#根据txt文章学习
def learn_txt_document():

    global sentence_cut_list

    #TaggedLineDocument输入文档路径,会根据行数自动分配文档的tag标记 之前需用空格分词, 自动转化为TaggedDocument格式
    sentence_cut_list = gensim.models.doc2vec.TaggedLineDocument(filePath) #加载语料
    #如果是较大语料则建议使用这种方式,或者TaggedBrownCorpus
#    for document in sentence_cut_list:
#        #TaggedDocument(['1805年,在', '拿破仑', '率兵', '征服', '欧洲', 之后','法国', '和', '俄国', '之间', '也', '发生了', '战争'], [0])


#根据txt短句学习
def learn_txt_word_line():

    global sentence_cut_list

    file_object = open(filePath, 'r', encoding='utf-8')

    docIndex_ = 0
    #针对一篇连续的文章,每一行都有上下文联系 使用Doc2v
    #短句和文章都可以使用
    for line in file_object.readlines():

        seg_list = jieba.cut(line)

        word_list_ = [] #每一行的分词
        for str in seg_list:
            #特殊字符过滤
            for punc_ in all_punc:
                str = str.replace(punc_, '')
            #字符串过滤
            if str not in all_specialWord:
                word_list_.append(str)

        if len(word_list_) > 0:
            #TaggededDocument(words=['赞美', '万机神', '欧姆尼赛亚'], tags=[0])
            sentence_cut_list.append(TaggededDocument(word_list_, tags=[docIndex_] ) )
            docIndex_ = docIndex_ +1
            print(docIndex_)

#生词本学习
def learn_txt_note_word():

    global sentence_cut_list

    file_object = open(filePath, 'r', encoding='utf-8')

    docIndex_ = 0

    for line in file_object.readlines():
        line = line.strip()
        #全分词模式
        word_seg_list = jieba.cut(line, cut_all = True)
        for word_ in word_seg_list:
            if word_ not in model.wv:
                #网络获取到的解释
                mean_ = w2v_learn_from_web(word_)
                #w2v学习解释
                if len(mean_) > 0:
                    #jieba
                    mean_cut_ = sentence_cut_to_list(mean_)
                    sentence_cut_list.append(TaggededDocument(mean_cut_, tags=[docIndex_] ) )
                    docIndex_ = docIndex_ +1


#拆分获取的生词解释
def sentence_cut_to_list(sentence):
    #分隔符
    separator_list_ = ['1.', '2.', '3.', '4.', '5.', '6.', '7.', '8.', '9.']
    for separator_ in separator_list_:
        sentence = sentence.replace(separator_, '&&')

    mean_cut_list_ = []

    #如果里面有分割符的话,分成几个小句子的列表
    sentence_list_ = sentence.split('&&')

    for part_sentence_ in sentence_list_:

        #字符过滤
        for punc_ in all_punc:
            part_sentence_ = part_sentence_.replace(punc_, '')

        #字符串过滤
        if part_sentence_ not in all_specialWord:
            seg_list = jieba.cut(part_sentence_)

        word_list_ = []
        for str in seg_list:
            word_list_.append(str)
        mean_cut_list_.append(word_list_)

    return mean_cut_list_

model.build_vocab(documents=sentence_cut_list, update=is_vocab_update)
model.train(sentence_cut_list, total_examples=model.corpus_count, total_words=model.corpus_count, epochs=2000, word_count=0)
model.save(save_path)

 

赞美机魂!赞美万机之神!朗读神圣机魂颂5-8节!

你可能感兴趣的:(机械神教,Doc2vec)