python实现短文本相似度计算—word2vec对文本编码、LSTM计算距离

path='./data/qa_test.txt'#数据的路径
path_word2vec='/home/ruben/data/nlp/word2vec_wx'#word2vec路径
#造数据
fake_data=open(path,'r').readlines()
tain_data_l=[]
tain_data_r=[]
for line in fake_data:
    for line2 in fake_data:
        if(line is not line2):
            print(line.replace('\n',''),line2.replace('\n',''))
            tain_data_l.append(line.replace('\n',''))
            tain_data_r.append(line2.replace('\n',''))
print('left length:',len(tain_data_l))
print('right length:',len(tain_data_r))
import jieba
#构造字典和weight矩阵
list_word=['UNK']
dict_word={}
tain_data_l_n=[]#左边LSTM的输入
tain_data_r_n=[]#右边LSTM的输入

for data in [tain_data_l,tain_data_r]:
    for line in data:
        words=list(jieba.cut(line))
        for i,word in enumerate(words):
            if word not in dict_word:
                dict_word[word]=len(dict_word)
print(dict_word)#字典构造完毕
id2w={dict_word[w]:w for w in dict_word}#word的索引
embedding_size=256
embedding_arry=np.random.randn(len(dict_word)+1,embedding_size)#句子embedding矩阵
embedding_arry[0]=0
word2vector=gensim.models.Word2Vec.load(path_word2vec)
for index,word in enumerate(dict_word):
    if word in word2vector.wv.vocab:
        embedding_arry[index]=word2vector.wv.word_vec(word)
print('embedding_arry shape:',embedding_arry.shape)
del word2vector
#将词组替换为索引
for line in tain_data_l:
    words = list(jieba.cut(line))
    for i,word in enumerate(words):
        words[i]=dict_word[word]
    tain_data_l_n.append(words)
print('tain_data_l_n length:',len(tain_data_l_n))
y_train=np.ones((len(tain_data_l_n),))
for line in tain_data_r:
    words = list(jieba.cut(line))
    for i,word in enumerate(words):
        words[i]=dict_word[word]
    tain_data_r_n.append(words)
print('tain_data_r_n length:',len(tain_data_r_n))
#得到语料中句子的最大长度
max_length=0
for line in tain_data_r_n:
    if max_length

你可能感兴趣的:(python深度学习)