wmd 源码 python实现版

from gensim.models import Word2Vec
from gensim import corpora, models, similarities
import numpy as np
import scipy.optimize
from scipy import spatial
import time
import jieba
import itertools
jieba.load_userdict('/export/user/shizhengxin/word2vec/jiebaUserWord.txt')
# Constraints

def positivity(f):
    '''
    Constraint 1:
    Ensures flow moves from source to target
    '''
    return f

def fromSrc(f, wp, i, shape):
    """
    Constraint 2:
    Limits supply for source according to weight
    """
    fr = np.reshape(f, shape)
    f_sumColi = np.sum(fr[i, :])
    return wp[i] - f_sumColi
def toTgt(f, wq, j, shape):
    """
    Constraint 3:
    Limits demand for target according to weight
    """
    fr = np.reshape(f, shape)
    f_sumRowj = np.sum(fr[:, j])
    return wq[j] - f_sumRowj
def maximiseTotalFlow(f, wp, wq):
    """
    Constraint 4:
    Forces maximum supply to move from source to target
    """
    return f.sum() - np.minimum(wp.sum(), wq.sum())
# Objective function
def flow(f, D):
    """
    The objective function
    The flow represents the amount of goods to be moved
    from source to target
    """
    f = np.reshape(f, D.shape)
    return (f * D).sum()
# Distance
def groundDistance(x1, x2, norm=2):
    """
    L-norm distance
    Default norm = 2
    """
    return np.linalg.norm(x1 - x2, norm)
def get_normlize_data(common_list,model):
    vector = []
    common_list = list(common_list)    
    list_length  = len(common_list)
    coef_mat = np.zeros((list_length,list_length),dtype = float)
    raw_mat = []
    for i in range(len(common_list)):
        raw_mat.append(model.wv[common_list[i]])
        raw_mat = np.mat(raw_mat)
        vector.append(model.wv[common_list[i]]/np.linalg.norm(model.wv[common_list[i]]))
    return np.mat(vector)

def get_standard_ques(lists,model,dict_sim):
    all_standard_ques = []
    all_weights = []
    all_signature = []
    for i in lists:
        list2 = throw_stopwords(list(jieba.cut(i)), stopwords)
        list2 = list(set(list2))
        list2 = [sed for sed in list2 if sed in model.wv.vocab]
        list2_new, weights2, signature2 = get_signature(list2, dict_sim)

        all_standard_ques.append(get_normlize_data(list2_new, model))
        all_weights.append(weights2)
        all_signature.append(signature2)

    return all_standard_ques , all_weights , all_signature

# Distance matrix
def getDistMatrix(s1, s2, model):
    """
    Computes the distance matrix between the source
    and target distributions.
    The ground distance is using the L-norm (default L2 norm)
    """
    # rows = s1 feature length
    # cols = s2 feature length
    # numFeats1 = s1.shape[0]
    # numFeats2 = s2.shape[0]
    s1 = get_normlize_data(s1,model)
    s2 = get_normlize_data(s2,model)
    distance_matrix = s1.dot(s2.T)
    one_matrix = np.random.randint(1, 2, distance_matrix.shape)
    distance_matrix = one_matrix - distance_matrix
    # print(distance_matrix)
    distance_matrix = np.where(distance_matrix >= 0.9, 1, distance_matrix)
    distance_matrix = np.where(distance_matrix <= 0.4, 0, distance_matrix)
    # print(distance_matrix)
    return  distance_matrix
# Flow matrix
def getFlowMatrix(P, Q, D):
    """
    Computes the flow matrix between P and Q
    """
    numFeats1 = P[0].shape[0]
    numFeats2 = Q[0].shape[0]
    shape = (numFeats1, numFeats2)

    # Constraints
    cons1 = [{'type': 'ineq', 'fun': positivity},
             {'type': 'eq', 'fun': maximiseTotalFlow, 'args': (P[1], Q[1],)}]

    cons2 = [{'type': 'ineq', 'fun': fromSrc, 'args': (P[1], i, shape,)} for i in range(numFeats1)]
    cons3 = [{'type': 'ineq', 'fun': toTgt, 'args': (Q[1], j, shape,)} for j in range(numFeats2)]

    cons = cons1 + cons2 + cons3

    # Solve for F (solve transportation problem)
    F_guess = np.zeros(D.shape)
    F = scipy.optimize.minimize(flow, F_guess, args=(D,), constraints=cons)
    F = np.reshape(F.x, (numFeats1, numFeats2))
    return F
# Normalised EMD
def EMD(F, D):
    """
    EMD formula, normalised by the flow
    """
    return (F * D).sum() / F.sum()

# Runs EMD program
def getEMD(P, Q, model,F,D):
    """
    EMD computes the Earth Mover's Distance between
    the distributions P and Q
    P and Q are of shape (2,N)
    Where the first row are the set of N features
    The second row are the corresponding set of N weights
    The norm defines the L-norm for the ground distance
    Default is the Euclidean norm (norm = 2)
    """
    # D = getDistMatrix(P[0], Q[0],model)
    # F = getFlowMatrix(P, Q, D)
    return EMD(F, D)
# Example 1
path = 'stopWords.txt'
def get_stopwords(path):
    w = open(path,'r',encoding='utf-8')
    stopword = []
    for line in w.readlines():
        newline = line.strip()
        stopword.append(list(jieba.cut(newline)))
    stopword = itertools.chain(*stopword)
    w.close()
    return  stopword
stopwords = get_stopwords(path)

def throw_stopwords(list,stopwords):
    new_list = []
    for line in list:
        if line not in stopwords:
            new_list.append(line)
    return new_list

wp = open('question_from_scrawl_200.txt','r',encoding='utf-8')
all_ques = []
for line in wp.readlines():
    newline = line.strip()
    all_ques.append(newline)

wp.close()
tf_idfmodels = models.TfidfModel.load('./tf_idf_v1.model')
dictionary = corpora.Dictionary.load('./dictionary_v2')

def get_similar():
    ws = open('similary_word.txt','r',encoding='utf-8')
    dict_sim = {}
    for line in ws.readlines():
        newline = line.strip().split('\t')
        for word in newline[1:]:
            dict_sim[word] = newline[0]
    ws.close()
    return  dict_sim

def get_sub_list(lists,dict_sim):
    new_list = []
    for word in lists:
        if word in dict_sim.keys():
            new_list.append(dict_sim[word])
        else:
            new_list.append(word)

    return  new_list


def get_signature(listss,dict_sim):
    listss = get_sub_list(listss,dict_sim)
    corpus = [dictionary.doc2bow(listss)]
    corpus_tfidf = list(tf_idfmodels[corpus])[0]
    set_len = len(set(listss))
    weights1 = []
    list1_new = []
    sed = 0
    for id, score in corpus_tfidf:
        try:
            weights1.append(score)
            list1_new.append(dictionary.get(id))
            sed += 1
        except:
            pass
    weights1 = list(map(lambda x : x/sum(weights1),weights1))
    weights1 = np.array(weights1)
    list1_new = np.array(list1_new)
    signature1 = (list1_new, weights1)
    return list1_new , weights1 ,signature1

def get_standard_data():
    ws = open('new_faqs_data_v2.txt', 'r', encoding='utf-8')
    ques_standard = [line.strip().split('----')[2] for line in ws.readlines()]
    ws.close()
    return  ques_standard

def getExampleSignatures():
    dict_sim = get_similar()
    """
    returns signature1[features][weights], signature2[features][weights]
    """
    ws = open('new_faqs_data_v2.txt', 'r', encoding='utf-8')
    wk = open('question_score_tfidf_weight_v3.txt', 'a', encoding='utf-8')
    model = Word2Vec.load("./word2vec_test_v7.model")
    ques_standard = get_standard_data()
    ti = 0
    all_standard_ques, all_weights, all_signature = get_standard_ques(ques_standard, model, dict_sim)
    for list1_1 in all_ques:
        times1 = time.time()
        list1 = throw_stopwords(list(jieba.cut(list1_1)),stopwords)
        list1 = list(set(list1))
        list1 = [i for i in list1 if i in model.wv.vocab]
        list1_new, weights1, signature1 =  get_signature(list1,dict_sim)
        raw_distance = {}
        # print(len(all_standard_ques))
        # break
        for newline  in  range(len(all_standard_ques)) :
            # print('--------------',newline)
            # print(all_standard_ques[newline])
            # newline = line.strip().split('----')[2]
            D = getDistMatrix(list1_new, all_standard_ques[newline], model)
            # times4 = time.time()
            F = getFlowMatrix(signature1, all_signature[newline], D)
            # times3 = time.time()
            # print('times spend ' ,times3-times4)
            ti += 1
            # # F = np.where(F >= 0.9, 1, F)
            w_matrix = np.where(F <= 0.001, 0, F)
            raw_distance[newline] = getEMD(signature1, all_signature[newline], model, F, D)
            if ti  == 1000 :
                times2 = time.time()
                print('time spend', times2 - times1)
        new_sort = sorted(raw_distance.items(), key=lambda x: x[1], reverse=False)
        wk.writelines(list1_1+'****')
        for line ,score in new_sort[:4]:
            wk.writelines(line+'----'+str(score)+'----')

        wk.writelines(new_sort[4][0]+'----'+str(new_sort[4][1])+'\n')



# getExampleSignatures()

def get_percent(F):
    percen_10 = np.percentile(F, 70)
    F = np.where(F <= percen_10, 0, F)
    return  F

def get_common_list(list1,list2,dict_sim):

    list1 = get_sub_list(list1,dict_sim)
    list2 = get_sub_list(list2,dict_sim)

    '''
    获取共同字典
    '''
    return list1 , list2 ,list(set(list1).union(set(list2))) #构建两句话的并集


def get_vector(list1,common_list):
    list1 = list(set(list1))
    corpus = [dictionary.doc2bow(list1)]
    corpus_tfidf = list(tf_idfmodels[corpus])[0]
    list1_new = {}
    for id, score in corpus_tfidf:
        list1_new[dictionary.get(id)] = score
    '''
    获取词向量
    '''
    vector = [0]*len(common_list)
    # print(list1_new)
    # print('------------')
    # print(common_list)
    for ele in list1_new.keys():
        vector[common_list.index(ele)] =list1_new[ele] #创建BOW模型

    if sum(vector) == 0:
        return common_list , vector ,(list1,vector)
    vector = list(map(lambda x : x/sum(vector),vector))
    common_list = np.array(common_list)
    weights =  np.array(vector)
    # vector = get_softmax(vector)
    return  common_list , weights ,(common_list,weights)
def get_vector2(list1,common_list):
    corpus = [dictionary.doc2bow(common_list)]
    corpus_tfidf = list(tf_idfmodels[corpus])[0]
    list1 = list(set(list1))
    list1_new = {}
    for id, score in corpus_tfidf:
        list1_new[dictionary.get(id)] = score
    '''
    获取词向量
    '''

    vector = [0]*len(common_list)
    weights = [0]*len(common_list)
    # print(list1_new)
    # print('------------')
    # print(common_list)
    for ele in list1:
        if ele in list1_new.keys():
            weights[common_list.index(ele)] = list1_new[ele] #创建BOW模型

    common_list = np.array(common_list)
    weights =  np.array(weights)
    # vector = get_softmax(vector)
    return  common_list , weights ,(common_list,weights)

def get_F():
    dict_sim = get_similar()
    """
    returns signature1[features][weights], signature2[features][weights]
    """
    # ws = open('new_faqs_data_v2.txt', 'r', encoding='utf-8')
    # wk = open('question_score_tfidf_weight_v3.txt', 'a', encoding='utf-8')
    model = Word2Vec.load("./word2vec_test_v7.model")
    while True:
        print('input line:')
        list1_1 = input()
        list1 = throw_stopwords(list(jieba.cut(list1_1)), stopwords)
        list1 = list(set(list1))
        list1 = [i for i in list1 if i in model.wv.vocab]

        print('input line:')
        list2 = input()
        list2 = throw_stopwords(list(jieba.cut(list2)), stopwords)
        list2 = list(set(list2))
        list2 = [sed for sed in list2 if sed in model.wv.vocab]
        list1 , list2 ,common_list = get_common_list(list1,list2,dict_sim)
        print(common_list)
        list1_new, weights1, signature1 = get_vector(list1, common_list)
        list2_new, weights2, signature2 = get_vector(list2, common_list)

        D = getDistMatrix(list1_new, list2_new, model)
        # times4 = time.time()
        F = getFlowMatrix(signature1, signature2, D)

        distance = getEMD(signature1, signature2, model, F, D)
        print('原始权重', F)
        print('原始wmd',distance)
        F = get_percent(F)
        distance = getEMD(signature1, signature2, model, F, D)
        print('改进权重', F)
        print('改进wmd',distance)


get_F()

你可能感兴趣的:(自然语言处理)