from gensim.models import Word2Vec
from gensim import corpora, models, similarities
import numpy as np
import scipy.optimize
from scipy import spatial
import time
import jieba
import itertools
jieba.load_userdict('/export/user/shizhengxin/word2vec/jiebaUserWord.txt')
def positivity(f):
'''
Constraint 1:
Ensures flow moves from source to target
'''
return f
def fromSrc(f, wp, i, shape):
"""
Constraint 2:
Limits supply for source according to weight
"""
fr = np.reshape(f, shape)
f_sumColi = np.sum(fr[i, :])
return wp[i] - f_sumColi
def toTgt(f, wq, j, shape):
"""
Constraint 3:
Limits demand for target according to weight
"""
fr = np.reshape(f, shape)
f_sumRowj = np.sum(fr[:, j])
return wq[j] - f_sumRowj
def maximiseTotalFlow(f, wp, wq):
"""
Constraint 4:
Forces maximum supply to move from source to target
"""
return f.sum() - np.minimum(wp.sum(), wq.sum())
def flow(f, D):
"""
The objective function
The flow represents the amount of goods to be moved
from source to target
"""
f = np.reshape(f, D.shape)
return (f * D).sum()
def groundDistance(x1, x2, norm=2):
"""
L-norm distance
Default norm = 2
"""
return np.linalg.norm(x1 - x2, norm)
def get_normlize_data(common_list,model):
vector = []
common_list = list(common_list)
list_length = len(common_list)
coef_mat = np.zeros((list_length,list_length),dtype = float)
raw_mat = []
for i in range(len(common_list)):
raw_mat.append(model.wv[common_list[i]])
raw_mat = np.mat(raw_mat)
vector.append(model.wv[common_list[i]]/np.linalg.norm(model.wv[common_list[i]]))
return np.mat(vector)
def get_standard_ques(lists,model,dict_sim):
all_standard_ques = []
all_weights = []
all_signature = []
for i in lists:
list2 = throw_stopwords(list(jieba.cut(i)), stopwords)
list2 = list(set(list2))
list2 = [sed for sed in list2 if sed in model.wv.vocab]
list2_new, weights2, signature2 = get_signature(list2, dict_sim)
all_standard_ques.append(get_normlize_data(list2_new, model))
all_weights.append(weights2)
all_signature.append(signature2)
return all_standard_ques , all_weights , all_signature
def getDistMatrix(s1, s2, model):
"""
Computes the distance matrix between the source
and target distributions.
The ground distance is using the L-norm (default L2 norm)
"""
s1 = get_normlize_data(s1,model)
s2 = get_normlize_data(s2,model)
distance_matrix = s1.dot(s2.T)
one_matrix = np.random.randint(1, 2, distance_matrix.shape)
distance_matrix = one_matrix - distance_matrix
distance_matrix = np.where(distance_matrix >= 0.9, 1, distance_matrix)
distance_matrix = np.where(distance_matrix <= 0.4, 0, distance_matrix)
return distance_matrix
def getFlowMatrix(P, Q, D):
"""
Computes the flow matrix between P and Q
"""
numFeats1 = P[0].shape[0]
numFeats2 = Q[0].shape[0]
shape = (numFeats1, numFeats2)
cons1 = [{'type': 'ineq', 'fun': positivity},
{'type': 'eq', 'fun': maximiseTotalFlow, 'args': (P[1], Q[1],)}]
cons2 = [{'type': 'ineq', 'fun': fromSrc, 'args': (P[1], i, shape,)} for i in range(numFeats1)]
cons3 = [{'type': 'ineq', 'fun': toTgt, 'args': (Q[1], j, shape,)} for j in range(numFeats2)]
cons = cons1 + cons2 + cons3
F_guess = np.zeros(D.shape)
F = scipy.optimize.minimize(flow, F_guess, args=(D,), constraints=cons)
F = np.reshape(F.x, (numFeats1, numFeats2))
return F
def EMD(F, D):
"""
EMD formula, normalised by the flow
"""
return (F * D).sum() / F.sum()
def getEMD(P, Q, model,F,D):
"""
EMD computes the Earth Mover's Distance between
the distributions P and Q
P and Q are of shape (2,N)
Where the first row are the set of N features
The second row are the corresponding set of N weights
The norm defines the L-norm for the ground distance
Default is the Euclidean norm (norm = 2)
"""
return EMD(F, D)
path = 'stopWords.txt'
def get_stopwords(path):
w = open(path,'r',encoding='utf-8')
stopword = []
for line in w.readlines():
newline = line.strip()
stopword.append(list(jieba.cut(newline)))
stopword = itertools.chain(*stopword)
w.close()
return stopword
stopwords = get_stopwords(path)
def throw_stopwords(list,stopwords):
new_list = []
for line in list:
if line not in stopwords:
new_list.append(line)
return new_list
wp = open('question_from_scrawl_200.txt','r',encoding='utf-8')
all_ques = []
for line in wp.readlines():
newline = line.strip()
all_ques.append(newline)
wp.close()
tf_idfmodels = models.TfidfModel.load('./tf_idf_v1.model')
dictionary = corpora.Dictionary.load('./dictionary_v2')
def get_similar():
ws = open('similary_word.txt','r',encoding='utf-8')
dict_sim = {}
for line in ws.readlines():
newline = line.strip().split('\t')
for word in newline[1:]:
dict_sim[word] = newline[0]
ws.close()
return dict_sim
def get_sub_list(lists,dict_sim):
new_list = []
for word in lists:
if word in dict_sim.keys():
new_list.append(dict_sim[word])
else:
new_list.append(word)
return new_list
def get_signature(listss,dict_sim):
listss = get_sub_list(listss,dict_sim)
corpus = [dictionary.doc2bow(listss)]
corpus_tfidf = list(tf_idfmodels[corpus])[0]
set_len = len(set(listss))
weights1 = []
list1_new = []
sed = 0
for id, score in corpus_tfidf:
try:
weights1.append(score)
list1_new.append(dictionary.get(id))
sed += 1
except:
pass
weights1 = list(map(lambda x : x/sum(weights1),weights1))
weights1 = np.array(weights1)
list1_new = np.array(list1_new)
signature1 = (list1_new, weights1)
return list1_new , weights1 ,signature1
def get_standard_data():
ws = open('new_faqs_data_v2.txt', 'r', encoding='utf-8')
ques_standard = [line.strip().split('----')[2] for line in ws.readlines()]
ws.close()
return ques_standard
def getExampleSignatures():
dict_sim = get_similar()
"""
returns signature1[features][weights], signature2[features][weights]
"""
ws = open('new_faqs_data_v2.txt', 'r', encoding='utf-8')
wk = open('question_score_tfidf_weight_v3.txt', 'a', encoding='utf-8')
model = Word2Vec.load("./word2vec_test_v7.model")
ques_standard = get_standard_data()
ti = 0
all_standard_ques, all_weights, all_signature = get_standard_ques(ques_standard, model, dict_sim)
for list1_1 in all_ques:
times1 = time.time()
list1 = throw_stopwords(list(jieba.cut(list1_1)),stopwords)
list1 = list(set(list1))
list1 = [i for i in list1 if i in model.wv.vocab]
list1_new, weights1, signature1 = get_signature(list1,dict_sim)
raw_distance = {}
for newline in range(len(all_standard_ques)) :
D = getDistMatrix(list1_new, all_standard_ques[newline], model)
F = getFlowMatrix(signature1, all_signature[newline], D)
ti += 1
w_matrix = np.where(F <= 0.001, 0, F)
raw_distance[newline] = getEMD(signature1, all_signature[newline], model, F, D)
if ti == 1000 :
times2 = time.time()
print('time spend', times2 - times1)
new_sort = sorted(raw_distance.items(), key=lambda x: x[1], reverse=False)
wk.writelines(list1_1+'****')
for line ,score in new_sort[:4]:
wk.writelines(line+'----'+str(score)+'----')
wk.writelines(new_sort[4][0]+'----'+str(new_sort[4][1])+'\n')
def get_percent(F):
percen_10 = np.percentile(F, 70)
F = np.where(F <= percen_10, 0, F)
return F
def get_common_list(list1,list2,dict_sim):
list1 = get_sub_list(list1,dict_sim)
list2 = get_sub_list(list2,dict_sim)
'''
获取共同字典
'''
return list1 , list2 ,list(set(list1).union(set(list2)))
def get_vector(list1,common_list):
list1 = list(set(list1))
corpus = [dictionary.doc2bow(list1)]
corpus_tfidf = list(tf_idfmodels[corpus])[0]
list1_new = {}
for id, score in corpus_tfidf:
list1_new[dictionary.get(id)] = score
'''
获取词向量
'''
vector = [0]*len(common_list)
for ele in list1_new.keys():
vector[common_list.index(ele)] =list1_new[ele]
if sum(vector) == 0:
return common_list , vector ,(list1,vector)
vector = list(map(lambda x : x/sum(vector),vector))
common_list = np.array(common_list)
weights = np.array(vector)
return common_list , weights ,(common_list,weights)
def get_vector2(list1,common_list):
corpus = [dictionary.doc2bow(common_list)]
corpus_tfidf = list(tf_idfmodels[corpus])[0]
list1 = list(set(list1))
list1_new = {}
for id, score in corpus_tfidf:
list1_new[dictionary.get(id)] = score
'''
获取词向量
'''
vector = [0]*len(common_list)
weights = [0]*len(common_list)
for ele in list1:
if ele in list1_new.keys():
weights[common_list.index(ele)] = list1_new[ele]
common_list = np.array(common_list)
weights = np.array(weights)
return common_list , weights ,(common_list,weights)
def get_F():
dict_sim = get_similar()
"""
returns signature1[features][weights], signature2[features][weights]
"""
model = Word2Vec.load("./word2vec_test_v7.model")
while True:
print('input line:')
list1_1 = input()
list1 = throw_stopwords(list(jieba.cut(list1_1)), stopwords)
list1 = list(set(list1))
list1 = [i for i in list1 if i in model.wv.vocab]
print('input line:')
list2 = input()
list2 = throw_stopwords(list(jieba.cut(list2)), stopwords)
list2 = list(set(list2))
list2 = [sed for sed in list2 if sed in model.wv.vocab]
list1 , list2 ,common_list = get_common_list(list1,list2,dict_sim)
print(common_list)
list1_new, weights1, signature1 = get_vector(list1, common_list)
list2_new, weights2, signature2 = get_vector(list2, common_list)
D = getDistMatrix(list1_new, list2_new, model)
F = getFlowMatrix(signature1, signature2, D)
distance = getEMD(signature1, signature2, model, F, D)
print('原始权重', F)
print('原始wmd',distance)
F = get_percent(F)
distance = getEMD(signature1, signature2, model, F, D)
print('改进权重', F)
print('改进wmd',distance)
get_F()