#!/usr/bin/python3
import kashgari
from kashgari.embeddings import BERTEmbedding
import logging
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import jieba
import sys
chinese_bert_file='./data/chinese_L-12_H-768_A-12'
def cal_cosine():
bert = BERTEmbedding(chinese_bert_file,
task=kashgari.CLASSIFICATION,
sequence_length=10)
# call for single embed
sen1 = input('sentence1:')
sen2 = input('sentence2:')
seg_list1 = jieba.cut(sen1, cut_all=False)
seg_list2 = jieba.cut(sen2, cut_all=False)
seg_list1 = list(seg_list1)
seg_list2 = list(seg_list2)
embed_tensor1 = bert.embed_one(seg_list1)
embed_tensor2 = bert.embed_one(seg_list2)
embedding1 = np.zeros(shape=(1,3072))
embedding2 = np.zeros(shape=(1,3072))
for i in range(embed_tensor1.shape[0]):
# print(embed_tensor1[i][:])
embedding1 += embed_tensor1[i][:]
embedding2 += embed_tensor2[i][:]
cos_value = cosine_similarity(embedding1, embedding2)
print('cos_value =', str(cos_value[0][0]))
def compete_similar():
sen1 = input('sentence1:')
sen2 = input('sentence2:')
sege1=jieba.cut(sen1)
wordlist_1=[word for word in sege1]
sege2=jieba.cut(sen2)
wordlist_2=[word for word in sege2]
word_dict = set(wordlist_1+wordlist_2)
word_count_1={}
word_count_2={}
word_count_vec_1=[]
word_count_vec_2=[]
for word in word_dict:
num1=sen1.count(word)
num2=sen2.count(word)
word_count_1[word]=num1
word_count_2[word]=num2
word_count_vec_1.append(num1)
word_count_vec_2.append(num2)
vec_1=np.array(word_count_vec_1).reshape(1, -1)
vec_2=np.array(word_count_vec_2).reshape(1, -1)
result = cosine_similarity(vec_1,vec_2)
print('cos_result =', str(result[0][0]))
if __name__ == '__main__':
cal_cosine()
result=compete_similar()