基于词向量计算文本相似度(完整代码+测试数据)

基于词向量计算文本相似度

1.测试数据:

链接:https://pan.baidu.com/s/1fXJjcujAmAwTfsuTg2CbWA
提取码:f4vx

2.实验代码:

import math
import os
import pandas as pd
import numpy as np


# 计算两个向量的余弦相似度
def cos_similarity(vec_dim, vector_1, vector_2):  # 输入两个向量
    # 计算两个向量的点积
    x = 0
    i = 0
    # vec_dim = len(vector_1)  # 向量维度
    while i < vec_dim:
        x = x + vector_1[i] * vector_2[i]
        i = i + 1

    # 计算两个向量的模
    i = 0
    sq_1 = 0
    sq_2 = 0
    while i < vec_dim:
        sq_1 = sq_1 + vector_1[i] * vector_1[i]  # pow(a,2)
        sq_2 = sq_2 + vector_2[i] * vector_2[i]
        i = i + 1

    result = float(x) / (math.sqrt(sq_1) * math.sqrt(sq_2))
    return result


def get_embeddings(path):
    with open(path, encoding='utf8') as f:
        data = f.read().splitlines()  # 逐行读取文件,并去除回车,输出['','',...,]

    row = data[0].split()[0]  # embedding的行数
    col = data[0].split()[1]  # embedding的列数
    dim = int(col)

    i = 1
    embeddings = []
    while i < int(row):
        # item_list = []
        item = data[i].split(' ')
        word = item[0]
        embedding = item[1:]
        embedding = list(map(eval, embedding))
        # item_list.append(word)
        embeddings.append(embedding)
        i += 1
    # embeddings.append(item_list)  # item_list[0]为关键词,item_list[1]为embedding
    return embeddings, dim


def find_each(path):
    path_list = []
    files_dir = os.listdir(path)
    for file in files_dir:
        file_path = os.path.join('%s\%s' % (path, file))
        path_list.append(file_path)
    return path_list


def get_sim_matrix(path_1, path_2):  # 输入单embedding的路径
    # 获得两个embeddings数据
    embeddings_1, vec_dim_1 = get_embeddings(path_1)
    embeddings_2, vec_dim_2 = get_embeddings(path_2)

    # 生成词向量相似度矩阵
    if vec_dim_1 == vec_dim_2:
        matrix = []
        for em_1 in embeddings_1:
            score = []
            for em_2 in embeddings_2:
                cos_sim = cos_similarity(vec_dim_1, em_1, em_2)
                score.append(cos_sim)  # embeddings1中第i个embedding与embeddings2中每个embedding的相似度值
            matrix.append(score)
    else:
        print('input error: the dimensions are different')
    return matrix


# 卷积层,卷积核的感受野为2*2,参数表示一个输入词向量矩阵
def cnn_folding(dict_vec):
    c = len(dict_vec[1])  # 获取输入矩阵的横向长度
    r = len(dict_vec)  # 获取输入矩阵的纵向长度
    result = [[0 for col in range(c-1)] for row in range(r-1)]  # python构造的二维列表
    for i in range(r-1):  # 通过循环实现整个矩阵的运算
        for j in range(c-1):
            re = (dict_vec[i][j] + dict_vec[i][j+1] + dict_vec[i+1][j] +
                  dict_vec[i+1][j+1])/4  # 实现卷积层的运算,这里卷积核默认是[[1,1],[1,1]]
            result[i][j] = re
    return result


# 池化层,采用max-pooling方式实现池化,参数表示输入矩阵
def cnn_pooling(dict_pooling):
    c = len(dict_pooling[1])
    r = len(dict_pooling)
    result = [[0 for col in range(c - 1)] for row in range(r - 1)]  # python构造的二维列表
    for i in range(r - 1):
        for j in range(c - 1):
            re = max(dict_pooling[i][j], dict_pooling[i][j + 1], dict_pooling[i + 1][j],
                     dict_pooling[i + 1][j + 1])  # max-pooling方法实现池化
            result[i][j] = re
    return result


# 实现卷积层和池化层的连接层
def pooling_folding(matrix):
    res = []
    data_list = matrix
    while 1:  # 交替实现卷积层和池化层
        c = len(data_list[0])
        r = len(data_list)
        if c == 1 or r == 1:  # 判定池化层跳出循环条件
            for i in range(len(data_list)):
                for j in data_list[i]:
                    res.append(j)
            break
        pool = cnn_pooling(data_list)  # 实现池化层
        if len(pool) == 1 or len(pool[1]) == 1:  # 判定卷积层跳出循环的条件
            data_list = pool
            for i in range(len(data_list)):
                for j in data_list[i]:
                    res.append(j)
            break
        else:
            fold = cnn_folding(pool)  # 实现卷积层
            data_list = fold
            pool = [[0 for col in range(c - 1)] for row in range(r - 1)]
            fold = [[0 for col in range(c - 1)] for row in range(r - 1)]
    return res


jd_path = r'D:\thesis\0811\jd_graph\graph_embeddings'
user_path = r'D:\thesis\0811\user_graph\graph_embeddings'
jd_em_paths = find_each(jd_path)  # 得到目录下的
user_em_paths = find_each(user_path)
job_list = []
sim_lists = []
for jd_file in jd_em_paths:
    sim_dict = {}

    jd_file_name = os.path.basename(jd_file)
    jd_name = jd_file_name.split('.')[0]  # jd的类型名称
    job_list.append(jd_name)

    for user_file in user_em_paths:
        sim_matrix = get_sim_matrix(jd_file, user_file)  # 行代表job的embedding,列代表user的embedding,值为两个embedding的相似度
        sim_res = pooling_folding(sim_matrix)  # 送入卷积、池化层,全连接
        sim_score = sum(sim_res)/len(sim_res)  # 求和平均

        user_file_name = os.path.basename(user_file)
        user_name = user_file_name.split('.')[0]  # user id

        sim_dict.update({user_name: sim_score})  # 或.update(b=2) # 每个岗位与各用户的相似度
    sim_list = sorted(zip(sim_dict.values(), sim_dict.keys()), reverse=True)  # 降序排列
    sim_list = sim_list[:100]  # 取前100个

    sim_lists.append(sim_list)

df = pd.DataFrame()
df['jd_sub_type'] = job_list
df['sim_users'] = sim_lists
df.to_csv("../data/jd_user_sim_2.csv", encoding="utf8", index=None, header=True)  # 写入文件,每个岗位与各用户的相似度

# df = pd.read_csv("../data/jd_user_sim.csv", encoding='utf8', header=0)  # 读取文件

print('end')

3.说明

代码实现的是两个文件夹中,文本embedding两两之间的相似度。测试只提供了两个embedding,需要更改合适的路径运行。

参考:https://blog.csdn.net/Mr_carry/article/details/80996454(有核心代码的详解。)

你可能感兴趣的:(自然语言处理,python,深度学习,自然语言处理)