实体链接

假设已经从一段文本中找到了实体序列,接下来要将序列链接到某一实体。

链接策略:

  1. 计算序列和每个实体的tf-similarity,召回阈值大于0.5的实体(及别名)
  2. 计算序列和实体的余弦相似度:0.5*simi(序列,实体) + 0.5*top_simi(序列,别名s)
  3. 以上得分top1就是序列最终链接到的实体

code:

import logging, os
from tqdm import tqdm
import distance
import numpy as np
import nlutools
from nlutools import tools as nlu
from sklearn.feature_extraction.text import CountVectorizer
from scipy.linalg import norm

def get_logger(filename):
    logger = logging.getLogger('logger')
    logger.setLevel(logging.DEBUG)
    logging.basicConfig(format='%(message)s', level=logging.DEBUG)
    handler = logging.FileHandler(filename)
    handler.setLevel(logging.DEBUG)
    handler.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s: %(message)s'))
    logging.getLogger().addHandler(handler)
    return logger

def add_space(s):
    return ' '.join(list(s))

def edit_distance(s1, s2):
    return distance.levenshtein(s1, s2)

def tf_similarity(s1, s2):
    # 将字中间加入空格
    s1, s2 = add_space(s1), add_space(s2)
    # 转化为TF矩阵
    cv = CountVectorizer(tokenizer=lambda s: s.split())
    corpus = [s1, s2]
    vectors = cv.fit_transform(corpus).toarray()
    # 计算TF系数
    return np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1]))

def jaccard_similarity(s1, s2):
    # 将字中间加入空格
    s1, s2 = add_space(s1), add_space(s2)
    # 转化为TF矩阵
    cv = CountVectorizer(tokenizer=lambda s: s.split())
    corpus = [s1, s2]
    vectors = cv.fit_transform(corpus).toarray()
    # 求交集
    numerator = np.sum(np.min(vectors, axis=0))
    # 求并集
    denominator = np.sum(np.max(vectors, axis=0))
    # 计算杰卡德系数
    return numerator / denominator

# a = nlu.sensimi('移动端在线购物平台','网上购物平台',100)
# a = nlu.simiscore("移动端在线购物平台",'网上购物平台',type='ifchange')

def all_entity_dict():
    # 构造绝对路径
    abspath = os.path.abspath('.')
    # path = os.path.join(abspath, "linking/merge_rename4.txt")
    # path = os.path.join(abspath, "linking/merge_rename4_adj.txt")
    # path = os.path.join(abspath, "linking/merge_rename4_adj_discard.txt")
    # f = open(path)
    # f = open("merge_part1.txt")
    f = open("merge_all_adj_use.txt") # 对 merge_all_adj.txt 过滤之后的数据
    lines = f.readlines()
    entity_dict = {}
    for line in lines:
        entity, temp = line.rstrip().split("\t")
        alias = temp.split(",")
        if entity not in entity_dict:
            entity_dict[entity] = alias
        else:
            entity_dict[entity] = entity_dict[entity] + alias
    return entity_dict


def recall_entity(entity_dict, ner, simi_type="tf"):
    """
    # 所有 实体 及对应的 别名
    entity_dict: {"互动平台":["窗帘互动平台", "校友互动平台"], 
                  "次世代项目":["次世代项目", "标准次世代项目"], 
                  ...}
    # 序列标注实体:
    ner: "高铁项目"
    """
    # 召回一些最有可能的备选实体
    # alter_dic = {}
    temp = {}
    for entity,alias in entity_dict.items():
        # 计算相似度
        if simi_type == "nlu": # nlutools 相似度
            score = nlu.simiscore(entity, ner, type='ifchange')
            if score > 0.6:
                temp[entity] = round(score,3)
        if simi_type == "tf": # 字频 相似度
            score = tf_similarity(entity, ner)
            if score > 0.5:
                temp[entity] = round(score,3)
        if simi_type == "jaccard": # 杰卡德 相似度
            s = jaccard_similarity(entity, ner)
            if score > 0.3:
                temp[entity] = round(score,3)

    # 从大到小排序
    temp_new = sorted(temp.items(), key=lambda x: x[1], reverse=True)
    return temp_new[:8]

def ner_simiscore(entity_dict, ner):
    """
    # 所有 实体 及对应的 别名:
    entity_dict: {"互动平台":["窗帘互动平台", "校友互动平台"], 
                  "次世代项目":["次世代项目", "标准次世代项目"], 
                  ...}
    # 序列标注实体:
    ner: "高铁项目"
    """
    # 召回一批备选实体(最多8个):
    # eg: [('培训项目', 1.0), ('教育培训项目', 0.839)]
    alter_entity = recall_entity(entity_dict, ner)
    if alter_entity:
        # 精确计算每个备选实体的相似度
        # temp_ = {} 
        temp = {}
        for (entity,simi) in alter_entity:
            entity_simi = nlu.simiscore(ner, entity, type='ifchange')
            alias_simis = [] # 每个alias的相似度
            for alias in entity_dict[entity]:
                alias_simi = nlu.simiscore(ner, alias, type='ifchange')
                alias_simis.append(alias_simi)
                alias_simis_avg = np.mean(alias_simis) # 取alias的平均值 
                alias_max = max(alias_simis) # 取alias的最大值
            temp[entity] = round((0.4*entity_simi + 0.6*alias_max), 3) # 记录当前entity的相似度
        # 从大到小排序:
        # [('培训项目', 0.89), ('教育培训项目', 0.817)]
        temp_new = sorted(temp.items(), key=lambda x: x[1], reverse=True) 
        if temp_new[0][1] >= 0.7: # 如果top1大于0.7,则返回
            return temp_new[:1]
        else: # 否则,返回空
            return []

        # 打印前三,看看结果
        # for entity, avgscore in temp_new[:3]:
        #     print("{}\t{}".format(entity, avgscore))
        # simi,en = max(zip(score_dict.values(),score_dict.keys())) ## note
    else:
        print("无召回实体")
        return []


def ttt(ners, entity_dict):
    """
    返回一系列ner对应的实体。
    params:
    # 序列标注实体:
    ners: ["高铁项目","集成系统"]
    # 所有 实体 及对应的 别名
    entity_dict: {"互动平台":["窗帘互动平台", "校友互动平台"], 
                  "次世代项目":["次世代项目", "标准次世代项目"], 
                  ...}
    """
    result = []
    if ners:
        for ner in ners:
            ner = ner.lower() # 如果有英文,都统一转为小写
            simi = ner_simiscore(entity_dict, ner)
            result.append(simi)
    return result
    


if __name__ == "__main__":
    # a = ["移动端在线购物平台","培训项目","通信系统网络规划项目","云清洗平台","电信自营系统","子系统"]
    # for ner in a:
    #     print("\n", ner)
    #     recall_entity(entity_dict, ner, simi_type='nlu')
        # recall_entity(entity_dict, ner, simi_type='tf')

    # ner_simiscore(entity_dict,"通信系统网络规划项目")
    # print(ttt(["移动端在线购物平台","培训项目"]))

    # 测试实体召回代码
    # all_entity = all_entity_dict()
    # print(recall_entity(all_entity,"培训项目"))

    # 测试最终链接代码
    # print(ner_simiscore(all_entity,"培训项目"))

    
    entity_dict = all_entity_dict()
    with open("pm_temp") as f:
        lines = f.readlines()
    fout = open("pm_temp_pre", mode="w", encoding="utf8")
    for line in lines:
        ners = eval(line.rstrip())
        # ners = line.rstrip()
        result = ttt(ners, entity_dict)
        print("{}\t{}".format(ners, result))
        fout.write("{}\n".format(result))
    fout.close()

merge_all_adj_use:

邮件营销系统	edm邮件营销系统,电子邮件营销系统,邮件营销系统
数据营销系统	数据营销系统,大数据营销系统,数字管理营销系统
车险营销系统	阳光车险营销系统,车险营销系统
航空公司营销项目	航空公司营销项目,航空公司营销系统,航空公司营销业务
电力营销系统	南方电网营销系统,电力营销系统
智能营销系统	智能营销系统,智慧营销系统

pm_temp:

[]
['移动端在线购物平台']
[]
['培训项目']
['通信系统网络规划项目']
['云清洗平台', '电信自营系统']
['子系统']
['智能平台']
['宝洁集团化妆品品']
['综合性平台']
['交付勘测业务']
['公司高铁系统', '集成系统']
['考试系统', '考试管理系统']
['开放性平台']
[]

pm_temp_pre:

[]
[[('跨境购物平台', 0.813)]]
[]
[[('培训项目', 1.0)]]
[[('无线网络规划项目', 0.842)]]
[[], [('电信级网上运营系统', 0.764)]]
[[]]
[[('智能移动平台', 0.706)]]
[[]]
[[]]
[[]]
[[], [('系统集成系统', 0.83)]]
[[('考试系统', 1.0)], [('管理系统', 0.912)]]
[[]]
[]

你可能感兴趣的:(nlp)