假设已经从一段文本中找到了实体序列,接下来要将序列链接到某一实体。
链接策略:
code:
import logging, os
from tqdm import tqdm
import distance
import numpy as np
import nlutools
from nlutools import tools as nlu
from sklearn.feature_extraction.text import CountVectorizer
from scipy.linalg import norm
def get_logger(filename):
logger = logging.getLogger('logger')
logger.setLevel(logging.DEBUG)
logging.basicConfig(format='%(message)s', level=logging.DEBUG)
handler = logging.FileHandler(filename)
handler.setLevel(logging.DEBUG)
handler.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s: %(message)s'))
logging.getLogger().addHandler(handler)
return logger
def add_space(s):
return ' '.join(list(s))
def edit_distance(s1, s2):
return distance.levenshtein(s1, s2)
def tf_similarity(s1, s2):
# 将字中间加入空格
s1, s2 = add_space(s1), add_space(s2)
# 转化为TF矩阵
cv = CountVectorizer(tokenizer=lambda s: s.split())
corpus = [s1, s2]
vectors = cv.fit_transform(corpus).toarray()
# 计算TF系数
return np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1]))
def jaccard_similarity(s1, s2):
# 将字中间加入空格
s1, s2 = add_space(s1), add_space(s2)
# 转化为TF矩阵
cv = CountVectorizer(tokenizer=lambda s: s.split())
corpus = [s1, s2]
vectors = cv.fit_transform(corpus).toarray()
# 求交集
numerator = np.sum(np.min(vectors, axis=0))
# 求并集
denominator = np.sum(np.max(vectors, axis=0))
# 计算杰卡德系数
return numerator / denominator
# a = nlu.sensimi('移动端在线购物平台','网上购物平台',100)
# a = nlu.simiscore("移动端在线购物平台",'网上购物平台',type='ifchange')
def all_entity_dict():
# 构造绝对路径
abspath = os.path.abspath('.')
# path = os.path.join(abspath, "linking/merge_rename4.txt")
# path = os.path.join(abspath, "linking/merge_rename4_adj.txt")
# path = os.path.join(abspath, "linking/merge_rename4_adj_discard.txt")
# f = open(path)
# f = open("merge_part1.txt")
f = open("merge_all_adj_use.txt") # 对 merge_all_adj.txt 过滤之后的数据
lines = f.readlines()
entity_dict = {}
for line in lines:
entity, temp = line.rstrip().split("\t")
alias = temp.split(",")
if entity not in entity_dict:
entity_dict[entity] = alias
else:
entity_dict[entity] = entity_dict[entity] + alias
return entity_dict
def recall_entity(entity_dict, ner, simi_type="tf"):
"""
# 所有 实体 及对应的 别名
entity_dict: {"互动平台":["窗帘互动平台", "校友互动平台"],
"次世代项目":["次世代项目", "标准次世代项目"],
...}
# 序列标注实体:
ner: "高铁项目"
"""
# 召回一些最有可能的备选实体
# alter_dic = {}
temp = {}
for entity,alias in entity_dict.items():
# 计算相似度
if simi_type == "nlu": # nlutools 相似度
score = nlu.simiscore(entity, ner, type='ifchange')
if score > 0.6:
temp[entity] = round(score,3)
if simi_type == "tf": # 字频 相似度
score = tf_similarity(entity, ner)
if score > 0.5:
temp[entity] = round(score,3)
if simi_type == "jaccard": # 杰卡德 相似度
s = jaccard_similarity(entity, ner)
if score > 0.3:
temp[entity] = round(score,3)
# 从大到小排序
temp_new = sorted(temp.items(), key=lambda x: x[1], reverse=True)
return temp_new[:8]
def ner_simiscore(entity_dict, ner):
"""
# 所有 实体 及对应的 别名:
entity_dict: {"互动平台":["窗帘互动平台", "校友互动平台"],
"次世代项目":["次世代项目", "标准次世代项目"],
...}
# 序列标注实体:
ner: "高铁项目"
"""
# 召回一批备选实体(最多8个):
# eg: [('培训项目', 1.0), ('教育培训项目', 0.839)]
alter_entity = recall_entity(entity_dict, ner)
if alter_entity:
# 精确计算每个备选实体的相似度
# temp_ = {}
temp = {}
for (entity,simi) in alter_entity:
entity_simi = nlu.simiscore(ner, entity, type='ifchange')
alias_simis = [] # 每个alias的相似度
for alias in entity_dict[entity]:
alias_simi = nlu.simiscore(ner, alias, type='ifchange')
alias_simis.append(alias_simi)
alias_simis_avg = np.mean(alias_simis) # 取alias的平均值
alias_max = max(alias_simis) # 取alias的最大值
temp[entity] = round((0.4*entity_simi + 0.6*alias_max), 3) # 记录当前entity的相似度
# 从大到小排序:
# [('培训项目', 0.89), ('教育培训项目', 0.817)]
temp_new = sorted(temp.items(), key=lambda x: x[1], reverse=True)
if temp_new[0][1] >= 0.7: # 如果top1大于0.7,则返回
return temp_new[:1]
else: # 否则,返回空
return []
# 打印前三,看看结果
# for entity, avgscore in temp_new[:3]:
# print("{}\t{}".format(entity, avgscore))
# simi,en = max(zip(score_dict.values(),score_dict.keys())) ## note
else:
print("无召回实体")
return []
def ttt(ners, entity_dict):
"""
返回一系列ner对应的实体。
params:
# 序列标注实体:
ners: ["高铁项目","集成系统"]
# 所有 实体 及对应的 别名
entity_dict: {"互动平台":["窗帘互动平台", "校友互动平台"],
"次世代项目":["次世代项目", "标准次世代项目"],
...}
"""
result = []
if ners:
for ner in ners:
ner = ner.lower() # 如果有英文,都统一转为小写
simi = ner_simiscore(entity_dict, ner)
result.append(simi)
return result
if __name__ == "__main__":
# a = ["移动端在线购物平台","培训项目","通信系统网络规划项目","云清洗平台","电信自营系统","子系统"]
# for ner in a:
# print("\n", ner)
# recall_entity(entity_dict, ner, simi_type='nlu')
# recall_entity(entity_dict, ner, simi_type='tf')
# ner_simiscore(entity_dict,"通信系统网络规划项目")
# print(ttt(["移动端在线购物平台","培训项目"]))
# 测试实体召回代码
# all_entity = all_entity_dict()
# print(recall_entity(all_entity,"培训项目"))
# 测试最终链接代码
# print(ner_simiscore(all_entity,"培训项目"))
entity_dict = all_entity_dict()
with open("pm_temp") as f:
lines = f.readlines()
fout = open("pm_temp_pre", mode="w", encoding="utf8")
for line in lines:
ners = eval(line.rstrip())
# ners = line.rstrip()
result = ttt(ners, entity_dict)
print("{}\t{}".format(ners, result))
fout.write("{}\n".format(result))
fout.close()
merge_all_adj_use:
邮件营销系统 edm邮件营销系统,电子邮件营销系统,邮件营销系统
数据营销系统 数据营销系统,大数据营销系统,数字管理营销系统
车险营销系统 阳光车险营销系统,车险营销系统
航空公司营销项目 航空公司营销项目,航空公司营销系统,航空公司营销业务
电力营销系统 南方电网营销系统,电力营销系统
智能营销系统 智能营销系统,智慧营销系统
pm_temp:
[]
['移动端在线购物平台']
[]
['培训项目']
['通信系统网络规划项目']
['云清洗平台', '电信自营系统']
['子系统']
['智能平台']
['宝洁集团化妆品品']
['综合性平台']
['交付勘测业务']
['公司高铁系统', '集成系统']
['考试系统', '考试管理系统']
['开放性平台']
[]
pm_temp_pre:
[]
[[('跨境购物平台', 0.813)]]
[]
[[('培训项目', 1.0)]]
[[('无线网络规划项目', 0.842)]]
[[], [('电信级网上运营系统', 0.764)]]
[[]]
[[('智能移动平台', 0.706)]]
[[]]
[[]]
[[]]
[[], [('系统集成系统', 0.83)]]
[[('考试系统', 1.0)], [('管理系统', 0.912)]]
[[]]
[]