./data/city.txt
北京
上海
广州
深圳
海西
海西蒙古族藏族自治州
./data/scene.txt
故宫
长城
圆明园
外滩
白云山
欢乐谷
input:
我想去北京的圆明园玩
output:
[{'start': 3, 'end': 5, 'value': '北京', 'entity': 'city'}, {'start': 6, 'end': 9, 'value': '圆明园', 'entity': 'scene'}]
# -*- coding: utf-8 -*-
# @Author : XerCis
# @Time : 2020/5/15 17:00
# @Function: 绝对匹配提取实体词
import os
def read(file_path):
"""读取文本文档生成器"""
with open(file_path, mode="r", encoding="utf-8") as f:
for line in f:
yield line.strip() # 去除空格换行
def extract(message, dictionary_path):
"""绝对匹配提取实体词"""
entities = []
for file_path in os.listdir(dictionary_path):
if file_path.endswith(".txt"):
file_path = os.path.join(dictionary_path, file_path)
it = read(file_path)
for i in it:
start = message.find(i)
if start != -1:
entities.append({
"start": start,
"end": start + len(i),
"value": i,
"entity": os.path.basename(file_path)[:-4], # 以文件名作实体名
"confidence": 1
})
return entities
if __name__ == "__main__":
print(extract("我想去北京的圆明园玩", dictionary_path="./data"))
优点:逐行读取文件,内存消耗极小
缺点:运行效率慢
# -*- coding: utf-8 -*-
# @Author : XerCis
# @Time : 2020/5/15 17:00
# @Function: 绝对匹配提取实体词
import os
def read(dictionary_path):
"""读取文本文档数据进字典"""
data = {}
for file_path in os.listdir(dictionary_path):
if file_path.endswith(".txt"):
file_path = os.path.join(dictionary_path, file_path)
file_name = os.path.basename(file_path)[:-4] # 以文件名作实体名
with open(file_path, mode="r", encoding="utf-8") as f:
data[file_name] = f.read().splitlines()
return data
def extract(data, message):
"""绝对匹配提取实体词"""
entities = []
for entity, value in data.items():
for i in value:
start = message.find(i)
if start != -1:
entities.append({
"start": start,
"end": start + len(i),
"value": i,
"entity": entity,
"confidence": 1
})
return entities
if __name__ == "__main__":
data = read("./data")
print(extract(data, message="我想去北京的圆明园玩"))
# -*- coding: utf-8 -*-
# @Author : XerCis
# @Time : 2020/5/15 17:00
# @Function: 绝对匹配提取实体词
import os
from itertools import combinations
def read(dictionary_path):
"""读取文本文档数据进字典"""
data = {}
for file_path in os.listdir(dictionary_path):
if file_path.endswith(".txt"):
file_path = os.path.join(dictionary_path, file_path)
file_name = os.path.basename(file_path)[:-4] # 以文件名作实体名
with open(file_path, mode="r", encoding="utf-8") as f:
data[file_name] = f.read().splitlines()
return data
def extract(data, message, take_long=False, take_short=False):
"""绝对匹配提取实体词"""
if take_long and take_short:
raise ValueError("take_long and take_short can not be both True")
entities = []
for entity, value in data.items():
for i in value:
start = message.find(i)
if start != -1:
entities.append({
"start": start,
"end": start + len(i),
"value": i,
"entity": entity,
"confidence": 1
})
for i in list(combinations(entities, 2)):
v0, v1 = i[0]["value"], i[1]["value"]
if v0 in v1 or v1 in v0:
(long, short) = (i[0], i[1]) if len(v0) > len(v1) else (i[1], i[0])
if take_long == True and short in entities:
entities.remove(short)
if take_short == True and long in entities:
entities.remove(long)
return entities
if __name__ == "__main__":
data = read("./data")
print(extract(data, message="海西全称为海西蒙古族藏族自治州", take_long=True))
print(extract(data, message="海西全称为海西蒙古族藏族自治州", take_short=True))
# [{'start': 5, 'end': 15, 'value': '海西蒙古族藏族自治州', 'entity': 'city', 'confidence': 1}]
# [{'start': 0, 'end': 2, 'value': '海西', 'entity': 'city', 'confidence': 1}]