借助Pyltp获取文本的三元组关系,需要在分词,在分词的基础上实现词性标注
LTP_DIR 是下载的模型的文件夹的路径,其中的cws.model就是用来分词的模型
导入分词模型
from pyltp import Segmentor
sentence = '我想听一首周杰伦的歌'
LTP_DIR = "./ltp_data_v3.4.0"
segmentor = Segmentor()
segmentor.load(os.path.join(LTP_DIR, "cws.model"))
words = list(self.segmentor.segment(sentence))
segmentor.release() #释放模型
但是和jieba的分词工具稍微比较了一下,发现jieba好像在一些方面分的更加精准一点
import os
import jieba
from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller
sentence = '我想听一首周杰伦的歌'
LTP_DIR = "D:\Downloads\ltp_data_v3.4.0\ltp_data_v3.4.0"
segmentor = Segmentor()
segmentor.load(os.path.join(LTP_DIR, "cws.model"))
words = list(segmentor.segment(sentence))
segmentor.release() #释放模型
print(words)
seg_list =[i for i in jieba.lcut(sentence)]
seg_list
pyltp分词结果 -----》[‘我’, ‘想’, ‘听’, ‘一’, ‘首’, ‘周杰伦’, ‘的’, ‘歌’]
jieba分词结果 -----》[‘我’, ‘想’, ‘听’, ‘一首’, ‘周杰伦’, ‘的’, ‘歌’]
词性标注需要在已经分完词的基础上实现对文本的词性的标注
将分词后的列表传入,词性标注的模型,进行此项标注
postagger = Postagger()
postagger.load(os.path.join(LTP_DIR,"pos.model"))
postags = list(postagger.postag(words))
postagger.release()
for x,y in zip(words,postags):
print(x,y)
依存句法分析,需要在分词和词性标注的基础上实现
依存语义分析通过分析语言单位内成分之间的依存关系来确定语法结构。依存语义分析用来识别句子中的“主谓宾”、“定状补”等语法成分,并分析这些成分之间的关系。
依存句法分析标注关系有14种,具体如下:
parser = Parser()
parser.load(os.path.join(LTP_DIR, "parser.model"))
arcs = parser.parse(words, postags)
i=0
for word,arc in zip(words,arcs):
i=i+1
print(str(i)+'/'+word+'/'+str(arc.head)+'/'+str(arc.relation))
parser.release()
1/我/2/SBV
2/想/0/HED
3/听/2/VOB
4/一/5/ATT
5/首/8/ATT
6/周杰伦/8/ATT
7/的/6/RAD
8/歌/3/VOB
arc.head
指的是依存弧的父节点的索引,arc.relation
指的是依存弧的关系。
其中句子的根Root的序号是0,从我开始为1,到最后
在结果中我们可以看到“想”是整句话的核心关系,所以它的父节点是索引0。
我和想的关系是SBV(主谓关系),想是核心,听和想的是(动宾关系),以此类推。
labeller=SementicRoleLabeller()#初始化实例
labeller.load(os.path.join(LTP_DIR,'pisrl_win.model'))
roles=labeller.label(words,postags,arcs)#语义角色标注
for role in roles:
print(role.index,"".join(["%s:(%d,%d)"%(arg.name,arg.range.start,arg.range.end) for arg in role.arguments]))
labeller.release()
其中pisrl_win.model模型是要去官网下,默认下载的包里头还没有这个包,只有pisrl.model
role.index
代表谓词的索引,role.arguments
代表关于该谓词的若干语义角色。
arg.name
表示语义角色类型,arg.range.start
表示该语义角色起始词位置的索引,arg.range.end
表示该语义角色结束词位置的索引。
child_dict_list = []
format_parse_list = []
for index in range(len(words)):
child_dict = dict()
for arc_index in range(len(arcs)):
if arcs[arc_index].head == index+1: #arcs的索引从1开始 arc. head 表示依存弧的父结点的索引。 ROOT 节点的索引是 0 ,第一个词开始的索引依次为1,2,3,···arc. relation 表示依存弧的关系。
if arcs[arc_index].relation in child_dict:
child_dict[arcs[arc_index].relation].append(arc_index)#添加
else:
child_dict[arcs[arc_index].relation] = []#新建
child_dict[arcs[arc_index].relation].append(arc_index)
child_dict_list.append(child_dict)# 每个词对应的依存关系父节点和其关系
rely_id = [arc.head for arc in arcs] # 提取依存父节点id
relation = [arc.relation for arc in arcs] # 提取依存关系
heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语
for i in range(len(words)):
a = [relation[i], words[i], i, postags[i], heads[i], rely_id[i]-1, postags[rely_id[i]-1]]
format_parse_list.append(a)
sentence_parser.py
import os
from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller
# pip install pyltp -i https://pypi.tuna.tsinghua.edu.cn/simple 可以先下载好whl文件
#LTP语言平台:http://ltp.ai/index.html
#咱们使用的工具包,pyltp:https://pyltp.readthedocs.io/zh_CN/latest/api.html
#LTP附录:https://ltp.readthedocs.io/zh_CN/latest/appendix.html#id3
#安装方法:https://github.com/HIT-SCIR/pyltp
class LtpParser:
def __init__(self):
LTP_DIR = "./ltp_data_v3.4.0"
self.segmentor = Segmentor()
self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))
self.postagger = Postagger()
self.postagger.load(os.path.join(LTP_DIR, "pos.model"))
self.parser = Parser()
self.parser.load(os.path.join(LTP_DIR, "parser.model"))
self.recognizer = NamedEntityRecognizer()
self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))
self.labeller = SementicRoleLabeller()
self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model'))
'''语义角色标注'''
def format_labelrole(self, words, postags):
arcs = self.parser.parse(words, postags)
roles = self.labeller.label(words, postags, arcs)
roles_dict = {}
for role in roles:
roles_dict[role.index] = {arg.name:[arg.name,arg.range.start, arg.range.end] for arg in role.arguments}
return roles_dict
'''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典'''
def build_parse_child_dict(self, words, postags, arcs):
child_dict_list = []
format_parse_list = []
for index in range(len(words)):
child_dict = dict()
for arc_index in range(len(arcs)):
if arcs[arc_index].head == index+1: #arcs的索引从1开始 arc. head 表示依存弧的父结点的索引。 ROOT 节点的索引是 0 ,第一个词开始的索引依次为1,2,3,···arc. relation 表示依存弧的关系。
if arcs[arc_index].relation in child_dict:
child_dict[arcs[arc_index].relation].append(arc_index)#添加
else:
child_dict[arcs[arc_index].relation] = []#新建
child_dict[arcs[arc_index].relation].append(arc_index)
child_dict_list.append(child_dict)# 每个词对应的依存关系父节点和其关系
rely_id = [arc.head for arc in arcs] # 提取依存父节点id
relation = [arc.relation for arc in arcs] # 提取依存关系
heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语
for i in range(len(words)):
a = [relation[i], words[i], i, postags[i], heads[i], rely_id[i]-1, postags[rely_id[i]-1]]
format_parse_list.append(a)
return child_dict_list, format_parse_list
'''parser主函数'''
def parser_main(self, sentence):
words = list(self.segmentor.segment(sentence))
postags = list(self.postagger.postag(words))
arcs = self.parser.parse(words, postags)
child_dict_list, format_parse_list = self.build_parse_child_dict(words, postags, arcs)
roles_dict = self.format_labelrole(words, postags)
return words, postags, child_dict_list, roles_dict, format_parse_list
if __name__ == '__main__':
parse = LtpParser()
#sentence = '我想听一首迪哥的歌'
sentence = '奥巴马昨晚在白宫发表了演说'
words, postags, child_dict_list, roles_dict, format_parse_list = parse.parser_main(sentence)
print(words, len(words))
print(postags, len(postags))
print(child_dict_list, len(child_dict_list))
print(roles_dict)
print(format_parse_list, len(format_parse_list))
from sentence_parser import *
import re
#LTP语言平台:http://ltp.ai/index.html
#咱们使用的工具包,pyltp:https://pyltp.readthedocs.io/zh_CN/latest/api.html
#LTP附录:https://ltp.readthedocs.io/zh_CN/latest/appendix.html#id3
#安装方法:https://github.com/HIT-SCIR/pyltp
class TripleExtractor:
def __init__(self):
self.parser = LtpParser()
'''文章分句处理, 切分长句,冒号,分号,感叹号等做切分标识'''
def split_sents(self, content):
return [sentence for sentence in re.split(r'[??!!。;;::\n\r]', content) if sentence]
'''利用语义角色标注,直接获取主谓宾三元组,基于A0,A1,A2'''
def ruler1(self, words, postags, roles_dict, role_index):
v = words[role_index]
role_info = roles_dict[role_index]
if 'A0' in role_info.keys() and 'A1' in role_info.keys():
s = ''.join([words[word_index] for word_index in range(role_info['A0'][1], role_info['A0'][2]+1) if
postags[word_index][0] not in ['w', 'u', 'x'] and words[word_index]])
o = ''.join([words[word_index] for word_index in range(role_info['A1'][1], role_info['A1'][2]+1) if
postags[word_index][0] not in ['w', 'u', 'x'] and words[word_index]])
if s and o:
return '1', [s, v, o]
return '4', []
'''三元组抽取主函数'''
def ruler2(self, words, postags, child_dict_list, arcs, roles_dict):
svos = []
for index in range(len(postags)):
tmp = 1
# 先借助语义角色标注的结果,进行三元组抽取
if index in roles_dict:
flag, triple = self.ruler1(words, postags, roles_dict, index)
if flag == '1':
svos.append(triple)
tmp = 0
if tmp == 1:
# 如果语义角色标记为空,则使用依存句法进行抽取
# if postags[index] == 'v':
if postags[index]:
# 抽取以谓词为中心的事实三元组
child_dict = child_dict_list[index]
# 主谓宾
if 'SBV' in child_dict and 'VOB' in child_dict:
r = words[index]
e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
svos.append([e1, r, e2])
# 定语后置,动宾关系
relation = arcs[index][0]
head = arcs[index][2]
if relation == 'ATT':
if 'VOB' in child_dict:
e1 = self.complete_e(words, postags, child_dict_list, head - 1)
r = words[index]
e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
temp_string = r + e2
if temp_string == e1[:len(temp_string)]:
e1 = e1[len(temp_string):]
if temp_string not in e1:
svos.append([e1, r, e2])
# 含有介宾关系的主谓动补关系
if 'SBV' in child_dict and 'CMP' in child_dict:
e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
cmp_index = child_dict['CMP'][0]
r = words[index] + words[cmp_index]
if 'POB' in child_dict_list[cmp_index]:
e2 = self.complete_e(words, postags, child_dict_list, child_dict_list[cmp_index]['POB'][0])
svos.append([e1, r, e2])
return svos
'''对找出的主语或者宾语进行扩展'''
def complete_e(self, words, postags, child_dict_list, word_index):
child_dict = child_dict_list[word_index]
prefix = ''
if 'ATT' in child_dict:
for i in range(len(child_dict['ATT'])):
prefix += self.complete_e(words, postags, child_dict_list, child_dict['ATT'][i])
postfix = ''
if postags[word_index] == 'v':
if 'VOB' in child_dict:
postfix += self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
if 'SBV' in child_dict:
prefix = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix
return prefix + words[word_index] + postfix
'''程序主控函数'''
def triples_main(self, content):
sentences = self.split_sents(content)
svos = []
for sentence in sentences:
words, postags, child_dict_list, roles_dict, arcs = self.parser.parser_main(sentence)
svo = self.ruler2(words, postags, child_dict_list, arcs, roles_dict)
svos += svo
return svos
'''测试'''
def test():
content5 = '我购买了一件玩具,孩子非常喜欢这个玩具,但是质量不太好。希望商家能够保障商品质量,不要再出现类似问题。'
extractor = TripleExtractor()
svos = extractor.triples_main(content5)
print('svos', svos)
test()