【pyltp】基于python用nlp做知识抽取

前提:如何安装pyltp请查看 https://blog.csdn.net/oqqHun123/article/details/86767537

 

课题一:基于python用nlp做知识抽取

详细:把一篇文章中的三元组抽出来,例如:中国 首都 北京

调研: Ner:将文章中实体抽出; pyltp进行关系分类;一篇文章基本上是采取分句去一句一句做的。

acl上找近几年的文章,里面有很多想法。

 

# -*- coding: utf-8 -*-
from pyltp import SentenceSplitter
from pyltp import Segmentor
from pyltp import Postagger
from pyltp import NamedEntityRecognizer
from pyltp import Parser


def sentence_splitter(sentence):
    """
    分句,也就是将一片文本分割为独立的句子
    :param sentence:几句话
    :return: 单个单个句子
    """
    single_sentence = SentenceSplitter.split(sentence)  # 分句
    print '\n'.join(single_sentence)


def word_splitter(sentence):
    """
    分词
    :param sentence:
    :return:
    """
    segmentor = Segmentor()  # 初始化实例
    segmentor.load('D:/Program Files/ltp-models/3.3.1/ltp-data-v3.3.1/ltp_data/cws.model')  # 加载模型
    words = segmentor.segment(sentence)  # 分词
    # 默认可以这样输出
    # print '\t'.join(words)
    # 可以转换成List 输出
    words_list = list(words)
    # for word in words_list:
    #     print word
    segmentor.release()  # 释放模型
    return words_list


def word_tag(words):
    """
    词性标注
    :param words: 已切分好的词
    :return:
    """
    postagger = Postagger()  # 初始化实例
    postagger.load('D:/Program Files/ltp-models/3.3.1/ltp-data-v3.3.1/ltp_data/pos.model')  # 加载模型
    postags = postagger.postag(words)  # 词性标注
    # for word, tag in zip(words, postags):
    #     print word+'/'+tag
    postagger.release()  # 释放模型
    return postags


def name_recognition(words, postags):
    """
    命名实体识别
    :param words:分词
    :param postags:标注
    :return:
    """
    recognizer = NamedEntityRecognizer()  # 初始化实例
    recognizer.load('D:/Program Files/ltp-models/3.3.1/ltp-data-v3.3.1/ltp_data/ner.model')  # 加载模型
    netags = recognizer.recognize(words, postags)  # 命名实体识别

    # 地名标签为 ns
    result = ''
    for i in range(0, len(netags)):
        if i

 

 

你可能感兴趣的:(人工智能,python,pyltp,nlp)