实体识别(corenlp)

# coding=utf-8
import json
import logging
from stanfordcorenlp import StanfordCoreNLP

# nlp_entity = StanfordCoreNLP(r'stanford-corenlp-full-2018-02-27', lang='zh', quiet=False, logging_level=logging.DEBUG)


def combineByNer(nerType, tokens):
    # find all first indexes of series of tokens tagged as PERSON
    num_tokens = len(tokens)
    result = []
    first_indexes = (i for i in range(num_tokens) if tokens[i]['ner'] == nerType and (i == 0 or tokens[i-1]['ner'] != nerType))
    for begin_index in first_indexes:
        # find the end of the PERSON phrase (consecutive tokens tagged as PERSON)
        end_index = begin_index + 1
        while end_index < num_tokens and tokens[end_index]['ner'] == nerType:
            end_index += 1
        end_index -= 1
        # generate a mention identifier
        mention_text = "".join(map(lambda i: tokens[i]['word'], range(begin_index, end_index + 1)))
        # Output a tuple for each PERSON phrase
        wordInfo = {'word': '', 'ner': '', 'begin': '', 'end': ''}
        wordInfo['word'] = mention_text
        wordInfo['ner'] = nerType
        wordInfo['begin'] = tokens[begin_index]['characterOffsetBegin']
        wordInfo['end'] = tokens[end_index]['characterOffsetEnd']
        result.append(wordInfo)
    return result


def getNerCN(text, nlp_entity, nerType="PERSON,ORGANIZATION,DATE,TIME,LOCATION"):
    if 'LOCATION' in nerType:
        nerType = nerType.replace('LOCATION', 'CITY,STATE_OR_PROVINCE,COUNTRY')
    nerList = nerType.split(',')
    props = {'annotators': 'tokenize,pos,ner', 'pipelineLanguage': 'zh', 'outputFormat': 'json'}
    resp = nlp_entity.annotate(text, properties=props)
    jsonObj = json.loads(resp, strict=False)
    if jsonObj is None:
        return ''
    results = []
    sentences = jsonObj['sentences']
    sentLen = len(sentences)
    for i in range(sentLen):
        tokens = sentences[i]['tokens']
        for nerType in nerList:
            res = combineByNer(nerType, tokens)
            if len(res) != 0:
                results += res
    for result in results:
        if result["ner"] in ["CITY", "STATE_OR_PROVINCE", "COUNTRY"]:
            result["ner"] = "LOCATION"
    jsonStr = json.dumps(results, ensure_ascii=False, indent=4)
    return jsonStr


# print(getNerCN(text="密西西比州在美国,北京在中国。", nlp_entity=nlp_entity))

你可能感兴趣的:(nlp)