基于spacy的中文命名实体识别

找了大半天,GIT、CSDN没找到完整的一篇介绍spacy做NER的项目,这么棒的工业级框架,竟然没有详细的介绍,经本人用医疗数据初步测试,标注1000条数据,测试集F1值竟然可以达到90%,附官网链接 https://spacy.io/

1、spacy版本号2.3.2

2、训练数据格式

TRAIN_DATA = [ ("TEXT", {'entities': [(START_index, END_index, 'LBALE'), (START_index, START_index, 'LBALE')]})]

3、训练模块

nlp = spacy.blank('zh')  # 英文为 en
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)

# add labels
for _, annotations in TRAIN_DATA:    # 训练数据
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(10):
        print("Statring iteration " + str(itn))
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in TRAIN_DATA:
            nlp.update(
                [text],  # batch of texts
                [annotations],  # batch of annotations
                drop=0.2,  # dropout - make it harder to memorise data
                sgd=optimizer,  # callable to update weights
                losses=losses)
        print(losses)
nlp.to_disk('./test_model')  # 模型保存

4、测试模块

examples = TRAIN_DATA # 测试数据

tp = 0
tr = 0
tf = 0

ta = 0
c = 0
for text, annot in examples:

    f = open("resume" + str(c) + ".txt", "w")
    doc_to_test = nlp(text)
    d = {}
    for ent in doc_to_test.ents:
        d[ent.label_] = []
    for ent in doc_to_test.ents:
        d[ent.label_].append(ent.text)

    for i in set(d.keys()):

        f.write("\n\n")
        f.write(i + ":" + "\n")
        for j in set(d[i]):
            f.write(j.replace('\n', '') + "\n")
    d = {}
    for ent in doc_to_test.ents:
        d[ent.label_] = [0, 0, 0, 0, 0, 0]
    for ent in doc_to_test.ents:
        doc_gold_text = nlp.make_doc(text)
        gold = GoldParse(doc_gold_text, entities=annot.get("entities"))
        y_true = [ent.label_ if ent.label_ in x else 'Not ' + ent.label_ for x in gold.ner]
        y_pred = [x.ent_type_ if x.ent_type_ == ent.label_ else 'Not ' + ent.label_ for x in doc_to_test]
        if (d[ent.label_][0] == 0):
            # f.write("For Entity "+ent.label_+"\n")
            # f.write(classification_report(y_true, y_pred)+"\n")
            (p, r, f, s) = precision_recall_fscore_support(y_true, y_pred, average='weighted')
            a = accuracy_score(y_true, y_pred)
            d[ent.label_][0] = 1
            d[ent.label_][1] += p
            d[ent.label_][2] += r
            d[ent.label_][3] += f
            d[ent.label_][4] += a
            d[ent.label_][5] += 1
    c += 1
    for i in d:
        print("\n For Entity " + i + "\n")
        print("Accuracy : " + str((d[i][4] / d[i][5]) * 100) + "%")
        print("Precision : " + str(d[i][1] / d[i][5]))
        print("Recall : " + str(d[i][2] / d[i][5]))
        print("F-score : " + str(d[i][3] / d[i][5]))

5、模型加载调用
text = "测试句子"
nlp1 = spacy.load("./test_model")
doc = nlp1(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

你可能感兴趣的:(自然语言处理)