实体识别入门代码实战

~~最近从CV转到了NLP,记录下NLP相关学习知识。

概念

命名实体识别(Named Entity Recognition,NER)是NLP中一项非常基础的任务,NER是信息提取、问答系统、句法分析、机器翻译等众多NLP任务的重要基础工具。

什么是实体,我理解就是在实际的任务中,你希望在句子中获取到的有用词语。在新闻中可能是事件的主体,如人物、地点、机构。在医疗文档里面可能是症状名称、药物名称,商品描述中的品牌词、物品属性词等。

数据标注

命名实体标注即是对一个文本序列中的每一个词(字)打上对应的标签,表示这个词(字)是否为命名实体的一部分。命名实体常见的标注方式分为 BIO、BIOES,其含义如下:

  • BIO: B 即 begin ,表示实体开始的字符,I 即 inside,表示为实体的一部分,O 即 outside,表示不是实体字符。
  • BIOES:E 即 end,表示实体字符的结束,S 即 single, 表示单个字为实体。
    当然还有其他的标注形式,这里就不展开了。

标注示例:

文本:
BIOES O O O O O B I I E O O O O O O O O O
BIO O O O O O B I I I O O O O O O O O O

模型

NER 的发展大体经过了以下几个发展阶段:

  1. 早期阶段: 基于规则、基于字典的方法
  2. 统计模型: HMM、CRF
  3. 传统深度学习模型:主要是 LSTM、BILSTM 等深度学习模型 + CRF 模型
  4. 基于预训练模型: BERT + CRF
  5. 学习框架: BERT + MRC
    其实2、3、4 方法其实本质是一样的,都是不断的使用更好的特征提取模型,随着 BERT 或者说 Transformer 开始一统 NLP 领域, 现在基本都是基于 BERT 去做 NER ,所以后面我们主要介绍下第4、5 种方法。
基于预训练模型
BERT + CRF

这里直接用 bert4Keras 的示例代码

#! -*- coding: utf-8 -*-

# 用CRF做中文命名实体识别

# 数据集 http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz

# 实测验证集的F1可以到96.48%,测试集的F1可以到95.38%

import numpy as np

from bert4keras.backend import keras, K

from bert4keras.models import build_transformer_model

from bert4keras.tokenizers import Tokenizer

from bert4keras.optimizers import Adam

from bert4keras.snippets import sequence_padding, DataGenerator

from bert4keras.snippets import open, ViterbiDecoder, to_array

from bert4keras.layers import ConditionalRandomField

from keras.layers import Dense

from keras.models import Model

from tqdm import tqdm

maxlen = 256

epochs = 10

batch_size = 32

bert_layers = 12

learning_rate = 2e-5 # bert_layers越小,学习率应该要越大

crf_lr_multiplier = 1000 # 必要时扩大CRF层的学习率

categories = set()

# bert配置

config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'

checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'

dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'

def load_data(filename):

"""加载数据

单条格式:[text, (start, end, label), (start, end, label), ...],

意味着text[start:end + 1]是类型为label的实体。

"""

D = []

with open(filename, encoding='utf-8') as f:

f = f.read()

for l in f.split('\n\n'):

if not l:

continue

d = ['']

for i, c in enumerate(l.split('\n')):

char, flag = c.split(' ')

d[0] += char

if flag[0] == 'B':

d.append([i, i, flag[2:]])

categories.add(flag[2:])

elif flag[0] == 'I':

d[-1][1] = i

D.append(d)

return D

# 标注数据

train_data = load_data('/root/ner/china-people-daily-ner-corpus/example.train')

valid_data = load_data('/root/ner/china-people-daily-ner-corpus/example.dev')

test_data = load_data('/root/ner/china-people-daily-ner-corpus/example.test')

categories = list(sorted(categories))

# 建立分词器

tokenizer = Tokenizer(dict_path, do_lower_case=True)

class data_generator(DataGenerator):

"""数据生成器

"""

def __iter__(self, random=False):

batch_token_ids, batch_segment_ids, batch_labels = [], [], []

for is_end, d in self.sample(random):

tokens = tokenizer.tokenize(d[0], maxlen=maxlen)

mapping = tokenizer.rematch(d[0], tokens)

start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}

end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}

token_ids = tokenizer.tokens_to_ids(tokens)

segment_ids = [0] * len(token_ids)

labels = np.zeros(len(token_ids))

for start, end, label in d[1:]:

if start in start_mapping and end in end_mapping:

start = start_mapping[start]

end = end_mapping[end]

labels[start] = categories.index(label) * 2 + 1

labels[start + 1:end + 1] = categories.index(label) * 2 + 2

batch_token_ids.append(token_ids)

batch_segment_ids.append(segment_ids)

batch_labels.append(labels)

if len(batch_token_ids) == self.batch_size or is_end:

batch_token_ids = sequence_padding(batch_token_ids)

batch_segment_ids = sequence_padding(batch_segment_ids)

batch_labels = sequence_padding(batch_labels)

yield [batch_token_ids, batch_segment_ids], batch_labels

batch_token_ids, batch_segment_ids, batch_labels = [], [], []

"""

后面的代码使用的是bert类型的模型,如果你用的是albert,那么前几行请改为:

model = build_transformer_model(

config_path,

checkpoint_path,

model='albert',

)

output_layer = 'Transformer-FeedForward-Norm'

output = model.get_layer(output_layer).get_output_at(bert_layers - 1)

"""

model = build_transformer_model(

config_path,

checkpoint_path,

)

output_layer = 'Transformer-%s-FeedForward-Norm' % (bert_layers - 1)

output = model.get_layer(output_layer).output

output = Dense(len(categories) * 2 + 1)(output)

CRF = ConditionalRandomField(lr_multiplier=crf_lr_multiplier)

output = CRF(output)

model = Model(model.input, output)

model.summary()

model.compile(

loss=CRF.sparse_loss,

optimizer=Adam(learning_rate),

metrics=[CRF.sparse_accuracy]

)

class NamedEntityRecognizer(ViterbiDecoder):

"""命名实体识别器

"""

def recognize(self, text):

tokens = tokenizer.tokenize(text, maxlen=512)

mapping = tokenizer.rematch(text, tokens)

token_ids = tokenizer.tokens_to_ids(tokens)

segment_ids = [0] * len(token_ids)

token_ids, segment_ids = to_array([token_ids], [segment_ids])

nodes = model.predict([token_ids, segment_ids])[0]

labels = self.decode(nodes)

entities, starting = [], False

for i, label in enumerate(labels):

if label > 0:

if label % 2 == 1:

starting = True

entities.append([[i], categories[(label - 1) // 2]])

elif starting:

entities[-1][0].append(i)

else:

starting = False

else:

starting = False

return [(mapping[w[0]][0], mapping[w[-1]][-1], l) for w, l in entities]

NER = NamedEntityRecognizer(trans=K.eval(CRF.trans), starts=[0], ends=[0])

def evaluate(data):

"""评测函数

"""

X, Y, Z = 1e-10, 1e-10, 1e-10

for d in tqdm(data, ncols=100):

R = set(NER.recognize(d[0]))

T = set([tuple(i) for i in d[1:]])

X += len(R & T)

Y += len(R)

Z += len(T)

f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z

return f1, precision, recall

class Evaluator(keras.callbacks.Callback):

"""评估与保存

"""

def __init__(self):

self.best_val_f1 = 0

def on_epoch_end(self, epoch, logs=None):

trans = K.eval(CRF.trans)

NER.trans = trans

print(NER.trans)

f1, precision, recall = evaluate(valid_data)

# 保存最优

if f1 >= self.best_val_f1:

self.best_val_f1 = f1

model.save_weights('./best_model.weights')

print(

'valid: f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' %

(f1, precision, recall, self.best_val_f1)

)

f1, precision, recall = evaluate(test_data)

print(

'test: f1: %.5f, precision: %.5f, recall: %.5f\n' %

(f1, precision, recall)

)

if __name__ == '__main__':

evaluator = Evaluator()

train_generator = data_generator(train_data, batch_size)

model.fit(

train_generator.forfit(),

steps_per_epoch=len(train_generator),

epochs=epochs,

callbacks=[evaluator]

)

else:

model.load_weights('./best_model.weights')

NER.trans = K.eval(CRF.trans)

你可能感兴趣的:(自然语言处理(nlp),自然语言处理,深度学习,人工智能,nlp)