BERT实战:中文命名实体识别

使用bert实现的一个NER10标签任务

  • github:nlp-code/bert命名实体识别.ipynb at main · cshmzin/nlp-code (github.com)
  • bert介绍博客:Simple to Bert | Ripshun Blog
  • 数据集来源:CLUE官网(细粒度NER任务)

获取数据:

# 获取数据
import json

train_data = []
dev_data = []
test_data = []

for line in open('train.json','r',encoding='UTF-8'):
    train_data.append(json.loads(line))

for line in open('dev.json','r',encoding='UTF-8'):
    dev_data.append(json.loads(line))

for line in open('test.json','r',encoding='UTF-8'):
    test_data.append(json.loads(line))

print(f'数量:train:{len(train_data)},dev:{len(dev_data)},test:{len(test_data)}')
print(train_data[0])
print(dev_data[0])
print(test_data[0])

标签数据处理:

构建标签字典,字典格式如下:

#上图为标签类别
#需要构建标签
import re

label_type = {'o':0,'address':1,'book':2,'company':3,'game':4,'government':5,'movie':6,'name':7,'organization':8,'position':9,'scene':10}

def decode_label(d):
#解析标签,以列表形式构成
  text_len = len(d['text'])
  label = [0]*text_len
  types = d['label'].keys()
  for t in types:
    values = d['label'][t].values()
    si = [v for value in values for v in value]
    for i in si:
      for j in range(i[0],i[1]+1):
        label[j] = label_type[t]
  return label


def transfrom_data(data,mode):
  data_texts = [re.sub('\d','&',d['text']) for d in data]
  
  if mode == 'train':
    data_labels = []
    for d in data:
      data_labels.append(decode_label(d))
    return (data_texts,data_labels)
  
  else:
    return data_texts 

train_texts,train_labels = transfrom_data(train_data,'train')
dev_texts,dev_labels = transfrom_data(dev_data,'train')
test_texts = transfrom_data(train_data,'test')

数据处理:

使用bert中的tokenizer解析数据,构建数据向量。

! pip install transformers
from transformers import BertTokenizer
from IPython.display import clear_output

# 使用bert的tokenizer将文字转化成数字。
PRETRAINED_MODEL_NAME = "bert-base-chinese"  # 指定为中文
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
clear_output()

train_ids = []
dev_ids = []
test_ids = []
for train_text in train_texts:
  train_ids.append(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_text)))

for dev_text in dev_texts:
  dev_ids.append(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(dev_text)))

for test_text in test_texts:
  test_ids.append(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(test_text)))

print(train_ids[0])
print(dev_texts[66])
print(dev_labels[66])

构建dataloader

使用torch中构建数据集的方法

import torch
from torch.utils.data import DataLoader,Dataset
from torch.nn.utils.rnn import pad_sequence

MaxLen = 40
class NewDataset(Dataset):
    def __init__(self,ids,labels):
        self.ids = ids
        self.labels = labels
        self.len = len(ids)

    def __getitem__(self, item):
        tokens_tensor = torch.tensor(self.ids[item])
        label_tensor = torch.tensor(self.labels[item])
        return (tokens_tensor,label_tensor)

    def __len__(self):
        return self.len

trainset = NewDataset(train_ids,train_labels)
devset = NewDataset(dev_ids,dev_labels)
BATCH_SIZE = 64
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    label_tensors = [s[1] for s in samples]


    # zero pad 到同一序列长度
    one = [0]
    tokens_tensors = pad_sequence(tokens_tensors,batch_first=True)
    label_tensors = pad_sequence(label_tensors,batch_first=True,padding_value=0)

    if len(tokens_tensors[0]) != 50:
      tokens_tensors = torch.tensor([t + one for t in tokens_tensors.numpy().tolist()])
    if len(label_tensors[0]) != 50: 
      label_tensors = torch.tensor([t + one for t in label_tensors.numpy().tolist()])
    # attention masks,将 tokens_tensors 不为 zero padding 的位置设为1
    masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)

    return tokens_tensors, masks_tensors, label_tensors


trainloader = DataLoader(trainset,batch_size=BATCH_SIZE,collate_fn=create_mini_batch,drop_last=False)
devloader = DataLoader(trainset,batch_size=BATCH_SIZE,collate_fn=create_mini_batch,drop_last=False)

训练模型

使用BertForTokenClassification模型

from transformers import BertForTokenClassification
model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=17)
model.cuda()

model.train()
optimizer = torch.optim.Adam(model.parameters(),lr=1e-5)
Epochs = 10
for epoch in range(Epochs):
    losses = 0.0
    for data in trainloader:
        tokens_tensors, masks_tensors, label_tensors = [t.cuda() for t in data]
        optimizer.zero_grad()
        outputs = model(input_ids = tokens_tensors,attention_mask = masks_tensors,labels = label_tensors)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        losses += loss.item()
    print(losses)

只训练了10次,效果还不是很好。

验证数据:

import numpy as np
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

nb_eval_steps = 0
model.eval()
eval_loss,eval_accuracy = 0,0
predictions , true_labels = [], []

for data in devloader:
    tokens_tensors, masks_tensors, label_tensors = [t.cuda() for t in data]
    with torch.no_grad():
        outputs = model(input_ids=tokens_tensors, attention_mask=masks_tensors, labels=label_tensors)
        loss = outputs[0]
        preds = model(input_ids=tokens_tensors, attention_mask=masks_tensors)

    for pred,label_tensor in zip(preds[0],label_tensors):
      logit = pred.detach().cpu().numpy()#detach的方法,将variable参数从网络中隔离开,不参与参数更新
      label_ids = label_tensor.cpu().numpy()

      predictions.extend(np.argmax(logit, axis=1))
      true_labels.append(label_ids)
      # 计算accuracy 和 loss
      tmp_eval_accuracy = flat_accuracy(logit, label_ids)

      eval_loss += loss.mean().item()
      eval_accuracy += tmp_eval_accuracy
      nb_eval_steps += 1

print("Validation loss: {}".format(eval_loss/nb_eval_steps))
print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

from sklearn.metrics import f1_score
pred_tags = list(np.array(predictions).flatten())
valid_tags = list(np.array(true_labels).flatten())
print(pred_tags[0:20])
print(valid_tags[0:20])
print("F1-Score: {}".format(f1_score(pred_tags,valid_tags,average='weighted')))#传入的是具体的tag

通过一句话来验证

text = '普京是俄罗斯的总统'
test_tokens = tokenizer.tokenize(text)
test_ids = tokenizer.convert_tokens_to_ids(test_tokens)
test_tokens_tensor = torch.tensor(test_ids)
test_tokens_tensor = test_tokens_tensor

test_masks_tensor = torch.zeros(test_tokens_tensor.shape, dtype=torch.long)
test_masks_tensor = test_masks_tensor.masked_fill(test_tokens_tensor != 0, 1)

outputs = model(input_ids=test_tokens_tensor.unsqueeze(0).cuda(),attention_mask=test_masks_tensor.unsqueeze(0).cuda())
logits = outputs[0]
preds = []
for logit in logits:
  preds.extend(np.argmax(logit.detach().cpu().numpy(), axis=1))

inverse_dict=dict([val,key] for key,val in label_type.items())
preds = [inverse_dict[i] for i in preds]

print(test_tokens)
print(preds)

结果:

总体来说bert的强大表现使得即使训练不足,也能达到很高的f1得分。

但是可以看出训练出来的模型有很多缺陷:

  • 标签的分类问题,我直接将每个标签分为一类,但实际上是可以将一个标签分为首中位3部分的。这样导致训练出来的模型界限划分优点小问题(比如结果中【普京是】被划分在了一起)。
  • 标签太多,标签过多导致机器很难将其中标签区分,如结果中的【普京】被分给了公司的标签,实际上应该是人名(可以看出其实人名和公司名机器还不能完全区分他的语义)。
  • 训练不足:可以看到训练的loss值还没有完全降下来。
  • 模型过于简单,没有考虑加入lstm和crf处理。

你可能感兴趣的:(自然语言处理,NER,bert)