实体识别(4) -基于Bert进行商品标题实体识别[很详细]

基于Bert进行实体识别任务微调

致Great,ChallengeHub公众号,微信:1185918903,备注NLP技术交流

和鲸主页:https://www.heywhale.com/home/user/profile/58f387e7a686fb29e425d133

所需要的pip包

  • pandas

  • numpy

  • sklearn

  • pytorch

  • transformers: https://github.com/huggingface/transformers
    https://huggingface.co/models

  • seqeval

#!pip install transformers seqeval[gpu]

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForTokenClassification

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda

数据处理

比赛数据下载地址:商品标题实体识别 https://www.heywhale.com/home/competition/620b34ed28270b0017b823ad

pd.DataFrame([[1,2,3],
             [4,5,6]])

| | 0 | 1 | 2 |


with open('train_500.txt','r',encoding='utf-8') as f:
    tmp=[]
    cnt=1
    for line in tqdm(f.read().split('\n')):
        sentence_id=f'train_{cnt}'
        # print(line)
        if line!='\n' and len(line.strip())>0:
            word_tags=line.split(' ')
            if len(word_tags)==2:
                tmp.append([sentence_id]+word_tags)
            elif len(word_tags)==2:
                word=' '.join(word_tags[:-1])
                tag=word_tags[-1]
                tmp.append([sentence_id,word,tag])
        else:
            cnt+=1

100%|████████████████████████████████████████████████████████████████████████| 28307/28307 [00:00<00:00, 886249.33it/s]
data=pd.DataFrame(tmp,columns=['sentence_id','words','tags'])
data

| | sentence_id | words | tags |

26674 rows × 3 columns

data[data['sentence_id']=='train_1']

| | sentence_id | words | tags |

65 rows × 3 columns

data['sentence'] = data[['sentence_id','words','tags']].groupby(['sentence_id'])['words'].transform(lambda x: ' '.join(x))
data['word_labels'] = data[['sentence_id','words','tags']].groupby(['sentence_id'])['tags'].transform(lambda x: ','.join(x))
data.head()

| | sentence_id | words | tags | sentence | word_labels |

data.shape

(26674, 5)
data['sentence_id'].nunique()

501
labels_to_ids = {k: v for v, k in enumerate(data.tags.unique())}
ids_to_labels = {v: k for v, k in enumerate(data.tags.unique())}
labels_to_ids

{'B-40': 0,
 'I-40': 1,
 'B-4': 2,
 'I-4': 3,
 'B-14': 4,
 'I-14': 5,
 'B-5': 6,
 'I-5': 7,
 'B-7': 8,
 'I-7': 9,
 'B-11': 10,
 'I-11': 11,
 'B-13': 12,
 'I-13': 13,
 'B-8': 14,
 'I-8': 15,
 'O': 16,
 'B-16': 17,
 'I-16': 18,
 'B-29': 19,
 'I-29': 20,
 'B-9': 21,
 'I-9': 22,
 'B-12': 23,
 'I-12': 24,
 'B-18': 25,
 'I-18': 26,
 'B-1': 27,
 'I-1': 28,
 'B-3': 29,
 'I-3': 30,
 'B-22': 31,
 'I-22': 32,
 'B-37': 33,
 'I-37': 34,
 'B-39': 35,
 'I-39': 36,
 'B-10': 37,
 'I-10': 38,
 'B-36': 39,
 'I-36': 40,
 'B-34': 41,
 'I-34': 42,
 'B-31': 43,
 'I-31': 44,
 'B-38': 45,
 'I-38': 46,
 'B-54': 47,
 'I-54': 48,
 'B-6': 49,
 'I-6': 50,
 'B-30': 51,
 'I-30': 52,
 'B-15': 53,
 'I-15': 54,
 'B-2': 55,
 'I-2': 56,
 'B-49': 57,
 'I-49': 58,
 'B-21': 59,
 'I-21': 60,
 'B-47': 61,
 'I-47': 62,
 'B-23': 63,
 'I-23': 64,
 'B-20': 65,
 'I-20': 66,
 'B-50': 67,
 'I-50': 68,
 'B-46': 69,
 'I-46': 70,
 'B-41': 71,
 'I-41': 72,
 'B-43': 73,
 'I-43': 74,
 'B-48': 75,
 'I-48': 76,
 'B-19': 77,
 'I-19': 78,
 'B-52': 79,
 'I-52': 80}
len(labels_to_ids)

81
data = data[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
# 也可以根据sentence_id去重
data.head()

| | sentence | word_labels |

len(data)

501
data.iloc[1].sentence

'牛 皮 纸 袋 手 提 袋 定 制 l o g o 烘 焙 购 物 服 装 包 装 外 卖 打 包 袋 子 礼 品 袋 纸 质 黑 色 3 2 * 1 1 * 2 5 大 横 1 0 0 个'
data.iloc[1].word_labels

'B-4,I-4,I-4,I-4,B-4,I-4,I-4,B-29,I-29,I-29,I-29,I-29,I-29,B-9,I-9,B-5,I-5,B-40,I-40,B-4,I-4,B-40,I-40,B-5,I-5,B-4,I-4,B-4,I-4,I-4,B-12,I-12,B-16,I-16,B-18,I-18,I-18,I-18,I-18,I-18,I-18,I-18,B-13,I-13,B-18,I-18,I-18,I-18'
len(data['sentence'][0].split(' '))

65
data['sentence'].apply(lambda x:len(x.split(' '))).describe()

count    501.000000
mean      53.241517
std       12.810135
min        8.000000
25%       44.000000
50%       53.000000
75%       62.000000
max       91.000000
Name: sentence, dtype: float64

构建DataLoader

MAX_LEN = 91 # 120
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 5
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 5
# MODEL_NAME='chinese-roberta-wwm-ext'
MODEL_NAME='hfl/chinese-roberta-wwm-ext'
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME) # encode_plus()# 整体

BERT做NER 一个棘手部分是 BERT 依赖于 wordpiece tokenization,而不是 word tokenization。

比如:Washington的标签为 “b-gpe”,分词之后得到, “Wash”, “##ing”, “##ton”,“b-gpe”, “b-gpe”, “b-gpe”

def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """

    Word piece tokenization使得很难将词标签与单个subword进行匹配。
    这个函数每次次对每个单词进行一个分词,这样方便为每个subword保留正确的标签。 
    当然,它的处理时间有点慢,但它会帮助我们的模型达到更高的精度。
    """

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):

        # 逐字分词
        tokenized_word = tokenizer.tokenize(word) # id
        n_subwords = len(tokenized_word) # 1

        # 将单个字分词结果追加到句子分词列表
        tokenized_sentence.extend(tokenized_word)

        # 标签同样添加n个subword,与原始word标签一致
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

data.iloc[0]

sentence       手 机 三 脚 架 网 红 直 播 支 架 桌 面 自 拍 杆 蓝 牙 遥 控 三 脚 架 ...
word_labels    B-40,I-40,B-4,I-4,I-4,B-14,I-14,B-5,I-5,B-4,I-...
Name: 0, dtype: object
# tokenize_and_preserve_labels(data.iloc[0]['sentence'],data.iloc[0]['word_labels'],tokenizer)

这里有其他的处理方式,比如只有第一个subword给定原始标签,其他subword给定一个无关标签

# BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding

# https://arxiv.org/abs/1810.04805

encoding_result=tokenizer.encode_plus('这里有其他的处理方式,比如只有第一个subword给定原始标签,其他subword给定一个无关标签')
encoding_result.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
encoding_result

{'input_ids': [101, 6821, 7027, 3300, 1071, 800, 4638, 1905, 4415, 3175, 2466, 8024, 3683, 1963, 1372, 3300, 5018, 671, 702, 11541, 8204, 10184, 5314, 2137, 1333, 1993, 3403, 5041, 8024, 1071, 800, 11541, 8204, 10184, 5314, 2137, 671, 702, 3187, 1068, 3403, 5041, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
# tokenizer.convert_ids_to_tokens([101, 6821, 7027, 3300, 1071, 800, 4638, 1905, 4415, 3175, 2466, 8024, 3683, 1963, 1372, 3300, 5018, 671, 702, 11541, 8204, 10184, 5314, 2137, 1333, 1993, 3403, 5041, 8024, 1071, 800, 11541, 8204, 10184, 5314, 2137, 671, 702, 3187, 1068, 3403, 5041, 102])

class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # 步骤 1: 对每个句子分词
        sentence = self.data.sentence[index]  
        word_labels = self.data.word_labels[index]  
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)

        # 步骤 2: 添加特殊token并添加对应的标签
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # 给[CLS] token添加O标签
        labels.insert(-1, "O") # 给[SEP] token添加O标签

        # 步骤 3: 截断/填充
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # 截断
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # 填充
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # 步骤 4: 构建attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]

        # 步骤 5: 将分词结果转为词表的id表示
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [labels_to_ids[label] for label in labels]

        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        } 

    def __len__(self):
        return self.len

按照0.8:0.2比列将数据集,划分为训练集和测试集

from sklearn.model_selection import train_test_split
# train_dataset,test_dataset=train_test_split(data,test_size=0.2,random_state=42)

train_size = 0.8
train_dataset = data.sample(frac=train_size,random_state=200)
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (501, 2)
TRAIN Dataset: (401, 2)
TEST Dataset: (100, 2)

下面为第一个样本的分词id与标签:

training_set[0]

{'ids': tensor([ 101, 3345, 2533, 1164, 2137, 1169, 5011, 6381, 3315, 4851, 4665, 1947,
         6163, 7770, 3440, 4851, 1501, 1215, 1062, 6381,  752, 3315, 1555, 1218,
         1062, 1385, 6843, 4851, 3136, 2360, 5688, 4851, 4289,  143,  126, 1217,
         1331, 2339,  868,  833, 6379, 5011, 6381, 3315, 2094, 2137,  976,  143,
          126, 5273, 5682,  523, 1285, 5277, 5436, 4667,  163, 4669, 4851, 4665,
          524,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0]),
 'mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'targets': tensor([16, 27, 28, 28, 19, 20,  2,  3,  3,  2,  3, 12, 13,  4,  5,  2,  3,  6,
          7,  2,  3,  3,  4,  5,  8,  9,  6,  7,  2,  3,  3,  3,  3, 25, 26, 12,
         13,  6,  7,  6,  7,  2,  3,  3,  3, 19, 20, 25, 26, 17, 18, 16, 16, 16,
         12, 13,  2,  3,  2,  3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
         16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
         16])}
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["ids"]), training_set[0]["targets"]):
  print('{0:10}  {1}   {2}'.format(token, label,ids_to_labels[label.numpy().tolist()]))

[CLS]       16   O
杰           27   B-1
得           28   I-1
利           28   I-1
定           19   B-29
制           20   I-29
笔           2   B-4
记           3   I-4
本           3   I-4
礼           2   B-4
盒           3   I-4
套           12   B-13
装           13   I-13
高           4   B-14
档           5   I-14
礼           2   B-4
品           3   I-4
办           6   B-5
公           7   I-5
记           2   B-4
事           3   I-4
本           3   I-4
商           4   B-14
务           5   I-14
公           8   B-7
司           9   I-7
送           6   B-5
礼           7   I-5
教           2   B-4
师           3   I-4
节           3   I-4
礼           3   I-4
物           3   I-4
a           25   B-18
5           26   I-18
加           12   B-13
厚           13   I-13
工           6   B-5
作           7   I-5
会           6   B-5
议           7   I-5
笔           2   B-4
记           3   I-4
本           3   I-4
子           3   I-4
定           19   B-29
做           20   I-29
a           25   B-18
5           26   I-18
红           17   B-16
色           18   I-16
【           16   O
升           16   O
级           16   O
翻           12   B-13
盖           13   I-13
u           2   B-4
盘           3   I-4
礼           2   B-4
盒           3   I-4
】           16   O
[SEP]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O

创建Pytorch的DataLoader

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

定义网络

  • 模型结构:BertForTokenClassification

  • 预训练权重: “bert-base-uncased”

len(labels_to_ids)

81
model = BertForTokenClassification.from_pretrained(MODEL_NAME, num_labels=len(labels_to_ids))
model.to(device)

训练模型

ids.shape

torch.Size([1, 91])
ids = training_set[0]["ids"].unsqueeze(0)
mask = training_set[0]["mask"].unsqueeze(0)
targets = training_set[0]["targets"].unsqueeze(0) # 真实标签
ids = ids.to(device)
mask = mask.to(device)
targets = targets.to(device)
outputs = model(input_ids=ids, attention_mask=mask, labels=targets) # 输出有两个:一个为loss和一个为logits
initial_loss = outputs[0]
initial_loss

tensor(4.5096, device='cuda:0', grad_fn=)

模型输出logits大小为 (batch_size, sequence_length, num_labels):

tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 91, 81])

设置优化器Adam

optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

# 训练函数
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # 将model设置为train模式
    model.train()

    for idx, batch in enumerate(training_loader):

        ids = batch['ids'].to(device, dtype = torch.long) #(4,91)
        mask = batch['mask'].to(device, dtype = torch.long) #(4,91)
        targets = batch['targets'].to(device, dtype = torch.long)#(4,91)

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
        loss, tr_logits = outputs[0],outputs[1]
        # print(outputs.keys())
        # print(loss)
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)

        if idx % 50==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 50 training steps: {loss_step}")

        # 计算准确率
        flattened_targets = targets.view(-1) # 真实标签 大小 (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # 模型输出shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # 取出每个token对应概率最大的标签索引 shape (batch_size * seq_len,)
        # MASK:PAD
        active_accuracy = mask.view(-1) == 1 # shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_preds.extend(predictions)
        tr_labels.extend(targets)

        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # 梯度剪切
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )

        # loss反向求导
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

训练模型

for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 50 training steps: 4.573911666870117
Training loss per 50 training steps: 3.5836149757983637
Training loss per 50 training steps: 3.146424697177245
Training loss epoch: 3.146424697177245
Training accuracy epoch: 0.28337175397646347
Training epoch: 2
Training loss per 50 training steps: 2.3866159915924072
Training loss per 50 training steps: 2.211251039131015
Training loss per 50 training steps: 2.0536219070453456
Training loss epoch: 2.0536219070453456
Training accuracy epoch: 0.49648706430276834
Training epoch: 3
Training loss per 50 training steps: 1.8235304355621338
Training loss per 50 training steps: 1.6210375042522656
Training loss per 50 training steps: 1.5436867876808242
Training loss epoch: 1.5436867876808242
Training accuracy epoch: 0.6369489455144468
Training epoch: 4
Training loss per 50 training steps: 1.3719302415847778
Training loss per 50 training steps: 1.254675311200759
Training loss per 50 training steps: 1.2525309105910878
Training loss epoch: 1.2525309105910878
Training accuracy epoch: 0.7013529778539404
Training epoch: 5
Training loss per 50 training steps: 1.2091379165649414
Training loss per 50 training steps: 1.0707006524590885
Training loss per 50 training steps: 1.0643499292949639
Training loss epoch: 1.0643499292949639
Training accuracy epoch: 0.7417508051186237

评估模型

验证集评估

def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):

            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.long)

            # loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=targets)
            outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
            loss, eval_logits = outputs[0],outputs[1]
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += targets.size(0)

            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # 计算准确率
            flattened_targets = targets.view(-1) # 大小 (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # 大小 (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # 大小 (batch_size * seq_len,)
            active_accuracy = mask.view(-1) == 1 # 大小 (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(targets)
            eval_preds.extend(predictions)

            tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    #print(eval_labels)
    #print(eval_preds)

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]

    #print(labels)
    #print(predictions)

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

labels, predictions = valid(model, testing_loader)

Validation loss per 100 evaluation steps: 0.8113014698028564
Validation Loss: 1.1529839837551117
Validation Accuracy: 0.7087672360508763
# len(predictions),len(labels)

tmp=[]
for tags in data['word_labels']:
    tmp.extend(tags.split(','))
pd.Series(tmp).value_counts()

I-4     3856
O       2970
B-4     2061
I-18    1777
I-38    1487
        ... 
I-48       1
I-23       1
B-23       1
B-52       1
B-46       1
Length: 81, dtype: int64
ids_to_labels[18]

'I-16'
from seqeval.metrics import classification_report

print(classification_report([labels], [predictions])) # [] 避免报错TypeError: Found input variables without list of list.

precision    recall  f1-score   support

           1       0.65      0.72      0.69        68
          10       0.00      0.00      0.00        24
          11       0.67      0.71      0.69       145
          12       0.38      0.38      0.38        21
          13       0.41      0.58      0.48       137
          14       0.57      0.90      0.70        51
          15       0.00      0.00      0.00         5
          16       0.68      0.72      0.70        78
          18       0.48      0.52      0.50       157
          19       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         4
          21       0.00      0.00      0.00         1
          22       0.00      0.00      0.00        13
          29       0.00      0.00      0.00        13
           3       0.13      0.20      0.16        25
          30       0.00      0.00      0.00         2
          34       0.00      0.00      0.00         1
          36       0.00      0.00      0.00         2
          37       0.34      0.56      0.42        34
          38       0.28      0.40      0.33        82
          39       0.00      0.00      0.00        10
           4       0.68      0.79      0.73       417
          40       0.51      0.56      0.54       108
          46       0.00      0.00      0.00         1
          47       0.00      0.00      0.00         2
           5       0.49      0.68      0.57        81
          50       0.00      0.00      0.00         2
          54       0.50      0.57      0.53        14
           6       0.00      0.00      0.00        10
           7       0.69      0.90      0.78        59
           8       0.69      0.83      0.76        41
           9       0.20      0.04      0.06        27

   micro avg       0.54      0.62      0.58      1636
   macro avg       0.26      0.31      0.28      1636
weighted avg       0.53      0.62      0.57      1636

F:\ProgramData\Anaconda3\lib\site-packages\seqeval\metrics\v1.py:57: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

预测

''.join(data.iloc[0]['sentence'].split())

'手机三脚架网红直播支架桌面自拍杆蓝牙遥控三脚架摄影拍摄拍照抖音看电视神器三角架便携伸缩懒人户外支撑架【女神粉】自带三脚架+蓝牙遥控'
sentence = "手机三脚架网红直播支架桌面自拍杆蓝牙遥控三脚架摄影拍摄拍照抖音看电视神器三角架便携伸缩懒人户外支撑架【女神粉】自带三脚架+蓝牙遥控"

inputs = tokenizer(sentence, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors="pt")

# 加载到gpu
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)
# 输入到模型
outputs = model(ids, mask)
logits = outputs[0]

active_logits = logits.view(-1, model.num_labels) # 大小 (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # 大小 (batch_size*seq_len,) 

tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # tuple = (wordpiece, prediction)

word_level_predictions = []
for pair in wp_preds:
  if (pair[0].startswith(" ##")) or (pair[0] in ['[CLS]', '[SEP]', '[PAD]']):
    # skip prediction
    continue
  else:
    word_level_predictions.append(pair[1])

# 拼接文本
str_rep = " ".join([t[0] for t in wp_preds if t[0] not in ['[CLS]', '[SEP]', '[PAD]']]).replace(" ##", "")
print(str_rep)
print(word_level_predictions)

手 机 三 脚 架 网 红 直 播 支 架 桌 面 自 拍 杆 蓝 牙 遥 控 三 脚 架 摄 影 拍 摄 拍 照 抖 音 看 电 视 神 器 三 角 架 便 携 伸 缩 懒 人 户 外 支 撑 架 【 女 神 粉 】 自 带 三 脚 架 + 蓝 牙 遥 控
['B-40', 'I-40', 'B-4', 'I-4', 'I-4', 'B-14', 'I-8', 'B-5', 'I-5', 'B-4', 'I-4', 'B-7', 'I-7', 'B-4', 'I-4', 'I-4', 'B-11', 'I-11', 'B-11', 'I-11', 'B-4', 'I-4', 'I-4', 'B-5', 'I-5', 'B-5', 'I-5', 'B-5', 'I-5', 'B-5', 'I-5', 'B-5', 'I-5', 'I-5', 'O', 'O', 'B-4', 'I-4', 'I-4', 'B-11', 'I-11', 'B-11', 'I-11', 'B-8', 'I-8', 'B-7', 'I-7', 'B-4', 'I-4', 'I-4', 'O', 'B-8', 'I-8', 'O', 'O', 'B-13', 'I-11', 'B-4', 'I-4', 'I-4', 'O', 'B-11', 'I-11', 'B-11', 'O']

保存模型

保存模型词汇表 、模型权重、配置文件,之后可以用 from_pretrained()

import os

directory = "./model"

if not os.path.exists(directory):
    os.makedirs(directory)

# 保存tokenizer
tokenizer.save_vocabulary(directory)
# 保存权重和配置文件
model.save_pretrained(directory)
print('All files saved')
print('This tutorial is completed')

All files saved
This tutorial is completed

其他

def prepare_sentence(sentence, tokenizer, maxlen):    
      # 步骤 1: tokenize the sentence
      tokenized_sentence = tokenizer.tokenize(sentence)

      # 步骤 2: add special tokens 
      tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] 

      # 步骤 3: truncating/padding
      if (len(tokenized_sentence) > maxlen):
        # truncate
        tokenized_sentence = tokenized_sentence[:maxlen]
      else:
        # pad
        tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]

      # 步骤 4: obtain the attention mask
      attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]

      # 步骤 5: convert tokens to input ids
      ids = tokenizer.convert_tokens_to_ids(tokenized_sentence)

      return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(attn_mask, dtype=torch.long),
            #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
      }

# Bert:
- Bert CRF
- Bert BiLSTM+CRF
- Lex-Bert
- FLat-NER:FLAT: Chinese NER Using Flat-Lattice Transformer
- Unified Named Entity Recognition as Word-Word Relation Classification
  https://github.com/ljynlp/W2NER
# 数据

- 数据增强:https://github.com/425776024/nlpcda
- 语义增强:embedding 拼音 偏旁 
- 伪标签学习

你可能感兴趣的:(NLP,实体识别,bert,深度学习,自然语言处理)