致Great,ChallengeHub公众号,微信:1185918903,备注NLP技术交流
和鲸主页:https://www.heywhale.com/home/user/profile/58f387e7a686fb29e425d133
pandas
numpy
sklearn
pytorch
transformers: https://github.com/huggingface/transformers
https://huggingface.co/models
seqeval
#!pip install transformers seqeval[gpu]
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForTokenClassification
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)
cuda
比赛数据下载地址:商品标题实体识别 https://www.heywhale.com/home/competition/620b34ed28270b0017b823ad
pd.DataFrame([[1,2,3],
[4,5,6]])
| | 0 | 1 | 2 |
with open('train_500.txt','r',encoding='utf-8') as f:
tmp=[]
cnt=1
for line in tqdm(f.read().split('\n')):
sentence_id=f'train_{cnt}'
# print(line)
if line!='\n' and len(line.strip())>0:
word_tags=line.split(' ')
if len(word_tags)==2:
tmp.append([sentence_id]+word_tags)
elif len(word_tags)==2:
word=' '.join(word_tags[:-1])
tag=word_tags[-1]
tmp.append([sentence_id,word,tag])
else:
cnt+=1
100%|████████████████████████████████████████████████████████████████████████| 28307/28307 [00:00<00:00, 886249.33it/s]
data=pd.DataFrame(tmp,columns=['sentence_id','words','tags'])
data
| | sentence_id | words | tags |
26674 rows × 3 columns
data[data['sentence_id']=='train_1']
| | sentence_id | words | tags |
65 rows × 3 columns
data['sentence'] = data[['sentence_id','words','tags']].groupby(['sentence_id'])['words'].transform(lambda x: ' '.join(x))
data['word_labels'] = data[['sentence_id','words','tags']].groupby(['sentence_id'])['tags'].transform(lambda x: ','.join(x))
data.head()
| | sentence_id | words | tags | sentence | word_labels |
data.shape
(26674, 5)
data['sentence_id'].nunique()
501
labels_to_ids = {k: v for v, k in enumerate(data.tags.unique())}
ids_to_labels = {v: k for v, k in enumerate(data.tags.unique())}
labels_to_ids
{'B-40': 0,
'I-40': 1,
'B-4': 2,
'I-4': 3,
'B-14': 4,
'I-14': 5,
'B-5': 6,
'I-5': 7,
'B-7': 8,
'I-7': 9,
'B-11': 10,
'I-11': 11,
'B-13': 12,
'I-13': 13,
'B-8': 14,
'I-8': 15,
'O': 16,
'B-16': 17,
'I-16': 18,
'B-29': 19,
'I-29': 20,
'B-9': 21,
'I-9': 22,
'B-12': 23,
'I-12': 24,
'B-18': 25,
'I-18': 26,
'B-1': 27,
'I-1': 28,
'B-3': 29,
'I-3': 30,
'B-22': 31,
'I-22': 32,
'B-37': 33,
'I-37': 34,
'B-39': 35,
'I-39': 36,
'B-10': 37,
'I-10': 38,
'B-36': 39,
'I-36': 40,
'B-34': 41,
'I-34': 42,
'B-31': 43,
'I-31': 44,
'B-38': 45,
'I-38': 46,
'B-54': 47,
'I-54': 48,
'B-6': 49,
'I-6': 50,
'B-30': 51,
'I-30': 52,
'B-15': 53,
'I-15': 54,
'B-2': 55,
'I-2': 56,
'B-49': 57,
'I-49': 58,
'B-21': 59,
'I-21': 60,
'B-47': 61,
'I-47': 62,
'B-23': 63,
'I-23': 64,
'B-20': 65,
'I-20': 66,
'B-50': 67,
'I-50': 68,
'B-46': 69,
'I-46': 70,
'B-41': 71,
'I-41': 72,
'B-43': 73,
'I-43': 74,
'B-48': 75,
'I-48': 76,
'B-19': 77,
'I-19': 78,
'B-52': 79,
'I-52': 80}
len(labels_to_ids)
81
data = data[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
# 也可以根据sentence_id去重
data.head()
| | sentence | word_labels |
len(data)
501
data.iloc[1].sentence
'牛 皮 纸 袋 手 提 袋 定 制 l o g o 烘 焙 购 物 服 装 包 装 外 卖 打 包 袋 子 礼 品 袋 纸 质 黑 色 3 2 * 1 1 * 2 5 大 横 1 0 0 个'
data.iloc[1].word_labels
'B-4,I-4,I-4,I-4,B-4,I-4,I-4,B-29,I-29,I-29,I-29,I-29,I-29,B-9,I-9,B-5,I-5,B-40,I-40,B-4,I-4,B-40,I-40,B-5,I-5,B-4,I-4,B-4,I-4,I-4,B-12,I-12,B-16,I-16,B-18,I-18,I-18,I-18,I-18,I-18,I-18,I-18,B-13,I-13,B-18,I-18,I-18,I-18'
len(data['sentence'][0].split(' '))
65
data['sentence'].apply(lambda x:len(x.split(' '))).describe()
count 501.000000
mean 53.241517
std 12.810135
min 8.000000
25% 44.000000
50% 53.000000
75% 62.000000
max 91.000000
Name: sentence, dtype: float64
MAX_LEN = 91 # 120
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 5
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 5
# MODEL_NAME='chinese-roberta-wwm-ext'
MODEL_NAME='hfl/chinese-roberta-wwm-ext'
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME) # encode_plus()# 整体
BERT做NER 一个棘手部分是 BERT 依赖于 wordpiece tokenization,而不是 word tokenization。
比如:Washington的标签为 “b-gpe”,分词之后得到, “Wash”, “##ing”, “##ton”,“b-gpe”, “b-gpe”, “b-gpe”
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
"""
Word piece tokenization使得很难将词标签与单个subword进行匹配。
这个函数每次次对每个单词进行一个分词,这样方便为每个subword保留正确的标签。
当然,它的处理时间有点慢,但它会帮助我们的模型达到更高的精度。
"""
tokenized_sentence = []
labels = []
sentence = sentence.strip()
for word, label in zip(sentence.split(), text_labels.split(",")):
# 逐字分词
tokenized_word = tokenizer.tokenize(word) # id
n_subwords = len(tokenized_word) # 1
# 将单个字分词结果追加到句子分词列表
tokenized_sentence.extend(tokenized_word)
# 标签同样添加n个subword,与原始word标签一致
labels.extend([label] * n_subwords)
return tokenized_sentence, labels
data.iloc[0]
sentence 手 机 三 脚 架 网 红 直 播 支 架 桌 面 自 拍 杆 蓝 牙 遥 控 三 脚 架 ...
word_labels B-40,I-40,B-4,I-4,I-4,B-14,I-14,B-5,I-5,B-4,I-...
Name: 0, dtype: object
# tokenize_and_preserve_labels(data.iloc[0]['sentence'],data.iloc[0]['word_labels'],tokenizer)
这里有其他的处理方式,比如只有第一个subword给定原始标签,其他subword给定一个无关标签
# BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding
# https://arxiv.org/abs/1810.04805
encoding_result=tokenizer.encode_plus('这里有其他的处理方式,比如只有第一个subword给定原始标签,其他subword给定一个无关标签')
encoding_result.keys()
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
encoding_result
{'input_ids': [101, 6821, 7027, 3300, 1071, 800, 4638, 1905, 4415, 3175, 2466, 8024, 3683, 1963, 1372, 3300, 5018, 671, 702, 11541, 8204, 10184, 5314, 2137, 1333, 1993, 3403, 5041, 8024, 1071, 800, 11541, 8204, 10184, 5314, 2137, 671, 702, 3187, 1068, 3403, 5041, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
# tokenizer.convert_ids_to_tokens([101, 6821, 7027, 3300, 1071, 800, 4638, 1905, 4415, 3175, 2466, 8024, 3683, 1963, 1372, 3300, 5018, 671, 702, 11541, 8204, 10184, 5314, 2137, 1333, 1993, 3403, 5041, 8024, 1071, 800, 11541, 8204, 10184, 5314, 2137, 671, 702, 3187, 1068, 3403, 5041, 102])
class dataset(Dataset):
def __init__(self, dataframe, tokenizer, max_len):
self.len = len(dataframe)
self.data = dataframe
self.tokenizer = tokenizer
self.max_len = max_len
def __getitem__(self, index):
# 步骤 1: 对每个句子分词
sentence = self.data.sentence[index]
word_labels = self.data.word_labels[index]
tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)
# 步骤 2: 添加特殊token并添加对应的标签
tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
labels.insert(0, "O") # 给[CLS] token添加O标签
labels.insert(-1, "O") # 给[SEP] token添加O标签
# 步骤 3: 截断/填充
maxlen = self.max_len
if (len(tokenized_sentence) > maxlen):
# 截断
tokenized_sentence = tokenized_sentence[:maxlen]
labels = labels[:maxlen]
else:
# 填充
tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
labels = labels + ["O" for _ in range(maxlen - len(labels))]
# 步骤 4: 构建attention mask
attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]
# 步骤 5: 将分词结果转为词表的id表示
ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)
label_ids = [labels_to_ids[label] for label in labels]
return {
'ids': torch.tensor(ids, dtype=torch.long),
'mask': torch.tensor(attn_mask, dtype=torch.long),
#'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
'targets': torch.tensor(label_ids, dtype=torch.long)
}
def __len__(self):
return self.len
按照0.8:0.2比列将数据集,划分为训练集和测试集
from sklearn.model_selection import train_test_split
# train_dataset,test_dataset=train_test_split(data,test_size=0.2,random_state=42)
train_size = 0.8
train_dataset = data.sample(frac=train_size,random_state=200)
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)
print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))
training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)
FULL Dataset: (501, 2)
TRAIN Dataset: (401, 2)
TEST Dataset: (100, 2)
下面为第一个样本的分词id与标签:
training_set[0]
{'ids': tensor([ 101, 3345, 2533, 1164, 2137, 1169, 5011, 6381, 3315, 4851, 4665, 1947,
6163, 7770, 3440, 4851, 1501, 1215, 1062, 6381, 752, 3315, 1555, 1218,
1062, 1385, 6843, 4851, 3136, 2360, 5688, 4851, 4289, 143, 126, 1217,
1331, 2339, 868, 833, 6379, 5011, 6381, 3315, 2094, 2137, 976, 143,
126, 5273, 5682, 523, 1285, 5277, 5436, 4667, 163, 4669, 4851, 4665,
524, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0]),
'mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
'targets': tensor([16, 27, 28, 28, 19, 20, 2, 3, 3, 2, 3, 12, 13, 4, 5, 2, 3, 6,
7, 2, 3, 3, 4, 5, 8, 9, 6, 7, 2, 3, 3, 3, 3, 25, 26, 12,
13, 6, 7, 6, 7, 2, 3, 3, 3, 19, 20, 25, 26, 17, 18, 16, 16, 16,
12, 13, 2, 3, 2, 3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
16])}
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["ids"]), training_set[0]["targets"]):
print('{0:10} {1} {2}'.format(token, label,ids_to_labels[label.numpy().tolist()]))
[CLS] 16 O
杰 27 B-1
得 28 I-1
利 28 I-1
定 19 B-29
制 20 I-29
笔 2 B-4
记 3 I-4
本 3 I-4
礼 2 B-4
盒 3 I-4
套 12 B-13
装 13 I-13
高 4 B-14
档 5 I-14
礼 2 B-4
品 3 I-4
办 6 B-5
公 7 I-5
记 2 B-4
事 3 I-4
本 3 I-4
商 4 B-14
务 5 I-14
公 8 B-7
司 9 I-7
送 6 B-5
礼 7 I-5
教 2 B-4
师 3 I-4
节 3 I-4
礼 3 I-4
物 3 I-4
a 25 B-18
5 26 I-18
加 12 B-13
厚 13 I-13
工 6 B-5
作 7 I-5
会 6 B-5
议 7 I-5
笔 2 B-4
记 3 I-4
本 3 I-4
子 3 I-4
定 19 B-29
做 20 I-29
a 25 B-18
5 26 I-18
红 17 B-16
色 18 I-16
【 16 O
升 16 O
级 16 O
翻 12 B-13
盖 13 I-13
u 2 B-4
盘 3 I-4
礼 2 B-4
盒 3 I-4
】 16 O
[SEP] 16 O
[PAD] 16 O
[PAD] 16 O
[PAD] 16 O
[PAD] 16 O
[PAD] 16 O
[PAD] 16 O
[PAD] 16 O
[PAD] 16 O
[PAD] 16 O
[PAD] 16 O
[PAD] 16 O
[PAD] 16 O
[PAD] 16 O
[PAD] 16 O
[PAD] 16 O
[PAD] 16 O
[PAD] 16 O
[PAD] 16 O
[PAD] 16 O
[PAD] 16 O
[PAD] 16 O
[PAD] 16 O
[PAD] 16 O
[PAD] 16 O
[PAD] 16 O
[PAD] 16 O
[PAD] 16 O
[PAD] 16 O
[PAD] 16 O
创建Pytorch的DataLoader
train_params = {'batch_size': TRAIN_BATCH_SIZE,
'shuffle': True,
'num_workers': 0
}
test_params = {'batch_size': VALID_BATCH_SIZE,
'shuffle': True,
'num_workers': 0
}
training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)
模型结构:BertForTokenClassification
预训练权重: “bert-base-uncased”
len(labels_to_ids)
81
model = BertForTokenClassification.from_pretrained(MODEL_NAME, num_labels=len(labels_to_ids))
model.to(device)
ids.shape
torch.Size([1, 91])
ids = training_set[0]["ids"].unsqueeze(0)
mask = training_set[0]["mask"].unsqueeze(0)
targets = training_set[0]["targets"].unsqueeze(0) # 真实标签
ids = ids.to(device)
mask = mask.to(device)
targets = targets.to(device)
outputs = model(input_ids=ids, attention_mask=mask, labels=targets) # 输出有两个:一个为loss和一个为logits
initial_loss = outputs[0]
initial_loss
tensor(4.5096, device='cuda:0', grad_fn=)
模型输出logits大小为 (batch_size, sequence_length, num_labels):
tr_logits = outputs[1]
tr_logits.shape
torch.Size([1, 91, 81])
设置优化器Adam
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
# 训练函数
def train(epoch):
tr_loss, tr_accuracy = 0, 0
nb_tr_examples, nb_tr_steps = 0, 0
tr_preds, tr_labels = [], []
# 将model设置为train模式
model.train()
for idx, batch in enumerate(training_loader):
ids = batch['ids'].to(device, dtype = torch.long) #(4,91)
mask = batch['mask'].to(device, dtype = torch.long) #(4,91)
targets = batch['targets'].to(device, dtype = torch.long)#(4,91)
outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
loss, tr_logits = outputs[0],outputs[1]
# print(outputs.keys())
# print(loss)
tr_loss += loss.item()
nb_tr_steps += 1
nb_tr_examples += targets.size(0)
if idx % 50==0:
loss_step = tr_loss/nb_tr_steps
print(f"Training loss per 50 training steps: {loss_step}")
# 计算准确率
flattened_targets = targets.view(-1) # 真实标签 大小 (batch_size * seq_len,)
active_logits = tr_logits.view(-1, model.num_labels) # 模型输出shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # 取出每个token对应概率最大的标签索引 shape (batch_size * seq_len,)
# MASK:PAD
active_accuracy = mask.view(-1) == 1 # shape (batch_size * seq_len,)
targets = torch.masked_select(flattened_targets, active_accuracy)
predictions = torch.masked_select(flattened_predictions, active_accuracy)
tr_preds.extend(predictions)
tr_labels.extend(targets)
tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
tr_accuracy += tmp_tr_accuracy
# 梯度剪切
torch.nn.utils.clip_grad_norm_(
parameters=model.parameters(), max_norm=MAX_GRAD_NORM
)
# loss反向求导
optimizer.zero_grad()
loss.backward()
optimizer.step()
epoch_loss = tr_loss / nb_tr_steps
tr_accuracy = tr_accuracy / nb_tr_steps
print(f"Training loss epoch: {epoch_loss}")
print(f"Training accuracy epoch: {tr_accuracy}")
训练模型
for epoch in range(EPOCHS):
print(f"Training epoch: {epoch + 1}")
train(epoch)
Training epoch: 1
Training loss per 50 training steps: 4.573911666870117
Training loss per 50 training steps: 3.5836149757983637
Training loss per 50 training steps: 3.146424697177245
Training loss epoch: 3.146424697177245
Training accuracy epoch: 0.28337175397646347
Training epoch: 2
Training loss per 50 training steps: 2.3866159915924072
Training loss per 50 training steps: 2.211251039131015
Training loss per 50 training steps: 2.0536219070453456
Training loss epoch: 2.0536219070453456
Training accuracy epoch: 0.49648706430276834
Training epoch: 3
Training loss per 50 training steps: 1.8235304355621338
Training loss per 50 training steps: 1.6210375042522656
Training loss per 50 training steps: 1.5436867876808242
Training loss epoch: 1.5436867876808242
Training accuracy epoch: 0.6369489455144468
Training epoch: 4
Training loss per 50 training steps: 1.3719302415847778
Training loss per 50 training steps: 1.254675311200759
Training loss per 50 training steps: 1.2525309105910878
Training loss epoch: 1.2525309105910878
Training accuracy epoch: 0.7013529778539404
Training epoch: 5
Training loss per 50 training steps: 1.2091379165649414
Training loss per 50 training steps: 1.0707006524590885
Training loss per 50 training steps: 1.0643499292949639
Training loss epoch: 1.0643499292949639
Training accuracy epoch: 0.7417508051186237
验证集评估
def valid(model, testing_loader):
# put model in evaluation mode
model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_examples, nb_eval_steps = 0, 0
eval_preds, eval_labels = [], []
with torch.no_grad():
for idx, batch in enumerate(testing_loader):
ids = batch['ids'].to(device, dtype = torch.long)
mask = batch['mask'].to(device, dtype = torch.long)
targets = batch['targets'].to(device, dtype = torch.long)
# loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=targets)
outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
loss, eval_logits = outputs[0],outputs[1]
eval_loss += loss.item()
nb_eval_steps += 1
nb_eval_examples += targets.size(0)
if idx % 100==0:
loss_step = eval_loss/nb_eval_steps
print(f"Validation loss per 100 evaluation steps: {loss_step}")
# 计算准确率
flattened_targets = targets.view(-1) # 大小 (batch_size * seq_len,)
active_logits = eval_logits.view(-1, model.num_labels) # 大小 (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # 大小 (batch_size * seq_len,)
active_accuracy = mask.view(-1) == 1 # 大小 (batch_size * seq_len,)
targets = torch.masked_select(flattened_targets, active_accuracy)
predictions = torch.masked_select(flattened_predictions, active_accuracy)
eval_labels.extend(targets)
eval_preds.extend(predictions)
tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
eval_accuracy += tmp_eval_accuracy
#print(eval_labels)
#print(eval_preds)
labels = [ids_to_labels[id.item()] for id in eval_labels]
predictions = [ids_to_labels[id.item()] for id in eval_preds]
#print(labels)
#print(predictions)
eval_loss = eval_loss / nb_eval_steps
eval_accuracy = eval_accuracy / nb_eval_steps
print(f"Validation Loss: {eval_loss}")
print(f"Validation Accuracy: {eval_accuracy}")
return labels, predictions
labels, predictions = valid(model, testing_loader)
Validation loss per 100 evaluation steps: 0.8113014698028564
Validation Loss: 1.1529839837551117
Validation Accuracy: 0.7087672360508763
# len(predictions),len(labels)
tmp=[]
for tags in data['word_labels']:
tmp.extend(tags.split(','))
pd.Series(tmp).value_counts()
I-4 3856
O 2970
B-4 2061
I-18 1777
I-38 1487
...
I-48 1
I-23 1
B-23 1
B-52 1
B-46 1
Length: 81, dtype: int64
ids_to_labels[18]
'I-16'
from seqeval.metrics import classification_report
print(classification_report([labels], [predictions])) # [] 避免报错TypeError: Found input variables without list of list.
precision recall f1-score support
1 0.65 0.72 0.69 68
10 0.00 0.00 0.00 24
11 0.67 0.71 0.69 145
12 0.38 0.38 0.38 21
13 0.41 0.58 0.48 137
14 0.57 0.90 0.70 51
15 0.00 0.00 0.00 5
16 0.68 0.72 0.70 78
18 0.48 0.52 0.50 157
19 0.00 0.00 0.00 1
2 0.00 0.00 0.00 4
21 0.00 0.00 0.00 1
22 0.00 0.00 0.00 13
29 0.00 0.00 0.00 13
3 0.13 0.20 0.16 25
30 0.00 0.00 0.00 2
34 0.00 0.00 0.00 1
36 0.00 0.00 0.00 2
37 0.34 0.56 0.42 34
38 0.28 0.40 0.33 82
39 0.00 0.00 0.00 10
4 0.68 0.79 0.73 417
40 0.51 0.56 0.54 108
46 0.00 0.00 0.00 1
47 0.00 0.00 0.00 2
5 0.49 0.68 0.57 81
50 0.00 0.00 0.00 2
54 0.50 0.57 0.53 14
6 0.00 0.00 0.00 10
7 0.69 0.90 0.78 59
8 0.69 0.83 0.76 41
9 0.20 0.04 0.06 27
micro avg 0.54 0.62 0.58 1636
macro avg 0.26 0.31 0.28 1636
weighted avg 0.53 0.62 0.57 1636
F:\ProgramData\Anaconda3\lib\site-packages\seqeval\metrics\v1.py:57: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
''.join(data.iloc[0]['sentence'].split())
'手机三脚架网红直播支架桌面自拍杆蓝牙遥控三脚架摄影拍摄拍照抖音看电视神器三角架便携伸缩懒人户外支撑架【女神粉】自带三脚架+蓝牙遥控'
sentence = "手机三脚架网红直播支架桌面自拍杆蓝牙遥控三脚架摄影拍摄拍照抖音看电视神器三角架便携伸缩懒人户外支撑架【女神粉】自带三脚架+蓝牙遥控"
inputs = tokenizer(sentence, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors="pt")
# 加载到gpu
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)
# 输入到模型
outputs = model(ids, mask)
logits = outputs[0]
active_logits = logits.view(-1, model.num_labels) # 大小 (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # 大小 (batch_size*seq_len,)
tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # tuple = (wordpiece, prediction)
word_level_predictions = []
for pair in wp_preds:
if (pair[0].startswith(" ##")) or (pair[0] in ['[CLS]', '[SEP]', '[PAD]']):
# skip prediction
continue
else:
word_level_predictions.append(pair[1])
# 拼接文本
str_rep = " ".join([t[0] for t in wp_preds if t[0] not in ['[CLS]', '[SEP]', '[PAD]']]).replace(" ##", "")
print(str_rep)
print(word_level_predictions)
手 机 三 脚 架 网 红 直 播 支 架 桌 面 自 拍 杆 蓝 牙 遥 控 三 脚 架 摄 影 拍 摄 拍 照 抖 音 看 电 视 神 器 三 角 架 便 携 伸 缩 懒 人 户 外 支 撑 架 【 女 神 粉 】 自 带 三 脚 架 + 蓝 牙 遥 控
['B-40', 'I-40', 'B-4', 'I-4', 'I-4', 'B-14', 'I-8', 'B-5', 'I-5', 'B-4', 'I-4', 'B-7', 'I-7', 'B-4', 'I-4', 'I-4', 'B-11', 'I-11', 'B-11', 'I-11', 'B-4', 'I-4', 'I-4', 'B-5', 'I-5', 'B-5', 'I-5', 'B-5', 'I-5', 'B-5', 'I-5', 'B-5', 'I-5', 'I-5', 'O', 'O', 'B-4', 'I-4', 'I-4', 'B-11', 'I-11', 'B-11', 'I-11', 'B-8', 'I-8', 'B-7', 'I-7', 'B-4', 'I-4', 'I-4', 'O', 'B-8', 'I-8', 'O', 'O', 'B-13', 'I-11', 'B-4', 'I-4', 'I-4', 'O', 'B-11', 'I-11', 'B-11', 'O']
保存模型词汇表 、模型权重、配置文件,之后可以用 from_pretrained()
import os
directory = "./model"
if not os.path.exists(directory):
os.makedirs(directory)
# 保存tokenizer
tokenizer.save_vocabulary(directory)
# 保存权重和配置文件
model.save_pretrained(directory)
print('All files saved')
print('This tutorial is completed')
All files saved
This tutorial is completed
def prepare_sentence(sentence, tokenizer, maxlen):
# 步骤 1: tokenize the sentence
tokenized_sentence = tokenizer.tokenize(sentence)
# 步骤 2: add special tokens
tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"]
# 步骤 3: truncating/padding
if (len(tokenized_sentence) > maxlen):
# truncate
tokenized_sentence = tokenized_sentence[:maxlen]
else:
# pad
tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
# 步骤 4: obtain the attention mask
attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]
# 步骤 5: convert tokens to input ids
ids = tokenizer.convert_tokens_to_ids(tokenized_sentence)
return {
'ids': torch.tensor(ids, dtype=torch.long),
'mask': torch.tensor(attn_mask, dtype=torch.long),
#'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
}
# Bert:
- Bert CRF
- Bert BiLSTM+CRF
- Lex-Bert
- FLat-NER:FLAT: Chinese NER Using Flat-Lattice Transformer
- Unified Named Entity Recognition as Word-Word Relation Classification
https://github.com/ljynlp/W2NER
# 数据
- 数据增强:https://github.com/425776024/nlpcda
- 语义增强:embedding 拼音 偏旁
- 伪标签学习