###1.原始数据格式
海 O
钓 O
比 O
赛 O
地 O
点 O
在 O
厦 B-LOC
门 I-LOC
与 O
金 B-LOC
门 I-LOC
之 O
间 O
的 O
海 O
域 O
。 O
日 B-LOC
俄 B-LOC
两 O
国 O
国 O
内 O
政 O
局 O
都 O
充 O
满 O
变 O
数 O
, O
尽 O
管 O
日 B-LOC
俄 B-LOC
关 O
系 O
目 O
前 O
是 O
历 O
史 O
最 O
佳 O
时 O
期 O
, O
但 O
其 O
脆 O
弱 O
性 O
不 O
言 O
自 O
明 O
。 O
将数据中的文本和对应的标签,读取成下面的one-hot格式
['海', '钓', '比', '赛', '地', '点', '在', '厦', '门', '与', '金', '门', '之', '间', '的', '海', '域', '。', ']
[ 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'B-LOC', 'I-LOC', 'O' , 'O', 'O', 'O', 'O', 'O']
首先创建一个标签的字典库
VOCAB=(“[PAD]”,“O”,“'B-LOC”,“I-LOC”,“[CLS]”,“[SEP]”)
其中[PAD]是一个补位符号,可以自己定义,也可以定义为x都可以,VOCAB对应的字典如下:
{0: ‘[PAD]’, 1: ‘O’, 2: ‘B-TIM’, 3: ‘I-TIM’, 4: ‘B-PER’, 5: ‘I-PER’, 6: ‘B-ORG’, 7: ‘I-ORG’, 8: ‘B-LOC’, 9: ‘I-LOC’, 10: ‘[CLS]’, 11: ‘[SEP]’}
然后根据这个字典处理成Bert需要的数据:
input_ids:将文本转换成bert字典中对应的编号
input_mask:用1和0代表两句话的位置,如果只有一句话,就用1表示(开始符,分割符也要占一个位置),0用来补位
segment_ids:表示句子的长度
label_ids:是实体标签对应字典中的位置
'input_ids': [101, 3862, 7157, 3683, 6612, 1765, 4157, 1762, 1336, 7305, 680, 7032, 7305, 722, 7313, 4638, 3862, 1818, 511, 102]
'input_mask':[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
'segment_ids':[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
'label_ids':[10, 1, 1, 1, 1, 1, 1, 1, 8, 9, 1, 8, 9, 1, 1, 1, 1, 1, 1, 11]
因为输入到bert模型中的数据,每一个epoch的数据的长度要求一致,且长度不能超过512位,便于计算.
input_ids:在list后面添加0补位,因为在句子中是使用[PAD]进行补位,[PAD]在bert的字典库中对应的位置为0
input_mask:用0补位
segment_ids:用0补位
label_ids:用0补位,因为在我自己设置的tag字典库中补位是用[PAD],他对应的位置是0,这个可以自定义.
我设置的最大长度为100,转换成长度一致的结果如下:
input_ids=[101, 3862, 7157, 3683, 6612, 1765, 4157, 1762, 1336, 7305, 680, 7032, 7305, 722, 7313, 4638, 3862, 1818, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
input_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
segment_ids=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
label_ids=[10, 1, 1, 1, 1, 1, 1, 1, 8, 9, 1, 8, 9, 1, 1, 1, 1, 1, 1, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
为了便于理解,我将每一个过程都封装成了一个函数,代码如下
from transformers import BertTokenizer
bert_model='./bert-base-chinese'
tokenizer=BertTokenizer.from_pretrained(bert_model)
# 自定义的label标签字典库
VOCAB = (
"[PAD]",
"O",
'B-TIM',
'I-TIM',
"B-PER",
"I-PER",
"B-ORG",
"I-ORG",
"B-LOC",
"I-LOC",
"[CLS]",
"[SEP]"
)
# 标签转数字,数字转标签
tag2idx = {tag: idx for idx, tag in enumerate(VOCAB)}
idx2tag = {idx: tag for idx, tag in enumerate(VOCAB)}
MAX_LEN=510
# 定义一个类分装好数据
class Input_futures(object):
def __init__(self,input_ids,input_mask,segment_ids,label_ids):
self.input_ids=input_ids
self.input_mask=input_mask
self.segment_ids=segment_ids
self.label_ids=label_ids
class Input_example(object):
def __init__(self, input_ids, input_mask, segment_ids, label_ids):
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.label_ids = label_ids
def load_data(file_path):
with open(file_path,'r',encoding='utf-8')as f:
lines=f.read().strip().split('\n\n')
sents, tags_li = [], [] # list of lists
for i, entry in enumerate(lines):
words = [line.split()[0] for line in entry.splitlines()]
tags = ([line.split()[-1] for line in entry.splitlines()])
sents.append(words[:MAX_LEN])
tags_li.append(tags[:MAX_LEN])
return zip(sents,tags_li)
def convert_to_feature(entitys):
input_ids=[]
input_mask=[]
segment_ids=[]
label_ids=[]
for entity in entitys:
input=tokenizer.encode_plus(entity[0])
#label的前面添加[CLS],尾部添加[SEP]
label=list(entity[1])
label.insert(0,'[CLS]')
label.append('[SEP]')
label_id=[tag2idx[each] for each in label]
input_ids.append(input['input_ids'])
input_mask.append(input['attention_mask'])
segment_ids.append(input['token_type_ids'])
label_ids.append(label_id)
feature = Input_futures(
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
label_ids=label_ids
)
return feature
def convrt_to_example(feature,max_len):
f=lambda feature,max_len:[sample+[0]*(max_len-len(sample))for sample in feature]
input_ids=f(feature.input_ids,max_len)
input_mask=f(feature.input_mask,max_len)
segment_ids=f(feature.segment_ids,max_len)
label_ids=f(feature.label_ids,max_len)
example=Input_example(
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
label_ids=label_ids
)
return example
if __name__=='__main__':
print('程序开始'.center(50,'#'))
# 1.读取数据
path='./data/train.txt'
entitys=load_data(path)
# 2.将数据处理成bert格式的类型
feature=convert_to_feature(entitys)
# 3.填充数据
example=convrt_to_example(feature,100)
print('程序结束'.center(50,'#'))
因为每次要去写代码比较麻烦,所以我将代码封装成了DataSet类,只要按照我们自己要求的格式进行输入就可以直接拿去用,省去了再写代码的过程.
['海', '钓', '比', '赛', '地', '点', '在', '厦', '门', '与', '金', '门', '之', '间', '的', '海', '域', '。', ']
[ 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'B-LOC', 'I-LOC', 'O' , 'O', 'O', 'O', 'O', 'O']
分装数据:
def load_data(file_path):
with open(file_path,'r',encoding='utf-8')as f:
lines=f.read().strip().split('\n\n')
sents, tags_li = [], [] # list of lists
for i, entry in enumerate(lines):
words = [line.split()[0] for line in entry.splitlines()]
tags = ([line.split()[-1] for line in entry.splitlines()])
sents.append(words[:MAX_LEN])
tags_li.append(tags[:MAX_LEN])
return zip(sents,tags_li)
注意:我这里是将数据打包成了一个zip格式,到时候输出的时候,sent和tag会作为一个整体输出.
创建2个utils,保存成pkl格式,和读取pkl格式的数据
import pickle as pkl
import codecs
def save_pkl(path, obj):
print(f'save in {path}')
with codecs.open(path, 'wb') as f:
pkl.dump(obj, f)
def load_pkl(path):
print(f'load in {path}')
with codecs.open(path,'rb') as f:
data = pkl.load(f)
return data
只要完成上面2步骤之后,数据保存成我指定的格式,然后把下面的封装好的代码放到项目中就可以直接使用,节省了很多步骤.
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer
from utils.util import load_pkl
bert_model='./bert-base-chinese'
tokenizer=BertTokenizer.from_pretrained(bert_model)
class NerDataset(Dataset):
def __init__(self,file_path,tag2idx):
self.tag2idx=tag2idx
self.entities=list(load_pkl(file_path))
def __getitem__(self, item):
entity=self.entities[item]
feature = {}
input = tokenizer.encode_plus(entity[0])
# label的前面添加[CLS],尾部添加[SEP]
label = entity[1]
label.insert(0, '[CLS]')
label.append('[SEP]')
label_id = [self.tag2idx[each] for each in label]
feature['input_ids']=input['input_ids']
feature['attention_mask']=input['attention_mask']
feature['token_type_ids']=input['token_type_ids']
feature['label_ids']=label_id
feature['len']=len(input['input_ids'])
return feature
def __len__(self):
return len(self.entities)
def collate_fn(batch):
all_input_ids = []
all_attention_mask = []
all_token_type_ids = []
all_labels = []
# 计算这一个批次的最大长度
lens = [data['len'] for data in batch]
max_len=max(lens)
f = lambda feature, max_len: feature + [0] * (max_len - len(feature))
for feature in batch:
input_ids = f(feature['input_ids'], max_len)
attention_mask = f(feature['attention_mask'], max_len)
token_type_ids = f(feature['token_type_ids'], max_len)
label_ids = f(feature['label_ids'], max_len)
all_input_ids.append(input_ids)
all_attention_mask.append(attention_mask)
all_token_type_ids.append(token_type_ids)
all_labels.append(label_ids)
# 最后将这些值转换为tensor格式进行输出
return torch.tensor(all_input_ids),torch.tensor(all_token_type_ids),torch.tensor(all_attention_mask),torch.tensor(all_labels)
from torch.utils.data import DataLoader
from NER.NerDataSet import NerDataset, collate_fn
VOCAB = (
"[PAD]",
"O",
'B-TIM',
'I-TIM',
"B-PER",
"I-PER",
"B-ORG",
"I-ORG",
"B-LOC",
"I-LOC",
"[CLS]",
"[SEP]"
)
# 标签转数字,数字转标签
tag2idx = {tag: idx for idx, tag in enumerate(VOCAB)}
idx2tag = {idx: tag for idx, tag in enumerate(VOCAB)}
MAX_LEN=510
if __name__=="__main__":
print("程序开始".center(40,'#'))
save_path = './data/train.pkl'
train_data = NerDataset(file_path=save_path,tag2idx=tag2idx)
train_iter=DataLoader(dataset=train_data,
batch_size=4,
shuffle=True,
collate_fn=collate_fn)
for i,batch in enumerate(train_iter):
print(f'第{i}个batch')
input_ids, token_type_ids, attention_mask, labels_ids = batch
print(input_ids)
print(token_type_ids)
print(attention_mask)
print(labels_ids)
第0个batch
tensor([[ 101, 1046, 7716, 2209, 4638, 1957, 1036, 6375, 2025, 791, 2399, 6438,
758, 2399, 5277, 8024, 1961, 2792, 1762, 4638, 4408, 677, 3300, 124,
121, 1914, 1399, 1398, 2110, 8024, 6421, 4408, 4638, 100, 2157, 1999,
833, 100, 4507, 122, 121, 1399, 2157, 7270, 5299, 2768, 511, 102,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[ 101, 3189, 915, 697, 1744, 1744, 1079, 3124, 2229, 6963, 1041, 4007,
1359, 3144, 8024, 2226, 5052, 3189, 915, 1068, 5143, 4680, 1184, 3221,
1325, 1380, 3297, 881, 3198, 3309, 8024, 852, 1071, 5546, 2483, 2595,
679, 6241, 5632, 3209, 511, 102, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[ 101, 852, 868, 711, 671, 702, 1066, 772, 1054, 1447, 510, 782,
3696, 1062, 789, 8024, 2418, 2496, 5541, 2577, 2160, 7333, 8024, 4696,
3633, 976, 1168, 100, 1044, 1921, 678, 722, 2569, 5445, 2569, 8024,
1400, 1921, 678, 722, 727, 5445, 727, 100, 8024, 3909, 1265, 702,
782, 4638, 1399, 1164, 2533, 1927, 1469, 2143, 6802, 2650, 1599, 8024,
2828, 3121, 7484, 1920, 689, 3030, 1762, 7674, 855, 8024, 6821, 3416,
2798, 5543, 6631, 6632, 5632, 2769, 8024, 3030, 5564, 686, 921, 8024,
3300, 2792, 868, 711, 511, 102],
[ 101, 3763, 4294, 7339, 3136, 5298, 877, 7440, 2861, 8038, 697, 3118,
7339, 6963, 2682, 5526, 8024, 1728, 3634, 6963, 868, 1139, 749, 3297,
1920, 4638, 1222, 1213, 511, 102, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[10, 4, 5, 5, 1, 1, 1, 4, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 7,
7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 11, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[10, 8, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8,
8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 11],
[10, 6, 7, 7, 1, 1, 4, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 11, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
第1个batch
tensor([[ 101, 6821, 2429, 898, 2255, 988, 3717, 4638, 1300, 4289, 7667, 4507,
1744, 1079, 671, 3837, 4638, 6392, 6369, 2360, 712, 2898, 6392, 6369,
8024, 3146, 702, 2456, 5029, 5408, 5125, 5401, 5445, 2612, 2131, 511,
102],
[ 101, 3862, 7157, 3683, 6612, 1765, 4157, 1762, 1336, 7305, 680, 7032,
7305, 722, 7313, 4638, 3862, 1818, 511, 102, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0],
[ 101, 1762, 1355, 6809, 1744, 2157, 8024, 2593, 3131, 924, 7372, 1282,
1146, 3249, 1350, 8024, 2347, 2768, 711, 4852, 833, 924, 7397, 860,
5143, 4638, 7028, 6206, 5299, 2768, 6956, 1146, 511, 102, 0, 0,
0],
[ 101, 1346, 1217, 3635, 6121, 4638, 3300, 4511, 3300, 1957, 8024, 3300,
2399, 6768, 782, 8024, 738, 3300, 704, 2399, 782, 511, 102, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
11],
[10, 1, 1, 1, 1, 1, 1, 1, 8, 9, 1, 8, 9, 1, 1, 1, 1, 1,
1, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0],
[10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 11, 0, 0,
0],
[10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0]])
命名实体识别任务,是每个字对应一个标签,所以在填充数据的时候,需要给标签也填补位置.而bert输入格式是固定的,则在其他几个任务中,是一样的处理过程,唯一的区别就是在输出的真实值不一样.
后续继续补充
后续继续补充
后续继续补充