def word2id_func(raw_dataset):
# returns a dictionary of words and their ids
# print('raw_dataset:' , raw_dataset)
words = []
for entry in raw_dataset:
words.extend(entry['utterance'].split())
words = list(set(words))
words_dict = {'[PAD]': PAD_TOKEN}
words_dict.update({w:i+1 for i, w in enumerate(words)})
words_dict['[UNK]'] = len(words_dict)
print('words_dict:', words_dict)
return words_dict
将训练数据、验证数据、测试数据全部传入,输出一个字典,字典的键为词,值为对应序号。
第一个为'[PAD]': 0
,最后一个为 '[UNK]': 12135
words_dict.update({w:i+1 for i, w in enumerate(words)})
这段代码使用了列表推导式(list comprehension)来生成一个字典,其中键是列表 words 中的元素,值是对应元素在列表中的索引加一。
构建槽位索引序列
def slot2id_func(raw_dataset):
# returns a dictionary of slots and their ids
slots = ['[PAD]']
for entry in raw_dataset:
slots.extend(entry['slots'].split())
slots = list(set(slots))
slots_dict = {s:i for i, s in enumerate(slots)}
print('slots_dict:', slots_dict)
return slots_dict
[PAD]
作为初始元素添加到该列表中;set()
函数将slots列表转换为集合,在此过程中去除重复的槽位,并再次将其转换为列表;enumerate(slots)
获取,从0开始递增。构建意图标签索引
def intent2id_func(raw_dataset):
# returns a dictionary of intents and their ids
intents = [entry['intent'] for entry in raw_dataset]
intents = list(set(intents))
intents_dict = {inte:i for i, inte in enumerate(intents)}
# print('intents_dict:', intents_dict)
return intents_dict
构建词表
def vocab_func(raw_dataset):
vocab = set()
for entry in raw_dataset:
vocab = vocab.union(set(entry['utterance'].split()))
print('list(vocab):', list(vocab))
return ['[PAD]'] + list(vocab) + ['[UNK]']
class IntentsAndSlots(data.Dataset):
# Mandatory methods are __init__, __len__ and __getitem__
def __init__(self, dataset, lang, unk='[UNK]'):
self.utterances = []
self.intents = []
self.slots = []
self.unk = unk
for x in dataset:
self.utterances.append(x['utterance'])
self.slots.append(x['slots'])
self.intents.append(x['intent'])
self.utt_ids = self.mapping_seq(self.utterances, lang.word2id)
self.slot_ids = self.mapping_seq(self.slots, lang.slot2id)
self.intent_ids = self.mapping_lab(self.intents, lang.intent2id)
def __len__(self):
return len(self.utterances)
def __getitem__(self, idx):
utt = torch.Tensor(self.utt_ids[idx])
slots = torch.Tensor(self.slot_ids[idx])
intent = self.intent_ids[idx]
sample = {'utterance': utt, 'slots': slots, 'intent': intent}
return sample
# Auxiliary methods
def mapping_lab(self, data, mapper):
return [mapper[x] if x in mapper else mapper[self.unk] for x in data]
def mapping_seq(self, data, mapper): # Map sequences to number
res = []
for seq in data:
tmp_seq = []
for x in seq.split():
if x in mapper:
tmp_seq.append(mapper[x])
else:
tmp_seq.append(mapper[self.unk])
res.append(tmp_seq)
return res
调用了mapping_seq
方法,将utterances、slots和intents转换为相应的ID序列,并保存在utt_ids、slot_ids和intent_ids中。
__getitem__
方法:给定的索引idx,获取相应位置上的utt_ids、slot_ids和intent_ids,并使用torch.Tensor进行转换。然后创建一个字典sample,将转换后的数据作为键值对存储在字典中。
mapping_lab
将给定的数据列表data根据映射字典mapper进行映射转换。该方法使用列表推导式,遍历数据列表data中的元素x。如果x存在于映射字典mapper中,则将x映射为对应的值;否则,将x映射为指定的未知标记unk对应的值。
数据加载器
def collate_fn(data):
def merge(sequences):
'''
merge from batch * sent_len to batch * max_len
'''
lengths = [len(seq) for seq in sequences]
max_len = 1 if max(lengths) == 0 else max(lengths)
# Pad token is zero in our case
# So we create a matrix full of PAD_TOKEN (i.e. 0) with the shape
# batch_size X maximum length of a sequence
padded_seqs = torch.LongTensor(len(sequences), max_len).fill_(PAD_TOKEN)
for i, seq in enumerate(sequences):
end = lengths[i]
padded_seqs[i, :end] = seq # We copy each sequence into the matrix
# print(padded_seqs)
padded_seqs = padded_seqs.detach() # We remove these tensors from the computational graph
return padded_seqs, lengths
# Sort data by seq lengths
data.sort(key=lambda x: len(x['utterance']), reverse=True)
new_item = {}
for key in data[0].keys():
new_item[key] = [d[key] for d in data]
# We just need one length for packed pad seq, since len(utt) == len(slots)
src_utt, _ = merge(new_item['utterance'])
y_slots, y_lengths = merge(new_item["slots"])
intent = torch.LongTensor(new_item["intent"])
src_utt = src_utt.to(device) # We load the Tensor on our seleceted device
y_slots = y_slots.to(device)
intent = intent.to(device)
y_lengths = torch.LongTensor(y_lengths).to(device)
new_item["utterances"] = src_utt
new_item["intents"] = intent
new_item["y_slots"] = y_slots
new_item["slots_len"] = y_lengths
return new_item
collate_fn函数的作用是对数据进行填充和合并操作,以便于模型训练过程中对不同长度的序列进行批次化处理。
hid_size = 200
emb_size = 300
lr = 0.0001 # learning rate
clip = 5 # gradient clipping
out_slot = len(lang.slot2id)
out_int = len(lang.intent2id)
vocab_len = len(lang.word2id)
train_raw = load_data(os.path.join('data', dataset, 'train.json'))
test_raw = load_data(os.path.join('data', dataset, 'test.json'))
dev_raw = load_data(os.path.join('data', dataset, 'valid.json'))
class JERNIE(nn.Module):
def __init__(self, out_int, out_slot):
super(JERNIE, self).__init__()
self.tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-2.0-base-en")
self.ERNIE = AutoModel.from_pretrained("nghuyong/ernie-2.0-base-en")
self.ERNIE.to(device)
# 定义两个线性层,用于意图分类与槽位填充的输出
self.intent_classifier = nn.Linear(768, out_int)
self.slot_classifier = nn.Linear(768, out_slot)
def forward(self, input, lang):
# get back the input sentence
utterance = []
for element in input:
utterance.append(' '.join(lang.vocab[i] for i in element if i > 0))
tokenized = self.tokenizer(utterance, return_tensors='pt', add_special_tokens=True, padding=True).to(device)
output = self.ERNIE(**tokenized)
intent = output.pooler_output
slots = output.last_hidden_state[:, :input.size(1), :]
intent = self.intent_classifier(intent)
slots = self.slot_classifier(slots)
slots = slots.permute(0, 2, 1)
return intent, slots
slots = slots.permute(0, 2, 1)
这行代码使用了PyTorch的permute函数,用于交换张量的维度顺序。在这里,slots.permute(0, 2, 1)将slots张量的维度从(批大小, 序列长度, 隐藏单元数)变为(批大小, 隐藏单元数, 序列长度)。通过这个操作,槽分类结果的维度被重新排列,以便后续处理或计算的需要。
from conll import evaluate
from sklearn.metrics import classification_report
def evaluation_loop(data, criterion_slots, criterion_intents, model, lang):
model.eval()
loss_array = []
ref_intents = []
hyp_intents = []
ref_slots = []
hyp_slots = []
with torch.no_grad(): # It used to avoid the creation of computational graph
for sample in data:
intents, slots = model(sample['utterances'], lang)
loss_intent = criterion_intents(intents, sample['intents'])
loss_slot = criterion_slots(slots, sample['y_slots'])
loss = loss_intent + loss_slot
loss_array.append(loss.item())
# Intent inference
# Get the highest probable class
out_intents = [lang.id2intent[x]
for x in torch.argmax(intents, dim=1).tolist()]
gt_intents = [lang.id2intent[x] for x in sample['intents'].tolist()]
ref_intents.extend(gt_intents)
hyp_intents.extend(out_intents)
# Slot inference
output_slots = torch.argmax(slots, dim=1)
for id_seq, seq in enumerate(output_slots):
length = sample['slots_len'].tolist()[id_seq]
utt_ids = sample['utterance'][id_seq][:length].tolist()
gt_ids = sample['y_slots'][id_seq].tolist()
gt_slots = [lang.id2slot[elem] for elem in gt_ids[:length]]
utterance = [lang.id2word[elem] for elem in utt_ids]
to_decode = seq[:length].tolist()
ref_slots.append([(utterance[id_el], elem) for id_el, elem in enumerate(gt_slots)])
tmp_seq = []
for id_el, elem in enumerate(to_decode):
tmp_seq.append((utterance[id_el], lang.id2slot[elem]))
hyp_slots.append(tmp_seq)
try:
results = evaluate(ref_slots, hyp_slots)
except Exception as ex:
# Sometimes the model predics a class that is not in REF
print(ex)
ref_s = set([x[1] for x in ref_slots])
hyp_s = set([x[1] for x in hyp_slots])
print(hyp_s.difference(ref_s))
report_intent = classification_report(ref_intents, hyp_intents,
zero_division=False, output_dict=True)
return results, report_intent, loss_array
评估循环函数。它用于对模型在给定数据集上进行评估,并返回评估结果。
def training_loop(data, optimizer, criterion_slots, criterion_intents, model, lang):
model.train()
loss_array = []
for sample in data:
optimizer.zero_grad() # Zeroing the gradient
intent, slots = model(sample['utterances'], lang)
loss_intent = criterion_intents(intent, sample['intents'])
loss_slot = criterion_slots(slots, sample['y_slots'])
loss = loss_intent + loss_slot # In joint training we sum the losses.
# Is there another way to do that?
loss_array.append(loss.item())
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
optimizer.step() # Update the weights
return loss_array
from tqdm import tqdm
for x in tqdm(range(1, n_epochs)):
loss = training_loop(train_loader, optimizer, criterion_slots,
criterion_intents, model, lang)
if x % 5 == 0:
sampled_epochs.append(x)
losses_train.append(np.asarray(loss).mean())
results_dev, intent_res, loss_dev = evaluation_loop(dev_loader, criterion_slots,
criterion_intents, model, lang)
losses_dev.append(np.asarray(loss_dev).mean())
f1 = results_dev['total']['f']
if f1 > best_f1:
best_f1 = f1
else:
# halve optimizer learning rate
if patience % 3 == 0:
for param_group in optimizer.param_groups:
param_group['lr'] = param_group['lr'] / 2
patience -= 1
if patience <= 0: # Early stopping with patience
break # Not nice but it keeps the code clean
results_test, intent_test, _ = evaluation_loop(test_loader, criterion_slots,
criterion_intents, model, lang)
完整的训练过程:
num | measure | model | score |
---|---|---|---|
1 | Slot F1 | ERNIE | 0.9454038997214484 |
1 | Intent Accuracy | ERNIE | 0.8628571428571429 |
2 | Slot F1 | ERNIE | 0.9372222222222222 |
2 | Intent Accuracy | ERNIE | 0.8642857142857143 |