在之前的博客召唤神龙打造自己的ChatGPT_gzroy的博客-CSDN博客中,我介绍了如何用有监督微调训练(SFT)来训练一个GPT 2的模型,使得模型具备对话问答的能力。在OpenAI的InstructGPT论文中,SFT是训练的第一步,第二步是训练一个奖励模型,使得可以根据人类的偏好来对模型的回答进行评分,然后在第三步就可以通过强化学习的方式,通过奖励模型给出的奖励来对模型进行进一步的训练,使得模型能满足安全性,可控性等方面的要求。这篇文章我将介绍如何训练一个奖励模型。
在InstructGPT论文中,OpenAI介绍了如何准备数据,通过第一步的SFT模型,准备一批提示语,对于每个提示语,模型都生成多个回答,例如生成9个回答。然后人工对这9个回答的质量进行排序。之所以不是人工直接对这9个回答进行评分,是因为每个人对于回答的评分标准都不同,然而大家对于那个回答质量更高是比较容易得到统一的,因此采取排序的方式。获得了排序之后,我们就可以计算这个pair-wise的loss值,即把回答两两比较,其得分之间的差距应该足够大。例如对于回答A和回答B,A的质量比B的要高,那么我们可以用以下公式表示这两个回答之间的质量差值,其中x表示prompt, y表示对应的回答:
如果我们有9个回答,那么两两配对之后总共有个,k=9,总共36个配对。因此总的loss值为:
对这个loss值进行最小化,即代表模型能最大限度的区分质量好和质量差的回答之间的评分。
OpenAI并没有开放相关的数据集,另外人工对回答进行排序也是比较耗时的工作,因此这里我打算用一些开源的数据集来完成这个奖励模型的训练。在Huggingface有一个Anthropic公司提供的一个数据集,Anthropic/hh-rlhf · Datasets at Hugging Face,这个公司的员工也是从OpenAI出来的,因为不满OpenAI对技术的不开放而成立的一个新公司。在这个数据集里面提供了chosen和rejected两个字段,分别代表对于一个Prompt的质量好和质量差的回答,我们可以利用这个数据集来进行训练。
以下代码是对这个数据集进行一些数据转换,把里面的human和assistant分别替换为prompt和response,因为在我之前SFT训练的模型是采用这种格式来进行训练的,另外就是在数据的末尾增加一个<|endoftext|>的token,代码如下:
from datasets import load_from_disk
import re
from tqdm import trange
from transformers import GPT2Tokenizer
import pickle
regex_human = re.compile(r'(\nHuman:)+')
regex_assistant = re.compile(r'(\nAssistant:)+')
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
ds = load_from_disk('rlhf')
train_data = []
for i in trange(ds['train'].__len__()):
chosen = ds['train'][i]['chosen']
rejected = ds['train'][i]['rejected']
chosen = re.sub(regex_human, '### Prompt:', chosen)
chosen = re.sub(regex_assistant, '### Response:', chosen)
chosen += '<|endoftext|>'
rejected = re.sub(regex_human, '### Prompt:', rejected)
rejected = re.sub(regex_assistant, '### Response:', rejected)
rejected += '<|endoftext|>'
chosen_ids = tokenizer.encode(chosen)
rejected_ids = tokenizer.encode(rejected)
train_data.append((chosen_ids, rejected_ids))
with open('reward_train.pkl', 'wb') as f:
pickle.dump(train_data, f)
test_data = []
for i in trange(ds['test'].__len__()):
chosen = ds['test'][i]['chosen']
rejected = ds['test'][i]['rejected']
chosen = re.sub(regex_human, '### Prompt:', chosen)
chosen = re.sub(regex_assistant, '### Response:', chosen)
chosen += '<|endoftext|>'
rejected = re.sub(regex_human, '### Prompt:', rejected)
rejected = re.sub(regex_assistant, '### Response:', rejected)
rejected += '<|endoftext|>'
chosen_ids = tokenizer.encode(chosen)
rejected_ids = tokenizer.encode(rejected)
test_data.append((chosen_ids, rejected_ids))
with open('reward_test.pkl', 'wb') as f:
pickle.dump(test_data, f)
按照InstructGPT论文的介绍,奖励模型最好基于SFT模型来构造,因此我这里也是采用同样的方式,基于之前训练好的SFT模型来进行训练。为了能够根据输入的文字得出一个分值,需要在原有模型的基础上,去掉最后的(hidden_dim, vocab_size)的线性变换层,改为添加一个维度为(hidden_dim, 1)的线性变换层,从而将模型输出的隐变量映射为一个分值。
以下代码是参照GPT 2模型的结构来定义一个奖励模型:
import torch
from torch import nn
from torch.nn import functional as F
import math
import inspect
class MHA(nn.Module):
def __init__(self, d_model, num_heads, attn_pdrop, resid_pdrop):
super().__init__()
self.d_model = d_model
self.num_heads = num_heads
self.attn_pdrop = attn_pdrop
self.resid_dropout = nn.Dropout(resid_pdrop)
self.c_attn = nn.Linear(d_model, d_model*3)
self.c_proj = nn.Linear(d_model, d_model)
def forward(self, x, attn_mask):
B, T, C = x.size()
x_qkv = self.c_attn(x)
q, k, v = x_qkv.split(self.d_model, dim=2)
q = q.view(B, T, self.num_heads, C//self.num_heads).transpose(1, 2)
k = k.view(B, T, self.num_heads, C//self.num_heads).transpose(1, 2)
v = v.view(B, T, self.num_heads, C//self.num_heads).transpose(1, 2)
y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, dropout_p=self.attn_pdrop if self.training else 0, is_causal=True)
y = y.transpose(1, 2).contiguous().view(B, T, C)
y = self.c_proj(y)
y = self.resid_dropout(y)
return y
class FeedForward(nn.Module):
def __init__(self, d_model, dff, dropout):
super().__init__()
self.c_fc = nn.Linear(d_model, dff)
self.c_proj = nn.Linear(dff, d_model)
self.dropout = nn.Dropout(dropout)
self.gelu = nn.GELU()
def forward(self, x):
x = self.c_fc(x)
x = self.gelu(x)
x = self.c_proj(x)
x = self.dropout(x)
return x
class Block(nn.Module):
def __init__(self, d_model, num_heads, dff, attn_pdrop, resid_pdrop, dropout):
super().__init__()
self.ln_1 = nn.LayerNorm(d_model)
self.attn = MHA(d_model, num_heads, attn_pdrop, resid_pdrop)
self.ln_2 = nn.LayerNorm(d_model)
self.mlp = FeedForward(d_model, dff, dropout)
def forward(self, x, attn_mask):
x = x + self.attn(self.ln_1(x), attn_mask)
x = x + self.mlp(self.ln_2(x))
return x
class RewardModel(nn.Module):
def __init__(self, vocab_size, d_model, block_size, embed_pdrop, num_heads, dff, attn_pdrop, resid_pdrop, dropout, num_layer):
super().__init__()
self.wte = nn.Embedding(vocab_size, d_model, sparse=False)
self.wpe = nn.Embedding(block_size, d_model, sparse=False)
self.dropout_embed = nn.Dropout(embed_pdrop)
self.h = nn.ModuleList([Block(d_model, num_heads, dff, attn_pdrop, resid_pdrop, dropout) for _ in range(num_layer)])
self.num_layer = num_layer
self.block_size = block_size
#self.lm_head = nn.Linear(d_model, vocab_size, bias=False)
#self.wte.weight = self.lm_head.weight
self.reward_head = nn.Linear(d_model, 1, bias=False)
self.ln_f = nn.LayerNorm(d_model)
self.PAD_ID = vocab_size - 1
self.apply(self._init_weights)
# apply special scaled init to the residual projections, per GPT-2 paper
for pn, p in self.named_parameters():
if pn.endswith('c_proj.weight'):
torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * num_layer))
def _init_weights(self, module):
if isinstance(module, nn.Linear):
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
if module.bias is not None:
torch.nn.init.zeros_(module.bias)
elif isinstance(module, nn.Embedding):
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
def forward(self, input_ids, reward_pos, attn_mask, return_loss=False):
device = input_ids.device
b, t = input_ids.size()
pos = torch.arange(0, t, dtype=torch.long, device=device)
x = self.wte(input_ids) + self.wpe(pos)
x = self.dropout_embed(x)
for block in self.h:
x = block(x, attn_mask)
x = self.ln_f(x)
rewards = self.reward_head(x).squeeze(-1)
#x = torch.reshape(x, [b,t])
#scores = torch.gather(x, dim=-1, index=reward_pos)
chosen_end_scores = []
rejected_end_scores = []
bs = input_ids.shape[0] // 2
chosen = input_ids[:bs]
rejected = input_ids[bs:]
chosen_rewards = rewards[:bs]
rejected_rewards = rewards[bs:]
loss = 0
for i in range(bs):
if torch.all(torch.eq(chosen[i], rejected[i])).item():
c_inds = (chosen[i] == self.PAD_ID).nonzero()
c_ind = c_inds[0].item() if len(c_inds) > 0 else chosen.shape[1]
chosen_end_scores.append(chosen_rewards[i, c_ind - 1])
continue
# Check if there is any padding otherwise take length of sequence
c_inds = (chosen[i] == self.PAD_ID).nonzero()
c_ind = c_inds[0].item() if len(c_inds) > 0 else chosen.shape[1]
r_inds = (rejected[i] == self.PAD_ID).nonzero()
r_ind = r_inds[0].item() if len(r_inds) > 0 else rejected.shape[1]
#c_truncated_reward = chosen_rewards[i][c_ind-1]
#r_truncated_reward = rejected_rewards[i][r_ind-1]
chosen_end_scores.append(c_truncated_reward)
rejected_end_scores.append(r_truncated_reward)
if return_loss:
end_ind = max(c_ind, r_ind)
# Retrieve first index where trajectories diverge
divergence_ind = (chosen[i] != rejected[i]).nonzero()[0]
assert divergence_ind > 0
# Index into the correct rewards
c_truncated_reward = chosen_rewards[i][divergence_ind:end_ind]
r_truncated_reward = rejected_rewards[i][divergence_ind:end_ind]
# Append the last rewards to the list of end scores
chosen_end_scores.append(c_truncated_reward[-1])
rejected_end_scores.append(r_truncated_reward[-1])
# Compute loss based on truncated rewards (ignore padding)
loss += -F.logsigmoid(c_truncated_reward - r_truncated_reward).mean()
#loss += -F.logsigmoid(chosen_rewards[i][c_ind-1]-rejected_rewards[i][r_ind-1])
loss = loss / bs
return chosen_end_scores, rejected_end_scores, loss
def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
# start with all of the candidate parameters
param_dict = {pn: p for pn, p in self.named_parameters()}
# filter out those that do not require grad
param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
# create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
# i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
optim_groups = [
{'params': decay_params, 'weight_decay': weight_decay},
{'params': nodecay_params, 'weight_decay': 0.0}
]
num_decay_params = sum(p.numel() for p in decay_params)
num_nodecay_params = sum(p.numel() for p in nodecay_params)
print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
# Create AdamW optimizer and use the fused version if it is available
fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
use_fused = fused_available and device_type == 'cuda'
extra_args = dict(fused=True) if use_fused else dict()
optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
print(f"using fused AdamW: {use_fused}")
return optimizer
@torch.no_grad()
def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None, block_size=512):
for _ in range(max_new_tokens):
# if the sequence context is growing too long we must crop it at block_size
idx_cond = idx if idx.size(1) <= block_size else idx[:, -block_size:]
# forward the model to get the logits for the index in the sequence
logits, _ = self(idx_cond)
# pluck the logits at the final step and scale by desired temperature
logits = logits / temperature
# optionally crop the logits to only the top k options
if top_k is not None:
v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
logits[logits < v[:, [-1]]] = -float('Inf')
# apply softmax to convert logits to (normalized) probabilities
probs = F.softmax(logits, dim=-1)
# sample from the distribution
idx_next = torch.multinomial(probs, num_samples=1)
# append sampled index to the running sequence and continue
idx = torch.cat((idx, idx_next), dim=1)
return idx
模型的主要结构和GPT 2是一致的,主要是加了一个reward_head的线性变换层,把最后模型输出的隐变量映射为一个数值。另外,在计算loss的时候,取chosen和rejected这两个回答的差异部分,按照之前介绍的loss值公式来进行计算。
模型训练,这部分的代码就没什么特别的了,主要是我们读取数据的时候,要把chosed和rejected两部分数据组合在一起。以下是定义dataset的代码:
import torch
from torch.utils.data import Dataset
import pickle
class RewardDataset(Dataset):
def __init__(self, dataset_file, block_size):
with open(dataset_file, 'rb') as f:
self.data = pickle.load(f)
self.block_size = block_size
def __len__(self):
return (len(self.data))
def __getitem__(self, index):
chosen = self.data[index][0]
rejected = self.data[index][1]
delta_len = self.block_size - len(chosen)
if delta_len >= 0:
reward_pos_chosen = torch.IntTensor([len(chosen)-1])
attn_mask_chosen = [1 for _ in range(len(chosen))]
attn_mask_chosen.extend([0 for _ in range(delta_len)])
chosen.extend([0 for _ in range(delta_len)])
else:
reward_pos_chosen = torch.IntTensor([self.block_size-1])
chosen = chosen[:self.block_size]
attn_mask_chosen = [1 for _ in range(self.block_size)]
delta_len = self.block_size - len(rejected)
if delta_len >= 0:
reward_pos_rejected = torch.IntTensor([len(rejected)-1])
attn_mask_rejected = [1 for _ in range(len(rejected))]
attn_mask_rejected.extend([0 for _ in range(delta_len)])
rejected.extend([0 for _ in range(delta_len)])
else:
reward_pos_rejected = torch.IntTensor([self.block_size-1])
rejected = rejected[:self.block_size]
attn_mask_rejected = [1 for _ in range(self.block_size)]
chosen = torch.IntTensor(chosen)
rejected = torch.IntTensor(rejected)
attn_mask_chosen = torch.FloatTensor(attn_mask_chosen)
attn_mask_rejected = torch.FloatTensor(attn_mask_rejected)
return chosen, rejected, reward_pos_chosen, reward_pos_rejected, attn_mask_chosen, attn_mask_rejected
定义一个函数加载之前SFT训练的模型的参数:
def load_sft(checkpointname, vocab_size, device):
checkpoint = torch.load(checkpointname)
config = checkpoint['config']
config['num_heads'] = config['num_head']
config.pop('num_head')
config['vocab_size'] = vocab_size
model_sft = GPT2(**config)
model_sft = torch.compile(model_sft)
model_sft.load_state_dict(checkpoint['model_state_dict'])
sd_sft = model_sft.state_dict()
sd_keys_sft = sd_sft.keys()
sd_keys_sft = [k for k in sd_keys_sft if not k.endswith('lm_head.weight')] # ignore these, just a buffer
#sd_keys_sft = [k for k in sd_keys_sft if not k.endswith('.attn.bias')] # same, just the mask (buffer)
model = RewardModel(**config)
model.to(device)
model = torch.compile(model)
sd = model.state_dict()
for k in sd_keys_sft:
assert sd_sft[k].shape == sd[k].shape
with torch.no_grad():
sd[k].copy_(sd_sft[k])
del model_sft, sd_sft
return model
最后就是对模型进行训练了,代码没什么特别的地方:
dataset = RewardDataset(args.dataset, 1024)
dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, num_workers=4)
total_loss = 0
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
for epoch in range(start_epoch, start_epoch+args.num_epoch):
start = time.time()
for batch, (chosen,rejected,pos_chosen,pos_rejected,attn_mask_chosen,attn_mask_rejected) in enumerate(dataloader):
optimizer.zero_grad()
lr = get_lr(batch+epoch*args.steps_epoch, args.warmup_steps, args.learning_rate, args.steps_epoch*args.total_epochs)
for param_group in optimizer.param_groups:
param_group['lr'] = lr
input_data = torch.cat((chosen, rejected), 0)
pos = torch.cat((pos_chosen, pos_rejected), 0)
input_data = input_data.to(args.device)
pos = pos.long().to(args.device)
attn_mask_temp = torch.cat((attn_mask_chosen, attn_mask_rejected), 0)
attn_mask_temp = attn_mask_temp.to(args.device)
attn_mask_temp = torch.unsqueeze(attn_mask_temp, -1)
attn_mask = torch.bmm(attn_mask_temp, attn_mask_temp.transpose(1,2))
attn_mask = torch.unsqueeze(attn_mask, 1)
if mixed:
with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
reward_c, reward_r, loss = model(input_data, pos, None, return_loss=True)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
else:
reward_c, reward_r, loss = model(input_data, pos, attn_mask, return_loss=True)
loss.backward()
optimizer.step()
total_loss += loss.item()
#total_accuracy += accuracy(logits, y)
total_accuracy = 0
if batch%100 == 0 and batch>0:
line = f'Batch: {batch+epoch*args.steps_epoch}, Loss: {total_loss/100:.4f}, Learning_rate: {lr:.7f}'
with open(args.logfile, 'a') as logfile:
logfile.write(line+'\n')
print(line)
total_loss = 0
total_accuracy = 0
if batch%args.steps_epoch == 0:
break
以上就是训练奖励模型的具体实现过程。当奖励模型训练好之后,我们就可以用来对SFT模型给出的回答进行评分,这个评分将作为奖励值,用于最后的强化学习中,使得模型能根据人类的偏好来调整回答的质量。