大语言模型(如GPT、BERT、LLaMA、Grok等)是基于深度神经网络(主要是Transformer架构)的模型,通过在大规模文本数据上训练,学习语言的统计规律、语义和上下文关系。它们可以完成多种任务,包括文本生成、分类、翻译、问答等。训练和推理是模型生命周期的两个核心阶段:
训练一个大语言模型是一个复杂的过程,涉及数据准备、模型架构设计、优化算法、分布式计算和微调等多个阶段。以下是详细分解:
数据是LLM训练的基础,其质量和规模直接影响模型性能。
wget
、BeautifulSoup
(Python爬虫)、Apache Nutch。<.*?>
)。lxml
、html2text
。datasketch
(MinHash实现)。Buy now for $99! Visit www.example.com
Buy now for $99!
sentencepiece
库、tokenizers
(Hugging Face)。[CLS]
(分类)、[SEP]
(分隔)、<|start|>
(开始)。tokenizers
):from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
tokenizer = Tokenizer(BPE())
trainer = BpeTrainer(vocab_size=30000, special_tokens=["<|start|>", "<|end|>"])
tokenizer.train(files=["corpus.txt"], trainer=trainer)
encoded = tokenizer.encode("Hello, world!")
print(encoded.tokens) # ['Hel', 'lo', ',', 'wor', 'ld', '!']
Hello
→ 1001,,
→ 1002。torch.tensor
)。[PAD]
。[Hello, world, [PAD]]
,掩码为[1, 1, 0]
。import torch
tokens = [1001, 1002, 1003] # Hello, world
padded = tokens + [0] * (512 - len(tokens)) # 填充到512
attention_mask = [1] * len(tokens) + [0] * (512 - len(tokens))
input_tensor = torch.tensor([padded])
mask_tensor = torch.tensor([attention_mask])
LLM通常基于Transformer架构,以下是详细分解。
X ∈ ℝ^{n×d}
,n为序列长度,d为嵌入维度。Q = XW_Q
, K = XW_K
, V = XW_V
,其中W_Q, W_K, W_V ∈ ℝ^{d×d_k}
。Attention(Q, K, V) = softmax(QK^T / √d_k)V
。√d_k
为缩放因子,防止数值过大。import torch
import torch.nn as nn
class SelfAttention(nn.Module):
def __init__(self, d_model, d_k):
super().__init__()
self.d_k = d_k
self.W_q = nn.Linear(d_model, d_k)
self.W_k = nn.Linear(d_model, d_k)
self.W_v = nn.Linear(d_model, d_k)
def forward(self, x):
Q = self.W_q(x) # [batch, seq_len, d_k]
K = self.W_k(x)
V = self.W_v(x)
scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)
attn_weights = torch.softmax(scores, dim=-1)
output = torch.matmul(attn_weights, V)
return output
head_i = Attention(QW_Q^i, KW_K^i, VW_V^i)
。MultiHead(Q, K, V) = Concat(head_1, ..., head_h)W_O
。d_k = d_model / h
。FFN(x) = max(0, xW_1 + b_1)W_2 + b_2
。4 * d_model
(如BERT的3072)。class FeedForward(nn.Module):
def __init__(self, d_model, d_ff):
super().__init__()
self.linear1 = nn.Linear(d_model, d_ff)
self.linear2 = nn.Linear(d_ff, d_model)
self.relu = nn.ReLU()
def forward(self, x):
return self.linear2(self.relu(self.linear1(x)))
LayerNorm(x) = γ * (x - μ) / σ + β
,其中μ、σ为均值和标准差,γ、β为可学习参数。x = x + Sublayer(x)
,避免梯度消失。PE(pos, 2i) = sin(pos / 10000^(2i/d_model))
。PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))
。def get_positional_encoding(seq_len, d_model):
pos = torch.arange(seq_len).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
pe = torch.zeros(seq_len, d_model)
pe[:, 0::2] = torch.sin(pos * div_term)
pe[:, 1::2] = torch.cos(pos * div_term)
return pe
d_model=12288
,h=96
。d_model=768
或1024。O(n^2 * d)
,n为序列长度。O(n * d * d_ff)
。训练目标定义了模型优化的方向。以下是详细分析:
P(x_t | x_1, ..., x_{t-1})
。L = -∑_{t=1}^T log P(x_t | x_1, ..., x_{t-1})
。P(x_t) = softmax(W_o * h_t)
,其中h_t
为Transformer输出。class LanguageModel(nn.Module):
def __init__(self, vocab_size, d_model):
super().__init__()
self.transformer = TransformerDecoder(...) # 简化的Transformer
self.output = nn.Linear(d_model, vocab_size)
def forward(self, x):
h = self.transformer(x)
logits = self.output(h)
return logits
criterion = nn.CrossEntropyLoss()
logits = model(input_ids) # [batch, seq_len, vocab_size]
loss = criterion(logits.view(-1, vocab_size), target_ids.view(-1))
target_ids = input_ids[1:]
)。The cat [MASK] on the mat.
,预测sat
。L = -∑_{i∈masked} log P(x_i | x)
。[MASK]
。def mask_tokens(inputs, tokenizer, mlm_prob=0.15):
labels = inputs.clone()
mask = torch.rand(inputs.shape) < mlm_prob
inputs[mask] = tokenizer.mask_token_id
return inputs, labels
P(y_1, ..., y_m | x_1, ..., x_n)
。L = -∑_{t=1}^m log P(y_t | y_1, ..., y_{t-1}, x)
。teacher forcing
:训练时输入真实目标序列。m_t = β_1 m_{t-1} + (1 - β_1) g_t
。v_t = β_2 v_{t-1} + (1 - β_2) g_t^2
。θ_t = θ_{t-1} - η * m_t / (√v_t + ε)
。β_1=0.9
, β_2=0.999
, ε=1e-8
。optimizer = torch.optim.Adam(model.parameters(), lr=6e-4, betas=(0.9, 0.999))
lr_t = lr_max * t / T_warmup
。lr_t = lr_min + 0.5 * (lr_max - lr_min) * (1 + cos(π * t / T))
。from torch.optim.lr_scheduler import CosineAnnealingLR
scheduler = CosineAnnealingLR(optimizer, T_max=100000, eta_min=1e-6)
optimizer.zero_grad()
for i, batch in enumerate(data):
loss = model(batch).loss
loss.backward()
if (i + 1) % accumulation_steps == 0:
optimizer.step()
optimizer.zero_grad()
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()
for batch in data:
optimizer.zero_grad()
with autocast():
loss = model(batch).loss
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
All-Reduce
聚合梯度。import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
dist.init_process_group(backend="nccl")
model = DDP(model)
from deepspeed import init_distributed
import deepspeed
model_engine, optimizer, _, _ = deepspeed.initialize(model=model, config=ds_config)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
output_dir="./sft",
per_device_train_batch_size=8,
num_train_epochs=3,
)
trainer = Trainer(model=model, args=training_args, train_dataset=dataset)
trainer.train()
from trl import PPOTrainer
ppo_trainer = PPOTrainer(model=model, ref_model=ref_model, config=ppo_config)
ppo_trainer.train()
W = W_0 + BA
,B、A为小矩阵。from peft import LoraConfig, get_peft_model
lora_config = LoraConfig(r=8, lora_alpha=16, target_modules=["q", "v"])
model = get_peft_model(model, lora_config)
推理是使用训练好的模型处理输入,生成输出的过程。以下是详细分解:
["我", "爱", "学", "习", "人", "工", "智", "能"]
。token_ids = tokenizer.convert_tokens_to_ids(tokens)
。[PAD]
。attention_mask = [1, 1, ..., 0, 0]
。x = embedding(token_ids)
,x ∈ ℝ^{seq_len×d_model}
。x = x + positional_encoding(seq_len, d_model)
。h = MultiHeadAttention(x)
。x = x + h
。x = LayerNorm(x)
。h = FeedForward(x)
。x = LayerNorm(x + h)
。logits = W_o * h
,probs = softmax(logits)
。x_1, ..., x_t
。x_{t+1} ~ P(x_{t+1} | x_1, ..., x_t)
。x_{t+1} = argmax(P(x_{t+1}))
。score = ∑ log P(x_t)
。beam_size=5
。x_{t+1} ~ P(x_{t+1})
。P'(x) = P(x)^(1/T) / ∑ P(x)^(1/T)
。from transformers import GPT2LMHeadModel, GPT2Tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
inputs = tokenizer("Hello, world!", return_tensors="pt")
outputs = model.generate(
inputs["input_ids"],
max_length=50,
do_sample=True,
top_k=50,
top_p=0.95,
temperature=0.7,
)
print(tokenizer.decode(outputs[0]))
[MASK]
。from transformers import BertForMaskedLM
model = BertForMaskedLM.from_pretrained("bert-base-uncased")
inputs = tokenizer("The cat [MASK] on the mat.", return_tensors="pt")
logits = model(**inputs).logits
masked_idx = (inputs["input_ids"] == tokenizer.mask_token_id).nonzero()
predicted_token = logits[0, masked_idx, :].argmax(dim=-1)
w_int8 = round(w_fp32 / scale)
。torch.quantization
。torch.nn.utils.prune
。L = α * L_CE + (1 - α) * L_KL(teacher_logits, student_logits)
。K_t = [K_1, ..., K_t]
, V_t = [V_1, ..., V_t]
。class TransformerDecoderWithCache(nn.Module):
def __init__(self):
super().__init__()
self.k_cache = []
self.v_cache = []
def forward(self, x, use_cache=True):
if use_cache:
k, v = self.attention(x)
self.k_cache.append(k)
self.v_cache.append(v)
return self.attention(x, k_cache=self.k_cache, v_cache=self.v_cache)
d_model=8192
,h=64
。lr=3e-4
,余弦衰减。["请", "解", "释", "量", "子", "力", "学"]
。8192
维向量。p=0.9
,temperature=0.8
。class Transformer(nn.Module):
def __init__(self, d_model, nhead, num_layers):
super().__init__()
self.encoder = nn.TransformerEncoder(
nn.TransformerEncoderLayer(d_model, nhead), num_layers
)
def forward(self, src):
return self.encoder(src)