很简单,生产流水线,有一个固定的顺序:
初始化 def init(self) -->
训练 def training_step(self, batch, batch_idx) --> training_step_end(self,batch_parts) --> training_epoch_end(self, training_outputs)
校验 def validation_step(self, batch, batch_idx) --> …
测试 def test_step(self, batch, batch_idx) --> …
class MyModule(pl.LightningModule):
def __init__(self):
self.loss = ...
def forward(self, x, y):
# write my model layers...
# 可以加上dropout,残差
...
return out
def configure_optimizers(self):
return torch.optim.Adam(self.parameters()) #我用Adam, 可以替换其他优化器
def training_step(self, batch, batch_idx):
x, y = batch;
y_hat = self(x); #forward
loss = self.loss(y_hat, y) #计算loss
acc = accuracy(y_hat, y) #计算acc, from pytorch_lightning.metrics.functional import accuracy
return {"loss": loss, "acc": acc} #返回一个batch的loss,acc
def training_epoch_end(self, training_ouputs):
# training_ouputs应该是有batch_size行字典,记录着每一个batch的loss和acc,training_ouputs[0]["loss"]就是取出第一个batch的loss
avg_loss = torch.tensor([x["loss"] for x in training_ouputs]).mean() #一个epoch的平均loss
avg_acc = torch.tensor([x["acc"] for x in training_ouputs]).mean() #一个epoch的平均acc
def validation_step(self, batch, batch_idx):
x, y = batch;
y_hat = self(x); #forward
loss = self.loss(y_hat, y) #计算loss
acc = accuracy(y_hat, y) #计算acc, from pytorch_lightning.metrics.functional import accuracy
return {"loss": loss, "acc": acc} #返回一个batch的loss,acc
def validation_epoch_end(self, validation_ouputs):
# validation_ouputs同上
avg_loss = torch.tensor([x["loss"] for x in validation_ouputs]).mean() #一个epoch的平均loss
avg_acc = torch.tensor([x["acc"] for x in validation_ouputs]).mean() #一个epoch的平均acc
train_loader, val_loader = get_dataloaders() #我在另一个dataset.py文件写的方法
model = MyModule() #实例化model
trainer = Trainer(gpus=-1, max_epochs=5, progress_bar_refresh_rate=21)
# train and val
trainer.fit(model, train_dataloader=train_loader, val_dataloader=val_loader)
# test(第一种方法)
trainer.test(test_dataloaders=test_dataloader)
# test(第二种方法)
model = MyModule.load_from_checkpoint(PATH)
trainer = Trainer()
trainer.test(model, test_dataloaders=test_dataloader)
#默认就是tb。直接命令行输入就能看到各个版本
conda activate pytorch38
tensorboard --logdir ./lightning_logs
#或者
tensorboard --logdir ./
######################################################################
from pytorch_lightning.loggers import TensorBoardLogger
# 创建记录器logger
logger = TensorBoardLogger('tb_logs', name='my_model')
# 关联训练器trainer
model = LightningMNISTClassifier(outdim=outdim)
trainer = pl.Trainer(gpus=-1, max_epochs=2, logger=logger)
trainer.fit(model)
# 写入记录
def training_step(self, batch, batch_idx):
self.log('my_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
# 在cmd打开tensorboard
conda activate pytorch38
tensorboard --logdir ./ tb_logs
Lightning 会自动保存最近训练的epoch的模型到当前的工作空间。自动保存下,也可以自定义要监控的量来保存模型,步骤如下:
1.计算需要监控的量,如loss
2.使用log()函数标记该要监控的量
3.初始化ModelCheckpoint回调,并设置要监控的量,下面有详细的描述
4.将其传回到Trainer中
步骤示例代码:
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning import Trainer
# 初始化`ModelCheckpoint`回调,并设置要监控的量
# --monitor 需要监控的量,string类型。
# 例如'val_loss'(在training_step() or validation_step()函数中通过self.log('val_loss', loss)进行标记);
# 默认为None,只保存最后一个epoch的模型参数(我的理解是只保留最后一个epoch的模型参数,但是还是每训练完一个epoch之后会保存一次,然后覆盖上一次的模型)
checkpoint_callback = ModelCheckpoint(monitor='val_loss')
# 将该callback放到其他的callback的list中
trainer = Trainer(callbacks=[checkpoint_callback])