pytorch-lightning学习

PL的流程

很简单,生产流水线,有一个固定的顺序:
初始化 def init(self) -->
训练 def training_step(self, batch, batch_idx) --> training_step_end(self,batch_parts) --> training_epoch_end(self, training_outputs)
校验 def validation_step(self, batch, batch_idx) --> …
测试 def test_step(self, batch, batch_idx) --> …

class MyModule(pl.LightningModule):
    def __init__(self):
        self.loss = ...

    def forward(self, x, y):
        # write my model layers...
        # 可以加上dropout,残差
        ...
        return out

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters()) #我用Adam, 可以替换其他优化器

    def training_step(self, batch, batch_idx):
        x, y = batch;
        y_hat = self(x); #forward
        loss = self.loss(y_hat, y) #计算loss
        acc = accuracy(y_hat, y) #计算acc, from pytorch_lightning.metrics.functional import accuracy
        return {"loss": loss, "acc": acc} #返回一个batch的loss,acc

    def training_epoch_end(self, training_ouputs):
        # training_ouputs应该是有batch_size行字典,记录着每一个batch的loss和acc,training_ouputs[0]["loss"]就是取出第一个batch的loss
        avg_loss = torch.tensor([x["loss"] for x in training_ouputs]).mean() #一个epoch的平均loss
        avg_acc = torch.tensor([x["acc"] for x in training_ouputs]).mean() #一个epoch的平均acc

    def validation_step(self, batch, batch_idx):
        x, y = batch;
        y_hat = self(x); #forward
        loss = self.loss(y_hat, y) #计算loss
        acc = accuracy(y_hat, y) #计算acc, from pytorch_lightning.metrics.functional import accuracy
        return {"loss": loss, "acc": acc} #返回一个batch的loss,acc

    def validation_epoch_end(self, validation_ouputs):
        # validation_ouputs同上
        avg_loss = torch.tensor([x["loss"] for x in validation_ouputs]).mean() #一个epoch的平均loss
        avg_acc = torch.tensor([x["acc"] for x in validation_ouputs]).mean() #一个epoch的平均acc
        
train_loader, val_loader = get_dataloaders() #我在另一个dataset.py文件写的方法
model = MyModule() #实例化model
trainer = Trainer(gpus=-1, max_epochs=5, progress_bar_refresh_rate=21)

# train and val
trainer.fit(model, train_dataloader=train_loader, val_dataloader=val_loader)

# test(第一种方法)
trainer.test(test_dataloaders=test_dataloader)

# test(第二种方法)
model = MyModule.load_from_checkpoint(PATH)
trainer = Trainer()
trainer.test(model, test_dataloaders=test_dataloader)	

pytorch-lightning怎么用tensorboard画图

#默认就是tb。直接命令行输入就能看到各个版本
conda activate pytorch38
tensorboard --logdir ./lightning_logs
#或者
tensorboard --logdir ./


######################################################################
from pytorch_lightning.loggers import TensorBoardLogger

# 创建记录器logger
logger = TensorBoardLogger('tb_logs', name='my_model')

# 关联训练器trainer
model = LightningMNISTClassifier(outdim=outdim)
trainer = pl.Trainer(gpus=-1, max_epochs=2, logger=logger)
trainer.fit(model)

# 写入记录
    def training_step(self, batch, batch_idx):
        self.log('my_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

# 在cmd打开tensorboard
conda activate pytorch38
tensorboard --logdir ./ tb_logs

模型保存

Lightning 会自动保存最近训练的epoch的模型到当前的工作空间。自动保存下,也可以自定义要监控的量来保存模型,步骤如下:
1.计算需要监控的量,如loss
2.使用log()函数标记该要监控的量
3.初始化ModelCheckpoint回调,并设置要监控的量,下面有详细的描述
4.将其传回到Trainer中
步骤示例代码:

from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning import Trainer
    
# 初始化`ModelCheckpoint`回调,并设置要监控的量
# --monitor 需要监控的量,string类型。
# 例如'val_loss'(在training_step() or validation_step()函数中通过self.log('val_loss', loss)进行标记);
# 默认为None,只保存最后一个epoch的模型参数(我的理解是只保留最后一个epoch的模型参数,但是还是每训练完一个epoch之后会保存一次,然后覆盖上一次的模型)
checkpoint_callback = ModelCheckpoint(monitor='val_loss')
    
# 将该callback放到其他的callback的list中
trainer = Trainer(callbacks=[checkpoint_callback])

你可能感兴趣的:(深度学习,python,深度学习,神经网络,pytorch)