import torch
import torchvision
from torchvision import datasets, transforms
from torch import nn
from torch import optim
from torch.utils.data import DataLoader
import torch.nn.functional as F
import copy
import matplotlib.pyplot as plt
import numpy as np
root = "xxx"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean=(0.5,), std=(0.5,))
])
dataset = {
"train": datasets.MNIST(
root=root,
transform=transform,
train=True,
download=True
),
"test": datasets.MNIST(
root=root,
transform=transform,
train=False
)
}
dataset_size = {x: len(dataset[x]) for x in ["train", "test"]}
data_loader = {
x: DataLoader(
dataset=dataset[x], batch_size=256, shuffle=True
) for x in ["train", "test"]
}
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(stride=2, kernel_size=2),
nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.MaxPool2d(stride=2, kernel_size=2)
)
self.classifier = nn.Sequential(
nn.Linear(7 * 7 * 128, 256),
nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(256, 128),
nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(128, 10)
)
self._initialize_weights()
def forward(self, x):
x = self.features(x)
x = torch.flatten(x, start_dim=1)
x = self.classifier(x)
return x
def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(tensor=m.weight, mode="fan_out",
nonlinearity="relu")
if m.bias is not None:
nn.init.constant_(tensor=m.bias, val=0)
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(tensor=m.weight, val=1)
nn.init.constant_(tensor=m.bias, val=0)
elif isinstance(m, nn.Linear):
nn.init.normal_(tensor=m.weight, mean=0, std=0.01)
nn.init.constant_(tensor=m.bias, val=0)
net = Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.01)
num_epochs = 100
net.to(device)
Net(
(features): Sequential(
(0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU()
(3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(5): ReLU()
(6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(7): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(9): ReLU()
(10): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(11): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(12): ReLU()
(13): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
)
(classifier): Sequential(
(0): Linear(in_features=6272, out_features=256, bias=True)
(1): ReLU()
(2): Dropout(p=0.5)
(3): Linear(in_features=256, out_features=128, bias=True)
(4): ReLU()
(5): Dropout(p=0.5)
(6): Linear(in_features=128, out_features=10, bias=True)
)
)
def train(net, optimizer, criterion, num_epochs=100, lr_reduce=5, early_stop=10, logger="./logger.txt"):
best_wts = copy.deepcopy(net.state_dict())
best_acc = 0
cnt_for_lr_reduce = 0
cnt_for_early_stop = 0
for epoch in range(num_epochs):
print("epoch {}/{}".format(epoch + 1, num_epochs))
for phase in ["train", "test"]:
running_loss = 0.0
running_corrects = 0
if phase == "train":
net.train()
else:
net.eval()
for inputs, labels in data_loader[phase]:
optimizer.zero_grad()
inputs = inputs.to(device)
labels = labels.to(device)
with torch.set_grad_enabled(phase == "train"):
outputs = net(inputs)
_, labels_hat = outputs.data.max(dim=1)
loss = criterion(outputs, labels)
if phase == "train":
loss.backward()
optimizer.step()
running_loss += loss.item()
running_corrects += sum(labels_hat == labels.data).item()
loss = running_loss / dataset_size[phase]
acc = running_corrects / dataset_size[phase] * 100
print("{} loss: {:.6f}, acc: {:.4f}%".format(phase, loss, acc))
with open(logger, mode="a+") as f:
f.write("{} epoch: {}, loss: {:.6f}, acc: {:.4f}%, lr: {:.6f}\n".format(
phase, epoch, loss, acc, optimizer.param_groups[0]["lr"]
))
if phase == "test":
if acc > best_acc:
best_acc = acc
best_wts = copy.deepcopy(net.state_dict())
torch.save(
net.state_dict(),
"best_net-epoch_{}-val_loss_{:.6f}-val_acc_{:.4f}.pth".format(
epoch, loss, acc
)
)
cnt_for_lr_reduce = 0
cnt_for_early_stop = 0
else:
cnt_for_lr_reduce += 1
cnt_for_early_stop += 1
if cnt_for_lr_reduce > lr_reduce:
for p in optimizer.param_groups:
p['lr'] *= 0.9
cnt_for_lr_reduce = 0
print("lr reduced")
if cnt_for_early_stop > early_stop:
break
torch.save(net.state_dict(), "last.pth")
train(net, optimizer, criterion, logger="adam_log.txt")
epoch 1/100
train loss: 0.005627, acc: 46.5583%
test loss: 0.002554, acc: 80.7300%
epoch 2/100
train loss: 0.004166, acc: 60.8683%
test loss: 0.002243, acc: 82.7300%
...
epoch 28/100
train loss: 0.000821, acc: 94.6800%
test loss: 0.000467, acc: 97.6400%
epoch 29/100
train loss: 0.000803, acc: 94.7450%
test loss: 0.000326, acc: 98.2100%
optimizer = optim.SGD(net.parameters(), lr=0.0001)
train(net, optimizer, criterion, logger="sgd_log.txt")
epoch 1/100
train loss: 0.000756, acc: 95.0150%
test loss: 0.000322, acc: 98.1900%
epoch 2/100
train loss: 0.000764, acc: 94.9900%
test loss: 0.000347, acc: 98.1800%
...
epoch 13/100
train loss: 0.000755, acc: 95.0933%
test loss: 0.000330, acc: 98.0900%
epoch 14/100
train loss: 0.000744, acc: 95.0883%
test loss: 0.000328, acc: 95.5200%