理论参见:高效深度学习软硬件设计——神经网络压缩、 Pruning模型剪枝、权值共享、低秩近似_iwill323的博客-CSDN博客
目录
任务和数据集
任务描述
数据集
导包
辅助函数
数据处理
显示文件夹和文件数量
transforms
Dataset
数据加载函数
分类模型
训练
加载数据集
训练函数
进行训练
推断
加载数据
architecture design
概念
Depthwise Separable Convolution
建立模型
训练
knowledge distillation
teacher网络
损失函数
训练函数
训练
知识精馏另一种思路
训练函数
训练
模型剪枝
导包
teacher函数
模型剪枝函数
剪枝效果
模型参数数量
模型运行时间
剪枝之后的准确度
总结
拓展
查看分层抽样结果
计时
取小数点后n位并保存为str
统计文件夹下的文件夹和文件
● 网络压缩:让模型更小,同时不损失性能
● 训练一个非常小的模型来完成HW3的任务
数据集来自HW3的food-11数据集,总共有11个类别。
● Training set:9866 labeled images
● Validation set:3430 labeled images
● Evaluation set:3347 images
import numpy as np
import pandas as pd
import torch
import os
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from PIL import Image
from torch.utils.data import ConcatDataset, DataLoader, Subset, Dataset # "ConcatDataset" and "Subset" are possibly useful.
from torchvision.datasets import DatasetFolder, VisionDataset
from sklearn.model_selection import StratifiedShuffleSplit
from torchsummary import summary
from tqdm.auto import tqdm
import random
from d2l import torch as d2l
same_seeds用于固定随机种子,log函数用于日志记录
def same_seeds(seed):
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
def log(log_fw, text): # define a logging function to trace the training process
print(text)
log_fw.write(str(text)+'\n')
log_fw.flush()
# running this will list all files under the input directory
for dirname, _, filenames in os.walk('/kaggle/input'):
if len(filenames) > 0:
print(f"{dirname}: {len(filenames)} files.") # Show the .jpg file amounts in each split.
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/ml2022spring-hw13/food11-hw13: 1 files. /kaggle/input/ml2022spring-hw13/food11-hw13/validation: 3430 files. /kaggle/input/ml2022spring-hw13/food11-hw13/training: 9866 files. /kaggle/input/ml2022spring-hw13/food11-hw13/evaluation: 3347 files.
test_tfm = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
train_tfm = transforms.Compose([
# add some useful transform or augmentation here, according to your experience in HW3.
transforms.RandomResizedCrop((224, 224), scale=(0.5, 1), ratio=(0.5, 2)),
# You can change this, but be aware of that the given teacher model's input size is (3, 224, 224).
# Thus, Input size other then 224 might hurt the performance. please be careful.
transforms.RandomHorizontalFlip(0.5),
transforms.RandomRotation(180),
transforms.RandomAffine(30),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
传统机器学习阶段(数据集在万这个数量级),一般分配比例为6:2:2。当数据量非常大(百万级)时,即使拿1%的数据做test也有一万之多,已经足够了。可以拿更多的数据做训练。因此常见的比例可以达到98:1:1 ,甚至可以达到99.5:0.3:0.2等。可以参考训练集、验证集和测试集 - 知乎
本作业数据集中,训练集和验证集数据比例为3:1,既然不用划分出测试集,所以我想将训练集比例提高,所以改动了一下原代码,重新划分训练集和数据集,好将训练集和验证集数据比例设置为9:1
class FoodDataset(Dataset):
def __init__(self, files, labels, tfm=test_tfm):
super().__init__()
self.files = files
self.labels = labels
print(f"One sample",self.files[0])
self.transform = tfm
def __len__(self):
return len(self.files)
def __getitem__(self,idx):
fname = self.files[idx]
im = Image.open(fname)
im = self.transform(im)
if self.labels is not None:
label = self.labels[idx]
else:
label = -1 # test has no label
return im, label
使用StratifiedShuffleSplit进行分层采样。效果见最后一个部分的讨论
def loadData(dataset_dir, batch_train, batch_valid, num_workers, valid_ratio, train_tfm, test_tfm):
# 把所有文件名都加上路径前缀,因为训练集和验证集处于不同的文件夹
train_path = os.path.join(dataset_dir, 'training')
train_files = sorted([os.path.join(train_path, x) for x in os.listdir(train_path) if x.endswith(".jpg")])
train_labels = [int(x.split('/')[-1].split('_')[0]) for x in train_files]
val_path = os.path.join(dataset_dir, 'validation')
val_files = sorted([os.path.join(val_path, x) for x in os.listdir(val_path) if x.endswith(".jpg")])
val_labels = [int(x.split('/')[-1].split('_')[0]) for x in val_files]
files = train_files + val_files
labels = train_labels + val_labels
stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=valid_ratio, random_state=0)
splits = stratified_split.split(files, labels)
train_split_id, val_split_id = next(iter(splits)) # train_split_id, val_split_id是index
df = pd.DataFrame({'files': files, 'labels': labels})
train_files = df.iloc[train_split_id]['files'].values
train_labels = df.iloc[train_split_id]['labels'].values
train_set = FoodDataset(train_files, train_labels, tfm=train_tfm)
train_loader = DataLoader(train_set, batch_size=batch_train, shuffle=True, num_workers=num_workers, pin_memory=True)
val_files = df.iloc[val_split_id]['files'].values
val_labels = df.iloc[val_split_id]['labels'].values
valid_set = FoodDataset(val_files, val_labels, tfm=train_tfm)
valid_loader = DataLoader(valid_set, batch_size=batch_valid, shuffle=True, num_workers=num_workers, pin_memory=True)
print('train集总长度是 {:d}, batch数量是 {:.2f}'.format(len(train_set), len(train_set)/ batch_train))
print('valid集总长度是 {:d}, batch数量是 {:.2f}'.format(len(valid_set), len(valid_set)/ batch_valid))
return train_loader, valid_loader
下面是原代码给出的分类模型
class StudentNet(nn.Module):
def __init__(self):
super().__init__()
self.cnn = nn.Sequential(
nn.Conv2d(3, 32, 3),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.Conv2d(32, 32, 3),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(2, 2, 0),
nn.Conv2d(32, 64, 3),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(2, 2, 0),
nn.Conv2d(64, 100, 3),
nn.BatchNorm2d(100),
nn.ReLU(),
nn.MaxPool2d(2, 2, 0),
# Here we adopt Global Average Pooling for various input size.
nn.AdaptiveAvgPool2d((1, 1)),
)
self.fc = nn.Sequential(
nn.Linear(100, 11),
)
def forward(self, x):
out = self.cnn(x)
out = out.view(out.size()[0], -1)
return self.fc(out)
def get_student_model():
return StudentNet()
该模型参数特征如下:
Total params: 87,907 Trainable params: 87,907 Non-trainable params: 0 ---------------------------------------------------------------- Input size (MB): 0.57 Forward/backward pass size (MB): 99.72 Params size (MB): 0.34 Estimated Total Size (MB): 100.62
dataset_dir = '/kaggle/input/ml2022spring-hw13/food11-hw13'
batch_train = 256
batch_valid = 256
num_workers = 4
valid_ratio = 0.1
train_loader, valid_loader = loadData(dataset_dir, batch_train, batch_valid, num_workers, valid_ratio, train_tfm, test_tfm)
之前数据集加载都是使用drop_last=True,训练的数据量就可以使用训练的batch数量和每个batch含有的样本数量相乘得到,然后用于loss和accuracy的计算。本例采用drop_last=False,那么训练的数据量用列表train_lens来记录,最后sum一下。模型保存的时候,将验证集上的精度保存进入了名字中,感觉这样更好。
def trainer(train_loader, valid_loader, config, devices):
student_model = get_student_model()
student_model.to(devices[0])
optimizer = torch.optim.Adam(student_model.parameters(), lr=config['lr'], weight_decay=config['weight_decay'])
loss_fn = nn.CrossEntropyLoss()
stale, best_acc = 0, config['best_acc']
n_epochs, max_norm, patience = config['n_epochs'], config['grad_norm_max'], config['patience']
legend = ['train loss', 'train acc', 'valid_loss', 'valid_acc']
animator = d2l.Animator(xlabel='epoch', xlim=[0, n_epochs], legend=legend)
save_path = os.path.join(config['save_dir'], config['exp_name']) # create saving directory
os.makedirs(save_path , exist_ok=True)
log_fw = open(f"{save_path}/log.txt", 'w') # open log file to save log outputs
log(log_fw, config) # log your configs to the log file
for epoch in range(n_epochs):
# ---------- Training ----------
student_model.train()
train_loss = []
train_accs = []
train_lens = []
for imgs, labels in train_loader:
imgs, labels = imgs.to(devices[0]), labels.to(devices[0])
logits = student_model(imgs)
loss = loss_fn(logits, labels)
optimizer.zero_grad()
loss.backward()
# Clip the gradient norms for stable training.
grad_norm = nn.utils.clip_grad_norm_(student_model.parameters(), max_norm=max_norm)
optimizer.step()
acc = (logits.argmax(dim=-1) == labels).float().sum()
train_batch_len = len(imgs)
train_loss.append(loss.item() * train_batch_len)
train_accs.append(acc.item())
train_lens.append(train_batch_len)
train_loss = sum(train_loss) / sum(train_lens)
train_acc = sum(train_accs) / sum(train_lens)
log(log_fw, f"[ Train | {epoch + 1:03d}/{n_epochs:03d} ] loss = {train_loss:.5f}, acc = {train_acc:.5f}")
animator.add(epoch, (train_loss, train_acc, None, None))
# ---------- Validation ----------
student_model.eval()
valid_loss = []
valid_accs = []
valid_lens = []
for imgs, labels in valid_loader:
imgs, labels = imgs.to(devices[0]), labels.to(devices[0])
with torch.no_grad():
logits = student_model(imgs)
loss = loss_fn(logits, labels) # SIMPLE BASELINE
acc = (logits.argmax(dim=-1) == labels).float().sum()
batch_len = len(imgs)
valid_loss.append(loss.item() * batch_len)
valid_accs.append(acc.item())
valid_lens.append(batch_len)
valid_loss = sum(valid_loss) / sum(valid_lens)
valid_acc = sum(valid_accs) / sum(valid_lens)
config['accs'].append(valid_acc)
config['loss'].append(valid_loss)
# update logs
if valid_acc > best_acc:
log(log_fw, f"[ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f} -> best")
else:
log(log_fw, f"[ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f}")
# save models
if valid_acc > best_acc:
log(log_fw, f"Best model found at epoch {epoch}, saving model")
best_acc = valid_acc
torch.save(student_model.state_dict(), f"{save_path}/student_best" + "{:.5f}".format(best_acc) + ".ckpt") # only save best to prevent output memory exceed error
stale = 0
else:
stale += 1
if stale > patience:
log(log_fw, f"No improvment {patience} consecutive epochs, early stopping")
break
animator.add(epoch + 1, (None, None, valid_loss, valid_acc))
#log("Finish training")
log_fw.close()
config['best_acc'] = best_acc
devices = d2l.try_all_gpus()
print(f'DEVICE: {devices}')
seed = 20220013
same_seeds(seed) # set a random seed for reproducibility
config = {
'lr': 5e-4,
'weight_decay': 1e-5,
'grad_norm_max': 10,
'n_epochs': 2,
'patience': 300, # If no improvement in 'patience' epochs, early stop
'best_acc': 0.0,
'save_dir': '/kaggle/working/outputs',
'exp_name': 'simple_baseline'
}
trainer(train_loader, valid_loader, config, devices)
训练过程震荡比较大,训练到最后感觉提升空间不大。不知道用scheduler能否好一点。
path = os.path.join(dataset_dir, "evaluation")
files = sorted([os.path.join(path, x) for x in os.listdir(path) if x.endswith(".jpg")])
eval_set = FoodDataset(files, None, tfm=test_tfm)
eval_loader = DataLoader(eval_set, batch_size=256, shuffle=False, num_workers=num_workers, pin_memory=True)
推断结果
student_model_best = get_student_model()
save_path = os.path.join(config['save_dir'], config['exp_name'], 'student_best'+f"{config['best_acc']:.5f}"+ ".ckpt")
student_model_best.load_state_dict(torch.load(save_path))
student_model_best.to(devices[0])
student_model_best.eval()
eval_preds = []
for imgs, _ in tqdm(eval_loader):
with torch.no_grad():
logits = student_model_best(imgs.to(devices[0]))
preds = list(logits.argmax(dim=-1).squeeze().cpu().numpy())
eval_preds += preds
def pad4(i):
return "0"*(4-len(str(i))) + str(i)
ids = [pad4(i) for i in range(0,len(eval_set))]
categories = eval_preds
df = pd.DataFrame()
df['Id'] = ids
df['Category'] = categories
df.to_csv(f"/kaggle/working/submission.csv", index=False)
其中:绿色是感受野receptive field;蓝色是不同通道之间的联系
(a)普通卷积层fully connected convolution layer: 输入与输出通道全连接
# Regular Convolution, # of params = in_chs * out_chs * kernel_size^2
nn.Conv2d(in_chs, out_chs, kernel_size, stride, padding)
(b) Depthwise convolution layer(DW)中,每个feature map独享一个filter,然后通过pointwise convolution layer(PW) 将feature maps不同通道的数据整合起来
(c) Group convolution layer(GC): 将feature maps分组,每个组自己做普通卷积。如果group_size = input_feature_size,那么 GC变成Depthwise convolution layer。如果group_size = 1,那么GC 变成fully connected.
# Group Convolution, "groups" controls the connections between inputs and
# outputs. in_chs and out_chs must both be divisible by groups.
nn.Conv2d(in_chs, out_chs, kernel_size, stride, padding, groups=groups)
举例子,很容易理解。下面是Depthwise convolution layer
ic = 64
net = nn.Sequential(
nn.Conv2d(ic, ic, kernel_size=3, stride=1, padding=1, groups=ic)
)
summary(net, (64, 224, 224), device='cpu') # 64 * (3*3+1)
Total params: 640
64的来源:64个filter,每个filter只有1个通道,权重有9个数,再加上1个bias
下面是fully connected convolution layer:
ic = 64
net = nn.Sequential(
nn.Conv2d(ic, ic, kernel_size=3, stride=1, padding=1, groups=1)
)
summary(net, (64, 224, 224), device='cpu')
# 36928 = 64 *(64 * 9 + 1)
Total params: 36,928
36928的来源:64个filter,每个filter有64个通道,权重有9*64个数,再加上1个bias
def dwpw_conv(ic, oc, kernel_size=3, stride=1, padding=1):
return nn.Sequential(
nn.Conv2d(ic, ic, kernel_size, stride=stride, padding=padding, groups=ic), #depthwise convolution
nn.BatchNorm2d(ic),
nn.LeakyReLU(0.01, inplace=True),
nn.Conv2d(ic, oc, 1), # pointwise convolution
nn.BatchNorm2d(oc),
nn.LeakyReLU(0.01, inplace=True)
)
net = dwpw_conv(64, 64)
summary(net, (64, 32, 32), device='cpu')
模型结构:
---------------------------------------------------------------- Layer (type) Output Shape Param # ================================================================ Conv2d-1 [-1, 64, 32, 32] 640 BatchNorm2d-2 [-1, 64, 32, 32] 128 LeakyReLU-3 [-1, 64, 32, 32] 0 Conv2d-4 [-1, 64, 32, 32] 4,160 BatchNorm2d-5 [-1, 64, 32, 32] 128 LeakyReLU-6 [-1, 64, 32, 32] 0 ================================================================ Total params: 5,056 Trainable params: 5,056 Non-trainable params: 0 ---------------------------------------------------------------- Input size (MB): 0.25 Forward/backward pass size (MB): 0.75 Params size (MB): 0.02 Estimated Total Size (MB): 1.02
输入如果是[1, 64, 32, 32],那么输出是torch.Size([1, 64, 32, 32]),因为这里的步长是1
对比一下残差模块:
class Residual_Block(nn.Module):
def __init__(self, ic, oc, stride=1):
super().__init__()
self.conv1 = nn.Sequential(
nn.Conv2d(ic, oc, kernel_size=3, padding=1, stride=stride),
nn.BatchNorm2d(oc),
nn.ReLU(inplace=True)
)
self.conv2 = nn.Sequential(
nn.Conv2d(oc, oc, kernel_size=3, padding=1),
nn.BatchNorm2d(oc)
)
self.relu = nn.ReLU(inplace=True)
if stride != 1 or (ic != oc): # 对于resnet18,可以不需要stride != 1这个条件
self.conv3 = nn.Sequential(
nn.Conv2d(ic, oc, kernel_size=1, stride=stride),
nn.BatchNorm2d(oc)
)
else:
self.conv3 = None
def forward(self, X):
Y = self.conv1(X)
Y = self.conv2(Y)
if self.conv3:
X = self.conv3(X)
Y += X
return self.relu(Y)
net = nn.Sequential(
Residual_Block(64, 64)
)
summary(net, (64, 224, 224), device='cpu')
---------------------------------------------------------------- Layer (type) Output Shape Param # ================================================================ Conv2d-1 [-1, 64, 32, 32] 36,928 BatchNorm2d-2 [-1, 64, 32, 32] 128 ReLU-3 [-1, 64, 32, 32] 0 Conv2d-4 [-1, 64, 32, 32] 36,928 BatchNorm2d-5 [-1, 64, 32, 32] 128 ReLU-6 [-1, 64, 32, 32] 0 Residual_Block-7 [-1, 64, 32, 32] 0 ================================================================ Total params: 74,112 Trainable params: 74,112 Non-trainable params: 0
同样是两层卷积层,每一层输出的形状也一样,但是模型参数约为20分之一。这是因为Depthwise Separable Convolution用两个卷积层去替代一个普通卷积层,两者的参数量之比主要取决于1/(k×k),在这里为1/9,而残差块包含两个普通卷积层,于是Depthwise Separable Convolution的模型参数应该为1/18左右
使用残差网络的构建思想,下面先贴一个手写的resnet18作参考
class Residual_Block(nn.Module):
def __init__(self, ic, oc, stride=1):
super().__init__()
self.conv1 = nn.Sequential(
nn.Conv2d(ic, oc, kernel_size=3, padding=1, stride=stride),
nn.BatchNorm2d(oc),
nn.ReLU(inplace=True)
)
self.conv2 = nn.Sequential(
nn.Conv2d(oc, oc, kernel_size=3, padding=1),
nn.BatchNorm2d(oc)
)
self.relu = nn.ReLU(inplace=True)
if stride != 1 or (ic != oc): # 对于resnet18,可以不需要stride != 1这个条件
self.conv3 = nn.Sequential(
nn.Conv2d(ic, oc, kernel_size=1, stride=stride),
nn.BatchNorm2d(oc)
)
else:
self.conv3 = None
def forward(self, X):
Y = self.conv1(X)
Y = self.conv2(Y)
if self.conv3:
X = self.conv3(X)
Y += X
return self.relu(Y)
class ResNet(nn.Module):
def __init__(self, block = Residual_Block, num_layers = [2,2,2,2], num_classes=11):
super().__init__()
self.preconv = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
)
self.layer0 = self.make_residual(block, 64, 64, num_layers[0])
self.layer1 = self.make_residual(block, 64, 128, num_layers[1], stride=2)
self.layer2 = self.make_residual(block, 128, 256, num_layers[2], stride=2)
self.layer3 = self.make_residual(block, 256, 512, num_layers[3], stride=2)
self.postliner = nn.Sequential(
nn.AdaptiveAvgPool2d((1,1)),
nn.Flatten(),
nn.Linear(512, num_classes)
)
def make_residual(self, block, ic, oc, num_layer, stride=1):
layers = []
layers.append(block(ic, oc, stride))
for i in range(1, num_layer):
layers.append(block(oc, oc))
return nn.Sequential(*layers)
def forward(self, x):
out = self.preconv(x)
out = self.layer0(out) # [64, 32, 32]
out = self.layer1(out) # [128, 16, 16]
out = self.layer2(out) # [256, 8, 8]
out = self.layer3(out) # [512, 4, 4]
out = self.postliner(out)
return out
设计的StudentNet也分三部分,第一、三部分和残差网络一样,第二部分也是由四个模块构成,不过每个模块只有1个模块(2个卷积层)
def dwpw_conv(ic, oc, kernel_size=3, stride=1, padding=1):
return nn.Sequential(
nn.Conv2d(ic, ic, kernel_size, stride=stride, padding=padding, groups=ic), #depthwise convolution
nn.BatchNorm2d(ic),
nn.LeakyReLU(0.01, inplace=True),
nn.Conv2d(ic, oc, 1), # pointwise convolution
nn.BatchNorm2d(oc),
nn.LeakyReLU(0.01, inplace=True)
)
class StudentNet(nn.Module):
def __init__(self, num_classes=11):
super().__init__()
# 224 --> 56
self.preconv = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
)
self.layer1 = dwpw_conv(64, 64) # 56 --> 56
self.layer2 = dwpw_conv(64, 128, stride=2) # 56 --> 28
self.layer3 = dwpw_conv(128, 256, stride=2) # 28 --> 14
self.layer4 = dwpw_conv(256, 140, stride=2) # 因为作业要求模型参数小于10万,所以这里限制线性层的输入维度
# Here we adopt Global Average Pooling for various input size.
self.postliner = nn.Sequential(
nn.AdaptiveAvgPool2d((1, 1)),
nn.Flatten(1),
nn.Linear(140, num_classes)
)
def forward(self, x):
out = self.preconv(x)
out = self.layer1(out)
out = self.layer2(out)
out = self.layer3(out)
out = self.layer4(out)
out = self.postliner(out)
return out
def get_student_model(): # This function should have no arguments so that we can get your student network by directly calling it.
return StudentNet()
卷积层相当于只有5层,再加上1个全连接层,共6层。第四个模块的输出是140维,这是为了让模型参数不超过10万个,这是作业要求
Total params: 99,891 Trainable params: 99,891 Non-trainable params: 0 ---------------------------------------------------------------- Input size (MB): 0.57 Forward/backward pass size (MB): 34.71 Params size (MB): 0.38 Estimated Total Size (MB): 35.66
如果使用残差网络,让num_layers = [1,1,1,1],即第二部分每层有一个模块,这样9个卷积层加1个全连接层总共10层,并且同样让第四个模块的输出是140维,那么模型参数个数约为177万。可见Depthwise Separable Convolution让模型参数数量变为约20分之一。
学习率固定为5e-4,没有使用scheduler,训练过程比较平缓,直观上感觉比较好训练
大模型教小模型,让小模型去对标预测大模型的输出,而非预测ground truth.
为什么管用?
助教代码提供了一个teacher network,模型结构为resnet18,num_classes=11,有11,182,155个参数,准确率大概是0.899
def get_teacher(dataset_dir, teacher_name, teacher_ckpt, num_classes=11):
teacher_model = torch.hub.load('pytorch/vision:v0.10.0', teacher_name, pretrained=False, num_classes=11)
teacher_ckpt_path = os.path.join(dataset_dir, teacher_ckpt)
teacher_model.load_state_dict(torch.load(teacher_ckpt_path, map_location='cpu'))
return teacher_model
关于KL损失:
使用loss_fn_kd作为损失函数,该方法同时使用软标签损失和硬标签损失,让student model的训练更平滑
CE = nn.CrossEntropyLoss()
def loss_fn_kd(student_logits, labels, teacher_logits, alpha=0.5, temperature=20.0):
student_T = (student_logits/temperature).softmax(dim=-1)
teacher_T = (teacher_logits/temperature).softmax(dim=-1)
kl_loss = (teacher_T*(teacher_T.log() - student_T.log())).sum(1).mean() # 散度, 先对每一行求和,再求平均
ce_loss = CE(student_logits, labels) # Original Cross Entropy Loss
return alpha*(temperature**2)*kl_loss + (1 - alpha)*ce_loss
(teacher_T*(teacher_T.log() - student_T.log())).sum(1).mean():(teacher_T*(teacher_T.log() - student_T.log()))结果,每一行内求和,然后再对各行求平均
增加了一个scheduler
def trainer_KD(train_loader, valid_loader, config, devices):
student_model = get_student_model()
student_model.load_state_dict(torch.load('/kaggle/input/leehw13best3/student_best (3).ckpt'))
student_model.to(devices[0])
teacher_model = get_teacher(dataset_dir, config['teacher_name'], config['teacher_ckpt'])
teacher_model.to(devices[0])
teacher_model.eval()
loss_fn = loss_fn_kd
optimizer = torch.optim.Adam(student_model.parameters(), lr=config['lr'], weight_decay=config['weight_decay'])
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer,
T_0=config['T_0'], T_mult=config['T_mult'],
eta_min=config['lr']/config['eta_min_ratio'])
stale, best_acc = 0, config['best_acc']
n_epochs, max_norm, patience = config['n_epochs'], config['grad_norm_max'], config['patience']
legend = ['train loss', 'train acc', 'valid_loss', 'valid_acc']
animator = d2l.Animator(xlabel='epoch', xlim=[0, n_epochs], legend=legend)
save_path = os.path.join(config['save_dir'], config['exp_name']) # create saving directory
os.makedirs(save_path , exist_ok=True)
log_fw = open(f"{save_path}/log.txt", 'w') # open log file to save log outputs
log(log_fw, config) # log your configs to the log file
for epoch in range(n_epochs):
# ---------- Training ----------
student_model.train()
train_loss = []
train_accs = []
train_lens = []
for imgs, labels in train_loader:
imgs, labels = imgs.to(devices[0]), labels.to(devices[0])
with torch.no_grad():
teacher_logits = teacher_model(imgs)
logits = student_model(imgs)
loss = loss_fn(logits, labels, teacher_logits)
optimizer.zero_grad()
loss.backward()
# Clip the gradient norms for stable training.
grad_norm = nn.utils.clip_grad_norm_(student_model.parameters(), max_norm=max_norm)
optimizer.step()
acc = (logits.argmax(dim=-1) == labels).float().sum()
train_batch_len = len(imgs)
train_loss.append(loss.item() * train_batch_len)
train_accs.append(acc.item())
train_lens.append(train_batch_len)
scheduler.step()
train_loss = sum(train_loss) / sum(train_lens)
train_acc = sum(train_accs) / sum(train_lens)
log(log_fw, f"[ Train | {epoch + 1:03d}/{n_epochs:03d} ] loss = {train_loss:.5f}, acc = {train_acc:.5f}")
animator.add(epoch, (train_loss, train_acc, None, None))
# ---------- Validation ----------
student_model.eval()
valid_loss = []
valid_accs = []
valid_lens = []
for imgs, labels in valid_loader:
imgs, labels = imgs.to(devices[0]), labels.to(devices[0])
with torch.no_grad():
logits = student_model(imgs)
teacher_logits = teacher_model(imgs)
loss = loss_fn(logits, labels, teacher_logits)
acc = (logits.argmax(dim=-1) == labels).float().sum()
batch_len = len(imgs)
valid_loss.append(loss.item() * batch_len)
valid_accs.append(acc.item())
valid_lens.append(batch_len)
valid_loss = sum(valid_loss) / sum(valid_lens)
valid_acc = sum(valid_accs) / sum(valid_lens)
# update logs
if valid_acc > best_acc:
log(log_fw, f"[ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f} -> best")
else:
log(log_fw, f"[ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f}")
# save models
if valid_acc > best_acc:
log(log_fw, f"Best model found at epoch {epoch}, saving model")
best_acc = valid_acc
torch.save(student_model.state_dict(), f"{save_path}/student_best" + "{:.5f}".format(best_acc) + ".ckpt") # only save best to prevent output memory exceed error
stale = 0
else:
stale += 1
if stale > patience:
log(log_fw, f"No improvment {patience} consecutive epochs, early stopping")
break
animator.add(epoch + 1, (None, None, valid_loss, valid_acc))
#log("Finish training")
log_fw.close()
config['best_acc'] = best_acc
devices = d2l.try_all_gpus()
print(f'DEVICE: {devices}')
config = {
'lr': 5e-4,
'weight_decay': 1e-5,
'grad_norm_max': 10,
'n_epochs': 140,
'patience': 300, # If no improvement in 'patience' epochs, early stop
'best_acc': 0.0,
'T_0': 4,
'T_mult': 2,
'eta_min_ratio':20,
'teacher_name': 'resnet18',
'teacher_ckpt': "resnet18_teacher.ckpt",
'save_dir': '/content/res',
'exp_name': 'simple_baseline'
}
trainer_KD(train_loader, valid_loader, config, devices)
注意这里和后面使用的都是深度可分离卷积。训练过程比较顺利,算到得分0.8247分。scheduler是否有比没有好,还需观察
前面只是用teacher model的输出指导student model的学习,下面额外引入中间层的特征损失,使得student model和teacher model的中间层特征输出也尽量一致,思路如下图。该方法的损失函数由多个loss组成,每个loss有对应的权重,使用了动态的权重方案,前期让中间层特征的loss权重大,然后逐步减少,这样网络前期可以更好的学习中间层的特征,给后续的logit学习打好基础。注意,由于中间层没有被输出,该方法需要使用hook来获取中间层的输出。
参考:李宏毅2022机器学习HW13解析 - 知乎
设计student model的时候,我们让student model每一层的输出维度,与teacher model保持一致。
student model的preconv直接使用teacher model已经选了好的参数,不用再训练。其中第0层是卷积层,由于设置了bias=False,所以直接给weight赋值就行了。第1层是bachnorm层,需要给weight,bias和滑动平均、方差赋值。最后设置不需要对他们进行更新。
student model的第1、2、3层的输出维度,与teacher model保持一致,取出这三层的输出作为student model的学习“科目”之一。HookTool用fea属性保存module的输出,get_feas_by_hook对将hook分别注册到上述3层上。只要student model和teacher model进行了向前传播,就能通过返回的fea_hooks取出各层的输出。下面举个例子:
fea_hooks_teacher = get_feas_by_hook(teacher_model)
fea_hooks_student = get_feas_by_hook(student_model)
imgs = torch.rand([1,3,224,224])
teacher_logits = teacher_model(imgs)
logits = student_model(imgs)
for i in range(len(fea_hooks_teacher)):
feature = fea_hooks_teacher[i].fea
print(feature.shape)
for i in range(len(fea_hooks_student)):
feature = fea_hooks_student[i].fea
print(feature.shape)
torch.Size([1, 64, 56, 56]) torch.Size([1, 128, 28, 28]) torch.Size([1, 256, 14, 14]) torch.Size([1, 64, 56, 56]) torch.Size([1, 128, 28, 28]) torch.Size([1, 256, 14, 14]
它的输入是student model和teacher model的fea_hooks,通过fea_hooks每个元素的fea属性取出输出,用F.smooth_l1_loss计算损失,len(student) - i是权重,可见层数离输出层越近,权重越小
def use_pretrain(student_model, teacher_model):
student_model.preconv[0].weight = teacher_model.conv1.weight # conv1
student_model.preconv[1].weight = teacher_model.bn1.weight # bn1
student_model.preconv[1].bias = teacher_model.bn1.bias
student_model.preconv[1].running_mean = teacher_model.bn1.running_mean
student_model.preconv[1].running_var = teacher_model.bn1.running_var
student_model.preconv[0].weight.requires_grad = False
student_model.preconv[1].weight.requires_grad = False
student_model.preconv[1].bias.requires_grad = False
class HookTool:
def __init__(self):
self.fea = None
def hook_fun(self, module, fea_in, fea_out):
self.fea = fea_out
def get_feas_by_hook(model, names=['layer1', 'layer2', 'layer3']):
fea_hooks = []
for name, module in model.named_modules():
if name in names:
cur_hook = HookTool()
module.register_forward_hook(cur_hook.hook_fun)
fea_hooks.append(cur_hook)
return fea_hooks
def loss_fea_layers(student, teacher):
loss = 0
for i in range(len(student)):
#loss += (len(student) - i)* (student[i].fea - teacher[i].fea).norm(2, [1, 2, 3]).mean()
loss += (len(student) - i) * F.smooth_l1_loss(student[i].fea, teacher[i].fea)
return loss
def trainer_KD_Hook(train_loader, valid_loader, config, devices):
student_model = get_student_model()
student_model.to(devices[0])
student_model.load_state_dict(torch.load('/kaggle/input/leehw13besthook/student_best0.63008.ckpt'))
teacher_model = get_teacher(dataset_dir, config['teacher_name'], config['teacher_ckpt'])
teacher_model.to(devices[0])
teacher_model.eval()
use_pretrain(student_model, teacher_model)
fea_hooks_teacher = get_feas_by_hook(teacher_model)
fea_hooks_student = get_feas_by_hook(student_model)
optimizer = torch.optim.Adam(student_model.parameters(), lr=config['lr'], weight_decay=config['weight_decay'])
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer,
T_0=config['T_0'], T_mult=config['T_mult'],
eta_min=config['lr']/config['eta_min_ratio'])
loss_fn = loss_fn_kd
stale, best_acc = 0,config['best_acc']
n_epochs, max_norm, patience = config['n_epochs'], config['grad_norm_max'], config['patience']
legend = ['train loss', 'train acc', 'valid_loss', 'valid_acc']
animator = d2l.Animator(xlabel='epoch', xlim=[0, n_epochs], legend=legend)
save_path = os.path.join(config['save_dir'], config['exp_name']) # create saving directory
os.makedirs(save_path , exist_ok=True)
log_fw = open(f"{save_path}/log.txt", 'w') # open log file to save log outputs
log(log_fw, config) # log your configs to the log file
for epoch in range(n_epochs):
# ---------- Training ----------
student_model.train()
train_loss = []
train_accs = []
train_lens = []
train_loss_fea = []
percent = (1 + epoch) / n_epochs
for imgs, labels in train_loader:
imgs, labels = imgs.to(devices[0]), labels.to(devices[0])
with torch.no_grad():
teacher_logits = teacher_model(imgs)
logits = student_model(imgs)
loss_logits = loss_fn(logits, labels, teacher_logits, alpha=1 - percent*percent)
loss_fea = loss_fea_layers(fea_hooks_student, fea_hooks_teacher)
loss = (10 * percent * percent) * loss_logits + loss_fea
optimizer.zero_grad()
loss.backward()
# Clip the gradient norms for stable training.
grad_norm = nn.utils.clip_grad_norm_(student_model.parameters(), max_norm=max_norm)
optimizer.step()
acc = (logits.argmax(dim=-1) == labels).float().sum()
train_batch_len = len(imgs)
train_loss.append(loss.item() * train_batch_len)
train_loss_fea.append(loss_fea.item()* train_batch_len)
train_accs.append(acc.item())
train_lens.append(train_batch_len)
#scheduler.step()
train_loss = sum(train_loss) / sum(train_lens)
train_loss_fea = sum(train_loss_fea) / sum(train_lens)
train_acc = sum(train_accs) / sum(train_lens)
log(log_fw, f"[ Train | {epoch + 1:03d}/{n_epochs:03d} ] loss = {train_loss:.5f}, acc = {train_acc:.5f}")
animator.add(epoch, (train_loss, train_acc, None, None))
# ---------- Validation ----------
student_model.eval()
valid_loss = []
valid_accs = []
valid_lens = []
for imgs, labels in valid_loader:
imgs, labels = imgs.to(devices[0]), labels.to(devices[0])
with torch.no_grad():
logits = student_model(imgs)
teacher_logits = teacher_model(imgs)
loss = loss_fn(logits, labels, teacher_logits)
acc = (logits.argmax(dim=-1) == labels).float().sum()
batch_len = len(imgs)
valid_loss.append(loss.item() * batch_len)
valid_accs.append(acc.item())
valid_lens.append(batch_len)
valid_loss = sum(valid_loss) / sum(valid_lens)
valid_acc = sum(valid_accs) / sum(valid_lens)
# update logs
if valid_acc > best_acc:
log(log_fw, f"[ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f} -> best")
else:
log(log_fw, f"[ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f}")
# save models
if valid_acc > best_acc:
log(log_fw, f"Best model found at epoch {epoch}, saving model")
best_acc = round(valid_acc, 5)
torch.save(student_model.state_dict(), f"{save_path}/student_best" + str(best_acc) + ".ckpt") # only save best to prevent output memory exceed error
stale = 0
else:
stale += 1
if stale > patience:
log(log_fw, f"No improvment {patience} consecutive epochs, early stopping")
break
animator.add(epoch + 1, (None, None, valid_loss, valid_acc))
#log("Finish training")
log_fw.close()
config['best_acc'] = best_acc
devices = d2l.try_all_gpus()
print(f'DEVICE: {devices}')
config = {
'lr': 3e-3,
'weight_decay': 1e-5,
'grad_norm_max': 10,
'n_epochs': 240,
'patience': 300, # If no improvement in 'patience' epochs, early stop
'best_acc': 0.0,
'T_0': 4,
'T_mult': 2,
'eta_min_ratio':20,
'teacher_name': 'resnet18',
'teacher_ckpt': "resnet18_teacher.ckpt",
'save_dir': '/kaggle/working/',
'exp_name': 'simple_baseline'
}
trainer_KD_Hook(train_loader, valid_loader, config, devices)
由于训练过程比较漫长,所以训练过程中间间断了好几次
起始训练,准确度上升非常快
算了30个epoch,验证集准确度就达到了0.63
loss = (10 * percent * percent) * loss_logits + loss_fea
在loss_logits 和 loss_fea都不变的情况下,loss_logits的系数是逐渐增大的,于是loss也逐渐增大
学习率 | epoch= 1 | 2 | 3 | 4 | 5 |
3e-3 | 0.68045 | 0.61203 | 0.71203 | 0.70977 | 0.65865 |
1e-3 | 0.63158 | 0.62256 | 0.61278 | 0.65038 | 0.62857 |
3e-4 | 0.74511 | 0.73008 | 0.73308 | 0.71203 | 0.70000 |
1e-4 |
0.76165 | 0.78571 | 0.78045 | 0.78421 | 0.79398 |
3e-05 | 0.76917 | 0.77519 | 0.78722 | 0.77895 | 0.79173 |
算到了最后,发现训练误差先增大后减小,并且验证集误差在前半段徘徊在低点,在后三分之一才真正往上走。仔细分析,我觉得还是和loss的计算方式有关系:前面loss_logits系数太小,模型主要学习的是loss_fea,但是loss_fea只涉及teacher model第1,2,3层的输出,与第四层和最后的线性层没关系,所以训练了半天,student model主要调整了第1,2,3层的参数,但是对预测输出的提升效果不大;在训练后期,loss_logits系数加大,这才在把student model学习的重心放在减少预测误差上,验证集误差下降,并且loss_logits的下降带动了loss的下降。
import torch
from torch import nn
import torch.nn.utils.prune as prune
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as transforms
import os
import tqdm
from PIL import Image
import pandas as pd
import time
from torchsummary import summary
def get_teacher(dataset_dir, teacher_name, teacher_ckpt, num_classes=11):
teacher_model = torch.hub.load('pytorch/vision:v0.10.0', teacher_name, pretrained=False, num_classes=11)
teacher_ckpt_path = os.path.join(dataset_dir, teacher_ckpt)
teacher_model.load_state_dict(torch.load(teacher_ckpt_path, map_location='cpu'))
return teacher_model
teacher_name = 'resnet18'
teacher_ckpt = "resnet18_teacher.ckpt"
dataset_dir = '/kaggle/input/ml2022spring-hw13/food11-hw13'
按一定比例剪去卷积层的权重
def prune_convd(model, ratio):
for name, module in model.named_modules():
if isinstance(module, torch.nn.Conv2d): # if the nn.module is torch.nn.Conv2d
prune.l1_unstructured(module, name='weight', amount=ratio) # use 'prune' method provided by 'torch.nn.utils.prune' to prune the weight parameters in the nn.Conv2d layers
# Next, you just have to generize the above code to different ratio and test the accuracy on the validation set of food11-hw13.
首先看一下teacher函数的模型参数数量
model = get_teacher(dataset_dir, teacher_name, teacher_ckpt)
summary(model, (3,224,224))
Total params: 11,182,155 Trainable params: 11,182,155 Non-trainable params: 0 ---------------------------------------------------------------- Input size (MB): 0.57 Forward/backward pass size (MB): 62.79 Params size (MB): 42.66 Estimated Total Size (MB): 106.02 ----------------------------------------------------------------
下面剪去卷积层的权重的50%
model = get_teacher(dataset_dir, teacher_name, teacher_ckpt)
ratio = 0.5
prune_convd(model, ratio)
summary(model, (3,224,224))
Total params: 11,182,155 Trainable params: 11,182,155 Non-trainable params: 0 ---------------------------------------------------------------- Input size (MB): 0.57 Forward/backward pass size (MB): 62.79 Params size (MB): 42.66 Estimated Total Size (MB): 106.02 ----------------------------------------------------------------
可以发现,存储的参数数量没有变。这是因为,pytorch对weight进行剪枝,得到一个参数weight_orig,它存储的参数和剪枝前模型的weight参数是一样的,只不过多了一个weight_mask参数,torch.nn.utils.prune将mask和weight_orig相乘,存在weight属性中。
如果删除weight_orig,会不会有变化?
def prune_convd_remove(model, ratio):
for name, module in model.named_modules():
if isinstance(module, torch.nn.Conv2d): # if the nn.module is torch.nn.Conv2d
prune.remove(module, 'weight') # use 'prune' method provided by 'torch.nn.utils.prune' to prune the weight parameters in the nn.Conv2d layers
model = get_teacher(dataset_dir, teacher_name, teacher_ckpt)
ratio = 0.5
prune_convd(model, ratio)
prune_convd_remove(model, 'weight')
summary(model, (3,224,224))
没有任何变化
eval_set = FoodDataset_test(os.path.join(dataset_dir, "evaluation"), tfm=test_tfm)
eval_loader = DataLoader(eval_set, batch_size=512, shuffle=False, num_workers=num_workers, pin_memory=True)
print(eval_set, len(eval_set)/512)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
times = []
def pad4(i):
return "0"*(4-len(str(i))) + str(i)
for i in range(0, 100, 5):
model = get_teacher(dataset_dir, teacher_name, teacher_ckpt)
model.load_state_dict(torch.load(os.path.join(dataset_dir, teacher_ckpt), map_location=torch.device('cpu')))
prune_ratio = i / 100
prune_convd(model, prune_ratio)
model.to(device)
model.eval()
eval_preds = []
# torch.cuda.synchronize()
tik = time.time()
for imgs, _ in eval_loader:
with torch.no_grad():
logits = model(imgs.to(device))
preds = list(logits.argmax(dim=-1).squeeze().cpu().numpy())
eval_preds += preds
times.append(time.time()-tik)
ids = [pad4(i) for i in range(0,len(eval_set))]
categories = eval_preds
df = pd.DataFrame()
df['Id'] = ids
df['Category'] = categories
df.to_csv(f"/kaggle/working/submissionteacher{prune_ratio}.csv", index=False)
print(times)
使用英伟达3080,num_works=2,每一个例子只用12秒就算完了,所耗时间基本一样,几乎没有加速。我怀疑是因为运行时间太短了,就在kaggle上用num_works=2进行计算,每一例子需要234秒,也几乎没有加速。
Such pruning technique in the tutorial is just to add some masks to the modules. The amount of calculation is nearly the same so the inference time is also similar
剪枝比例 | 准确度 | 剪枝比例 | 准确度 |
0 | 0.9053 | 30% | 0.8675 |
5% | 0.8984 | 35% | 0.8316 |
10% | 0.8994 | 40% | 0.8087 |
15% | 0.9003 | 45% | 0.7091 |
20% | 0.8864 | 50% | 0.5398 |
25% | 0.8864 | 55% | 0.3047 |
综上所述,模型剪枝之后,模型并没有变得更小,也没有运行更快,剪了个寂寞。我感觉pytorch提供的剪枝方式属于weight pruning,那么上述结果就和老师上课时关于weight pruning讲的内容吻合( 参见:高效深度学习软硬件设计——神经网络压缩_iwill323的博客-CSDN博客)
首先进行分层抽样,将所有数据放进df这个变量中,通过train_split_id和val_split_id这两个变量可以切分出抽样结果
dataset_dir = '/kaggle/input/ml2022spring-hw13/food11-hw13'
valid_ratio = 0.1
train_path = os.path.join(dataset_dir, 'training')
train_files = sorted([os.path.join(train_path, x) for x in os.listdir(train_path) if x.endswith(".jpg")])
train_labels = [int(x.split('/')[-1].split('_')[0]) for x in train_files]
val_path = os.path.join(dataset_dir, 'validation')
val_files = sorted([os.path.join(val_path, x) for x in os.listdir(val_path) if x.endswith(".jpg")])
val_labels = [int(x.split('/')[-1].split('_')[0]) for x in val_files]
files = train_files + val_files
labels = train_labels + val_labels
stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=valid_ratio, random_state=0)
splits = stratified_split.split(files, labels)
train_split_id, val_split_id = next(iter(splits)) # train_split_id, val_split_id是index
df = pd.DataFrame({'files': files, 'labels': labels})
有两种统计方法,一种是通过collections包
import collections
df_val = df.iloc[val_split_id]
count_val = collections.Counter(df_val['labels'])
train_df = df.iloc[train_split_id]
count_train = collections.Counter(train_df['labels'])
print(count_val, '\n' ,count_train)
Counter({9: 200, 2: 200, 5: 178, 0: 136, 3: 131, 8: 120, 4: 117, 10: 94, 6: 59, 1: 57, 7: 38}) Counter({2: 1800, 9: 1800, 5: 1596, 0: 1220, 3: 1182, 8: 1082, 4: 1057, 10: 847, 6: 528, 1: 516, 7: 338})
查看各标签的数据量之比
ratio = []
for i in count_train:
ratio.append(count_val[i] / count_train[i])
print(i, count_val[i] / count_train[i])
print(min(ratio), max(ratio))
10 0.11097992916174734 0 0.11147540983606558 1 0.11046511627906977 5 0.11152882205513784 8 0.11090573012939002 4 0.11069063386944182 6 0.11174242424242424 2 0.1111111111111111 7 0.11242603550295859 3 0.11082910321489002 9 0.1111111111111111 0.11046511627906977 0.11242603550295859
可以看到验证集和训练集每一标签的数据量之比都接近1/9
另一种方式是通过value_counts() 方法。value_counts() 方法返回一个序列 Series,该序列包含每个值的数量。也就是说,对于数据框中的任何列,value_counts () 方法会返回该列每个项的计数。
value_counts()是Series拥有的方法,一般在DataFrame中使用时,需要指定对哪一列或行使用。value_count只能对应series,不能直接对整个dataframe做操作
df_val = df.iloc[val_split_id]
val_counts = df_val.labels.value_counts()
df_train = df.iloc[train_split_id]
train_counts = df_train.labels.value_counts()
print(val_counts, train_counts)
9 200 2 200 5 178 0 136 3 131 8 120 4 117 10 94 6 59 1 57 7 38 Name: labels, dtype: int64 2 1800 9 1800 5 1596 0 1220 3 1182 8 1082 4 1057 10 847 6 528 1 516 7 338 Name: labels, dtype: int64
查看各标签的数据量之比
ratio = []
for i in range(len(train_counts)):
ratio.append(val_counts[i]/train_counts[i])
print(i, val_counts[i] / train_counts[i])
0 0.11147540983606558 1 0.11046511627906977 2 0.1111111111111111 3 0.11082910321489002 4 0.11069063386944182 5 0.11152882205513784 6 0.11174242424242424 7 0.11242603550295859 8 0.11090573012939002 9 0.1111111111111111 10 0.11097992916174734
import time
torch.cuda.synchronize()
tik = time.time()
a = torch.randn([2000,1000])
a = time.time() - tik
print(a)
0.017406702041625977
for name, module in student_model.named_modules():
print(name)
preconv preconv.0 preconv.1 preconv.2 preconv.3 layer1 layer1.0 layer1.1 layer1.2 layer1.3 layer1.4 layer1.5 layer2 layer2.0 layer2.1 layer2.2 layer2.3 layer2.4 layer2.5 layer3 layer3.0 layer3.1 layer3.2 layer3.3 layer3.4 layer3.5 layer4 layer4.0 layer4.1 layer4.2 layer4.3 layer4.4 layer4.5 postliner postliner.0 postliner.1 postliner.2
student_model.layer4
Sequential( (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=256) (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (2): LeakyReLU(negative_slope=0.01, inplace=True) (3): Conv2d(256, 140, kernel_size=(1, 1), stride=(1, 1)) (4): BatchNorm2d(140, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (5): LeakyReLU(negative_slope=0.01, inplace=True) )
student_model.layer4[5]
LeakyReLU(negative_slope=0.01, inplace=True)
student_model.preconv[0]
Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
第一反应是使用round函数,后来发现这么做不行。
可以有三种方法:
a = 0.2343
b = round(a, 2)
c = "{:.2f}".format(a)
d = f"{a:.2f}"
print(str(b), c, d)
0.23 0.23 0.23
a = 0.2003
b = round(a, 2)
c = "{:.2f}".format(a)
d = f"{a:.2f}"
print(str(b), c, d)
0.2 0.20 0.20
使用round函数,最后一位如果是0,会被丢弃。可以使用另外两种方法。
for dirname, _, filenames in os.walk('/kaggle/working/'):
print(dirname, filenames)