import numpy as np
import torch
import torchvision
torch.__version__ # PyTorch version
torch.version.cuda # Corresponding CUDA version
torch.backends.cudnn.version() # Corresponding cuDNN version
torch.cuda.get_device_name(0) # GPU type
conda update pytorch torchvision -c pytorch
torch.manual_seed(0)
torch.cuda.manual_seed_all(0)
在命令行指定环境变量
os.environ[ CUDA_VISIBLE_DEVICES ] = 0,1
在代码中指定
os.environ[ CUDA_VISIBLE_DEVICES ] = 0,1
判断是否有CUDA支持
torch.cuda.is_available()
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = True
有时 Control-C 中止运行后 GPU 存储没有及时释放,需要手动清空。在 PyTorch 内部可以
torch.cuda.empty_cache()
或在命令行可以先使用 ps 找到程序的 PID,再使用 kill 结束该进程
ps aux | grep pythonkill -9 [pid]
或者直接重置没有被清空的 GPU
nvidia-smi --gpu-reset -i [gpu_id]
tensor.type() # Data type
tensor.size() # Shape of the tensor. It is a subclass of Python tuple
tensor.dim() # Number of dimensions.
# Set default tensor type. Float in PyTorch is much faster than double.
torch.set_default_tensor_type(torch.FloatTensor)
# Type convertions.
tensor = tensor.cuda()
tensor = tensor.cpu()
tensor = tensor.float()
tensor = tensor.long()
# torch.Tensor -> np.ndarray.
ndarray = tensor.cpu().numpy()
# np.ndarray -> torch.Tensor.
tensor = torch.from_numpy(ndarray).float()
tensor = torch.from_numpy(ndarray.copy()).float() # If ndarray has negative stride
value = tensor.item()
tensor = torch.reshape(tensor, shape)
tensor = tensor[torch.randperm(tensor.size(0))] # Shuffle the first dimension
# Assume tensor has shape N*D*H*W.tensor = tensor[:, :, :, torch.arange(tensor.size(3) - 1, -1, -1).long()]
# Operation | New/Shared memory | Still in computation graph |
tensor.clone() # | New | Yes |
tensor.detach() # | Shared | No |
tensor.detach.clone()() # | New | No |
tensor = torch.cat(list_of_tensors, dim=0)
tensor = torch.stack(list_of_tensors, dim=0)
N = tensor.size(0)
one_hot = torch.zeros(N, num_classes).long()
one_hot.scatter_(dim=1, index=torch.unsqueeze(tensor, dim=1), src=torch.ones(N, num_classes).long())
torch.nonzero(tensor) # Index of non-zero elements
torch.nonzero(tensor == 0) # Index of zero elements
torch.nonzero(tensor).size(0) # Number of non-zero elements
torch.nonzero(tensor == 0).size(0) # Number of zero elements
# Expand tensor of shape 64*512 to shape 64*512*7*7.
torch.reshape(tensor, (64, 512, 1, 1)).expand(64, 512, 7, 7)
# Matrix multiplication: (m*n) * (n*p) -> (m*p).
result = torch.mm(tensor1, tensor2)
# Batch matrix multiplication: (b*m*n) * (b*n*p) -> (b*m*p).
result = torch.bmm(tensor1, tensor2)
# Element-wise multiplication.
result = tensor1 * tensor2
# X1 is of shape m*d.
X1 = torch.unsqueeze(X1, dim=1).expand(m, n, d)
# X2 is of shape n*d.
X2 = torch.unsqueeze(X2, dim=0).expand(m, n, d)
# dist is of shape m*n, where dist[i][j] = sqrt(|X1[i, :] - X[j, :]|^2)
dist = torch.sqrt(torch.sum((X1 - X2) ** 2, dim=2))
conv = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=True)conv = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=True)
gap = torch.nn.AdaptiveAvgPool2d(output_size=1)
X = torch.reshape(N, D, H * W) # Assume X has shape N*D*H*W
X = torch.bmm(X, torch.transpose(X, 1, 2)) / (H * W) # Bilinear pooling
assert X.size() == (N, D, D)
X = torch.reshape(X, (N, D * D))
X = torch.sign(X) * torch.sqrt(torch.abs(X) + 1e-5) # Signed-sqrt normalization
X = torch.nn.functional.normalize(X) # L2 normalization
class BN(torch.nn.Module)
def __init__(self):
...
self.register_buffer( running_mean , torch.zeros(num_features))
def forward(self, X):
...
self.running_mean += momentum * (current - self.running_mean)
num_parameters = sum(torch.numel(parameter) for parameter in model.parameters())
# Common practise for initialization.
for layer in model.modules():
if isinstance(layer, torch.nn.Conv2d):
torch.nn.init.kaiming_normal_(layer.weight, mode= fan_out ,
nonlinearity= relu )
if layer.bias is not None:
torch.nn.init.constant_(layer.bias, val=0.0)
elif isinstance(layer, torch.nn.BatchNorm2d):
torch.nn.init.constant_(layer.weight, val=1.0)
torch.nn.init.constant_(layer.bias, val=0.0)
elif isinstance(layer, torch.nn.Linear):
torch.nn.init.xavier_normal_(layer.weight)
if layer.bias is not None:
torch.nn.init.constant_(layer.bias, val=0.0)
# Initialization with given tensor.
layer.weight = torch.nn.Parameter(tensor)
model.load_state_dict(torch.load( model,pth ), strict=False)
model.load_state_dict(torch.load( model,pth , map_location= cpu ))
# VGG-16 relu5-3 feature.
model = torchvision.models.vgg16(pretrained=True).features[:-1]
# VGG-16 pool5 feature.
model = torchvision.models.vgg16(pretrained=True).features
# VGG-16 fc7 feature.
model = torchvision.models.vgg16(pretrained=True)
model.classifier = torch.nn.Sequential(*list(model.classifier.children())[:-3])
# ResNet GAP feature.
model = torchvision.models.resnet18(pretrained=True)
model = torch.nn.Sequential(collections.OrderedDict(
list(model.named_children())[:-1]))
with torch.no_grad():
model.eval()
conv_representation = model(image)
class FeatureExtractor(torch.nn.Module):
"""Helper class to extract several convolution features from the given
pre-trained model.
Attributes:
_model, torch.nn.Module.
_layers_to_extract, list or set
Example:
>>> model = torchvision.models.resnet152(pretrained=True)
>>> model = torch.nn.Sequential(collections.OrderedDict(
list(model.named_children())[:-1]))
>>> conv_representation = FeatureExtractor(
pretrained_model=model,
layers_to_extract={ layer1 , layer2 , layer3 , layer4 })(image)
"""
def __init__(self, pretrained_model, layers_to_extract):
torch.nn.Module.__init__(self)
self._model = pretrained_model
self._model.eval()
self._layers_to_extract = set(layers_to_extract)
def forward(self, x):
with torch.no_grad():
conv_representation = []
for name, layer in self._model.named_children():
x = layer(x)
if name in self._layers_to_extract:
conv_representation.append(x)
return conv_representation
model = torchvision.models.resnet18(pretrained=True)
for param in model.parameters():
param.requires_grad = False
model.fc = nn.Linear(512, 100) # Replace the last fc layer
optimizer = torch.optim.SGD(model.fc.parameters(), lr=1e-2, momentum=0.9, weight_decay=1e-4)
model = torchvision.models.resnet18(pretrained=True)
finetuned_parameters = list(map(id, model.fc.parameters()))
conv_parameters = (p for p in model.parameters() if id(p) not in finetuned_parameters)
parameters = [{ params : conv_parameters, lr : 1e-3},
{ params : model.fc.parameters()}]
optimizer = torch.optim.SGD(parameters, lr=1e-2, momentum=0.9, weight_decay=1e-4)
train_transform = torchvision.transforms.Compose([
torchvision.transforms.RandomResizedCrop(size=224,
scale=(0.08, 1.0)),
torchvision.transforms.RandomHorizontalFlip(),
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize(mean=(0.485, 0.456, 0.406),
std=(0.229, 0.224, 0.225)),
])
val_transform = torchvision.transforms.Compose([
torchvision.transforms.Resize(224),
torchvision.transforms.CenterCrop(224),
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize(mean=(0.485, 0.456, 0.406),
std=(0.229, 0.224, 0.225)),
])
for t in epoch(80):
for images, labels in tqdm.tqdm(train_loader, desc= Epoch %3d % (t + 1)):
images, labels = images.cuda(), labels.cuda()
scores = model(images)
loss = loss_function(scores, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
for images, labels in train_loader:
images, labels = images.cuda(), labels.cuda()
N = labels.size(0)
# C is the number of classes.
smoothed_labels = torch.full(size=(N, C), fill_value=0.1 / (C - 1)).cuda()
smoothed_labels.scatter_(dim=1, index=torch.unsqueeze(labels, dim=1), value=0.9)
score = model(images)
log_prob = torch.nn.functional.log_softmax(score, dim=1)
loss = -torch.sum(log_prob * smoothed_labels) / N
optimizer.zero_grad()
loss.backward()
optimizer.step()
beta_distribution = torch.distributions.beta.Beta(alpha, alpha)
for images, labels in train_loader:
images, labels = images.cuda(), labels.cuda()
# Mixup images.
lambda_ = beta_distribution.sample([]).item()
index = torch.randperm(images.size(0)).cuda()
mixed_images = lambda_ * images + (1 - lambda_) * images[index, :]
# Mixup loss.
scores = model(mixed_images)
loss = (lambda_ * loss_function(scores, labels)
+ (1 - lambda_) * loss_function(scores, labels[index]))
optimizer.zero_grad()
loss.backward()
optimizer.step()
l1_regularization = torch.nn.L1Loss(reduction= sum )
loss = ... # Standard cross-entropy loss
for param in model.parameters():
loss += torch.sum(torch.abs(param))
loss.backward()
bias_list = (param for name, param in model.named_parameters() if name[-4:] == bias )
others_list = (param for name, param in model.named_parameters() if name[-4:] != bias )
parameters = [{ parameters : bias_list, weight_decay : 0},
{ parameters : others_list}]
optimizer = torch.optim.SGD(parameters, lr=1e-2, momentum=0.9, weight_decay=1e-4)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=20)
score = model(images)
prediction = torch.argmax(score, dim=1)
num_correct = torch.sum(prediction == labels).item()
accuruacy = num_correct / labels.size(0)
链接:https://github.com/szagoruyko/pytorchviz
有 Facebook 自己开发的 Visdom 和 Tensorboard 两个选择。
https://github.com/facebookresearch/visdom
https://github.com/lanpa/tensorboard
# Example using Visdom.
vis = visdom.Visdom(env= Learning curve , use_incoming_socket=False)
assert self._visdom.check_connection()
self._visdom.close()
options = collections.namedtuple( Options , [ loss , acc , lr ])(
loss={ xlabel : Epoch , ylabel : Loss , showlegend : True},
acc={ xlabel : Epoch , ylabel : Accuracy , showlegend : True},
lr={ xlabel : Epoch , ylabel : Learning rate , showlegend : True})
for t in epoch(80):
tran(...)
val(...)
vis.line(X=torch.Tensor([t + 1]), Y=torch.Tensor([train_loss]),
name= train , win= Loss , update= append , opts=options.loss)
vis.line(X=torch.Tensor([t + 1]), Y=torch.Tensor([val_loss]),
name= val , win= Loss , update= append , opts=options.loss)
vis.line(X=torch.Tensor([t + 1]), Y=torch.Tensor([train_acc]),
name= train , win= Accuracy , update= append , opts=options.acc)
vis.line(X=torch.Tensor([t + 1]), Y=torch.Tensor([val_acc]),
name= val , win= Accuracy , update= append , opts=options.acc)
vis.line(X=torch.Tensor([t + 1]), Y=torch.Tensor([lr]),
win= Learning rate , update= append , opts=options.lr)
# If there is one global learning rate (which is the common case).
lr = next(iter(optimizer.param_groups))[ lr ]
# If there are multiple learning rates for different layers.
all_lr = []
for param_group in optimizer.param_groups:
all_lr.append(param_group[ lr ])
# Reduce learning rate when validation accuarcy plateau.
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode= max , patience=5, verbose=True)
for t in range(0, 80):
train(...); val(...)
scheduler.step(val_acc)
# Cosine annealing learning rate.
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=80)
# Reduce learning rate by 10 at given epochs.
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[50, 70], gamma=0.1)
for t in range(0, 80):
scheduler.step()
train(...); val(...)
# Learning rate warmup by 10 epochs.
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda t: t / 10)
for t in range(0, 10):
scheduler.step()
train(...); val(...)
# Save checkpoint.
is_best = current_acc > best_acc
best_acc = max(best_acc, current_acc)
checkpoint = {
best_acc : best_acc,
epoch : t + 1,
model : model.state_dict(),
optimizer : optimizer.state_dict(),
}
model_path = os.path.join( model , checkpoint.pth.tar )
torch.save(checkpoint, model_path)
if is_best:
shutil.copy( checkpoint.pth.tar , model_path)
# Load checkpoint.
if resume:
model_path = os.path.join( model , checkpoint.pth.tar )
assert os.path.isfile(model_path)
checkpoint = torch.load(model_path)
best_acc = checkpoint[ best_acc ]
start_epoch = checkpoint[ epoch ]
model.load_state_dict(checkpoint[ model ])
optimizer.load_state_dict(checkpoint[ optimizer ])
print( Load checkpoint at epoch %d. % start_epoch)
# data[ label ] and data[ prediction ] are groundtruth label and prediction
# for each image, respectively.
accuracy = np.mean(data[ label ] == data[ prediction ]) * 100
# Compute recision and recall for each class.
for c in range(len(num_classes)):
tp = np.dot((data[ label ] == c).astype(int),
(data[ prediction ] == c).astype(int))
tp_fp = np.sum(data[ prediction ] == c)
tp_fn = np.sum(data[ label ] == c)
precision = tp / tp_fp * 100
recall = tp / tp_fn * 100
def forward(self, x):
...
x = torch.nn.functional.dropout(x, p=0.5, training=self.training)
torch.no_grad()
model.train()
model.eval()
x = torch.nn.functional.relu(x, inplace=True)
with torch.autograd.profiler.profile(enabled=True, use_cuda=False) as profile:
...
print(profile)
或者在命令行运行
python -m torch.utils.bottleneck main.py