PC-DARTS 仍出自华为诺亚方舟,相比前作 PDARTS 更加优雅。额外引入的一组权重参数可以提高性能。
if not torch.cuda.is_available():
logging.info('no gpu device available')
sys.exit(1)
np.random.seed(args.seed)
torch.cuda.set_device(args.gpu)
cudnn.benchmark = True
torch.manual_seed(args.seed)
cudnn.enabled=True
torch.cuda.manual_seed(args.seed)
logging.info('gpu device = %d' % args.gpu)
logging.info("args = %s", args)
Network 和 quark0/darts 中一样仅创建一次。
criterion = nn.CrossEntropyLoss()
criterion = criterion.cuda()
model = Network(args.init_channels, CIFAR_CLASSES, args.layers, criterion)
model = model.cuda()
logging.info("param size = %fMB", utils.count_parameters_in_MB(model))
optimizer = torch.optim.SGD(
model.parameters(),
args.learning_rate,
momentum=args.momentum,
weight_decay=args.weight_decay)
在验证集上训练结构参数并测试。
train_transform, valid_transform = utils._data_transforms_cifar10(args)
if args.set=='cifar100':
train_data = dset.CIFAR100(root=args.data, train=True, download=True, transform=train_transform)
else:
train_data = dset.CIFAR10(root=args.data, train=True, download=True, transform=train_transform)
num_train = len(train_data)
indices = list(range(num_train))
split = int(np.floor(args.train_portion * num_train))
train_queue = torch.utils.data.DataLoader(
train_data, batch_size=args.batch_size,
sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]),
pin_memory=True, num_workers=2)
valid_queue = torch.utils.data.DataLoader(
train_data, batch_size=args.batch_size,
sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[split:num_train]),
pin_memory=True, num_workers=2)
scheduler
放置的位置不好。
Architect 封装了模型结构调整。
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
optimizer, float(args.epochs), eta_min=args.learning_rate_min)
architect = Architect(model, args)
for epoch in range(args.epochs):
scheduler.step()
lr = scheduler.get_lr()[0]
logging.info('epoch %d lr %e', epoch, lr)
genotype = model.genotype()
logging.info('genotype = %s', genotype)
#print(F.softmax(model.alphas_normal, dim=-1))
#print(F.softmax(model.alphas_reduce, dim=-1))
# training
train_acc, train_obj = train(train_queue, valid_queue, model, architect, criterion, optimizer, lr,epoch)
logging.info('train_acc %f', train_acc)
# validation
if args.epochs-epoch<=1:
valid_acc, valid_obj = infer(valid_queue, model, criterion)
logging.info('valid_acc %f', valid_acc)
utils.save(model, os.path.join(args.save, 'weights.pt'))
Architect 包含网络以及结构参数优化器,训练时负责对网络结构进行调整。
Architect 直接影响了多 GPU 调用。
def __init__(self, model, args):
self.network_momentum = args.momentum
self.network_weight_decay = args.weight_decay
self.model = model
self.optimizer = torch.optim.Adam(self.model.arch_parameters(),
lr=args.arch_learning_rate, betas=(0.5, 0.999), weight_decay=args.arch_weight_decay)
二阶优化。
loss = self.model._loss(input, target)
theta = _concat(self.model.parameters()).data
try:
moment = _concat(network_optimizer.state[v]['momentum_buffer'] for v in self.model.parameters()).mul_(self.network_momentum)
except:
moment = torch.zeros_like(theta)
dtheta = _concat(torch.autograd.grad(loss, self.model.parameters())).data + self.network_weight_decay*theta
unrolled_model = self._construct_model_from_theta(theta.sub(eta, moment+dtheta))
return unrolled_model
self.optimizer.zero_grad()
if unrolled:
self._backward_step_unrolled(input_train, target_train, input_valid, target_valid, eta, network_optimizer)
else:
self._backward_step(input_valid, target_valid)
self.optimizer.step()
loss = self.model._loss(input_valid, target_valid)
loss.backward()
unrolled_model = self._compute_unrolled_model(input_train, target_train, eta, network_optimizer)
unrolled_loss = unrolled_model._loss(input_valid, target_valid)
unrolled_loss.backward()
dalpha = [v.grad for v in unrolled_model.arch_parameters()]
vector = [v.grad.data for v in unrolled_model.parameters()]
implicit_grads = self._hessian_vector_product(vector, input_train, target_train)
for g, ig in zip(dalpha, implicit_grads):
g.data.sub_(eta, ig.data)
for v, g in zip(self.model.arch_parameters(), dalpha):
if v.grad is None:
v.grad = Variable(g.data)
else:
v.grad.data.copy_(g.data)
model_new = self.model.new()
model_dict = self.model.state_dict()
params, offset = {}, 0
for k, v in self.model.named_parameters():
v_length = np.prod(v.size())
params[k] = theta[offset: offset+v_length].view(v.size())
offset += v_length
assert offset == len(theta)
model_dict.update(params)
model_new.load_state_dict(model_dict)
return model_new.cuda()
R = r / _concat(vector).norm()
for p, v in zip(self.model.parameters(), vector):
p.data.add_(R, v)
loss = self.model._loss(input, target)
grads_p = torch.autograd.grad(loss, self.model.arch_parameters())
for p, v in zip(self.model.parameters(), vector):
p.data.sub_(2*R, v)
loss = self.model._loss(input, target)
grads_n = torch.autograd.grad(loss, self.model.arch_parameters())
for p, v in zip(self.model.parameters(), vector):
p.data.add_(R, v)
return [(x-y).div_(2*R) for x, y in zip(grads_p, grads_n)]
与 quark0/darts 相同,而看不到 chenxin061/pdarts 的影子。
self.stem0
包含两组卷积,与self.stem1
一起连续缩减特征图。
def __init__(self, C, num_classes, layers, criterion, steps=4, multiplier=4, stem_multiplier=3):
super(Network, self).__init__()
self._C = C
self._num_classes = num_classes
self._layers = layers
self._criterion = criterion
self._steps = steps
self._multiplier = multiplier
C_curr = stem_multiplier*C
self.stem0 = nn.Sequential(
nn.Conv2d(3, C_curr // 2, kernel_size=3, stride=2, padding=1, bias=False),
nn.BatchNorm2d(C_curr // 2),
nn.ReLU(inplace=True),
nn.Conv2d(C_curr // 2, C_curr, 3, stride=2, padding=1, bias=False),
nn.BatchNorm2d(C_curr),
)
self.stem1 = nn.Sequential(
nn.ReLU(inplace=True),
nn.Conv2d(C_curr, C_curr, 3, stride=2, padding=1, bias=False),
nn.BatchNorm2d(C_curr),
)
构造单元。
C_prev_prev, C_prev, C_curr = C_curr, C_curr, C
self.cells = nn.ModuleList()
reduction_prev = True
for i in range(layers):
if i in [layers//3, 2*layers//3]:
C_curr *= 2
reduction = True
else:
reduction = False
cell = Cell(steps, multiplier, C_prev_prev, C_prev, C_curr, reduction, reduction_prev)
reduction_prev = reduction
self.cells += [cell]
C_prev_prev, C_prev = C_prev, multiplier*C_curr
self.global_pooling = nn.AdaptiveAvgPool2d(1)
self.classifier = nn.Linear(C_prev, num_classes)
self._initialize_alphas()
创建网络,将结构参数复制过去。
model_new = Network(self._C, self._num_classes, self._layers, self._criterion).cuda()
for x, y in zip(model_new.arch_parameters(), self.arch_parameters()):
x.data.copy_(y.data)
return model_new
weights
和weights2
命名有点随意。
s0 = self.stem0(input)
s1 = self.stem1(s0)
for i, cell in enumerate(self.cells):
if cell.reduction:
weights = F.softmax(self.alphas_reduce, dim=-1)
n = 3
start = 2
weights2 = F.softmax(self.betas_reduce[0:2], dim=-1)
for i in range(self._steps-1):
end = start + n
tw2 = F.softmax(self.betas_reduce[start:end], dim=-1)
start = end
n += 1
weights2 = torch.cat([weights2,tw2],dim=0)
else:
weights = F.softmax(self.alphas_normal, dim=-1)
n = 3
start = 2
weights2 = F.softmax(self.betas_normal[0:2], dim=-1)
for i in range(self._steps-1):
end = start + n
tw2 = F.softmax(self.betas_normal[start:end], dim=-1)
start = end
n += 1
weights2 = torch.cat([weights2,tw2],dim=0)
s0, s1 = s1, cell(s0, s1, weights,weights2)
out = self.global_pooling(s1)
logits = self.classifier(out.view(out.size(0),-1))
return logits
logits = self(input)
return self._criterion(logits, target)
函数名称未变,但以及是两组参数了。
k = sum(1 for i in range(self._steps) for n in range(2+i))
num_ops = len(PRIMITIVES)
self.alphas_normal = Variable(1e-3*torch.randn(k, num_ops).cuda(), requires_grad=True)
self.alphas_reduce = Variable(1e-3*torch.randn(k, num_ops).cuda(), requires_grad=True)
self.betas_normal = Variable(1e-3*torch.randn(k).cuda(), requires_grad=True)
self.betas_reduce = Variable(1e-3*torch.randn(k).cuda(), requires_grad=True)
self._arch_parameters = [
self.alphas_normal,
self.alphas_reduce,
self.betas_normal,
self.betas_reduce,
]
return self._arch_parameters
在架构搜索完成后,边缘 ( i , j ) \left(i,j\right) (i,j) 的连接由 { α i , j o } \left\{\alpha_{i,j}^o\right\} {αi,jo} 和 β i , j \beta_{i,j} βi,j 决定,二者归一化系数后相乘:
exp { β i , j } ∑ i ′ < j exp { β i ′ , j } ⋅ exp { α i , j o } ∑ o ′ ∈ O exp { α i , j o ′ } \frac{\exp\left\{\beta_{i,j}\right\}}{{\sum_{i'<j}}\exp\left\{\beta_{i',j}\right\}}\cdot \frac{\exp\left\{\alpha_{i,j}^o\right\}}{{\sum_{o'\in\mathcal{O}}}\exp\left\{\alpha_{i,j}^{o'}\right\}} ∑i′<jexp{βi′,j}exp{βi,j}⋅∑o′∈Oexp{αi,jo′}exp{αi,jo}
然后和 DARTS 中一样通过找到大边权重来选择边缘。由于 β i , j \beta_{i,j} βi,j 是通过训练过程共享的,因此学习的网络架构不易受迭代间通道采样的影响,从而使架构搜索更加稳定。
论文中的解释有些牵强,因为通道采样其实是固定的。而修剪边缘时像 DARTS 那样直接使用 Softmax 不太合理,加一个可学习参数进行层内选择。
嵌套定义_parse
函数。
对于每层,由归一化后的参数乘积排序,取最大的两个非空操作为边。每条边再确定最佳操作。
def _parse(weights,weights2):
gene = []
n = 2
start = 0
for i in range(self._steps):
end = start + n
W = weights[start:end].copy()
W2 = weights2[start:end].copy()
for j in range(n):
W[j,:] = W[j,:]*W2[j]
edges = sorted(range(i + 2), key=lambda x: -max(W[x][k] for k in range(len(W[x])) if k != PRIMITIVES.index('none')))[:2]
#edges = sorted(range(i + 2), key=lambda x: -W2[x])[:2]
for j in edges:
k_best = None
for k in range(len(W[j])):
if k != PRIMITIVES.index('none'):
if k_best is None or W[j][k] > W[j][k_best]:
k_best = k
gene.append((PRIMITIVES[k_best], j))
start = end
n += 1
return gene
start = 2
跳过两个输入节点。循环处理3个中间节点,统一将同层 { α i , j o } \left\{\alpha_{i,j}^o\right\} {αi,jo}送 Softmax 归一化。
n = 3
start = 2
weightsr2 = F.softmax(self.betas_reduce[0:2], dim=-1)
weightsn2 = F.softmax(self.betas_normal[0:2], dim=-1)
for i in range(self._steps-1):
end = start + n
#print(self.betas_reduce[start:end])
tw2 = F.softmax(self.betas_reduce[start:end], dim=-1)
tn2 = F.softmax(self.betas_normal[start:end], dim=-1)
start = end
n += 1
weightsr2 = torch.cat([weightsr2,tw2],dim=0)
weightsn2 = torch.cat([weightsn2,tn2],dim=0)
分别解析出正常和约减单元的基因型。
gene_normal = _parse(F.softmax(self.alphas_normal, dim=-1).data.cpu().numpy(),weightsn2.data.cpu().numpy())
gene_reduce = _parse(F.softmax(self.alphas_reduce, dim=-1).data.cpu().numpy(),weightsr2.data.cpu().numpy())
concat = range(2+self._steps-self._multiplier, self._steps+2)
genotype = Genotype(
normal=gene_normal, normal_concat=concat,
reduce=gene_reduce, reduce_concat=concat
)
return genotype
Cell 的输出为各层输出的拼接,维度很高,所以在输入加了两个预处理。
def __init__(self, steps, multiplier, C_prev_prev, C_prev, C, reduction, reduction_prev):
super(Cell, self).__init__()
self.reduction = reduction
if reduction_prev:
self.preprocess0 = FactorizedReduce(C_prev_prev, C, affine=False)
else:
self.preprocess0 = ReLUConvBN(C_prev_prev, C, 1, 1, 0, affine=False)
self.preprocess1 = ReLUConvBN(C_prev, C, 1, 1, 0, affine=False)
self._steps = steps
self._multiplier = multiplier
self._ops = nn.ModuleList()
self._bns = nn.ModuleList()
for i in range(self._steps):
for j in range(2+i):
stride = 2 if reduction and j < 2 else 1
op = MixedOp(C, stride)
self._ops.append(op)
weights2
是新加入的 β \beta β。
x j P C = ∑ i < j exp { β i , j } ∑ i ′ < j exp { β i ′ , j } ⋅ f i , j ​ ( x i ) . \mathbf{x}_j^\mathrm{PC}={\sum_{i<j}\frac{\exp\left\{\beta_{i,j}\right\}}{{\sum_{i'<j}}\exp\left\{\beta_{i',j}\right\}}\cdot f_{i,j}\!\left(\mathbf{x}_i\right)}. xjPC=i<j∑∑i′<jexp{βi′,j}exp{βi,j}⋅fi,j(xi).
获取设备的方式略显复杂。
s0 = self.preprocess0(s0)
s1 = self.preprocess1(s1)
states = [s0, s1]
offset = 0
for i in range(self._steps):
s = sum(weights2[offset+j].to(self._ops[offset+j](h, weights[offset+j]).device)*self._ops[offset+j](h, weights[offset+j]) for j, h in enumerate(states))
#s = channel_shuffle(s,4)
offset += len(states)
states.append(s)
return torch.cat(states[-self._multiplier:], dim=1)
PRIMITIVES 中的每个操作处理一半通道。
池化为什么是2x2?
向 PRIMITIVES 中的池化操作后面加 BN。
def __init__(self, C, stride):
super(MixedOp, self).__init__()
self._ops = nn.ModuleList()
self.mp = nn.MaxPool2d(2,2)
#self.bn=nn.BatchNorm2d(C//4, affine=False)
#self.conv1 = nn.Conv2d(C//4,C//4,kernel_size=1,stride=1,padding=0,bias=False)
for primitive in PRIMITIVES:
op = OPS[primitive](C//2, stride, False)
if 'pool' in primitive:
op = nn.Sequential(op, nn.BatchNorm2d(C//2, affine=False))
self._ops.append(op)
按通道切分输入。
x=[xtemp, xtemp2]
。xtemp3
和xtemp4
没有用到。
混合操作处理前一半输入,得到temp1
。model_search.py 中为1/4。
dim_2 = x.shape[1]
xtemp = x[ : , : dim_2//2, :, :]
xtemp2 = x[ : , dim_2//2:, :, :]
xtemp3 = x[:,dim_2// 4:dim_2// 2, :, :]
xtemp4 = x[:,dim_2// 2:, :, :]
temp1 = sum(w.to(xtemp.device) * op(xtemp) for w, op in zip(weights, self._ops))
if temp1.shape[2] == x.shape[2]:
#ans = torch.cat([temp1,self.bn(self.conv1(xtemp3))],dim=1)
#ans = torch.cat([ans,xtemp4],dim=1)
ans = torch.cat([temp1,xtemp2],dim=1)
#ans = torch.cat([ans,x[:, 2*dim_2// 4: , :, :]],dim=1)
else:
#ans = torch.cat([temp1,self.bn(self.conv1(self.mp(xtemp3)))],dim=1)
#ans = torch.cat([ans,self.mp(xtemp4)],dim=1)
ans = torch.cat([temp1,self.mp(xtemp2)], dim=1)
ans = channel_shuffle(ans,2)
return ans
#return sum(w.to(x.device) * op(x) for w, op in zip(weights, self._ops))
torch.transpose 返回张量,它是input
的转置版本。交换给定维度dim0
和dim1
。由此产生的张量与输入张量共享它的底层存储,因此改变一个的内容将改变另一个的内容。
将通道分组,
batchsize, num_channels, height, width = x.data.size()
channels_per_group = num_channels // groups
# reshape
x = x.view(batchsize, groups,
channels_per_group, height, width)
x = torch.transpose(x, 1, 2).contiguous()
# flatten
x = x.view(batchsize, -1, height, width)
return x
代码未区分搜索和训练所用的类,有很多重名的。以下为训练程序。
if not torch.cuda.is_available():
logging.info('No GPU device available')
sys.exit(1)
np.random.seed(args.seed)
cudnn.benchmark = True
torch.manual_seed(args.seed)
cudnn.enabled=True
torch.cuda.manual_seed(args.seed)
eval 函数用来执行一个字符串表达式,并返回表达式的值。
从 genotypes.py 中获取指定的结构。
NetworkImageNet 与搜索时的 Network 不同。各自的Cell
类定义也不同。
logging.info("args = %s", args)
logging.info("unparsed_args = %s", unparsed)
num_gpus = torch.cuda.device_count()
genotype = eval("genotypes.%s" % args.arch)
print('---------Genotype---------')
logging.info(genotype)
print('--------------------------')
model = Network(args.init_channels, CLASSES, args.layers, args.auxiliary, genotype)
if num_gpus > 1:
model = nn.DataParallel(model)
model = model.cuda()
else:
model = model.cuda()
logging.info("param size = %fMB", utils.count_parameters_in_MB(model))
criterion = nn.CrossEntropyLoss()
criterion = criterion.cuda()
criterion_smooth = CrossEntropyLabelSmooth(CLASSES, args.label_smooth)
criterion_smooth = criterion_smooth.cuda()
optimizer = torch.optim.SGD(
model.parameters(),
args.learning_rate,
momentum=args.momentum,
weight_decay=args.weight_decay
)
data_dir = os.path.join(args.tmp_data_dir, 'imagenet')
traindir = os.path.join(data_dir, 'train')
validdir = os.path.join(data_dir, 'val')
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
train_data = dset.ImageFolder(
traindir,
transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ColorJitter(
brightness=0.4,
contrast=0.4,
saturation=0.4,
hue=0.2),
transforms.ToTensor(),
normalize,
]))
valid_data = dset.ImageFolder(
validdir,
transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize,
]))
train_queue = torch.utils.data.DataLoader(
train_data, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=args.workers)
valid_queue = torch.utils.data.DataLoader(
valid_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=args.workers)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.decay_period, gamma=args.gamma)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, float(args.epochs))
支持两种学习率调整策略。adjust_lr 线性减少。
前5个 epoch 学习率预热。
best_acc_top1 = 0
best_acc_top5 = 0
for epoch in range(args.epochs):
if args.lr_scheduler == 'cosine':
scheduler.step()
current_lr = scheduler.get_lr()[0]
elif args.lr_scheduler == 'linear':
current_lr = adjust_lr(optimizer, epoch)
else:
print('Wrong lr type, exit')
sys.exit(1)
logging.info('Epoch: %d lr %e', epoch, current_lr)
if epoch < 5 and args.batch_size > 256:
for param_group in optimizer.param_groups:
param_group['lr'] = current_lr * (epoch + 1) / 5.0
logging.info('Warming-up Epoch: %d, LR: %e', epoch, current_lr * (epoch + 1) / 5.0)
if num_gpus > 1:
model.module.drop_path_prob = args.drop_path_prob * epoch / args.epochs
else:
model.drop_path_prob = args.drop_path_prob * epoch / args.epochs
epoch_start = time.time()
train_acc, train_obj = train(train_queue, model, criterion_smooth, optimizer)
logging.info('Train_acc: %f', train_acc)
测试并保存。
valid_acc_top1, valid_acc_top5, valid_obj = infer(valid_queue, model, criterion)
logging.info('Valid_acc_top1: %f', valid_acc_top1)
logging.info('Valid_acc_top5: %f', valid_acc_top5)
epoch_duration = time.time() - epoch_start
logging.info('Epoch time: %ds.', epoch_duration)
is_best = False
if valid_acc_top5 > best_acc_top5:
best_acc_top5 = valid_acc_top5
if valid_acc_top1 > best_acc_top1:
best_acc_top1 = valid_acc_top1
is_best = True
utils.save_checkpoint({
'epoch': epoch + 1,
'state_dict': model.state_dict(),
'best_acc_top1': best_acc_top1,
'optimizer' : optimizer.state_dict(),
}, is_best, args.save)
objs = utils.AvgrageMeter()
top1 = utils.AvgrageMeter()
top5 = utils.AvgrageMeter()
batch_time = utils.AvgrageMeter()
logits_aux
为辅助头的输出。
torch.nn.utils.clip_grad_norm_ 剪辑可迭代参数的梯度范数。在所有梯度上一起计算范数,就好像它们被连接成单个矢量一样。梯度是原地修改的。
model.train()
for step, (input, target) in enumerate(train_queue):
target = target.cuda(non_blocking=True)
input = input.cuda(non_blocking=True)
b_start = time.time()
optimizer.zero_grad()
logits, logits_aux = model(input)
loss = criterion(logits, target)
if args.auxiliary:
loss_aux = criterion(logits_aux, target)
loss += args.auxiliary_weight*loss_aux
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
optimizer.step()
batch_time.update(time.time() - b_start)
prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
n = input.size(0)
objs.update(loss.data.item(), n)
top1.update(prec1.data.item(), n)
top5.update(prec5.data.item(), n)
if step % args.report_freq == 0:
end_time = time.time()
if step == 0:
duration = 0
start_time = time.time()
else:
duration = end_time - start_time
start_time = time.time()
logging.info('TRAIN Step: %03d Objs: %e R1: %f R5: %f Duration: %ds BTime: %.3fs',
step, objs.avg, top1.avg, top5.avg, duration, batch_time.avg)
return top1.avg, objs.avg
self.stem0
为 Conv-BN-Relu-Conv-BN,self.stem1
为 ReLU-Conv-BN。
def __init__(self, C, num_classes, layers, auxiliary, genotype):
super(NetworkImageNet, self).__init__()
self._layers = layers
self._auxiliary = auxiliary
self.stem0 = nn.Sequential(
nn.Conv2d(3, C // 2, kernel_size=3, stride=2, padding=1, bias=False),
nn.BatchNorm2d(C // 2),
nn.ReLU(inplace=True),
nn.Conv2d(C // 2, C, 3, stride=2, padding=1, bias=False),
nn.BatchNorm2d(C),
)
self.stem1 = nn.Sequential(
nn.ReLU(inplace=True),
nn.Conv2d(C, C, 3, stride=2, padding=1, bias=False),
nn.BatchNorm2d(C),
)
Cell 构建单元。
C_prev_prev, C_prev, C_curr = C, C, C
self.cells = nn.ModuleList()
reduction_prev = True
for i in range(layers):
if i in [layers // 3, 2 * layers // 3]:
C_curr *= 2
reduction = True
else:
reduction = False
cell = Cell(genotype, C_prev_prev, C_prev, C_curr, reduction, reduction_prev)
reduction_prev = reduction
self.cells += [cell]
C_prev_prev, C_prev = C_prev, cell.multiplier * C_curr
if i == 2 * layers // 3:
C_to_auxiliary = C_prev
AuxiliaryHeadImageNet 似乎只是增加了网络深度。
if auxiliary:
self.auxiliary_head = AuxiliaryHeadImageNet(C_to_auxiliary, num_classes)
self.global_pooling = nn.AvgPool2d(7)
self.classifier = nn.Linear(C_prev, num_classes)
logits_aux = None
s0 = self.stem0(input)
s1 = self.stem1(s0)
for i, cell in enumerate(self.cells):
s0, s1 = s1, cell(s0, s1, self.drop_path_prob)
if i == 2 * self._layers // 3:
if self._auxiliary and self.training:
logits_aux = self.auxiliary_head(s1)
out = self.global_pooling(s1)
logits = self.classifier(out.view(out.size(0), -1))
return logits, logits_aux
_compile 解析出基因型。
def __init__(self, genotype, C_prev_prev, C_prev, C, reduction, reduction_prev):
super(Cell, self).__init__()
print(C_prev_prev, C_prev, C)
if reduction_prev:
self.preprocess0 = FactorizedReduce(C_prev_prev, C)
else:
self.preprocess0 = ReLUConvBN(C_prev_prev, C, 1, 1, 0)
self.preprocess1 = ReLUConvBN(C_prev, C, 1, 1, 0)
if reduction:
op_names, indices = zip(*genotype.reduce)
concat = genotype.reduce_concat
else:
op_names, indices = zip(*genotype.normal)
concat = genotype.normal_concat
self._compile(C, op_names, indices, concat, reduction)
_compile 将操作加入self._ops
列表中。
每一步两个操作,操作的输入为前两个时(self.preprocess0
和self.preprocess1
)可能需要缩小特征图。
assert len(op_names) == len(indices)
self._steps = len(op_names) // 2
self._concat = concat
self.multiplier = len(concat)
self._ops = nn.ModuleList()
for name, index in zip(op_names, indices):
stride = 2 if reduction and index < 2 else 1
op = OPS[name](C, stride, True)
self._ops += [op]
self._indices = indices
每层两个操作。根据self._indices
取出操作输入的索引。
drop_path
s0 = self.preprocess0(s0)
s1 = self.preprocess1(s1)
states = [s0, s1]
for i in range(self._steps):
h1 = states[self._indices[2*i]]
h2 = states[self._indices[2*i+1]]
op1 = self._ops[2*i]
op2 = self._ops[2*i+1]
h1 = op1(h1)
h2 = op2(h2)
if self.training and drop_prob > 0.:
if not isinstance(op1, Identity):
h1 = drop_path(h1, drop_prob)
if not isinstance(op2, Identity):
h2 = drop_path(h2, drop_prob)
s = h1 + h2
states += [s]
return torch.cat([states[i] for i in self._concat], dim=1)
def __init__(self, C, num_classes):
"""assuming input size 14x14"""
super(AuxiliaryHeadImageNet, self).__init__()
self.features = nn.Sequential(
nn.ReLU(inplace=True),
nn.AvgPool2d(5, stride=2, padding=0, count_include_pad=False),
nn.Conv2d(C, 128, 1, bias=False),
nn.BatchNorm2d(128),
nn.ReLU(inplace=True),
nn.Conv2d(128, 768, 2, bias=False),
# NOTE: This batchnorm was omitted in my earlier implementation due to a typo.
# Commenting it out for consistency with the experiments in the paper.
# nn.BatchNorm2d(768),
nn.ReLU(inplace=True)
)
self.classifier = nn.Linear(768, num_classes)
x = self.features(x)
x = self.classifier(x.view(x.size(0),-1))
return x
torch.Tensor.div_ 原地除法。
torch.Tensor.mul_ 原地乘法。
torch.Tensor.bernoulli_ 用 B e r n o u l l i ( p ) \mathrm{Bernoulli(p)} Bernoulli(p) 的独立样本填充self
的每个位置。self
可以有整型dtype
。
if drop_prob > 0.:
keep_prob = 1.-drop_prob
mask = Variable(torch.cuda.FloatTensor(x.size(0), 1, 1, 1).bernoulli_(keep_prob))
x.div_(keep_prob)
x.mul_(mask)
return x