ViT(TransReID)模型各阶段形状

文章首发及后续更新:https://mwhls.top/3747.html,无图/无目录/格式错误/更多相关请至首发页查看。
新的更新内容请到mwhls.top查看。
欢迎提出任何疑问及批评,非常感谢!

之前看TransReID代码时的记录,他们代码写的很全,训练测试都有。
这段时间在这个基础上改了点代码,暂时没出现模型理解错的地方。
ViT外的改进没有记录。

有错误或疑问请留言,谢谢。

目录
1. 代码
2. 参数设置:
3. 模型构造
4. class build_transformer() – model/make_model.py
5. class TransReID() – model/backbones/vit_pytorch.py
6. class PatchEmbed() – model/backbones/vit_pytorch.py
7. class Block() – model/backbones/vit_pytorch.py
8. class Attention() – model/backbones/vit_pytorch.py
9. class Mlp() – model/backbones/vit_pytorch.py
10. 损失函数构造 – 略
11. make_loss() – loss/make_loss.py
12. class CenterLoss() – loss/center_loss.py
13. 优化器配置 – 略
14. make_optimizer() – solver/make_optimizer.py
15. 调度器配置 – 略
16. create_scheduler() – solver/scheduler_factory.py
17. 模型训练
18. do_train() – processor/processor.py

代码

  • TransReID-GitHub
  • 下面是TransReID里的ViT结构,不包括TransReID的改进。
  • 代码简写了。

参数设置:

img_size = [H, W, C] = [224, 224, 3] # 图片尺寸,代码中的img_size为[H, W]
patch_size = [16, 16] # patch尺寸
num_patches = 224/16 * 224/16 = 14*14 = 196 # patch数
embed_dim = 768 # 一个patch_embed的长度
batch_size = B # batch_size简写,我选的是16。
camera_num = 1 # 不启用TransReID的相机SIE
view_num = 3 # 启用view_num,但是表示模态数,三模态。

模型构造

class build_transformer() – model/make_model.py
def __init__(self, num_classes, camera_num, view_num, cfg, factory):
  self.base = TransReID()
  self.base.load_param()
  self.gap = nn.AdaptiveAvgPool2d(1)
  self.classifier = arcface/cosface/amsoftmax/circle/nn.Linear(in_planes=768, num_classes, bias=False) 
  self.bottleneck = BatchNorm1d(768)
def forward(self, x, label=None, cam_label= None, view_label=None):
  # x:[B, 3, 224, 224]
  global_feat = self.base(x, cam, view) # ->[B, 768]
  feat = self.bottleneck(global_feat) # ->[B, 768]
  if training:
    cls_score = self.classifier(feat(, label)) # ->[B, num_classes] classifier为nn.Linear时不用(, label)
    return cls_score, global_feat # [B, num_classes], [B, 768] 
  else: 
    return feat/global_feat 
class TransReID() – model/backbones/vit_pytorch.py
def __init__():
  self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) # [1, 1, 768]
  self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) # [1, 197, 768]
  self.patch_embed = PatchEmbed() # [B, 196, 768]
  self.sie_embed = nn.Parameter(torch.zeros(view, 1, embed_dim)) # [3, 1, 768]
  self.pos_drop = nn.Dropout()
  self.blocks = [Block() for i in range(depth)]
  self.norm = norm_layer(embed_dim) # [768]
  self.fc = nn.Linear(embed_dim, num_classes) # [768, 1000]
  trunc_normal_(cls_token与pos_embed)
def forward(self, x, cam_label=None, view_label=None):
  x = self.forward_features(x, cam_label, view_label)
  return x
def forward_features(self, x, camera_id, view_id):
  # x:[B, 3, 224, 224]
  x = self.patch_embed(x) # ->[B, 196, 768],由[B, 3, 224, 224]的img经过切patch及线性投影转变而来
  cls_tokens = self.cls_token.expand(x.shape[0], -1, -1) # ->[1*B, 1, 768], 重复B个cls_tokens
  x = torch.cat((cls_tokens, x), dim=1) # ->[B, 1+196, 768]
  x = x + self.pos_embed + self.sie_xishu * self.sie_embed[view_id] # ->[B, 197, 768],广播合并。
  x = self.pos_drop(x) # 应用dropout
  for blk in self.blocks: # 不使用TransReID的JPM的return分支。
    x = blk(x) # ->[B, 197, 768]
  x = self.norm(x)
  # [B, cls_token + patch_embed_1 + patch_embed_2 + ... + patch_embed_196, embed_dimension](B, 1+197, 768)
  # ->[B, cls_token, embed_dimension](B, 1, 768) -> [B, embed_dimension](B, 768)
  # 即只取出每个batch里的cls_token,我还以为每个patch都参与分类,原来就cls参与,因为forward()不好调试我还写了代码来确定我这样是不是对的(我错了,我只是还没debug到forward那边)。
  # 这个是ViT的原文:Similar to BERT’s [class] token, we prepend a learnable embedding to the sequence of embedded patches (z00 = xclass), whose state at the output of the Transformer encoder (z0L) serves as theimage representation y (Eq. 4).
  # 之前看有解说视频吐槽这个cls是没用的,然后TransReID的图强调了他们改了这个cls,我一直误以为这玩意真的没啥用,没想到最后是唯一用上的...
  return x[:, 0] # [B, 768]
class PatchEmbed() – model/backbones/vit_pytorch.py
def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768)
  self.proj = nn.Conv2dd(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
def forward(self, x):
  B, C, H, W = x.shape # [B, 3, 224, 224]
  # [B, C, H, W] -> [B, embed_dim(C2), H/patch_size(H2), W/patch_size(W2)] -> [B, C2, H2W2] -> [B, H2W2, C2]
  # 实际上是[Batch_size, num_patches, embed_dim],即[B, 196, 768]
  x = self.proj(x).flatten(2).transpose(1, 2) 
  return x
class Block() – model/backbones/vit_pytorch.py
def __init__(dim=768):
  self.norm1 = norm_layer(dim)
  self.attn = Attention() # 输入[B, 197, 768],输出[B, 197, 768]
  self.drop_path = DropPath()
  self.norm2 = norm_layer(dim)
  self.mlp = Mlp() # 输入[B, 197, 768],输出[B, 197, 768]
def forward(self, x):
  # x:[B, 197, 768]
  x = x + self.drop_path(self.attn(self.norm1(x))) # 正则、自注意力、正则、残差
  x = x + self.drop_path(self.mlp(self.norm2(x))) # 正则、MLP、正则、残差
  return x # [B, 197, 768]
class Attention() – model/backbones/vit_pytorch.py
def __init__(self, dim=768, num_heads=12, ):
  self.qkv = nn.Linear(dim, dim * 3, bias=True)
  self.attn_drop = nn.Dropout(attn_drop)
  self.proj = nn.Linear(dim, dim)
  self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x):
  B, N, C = x.shape # [B, 197, 768]
  # x -> qkv: [B, 197, 768] -> [B, 197, 768*3] -> [B, 197, 3, 12, 64] -> [3, B, 12, 197, 64]
  # 即[q+k+v, Batch_size, num_heads, cls_tokens+patches_embed, head_channel],.
  # 我看Transformer论文里好像没提到一个head里面的数据叫什么,不过既然它是由一个channel按head平分的,那就叫head_channel吧。
  # 以及这里的channel应该指的是Transformer里的dimension,即d_model = 512
  qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C//self.num_heads).permute(2, 0, 3, 1, 4)
  q, k, v = qkv[0], qkv[1], qkv[2] # q,k,v均为[B, 12, 197, 64]
  attn = (q @ k.transpose(-2, -1)) * self.scale # [B, 12, 197, 64] @ [B, 12, 64, 197] -> [B, 12, 197, 197],QK矩阵乘法并放缩值域
  attn = attn.softmax(dim=-1) # 最后一维进行softmax
  attn = self.attn_drop(attn) # 应用dropout
  # [B, 12, 197, 197] * [B, 12, 197, 64] -> [B, 12, 197, 64] -> [B, 197, 12, 64] -> [B, 197, 768]
  x = (attn @ v).transpose(1, 2).reshape(B, N, C)
  x = self.proj(x) # ->[B, 197, 768] 对(q@k)@v来一次全连接层,并dropout。
  x = self.proj_drop(x) # 应用dropout
  return x # [B, 197, 768]
class Mlp() – model/backbones/vit_pytorch.py
def __init__(in_features=768, hidden_features=768*4, act_layer=nn.GELU, ):
  out_features = out_features or in_features
  hidden_features = hidden_features or in_features
  self.fc1 = nn.Linear(in_features, hidden_features)
  self.act = act_layer()
  self.fc2 = nn.Linear(hidden_features, out_features)
  self.drop = nn.Dropout()
def forward(self, x):
  # x:[B, 197, 768]
  x = self.fc1(x) # ->[B, 197, 768*4]
  x = self.act(x) # 应用GELU激活函数
  x = self.drop(x) # 应用dropout
  x = self.fc2(x) # ->[B, 197, 768]
  x = self.drop(x) # 应用dropout
  return x # [B, 197, 768]

损失函数构造 – 略

make_loss() – loss/make_loss.py
center_criterion = CenterLoss(num_classes=num_classes, feat_dim=2048, use_gpu=True)
triplet = TripletLoss()
return loss_func, center_criterion
class CenterLoss() – loss/center_loss.py
def __init__(self, num_classes=751, feat_dim=2048, use_gpu=True):
  self.centers = nn.Parameter(torch.randn(self.num_classes, seflf.feat_dim))
def forward(self, x, labels):
  # x:[B, feat_dim], feature matrix.
  # labels:[num_classes] with truth labels.

优化器配置 – 略

make_optimizer() – solver/make_optimizer.py
# SGD
optimizer = getattr(torch.optim, cfg.SOLVER.OPTIMIZER_NAME)(params, momentum=cfg.SOLVER.MOMENTUM)
optimizer_center = torch.optim.SGD(center_criterion.parameters(), lr=cfg.SOLVER.CENTER_LR)
return optimizer, optimizer_center

调度器配置 – 略

create_scheduler() – solver/scheduler_factory.py
ls_scheduler = CosineLRScheduler()
return le_scheduler

模型训练

do_train() – processor/processor.py
scaler = amp.GradScaler()
for epoch in range(1, epochs + 1):
  model.train()
  scheduler.step(epoch)
  for n_iter, (img, vid, target_cam, target_view) in enumerate(train_loader):
    """
    # img:[B, 3, 224, 224], B张图片,3通道,H=224,W=224,经 数据集加载 里的train_transforms处理
    # vid:[B],B张图片对应的B个id
    # target_cam:[B],B张图片对应的B个cam_id
    # target_view:[B],B张图片对应的B个view_id
    """
    score, feat = model(img, vid, target_cam, target_view ) # 得到分类结果score:[B, num_classes],和特征feat:[B, 768]
    loss = loss_fn(score, feat, vid, target_cam) # 计算损失
    scaler.scale(loss).backward() # 梯度后向传播
    acc = (score.max(1)[1] == target).float().mean() # 求accuracy
  模型存储
  model.eval()
  for n_iter, (img, vid, camid, camids, target_view, _) in enumerate(val_loader):
    feat = model(img, camids, target_view)
    evaluator.update((feat, vid, camid))
  cmc, mAP, _, _, _, _, _ = evaluator.compute()

你可能感兴趣的:(Transformer,python,深度学习,Transformer)