SAM就是一类处理图像分割任务的通用模型。与以往只能处理某种特定类型图片的图像分割模型不同,SAM可以处理所有类型的图像。
在SAM出现前,基本上所有的图像分割模型都是专有模型。比如,在医学领域,有专门分割核磁图像的人工智能模型,也有专门分割CT影像的人工智能模型。但这些模型往往只在分割专有领域内的图像时,才具有良好性能,而在分割其他领域的图像时往往性能不佳。
沿着前两篇文章之后,本文讲下面带下划线的三个图像分割模型
1月 | 3月 | 4月 | 5月 | 6月 | 8月 | 10月 | 11月 | |
2020 | DETR | DDPM | DDIM VisionTransformer |
|||||
2021 | CLIP DALL·E |
SwinTransformer | MAE SwinTransformerV2 |
|||||
2022 | BLIP | DALL·E 2 | StableDiffusion BEiT-3 Midjourney V3 |
|||||
2023 | BLIP2 | VisualChatGPT GPT4 Midjourney V5 |
SAM(Segment Anything Model) | FastSAM (中科院版SAM) MobileSAM |
SAM(论文地址、代码地址)的目的是建立一个图像分割的基础模型,开发一个具有提示能力的模型
要解决的3个问题:
模型结构如下
利用MAE预训练的视觉Transformer (即ViT,如果忘了ViT长啥样,可回顾此文第4部分),最低限度适应高分辨率的输入,该encoder在prompt encoder之前,对每张图像只运行一次
输入(c,h,w)的图像,对图像进行缩放,按照长边缩放成1024,短边不够就pad,得到(c,1024,1024)的图像,经过image encoder,得到对图像16倍下采样的feature,大小为(256,64,64)
至于其代码实现主要实现以下几个类
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional, Tuple, Type
# 导入.common模块中的LayerNorm2d和MLPBlock
from .common import LayerNorm2d, MLPBlock
# 定义ImageEncoderViT类,这是一个基于Vision Transformer的图像编码器,该类从nn.Module继承
class ImageEncoderViT(nn.Module):
# 类的构造函数,定义了一系列的参数,例如图像大小,块大小,输入通道数,嵌入维度,Transformer的深度,注意力头部数等。
def __init__(
self,
img_size: int = 1024,
patch_size: int = 16,
in_chans: int = 3,
embed_dim: int = 768,
depth: int = 12,
num_heads: int = 12,
mlp_ratio: float = 4.0,
out_chans: int = 256,
qkv_bias: bool = True,
norm_layer: Type[nn.Module] = nn.LayerNorm,
act_layer: Type[nn.Module] = nn.GELU,
use_abs_pos: bool = True,
use_rel_pos: bool = False,
rel_pos_zero_init: bool = True,
window_size: int = 0,
global_attn_indexes: Tuple[int, ...] = (),
) -> None:
# 使用super函数调用父类的初始化函数
super().__init__()
# 将图像大小保存为类的一个属性
self.img_size = img_size
# 创建PatchEmbed实例,用于将输入图像划分为多个patch,并将每个patch嵌入到一个向量空间中
self.patch_embed = PatchEmbed(
kernel_size=(patch_size, patch_size),
stride=(patch_size, patch_size),
in_chans=in_chans,
embed_dim=embed_dim,
)
# 创建位置嵌入属性,如果使用绝对位置嵌入,则初始化这个属性
self.pos_embed: Optional[nn.Parameter] = None
if use_abs_pos:
self.pos_embed = nn.Parameter(
torch.zeros(1, img_size // patch_size, img_size // patch_size, embed_dim)
)
# 创建Transformer的主体,包含多个Transformer block
self.blocks = nn.ModuleList()
for i in range(depth):
block = Block(
dim=embed_dim,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
norm_layer=norm_layer,
act_layer=act_layer,
use_rel_pos=use_rel_pos,
rel_pos_zero_init=rel_pos_zero_init,
window_size=window_size if i not in global_attn_indexes else 0,
input_size=(img_size // patch_size, img_size // patch_size),
)
self.blocks.append(block)
# 创建neck属性,包含一个卷积层,一个LayerNorm层,另一个卷积层和另一个LayerNorm层
self.neck = nn.Sequential(
nn.Conv2d(
embed_dim,
out_chans,
kernel_size=1,
bias=False,
),
LayerNorm2d(out_chans),
nn.Conv2d(
out_chans,
out_chans,
kernel_size=3,
padding=1,
bias=False,
),
LayerNorm2d(out_chans),
)
# 前向传播函数
def forward(self, x: torch.Tensor) -> torch.Tensor:
# 对输入x进行patch embedding
x = self.patch_embed(x)
# 如果使用了位置嵌入,将位置嵌入加到x上
if self.pos_embed is not None:
x = x + self.pos_embed
# 将x通过所有的Transformer block
for blk in self.blocks:
x = blk(x)
# 将x通过neck,得到最终的输出
x = self.neck(x.permute(0, 3, 1, 2))
return x
# 定义Block类,这是Transformer的基本组成模块,包括注意力机制和前馈神经网络。该类从nn.Module继承
class Block(nn.Module):
# 类的构造函数,定义了一系列的参数,例如输入通道数,注意力头部数,mlp隐藏层与嵌入层的比例,是否添加偏置到查询,键,值,归一化层,激活函数等。
def __init__(
self,
dim: int,
num_heads: int,
mlp_ratio: float = 4.0,
qkv_bias: bool = True,
norm_layer: Type[nn.Module] = nn.LayerNorm,
act_layer: Type[nn.Module] = nn.GELU,
use_rel_pos: bool = False,
rel_pos_zero_init: bool = True,
window_size: int = 0,
input_size: Optional[Tuple[int, int]] = None,
) -> None:
# 使用super函数调用父类的初始化函数
super().__init__()
# 创建第一个归一化层
self.norm1 = norm_layer(dim)
# 创建注意力机制层
self.attn = Attention(
dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
use_rel_pos=use_rel_pos,
rel_pos_zero_init=rel_pos_zero_init,
input_size=input_size if window_size == 0 else (window_size, window_size),
)
# 创建第二个归一化层
self.norm2 = norm_layer(dim)
# 创建MLP层
self.mlp = MLPBlock(embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer)
# 定义窗口大小
self.window_size = window_size
# 前向传播函数
def forward(self, x: torch.Tensor) -> torch.Tensor:
# 保存输入x,以便稍后进行残差连接
shortcut = x
# 对x进行第一次归一化处理
x = self.norm1(x)
# 如果定义了窗口大小,则对x进行窗口划分
if self.window_size > 0:
H, W = x.shape[1], x.shape[2]
x, pad_hw = window_partition(x, self.window_size)
# 对x进行注意力处理
x = self.attn(x)
# 如果定义了窗口大小,则对x进行窗口合并
if self.window_size > 0:
x = window_unpartition(x, self.window_size, pad_hw, (H, W))
# 对x进行残差连接
x = shortcut + x
# 对x进行第二次归一化处理并通过MLP层,然后进行第二次残差连接
x = x + self.mlp(self.norm2(x))
return x
# 定义Attention类,这是一个多头注意力机制的块,支持相对位置嵌入,该类从nn.Module继承
class Attention(nn.Module):
# 类的构造函数,定义了一系列的参数,例如输入通道数,注意力头部数,是否添加偏置到查询,键,值,是否使用相对位置嵌入等。
def __init__(
self,
dim: int,
num_heads: int = 8,
qkv_bias: bool = True,
use_rel_pos: bool = False,
rel_pos_zero_init: bool = True,
input_size: Optional[Tuple[int, int]] = None,
) -> None:
# 使用super函数调用父类的初始化函数
super().__init__()
# 保存注意力头部数
self.num_heads = num_heads
# 计算每个注意力头部的维度
head_dim = dim // num_heads
# 缩放因子
self.scale = head_dim**-0.5
# 创建线性变换层,用于生成查询、键、值
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
# 创建线性变换层,用于将注意力加权后的值进行线性变换
self.proj = nn.Linear(dim, dim)
# 是否使用相对位置嵌入
self.use_rel_pos = use_rel_pos
if self.use_rel_pos:
assert (
input_size is not None
), "Input size must be provided if using relative positional encoding."
# 初始化相对位置嵌入参数
self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
# 前向传播函数
def forward(self, x: torch.Tensor) -> torch.Tensor:
B, H, W, _ = x.shape
# 对输入x进行线性变换得到查询、键、值
qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
# 将查询、键、值拆分出来
q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0)
# 计算注意力权重
attn = (q * self.scale) @ k.transpose(-2, -1)
# 如果使用相对位置嵌入,将相对位置嵌入添加到注意力权重中
if self.use_rel_pos:
attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W))
# 对注意力权重进行softmax归一化
attn = attn.softmax(dim=-1)
# 计算注意力加权后的值,并重新调整形状
x = (attn @ v).view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
# 将注意力加权后的值进行线性变换
x = self.proj(x)
return x
# 定义window_partition函数,用于将输入x分割为不重叠的窗口,并进行填充。
def window_partition(x: torch.Tensor, window_size: int) -> Tuple[torch.Tensor, Tuple[int, int]]:
"""
Partition into non-overlapping windows with padding if needed.
Args:
x (tensor): input tokens with [B, H, W, C].
window_size (int): window size.
Returns:
windows: windows after partition with [B * num_windows, window_size, window_size, C].
(Hp, Wp): padded height and width before partition
"""
B, H, W, C = x.shape
# 计算需要进行填充的行和列的数量
pad_h = (window_size - H % window_size) % window_size
pad_w = (window_size - W % window_size) % window_size
# 如果需要进行填充,则使用F.pad函数进行填充
if pad_h > 0 or pad_w > 0:
x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
# 计算填充后的高度和宽度
Hp, Wp = H + pad_h, W + pad_w
# 将输入x重新调整形状为窗口大小的倍数
x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
# 对调换维度进行重排列,并重新调整形状
windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
# 返回分割后的窗口和填充前的高度和宽度
return windows, (Hp, Wp)
# 定义window_unpartition函数,用于将窗口合并为原始序列,并移除填充。
def window_unpartition(
windows: torch.Tensor, window_size: int, pad_hw: Tuple[int, int], hw: Tuple[int, int]
) -> torch.Tensor:
"""
Window unpartition into original sequences and removing padding.
Args:
windows (tensor): input tokens with [B * num_windows, window_size, window_size, C].
window_size (int): window size.
pad_hw (Tuple): padded height and width (Hp, Wp).
hw (Tuple): original height and width (H, W) before padding.
Returns:
x: unpartitioned sequences with [B, H, W, C].
"""
Hp, Wp = pad_hw
H, W = hw
B = windows.shape[0] // (Hp * Wp // window_size // window_size)
# 将窗口重新调整为原始序列
x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
# 对调换维度进行重排列,并重新调整形状
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
# 如果填充的高度或宽度大于原始高度或宽度,则移除填充部分
if Hp > H or Wp > W:
x = x[:, :H, :W, :].contiguous()
# 返回合并后的序列
return x
# 定义get_rel_pos函数,根据查询和键的大小获取相对位置嵌入。
def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
"""
Get relative positional embeddings according to the relative positions of
query and key sizes.
Args:
q_size (int): size of query q.
k_size (int): size of key k.
rel_pos (Tensor): relative position embeddings (L, C).
Returns:
Extracted positional embeddings according to relative positions.
"""
# 计算相对距离的最大值
max_rel_dist = int(2 * max(q_size, k_size) - 1)
# 如果相对位置嵌入的形状与最大相对距离不一致,则进行插值处理
if rel_pos.shape[0] != max_rel_dist:
# 插值相对位置嵌入
rel_pos_resized = F.interpolate(
rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
size=max_rel_dist,
mode="linear",
)
rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
else:
rel_pos_resized = rel_pos
# 根据形状的不同,使用短边的长度进行坐标缩放
q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
return rel_pos_resized[relative_coords.long()]
# 定义add_decomposed_rel_pos函数,计算分解的相对位置嵌入
def add_decomposed_rel_pos(
attn: torch.Tensor,
q: torch.Tensor,
rel_pos_h: torch.Tensor,
rel_pos_w: torch.Tensor,
q_size: Tuple[int, int],
k_size: Tuple[int, int],
) -> torch.Tensor:
"""
Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py # noqa B950
Args:
attn (Tensor): attention map.
q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
Returns:
attn (Tensor): attention map with added relative positional embeddings.
"""
q_h, q_w = q_size
k_h, k_w = k_size
# 获取相对位置嵌入
Rh = get_rel_pos(q_h, k_h, rel_pos_h)
Rw = get_rel_pos(q_w, k_w, rel_pos_w)
B, _, dim = q.shape
r_q = q.reshape(B, q_h, q_w, dim)
rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
attn = (
attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
).view(B, q_h * q_w, k_h * k_w)
return attn
定义一个 PatchEmbed 类,用于将图像转换为补丁嵌入。它使用卷积层将输入图像转换为指定维度的补丁嵌入表示。在前向传播中,输入经过卷积层进行投影,并调换维度的顺序,以使得输出为批量-高度-宽度-通道的形状
# 定义PatchEmbed类,用于将图像转换为补丁嵌入。
class PatchEmbed(nn.Module):
"""
Image to Patch Embedding.
"""
def __init__(
self,
kernel_size: Tuple[int, int] = (16, 16),
stride: Tuple[int, int] = (16, 16),
padding: Tuple[int, int] = (0, 0),
in_chans: int = 3,
embed_dim: int = 768,
) -> None:
"""
Args:
kernel_size (Tuple): kernel size of the projection layer.
stride (Tuple): stride of the projection layer.
padding (Tuple): padding size of the projection layer.
in_chans (int): Number of input image channels.
embed_dim (int): Patch embedding dimension.
"""
# 使用super函数调用父类的初始化函数
super().__init__()
# 创建卷积层,用于将图像转换为补丁嵌入
self.proj = nn.Conv2d(
in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
)
# 前向传播函数
def forward(self, x: torch.Tensor) -> torch.Tensor:
# 将输入x进行投影
x = self.proj(x)
# 调换维度的顺序,B C H W -> B H W C
x = x.permute(0, 2, 3, 1)
return x
分成2类:稀疏的(点/box/文本)、稠密的(mask)
其代码实现为
import numpy as np
import torch
from torch import nn
from typing import Any, Optional, Tuple, Type
from .common import LayerNorm2d
class PromptEncoder(nn.Module):
def __init__(
self,
embed_dim: int,
image_embedding_size: Tuple[int, int],
input_image_size: Tuple[int, int],
mask_in_chans: int,
activation: Type[nn.Module] = nn.GELU,
) -> None:
"""
SAM模型的PromptEncoder类,用于编码输入到遮罩解码器的提示。
参数:
embed_dim (int): 提示的嵌入维度
image_embedding_size (tuple(int, int)): 图像嵌入的空间尺寸,格式为(H, W)。
input_image_size (int): 输入到图像编码器的图像填充尺寸,格式为(H, W)。
mask_in_chans (int): 用于编码输入遮罩的隐藏通道数。
activation (nn.Module): 用于编码输入遮罩时使用的激活函数。
"""
super().__init__()
self.embed_dim = embed_dim
self.input_image_size = input_image_size
self.image_embedding_size = image_embedding_size
self.pe_layer = PositionEmbeddingRandom(embed_dim // 2)
self.num_point_embeddings: int = 4 # 正/负点 + 2个框角
point_embeddings = [nn.Embedding(1, embed_dim) for i in range(self.num_point_embeddings)]
self.point_embeddings = nn.ModuleList(point_embeddings)
self.not_a_point_embed = nn.Embedding(1, embed_dim)
self.mask_input_size = (4 * image_embedding_size[0], 4 * image_embedding_size[1])
self.mask_downscaling = nn.Sequential(
nn.Conv2d(1, mask_in_chans // 4, kernel_size=2, stride=2),
LayerNorm2d(mask_in_chans // 4),
activation(),
nn.Conv2d(mask_in_chans // 4, mask_in_chans, kernel_size=2, stride=2),
LayerNorm2d(mask_in_chans),
activation(),
nn.Conv2d(mask_in_chans, embed_dim, kernel_size=1),
)
self.no_mask_embed = nn.Embedding(1, embed_dim)
def get_dense_pe(self) -> torch.Tensor:
"""
返回用于编码点提示的位置编码,应用于与图像编码尺寸相同的密集点集。
返回:
torch.Tensor: 形状为1x(embed_dim)x(embedding_h)x(embedding_w)的位置编码。
"""
return self.pe_layer(self.image_embedding_size).unsqueeze(0)
def _embed_points(
self,
points: torch.Tensor,
labels: torch.Tensor,
pad: bool,
) -> torch.Tensor:
"""嵌入点提示。"""
points = points + 0.5 # 移动到像素的中心
if pad:
padding_point = torch.zeros((points.shape[0], 1, 2), device=points.device)
padding_label = -torch.ones((labels.shape[0], 1), device=labels.device)
points = torch.cat([points, padding_point], dim=1)
labels = torch.cat([labels, padding_label], dim=1)
point_embedding = self.pe_layer.forward_with_coords(points, self.input_image_size)
point_embedding[labels == -1] = 0.0
point_embedding[labels == -1] += self.not_a_point_embed.weight
point_embedding[labels == 0] += self.point_embeddings[0].weight
point_embedding[labels == 1] += self.point_embeddings[1].weight
return point_embedding
def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor:
"""嵌入框提示。"""
boxes = boxes + 0.5 # 移动到像素的中心
coords = boxes.reshape(-1, 2, 2)
corner_embedding = self.pe_layer.forward_with_coords(coords, self.input_image_size)
corner_embedding[:, 0, :] += self.point_embeddings[2].weight
corner_embedding[:, 1, :] += self.point_embeddings[3].weight
return corner_embedding
def _embed_masks(self, masks: torch.Tensor) -> torch.Tensor:
"""嵌入遮罩输入。"""
mask_embedding = self.mask_downscaling(masks)
return mask_embedding
def _get_batch_size(
self,
points: Optional[Tuple[torch.Tensor, torch.Tensor]],
boxes: Optional[torch.Tensor],
masks: Optional[torch.Tensor],
) -> int:
"""
根据输入提示的批大小获取输出的批大小。
"""
if points is not None:
return points[0].shape[0]
elif boxes is not None:
return boxes.shape[0]
elif masks is not None:
return masks.shape[0]
else:
return 1
def _get_device(self) -> torch.device:
return self.point_embeddings[0].weight.device
def forward(
self,
points: Optional[Tuple[torch.Tensor, torch.Tensor]],
boxes: Optional[torch.Tensor],
masks: Optional[torch.Tensor],
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
嵌入不同类型的提示,返回稀疏和密集的嵌入。
参数:
points (tuple(torch.Tensor, torch.Tensor) or none): 要嵌入的点坐标和标签。
boxes (torch.Tensor or none): 要嵌入的框。
masks (torch.Tensor or none): 要嵌入的遮罩。
返回:
torch.Tensor: 稀疏的点和框嵌入,形状为BxNx(embed_dim),其中N由输入点和框的数量决定。
torch.Tensor: 密集的遮罩嵌入,形状为Bx(embed_dim)x(embed_H)x(embed_W)。
"""
bs = self._get_batch_size(points, boxes, masks)
sparse_embeddings = torch.empty((bs, 0, self.embed_dim), device=self._get_device())
if points is not None:
coords, labels = points
point_embeddings = self._embed_points(coords, labels, pad=(boxes is None))
sparse_embeddings = torch.cat([sparse_embeddings, point_embeddings], dim=1)
if boxes is not None:
box_embeddings = self._embed_boxes(boxes)
sparse_embeddings = torch.cat([sparse_embeddings, box_embeddings], dim=1)
if masks is not None:
dense_embeddings = self._embed_masks(masks)
else:
dense_embeddings = self.no_mask_embed.weight.reshape(1, -1, 1, 1).expand(
bs, -1, self.image_embedding_size[0], self.image_embedding_size[1]
)
return sparse_embeddings, dense_embeddings
class PositionEmbeddingRandom(nn.Module):
"""
使用随机空间频率的位置编码。
"""
def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None:
super().__init__()
if scale is None or scale <= 0.0:
scale = 1.0
self.register_buffer(
"positional_encoding_gaussian_matrix",
scale * torch.randn((2, num_pos_feats)),
)
def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor:
"""对归一化到[0,1]的点进行位置编码。"""
# 假设坐标在[0, 1]^2的正方形内,并具有d_1 x ... x d_n x 2的形状
coords = 2 * coords - 1
coords = coords @ self.positional_encoding_gaussian_matrix
coords = 2 * np.pi * coords
# 输出形状为d_1 x ... x d_n x C
return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)
def forward(self, size: Tuple[int, int]) -> torch.Tensor:
"""为指定大小的网格生成位置编码。"""
h, w = size
device: Any = self.positional_encoding_gaussian_matrix.device
grid = torch.ones((h, w), device=device, dtype=torch.float32)
y_embed = grid.cumsum(dim=0) - 0.5
x_embed = grid.cumsum(dim=1) - 0.5
y_embed = y_embed / h
x_embed = x_embed / w
pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1))
return pe.permute(2, 0, 1) # C x H x W
def forward_with_coords(
self, coords_input: torch.Tensor, image_size: Tuple[int, int]
) -> torch.Tensor:
"""对未归一化到[0,1]的点进行位置编码。"""
coords = coords_input.clone()
coords[:, :, 0] = coords[:, :, 0] / image_size[1]
coords[:, :, 1] = coords[:, :, 1] / image_size[0]
return self._pe_encoding(coords.to(torch.float)) # B x N x C
mask decoder模块:在prompt embeddings中插入一个可学习的token,用于docoder的输出
对于下图的左侧部分,依次进行如下4个步骤
重复上述步骤2次,再将attn再通过残差进行连接,最终输出masks和iou scores,这段的代码实现为
import torch
from torch import Tensor, nn
import math
from typing import Tuple, Type
from .common import MLPBlock
class TwoWayTransformer(nn.Module):
def __init__(
self,
depth: int,
embedding_dim: int,
num_heads: int,
mlp_dim: int,
activation: Type[nn.Module] = nn.ReLU,
attention_downsample_rate: int = 2,
) -> None:
"""
使用位置嵌入提供的查询,对输入图像进行注意力操作的Transformer解码器。
参数:
depth (int): Transformer中的层数
embedding_dim (int): 输入嵌入的通道维度
num_heads (int): 多头注意力的头数。embedding_dim必须是num_heads的倍数
mlp_dim (int): MLP块内部的通道维度
activation (nn.Module): MLP块中使用的激活函数
"""
super().__init__()
self.depth = depth
self.embedding_dim = embedding_dim
self.num_heads = num_heads
self.mlp_dim = mlp_dim
self.layers = nn.ModuleList()
for i in range(depth):
self.layers.append(
TwoWayAttentionBlock(
embedding_dim=embedding_dim,
num_heads=num_heads,
mlp_dim=mlp_dim,
activation=activation,
attention_downsample_rate=attention_downsample_rate,
skip_first_layer_pe=(i == 0),
)
)
self.final_attn_token_to_image = Attention(
embedding_dim, num_heads, downsample_rate=attention_downsample_rate
)
self.norm_final_attn = nn.LayerNorm(embedding_dim)
def forward(
self,
image_embedding: Tensor,
image_pe: Tensor,
point_embedding: Tensor,
) -> Tuple[Tensor, Tensor]:
"""
参数:
image_embedding (torch.Tensor): 要进行注意力操作的图像。形状应为B x embedding_dim x h x w,其中h和w可以是任意值。
image_pe (torch.Tensor): 添加到图像的位置编码。形状必须与image_embedding相同。
point_embedding (torch.Tensor): 添加到查询点的嵌入。形状必须为B x N_points x embedding_dim,其中N_points可以是任意值。
返回:
torch.Tensor: 处理后的point_embedding
torch.Tensor: 处理后的image_embedding
"""
# BxCxHxW -> BxHWxC == B x N_image_tokens x C
bs, c, h, w = image_embedding.shape
image_embedding = image_embedding.flatten(2).permute(0, 2, 1)
image_pe = image_pe.flatten(2).permute(0, 2, 1)
# 准备查询
queries = point_embedding
keys = image_embedding
# 应用Transformer块和最终的LayerNorm
for layer in self.layers:
queries, keys = layer(
queries=queries,
keys=keys,
query_pe=point_embedding,
key_pe=image_pe,
)
# 应用从点到图像的最终注意力层
q = queries + point_embedding
k = keys + image_pe
attn_out = self.final_attn_token_to_image(q=q, k=k, v=keys)
queries = queries + attn_out
queries = self.norm_final_attn(queries)
return queries, keys
class TwoWayAttentionBlock(nn.Module):
def __init__(
self,
embedding_dim: int,
num_heads: int,
mlp_dim: int = 2048,
activation: Type[nn.Module] = nn.ReLU,
attention_downsample_rate: int = 2,
skip_first_layer_pe: bool = False,
) -> None:
"""
一个具有四个层的Transformer块:
(1) 稀疏输入的自注意力,
(2) 将稀疏输入与密集输入的交叉注意力,
(3) 稀疏输入的MLP块,
(4) 将密集输入与稀疏输入的交叉注意力。
参数:
embedding_dim (int): 嵌入的通道维度
num_heads (int): 注意力层中的头数
mlp_dim (int): MLP块的隐藏维度
activation (nn.Module): MLP块的激活函数
skip_first_layer_pe (bool): 是否跳过第一层的位置编码
"""
super().__init__()
self.self_attn = Attention(embedding_dim, num_heads)
self.norm1 = nn.LayerNorm(embedding_dim)
self.cross_attn_token_to_image = Attention(
embedding_dim, num_heads, downsample_rate=attention_downsample_rate
)
self.norm2 = nn.LayerNorm(embedding_dim)
self.mlp = MLPBlock(embedding_dim, mlp_dim, activation)
self.norm3 = nn.LayerNorm(embedding_dim)
self.norm4 = nn.LayerNorm(embedding_dim)
self.cross_attn_image_to_token = Attention(
embedding_dim, num_heads, downsample_rate=attention_downsample_rate
)
self.skip_first_layer_pe = skip_first_layer_pe
def forward(
self, queries: Tensor, keys: Tensor, query_pe: Tensor, key_pe: Tensor
) -> Tuple[Tensor, Tensor]:
# 自注意力块
if self.skip_first_layer_pe:
queries = self.self_attn(q=queries, k=queries, v=queries)
else:
q = queries + query_pe
attn_out = self.self_attn(q=q, k=q, v=queries)
queries = queries + attn_out
queries = self.norm1(queries)
# 交叉注意力块,将token与图像嵌入进行注意力操作
q = queries + query_pe
k = keys + key_pe
attn_out = self.cross_attn_token_to_image(q=q, k=k, v=keys)
queries = queries + attn_out
queries = self.norm2(queries)
# MLP块
mlp_out = self.mlp(queries)
queries = queries + mlp_out
queries = self.norm3(queries)
# 交叉注意力块,将图像嵌入与token进行注意力操作
q = queries + query_pe
k = keys + key_pe
attn_out = self.cross_attn_image_to_token(q=k, k=q, v=queries)
keys = keys + attn_out
keys = self.norm4(keys)
return queries, keys
class Attention(nn.Module):
"""
允许在将查询、键和值投影后缩小嵌入大小的注意力层。
"""
def __init__(
self,
embedding_dim: int,
num_heads: int,
downsample_rate: int = 1,
) -> None:
super().__init__()
self.embedding_dim = embedding_dim
self.internal_dim = embedding_dim // downsample_rate
self.num_heads = num_heads
assert self.internal_dim % num_heads == 0, "num_heads must divide embedding_dim."
self.q_proj = nn.Linear(embedding_dim, self.internal_dim)
self.k_proj = nn.Linear(embedding_dim, self.internal_dim)
self.v_proj = nn.Linear(embedding_dim, self.internal_dim)
self.out_proj = nn.Linear(self.internal_dim, embedding_dim)
def _separate_heads(self, x: Tensor, num_heads: int) -> Tensor:
b, n, c = x.shape
x = x.reshape(b, n, num_heads, c // num_heads)
return x.transpose(1, 2) # B x N_heads x N_tokens x C_per_head
def _recombine_heads(self, x: Tensor) -> Tensor:
b, n_heads, n_tokens, c_per_head = x.shape
x = x.transpose(1, 2)
return x.reshape(b, n_tokens, n_heads * c_per_head) # B x N_tokens x C
def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
# 输入投影
q = self.q_proj(q)
k = self.k_proj(k)
v = self.v_proj(v)
# 分割为头部
q = self._separate_heads(q, self.num_heads)
k = self._separate_heads(k, self.num_heads)
v = self._separate_heads(v, self.num_heads)
# 注意力操作
_, _, _, c_per_head = q.shape
attn = q @ k.permute(0, 1, 3, 2) # B x N_heads x N_tokens x N_tokens
attn = attn / math.sqrt(c_per_head)
attn = torch.softmax(attn, dim=-1)
# 获取输出
out = attn @ v
out = self._recombine_heads(out)
out = self.out_proj(out)
return out
对于下图的右侧部分
其中,有几个问题值得提一下
其代码实现为 (定义一个MaskDecoder类,用于预测给定图像和提示嵌入的掩码,其使用的Transformer架构。同时,也定义了一个MLP类,即多层感知器网络)
import torch
from torch import nn
from torch.nn import functional as F
from typing import List, Tuple, Type
from .common import LayerNorm2d
# 定义MaskDecoder类,继承自nn.Module
class MaskDecoder(nn.Module):
# 构造函数
def __init__(
self,
*,
transformer_dim: int, # Transformer的维度
transformer: nn.Module,
num_multimask_outputs: int = 3, # 多重掩码输出的数量,默认为3
activation: Type[nn.Module] = nn.GELU, # 激活函数类型,默认为nn.GELU
iou_head_depth: int = 3, # 预测掩码质量的MLP的深度,默认为3
iou_head_hidden_dim: int = 256, # 预测掩码质量的MLP的隐藏维度,默认为256
) -> None:
super().__init__() # 调用父类的初始化函数
self.transformer_dim = transformer_dim # 初始化Transformer的维度
self.transformer = transformer # 初始化Transformer模块
# 初始化多重掩码输出的数量
self.num_multimask_outputs = num_multimask_outputs
self.iou_token = nn.Embedding(1, transformer_dim) # 初始化IOU嵌入
self.num_mask_tokens = num_multimask_outputs + 1 # 初始化掩码token的数量
# 初始化掩码token的嵌入
self.mask_tokens = nn.Embedding(self.num_mask_tokens, transformer_dim)
# 初始化输出缩放的网络
self.output_upscaling = nn.Sequential(
# 卷积反卷积2d
nn.ConvTranspose2d(transformer_dim, transformer_dim // 4, kernel_size=2, stride=2),
LayerNorm2d(transformer_dim // 4),
# 激活函数
activation(),
# 卷积反卷积2d
nn.ConvTranspose2d(transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2),
activation(),
)
# 初始化输出超网络的MLP列表
self.output_hypernetworks_mlps = nn.ModuleList(
[
MLP(transformer_dim, transformer_dim, transformer_dim // 8, 3)
for i in range(self.num_mask_tokens)
]
)
# 初始化IOU预测头
self.iou_prediction_head = MLP(
transformer_dim, iou_head_hidden_dim, self.num_mask_tokens, iou_head_depth
)
# 前向传播函数
def forward(
self,
image_embeddings: torch.Tensor, # 图像的嵌入表示
image_pe: torch.Tensor, # 图像的位置编码
sparse_prompt_embeddings: torch.Tensor, # 稀疏提示的嵌入表示
dense_prompt_embeddings: torch.Tensor, # 密集提示的嵌入表示
multimask_output: bool, # 是否返回多个掩码
) -> Tuple[torch.Tensor, torch.Tensor]: # 预测的掩码
masks, iou_pred = self.predict_masks(
image_embeddings=image_embeddings,
image_pe=image_pe,
sparse_prompt_embeddings=sparse_prompt_embeddings,
dense_prompt_embeddings=dense_prompt_embeddings,
)
# 根据multimask_output选择掩码输出
if multimask_output:
mask_slice = slice(1, None)
else:
mask_slice = slice(0, 1)
masks = masks[:, mask_slice, :, :]
iou_pred = iou_pred[:, mask_slice]
# 准备输出
return masks, iou_pred
# 预测掩码函数
def predict_masks(
self,
image_embeddings: torch.Tensor,
image_pe: torch.Tensor,
sparse_prompt_embeddings: torch.Tensor,
dense_prompt_embeddings: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
# 预测掩码。参考'forward'获取更多细节
"""
# 拼接输出token
output_tokens = torch.cat([self.iou_token.weight, self.mask_tokens.weight], dim=0)
output_tokens = output_tokens.unsqueeze(0).expand(sparse_prompt_embeddings.size(0), -1, -1)
tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1)
# 在batch方向上扩展每个图像数据,以便在mask上进行处理
src = torch.repeat_interleave(image_embeddings, tokens.shape[0], dim=0)
src = src + dense_prompt_embeddings
pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0)
b, c, h, w = src.shape
# 运行Transformer
hs, src = self.transformer(src, pos_src, tokens)
iou_token_out = hs[:, 0, :]
mask_tokens_out = hs[:, 1 : (1 + self.num_mask_tokens), :]
# 缩放mask嵌入并使用mask tokens预测masks
src = src.transpose(1, 2).view(b, c, h, w)
upscaled_embedding = self.output_upscaling(src)
hyper_in_list: List[torch.Tensor] = []
for i in range(self.num_mask_tokens):
hyper_in_list.append(self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :]))
hyper_in = torch.stack(hyper_in_list, dim=1)
b, c, h, w = upscaled_embedding.shape
masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w)
# 生成mask质量预测
iou_pred = self.iou_prediction_head(iou_token_out)
return masks, iou_pred
# MLP类,继承自nn.Module
class MLP(nn.Module):
# 构造函数
def __init__(
self,
input_dim: int, # 输入维度
hidden_dim: int, # 隐藏层维度
output_dim: int, # 输出维度
num_layers: int, # 层数
sigmoid_output: bool = False, # 是否在输出上应用sigmoid函数
) -> None:
super().__init__()
self.num_layers = num_layers
h = [hidden_dim] * (num_layers - 1)
# 初始化各层
self.layers = nn.ModuleList(
nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
)
self.sigmoid_output = sigmoid_output
# 前向传播函数
def forward(self, x):
# 遍历每一层,逐层处理输入
for i, layer in enumerate(self.layers):
x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
# 如果sigmoid_output为真,对输出应用sigmoid函数
if self.sigmoid_output:
x = F.sigmoid(x)
return x
在分别实现了上述三个结构后,在实际分割时便可以直接调用了
import torch
from torch import nn
from torch.nn import functional as F
from typing import Any, Dict, List, Tuple
from .image_encoder import ImageEncoderViT
from .mask_decoder import MaskDecoder
from .prompt_encoder import PromptEncoder
class Sam(nn.Module):
mask_threshold: float = 0.0
image_format: str = "RGB"
def __init__(
self,
image_encoder: ImageEncoderViT,
prompt_encoder: PromptEncoder,
mask_decoder: MaskDecoder,
pixel_mean: List[float] = [123.675, 116.28, 103.53],
pixel_std: List[float] = [58.395, 57.12, 57.375],
) -> None:
"""
SAM从图像和输入提示中预测对象的遮罩。
参数:
image_encoder (ImageEncoderViT): 用于将图像编码为图像嵌入的主干。
prompt_encoder (PromptEncoder): 对各种类型的输入提示进行编码。
mask_decoder (MaskDecoder): 从图像嵌入和编码的提示中预测遮罩。
pixel_mean (list(float)): 输入图像中像素归一化的平均值。
pixel_std (list(float)): 输入图像中像素归一化的标准差。
"""
super().__init__()
self.image_encoder = image_encoder
self.prompt_encoder = prompt_encoder
self.mask_decoder = mask_decoder
self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False)
self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False)
@property
def device(self) -> Any:
return self.pixel_mean.device
@torch.no_grad()
def forward(
self,
batched_input: List[Dict[str, Any]],
multimask_output: bool,
) -> List[Dict[str, torch.Tensor]]:
"""
从提供的图像和提示中端到端地预测遮罩。
如果事先不知道提示,建议使用SamPredictor而不是直接调用模型。
参数:
batched_input (list(dict)): 输入图像的列表,每个图像是一个包含以下键的字典。如果不存在提示键,则可以排除。
'image': 图像作为3xHxW格式的torch张量,已经转换为模型输入格式。
'original_size': (tuple(int, int)) 转换前图像的原始大小,格式为(H, W)。
'point_coords': (torch.Tensor) 该图像的批处理点提示,形状为BxNx2。已转换为模型的输入帧。
'point_labels': (torch.Tensor) 批处理点提示的标签,形状为BxN。
'boxes': (torch.Tensor) 批处理的框输入,形状为Bx4。已转换为模型的输入帧。
'mask_inputs': (torch.Tensor) 输入模型的批处理遮罩输入,形式为Bx1xHxW。
multimask_output (bool): 模型是否应该预测多个消除歧义的遮罩,还是返回单个遮罩。
返回:
(list(dict)): 输入图像的列表,每个元素是一个包含以下键的字典。
'masks': (torch.Tensor) 批处理的二进制遮罩预测,形状为BxCxHxW,其中B是输入提示的数量,C由multimask_output决定,(H, W)是图像的原始大小。
'iou_predictions': (torch.Tensor) 遮罩质量的模型预测,形状为BxC。
'low_res_logits': (torch.Tensor) 低分辨率的逻辑张量,形状为BxCxHxW,其中H=W=256。可以作为遮罩输入传递给后续的预测迭代。
"""
input_images = torch.stack([self.preprocess(x["image"]) for x in batched_input], dim=0)
image_embeddings = self.image_encoder(input_images)
outputs = []
for image_record, curr_embedding in zip(batched_input, image_embeddings):
if "point_coords" in image_record:
points = (image_record["point_coords"], image_record["point_labels"])
else:
points = None
sparse_embeddings, dense_embeddings = self.prompt_encoder(
points=points,
boxes=image_record.get("boxes", None),
masks=image_record.get("mask_inputs", None),
)
low_res_masks, iou_predictions = self.mask_decoder(
image_embeddings=curr_embedding.unsqueeze(0),
image_pe=self.prompt_encoder.get_dense_pe(),
sparse_prompt_embeddings=sparse_embeddings,
dense_prompt_embeddings=dense_embeddings,
multimask_output=multimask_output,
)
masks = self.postprocess_masks(
low_res_masks,
input_size=image_record["image"].shape[-2:],
original_size=image_record["original_size"],
)
masks = masks > self.mask_threshold
outputs.append(
{
"masks": masks,
"iou_predictions": iou_predictions,
"low_res_logits": low_res_masks,
}
)
return outputs
def postprocess_masks(
self,
masks: torch.Tensor,
input_size: Tuple[int, ...],
original_size: Tuple[int, ...],
) -> torch.Tensor:
"""
去除填充并将遮罩放大到原始图像大小。
参数:
masks (torch.Tensor): MaskDecoder生成的批处理遮罩,格式为BxCxHxW。
input_size (tuple(int, int)): 输入到模型的图像的大小,格式为(H, W)。用于去除填充。
original_size (tuple(int, int)): 调整为输入模型的图像的原始大小,格式为(H, W)。
返回:
(torch.Tensor): 格式为BxCxHxW的批处理遮罩,其中(H, W)由original_size给出。
"""
masks = F.interpolate(
masks,
(self.image_encoder.img_size, self.image_encoder.img_size),
mode="bilinear",
align_corners=False,
)
masks = masks[..., : input_size[0], : input_size[1]]
masks = F.interpolate(masks, original_size, mode="bilinear", align_corners=False)
return masks
def preprocess(self, x: torch.Tensor) -> torch.Tensor:
"""归一化像素值并填充为方形输入。"""
# 归一化颜色
x = (x - self.pixel_mean) / self.pixel_std
# 填充
h, w = x.shape[-2:]
padh = self.image_encoder.img_size - h
padw = self.image_encoder.img_size - w
x = F.pad(x, (0, padw, 0, padh))
return x
训练时模拟交互分割的过程,从目标mask中随机选取前景点或者box,点是从gt mask选取,box增加长边10%的噪声,最大20像素
在第一次prompt预测mask之后,后续是从预测mask和gt mask有差异的区域采样点
同时,将预测的mask(unthresholded mask logits代替二值化的mask,不过滤阈值,默认为0),作为prompt作为迭代
训练过程中,发现用8个采样点比较合适(对比16个,没有明显增益),为了鼓励模型从mask中获益,其中2个迭代不用新采样的点,总共11个迭代,一个是初始化的prompt输入,然后是8个上述迭代,再加2个不重新采样点的迭代(这样可以refine mask)。由于mask decoder比较轻,所以可以进行更多次的迭代
最终在1100万数据集上,生成了11亿高质量的mask
数据情况