CoOp是稍微改了下CLIP的text encoder
CLIP代码:https://github.com/OpenAI/CLIP
CoOp代码:https://github.com/KaiyangZhou/CoOp
import torch
import clip
from PIL import Image
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
image = preprocess(Image.open("CLIP.png")).unsqueeze(0).to(device)
text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)# torch.Size([3,77]) [batch_size,n_ctx]
# 这里只是对每个词进行编号,空的地方补0
# 设置n_ctx为77,但只支持输入75个词,
# 因为有两个词分别是startoftext 和 endoftext,会自动加入
with torch.no_grad():
image_features = model.encode_image(image)
text_features = model.encode_text(text)# torch.Size([3,512]) 每一条prompt进行encode后会变成512维
logits_per_image, logits_per_text = model(image, text)
probs = logits_per_image.softmax(dim=-1).cpu().numpy()
print("Label probs:", probs) # prints: [[0.9927937 0.00421068 0.00299572]]
def encode_text(self, text):
x = self.token_embedding(text).type(self.dtype) # [batch_size,n_ctx] -> [batch_size, n_ctx, d_model] #这里d_model==512
x = x + self.positional_embedding.type(self.dtype)
x = x.permute(1, 0, 2) # NLD -> LND
x = self.transformer(x)
x = x.permute(1, 0, 2) # LND -> NLD
x = self.ln_final(x).type(self.dtype)
# x.shape = [batch_size, n_ctx, transformer.width]
# take features from the eot embedding (eot_token is the highest number in each sequence)
x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
return x
CLIP的encode_text函数有①token_embedding和②positional_embedding。
①和②相加,输进transformer,通过训练学习,更新positional_embedding
CoOp的encode_text把CLIP的①token_embedding换成了 p r o m p t s prompts prompts, p r o m p t s prompts prompts是可学习的。
class TextEncoder(nn.Module):
def __init__(self, clip_model):
super().__init__()
self.transformer = clip_model.transformer
self.positional_embedding = clip_model.positional_embedding
self.ln_final = clip_model.ln_final
self.text_projection = clip_model.text_projection
self.dtype = clip_model.dtype
def forward(self, prompts, tokenized_prompts):
x = prompts + self.positional_embedding.type(self.dtype)
x = x.permute(1, 0, 2) # NLD -> LND
x = self.transformer(x)
x = x.permute(1, 0, 2) # LND -> NLD
x = self.ln_final(x).type(self.dtype)
# x.shape = [batch_size, n_ctx, transformer.width]
# take features from the eot embedding (eot_token is the highest number in each sequence)
'''
若输入进text encoder的是一句["this is hair"]
print("text",text) #tensor([[49406, 589, 533, 2225, 49407, 0, 0, 0, 0, ...]])
print("text.argmax(dim=-1)",text.argmax(dim=-1)) #tensor([4])
print("torch.arange(x.shape[0])",torch.arange(x.shape[0])) #tensor([0])
print("x",x.shape,x) #torch.Size([1, 77, 512])
print("x[torch.arange(x.shape[0]), text.argmax(dim=-1)]",x[torch.arange(x.shape[0]), text.argmax(dim=-1)].shape) #torch.Size([1, 512])即从[1,77,512]取了index==4的矩阵
'''
# ! tokenized_prompts在这里只是为了用了获得eot_token对应的embedding?
x = x[torch.arange(x.shape[0]), tokenized_prompts.argmax(dim=-1)] @ self.text_projection
return x
与CLIP手动设置的Prompt不同(比如“a photo of a {label}”)。CoOp定义了可学习的PromptLearner,可生成 p r o m p t s prompts prompts 和 t o k e n i z e d _ p r o m p t s tokenized\_prompts tokenized_prompts ,tokenized_prompts由如下代码得来:
nn.init.normal_(ctx_vectors, std=0.02) #初始化
prompt_prefix = " ".join(["X"] * n_ctx) #初始化
classnames = [name.replace("_", " ") for name in classnames]
name_lens = [len(_tokenizer.encode(name)) for name in classnames] # 这里只是对每个词进行编号
prompts = [prompt_prefix + " " + name + "." for name in classnames]
tokenized_prompts = torch.cat([clip.tokenize(p) for p in prompts])
with torch.no_grad():
embedding = clip_model.token_embedding(tokenized_prompts).type(dtype)
# These token vectors will be saved when in save_model(),
# but they should be ignored in load_model() as we want to use
# those computed using the current class names
self.register_buffer("token_prefix", embedding[:, :1, :]) # SOS
self.register_buffer("token_suffix", embedding[:, 1 + n_ctx :, :]) # CLS, EOS
在前向阶段, p r o m p t s prompts prompts里设置了ctx,ctx是nn.Parameter,是可学习的。这里的 s u f f i x suffix suffix,是包含CLS, EOS的,即类名和end-of-sentence,即suffix包含了类的信息。
# class_token_position == "end" class_token放在句末的情况
self.ctx = nn.Parameter(ctx_vectors) # to be optimized
ctx = self.ctx # ctx是context的缩写
prompts = torch.cat(
[
prefix, # (n_cls, 1, dim) # 相当于startoftext的embedding
ctx, # (n_cls, n_ctx, dim)
suffix, # (n_cls, *, dim) # 相当于endoftext 的embedding
],
dim=1,
)
class CustomCLIP(nn.Module):
def __init__(self, cfg, classnames, clip_model):
super().__init__()
self.prompt_learner = PromptLearner(cfg, classnames, clip_model)
self.tokenized_prompts = self.prompt_learner.tokenized_prompts
self.image_encoder = clip_model.visual
self.text_encoder = TextEncoder(clip_model)
self.logit_scale = clip_model.logit_scale
self.dtype = clip_model.dtype
# 训练CoOp时的前向阶段
def forward(self, image):
image_features = self.image_encoder(image.type(self.dtype))
prompts = self.prompt_learner()
tokenized_prompts = self.tokenized_prompts
text_features = self.text_encoder(prompts, tokenized_prompts)
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
logit_scale = self.logit_scale.exp()
logits = logit_scale * image_features @ text_features.t()
return logits
CoOp训练是用batch里的(image,label)。
前向阶段:image输入进image_encoder得到image_features;把prompts, tokenized_prompts放进TextEncoder获得text_features,两者算相似度获得logits。
logits与label进行交叉熵运算得到loss
反向阶段:loss反向传播,优化可学习的nn.Parameter。比如prompts里的ctx以及原CLIP里的 positional_embedding。
注:有理解错误的地方请指出