CLIP使用

安装环境

# 1. 环境安装pytorch
# 2. 安装tqdm
pip install ftfy regex tqdm
# 3. 安装clip
pip install git+https://github.com/openai/CLIP.git
    
# 内网使用pip install git+https://github.91chi.fun/https://github.com/openai/CLIP.git

API

# 1. 返回可以用的模型
clip.available_models()
['RN50', 'RN101', 'RN50x4', 'RN50x16', 'RN50x64', 'ViT-B/32', 'ViT-B/16', 'ViT-L/14', 'ViT-L/14@336px']

# 2. 返回对应的模型和图像转换器
model, preprocess = clip.load("ViT-B/32")

# 3. preprocess将Image转换成tensor[3, 224, 224],然后unsqueeze(0)转成[batch_size, 3, 3, 224]后才能输入模型
image = preprocess(Image.open("CLIP.png")).unsqueeze(0)

# 4. 将多个句子[batch_size]的每个句子转换成向量[batch_size, context_length]
# 	每个句子开头加一个BOS(49406) EOS(49407),然后填充到长度context_length(默认值为77)
# 	(若长度大于context_length-2,需设置参数truncate=True,然后返回值为BOS 内容 EOS,即EOS没有被切割掉)
text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device) # [3, 77]

# 5. 获取多个图片的特征
image_features = model.encode_image(image)
   
# 6. 获取多个文本的特征
text_features = model.encode_text(text)

# 7. 获取 多个图片和多个文本 之间余弦相似度(0~1)
logits_per_image, logits_per_text = model(image, text)

简单使用

示例1

import torch
import clip
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

image = preprocess(Image.open("CLIP.png")).unsqueeze(0).to(device)
text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)

with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    
    logits_per_image, logits_per_text = model(image, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Label probs:", probs)  # prints: [[0.9927937  0.00421068 0.00299572]]

示例2

import os
import clip
import torch
from torchvision.datasets import CIFAR100

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)

# Download the dataset
cifar100 = CIFAR100(root=os.path.expanduser("~/.cache"), download=True, train=False)

# Prepare the inputs
image, class_id = cifar100[3637]
image_input = preprocess(image).unsqueeze(0).to(device)
text_inputs = torch.cat([clip.tokenize(f"a photo of a {c}") for c in cifar100.classes]).to(device)

# Calculate features
with torch.no_grad():
    image_features = model.encode_image(image_input)
    text_features = model.encode_text(text_inputs)

# Pick the top 5 most similar labels for the image
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
# 这里乘100没什么用,意思是用百分比表示相似度
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
values, indices = similarity[0].topk(5)

# Print the result
print("\nTop predictions:\n")
for value, index in zip(values, indices):
    print(f"{cifar100.classes[index]:>16s}: {100 * value.item():.2f}%")

示例3

import os
import clip
import torch

import numpy as np
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR100
from tqdm import tqdm

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)

# Load the dataset
root = os.path.expanduser("~/.cache")
train = CIFAR100(root, download=True, train=True, transform=preprocess)
test = CIFAR100(root, download=True, train=False, transform=preprocess)


def get_features(dataset):
    all_features = []
    all_labels = []
    
    with torch.no_grad():
        for images, labels in tqdm(DataLoader(dataset, batch_size=100)):
            features = model.encode_image(images.to(device))

            all_features.append(features)
            all_labels.append(labels)

    return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()

# Calculate the image features
train_features, train_labels = get_features(train)
test_features, test_labels = get_features(test)

# Perform logistic regression
classifier = LogisticRegression(random_state=0, C=0.316, max_iter=1000, verbose=1)
classifier.fit(train_features, train_labels)

# Evaluate using the logistic regression classifier
predictions = classifier.predict(test_features)
accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100.
print(f"Accuracy = {accuracy:.3f}")

(重要)固定或更新CLIP参数

# 因为我们的模型只用到了CLIP视觉的编码器,所以我们只输出视觉编码器的参数有没有变化即可
# 不打开位置1和位置2,全部输出False,即所有参数都进行了更新
# 仅打开位置1,CLIP的参数为True,Linear为False,即Linear的参数更新
# 仅打开位置2,CLIP的参数为Flase,Linear为True,即只有CLIP的参数更新


import os
import clip
from torch import nn
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10
from torch.nn import functional as F
import torch


class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.model, self.preprocess = clip.load('ViT-B/32', 'cpu')
        self.linear = nn.Linear(512, 10)

        # 位置2
        for param in self.linear.parameters():
            param.requires_grad = False

    def forward(self, x):
        features = self.model.encode_image(x)

        # 位置1
        # features = features.detach()

        return self.linear(features)


net = Net()
optimizer = torch.optim.SGD(net.parameters(), lr=1e-2)

root = os.path.expanduser("~/.cache")
train = CIFAR10(root, download=True, train=True, transform=net.preprocess)
train = next(iter(DataLoader(train, batch_size=8)))

storeParam = {}
for name, param in net.model.visual.named_parameters():
    storeParam[name] = param.detach().clone()
for name, param in net.linear.named_parameters():
    storeParam[name] = param.detach().clone()

for i in range(10):
    out = net(train[0])
    loss = F.cross_entropy(out, train[1])

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(loss.item())

for name, param in net.model.visual.named_parameters():
    print(f"{name} {torch.equal(param, storeParam[name])}")
for name, param in net.linear.named_parameters():
    print(f"{name} {torch.equal(param, storeParam[name])}")

你可能感兴趣的:(深度学习,pytorch,python)