安装环境
pip install ftfy regex tqdm
pip install git+https://github.com/openai/CLIP.git
API
clip.available_models()
['RN50', 'RN101', 'RN50x4', 'RN50x16', 'RN50x64', 'ViT-B/32', 'ViT-B/16', 'ViT-L/14', 'ViT-L/14@336px']
model, preprocess = clip.load("ViT-B/32")
image = preprocess(Image.open("CLIP.png")).unsqueeze(0)
text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)
image_features = model.encode_image(image)
text_features = model.encode_text(text)
logits_per_image, logits_per_text = model(image, text)
简单使用
示例1
import torch
import clip
from PIL import Image
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
image = preprocess(Image.open("CLIP.png")).unsqueeze(0).to(device)
text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)
with torch.no_grad():
image_features = model.encode_image(image)
text_features = model.encode_text(text)
logits_per_image, logits_per_text = model(image, text)
probs = logits_per_image.softmax(dim=-1).cpu().numpy()
print("Label probs:", probs)
示例2
import os
import clip
import torch
from torchvision.datasets import CIFAR100
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)
cifar100 = CIFAR100(root=os.path.expanduser("~/.cache"), download=True, train=False)
image, class_id = cifar100[3637]
image_input = preprocess(image).unsqueeze(0).to(device)
text_inputs = torch.cat([clip.tokenize(f"a photo of a {c}") for c in cifar100.classes]).to(device)
with torch.no_grad():
image_features = model.encode_image(image_input)
text_features = model.encode_text(text_inputs)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
values, indices = similarity[0].topk(5)
print("\nTop predictions:\n")
for value, index in zip(values, indices):
print(f"{cifar100.classes[index]:>16s}: {100 * value.item():.2f}%")
示例3
import os
import clip
import torch
import numpy as np
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR100
from tqdm import tqdm
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)
root = os.path.expanduser("~/.cache")
train = CIFAR100(root, download=True, train=True, transform=preprocess)
test = CIFAR100(root, download=True, train=False, transform=preprocess)
def get_features(dataset):
all_features = []
all_labels = []
with torch.no_grad():
for images, labels in tqdm(DataLoader(dataset, batch_size=100)):
features = model.encode_image(images.to(device))
all_features.append(features)
all_labels.append(labels)
return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()
train_features, train_labels = get_features(train)
test_features, test_labels = get_features(test)
classifier = LogisticRegression(random_state=0, C=0.316, max_iter=1000, verbose=1)
classifier.fit(train_features, train_labels)
predictions = classifier.predict(test_features)
accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100.
print(f"Accuracy = {accuracy:.3f}")
(重要)固定或更新CLIP参数
import os
import clip
from torch import nn
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10
from torch.nn import functional as F
import torch
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.model, self.preprocess = clip.load('ViT-B/32', 'cpu')
self.linear = nn.Linear(512, 10)
for param in self.linear.parameters():
param.requires_grad = False
def forward(self, x):
features = self.model.encode_image(x)
return self.linear(features)
net = Net()
optimizer = torch.optim.SGD(net.parameters(), lr=1e-2)
root = os.path.expanduser("~/.cache")
train = CIFAR10(root, download=True, train=True, transform=net.preprocess)
train = next(iter(DataLoader(train, batch_size=8)))
storeParam = {}
for name, param in net.model.visual.named_parameters():
storeParam[name] = param.detach().clone()
for name, param in net.linear.named_parameters():
storeParam[name] = param.detach().clone()
for i in range(10):
out = net(train[0])
loss = F.cross_entropy(out, train[1])
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(loss.item())
for name, param in net.model.visual.named_parameters():
print(f"{name} {torch.equal(param, storeParam[name])}")
for name, param in net.linear.named_parameters():
print(f"{name} {torch.equal(param, storeParam[name])}")