原始代码并不会输出得分,而是根据各个属性的阈值判别大于阈值(不同的子类不同)的为命中,输出对应的属性。因此需要修改代码实现。
阈值: https://github.com/xinyu1205/recognize-anything/blob/main/ram/data/ram_tag_list_threshold.txt
默认阈值分布如下图,主要几种在0.65,还有两个高频阈值0.8和0.9。

1、修改阈值文件
找到需要输出得分的类目,并且将对应的阈值文件对应的类目阈值修改为0.0 (建议备份改文件), 这样才能输出对应的结果。
原生支持的中文标签: https://github.com/xinyu1205/recognize-anything/blob/main/ram/data/ram_tag_list_chinese.txt
原生支持的英文标签: https://github.com/xinyu1205/recognize-anything/blob/main/ram/data/ram_tag_list.txt
对应阈值: https://github.com/xinyu1205/recognize-anything/blob/main/ram/data/ram_tag_list_threshold.txt
2、修改ram.py
主要修改这个函数 generate_tag, 让其返回得分。
https://github.com/xinyu1205/recognize-anything/blob/main/ram/models/ram.py#L294
def generate_tag(self,
image,
threshold=0.68,
tag_input=None,
):
label_embed = torch.nn.functional.relu(self.wordvec_proj(self.label_embed))
image_embeds = self.image_proj(self.visual_encoder(image))
image_atts = torch.ones(image_embeds.size()[:-1],
dtype=torch.long).to(image.device)
# recognized image tags using image-tag recogntiion decoder
image_cls_embeds = image_embeds[:, 0, :]
image_spatial_embeds = image_embeds[:, 1:, :]
bs = image_spatial_embeds.shape[0]
label_embed = label_embed.unsqueeze(0).repeat(bs, 1, 1)
tagging_embed = self.tagging_head(
encoder_embeds=label_embed,
encoder_hidden_states=image_embeds,
encoder_attention_mask=image_atts,
return_dict=False,
mode='tagging',
)
logits = self.fc(tagging_embed[0]).squeeze(-1)
targets = torch.where(
torch.sigmoid(logits) > self.class_threshold.to(image.device),
torch.tensor(1.0).to(image.device),
torch.zeros(self.num_class).to(image.device))
tag = targets.cpu().numpy()
tag[:, self.delete_tag_index] = 0
tag_output = []
tag_output_chinese = []
scores_output = []
scores = torch.sigmoid(logits).cpu().numpy()
for b in range(bs):
index = np.argwhere(tag[b] == 1)
token = self.tag_list[index].squeeze(axis=1)
tag_output.append(' | '.join(token))
token_chinese = self.tag_list_chinese[index].squeeze(axis=1)
tag_output_chinese.append(' | '.join(token_chinese))
score = scores[b][index].squeeze(axis=1)
score = [format(_, '.2f') for _ in score]
scores_output.append(' | '.join(score))
return tag_output, tag_output_chinese, scores_output
3、修改inference.py
主要修改这个函数inference_ram, 让其返回得分。
https://github.com/xinyu1205/recognize-anything/blob/main/ram/inference.py#L33
def inference_ram(image, model):
with torch.no_grad():
tags, tags_chinese, scores = model.generate_tag(image)
return tags[0],tags_chinese[0], scores[0]
4、修改inference_ram.py
修改main函数,打印得分
https://github.com/xinyu1205/recognize-anything/blob/main/inference_ram.py
if __name__ == "__main__":
args = parser.parse_args()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
transform = get_transform(image_size=args.image_size)
#######load model
model = ram(pretrained=args.pretrained,
image_size=args.image_size,
vit='swin_l')
model.eval()
model = model.to(device)
image = transform(Image.open(args.image)).unsqueeze(0).to(device)
res = inference(image, model)
print("Image Tags: ", res[0])
print("图像标签: ", res[1])
print('得分:', res[2])
5、分析得分
一般建议导出文件,然后直接用Excel就可以分析。
在inference_ram.py基础上修改自己的脚本,假设命名为ram_inference_tigerZ.py
# -*- coding: utf-8 -*-
"""使用RAM进行打标.
@Author: zhangjianhu
@Create Date: 2023.09.20
@Last Edit Date: 2023.11.07
@Description:
使用RAM进行打标,输入为文件名列表文件,输出达标结果。
输出格式:每个文件一行,格式:文件名\t中文标签\t英文标签
其中中英文靠\t分割,中文、英文内部 靠 | 分割
"""
import argparse
import numpy as np
import random
import os
from tqdm import tqdm
import torch
from PIL import Image
from ram.models import ram
from ram import inference_ram as inference
from ram import get_transform
def main():
with open(args.image, 'r') as fp:
lines = fp.readlines()
results = []
files_failed = []
print('processing.....')
for line in tqdm(lines):
try:
result = [line.strip()]
if args.dir_prefix:
line = os.path.join(args.dir_prefix, line.strip())
image = transform(Image.open(line)).unsqueeze(0).to(device)
res = inference(image, model) # res[0] 英语标签, res[1] 中文标签, res[2] 自己修改的得分
result.append(res[0])
result.append(res[1])
result.append(res[2])
# print("Image Tags: ", res[0], type(res[0]))
# print("图像标签: ", res[1], type(res[1]))
except Exception as e:
result.append('-1')
files_failed.append(line)
# print(line, e)
results.append('\t'.join(result))
print('saving result.......')
if args.des:
file_result = args.des
else:
file_result = os.path.splitext(args.image)[0] + '__result' + os.path.splitext(args.image)[1]
with open(file_result, 'w') as fp:
fp.write('\n'.join(results))
print('all done.')
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Ram inference for tagging and captioning')
parser.add_argument('--image',
metavar='DIR',
help='path to dataset',
default='results/files_example.txt')
parser.add_argument('--des',
metavar='DIR',
help='path to save result',
default='')
parser.add_argument('--dir-prefix',
metavar='DIR',
help='path prefix to image',
default='')
parser.add_argument('--pretrained',
metavar='DIR',
help='path to pretrained model',
default='pretrained/ram_swin_large_14m.pth')
parser.add_argument('--image-size',
default=384,
type=int,
metavar='N',
help='input image size (default: 384)')
args = parser.parse_args()
print('preparing model........')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
transform = get_transform(image_size=args.image_size)
model = ram(pretrained=args.pretrained, image_size=args.image_size, vit='swin_l')
model.eval()
model = model.to(device)