当你已经设计好了一个神经网络模型MyModel,它可以在model_my.py中封装成MyNet:
class MyModel(nn.Module):
def __init__(self, variable1, variable2, ...):
super(MyModel, self).__init__()
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = nn.BatchNorm2d(64, affine=affine_par)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1, ceil_mode=True)
......
def forward(self, input1, input2, input3):
......
return ouput1, output2, output3, ...
def MyNet([needed_variables]):
model = MyModel(variable1, variable2, ...)
return model
在train.py或test.py文件中,调用该函数以引入该网络:
from xxx import MyNet
......
model = MyNet([needed_variables])
......
def get_arguments():
parser = argparse.ArgumentParser(description="myNet") # 创建解析器,desciption是说明文字
# 进行参数的设置,以学习率为例,learning-rate是参数名字,type=float设置数据类型,default设置默认值,help的内容是说明文字
parser.add_argument("--learning-rate", type=float, default=0.001, help="Base learning rate for training.")
......
parser.add_argument("--dataset", type=str, default='davis16', help="duts, coco, or davis16.")
# GPU的设置
parser.add_argument("--cuda", default=True, help="Run on CPU or GPU")
parser.add_argument("--gpus", type=str, default="1", help="choose gpu device.") #使用1号GPU(注意,是从0开始排的)
return parser.parse_args()
比如,当要运行mytrain.py文件时,进入到同一目录下,在终端命令行输入:python mytrain.py --learning-rate 0.002 --dataset duts --gpus 3
,即可手动设置对应参数值,其余参数则使用默认值
def configure_dataset_model(args):
if args.dataset == 'davis16':
args.batch_size = 5 # 每次输入网络的图片数量
args.maxEpoches = 15 # maxIterations= maxEpoches*len(train_aug)/batch_size_per_gpu
args.data_dir = 'mypath/dataset/DAVIS16' # 数据集的路径
args.data_list = 'mypath/dataset/DAVIS16/train_seqs.txt'
# args.data_list = 'mypath/dataset/DAVIS16/test_seqs.txt' # 准备好的训练或测试的视频序列集名字,txt文件中每行都是一个视频序列名字,后面有举例
args.input_size = '473,473' # 统一输入图片的大小,可选
......
args.restore_from = './pretrained/deep_labv3/deeplab_davis_12_0.pth' # 需要用到的预训练模型,根据需要更改,这里是训练阶段要用到deeplabv3模型
# args.restore_from = './snapshots/davis_iteration/mynet_555.pth' # 测试阶段用的是这个,我们网络训练好的模型
args.snapshot_dir = './snapshots/davis_iteration' # 保存训练模型的路径,测试阶段不需要
args.save_dir = './result/test/' # 测试阶段保存输出图片的路径
elif args.dataset == 'duts':
......
# 每个数据集都是同上操作
else:
print("dataset error") # 做一个数据集不存在的报错反馈
# test_seqs.txt
blackswan
bmx-trees
breakdance
camel
car-roundabout
car-shadow
cows
dance-twirl
dog
drift-chicane
drift-straight
goat
horsejump-high
kite-surf
libby
motocross-jump
paragliding-launch
parkour
scooter-black
soapbox
这样一来,在main函数中我们就可以初始化所有模型参数
def main():
args = get_arguments()
print("=====> Configure dataset and model")
configure_dataset_model(args)
print(args)
# 设置训练的GPU
print("=====> Set GPU for training")
if args.cuda:
print("====> Use gpu id: '{}'".format(args.gpus))
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
if not torch.cuda.is_available():
raise Exception("No GPU found or Wrong gpu id, please run without --cuda") # 如果没有GPU导致出现提示,需要在最开始运行的时候命令'--cuda False'
# 训练时需要的模块:设置生成随机数的种子,使得每次运行该文件的输出结果都一样,而不是每次随机函数生成的结果一样
print("=====> Random Seed: ", args.random_seed)
torch.manual_seed(args.random_seed)
if args.cuda:
torch.cuda.manual_seed(args.random_seed)
......
为了模块化,我们新建一个pre_dataset.py文件作为数据预处理模块:
class PreData(Dataset):
def __init__(self, data_path, data_list):
self.fwflow_list = []
self.bwflow_list = []
self.img_list = []
self.label_list = []
with open(data_list) as f:
seqs = f.readlines()
seqs = [seq.strip() for seq in seqs]
print(seqs)
# 以DAVIS16为例:
for i in seqs:
self.img_list+=sorted(glob.glob(os.path.join(data_path, "JPEGImages/480p", i, "*.jpg")))[:-1]
self.label_list+=sorted(glob.glob(os.path.join(data_path, "Annotations/480p", i, "*.png")))[:-1]
self.fwflow_list+=sorted(glob.glob(os.path.join(data_path, "davis_flow", i, "*.png")))
self.bwflow_list+=sorted(glob.glob(os.path.join(data_path, "davis_bwflow", i, "*.png")))
def __len__(self):
return len(self.img_list)
def __getitem__(self, item): # 这里的网络输入为视频序列中两帧图片+对应光流图,按需修改这个函数
frame = [item]
scope = 10 # 设置最大随机范围
other = np.random.randint(-scope, scope)
while item + other >= self.dataset_len or item + other < 0 or other == 0:
other = np.random.randint(-scope, scope)
#print(item, other)
name1 = self.img_list[item]
name2 = self.img_list[item + other]
#print(name1,name2)
while name1.split('/')[-2] != name2.split('/')[-2]:
other = np.random.randint(-scope, scope)
while item + other >= self.dataset_len or item + other < 0 or other == 0:
other = np.random.randint(-scope, scope)
#print(item,other)
name2 = self.img_list[item + other]
# print('in')
frame.append(item + other) # 当前帧和随机挑选视频序列的另一帧作为一组输入
videos, labels, fwflows, bwflows = [], [], [], []
for i in frame:
video = imread(self.img_list[i])
fw = imread(self.fwflow_list[i])
bw = imread(self.bwflow_list[i])
label = imread(self.label_list[i])
if len(label.shape) == 3:
label = label[:, :, 0]
label = label[:, :, np.newaxis]
videos.append(img_normalize(video.astype(np.float32) / 255.))
labels.append(label.astype(np.float32) / 255.)
fwflows.append(img_normalize(fw.astype(np.float32) / 255.))
bwflows.append(img_normalize(bw.astype(np.float32) / 255.))
H, W = labels[0].shape[0], labels[0].shape[1]
#print(H,W)
return {'video': F.interpolate(torch.from_numpy(np.stack(videos, 0)).permute(0, 3, 1, 2), (self.H, self.W), mode='bilinear', align_corners=True),
'fwflow': F.interpolate(torch.from_numpy(np.stack(fwflows, 0)).permute(0, 3, 1, 2), (self.H, self.W), mode='bilinear', align_corners=True),
'bwflow': F.interpolate(torch.from_numpy(np.stack(bwflows, 0)).permute(0, 3, 1, 2), (self.H, self.W), mode='bilinear', align_corners=True),
"label":torch.from_numpy(np.stack([labels[0]], 0)).permute(0, 3, 1, 2),
"H":H, "W":W, 'name': self.img_list[item].split("/")[-2]+"/"+self.img_list[item].split("/")[-1]} #返回需要的数据
# 图像颜色的归一化函数(统一为灰度图),在上面的__getitem__中有用到
def img_normalize(image):
if len(image.shape)==2:
channel = (image[:, :, np.newaxis] - 0.485) / 0.229
image = np.concatenate([channel,channel,channel], axis=2)
else:
image = (image-np.array([0.485, 0.456, 0.406], dtype=np.float32).reshape((1, 1, 3)))\
/np.array([0.229, 0.224, 0.225], dtype=np.float32).reshape((1, 1, 3))
return image
这样在训练或测试文件的main函数中就可以加载数据集了:
def main():
......
if args.dataset == 'davis':
h, w = map(int, args.input_size.split(','))
input_size = (h, w)
# 测试集为例:
db_test = db.PairwiseImg(data_path=args.data_dir, data_list=args.data_list)
testloader = data.DataLoader(db_test, batch_size= 1, shuffle=False, num_workers=0)
elif args.dataset == 'duts':
......
有许多方法可用,下面只是一个例子
def adjust_learning_rate(optimizer, decay_count, decay_rate=.9):
for param_group in optimizer.param_groups:
param_group['lr'] = max(1e-5, 5e-4 * pow(decay_rate, decay_count))
print(param_group['lr'])
举个例子,如果我需要用到二进制交叉熵损失bce,那么定义这样一个函数:
bce_loss = nn.BCELoss(reduction='mean')
def bce_loss(pred, target):
loss = 0
bce_out = bce_loss(pred, target)
loss += bce_out
return loss
按需封装成相应的函数,后续使用直接调用就行
def main():
......#(前面提到的初始化模型参数部分)
param_group = [{'params': get_lr_params(model), 'lr': 1*args.learning_rate },
{'params': get_last_lr_params(model), 'lr': 10*args.learning_rate}] #针对特定层进行学习
optimizer = optim.SGD(param_group, lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) # SGD:随机梯度下降,也可按需改成别的优化方法
optimizer.zero_grad() # 将每个参数的梯度值都置为0,即初始化
......
其中:
def get_lr_params(model):
"""
返回网络的所有参数(不包括最后的分类层)
"""
b = []
if torch.cuda.device_count() == 1: # 当只使用一个GPU进行训练时,个人认为由于空间有限,所以只针对某个层进行学习
b.append(model.encoder.layer3)
else: # 当使用多个GPU进行训练时
b.append(model.module.encoder.conv1)
b.append(model.module.encoder.bn1)
b.append(model.module.encoder.layer1)
b.append(model.module.encoder.layer2)
b.append(model.module.encoder.layer3)
b.append(model.module.encoder.main_classifier)
for i in range(len(b)):
for j in b[i].modules():
jj = 0
for k in j.parameters():
jj+=1
if k.requires_grad:
yield k
# 也可以直接append参数,则改成:
b = []
if torch.cuda.device_count() == 1:
b.append(model.encoder.layer3.parameters())
else:
b.append(model.module.encoder.conv1.parameters())
b.append(model.module.encoder.bn1.parameters())
b.append(model.module.encoder.layer1.parameters())
b.append(model.module.encoder.layer2.parameters())
b.append(model.module.encoder.layer3.parameters())
b.append(model.module.encoder.main_classifier.parameters())
for j in range(len(b)):
for i in b[j]:
yield i
def get_last_lr_params(model):
"""
返回网络最后分类层的的所有参数
"""
...... # 同上类似的操作
def main():
......#(初始化模型参数+设置优化器+加载数据集)
print("=====> Building network")
saved_state_dict = torch.load(args.restore_from) #载入deeplabv3模型(参数)
model = MyNet([needed_variables]) #引入GNNNet网络
new_params = model.state_dict().copy() #保存GNNNet参数
calt = 0
for i in saved_state_dict["model"]:
i_parts = i.split('.') # 针对多GPU的情况
print('i_parts: ', '.'.join(i_parts[1:-1]))
new_params['encoder'+'.'+'.'.join(i_parts[1:])] = saved_state_dict["model"][i]
print("=====> Loading init weights")
model.load_state_dict(new_params)
if args.cuda:
if torch.cuda.device_count()>1:
print("torch.cuda.device_count()=",torch.cuda.device_count())
model = torch.nn.DataParallel(model).cuda()
print("more than 1 gpu")
else:
print("single GPU for training")
model = model.cuda()
model.train() #把模型设置成训练模式
cudnn.benchmark = True
if not os.path.exists(args.snapshot_dir):
os.makedirs(args.snapshot_dir) # 新建保存模型的文件夹
print('=====> Computing network parameters')
total_paramters = netParams(model)
print('Total network parameters: ' + str(total_paramters))
# 日志文件
logFileLoc = args.snapshot_dir + args.logFile
if os.path.isfile(logFileLoc):
logger = open(logFileLoc, 'a')
else:
logger = open(logFileLoc, 'w')
logger.write("Parameters: %s" % (str(total_paramters)))
logger.write("\n%s\t\t%s" % ('iter', 'Loss(train)\n'))
logger.flush()
# 计算网络参数数量的函数(上面用到了)
def netParams(model):
'''
Computing total network parameters
Args:
model: model
return: total network parameters
'''
total_paramters = 0
for parameter in model.parameters():
i = len(parameter.size())
#print(parameter.size())
p = 1
for j in range(i):
p *= parameter.size(j)
total_paramters += p
return total_paramters
def main():
start = timeit.default_timer() # 开始时间
......#(网络创建部分)
print("=====> Begin to train")
train_len=len(trainloader)
print(" iteration numbers of per epoch: ", train_len)
print(" epoch num: ", args.maxEpoches)
print(" max iteration: ", args.maxEpoches*train_len)
for epoch in range(1, int(args.maxEpoches)):
running_loss = 0.0
ite_num_per = 0
iter_num = 0
datasampler.set_epoch(epoch)
model.train()
i=0
if epoch>15:
adjust_learning_rate(optimizer, (epoch-20))# 学习率自适应
for data in trainloader:
ite_num_per = ite_num_per + 1
i+=1
iter_num = iter_num + 1
img, fw_flow, bw_flow, label = data['video'].cuda(), \
data['fwflow'].cuda(),\
data['bwflow'].cuda(),\
data['label'].cuda()
B, Seq, C, H, W = img.size()
spatial_out, temporal_out = model(img, torch.cat((fw_flow, bw_flow), 2)) # 网络的输出,这个根据自己的网络模型来写
spatial_loss = bce_loss(spatial_out, label.view(B * Seq, 1, H, W))
temporal_loss = bce_loss(temporal_out, label.view(B * Seq, 1, H, W)) # 这里假设用的是我们前面写的bce_loss函数
loss = spatial_loss + temporal_loss
running_loss += loss.item() # 总体损失
loss.backward() # 反向传播,计算当前梯度
optimizer.step() # 根据梯度更新网络参数
optimizer.zero_grad() # 清空之前的梯度
print("[epoch: {}/{}, iter: {}/{}, iter: {}] train loss: {:.5f}".format(epoch, epoch_num, i, len(dataloader), iter_num, running_loss / ite_num_per))
logger.write("Epoch[{}]({}/{}): Loss: {:.10f} lr: {:.5f}\n".format(epoch, i_iter, train_len, loss, lr)) # 写日志文件
logger.flush() # 刷新缓冲区
print("=====> saving model")
torch.save({'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}, args.snapshot_dir + "epoch_{}_loss_{:.5f}.pth".format(epoch, running_loss / ite_num_per)) # 保存当前模型参数
end = timeit.default_timer() # 结束时间
print(float(end-start)/3600, 'h') # 整体训练时长
logger.write("total training time: {:.2f} h\n".format(float(end-start)/3600))
logger.close()
测试部分相对简单,不需要额外的函数了,可以直接写出测试部分的main函数代码:
def main():
......#(初始化模型参数+加载数据集)
# 加载训练好的网络模型参数
print("=====> Loading network")
model = MyNet([needed_variables]).cuda()
for param in model.parameters():
param.requires_grad = False
saved_state_dict = torch.load(args.restore_from)
model_dict = model.state_dict()
pretrained_dict = {k[7:]: v for k, v in saved_state_dict.items() if k[7:] in model_dict}
model_dict.update(pretrained_dict)
model.load_state_dict(model_dict)
model.eval()
start = timeit.default_timer() # 开始时间
num = 0 # 计算数量
for data in testloader:
img, fw_flow, bw_flow, H, W = data['video'].cuda(), data['fwflow'].cuda(), data['bwflow'].cuda(), data["H"].cuda(), data["W"].cuda()
flow = torch.cat((fw_flow, bw_flow), 2)
with torch.no_grad():
out, _ = model(img, flow)
# 对模型的输出结果进行相应处理,按照自己需要来写
out = F.interpolate(out[0], (H, W), mode='bilinear', align_corners=True)
out = out[0, 0].cpu().numpy()
out = (out - np.min(out) + 1e-12) / (np.max(out) - np.min(out) + 1e-12) * 255.
out = out.astype(np.uint8)
save_folder = args.save_dir + "davis16/" + data['name'][0].split("/")[-2]
if not os.path.exists(save_folder):
os.makedirs(save_folder) # 新建测试结果保存的文件夹
imwrite(save_folder + "/" + data['name'][0].split("/")[-1], out) # 将输出的图片保存到文件夹内
print('save: '+ data['name'][0])
num += 1
end = timeit.default_timer() # 结束时间
total_time = end-start
print('total_time:' + str(total_time) + ', fps:' + str(num / total_time)) # fps是frame per second,每秒能够完成的图片数量
参考于以下文献提供的代码:
[1] Wang W, Lu X, Shen J, et al. Zero-shot video object segmentation via attentive graph neural networks[C]//Proceedings of the IEEE/CVF International Conference on Computer Vision. 2019: 9236-9245.
代码链接:https://github.com/carrierlxk/AGNN
[2] Ren S, Liu W, Liu Y, et al. Reciprocal transformations for unsupervised video object segmentation[C]//Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. 2021: 15455-15464.
代码链接:https://github.com/OliverRensu/RTNet