低阶API模式训练,即使用for循环进行训练与验证。由于低阶API不支持数据下沉训练,因此需将低阶API改成高阶API,即可使用数据下沉模式进行训练,以提升性能。(本例中,性能对比:低阶API模式:770imgs/sec ;低阶API改下沉模式:840imgs/sec)
使用低阶API模式训练,用户可以自己控制哪些数据输入网络。
...for epoch in range(epoch_max):
print("\nEpoch:", epoch+1, "/", epoch_max)
t_start = time.time()
train_loss = 0
train_correct = 0
train_total = 0
for _, (data, gt_classes, _) in enumerate(ds_train):
model.set_train()
# 向网络中传入图片数据和标签
loss, output = model(data, gt_classes)
train_loss += loss
correct = correct_num(output, gt_classes)
correct = correct.asnumpy()
train_correct += correct.sum()
cb_params.cur_step_num += 1
ckpoint_cb.step_end(run_context)
...
低阶API模式改下沉后,则会将整个dataset输入网络,不能控制哪些数据输入网络,则需将dataset输出和network输入匹配。若不匹配则会报错。
dataset输出为:input_data、label和flag。而网络中需输入input_data和label。
class Dataset:
def __init__(self, image_list, label_list):
super(Dataset, self).__init__()
self.imgs = image_list
self.labels = label_list
def __getitem__(self, index):
img = Image.open(self.imgs[index]).convert('RGB')
flag = self.labels[index]
return img, self.labels[index], flag
def __len__(self):
return len(self.imgs)
def create_dataset(data_path, do_train, batch_size, repeat_num=1):
...
dataset = Dataset(save_image_list, save_label_list)
sampler = MySampler(dataset)
print("sample")
cifar_ds = ds.GeneratorDataset(dataset,
column_names=["image", "label", "flag"], sampler=sampler, shuffle=True)
...
steps_per_epoch = cifar_ds.get_dataset_size()
cifar_ds = cifar_ds.repeat(repeat_num)
return cifar_ds, steps_per_epoch
...
ds_train, steps_per_epoch_train = create_dataset(data_path, do_train=True, batch_size=train_batch_size, repeat_num=1)
...
model.train(epoch_max, ds_train, callbacks=[ckpoint_cb, LossMonitor(), fps], dataset_sink_mode=sink_mode, sink_size=steps_per_epoch_train)
将会出现如下报错:
[ERROR] ANALYZER(26833,python):2021-07-15-07:03:21.469.703 [mindspore/ccsrc/pipeline/jit/static_analysis/evaluator.cc:74] Eval] Function construct, The number of parameters of this function is 2, but the number of provided arguments is 3. NodeInfo: In file MindSpore_1P_definedata_high.py(35)
def construct(self, input_data, label):
^
Traceback (most recent call last):
File "MindSpore_1P_definedata_high.py", line 344, in
VAL_BATCH_SIZE, REPEAT_SIZE, dataset_sink_mode)
File "MindSpore_1P_definedata_high.py", line 279, in train_net
dataset_sink_mode=sink_mode, sink_size=steps_per_epoch_train)
File "/home/ma-user/miniconda3/envs/MindSpore-python3.7-aarch64/lib/python3.7/site-packages/mindspore/train/model.py", line 627, in train
sink_size=sink_size)
File "/home/ma-user/miniconda3/envs/MindSpore-python3.7-aarch64/lib/python3.7/site-packages/mindspore/train/model.py", line 413, in _train
self._train_dataset_sink_process(epoch, train_dataset, list_callback, cb_params, sink_size)
File "/home/ma-user/miniconda3/envs/MindSpore-python3.7-aarch64/lib/python3.7/site-packages/mindspore/train/model.py", line 475, in _train_dataset_sink_process
outputs = self._train_network(*inputs)
File "/home/ma-user/miniconda3/envs/MindSpore-python3.7-aarch64/lib/python3.7/site-packages/mindspore/nn/cell.py", line 341, in __call__
out = self.compile_and_run(*inputs)
File "/home/ma-user/miniconda3/envs/MindSpore-python3.7-aarch64/lib/python3.7/site-packages/mindspore/nn/cell.py", line 608, in compile_and_run
self.compile(*inputs)
File "/home/ma-user/miniconda3/envs/MindSpore-python3.7-aarch64/lib/python3.7/site-packages/mindspore/nn/cell.py", line 595, in compile
_executor.compile(self, *inputs, phase=self.phase, auto_parallel_mode=self._auto_parallel_mode)
File "/home/ma-user/miniconda3/envs/MindSpore-python3.7-aarch64/lib/python3.7/site-packages/mindspore/common/api.py", line 494, in compile
result = self._executor.compile(obj, args_list, phase, use_vm)
TypeError: mindspore/ccsrc/pipeline/jit/static_analysis/evaluator.cc:74 Eval] Function construct, The number of parameters of this function is 2, but the number of provided arguments is 3. NodeInfo: In file MindSpore_1P_definedata_high.py(35)
def construct(self, input_data, label):
^
The function call stack (See file 'analyze_fail.dat' for details):# 0 In file /home/ma-user/miniconda3/envs/MindSpore-python3.7-aarch64/lib/python3.7/site-packages/mindspore/train/dataset_helper.py(75)
return self.network(*outputs)
^# 1 In file MindSpore_1P_definedata_high.py(56)
loss = self.network(*inputs)
dataset输出为:input_data。而网络中需输入input_data和label。
class Dataset:
def __init__(self, image_list, label_list):
super(Dataset, self).__init__()
self.imgs = image_list
self.labels = label_list
def __getitem__(self, index):
img = Image.open(self.imgs[index]).convert('RGB')
return img
def __len__(self):
return len(self.imgs)
def create_dataset(data_path, do_train, batch_size, repeat_num=1):
...
dataset = Dataset(save_image_list, save_label_list)
sampler = MySampler(dataset)
print("sample")
cifar_ds = ds.GeneratorDataset(dataset, column_names=["image"], sampler=sampler, shuffle=True)
...
steps_per_epoch = cifar_ds.get_dataset_size()
cifar_ds = cifar_ds.repeat(repeat_num)
return cifar_ds, steps_per_epoch
...
ds_train, steps_per_epoch_train = create_dataset(data_path, do_train=True, batch_size=train_batch_size, repeat_num=1)
...
model.train(epoch_max, ds_train, callbacks=[ckpoint_cb, LossMonitor(), fps], dataset_sink_mode=sink_mode, sink_size=steps_per_epoch_train)
将会出现如下报错:
Traceback (most recent call last):
File "MindSpore_1P_definedata_high.py", line 344, in
VAL_BATCH_SIZE, REPEAT_SIZE, dataset_sink_mode)
File "MindSpore_1P_definedata_high.py", line 279, in train_net
dataset_sink_mode=sink_mode, sink_size=steps_per_epoch_train)
File "/home/ma-user/miniconda3/envs/MindSpore-python3.7-aarch64/lib/python3.7/site-packages/mindspore/train/model.py", line 627, in train
sink_size=sink_size)
File "/home/ma-user/miniconda3/envs/MindSpore-python3.7-aarch64/lib/python3.7/site-packages/mindspore/train/model.py", line 413, in _train
self._train_dataset_sink_process(epoch, train_dataset, list_callback, cb_params, sink_size)
File "/home/ma-user/miniconda3/envs/MindSpore-python3.7-aarch64/lib/python3.7/site-packages/mindspore/train/model.py", line 466, in _train_dataset_sink_process
dataset_helper=dataset_helper)
File "/home/ma-user/miniconda3/envs/MindSpore-python3.7-aarch64/lib/python3.7/site-packages/mindspore/train/model.py", line 292, in _exec_preprocess
dataset_helper = DatasetHelper(dataset, dataset_sink_mode, sink_size, epoch_num)
File "/home/ma-user/miniconda3/envs/MindSpore-python3.7-aarch64/lib/python3.7/site-packages/mindspore/train/dataset_helper.py", line 218, in __init__
self.iter = iterclass(dataset, sink_size, epoch_num)
File "/home/ma-user/miniconda3/envs/MindSpore-python3.7-aarch64/lib/python3.7/site-packages/mindspore/train/dataset_helper.py", line 360, in __init__
super().__init__(dataset, sink_size, epoch_num)
File "/home/ma-user/miniconda3/envs/MindSpore-python3.7-aarch64/lib/python3.7/site-packages/mindspore/train/dataset_helper.py", line 268, in __init__
create_data_info_queue=create_data_info_queue)
File "/home/ma-user/miniconda3/envs/MindSpore-python3.7-aarch64/lib/python3.7/site-packages/mindspore/train/_utils.py", line 59, in _exec_datagraph
batch_size = exec_dataset.get_batch_size()
File "/home/ma-user/miniconda3/envs/MindSpore-python3.7-aarch64/lib/python3.7/site-packages/mindspore/dataset/engine/datasets.py", line 1567, in get_batch_size
self._batch_size = runtime_getter[0].GetBatchSize()
RuntimeError: Thread ID 281473819629872 Unexpected error. input column name: label doesn't exist in the dataset columns.
Line of code : 366
File : /home/jenkins/agent-working-dir/workspace/Compile_Ascend_ARM_EulerOS/mindspore/mindspore/ccsrc/minddata/dataset/engine/datasetops/map_op/map_op.cc
有的任务需要同时训练多个Loss,比如检测任务,这时需要同时输出多个Loss的值方便查看各个Loss的趋势。
MindSpore默认只支持单Loss的输出,如果要同时输出多个Loss需要重写TrainOneStepCell函数(原本MindSpore的内置函数),主要重写部分为construct中关于梯度计算的部分。 具体代码如下所示:
class TrainOneStepCell(nn.Cell):
def __init__(self, network, optimizer, sens=1.0):
super(TrainOneStepCell, self).__init__(auto_prefix=False)
self.network = network
self.network.add_flags(defer_inline=True)
self.weights = ParameterTuple(network.trainable_params())
self.optimizer = optimizer
self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
self.sens = sens
self.reducer_flag = False
self.grad_reducer = None
parallel_mode = _get_parallel_mode()
if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL):
self.reducer_flag = True
if self.reducer_flag:
mean = _get_mirror_mean()
degree = _get_device_num()
self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
def construct(self, data, label):
weights = self.weights
loss1, loss2 = self.network(data, label)
sens1 = P.Fill()(P.DType()(loss1), P.Shape()(loss1), self.sens)
sens2 = P.Fill()(P.DType()(loss2), P.Shape()(loss2), self.sens)
sens = (sens1, sens2)
grads = self.grad(self.network, weights)(data, label, sens)
if self.reducer_flag:
# apply grad reducer on grads
grads = self.grad_reducer(grads)
return F.depend(loss1, self.optimizer(grads)), F.depend(loss2, self.optimizer(grads))