代码源码链接:https://github.com/kujason/avod
论问链接:https://arxiv.org/abs/1712.02294
本系列博客用于记录学习AVOD代码,其代码注释是本人自己写的,本人是个python新手,很多地方的不对之处欢迎各位指正.整个博客系列全是从pycharm上直接复制下来的,可能不大方便看.
#coding=utf-8
"""Detection model trainer.
This runs the DetectionModel trainer.
"""
import argparse
import os
import tensorflow as tf
import avod
import avod.builders.config_builder_util as config_builder
from avod.builders.dataset_builder import DatasetBuilder
from avod.core.models.avod_model import AvodModel
from avod.core.models.rpn_model import RpnModel
from avod.core import trainer
tf.logging.set_verbosity(tf.logging.ERROR)
def train(model_config, train_config, dataset_config):
#一堆操作!!!!读取config文件里面的详细内容
dataset = DatasetBuilder.build_kitti_dataset(dataset_config,
use_defaults=False)
train_val_test = 'train'
#avod
model_name = model_config.model_name
with tf.Graph().as_default():
if model_name == 'rpn_model':
model = RpnModel(model_config,
train_val_test=train_val_test,
dataset=dataset)
elif model_name == 'avod_model':
#avod_model,train,dataset.也就是avod_model的相关设置
model = AvodModel(model_config,
train_val_test=train_val_test,
dataset=dataset)
else:
raise ValueError('Invalid model_name')
#avod/core下面.下接trainer.train部分
trainer.train(model, train_config)
#程序开始的地方
def main(_):
parser = argparse.ArgumentParser()
# Defaults
#训练设置.
# split() 通过指定分隔符对字符串进行切片,如果参数 num 有指定值,则仅分隔 num 个子字符串
default_pipeline_config_path = avod.root_dir() + \
'/configs/avod_cars_example.config'
default_data_split = 'train'
default_device = '1'
#这是一些可以终端设置的地方,在训练是你需要在终端指定,如果不进行指定,相应内容就会直接选择默认的设置
parser.add_argument('--pipeline_config',
type=str,
dest='pipeline_config_path',
default=default_pipeline_config_path,
help='Path to the pipeline config')
parser.add_argument('--data_split',
type=str,
dest='data_split',
default=default_data_split,
help='Data split for training')
parser.add_argument('--device',
type=str,
dest='device',
default=default_device,
help='CUDA device id')
args = parser.parse_args()
# Parse pipeline config
#avod_cars_example.config
#上面一个的训练效果不是很好.用pyramid_cars_with_aug_example
model_config, train_config, _, dataset_config = \
config_builder.get_configs_from_pipeline_file(
args.pipeline_config_path, is_training=True)
# Overwrite data split
#train/val
dataset_config.data_split = args.data_split
# Set CUDA device id
os.environ['CUDA_VISIBLE_DEVICES'] = args.device
train(model_config, train_config, dataset_config)
if __name__ == '__main__':
tf.app.run()
#coding=utf-8
"""Detection model trainer.
This file provides a generic training method to train a
DetectionModel.
"""
import datetime
import os
import tensorflow as tf
import time
from avod.builders import optimizer_builder
from avod.core import trainer_utils
from avod.core import summary_utils
slim = tf.contrib.slim
def train(model, train_config):
"""Training function for detection models.
Args:
model: The detection model object.
train_config: a train_*pb2 protobuf.
training i.e. loading RPN weights onto AVOD model.
"""
model = model
train_config = train_config
# Get model configurations
model_config = model.model_config
# Create a variable tensor to hold the global step
#创建变量张量以保持全局步骤创建变量张量以保持全局步骤
global_step_tensor = tf.Variable(
0, trainable=False, name='global_step')
#############################
# Get training configurations
#############################
#120000
max_iterations = train_config.max_iterations
#10
summary_interval = train_config.summary_interval
#1000
checkpoint_interval = \
train_config.checkpoint_interval
#10000
max_checkpoints = train_config.max_checkpoints_to_keep
#data/output/avod_cars_example
paths_config = model_config.paths_config
#记录log的文件
logdir = paths_config.logdir
if not os.path.exists(logdir):
os.makedirs(logdir)
checkpoint_dir = paths_config.checkpoint_dir
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
checkpoint_path = checkpoint_dir + '/' + \
model_config.checkpoint_name
global_summaries = set([])
# The model should return a dictionary of predictions
#avod_model/build
'''start'''
#这是prediction部分.其中主要过程是先直接进入avod_model.py,再
#avod_model.py的build部分就有来自rpn_model的预测输入.
prediction_dict = model.build()
#false!/训练时我发现是设置为false
summary_histograms = train_config.summary_histograms
summary_img_images = train_config.summary_img_images
summary_bev_images = train_config.summary_bev_images
##############################
# Setup loss
##############################
losses_dict, total_loss = model.loss(prediction_dict)
# Optimizer
# adam_optimizer
training_optimizer = optimizer_builder.build(
train_config.optimizer,
global_summaries,
global_step_tensor)
# Create the train op
with tf.variable_scope('train_op'):
train_op = slim.learning.create_train_op(
total_loss,
training_optimizer,
clip_gradient_norm=1.0,
global_step=global_step_tensor)
# Save checkpoints regularly.
saver = tf.train.Saver(max_to_keep=max_checkpoints,
pad_step_number=True)
# Add the result of the train_op to the summary
tf.summary.scalar("training_loss", train_op)
# Add maximum memory usage summary op
# This op can only be run on device with gpu
# so it's skipped on travis
is_travis = 'TRAVIS' in os.environ
if not is_travis:
# tf.summary.scalar('bytes_in_use',
# tf.contrib.memory_stats.BytesInUse())
tf.summary.scalar('max_bytes',
tf.contrib.memory_stats.MaxBytesInUse())
summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))
summary_merged = summary_utils.summaries_to_keep(
summaries,
global_summaries,
histograms=summary_histograms,
input_imgs=summary_img_images,
input_bevs=summary_bev_images
)
#true!
allow_gpu_mem_growth = train_config.allow_gpu_mem_growth
if allow_gpu_mem_growth:
# GPU memory config
config = tf.ConfigProto()
config.gpu_options.allow_growth = allow_gpu_mem_growth
sess = tf.Session(config=config)
else:
sess = tf.Session()
# Create unique folder name using datetime for summary writer
datetime_str = str(datetime.datetime.now())
logdir = logdir + '/train'
train_writer = tf.summary.FileWriter(logdir + '/' + datetime_str,
sess.graph)
# Create init op
init = tf.global_variables_initializer()
# Continue from last saved checkpoint
#true!
if not train_config.overwrite_checkpoints:
trainer_utils.load_checkpoints(checkpoint_dir,
saver)
if len(saver.last_checkpoints) > 0:
checkpoint_to_restore = saver.last_checkpoints[-1]
saver.restore(sess, checkpoint_to_restore)
else:
# Initialize the variables
sess.run(init)
else:
# Initialize the variables
sess.run(init)
# Read the global step if restored
global_step = tf.train.global_step(sess,
global_step_tensor)
print('Starting from step {} / {}'.format(
global_step, max_iterations))
# Main Training Loop
last_time = time.time()
for step in range(global_step, max_iterations + 1):
# Save checkpoint
#1000
if step % checkpoint_interval == 0:
global_step = tf.train.global_step(sess,
global_step_tensor)
saver.save(sess,
save_path=checkpoint_path,
global_step=global_step)
print('Step {} / {}, Checkpoint saved to {}-{:08d}'.format(
step, max_iterations,
checkpoint_path, global_step))
# Create feed_dict for inferencing
#输入
feed_dict = model.create_feed_dict()
# Write summaries and train op
#10
if step % summary_interval == 0:
current_time = time.time()
time_elapsed = current_time - last_time
last_time = current_time
#预测部分的整个开头
train_op_loss, summary_out = sess.run(
[train_op, summary_merged], feed_dict=feed_dict)
print('Step {}, Total Loss {:0.3f}, Time Elapsed {:0.3f} s'.format(
step, train_op_loss, time_elapsed))
train_writer.add_summary(summary_out, step)
else:
# Run the train op only
sess.run(train_op, feed_dict)
# Close the summary writers
train_writer.close()