个人总结能实现多GPU跑图的方法:
1、使用谷歌框架tf.estimator;
session_config = tf.ConfigProto(device_count={'GPU': 0,'GPU':1,'GPU':2,'GPU':3})
run_config = tf.estimator.RunConfig().replace(session_config=session_config)
estimator = tf.estimator.Estimator(
model_fn=model_fn, model_dir=FLAGS.model_dir, config=run_config, params=params)
2、多机多卡分布式多GPU的Distribute 方式,分布式实现模型的并行计算;
Horovod github_homepage
Horovod示例代码
3、自己实现数据并行再多个GPU上计算。(本文介绍的方式)
数据并行就是采用同一个模型,而多个GPU上分别运行不同batch的数据。
类parallel:实现不同GPU块调用预测方法,传入对应的特征batch
import six
import tensorflow as tf
def _maybe_repeat(x, n):
if isinstance(x, list):
assert len(x) == n
return x
else:
return [x] * n
# Data-level parallelism
def data_parallelism( devices, fn, *args, **kwargs):
num_worker = len(devices)
devices = ["gpu:%d" % d for d in devices]
# Replicate args and kwargs
if args:
new_args = [_maybe_repeat(arg, num_worker) for arg in args]
# Transpose
new_args = [list(x) for x in zip(*new_args)]
else:
new_args = [[] for _ in range(num_worker)]
new_kwargs = [{} for _ in range(num_worker)]
for k, v in six.iteritems(kwargs):
vals = _maybe_repeat(v, num_worker)
for i in range(num_worker):
new_kwargs[i][k] = vals[i]
fns = _maybe_repeat(fn, num_worker)
# Now make the parallel call.
outputs = []
for i in range(num_worker):
with tf.variable_scope(tf.get_variable_scope(), reuse=(i != 0)):
with tf.name_scope("parallel_%d" % i):
with tf.device(devices[i]):
outputs.append(fns[i](*new_args[i], **new_kwargs[i])) # 分GPU调用方法
return outputs
最终跑VGG预测的代码如下:
import numpy as np
import tensorflow as tf
import vgg16
import data_process.util as util
import utils
import time
import pandas as pd
import os
import parallel
batch_size = 12
vgg_output_dim = 4096
map1 = pd.DataFrame(columns=['img_path', 'index'])
device_list = [0,1,2,3] # GPU 0,1,3
device_num = len(device_list)
batch_size = batch_size * device_num
def batch_generator():
"""构造迭代器,不是很重要,根据自己的业务自己构造"""
i = 0
current_batch = 0
file_index = -1
img_path_list = util.list_all_files("../20200409/") # 得到所有图片名称而已,代码不用看
total_count = len(img_path_list)
nb_batches = len(img_path_list) // batch_size
print(nb_batches)
while True:
X = np.zeros((batch_size, 224, 224, 3), dtype=np.float32)
count_tmp = 0
while count_tmp < batch_size:
file_index += 1
if file_index >= total_count:
break
path = img_path_list[file_index]
if path.split('.')[-1] == 'csv':
continue
img = utils.load_image(path) # 根据路劲读图片数据而已,也不用看
file_name = path.split('/')[-1]
try:
img.reshape((224, 224, 3))
X[count_tmp] = img
map1.loc[i] = [file_name, i]
i = i + 1
count_tmp += 1
except:
continue
if count_tmp > 0:
current_batch = current_batch + 1
yield X[0:count_tmp, :, :, :], current_batch
if count_tmp == 0:
break
def shard_features(feat, placeholders, predictions):
"""输入特征的分片,分割成每个GPU输入的占位符对应每个GPU的数据,需要根据自己的features结构修改"""
num_shards = len(placeholders)
feed_dict = {}
n = 0
batch = feat.shape[0] # 当前批次大小
shard_size = (batch + num_shards - 1) // num_shards
for i in range(num_shards):
shard_feat = feat[i * shard_size:(i + 1) * shard_size]
if shard_feat.shape[0] != 0:
feed_dict[placeholders[i]] = shard_feat
n = i + 1
else:
break
if isinstance(predictions, (list, tuple)):
predictions = predictions[:n]
return predictions, feed_dict
def inference_fn(features):
"""预测方法,构造vgg16模型,传入feature,得到fc7值,4096维度的tensor"""
vgg = vgg16.Vgg16()
with tf.name_scope("content_vgg"):
vgg.build(features)
return vgg.fc7
with tf.Graph().as_default():
placeholders = []
for i in range(device_num):
placeholders.append(tf.placeholder("float", [None, 224, 224, 3]))
predictions = parallel.data_parallelism(device_list, inference_fn, placeholders)
with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
time_start = time.time()
batch = batch_generator()
df2 = pd.DataFrame(columns=["vgg{}".format(i) for i in range(vgg_output_dim)])
for l,i in batch:
print("nums0{nums}".format( nums=i))
op, feed_dict = shard_features(l, placeholders, predictions)
prob = sess.run(op, feed_dict=feed_dict)
# 预测的结果prob是所有GPU的叠加列表
for device in range(device_num):
df1 = pd.DataFrame(prob[device],columns=["vgg{}".format(i) for i in range(vgg_output_dim)])
df2 = pd.concat([df2, df1], axis=0, ignore_index=False)
print(df2.shape)
print(df2.shape)
print(map1)
map1 = map1.reset_index()
df2 = df2.reset_index()
df = pd.concat([map1, df2], axis=1, ignore_index=True)
if not os.path.exists('./results'):
os.mkdir('./results')
df_path = os.path.join('./results', 'map_00.csv')
df.to_csv(df_path, mode='a', header=False)
print("最后结果{result}".format(result=df))
time_end = time.time()
print('time cost', time_end - time_start, 's')
解释都在代码里面,解释都是无力的,自己看代码和注释吧!