我有一个冷冻模型和 4 个 gpus。我想尽可能快地对尽可能多的数据进行推理。我基本上想在同一模型对 4 个批次执行推理的情况下执行数据并行性:每个 gpu 一个批次。
这就是我大致想做的事情
def return_ops():
# load the graph
with tf.Graph().as_default() as graph:
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(model_path, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def, name='')
inputs = []
outputs = []
with graph.as_default() as g:
for gpu in ['/gpu:0', '/gpu:1', '/gpu:2', '/gpu:3']:
with tf.device(gpu):
image_tensor = g.get_tensor_by_name('input:0')
get_embeddings = g.get_tensor_by_name('embeddings:0')
inputs.append(image_tensor)
outputs.append(get_embeddings)
return inputs, outputs, g
但是,当我跑步时
#sample batch
x = np.ones((100,160,160,3))
# get ops
image_tensor_list, pt_list, emb_list, graph = return_ops()
# construct feed dict
feed_dict = {it: x for it in image_tensor_list}
# run the ops
with tf.Session(graph=graph, config=tf.ConfigProto(allow_soft_placement=True)) as sess:
inf = sess.run(emb_list, feed_dict=feed_dict)
/gpu:0
使用 nvidia-smi 检查时,一切都在运行。
但是,我可以运行
with tf.device("/gpu:1"):
t = tf.range(1000)
with tf.Session() as sess:
sess.run(t)
并且第二个 gpu 上有活动...
如何正确实现此数据并行任务?