我一直在使用 Tensorflow 和 OpenAI 健身房进行深度强化学习。我的问题是 GPU 利用率低。谷歌搜索这个问题,我了解到在训练小型网络(例如训练 mnist)时期望大量 GPU 利用率是错误的。但我认为我的神经网络并没有那么小。该架构类似于原始 deepmind 论文中给出的架构(或多或少)。我的网络架构总结如下
卷积层 1 (filters=32, kernel_size=8x8, strides=4)
卷积层 2 (filters=64, kernel_size=8x8, strides=2)
卷积层 3 (filters=64, kernel_size=8x8, strides=1)
密集层(单位=512)
输出层(单位=9)
我正在使用 Tesla P100 16GB gpu 进行培训。我的学习算法是 Simple DQN。(再次,来自 Deepmind 论文)。超参数都在论文中给出。GPU 利用率仍远低于 10%(如 nvidia-smi 所示)。可能的问题是什么?
import tensorflow as tf
import numpy as np
import os, sys
import gym
from collections import deque
from time import sleep
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
def reset_graph(seed=142):
tf.reset_default_graph()
def preprocess_observation(obs):
img = obs[34:210:2, ::2] # crop and downsize
return np.mean(img, axis=2).reshape(88, 80) / 255.0
def combine_observations_multichannel(preprocessed_observations):
return np.array(preprocessed_observations).transpose([1, 2, 0])
n_observations_per_state = 3
preprocessed_observations = deque([], maxlen=n_observations_per_state)
env = gym.make("Breakout-v0")
obs = env.reset()
input_height = 88
input_width = 80
input_channels = 3
conv_n_maps = [32, 64, 64]
conv_kernel_sizes = [(8,8), (4,4), (3,3)]
conv_strides = [4, 2, 1]
conv_paddings = ["SAME"] * 3
conv_activation = [tf.nn.relu] * 3
n_hidden_in = 64 * 11 * 10 # conv3 has 64 maps of 10x10 each
n_hidden = 512
hidden_activation = tf.nn.relu
n_outputs = env.action_space.n # Number of discrete actions are available
initializer = tf.variance_scaling_initializer()
def q_network(X_state, name):
prev_layer = X_state
with tf.variable_scope(name) as scope:
for n_maps, kernel_size, strides, padding, activation in zip(
conv_n_maps, conv_kernel_sizes, conv_strides,
conv_paddings, conv_activation):
prev_layer = tf.layers.conv2d(
prev_layer, filters=n_maps, kernel_size=kernel_size,
strides=strides, padding=padding, activation=activation,
kernel_initializer=initializer)
last_conv_layer_flat = tf.reshape(prev_layer, shape=[-1, n_hidden_in])
hidden = tf.layers.dense(last_conv_layer_flat, n_hidden,
activation=hidden_activation,
kernel_initializer=initializer)
outputs = tf.layers.dense(hidden, n_outputs,
kernel_initializer=initializer)
trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
scope=scope.name)
trainable_vars_by_name = {var.name[len(scope.name):]: var
for var in trainable_vars}
return outputs, trainable_vars_by_name
X_state = tf.placeholder(tf.float32, shape=[None, input_height, input_width,
input_channels])
online_q_values, online_vars = q_network(X_state, name="q_networks/online")
target_q_values, target_vars = q_network(X_state, name="q_networks/target")
copy_ops = [target_var.assign(online_vars[var_name])
for var_name, target_var in target_vars.items()]
copy_online_to_target = tf.group(*copy_ops)
learning_rate = 0.001
momentum = 0.95
with tf.variable_scope("train"):
X_action = tf.placeholder(tf.int32, shape=[None])
y = tf.placeholder(tf.float32, shape=[None, 1])
q_value = tf.reduce_sum(online_q_values * tf.one_hot(X_action, n_outputs),
axis=1, keep_dims=True)
loss = tf.reduce_mean((y - q_value) ** 2)
global_step = tf.Variable(0, trainable=False, name='global_step')
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum, use_nesterov=True)
training_op = optimizer.minimize(loss, global_step=global_step)
replay_memory_size = 500000
replay_memory = deque([], maxlen=replay_memory_size)
def sample_memories(batch_size):
indices = np.random.permutation(len(replay_memory))[:batch_size]
cols = [[], [], [], [], []] # state, action, reward, next_state, continue
for idx in indices:
memory = replay_memory[idx]
for col, value in zip(cols, memory):
col.append(value)
cols = [np.array(col) for col in cols]
return cols[0], cols[1], cols[2].reshape(-1, 1), cols[3], cols[4].reshape(-1, 1)
eps_min = 0.1
eps_max = 1.0
eps_decay_steps = 2000000
def epsilon_greedy(q_values, step):
epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step/eps_decay_steps)
if np.random.rand() < epsilon:
return np.random.randint(n_outputs) # random action
else:
return np.argmax(q_values) # optimal action
n_steps = 4000000 # total number of training steps
training_start = 10000 # start training after 10,000 game iterations
training_interval = 4 # run a training step every 4 game iterations
save_steps = 1000 # save the model every 1,000 training steps
copy_steps = 10000 # copy online DQN to target DQN every 10,000 training steps
discount_rate = 0.99
skip_start = 5 # Skip the start of every game (it's just waiting time).
batch_size = 64
iteration = 0 # game iterations
checkpoint_dir = './saved_networks'
checkpoint_path = "./saved_networks/dqn_breakout.cpkt"
summary_path = "./summary/"
done = True # env needs to be reset
# Summary variables
svar_reward = tf.Variable(tf.zeros([1], dtype=tf.int32)) # Episode reward
svar_mmq = tf.Variable(tf.zeros([1]), dtype=tf.float32) # Episode Mean-Max-Q
svar_loss = tf.Variable(tf.zeros([1], dtype=tf.float64))
all_svars = [svar_reward, svar_mmq, svar_loss]
tf.summary.scalar("Episode Reward", tf.squeeze(svar_reward))
tf.summary.scalar("Episode Mean-Max-Q", tf.squeeze(svar_mmq))
tf.summary.scalar("Episode MSE", tf.squeeze(svar_loss))
# Placeholders
svar_reward_p, svar_mmq_p = tf.placeholder(tf.int32, [1]), tf.placeholder(tf.float32, [1])
svar_loss_p = tf.placeholder(tf.float64, [1])
svars_placeholders = [svar_reward_p, svar_mmq_p, svar_loss_p]
# Assign operation
summary_assign_op = [all_svars[i].assign(svars_placeholders[i]) for i in range(len(svars_placeholders))]
writer = tf.summary.FileWriter(summary_path)
summary_op = tf.summary.merge_all()
# For keeping track of no. of episodes played.
episode_step = tf.Variable(tf.zeros([1], dtype=tf.int64), trainable=False)
inc_episode_count = episode_step.assign_add([1])
init = tf.global_variables_initializer()
saver = tf.train.Saver()
loss_val = np.infty
game_length = 0
total_max_q = 0
mean_max_q = 0.0
ep_reward = 0
ep_loss = 0.
with tf.Session() as sess:
if os.path.isfile(checkpoint_path + ".index"):
saver.restore(sess, checkpoint_path)
print("<--------------------- Graph restored! -------------------------->")
else:
print("<--------- No checkpoints found! Starting over.. ---------------->")
init.run()
copy_online_to_target.run()
while True:
step = global_step.eval()
if step >= n_steps:
break
iteration += 1
print("\rIteration {}\tTraining step {}/{} ({:.1f})%\tLoss {:5f}\tMean Max-Q {:5f} ".format(
iteration, step, n_steps, step * 100 / n_steps, loss_val, mean_max_q), end="")
if done: # game over, start again
obs = env.reset()
# Clear observations from the past episode
preprocessed_observations.clear()
for skip in range(skip_start): # skip the start of each game
obs, reward, done, info = env.step(0) # Do nothing
preprocessed_observations.append(preprocess_observation(obs))
state = combine_observations_multichannel(preprocessed_observations)
# Online DQN evaluates what to do
q_values = online_q_values.eval(feed_dict={X_state: [state]})
action = epsilon_greedy(q_values, step)
# Online DQN plays
obs, reward, done, info = env.step(action)
ep_reward += reward
preprocessed_observations.append(preprocess_observation(obs))
next_state = combine_observations_multichannel(preprocessed_observations)
# Let's memorize what happened
replay_memory.append((state, action, reward, next_state, 1.0 - done))
state = next_state
# Compute statistics for tracking progress
total_max_q += q_values.max()
game_length += 1
if done:
mean_max_q = total_max_q / game_length
# Write summary -- start
if iteration >= training_start:
sess.run(summary_assign_op, feed_dict={
svar_reward_p: [ep_reward],
svar_mmq_p: [mean_max_q],
svar_loss_p: [ep_loss],
})
summaries_str = sess.run(summary_op)
writer.add_summary(summaries_str, sess.run(episode_step))
sess.run(inc_episode_count)
# Write summary -- end
total_max_q = 0.0
game_length = ep_reward = ep_loss = 0
if iteration < training_start or iteration % training_interval != 0:
continue # only train after warmup period and at regular intervals
# Sample memories and use the target DQN to produce the target Q-Value
X_state_val, X_action_val, rewards, X_next_state_val, continues = (
sample_memories(batch_size))
next_q_values = target_q_values.eval(
feed_dict={X_state: X_next_state_val})
max_next_q_values = np.max(next_q_values, axis=1, keepdims=True)
y_val = rewards + continues * discount_rate * max_next_q_values
# Train the online DQN
_, loss_val = sess.run([training_op, loss], feed_dict={
X_state: X_state_val, X_action: X_action_val, y: y_val})
ep_loss += loss_val
# Regularly copy the online DQN to the target DQN
if step % copy_steps == 0:
copy_online_to_target.run()
# And save regularly
if step % save_steps == 0:
saver.save(sess, checkpoint_path)