我正在尝试使用基于https://arxiv.org/pdf/1511.06581.pdf的 tensorflow 2 实现我自己的 Dueling DQN 。我实际上是在亚特兰蒂斯环境中训练它,但我无法获得好的结果(每场比赛的平均奖励不断减少,而TD 损失增加)。虽然我相信我从论文中得到了逻辑,但我不知道它是来自网络的直接实现还是来自所选参数。
编辑:使用 tf.keras.utils.plot_model 给了我这个。
class DQNAgent:
def __init__(self, state_shape, n_actions, epsilon=0):
self.state_input = Input(shape=state_shape, name='State')
self.x = Conv2D(16, (3, 3), strides=2, activation='relu')(self.state_input)
self.x = Conv2D(32, (3, 3), strides=2, activation='relu')(self.x)
self.x = Conv2D(64, (3, 3), strides=2, activation='relu')(self.x)
self.x = Flatten()(self.x)
self.x = Dense(256, activation='relu')(self.x)
self.head_v = Dense(256,activation='relu')(self.x)
self.head_v = Dense(1, activation='linear',name="Value")(self.head_v)
self.head_v = RepeatVector(n_actions)(self.head_v)
self.head_v = Flatten()(self.head_v)
self.head_a = Dense(256,activation='relu')(self.x)
self.head_a = Dense(n_actions, activation='linear',name='Activation')(self.head_a)
self.m_head_a = RepeatVector(n_actions)(tf.keras.backend.mean(self.head_a,axis=1,keepdims=True))
self.m_head_a = Flatten(name='meanActivation')(self.m_head_a)
self.head_a = Subtract()([self.head_a,self.m_head_a])
self.head_q = Add(name = "Q-value")([self.head_v,self.head_a])
self.network = tf.keras.Model(inputs=[self.state_input], outputs=[self.head_q])
self.weights = self.network.trainable_variables
self.epsilon = epsilon
self.optimizer = tf.keras.optimizers.Adam(1e-3)
def get_qvalues(self, state_t):
return self.network(state_t)
def train(self, exp_replay, batch_size=64):
states, actions, rewards, next_states, is_done = exp_replay.sample(batch_size)
is_not_done = 1 - is_done
with tf.GradientTape() as t:
current_qvalues = agent.get_qvalues(states)
current_action_qvalues = tf.reduce_sum(tf.one_hot(actions, n_actions) * current_qvalues, axis=-1)
next_qvalues_target = target_network.get_qvalues(next_states)
next_state_values_target = tf.reduce_max(next_qvalues_target, axis=-1)
reference_qvalues = rewards + gamma*next_state_values_target*is_not_done
td_loss = (current_action_qvalues - reference_qvalues)**2
td_loss = tf.math.reduce_mean(td_loss)
var_list = agent.weights
grads = t.gradient(td_loss,var_list)
self.optimizer.apply_gradients(zip(grads, var_list))
return td_loss
def sample_actions(self, qvalues):
batch_size, n_actions = qvalues.shape
random_actions = np.random.choice(n_actions, size=batch_size)
best_actions = tf.math.argmax(qvalues, axis=-1)
should_explore = np.random.choice([0, 1], batch_size, p = [1-self.epsilon, self.epsilon])
return np.where(should_explore, random_actions, best_actions)
def load_weights_into_target_network(agent, target_network):
for t, e in zip(target_network.network.trainable_variables, agent.network.trainable_variables):
t.assign(e)
env = make_env() # Apply frame buffer on "AtlantisDeterministic-V4" env
env.reset()
n_actions = env.action_space.n
state_dim = env.observation_space.shape
agent = DQNAgent(state_dim, n_actions, epsilon=0.5)
target_network = DQNAgent(state_dim, n_actions)
exp_replay = ReplayBuffer(10**5) # Random experience replay buffer
play_and_record(agent, env, exp_replay, n_steps=10000) # Plays exactly n_steps and records each transition in the ReplayBuffer
gamma = 0.99
for i in trange(10**5):
play_and_record(agent, env, exp_replay, 10)
td_loss = agent.train(exp_replay, 64)
# adjust agent parameters
if i % 500 == 0:
load_weights_into_target_network(agent, target_network)
agent.epsilon = max(agent.epsilon * 0.99, 0.01)