1

我正在尝试使用基于https://arxiv.org/pdf/1511.06581.pdf的 tensorflow 2 实现我自己的 Dueling DQN 。我实际上是在亚特兰蒂斯环境中训练它,但我无法获得好的结果(每场比赛的平均奖励不断减少,而TD 损失增加)。虽然我相信我从论文中得到了逻辑,但我不知道它是来自网络的直接实现还是来自所选参数。

编辑:使用 tf.keras.utils.plot_model 给了我这个

class DQNAgent:
  def __init__(self, state_shape, n_actions, epsilon=0):
    self.state_input = Input(shape=state_shape, name='State')
    self.x = Conv2D(16, (3, 3), strides=2, activation='relu')(self.state_input)
    self.x = Conv2D(32, (3, 3), strides=2, activation='relu')(self.x)
    self.x = Conv2D(64, (3, 3), strides=2, activation='relu')(self.x)
    self.x = Flatten()(self.x)
    self.x = Dense(256, activation='relu')(self.x)

    self.head_v = Dense(256,activation='relu')(self.x)
    self.head_v = Dense(1, activation='linear',name="Value")(self.head_v)
    self.head_v = RepeatVector(n_actions)(self.head_v)
    self.head_v = Flatten()(self.head_v)

    self.head_a = Dense(256,activation='relu')(self.x)
    self.head_a = Dense(n_actions, activation='linear',name='Activation')(self.head_a)

    self.m_head_a = RepeatVector(n_actions)(tf.keras.backend.mean(self.head_a,axis=1,keepdims=True))
    self.m_head_a = Flatten(name='meanActivation')(self.m_head_a)

    self.head_a = Subtract()([self.head_a,self.m_head_a])

    self.head_q = Add(name = "Q-value")([self.head_v,self.head_a])


    self.network = tf.keras.Model(inputs=[self.state_input], outputs=[self.head_q])
    self.weights = self.network.trainable_variables
    self.epsilon = epsilon
    self.optimizer = tf.keras.optimizers.Adam(1e-3)

  def get_qvalues(self, state_t):
    return self.network(state_t)

  def train(self, exp_replay, batch_size=64):
    states, actions, rewards, next_states, is_done = exp_replay.sample(batch_size)
    is_not_done = 1 - is_done

    with tf.GradientTape() as t:
      current_qvalues = agent.get_qvalues(states)
      current_action_qvalues = tf.reduce_sum(tf.one_hot(actions, n_actions) * current_qvalues, axis=-1)
      next_qvalues_target = target_network.get_qvalues(next_states)
      next_state_values_target = tf.reduce_max(next_qvalues_target, axis=-1)
      reference_qvalues = rewards + gamma*next_state_values_target*is_not_done
      td_loss = (current_action_qvalues - reference_qvalues)**2
      td_loss = tf.math.reduce_mean(td_loss)

    var_list = agent.weights
    grads = t.gradient(td_loss,var_list)
    self.optimizer.apply_gradients(zip(grads, var_list))
    return td_loss


  def sample_actions(self, qvalues):
    batch_size, n_actions = qvalues.shape
    random_actions = np.random.choice(n_actions, size=batch_size)
    best_actions = tf.math.argmax(qvalues, axis=-1)
    should_explore = np.random.choice([0, 1], batch_size, p = [1-self.epsilon, self.epsilon])
    return np.where(should_explore, random_actions, best_actions)


def load_weights_into_target_network(agent, target_network):
  for t, e in zip(target_network.network.trainable_variables, agent.network.trainable_variables):
    t.assign(e)

env = make_env() # Apply frame buffer on "AtlantisDeterministic-V4" env
env.reset()
n_actions = env.action_space.n
state_dim = env.observation_space.shape

agent = DQNAgent(state_dim, n_actions, epsilon=0.5)    
target_network = DQNAgent(state_dim, n_actions)

exp_replay = ReplayBuffer(10**5) # Random experience replay buffer
play_and_record(agent, env, exp_replay, n_steps=10000) # Plays exactly n_steps and records each transition in the ReplayBuffer
gamma = 0.99

for i in trange(10**5):
  play_and_record(agent, env, exp_replay, 10)

  td_loss = agent.train(exp_replay, 64)

  # adjust agent parameters
  if i % 500 == 0:
    load_weights_into_target_network(agent, target_network)
    agent.epsilon = max(agent.epsilon * 0.99, 0.01)
4

1 回答 1

1

问题来自目标网络没有正确更新,基本的编程错误。然而,它在给定的缓冲区修改下工作得非常好。谢谢你的帮助。

于 2020-03-03T09:58:23.070 回答