1

此代码应使用 TF-Agents 库在 Cartpole 环境中训练 DQN(Deep Q Networks)代理,但似乎代理没有正确训练。我正在尝试使用 Driver 模块编写一个最小的示例。

我还能够运行 TF-Agents 库中的示例。

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf

from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.networks import q_network
from tf_agents.policies import random_tf_policy
from tf_agents.agents.dqn import dqn_agent
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.drivers import dynamic_step_driver
from tf_agents.metrics import tf_metrics
from tf_agents.eval import metric_utils

tf.compat.v1.enable_v2_behavior()


# parameter
env_name = 'CartPole-v0'
num_iterations = 20000

collect_steps_per_iteration = 1
initial_steps = 1000

replay_buffer_capacity = 100000
batch_size = 64
learning_rate = 0.001
fc_layer_params = (50, )


# load enviroment
py_train_env = suite_gym.load(env_name)
py_eval_env = suite_gym.load(env_name)
tf_train_env = tf_py_environment.TFPyEnvironment(py_train_env)
tf_eval_env = tf_py_environment.TFPyEnvironment(py_eval_env)


# create agent
q_net = q_network.QNetwork(tf_train_env.observation_spec(), tf_train_env.action_spec(), fc_layer_params=fc_layer_params)
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
tf_agent = dqn_agent.DqnAgent(tf_train_env.time_step_spec(), tf_train_env.action_spec(), q_network=q_net,
                              optimizer=optimizer)
tf_agent.initialize()


# replay Buffer,policies and driver
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(data_spec=tf_agent.collect_data_spec,
                                                               batch_size=tf_train_env.batch_size,
                                                               max_length=replay_buffer_capacity)

random_policy = random_tf_policy.RandomTFPolicy(tf_train_env.time_step_spec(), tf_train_env.action_spec())
collect_policy = tf_agent.collect_policy
eval_policy = tf_agent.policy

init_driver = dynamic_step_driver.DynamicStepDriver(tf_train_env, random_policy, [replay_buffer.add_batch],
                                                    initial_steps)
collect_driver = dynamic_step_driver.DynamicStepDriver(tf_train_env, collect_policy, [replay_buffer.add_batch],
                                                       collect_steps_per_iteration)


# collect init data
init_driver.run()
ds = replay_buffer.as_dataset(num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3)
iterator = iter(ds)


# train agent
print('Train Agent(global steps=' + str(num_iterations*collect_steps_per_iteration) + '):')
tf_train_env.reset()
for i in range(num_iterations):
    collect_driver.run()
    experience, _ = next(iterator)
    train_loss = tf_agent.train(experience)

    # evaluate all 100 steps
    if ((i+1) * collect_steps_per_iteration) % 100 == 0:
        metric = [tf_metrics.AverageReturnMetric()]
        result = metric_utils.eager_compute(metric, tf_eval_env, eval_policy, num_episodes=5)
        print('step = {0}: loss = {1}: AR = {2}'.format((i+1) * collect_steps_per_iteration, train_loss.loss,
                                                        result['AverageReturn'].numpy()))

代码正在运行,但代理在训练后无法玩游戏。此外,我预计平均回报会随着时间的推移而增加,但会保持不变。

4

1 回答 1

0

:) 试试看。

# 重放缓冲区、策略和驱动程序

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(...

replay_buffer_observer = replay_buffer.add_batch

from tf_agents.metrics import tf_metrics
train_metrics = [
    tf_metrics.NumberOfEpisodes(),
    tf_metrics.EnvironmentSteps(),
    tf_metrics.AverageReturnMetric(),
    tf_metrics.AverageEpisodeLengthMetric(),
]

dynamic_step_driver.DynamicStepDriver(...observers=[ replay_buffer_observer ]+ train_metrics

于 2020-05-24T13:06:27.963 回答