此代码应使用 TF-Agents 库在 Cartpole 环境中训练 DQN(Deep Q Networks)代理,但似乎代理没有正确训练。我正在尝试使用 Driver 模块编写一个最小的示例。
我还能够运行 TF-Agents 库中的示例。
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.networks import q_network
from tf_agents.policies import random_tf_policy
from tf_agents.agents.dqn import dqn_agent
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.drivers import dynamic_step_driver
from tf_agents.metrics import tf_metrics
from tf_agents.eval import metric_utils
tf.compat.v1.enable_v2_behavior()
# parameter
env_name = 'CartPole-v0'
num_iterations = 20000
collect_steps_per_iteration = 1
initial_steps = 1000
replay_buffer_capacity = 100000
batch_size = 64
learning_rate = 0.001
fc_layer_params = (50, )
# load enviroment
py_train_env = suite_gym.load(env_name)
py_eval_env = suite_gym.load(env_name)
tf_train_env = tf_py_environment.TFPyEnvironment(py_train_env)
tf_eval_env = tf_py_environment.TFPyEnvironment(py_eval_env)
# create agent
q_net = q_network.QNetwork(tf_train_env.observation_spec(), tf_train_env.action_spec(), fc_layer_params=fc_layer_params)
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
tf_agent = dqn_agent.DqnAgent(tf_train_env.time_step_spec(), tf_train_env.action_spec(), q_network=q_net,
optimizer=optimizer)
tf_agent.initialize()
# replay Buffer,policies and driver
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(data_spec=tf_agent.collect_data_spec,
batch_size=tf_train_env.batch_size,
max_length=replay_buffer_capacity)
random_policy = random_tf_policy.RandomTFPolicy(tf_train_env.time_step_spec(), tf_train_env.action_spec())
collect_policy = tf_agent.collect_policy
eval_policy = tf_agent.policy
init_driver = dynamic_step_driver.DynamicStepDriver(tf_train_env, random_policy, [replay_buffer.add_batch],
initial_steps)
collect_driver = dynamic_step_driver.DynamicStepDriver(tf_train_env, collect_policy, [replay_buffer.add_batch],
collect_steps_per_iteration)
# collect init data
init_driver.run()
ds = replay_buffer.as_dataset(num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3)
iterator = iter(ds)
# train agent
print('Train Agent(global steps=' + str(num_iterations*collect_steps_per_iteration) + '):')
tf_train_env.reset()
for i in range(num_iterations):
collect_driver.run()
experience, _ = next(iterator)
train_loss = tf_agent.train(experience)
# evaluate all 100 steps
if ((i+1) * collect_steps_per_iteration) % 100 == 0:
metric = [tf_metrics.AverageReturnMetric()]
result = metric_utils.eager_compute(metric, tf_eval_env, eval_policy, num_episodes=5)
print('step = {0}: loss = {1}: AR = {2}'.format((i+1) * collect_steps_per_iteration, train_loss.loss,
result['AverageReturn'].numpy()))
代码正在运行,但代理在训练后无法玩游戏。此外,我预计平均回报会随着时间的推移而增加,但会保持不变。