为 DDPG 使用离线体验时出错。自定义环境维度(动作空间和状态空间)似乎与 DDPG RLLIB 培训师的预期不一致。
Ubuntu,Ray 0.7 版本(最新的 ray),DDPG 示例,离线数据集。用于离线数据集的采样器构建器。
用这个经验数据估计的 DQN 并运行通过。将环境动作空间更改为连续 (Box(,1)) 并且 DDPG 不起作用。
from ray.tune.registry import register_env
TRAIN_BATCH_SIZE = 512
class mmt_ctns_offline_logs(gym.Env):
def __init__(self):
self.action_space = Box(0,50,shape=(,1), dtype=np.float32) #one dimension action space, values range 0 to 50 max
self.observation_space = Box(-100000, 100000, shape=(,58), dtype=np.float32) #58 columns in state space
register_env("mmt_env_ctnaction", lambda config: mmt_ctns_offline_logs()) #register custom environment
#define the configuration. Some of these are defaults. But I have explicitely defined them for clarify (within my team)
config_dict = {"env": "mmt_env_ctnaction", "evaluation_num_episodes":50, "num_workers": 11, "sample_batch_size": 512,
"train_batch_size": TRAIN_BATCH_SIZE,
"input": "<experience_replay_folder>/",
"output": "<any_folder>", "gamma": 0.99,
"horizon": None,
"optimizer_class": "SyncReplayOptimizer",
"optimizer": {"prioritized_replay":True},
"actor_hiddens": [128, 64], "actor_hidden_activation": "relu",
"critic_hiddens": [64, 64], "critic_hidden_activation": "relu", "n_step": 1,
"target_network_update_freq": 500,
"input_evaluation": [],
"ignore_worker_failures":True, 'log_level': "DEBUG",
"buffer_size": 50000,
"prioritized_replay": True,
"prioritized_replay_alpha": 0.6,
"prioritized_replay_beta": 0.4,
"prioritized_replay_eps": 1e-6,
"compress_observations": False,
"lr": 1e-3,
"actor_loss_coeff": 0.1,
"critic_loss_coeff": 1.0,
"use_huber": False,
"huber_threshold": 1.0,
"l2_reg": 1e-6,
"grad_norm_clipping": True,
"learning_starts": 1500,
}
config = ddpg.DEFAULT_CONFIG.copy() #dqn.DEFAULT_CONFIG.copy()
for k,v in config_dict.items():
config[k] = v
config_ddpg = config
config_ddpg
run_experiments({
'NM_testing_DDPG_offpolicy_noIS': {
'run': 'DDPG',
'env': 'mmt_env_ctnaction',
'config': config_ddpg,
'local_dir': "/oxygen/narasimham/ray/tmp/mmt/mmt_user_27_DDPG/"
},
})
DDPG 迭代的预期结果。
实际 - 错误:-
ray.exceptions.RayTaskError: ray_DDPGTrainer:train() (pid=89635, host=ip-10-114-53-179)
File "/home/ubuntu/anaconda3/envs/tf_p36n/lib/python3.6/site-packages/ray/rllib/utils/tf_run_builder.py", line 49, in get
self.feed_dict, os.environ.get("TF_TIMELINE_DIR"))
File "/home/ubuntu/anaconda3/envs/tf_p36n/lib/python3.6/site-packages/ray/rllib/utils/tf_run_builder.py", line 91, in run_timeline
fetches = sess.run(ops, feed_dict=feed_dict)
File "/home/ubuntu/anaconda3/envs/tf_p36n/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 877, in run
run_metadata_ptr)
File "/home/ubuntu/anaconda3/envs/tf_p36n/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1076, in _run
str(subfeed_t.get_shape())))
ValueError: Cannot feed value of shape (512,) for Tensor 'default_policy/action:0', which has shape '(?, 1)'
During handling of the above exception, another exception occurred: