I have a mismatch in shapes between inputs and the model of my reinforcement learning project.
I have been closely following the AWS examples, specifically the cartpole example. However I have built my own custom environment. What I am struggling to understand is how to change my environment so that it is able to work with the prebuilt Ray RLEstimator.
Here is the code for the environment:
from enum import Enum
import math
import gym
from gym import error, spaces, utils, wrappers
from gym.utils import seeding
from gym.envs.registration import register
from gym.spaces import Discrete, Box
import numpy as np
# from float_space import FloatSpace
def sigmoid_price_fun(x, maxcust, gamma):
return maxcust / (1 + math.exp(gamma * max(0, x)))
class Actions(Enum):
DECREASE_PRICE = 0
INCREASE_PRICE = 1
HOLD = 2
PRICE_ADJUSTMENT = {
Actions.DECREASE_PRICE: -0.25,
Actions.INCREASE_PRICE: 0.25,
Actions.HOLD: 0
}
class ArrivalSim(gym.Env):
""" Simple environment for price optimising RL learner. """
def __init__(self, price):
"""
Parameters
----------
price : float
The initial price to use.
"""
super().__init__()
self.price = price
self.revenue = 0
self.action_space = Discrete(3) # [0, 1, 2] #increase or decrease
self.observation_space = Box(np.array(0.0),np.array(1000))
# self.observation_space = FloatSpace(price)
def step(self, action):
""" Enacts the specified action in the environment.
Returns the new price, reward, whether we're finished and an empty dict for compatibility with Gym's
interface. """
self._take_action(Actions(action))
next_state = self.price
# next_state = self.observation_space.sample()
reward = self._get_reward()
done = False
if next_state < 0 or reward == 0:
done = True
print(next_state, reward, done, {})
return np.array(next_state), reward, done, {}
def reset(self):
""" Resets the environment, selecting a random initial price. Returns the price. """
# self.observation_space.value = np.random.rand()
# return self.observation_space.sample()
self.price = np.random.rand()
return self.price
def _take_action(self, action):
# self.observation_space.value += PRICE_ADJUSTMENT[action]
self.price += PRICE_ADJUSTMENT[action]
def _get_reward(self,price):
# price = self.observation_space.value
# return max(np.random.poisson(sigmoid_price_fun(price, 50, 0.5)) * price, 0)
self.revenue = max(np.random.poisson(sigmoid_price_fun(self.price, 50, 0.5)) * self.price, 0)
return max(np.random.poisson(sigmoid_price_fun(self.price, 50, 0.5)) * self.price, 0)
# def render(self, mode='human'):
# super().render(mode)
def testEnv():
register(
id='ArrivalSim-v0',
entry_point='env:ArrivalSim',
kwargs= {'price' : 40}
)
env = gym.make('ArrivalSim-v0')
env.reset()
for _ in range(20):
test = env.action_space.sample()
print(test)
print(env.observation_space)
env.step(test) # take a random action
env.close()
if __name__ =='__main__':
testEnv()
Here is the training script
import json
import os
import gym
import ray
from ray.tune import run_experiments
from ray.tune.registry import register_env
from gym.envs.registration import register
from sagemaker_rl.ray_launcher import SageMakerRayLauncher
def create_environment(env_config):
import gym
# from gym.spaces import Space
from gym.envs.registration import register
# This import must happen inside the method so that worker processes import this code
register(
id='ArrivalSim-v0',
entry_point='env:ArrivalSim',
kwargs= {'price' : 40}
)
return gym.make('ArrivalSim-v0')
class MyLauncher(SageMakerRayLauncher):
def register_env_creator(self):
register_env("ArrivalSim-v0", create_environment)
def get_experiment_config(self):
return {
"training": {
"env": "ArrivalSim-v0",
"run": "PPO",
"stop": {
"episode_reward_mean": 5000,
},
"config": {
"gamma": 0.995,
"kl_coeff": 1.0,
"num_sgd_iter": 10,
"lr": 0.0001,
"sgd_minibatch_size": 32768,
"train_batch_size": 320000,
"monitor": False, # Record videos.
"model": {
"free_log_std": False
},
"use_gae": False,
"num_workers": (self.num_cpus-1),
"num_gpus": self.num_gpus,
"batch_mode": "complete_episodes"
}
}
}
if __name__ == "__main__":
MyLauncher().train_main()
Here is the code I run in Jupyter:
metric_definitions = RLEstimator.default_metric_definitions(RLToolkit.RAY)
environment = env = {
'SAGEMAKER_REQUIREMENTS': 'requirements.txt', # path relative to `source_dir` below.
}
estimator = RLEstimator(entry_point="train.py",
source_dir='.',
toolkit=RLToolkit.RAY,
toolkit_version='0.6.5',
framework=RLFramework.TENSORFLOW,
dependencies=["sagemaker_rl"],
# image_name='price-response-ray-cpu',
role=role,
# train_instance_type="ml.c5.2xlarge",
train_instance_type='local',
train_instance_count=1,
# output_path=s3_output_path,
# base_job_name=job_name_prefix,
metric_definitions=metric_definitions
# hyperparameters={
# Attention scientists! You can override any Ray algorithm parameter here:
#"rl.training.config.horizon": 5000,
#"rl.training.config.num_sgd_iter": 10,
#}
)
estimator.fit(wait=True)
job_name = estimator.latest_training_job.job_name
print("Training job: %s" % job_name)
The error message I have been receiving has been the following:
algo-1-dxwxx_1 | == Status ==
algo-1-dxwxx_1 | Using FIFO scheduling algorithm.
algo-1-dxwxx_1 | Resources requested: 0/3 CPUs, 0/0 GPUs
algo-1-dxwxx_1 | Memory usage on this node: 1.1/4.1 GB
algo-1-dxwxx_1 |
algo-1-dxwxx_1 | == Status ==
algo-1-dxwxx_1 | Using FIFO scheduling algorithm.
algo-1-dxwxx_1 | Resources requested: 2/3 CPUs, 0/0 GPUs
algo-1-dxwxx_1 | Memory usage on this node: 1.4/4.1 GB
algo-1-dxwxx_1 | Result logdir: /opt/ml/output/intermediate/training
algo-1-dxwxx_1 | Number of trials: 1 ({'RUNNING': 1})
algo-1-dxwxx_1 | RUNNING trials:
algo-1-dxwxx_1 | - PPO_ArrivalSim-v0_0: RUNNING
algo-1-dxwxx_1 |
algo-1-dxwxx_1 | (pid=72) 2019-08-30 09:35:13,030 WARNING ppo.py:172 -- FYI: By default, the value function will not share layers with the policy model ('vf_share_layers': False).
algo-1-dxwxx_1 | 2019-08-30 09:35:13,063 ERROR trial_runner.py:460 -- Error processing event.
algo-1-dxwxx_1 | Traceback (most recent call last):
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/tune/trial_runner.py", line 409, in _process_trial
algo-1-dxwxx_1 | result = self.trial_executor.fetch_result(trial)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/tune/ray_trial_executor.py", line 314, in fetch_result
algo-1-dxwxx_1 | result = ray.get(trial_future[0])
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/worker.py", line 2316, in get
algo-1-dxwxx_1 | raise value
algo-1-dxwxx_1 | ray.exceptions.RayTaskError: ray_worker (pid=72, host=b9b15d495b68)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/rllib/models/model.py", line 83, in __init__
algo-1-dxwxx_1 | restored, num_outputs, options)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/rllib/models/model.py", line 135, in _build_layers_v2
algo-1-dxwxx_1 | raise NotImplementedError
algo-1-dxwxx_1 | NotImplementedError
algo-1-dxwxx_1 |
algo-1-dxwxx_1 | During handling of the above exception, another exception occurred:
algo-1-dxwxx_1 |
algo-1-dxwxx_1 | ray_worker (pid=72, host=b9b15d495b68)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/rllib/agents/agent.py", line 276, in __init__
algo-1-dxwxx_1 | Trainable.__init__(self, config, logger_creator)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/tune/trainable.py", line 88, in __init__
algo-1-dxwxx_1 | self._setup(copy.deepcopy(self.config))
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/rllib/agents/agent.py", line 373, in _setup
algo-1-dxwxx_1 | self._init()
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/rllib/agents/ppo/ppo.py", line 77, in _init
algo-1-dxwxx_1 | self.env_creator, self._policy_graph)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/rllib/agents/agent.py", line 506, in make_local_evaluator
algo-1-dxwxx_1 | extra_config or {}))
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/rllib/agents/agent.py", line 714, in _make_evaluator
algo-1-dxwxx_1 | async_remote_worker_envs=config["async_remote_worker_envs"])
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/rllib/evaluation/policy_evaluator.py", line 288, in __init__
algo-1-dxwxx_1 | self._build_policy_map(policy_dict, policy_config)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/rllib/evaluation/policy_evaluator.py", line 661, in _build_policy_map
algo-1-dxwxx_1 | policy_map[name] = cls(obs_space, act_space, merged_conf)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/rllib/agents/ppo/ppo_policy_graph.py", line 176, in __init__
algo-1-dxwxx_1 | seq_lens=existing_seq_lens)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/rllib/models/catalog.py", line 215, in get_model
algo-1-dxwxx_1 | seq_lens)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/rllib/models/catalog.py", line 255, in _get_model
algo-1-dxwxx_1 | num_outputs, options)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/rllib/models/model.py", line 86, in __init__
algo-1-dxwxx_1 | input_dict["obs"], num_outputs, options)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/rllib/models/fcnet.py", line 37, in _build_layers
algo-1-dxwxx_1 | scope=label)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/tensorflow/contrib/framework/python/ops/arg_scope.py", line 182, in func_with_args
algo-1-dxwxx_1 | return func(*args, **current_args)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/tensorflow/contrib/layers/python/layers/layers.py", line 1854, in fully_connected
algo-1-dxwxx_1 | outputs = layer.apply(inputs)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 817, in apply
algo-1-dxwxx_1 | return self.__call__(inputs, *args, **kwargs)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/layers/base.py", line 374, in __call__
algo-1-dxwxx_1 | outputs = super(Layer, self).__call__(inputs, *args, **kwargs)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 730, in __call__
algo-1-dxwxx_1 | self._assert_input_compatibility(inputs)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 1493, in _assert_input_compatibility
algo-1-dxwxx_1 | str(x.shape.as_list()))
algo-1-dxwxx_1 | ValueError: Input 0 of layer default/fc1 is incompatible with the layer: : expected min_ndim=2, found ndim=1. Full shape received: [None]
algo-1-dxwxx_1 |
algo-1-dxwxx_1 | 2019-08-30 09:35:13,064 INFO ray_trial_executor.py:178 -- Destroying actor for trial PPO_ArrivalSim-v0_0. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
algo-1-dxwxx_1 | 2019-08-30 09:35:13,076 INFO trial_runner.py:497 -- Attempting to recover trial state from last checkpoint.
algo-1-dxwxx_1 | (pid=72) 2019-08-30 09:35:13,041 INFO policy_evaluator.py:278 -- Creating policy evaluation worker 0 on CPU (please ignore any CUDA init errors)
I am not sure how to change the input the environment gives to the model or the models setup itself. It seems the documentations are quite obscure. I have a hunch that problem lies with the observation and action spaces
Here is the reference to the original aws project example: https://github.com/awslabs/amazon-sagemaker-examples/tree/master/reinforcement_learning/rl_roboschool_ray