我目前正在进入 tensorflow 并且刚刚开始掌握图形的概念。现在我尝试使用梯度下降(亚当优化器)来实现一个 NN 来解决购物车环境。我首先随机初始化我的权重,然后在训练期间采取随机行动(考虑现有权重)。在测试时,我总是以最大概率采取行动。然而,我总是得到一个徘徊在 10 左右的分数,方差在 0.8 左右。总是。它根本没有以显着的方式发生变化,使它看起来在每一步都采取纯粹的随机动作,根本没有学习任何东西。正如我所说,权重似乎永远不会正确更新。我需要在哪里以及如何做到这一点?
这是我的代码:
import tensorflow as tf
import numpy as np
from gym.envs.classic_control import CartPoleEnv
env = CartPoleEnv()
learning_rate = 10**(-3)
gamma = 0.9999
n_train_trials = 10**3
n_test_trials = 10**2
n_actions = env.action_space.n
n_obs = env.observation_space.high.__len__()
goal_steps = 200
should_render = False
print_per_episode = 100
state_holder = tf.placeholder(dtype=tf.float32, shape=(None, n_obs), name='symbolic_state')
actions_one_hot_holder = tf.placeholder(dtype=tf.float32, shape=(None, n_actions),
name='symbolic_actions_one_hot_holder')
discounted_rewards_holder = tf.placeholder(dtype=tf.float32, shape=None, name='symbolic_reward')
# initialize neurons list dynamically
def get_neurons_list():
i = n_obs
n_neurons_list = [i]
while i < (n_obs * n_actions) // (n_actions // 2):
i *= 2
n_neurons_list.append(i)
while i // 2 > n_actions:
i = i // 2
n_neurons_list.append(i)
n_neurons_list.append(n_actions)
# print(n_neurons_list)
return n_neurons_list
with tf.name_scope('nonlinear_policy'):
# create list of layers with sizes
n_neurons_list = get_neurons_list()
network = None
for i in range((len(n_neurons_list) - 1)):
theta = tf.Variable(tf.random_normal([n_neurons_list[i], n_neurons_list[i+1]]))
bias = tf.Variable(tf.random_normal([n_neurons_list[i+1]]))
if network is None:
network = tf.matmul(state_holder, theta) + bias
else:
network = tf.matmul(network, theta) + bias
if i < len(n_neurons_list) - 1:
network = tf.nn.relu(network)
action_probabilities = tf.nn.softmax(network)
testing_action_choice = tf.argmax(action_probabilities, dimension=1, name='testing_action_choice')
with tf.name_scope('loss'):
actually_chosen_probability = action_probabilities * actions_one_hot_holder
L_theta = -1 * (tf.reduce_sum(tf.log(actually_chosen_probability)) * tf.reduce_sum(discounted_rewards_holder))
with tf.name_scope('train'):
# We define the optimizer to use the ADAM optimizer, and ask it to minimize our loss
gd_opt = tf.train.AdamOptimizer(learning_rate).minimize(L_theta)
sess = tf.Session() # FOR NOW everything is symbolic, this object has to be called to compute each value of Q
# Start
sess.run(tf.global_variables_initializer())
observation = env.reset()
batch_rewards = []
states = []
action_one_hots = []
episode_rewards = []
episode_rewards_list = []
episode_steps_list = []
step = 0
episode_no = 0
while episode_no <= n_train_trials:
if should_render: env.render()
step += 1
action_probability_values = sess.run(action_probabilities,
feed_dict={state_holder: [observation]})
# Choose the action using the action probabilities output by the policy implemented in tensorflow.
action = np.random.choice(np.arange(n_actions), p=action_probability_values.ravel())
# Calculating the one-hot action array for use by tensorflow
action_arr = np.zeros(n_actions)
action_arr[action] = 1.
action_one_hots.append(action_arr)
# Record states
states.append(observation)
observation, reward, done, info = env.step(action)
# We don't want to go above 200 steps
if step >= goal_steps:
done = True
batch_rewards.append(reward)
episode_rewards.append(reward)
# If the episode is done, and it contained at least one step, do the gradient updates
if len(batch_rewards) > 0 and done:
# First calculate the discounted rewards for each step
batch_reward_length = len(batch_rewards)
discounted_batch_rewards = batch_rewards.copy()
for i in range(batch_reward_length):
discounted_batch_rewards[i] *= (gamma ** (batch_reward_length - i - 1))
# Next run the gradient descent step
# Note that each of action_one_hots, states, discounted_batch_rewards has the first dimension as the length
# of the current trajectory
gradients = sess.run(gd_opt, feed_dict={actions_one_hot_holder: action_one_hots, state_holder: states,
discounted_rewards_holder: discounted_batch_rewards})
action_one_hots = []
states = []
batch_rewards = []
if done:
# Done with episode. Reset stuff.
episode_no += 1
episode_rewards_list.append(np.sum(episode_rewards))
episode_steps_list.append(step)
episode_rewards = []
step = 0
observation = env.reset()
if episode_no % print_per_episode == 0:
print("Episode {}: Average steps in last {} episodes".format(episode_no, print_per_episode),
np.mean(episode_steps_list[(episode_no - print_per_episode):episode_no]), '+-',
np.std(episode_steps_list[(episode_no - print_per_episode):episode_no])
)
observation = env.reset()
episode_rewards_list = []
episode_rewards = []
episode_steps_list = []
step = 0
episode_no = 0
print("Testing")
while episode_no <= n_test_trials:
env.render()
step += 1
# For testing, we choose the action using an argmax.
test_action, = sess.run([testing_action_choice],
feed_dict={state_holder: [observation]})
observation, reward, done, info = env.step(test_action[0])
if step >= 200:
done = True
episode_rewards.append(reward)
if done:
episode_no += 1
episode_rewards_list.append(np.sum(episode_rewards))
episode_steps_list.append(step)
episode_rewards = []
step = 0
observation = env.reset()
if episode_no % print_per_episode == 0:
print("Episode {}: Average steps in last {} episodes".format(episode_no, print_per_episode),
np.mean(episode_steps_list[(episode_no - print_per_episode):episode_no]), '+-',
np.std(episode_steps_list[(episode_no - print_per_episode):episode_no])
)