我现在尝试学习强化学习的概念。因此,我尝试使用 tensorflow 为推车杆示例实现 SARSA 算法。我将我的算法与对 q 值函数使用线性逼近函数的算法进行了比较,发现我的算法非常相似。不幸的是,我的实现似乎是错误的或效率低下的,因为学习成功是相当有限的。有没有人可以告诉我我是否做错了什么以及它是什么?我的实现代码是:
import numpy as np
import matplotlib.pylab as plt
import random
import gym
#define a neural network which returns two action dependent q-values given a state
neural_net = tf.keras.Sequential([
tf.keras.layers.Dense(10, activation = 'relu', input_shape = [4]),
tf.keras.layers.Dense(2)
])
#return the neural network's q-value for a specific action
def q_value(state, action):
return neural_net(tf.convert_to_tensor([state]))[0, action]
#act either randomly or choose the action which maximizes the q-value
def policy(state, epsilon):
values = neural_net(tf.convert_to_tensor([state]))
if np.random.rand() < epsilon:
return random.choice([0, 1])
else:
return np.argmax(values)
#intialize gym environment
env = gym.make('CartPole-v0')
#hyperparameters
discount = 0.99
optimizer = tf.keras.optimizers.Adam()
episodes = 1000
epsilon = 0.30
#collect reward for each episode
rewards = []
for episode in range(episodes):
#start trajectory for episode
state = env.reset()
#record rewards during episode
sum_returns = 0
#decrease random action after the first 100 episodes
if episode == 100:
epsilon = 0.10
#Q-learning
while True:
action = policy(state, epsilon)
next_state, reward, done, _ = env.step(action)
next_action = policy(next_state, epsilon)
sum_returns += 1
if done:
with tf.GradientTape() as tape:
tape.watch(neural_net.trainable_variables)
q_hat = q_value(state, action)
y = reward
loss = tf.square(y - q_hat)
gradients = tape.gradient(loss, neural_net.trainable_variables)
optimizer.apply_gradients(zip(gradients, neural_net.trainable_variables))
break
else:
with tf.GradientTape() as tape:
tape.watch(neural_net.trainable_variables)
q_hat = q_value(state, action)
y = reward + discount * q_value(next_state, next_action)
loss = tf.square(y - q_hat)
gradients = tape.gradient(loss, neural_net.trainable_variables)
optimizer.apply_gradients(zip(gradients, neural_net.trainable_variables))
state = next_state
rewards.append(sum_returns)
#plot learning over time
plt.plot([episode for episode in range(episodes)], rewards)
plt.show()```