我一直在尝试在健身房中为 Cartpole env 实施 actor-critic。我已经成功地为同一个环境实现了策略梯度。因此,仅更新了演员部分。但该算法似乎不起作用,因为平均奖励没有提高。
我已经尝试过多次调整网络参数以改进损失函数,但似乎没有任何效果。下面是我的代码。
演员和评论家网络的代码
class Actor(Model):
def __init__(self, output_shape):
super(Actor,self).__init__()
self.layer1 = Dense(32, activation='relu', name='layer1')
self.layer2 = Dense(16, activation='relu', name='layer2')
self.layer3 = Dense(16, activation='relu', name='layer3')
self.layer4 = Dense(output_shape, activation='softmax', name='layer4')
def call(self, input_data):
input_data = tf.convert_to_tensor(input_data)
input_data= tf.reshape(input_data, shape=(1,input_data.shape[0]))
output = self.layer1(input_data)
output = self.layer2(output)
output = self.layer3(output)
output = self.layer4(output)
return output
class Critic(Model):
def __init__(self):
super(Critic,self).__init__()
self.layer1 = Dense(16, activation='relu', name='layer1')
self.layer2 = Dense(16, activation='relu', name='layer2')
self.layer3 = Dense(16, activation='relu', name='layer3')
self.layer4 = Dense(1, activation='relu', name='layer4')
def call(self, state):
input_data = tf.convert_to_tensor(state)
input_data= tf.reshape(input_data, shape=(1,input_data.shape[0]))
output = self.layer1(input_data)
output = self.layer2(output)
output = self.layer3(output)
output = self.layer4(output)
return output
我正在更新网络如下:
def loss_function_policy(prob, action, q_value):
selected_probs = tf.math.log((tf.reduce_sum(prob * tf.one_hot(action, num_actions),keepdims=[1])))
cost = -tf.reduce_sum(q_value * selected_probs)
return cost
def update_policy(policy, Q, state, action):
opt = tf.keras.optimizers.Adam(learning_rate=0.0001,
beta_1=0.8,
beta_2=0.999,
epsilon=1e-05,
amsgrad=True,)
step_loss = []
with tf.GradientTape() as tape:
prob = policy(state)
q_value = Q(state)
loss = loss_function_policy(prob, action, q_value)
step_loss.append(loss)
gradients = tape.gradient(loss, policy.trainable_variables)
opt.apply_gradients(zip(gradients, policy.trainable_variables))
step_policy_loss.append(np.sum(step_loss))
def update_Q(Q,td_error, state, action): # td error = reward + gamma*Q(s') - Q(s)
# Optimizer
opt = tf.keras.optimizers.Adam(learning_rate=0.0003,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-07,
amsgrad=True,)
step_loss = []
with tf.GradientTape() as tape:
q_value = Q(state)
loss = -tf.reduce_sum(q_value * td_error)
step_loss.append(loss)
gradients = tape.gradient(loss, Q.trainable_variables)
opt.apply_gradients(zip(gradients, Q.trainable_variables))
step_q_loss.append(np.sum(step_loss))
Rest 是相同的标准 RL 代码。不过,我还附上了整个代码。请调查一下并帮助找出我哪里出错了。