我正在实现一个 SARSA 强化学习功能,它选择遵循相同当前策略的操作更新其 Q 值。
这引发了以下错误:
TypeError: only size-1 arrays can be converted to Python scalars
q[s, a] = q[s, a] + eta * (reward + gamma * q[s_, a_] - q[s, a]) ValueError: setting an array element with a sequence.
我假设这些行有问题:
q = np.zeros((env.n_states, env.n_actions))
和
q[s, a] = q[s, a] + eta * (reward + gamma * q[s_, a_] - q[s, a])
s, a = s_, a_
这是整个方法:
def sarsa(env, max_episodes, eta, gamma, epsilon, seed=None):
#environments, max number of episodes, initial learning rate, discount factor, exploration factor, seed
random_state = np.random.RandomState(seed)
eta =np.linspace(eta, 0, max_episodes)
epsilon = np.linspace(epsilon, 0, max_episodes)
q = np.zeros((env.n_states, env.n_actions))
rewards = np.zeros(max_episodes)
for i in range(max_episodes):
print('starting game', i)
observation = env.reset();
s = observation
rand = np.random.random();
a = maxAction(q, s)
done = False
epRewards = 0
while not done:
observation_, reward, done = env.step(a)
s_ = observation_
rand = np.random.random()
a_ = maxAction(q, s)
epRewards += reward
q[s, a] = q[s, a] + eta * (reward + gamma * q[s_, a_] - q[s, a])
s, a = s_, a_
epsilon -= 2/(max_episodes)
rewards[i] = epRewards
policy = q.argmax(axis=1)
value = q.max(axis=1)
return policy, value