-------------------------------------------------- - - - - - - 更新 - - - - - - - - - - - - - - - - - - - -------------------- 我继续尝试通过更改参数和通过学习规则更新的方式来改进代码。我想现在的表现是一个但更好,但仍然比预期的差得多。我希望继续尝试通过 Q learning 来解决它,因为这个网站引用了一个用 Q learning 解决的 cartpole 问题的例子:https ://medium.com/@tuzzer/cart-pole-balancing-with-q-learning- b54c6068d947
我的新代码:
import gym
import numpy as np
import math
statesAndQ = {}
bucketNumbers = [1,1,6,3]
env = gym.make('CartPole-v0')
env.reset()
bucketBounds = list(zip(env.observation_space.low, env.observation_space.high))
bucketBounds[1]=[-0.5,0.5]
bucketBounds[3] = [-math.radians(50), math.radians(50)]
def learningRule(statesAndQ, reward, discountFactor, observationTuple, tempStates, i_episode):
if len(tempStates)>2:
#print(tempStates)
maximumIndex = np.argmax(statesAndQ[tuple(tempStates[-2])[0]])
else:
maximumIndex = np.argmax(statesAndQ[tuple(tempStates[-2])[0]])
best_q = np.amax(statesAndQ[observationTuple])
if i_episode==0:
statesAndQ[observationTuple][maximumIndex] += get_learning_rate(i_episode)*(reward + discountFactor*best_q - statesAndQ[observationTuple][maximumIndex])
else:
statesAndQ[observationTuple][maximumIndex] += get_learning_rate(i_episode-1)*(reward + discountFactor*best_q - statesAndQ[observationTuple][maximumIndex])
maximum = statesAndQ[observationTuple][maximumIndex]
i = len(tempStates)-1
while i>0:
statesAndQ[tempStates[i][0]][np.argmax(statesAndQ[tempStates[i][0]])] += get_learning_rate(i_episode)*(maximum - statesAndQ[tempStates[i][0]][np.argmax(statesAndQ[tempStates[i][0]])])
maximum = statesAndQ[tempStates[i][0]][np.argmax(statesAndQ[tempStates[i][0]])]
i=i-1
return statesAndQ
def makeBuckets(state):
bucket_indice = []
for i in range(len(state)):
if state[i] <= bucketBounds[i][0]:
bucket_index = 0
elif state[i] >= bucketBounds[i][1]:
bucket_index = bucketNumbers[i] - 1
else:
bound_width = bucketBounds[i][1] - bucketBounds[i][0]
offset = (bucketNumbers[i]-1)*bucketBounds[i][0]/bound_width
scaling = (bucketNumbers[i]-1)/bound_width
bucket_index = int(round(scaling*state[i] - offset))
bucket_indice.append(bucket_index)
return tuple(bucket_indice)
def get_explore_rate(t):
#return max(0.01, min(1, 1.0 - math.log10((t+1)/15)))
if t>250:
return 0
else:
return max(min(1, 1.0 - math.log10((t+1)/25)), 0.01)
def get_learning_rate(t):
return max(0.1, min(0.7, 1.0 - math.log10((t+1)/30)))
for i_episode in range(1000):
print(get_explore_rate(i_episode))
print("learning")
print(get_learning_rate(i_episode))
observation = env.reset()
tempStates = []
length = 0
for i in range(250):
env.render()
random = np.random.normal(0,1)
observationTuple = makeBuckets(observation)
tempStates.append([observationTuple])
if not observationTuple in statesAndQ:
statesAndQ[observationTuple] = [0,0]
if random>get_explore_rate(i_episode):
action = np.argmax(statesAndQ[observationTuple])
observation, reward, done, info = env.step(action)
observationTuple = makeBuckets(observation)
tempStates.append([observationTuple])
length+=1
if not observationTuple in statesAndQ:
statesAndQ[observationTuple] = [0,0]
#print(tempStates)
statesAndQ = learningRule(statesAndQ, reward, 0.9, observationTuple, tempStates, i_episode)
length = 0
tempStates = []
else:
action = env.action_space.sample()
observation, reward, done, info = env.step(action)
observationTuple = makeBuckets(observation)
tempStates.append([observationTuple])
if not observationTuple in statesAndQ:
statesAndQ[observationTuple] = [0,0]
statesAndQ = learningRule(statesAndQ, reward, 0.9, observationTuple, tempStates, i_episode)
length = 0
tempStates = []
if done:
print("Episode finished after {} timesteps".format(i+1))
print(i_episode)
break
print("rollouts finished")
-------------------------------------------------- ----原帖----------------------------- ---------- 我是强化学习的初学者,刚刚开始学习它。我正在尝试实施 Q 学习来解决 OpenAi 健身房的 CartPole 问题。我似乎没有得到好的结果,我的程序似乎也没有改善它的发挥。我该如何努力改进它?
编码:
import gym
import numpy as np
import math
statesAndQ = {}
bucketNumbers = [1,1,6,3]
env = gym.make('CartPole-v0')
env.reset()
bucketBounds = list(zip(env.observation_space.low, env.observation_space.high))
bucketBounds[1]=[-0.5,0.5]
bucketBounds[3] = [-math.radians(50), math.radians(50)]
def learningRule(statesAndQ, reward, discountFactor, observationTuple, tempStates, i_episode):
maximum = np.argmax(statesAndQ[observationTuple])
statesAndQ[observationTuple][maximum] += get_learning_rate(i_episode)*(reward)
maximum = statesAndQ[observationTuple][maximum]
i = len(tempStates) - 2
while i>0:
statesAndQ[tempStates[i][0]][tempStates[i][1]] += get_learning_rate(i_episode)*(discountFactor*maximum)
maximum = statesAndQ[tempStates[i][0]][tempStates[i][1]]
i=i-1
return statesAndQ
def makeBuckets(state):
bucket_indice = []
for i in range(len(state)):
if state[i] <= bucketBounds[i][0]:
bucket_index = 0
elif state[i] >= bucketBounds[i][1]:
bucket_index = bucketNumbers[i] - 1
else:
bound_width = bucketBounds[i][1] - bucketBounds[i][0]
offset = (bucketNumbers[i]-1)*bucketBounds[i][0]/bound_width
scaling = (bucketNumbers[i]-1)/bound_width
bucket_index = int(round(scaling*state[i] - offset))
bucket_indice.append(bucket_index)
return tuple(bucket_indice)
def get_explore_rate(t):
return max(0.01, min(1, 1.0 - math.log10((t+1)/25)))
def get_learning_rate(t):
return max(0.1, min(0.5, 1.0 - math.log10((t+1)/25)))
for i_episode in range(1000):
observation = env.reset()
tempStates = []
prevState = makeBuckets(observation)
for i in range(200):
env.render()
random = np.random.normal(0,1)
observationTuple = makeBuckets(observation)
if not observationTuple in statesAndQ:
statesAndQ[observationTuple] = [0,0]
if random>get_explore_rate(i_episode):
action = np.argmax(statesAndQ[observationTuple])
observation, reward, done, info = env.step(action)
observationTuple = makeBuckets(observation)
tempStates.append([observationTuple, action])
if not observationTuple in statesAndQ:
statesAndQ[observationTuple] = [0,0]
statesAndQ = learningRule(statesAndQ, reward, 0.9, observationTuple, tempStates, i_episode)
else:
action = env.action_space.sample()
observation, reward, done, info = env.step(action)
observationTuple = makeBuckets(observation)
tempStates.append([observationTuple, action])
if not observationTuple in statesAndQ:
statesAndQ[observationTuple] = [0,0]
statesAndQ = learningRule(statesAndQ, reward, 0.9, observationTuple, tempStates, i_episode)
if done:
print("Episode finished after {} timesteps".format(i+1))
print(i_episode)
break
print("rollouts finished")