0

-------------------------------------------------- - - - - - - 更新 - - - - - - - - - - - - - - - - - - - -------------------- 我继续尝试通过更改参数和通过学习规则更新的方式来改进代码。我想现在的表现是一个但更好,但仍然比预期的差得多。我希望继续尝试通过 Q learning 来解决它,因为这个网站引用了一个用 Q learning 解决的 cartpole 问题的例子:https ://medium.com/@tuzzer/cart-pole-balancing-with-q-learning- b54c6068d947

我的新代码:

import gym
import numpy as np
import math

statesAndQ = {}

bucketNumbers = [1,1,6,3]

env = gym.make('CartPole-v0')
env.reset()

bucketBounds = list(zip(env.observation_space.low, env.observation_space.high))
bucketBounds[1]=[-0.5,0.5]
bucketBounds[3] = [-math.radians(50), math.radians(50)]

def learningRule(statesAndQ, reward, discountFactor, observationTuple, tempStates, i_episode):
    if len(tempStates)>2:
        #print(tempStates)
        maximumIndex = np.argmax(statesAndQ[tuple(tempStates[-2])[0]])
    else:
        maximumIndex = np.argmax(statesAndQ[tuple(tempStates[-2])[0]])
    best_q = np.amax(statesAndQ[observationTuple])
    if i_episode==0:
        statesAndQ[observationTuple][maximumIndex] += get_learning_rate(i_episode)*(reward + discountFactor*best_q - statesAndQ[observationTuple][maximumIndex])
    else:
        statesAndQ[observationTuple][maximumIndex] += get_learning_rate(i_episode-1)*(reward + discountFactor*best_q - statesAndQ[observationTuple][maximumIndex])

    maximum = statesAndQ[observationTuple][maximumIndex]

    i = len(tempStates)-1

    while i>0:
        statesAndQ[tempStates[i][0]][np.argmax(statesAndQ[tempStates[i][0]])]  += get_learning_rate(i_episode)*(maximum - statesAndQ[tempStates[i][0]][np.argmax(statesAndQ[tempStates[i][0]])])
        maximum = statesAndQ[tempStates[i][0]][np.argmax(statesAndQ[tempStates[i][0]])]
        i=i-1

    return statesAndQ

def makeBuckets(state):
    bucket_indice = []
    for i in range(len(state)):
        if state[i] <= bucketBounds[i][0]:
            bucket_index = 0
        elif state[i] >= bucketBounds[i][1]:
            bucket_index = bucketNumbers[i] - 1
        else:
            bound_width = bucketBounds[i][1] - bucketBounds[i][0]
            offset = (bucketNumbers[i]-1)*bucketBounds[i][0]/bound_width
            scaling = (bucketNumbers[i]-1)/bound_width
            bucket_index = int(round(scaling*state[i] - offset))
        bucket_indice.append(bucket_index)
    return tuple(bucket_indice)

def get_explore_rate(t):
    #return max(0.01, min(1, 1.0 - math.log10((t+1)/15)))
    if t>250:
        return 0
    else:
        return max(min(1, 1.0 - math.log10((t+1)/25)), 0.01)


def get_learning_rate(t):
    return max(0.1, min(0.7, 1.0 - math.log10((t+1)/30)))

for i_episode in range(1000):
    print(get_explore_rate(i_episode))
    print("learning")
    print(get_learning_rate(i_episode))
    observation = env.reset()
    tempStates = []
    length = 0

    for i in range(250):
        env.render()
        random = np.random.normal(0,1)

        observationTuple = makeBuckets(observation)
        tempStates.append([observationTuple])


        if not observationTuple in statesAndQ:
            statesAndQ[observationTuple] = [0,0]

        if random>get_explore_rate(i_episode):
            action = np.argmax(statesAndQ[observationTuple])
            observation, reward, done, info = env.step(action)
            observationTuple = makeBuckets(observation)
            tempStates.append([observationTuple])

            length+=1

            if not observationTuple in statesAndQ:
                statesAndQ[observationTuple] = [0,0]

            #print(tempStates)

            statesAndQ = learningRule(statesAndQ, reward, 0.9, observationTuple, tempStates, i_episode)
            length = 0
            tempStates = []

        else:
            action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
            observationTuple = makeBuckets(observation)
            tempStates.append([observationTuple])

            if not observationTuple in statesAndQ:
                statesAndQ[observationTuple] = [0,0]

            statesAndQ = learningRule(statesAndQ, reward, 0.9, observationTuple, tempStates, i_episode)
            length = 0
            tempStates = []

        if done:
            print("Episode finished after {} timesteps".format(i+1))
            print(i_episode)
            break

print("rollouts finished")

-------------------------------------------------- ----原帖----------------------------- ---------- 我是强化学习的初学者,刚刚开始学习它。我正在尝试实施 Q 学习来解决 OpenAi 健身房的 CartPole 问题。我似乎没有得到好的结果,我的程序似乎也没有改善它的发挥。我该如何努力改进它?

编码:

import gym
import numpy as np
import math

statesAndQ = {}

bucketNumbers = [1,1,6,3] 

env = gym.make('CartPole-v0')
env.reset()

bucketBounds = list(zip(env.observation_space.low, env.observation_space.high))
bucketBounds[1]=[-0.5,0.5]
bucketBounds[3] = [-math.radians(50), math.radians(50)]

def learningRule(statesAndQ, reward, discountFactor, observationTuple, tempStates, i_episode):
    maximum = np.argmax(statesAndQ[observationTuple])
    statesAndQ[observationTuple][maximum] += get_learning_rate(i_episode)*(reward)
    maximum = statesAndQ[observationTuple][maximum]

    i = len(tempStates) - 2

    while i>0:
        statesAndQ[tempStates[i][0]][tempStates[i][1]]  += get_learning_rate(i_episode)*(discountFactor*maximum)
        maximum = statesAndQ[tempStates[i][0]][tempStates[i][1]]
        i=i-1

    return statesAndQ

def makeBuckets(state):
    bucket_indice = []
    for i in range(len(state)):
        if state[i] <= bucketBounds[i][0]:
            bucket_index = 0
        elif state[i] >= bucketBounds[i][1]:
            bucket_index = bucketNumbers[i] - 1
        else:
            bound_width = bucketBounds[i][1] - bucketBounds[i][0]
            offset = (bucketNumbers[i]-1)*bucketBounds[i][0]/bound_width
            scaling = (bucketNumbers[i]-1)/bound_width
            bucket_index = int(round(scaling*state[i] - offset))
        bucket_indice.append(bucket_index)
    return tuple(bucket_indice)

def get_explore_rate(t):
    return max(0.01, min(1, 1.0 - math.log10((t+1)/25)))

def get_learning_rate(t):
    return max(0.1, min(0.5, 1.0 - math.log10((t+1)/25)))

for i_episode in range(1000):
    observation = env.reset()
    tempStates = []

    prevState = makeBuckets(observation)

    for i in range(200):
        env.render()
        random = np.random.normal(0,1)

        observationTuple = makeBuckets(observation)

        if not observationTuple in statesAndQ:
            statesAndQ[observationTuple] = [0,0]

        if random>get_explore_rate(i_episode):
            action = np.argmax(statesAndQ[observationTuple])
            observation, reward, done, info = env.step(action)
            observationTuple = makeBuckets(observation)                   

            tempStates.append([observationTuple, action])

            if not observationTuple in statesAndQ:
                statesAndQ[observationTuple] = [0,0]

            statesAndQ = learningRule(statesAndQ, reward, 0.9, observationTuple, tempStates, i_episode)

        else:
            action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
            observationTuple = makeBuckets(observation)                   
            tempStates.append([observationTuple, action])
            if not observationTuple in statesAndQ:
                statesAndQ[observationTuple] = [0,0]

            statesAndQ = learningRule(statesAndQ, reward, 0.9, observationTuple, tempStates, i_episode)

        if done:
            print("Episode finished after {} timesteps".format(i+1))
            print(i_episode)
            break

print("rollouts finished")
4

0 回答 0