0

我对强化学习完全陌生,这是我在实践中的第一个程序。我正在尝试使用策略梯度算法在 OpenAI 健身房环境中训练双足系统。

然而,奖励永远不会改变,无论是在第 0 集还是在第 1000 集,我都不知道出了什么问题。谁能帮帮我..

TIA。

代码如下:

import torch
import torch.nn as nn
import numpy
from torch.autograd import Variable
import gym

def train(model, name):
    model.train()
    env = gym.make(name)
    env.reset()
    num_episodes = 20000
    max_steps = 10000
    optim = torch.optim.Adam(model.parameters(), lr=1)
    numsteps = []
    rewards = []
    avg_numsteps = []

    min_reward = -1000

    for episode in range(num_episodes):
        state = env.reset()
        probs = []
        rewards = []

        for steps in range(max_steps):
            action, log_prob = model.action(state)
            env.render()
            state, reward, finished, _ = env.step(action.squeeze(0).detach().numpy())
            env.render()
            probs.append(log_prob)
            rewards.append(reward)
            if finished:
                break

        if finished:
            Rewards = []
            for i in range(len(rewards)):
                G = 0
                p = 0
                for reward in rewards[i:]:
                    G = G + 0.9 * p * reward
                    p = p + 1
                Rewards.append(G)

            Rewards = torch.tensor(Rewards)

            discounted_reward = (Rewards - Rewards.mean()) / (Rewards.std() + 1e-9)

            gradients = []
            for log_prob, G in zip(log_prob, discounted_reward):
                gradients.append(-log_prob * G)

            optim.zero_grad()
            policy_gradient = Variable(torch.stack(gradients).sum(), requires_grad=True)
            policy_gradient.backward()
            optim.step()
            numsteps.append(steps)
            avg_numsteps.append(numpy.mean(numsteps[-10:]))
            rewards.append(numpy.sum(rewards))

            print("episode: {}, total reward: {}, average_reward: {}, length: {}\n".format(episode,
                                                                                           numpy.sum(rewards),
                                                                                           numpy.round(numpy.mean(
                                                                                               rewards[-10:]),
                                                                                                       decimals=3),
                                                                                           steps))

        if numpy.sum(rewards) > min_reward:
            torch.save(model.state_dict(), '/home/atharva/policyNet.pth')
            min_reward = numpy.sum(rewards)


def test(model, name):
    env = gym.make(name)
    model.eval()
    state = env.reset()
    with torch.no_grad():
        while True:
            action, log_prob = model(state)
            state, reward, finished, _ = env.step(action.squeeze(0).numpy())
            env.render()
            if finished:
                break


class PolicyNet(nn.Module):
    def __init__(self, inputs, actions, hidden_size):
        super(PolicyNet, self).__init__()
        self.num_actions = actions
        self.layer1 = nn.Linear(inputs, hidden_size)
        self.layer2 = nn.Linear(hidden_size, hidden_size)
        self.layer3 = nn.Linear(hidden_size, 2*hidden_size)
        self.layer4 = nn.Linear(2*hidden_size, hidden_size)
        self.layer5 = nn.Linear(hidden_size, actions)

    def forward(self, x):
        x = self.layer1(x)
        x = nn.functional.relu(x)
        x = self.layer2(x)
        x = nn.functional.relu(x)
        x = self.layer3(x)
        x = nn.functional.relu(x)
        x = self.layer4(x)
        x = nn.functional.relu(x)
        x = self.layer5(x)
        return x

    def action(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        actions = self.forward(Variable(state))
        #prob = numpy.random.choice(self.num_actions, p=numpy.squeeze(actions.detach().numpy()))
        log_prob = torch.log(actions.squeeze(0))
        return actions, log_prob

调用如下

from REINFORCE import PolicyNet, train

model = PolicyNet(24, 4, 256)
train(model, 'BipedalWalker-v3')
4

1 回答 1

2

你的学习率太高了!高学习率可能会导致您的模型永远不会收敛(实际上,损失可能会发散)。太低的学习率会使训练过程变得不必要地冗长。你必须为自己找到一个权衡。

调整学习率可​​能会对模型的性能产生巨大影响。我可以建议您花一些时间阅读这篇(写得很好)博客文章:https ://machinelearningmastery.com/learning-rate-for-deep-learning-neural-networks/

对于初学者,请尝试在 range 内的学习率[0.01, 0.00001]。例如:

optim = torch.optim.Adam(model.parameters(), lr=0.001)
于 2020-07-25T11:02:13.603 回答