python - Cartpole-v0 的 PyTorch PPO 实现陷入局部最优

Question

我已经为 Cartpole-VO 环境实施了 PPO。但是，它不会在游戏的某些迭代中收敛。有时它会陷入局部最优。我已经使用 TD-0 优势实现了算法，即

A(s_t) = R(t+1) + \gamma V(S_{t+1}) - V(S_t)

这是我的代码：

def running_average(x, n):
    N = n
    kernel = np.ones(N)
    conv_len = x.shape[0]-N
    y = np.zeros(conv_len)
    for i in range(conv_len):
        y[i] = kernel @ x[i:i+N] # matrix multiplication operator: np.mul
        y[i] /= N
    return y



class ActorNetwork(nn.Module):
    def __init__(self, state_dim, n_actions, learning_rate=0.0003, epsilon_clipping=0.3, update_epochs=10):
        super().__init__()
        self.n_actions = n_actions
        self.model = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, n_actions),
            nn.Softmax(dim=-1)
        ).float()
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
        self.epsilon_clipping = epsilon_clipping
        self.update_epochs = update_epochs

    def forward(self, X):
        return self.model(X)

    
    def predict(self, state):
        if state.ndim < 2:
            action_probs = self.model(torch.FloatTensor(state).unsqueeze(0).float())
        else: 
            action_probs = self.model(torch.FloatTensor(state))

        return action_probs.squeeze(0).data.numpy()

   
    def update(self, states, actions, deltas, old_prob):
  
        batch_size = len(states)
        state_batch = torch.Tensor(states)
        action_batch = torch.Tensor(actions)
        delta_batch = torch.Tensor(deltas)
        old_prob_batch = torch.Tensor(old_prob)
        for k in range(self.update_epochs):
            pred_batch = self.model(state_batch)

            prob_batch = pred_batch.gather(dim=1, index=action_batch.long().view(-1, 1)).squeeze()

            ratio = torch.exp(torch.log(prob_batch) - torch.log(old_prob_batch))

            clipped = torch.clamp(ratio, 1 - self.epsilon_clipping, 1 + self.epsilon_clipping) * delta_batch
            loss_r = -torch.min(ratio*delta_batch, clipped)
            loss = torch.mean(loss_r)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()




class CriticNetwork(nn.Module):
    def __init__(self, state_dim, learning_rate=0.001):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
        ).float()
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)


    def forward(self, X):
        return self.model(X)

    def predict(self, state):
        if state.ndim < 2:
            values = self.model(torch.FloatTensor(state).unsqueeze(0).float())
        else:
            values = self.model(torch.FloatTensor(state))

        return values.data.numpy()

  
    def update(self, states, targets):
        
        state_batch = torch.Tensor(states)
        target_batch = torch.Tensor(targets)
        pred_batch = self.model(state_batch)
        loss = torch.nn.functional.mse_loss(pred_batch, target_batch.unsqueeze(1))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    


def train_ppo_agent(env, episode_length, max_episodes, gamma, visualize_step, learning_rate_actor=0.0003, learning_rate_critic=0.001, epsilon_clipping=0.2, actor_update_epochs=10):

  
    model_actor = ActorNetwork(env.observation_space.shape[0], env.action_space.n, learning_rate=learning_rate_actor,
                               epsilon_clipping=epsilon_clipping, update_epochs=actor_update_epochs)
    model_critic = CriticNetwork(env.observation_space.shape[0], learning_rate=learning_rate_critic)



    EPISODE_LENGTH = episode_length
    MAX_EPISODES = max_episodes
    GAMMA = gamma
    VISUALIZE_STEP = max(1, visualize_step)
    score = []


    for episode in range(MAX_EPISODES):
        curr_state = env.reset()
        done = False
        all_episode_t = []
        score_episode = 0
        for t in range(EPISODE_LENGTH):
            act_prob = model_actor.predict(curr_state)
            action = np.random.choice(np.array(list(range(env.action_space.n))), p=act_prob)
            value = model_critic.predict(curr_state)
            prev_state = curr_state
            curr_state, reward, done, info = env.step(action)
            score_episode += reward
            e_t = {'state': prev_state, 'action':action, 'action_prob':act_prob[action],'reward': reward, 'value': value}
            all_episode_t.append(e_t)
            if done:
                break
        score.append(score_episode)

        episode_values = [all_episode_t[t]['value'] for t in range(len(all_episode_t))]
        next_state_estimates = [episode_values[i].item() for i in range(1, len(episode_values))]
        next_state_estimates.append(0)
        boostrap_estimate = []
        for t in range(len(all_episode_t)):
            G = all_episode_t[t]['reward'] + GAMMA * next_state_estimates[t]
            boostrap_estimate.append(G)

        episode_target = np.array(boostrap_estimate)
        episode_values = np.array(episode_values)
        # compute the advantage for each state in the episode: R_{t+1} + \gamma * V(S_{t+1}) - V_{t}
        adv_batch = episode_target-episode_values
       
        state_batch = np.array([all_episode_t[t]['state'] for t in range(len(all_episode_t))])
        action_batch = np.array([all_episode_t[t]['action'] for t in range(len(all_episode_t))])
        old_actor_prob = np.array([all_episode_t[t]['action_prob'] for t in range(len(all_episode_t))])
       
        model_actor.update(state_batch, action_batch, adv_batch, old_actor_prob)
       
        model_critic.update(state_batch, episode_target)

        # print the status after every VISUALIZE_STEP episodes
        if episode % VISUALIZE_STEP == 0 and episode > 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(episode, np.mean(score[-VISUALIZE_STEP:-1])))
            # domain knowledge applied to stop training: if the average score across last 100 episodes is greater than 195, game is solved
            if np.mean(score[-100:-1]) > 195:
                break


    # Training plot: Episodic reward over Training Episodes
    score = np.array(score)
    avg_score = running_average(score, visualize_step)
    plt.figure(figsize=(15, 7))
    plt.ylabel("Episodic Reward", fontsize=12)
    plt.xlabel("Training Episodes", fontsize=12)
    plt.plot(score, color='gray', linewidth=1)
    plt.plot(avg_score, color='blue', linewidth=3)
    plt.scatter(np.arange(score.shape[0]), score, color='green', linewidth=0.3)
    plt.savefig("temp/cartpole_ppo_training_plot.pdf")

    # return the trained models
    return model_actor, model_critic

def main():
    env = gym.make('CartPole-v0')
    episode_length = 300
    n_episodes = 5000
    gamma = 0.99
    vis_steps = 100
    learning_rate_actor = 0.0003
    actor_update_epochs = 10
    epsilon_clipping = 0.2
    learning_rate_critic = 0.001
   
    # train the PPO agent
    model_actor, model_critic = train_ppo_agent(env, episode_length, n_episodes, gamma, vis_steps,
                                               learning_rate_actor=learning_rate_actor,
                                               learning_rate_critic=learning_rate_critic,
                                               epsilon_clipping=epsilon_clipping,
                                               actor_update_epochs=actor_update_epochs)

考虑到 Cartpole 环境的性质，我是否遗漏了什么，或者如果一个人使用简单的 TD-0 优势来实现 PPO，这种行为是预期的吗？

score 0 · Accepted Answer

如果删除行中的“-”（负标记）：

loss_r = -torch.min(ratio*delta_batch, clipped)

然后分数将随着时间的推移开始稳步增加。在此修复之前，您的负损失会随着时间的推移而增加。这不是神经网络的损失应该如何工作。由于梯度下降可以最大限度地减少损失。所以你想要一个可以被优化器最小化的正损失。

希望我的回答有点清楚，对不起，我不能更详细地介绍。

我的跑步可以在附图中看到：

python - Cartpole-v0 的 PyTorch PPO 实现陷入局部最优

1 回答 1

Related

Reference