0

我正在为 openAI 的 FrozenLake-v0 环境实现一个简单的 REINFORCE(策略梯度)算法。然而,它似乎并没有学到任何东西。

我为 openAI 的 CartPole-v0 使用了相同的神经架构,并使用 REINFORCE(策略梯度)对其进行了训练,并且效果很好。那么,我在 FrozenLake-v0 环境中做错了什么?

我的代码如下:

import gym
from gym.envs.registration import register
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt


# helper function for conversion of a state into an input to a neural network
def OH(x, n):
    '''
    :param x: state id
    :param n: n_states
    :return:  1-hot encoded numpy array of size [1,n]
    '''
    one_hot = np.zeros((n,))
    one_hot[x] = 1
    return one_hot



def running_mean(x, n):
    N=n
    kernel = np.ones(N)
    conv_len = x.shape[0]-N
    y = np.zeros(conv_len)
    for i in range(conv_len):
        y[i] = kernel @ x[i:i+N]
        y[i] /= N
    return y


# architecture of the Policy Network
class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, n_actions):
        super().__init__()
        self.n_actions = n_actions
        self.model = nn.Sequential(
            nn.Linear(state_dim, 256),
            nn.ReLU(),
            nn.Linear(256, n_actions),
            nn.Softmax(dim=0)
        ).float()

    def forward(self, X):
        return self.model(X)


def train_reinforce_agent(env, episode_length = 100, max_episodes = 50000, gamma = 0.99, visualize_step = 50, learning_rate=0.003):

    # define the parametric model for the Policy: this is an instantiation of the PolicyNetwork class
    model = PolicyNetwork(env.observation_space.shape[0], env.action_space.n)
    # define the optimizer for updating the weights of the Policy Network
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)


    # hyperparameters of the reinforce agent
    EPISODE_LENGTH = episode_length
    MAX_EPISODES = max_episodes
    GAMMA = gamma
    VISUALIZE_STEP = max(1, visualize_step)
    score = []



    for episode in range(MAX_EPISODES):
        # reset the environment
        curr_state = env.reset()
        done = False
        transitions = []

        # rollout an entire episode from the Policy Network
        for t in range(EPISODE_LENGTH):
            act_prob = model(torch.from_numpy(curr_state).float())
            action = np.random.choice(np.array(list(range(env.action_space.n))), p=act_prob.data.numpy())
            prev_state = curr_state
            curr_state, _, done, info = env.step(action)
            transitions.append((prev_state, action, t+1))

            if done:
                break
        score.append(len(transitions))
        reward_batch = torch.Tensor([r for (s, a, r) in transitions]).flip(dims=(0,))


        # compute the return for every state-action pair from the rewards at every time-step
        batch_Gvals = []
        for i in range(len(transitions)):
            new_Gval = 0
            power = 0
            for j in range(i, len(transitions)):
                new_Gval = new_Gval + ((GAMMA ** power) * reward_batch[j]).numpy()
            power += 1
            batch_Gvals.append(new_Gval)

        # normalize the returns for the batch
        expected_returns_batch = torch.FloatTensor(batch_Gvals)
        expected_returns_batch /= expected_returns_batch.max()

        # batch the states, actions, prob after the episode
        state_batch = torch.Tensor([s for (s, a, r) in transitions])
        action_batch = torch.Tensor([a for (s, a, r) in transitions])
        pred_batch = model(state_batch)
        prob_batch = pred_batch.gather(dim=1, index=action_batch.long().view(-1, 1)).squeeze()


        # compute the loss for one episode
        loss = -torch.sum(torch.log(prob_batch) * expected_returns_batch)

        # back-propagate the loss
        optimizer.zero_grad()
        loss.backward()
        # update the parameters of the Policy Network
        optimizer.step()

        # print the status after every VISUALIZE_STEP episodes
        if episode % VISUALIZE_STEP == 0 and episode > 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(episode, np.mean(score[-VISUALIZE_STEP:-1])))


    # Training plot: Episodic reward over Training Episodes
    score = np.array(score)
    avg_score = running_mean(score, visualize_step)
    plt.figure(figsize=(15, 7))
    plt.ylabel("Episode Duration", fontsize=12)
    plt.xlabel("Training Episodes", fontsize=12)
    plt.plot(score, color='gray', linewidth=1)
    plt.plot(avg_score, color='blue', linewidth=3)
    plt.scatter(np.arange(score.shape[0]), score, color='green', linewidth=0.3)
    plt.show()
4

0 回答 0