3

我正在为 Cartpole-V0 实施 REINFORCE。但是,训练过程非常不稳定。我没有对环境实施“提前停止”,并允许训练继续进行固定(高)数量的剧集。经过几千次迭代后,训练奖励似乎又下降了。这是由于过度拟合和提前停止是必不可少的,还是我执行不正确?

这是我的代码:

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import os


def running_average(x, n):
    N = n
    kernel = np.ones(N)
    conv_len = x.shape[0]-N
    y = np.zeros(conv_len)
    for i in range(conv_len):
        y[i] = kernel @ x[i:i+N] # matrix multiplication operator: np.mul
        y[i] /= N
    return y


class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, n_actions):
        super().__init__()
        self.n_actions = n_actions
        self.model = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, n_actions),
            nn.Softmax(dim=1)
        ).float()

    def forward(self, X):
        return self.model(X)


def train_reinforce_agent(env, episode_length, max_episodes, gamma, visualize_step, learning_rate=0.003):

    
    model = PolicyNetwork(env.observation_space.shape[0], env.action_space.n)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    EPISODE_LENGTH = episode_length
    MAX_EPISODES = max_episodes
    GAMMA = gamma
    VISUALIZE_STEP = max(1, visualize_step)
    score = []


    for episode in range(MAX_EPISODES):
        curr_state = env.reset()
        done = False
        all_episode_t = []
        score_episode = 0
        for t in range(EPISODE_LENGTH):
            act_prob = model(torch.from_numpy(curr_state).unsqueeze(0).float())
            action = np.random.choice(np.array(list(range(env.action_space.n))), p=act_prob.squeeze(0).data.numpy())
            prev_state = curr_state
            curr_state, reward, done, info = env.step(action)
            score_episode += reward
            e_t = {'state': prev_state, 'action':action, 'reward': reward, 'returns':0}
            all_episode_t.append(e_t)
            if done:
                break
        score.append(score_episode)

        G = 0
        max_G = 0
        for t in range(len(all_episode_t)-1, -1, -1):
            G = GAMMA*G + all_episode_t[t]['reward']
            all_episode_t[t]['returns'] = G
            if G > max_G:
                max_G = G

        episode_returns = np.array([all_episode_t[t]['returns'] for t in range(len(all_episode_t))])
        # normalize the returns
        for t in range(len(all_episode_t)):
            all_episode_t[t]['returns'] = (all_episode_t[t]['returns'] - np.mean(episode_returns))/(max_G + 10**(-6))



        episode_returns = torch.FloatTensor(episode_returns)
        state_batch = torch.Tensor(np.array([all_episode_t[t]['state'] for t in range(len(all_episode_t))]))
        action_batch = torch.Tensor(np.array([all_episode_t[t]['action'] for t in range(len(all_episode_t))]))

        
        pred_batch = model(state_batch)
        prob_batch = pred_batch.gather(dim=1, index=action_batch.long().view(-1, 1)).squeeze()
        loss_tensor = torch.log(prob_batch) * episode_returns
        loss = -torch.sum(loss_tensor)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if episode % VISUALIZE_STEP == 0 and episode > 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(episode, np.mean(score[-VISUALIZE_STEP:-1])))
            # # EARLY-STOPPING: if the average score across last 100 episodes is greater than 195, game is solved
            # if np.mean(score[-100:-1]) > 195:
            #     break


    # Training plot
    score = np.array(score)
    avg_score = running_average(score, visualize_step)
    plt.figure(figsize=(15, 7))
    plt.ylabel("Episodic Reward", fontsize=12)
    plt.xlabel("Training Episodes", fontsize=12)
    plt.plot(score, color='gray', linewidth=1)
    plt.plot(avg_score, color='blue', linewidth=3)
    plt.scatter(np.arange(score.shape[0]), score, color='green', linewidth=0.3)
    plt.savefig("cartpole_reinforce_training_plot.pdf")

   


def main():
    env = gym.make('CartPole-v0')
    episode_length = 300
    n_episodes = 5000
    gamma = 0.99
    vis_steps = 100
    learning_rate = 0.003
    train_reinforce_agent(env, episode_length, n_episodes, gamma, vis_steps, learning_rate=learning_rate)
    

if __name__ == "__main__":
    main()

4

0 回答 0