0

这是我的代码

网络输入是状态(4d),输出是Q值(2d)

我使用双端队列的体验回放

火车

    for i in range(EPOCHS):
        episode_reward = 0
        done = False
        obs = env.reset()
        while not done:
                 
            if random.random() < EPSILON:
                action = env.action_space.sample()
            else:
                with torch.no_grad():
                    q_values = np.array(policy_net(torch.tensor(np.array(obs)).float()))
                    action = np.argmax(q_values)

            next_obs, reward, done, info = env.step(action)

            episode_reward += reward

            queue.append((obs, action, reward, done, next_obs))


            if step_counter >= PRETRAINING_LENGTH:
                
                experience = queue.sample_mini_batch()
                
               
                #for the sampled experience minibatch 'experience'
                states, actions, rewards, dones, next_states = [i for i in experience]
            
            
                rewards_t = torch.FloatTensor(np.array(rewards).reshape(32, 1))
                actions_t = torch.LongTensor(np.array(actions).reshape(32, 1)) 
                dones_t = torch.FloatTensor(np.array(dones).reshape(32, 1)) 

                estimate = policy_net(torch.Tensor(states)).gather(1, actions_t) 

                with torch.no_grad():
                    q_next = target_policy_net(torch.Tensor(np.array(next_states)))

                y_vector = rewards_t + DISCOUNT_FACTOR * q_next.max(1)[0].view(BATCH_SIZE, 1) * (1 - dones_t)

                optimizer.zero_grad()
                loss = functional.smooth_l1_loss(estimate, y_vector)
                loss.backward()
                optimizer.step()

            if step_counter % TARGET_NETWORK_UPDATE_FREQUENCY == 0:
                target_policy_net.load_state_dict(policy_net.state_dict()) # here we update the target policy network to match the policy network
            step_counter += 1

        obs = next_obs

        EPSILON = EPSILON * EPSILON_DECAY
        if EPSILON < MIN_EPSILON:
            EPSILON = MIN_EPSILON

        episode_reward_record.append(episode_reward)

        if i%100 ==0 and i>0:
            last_100_avg = sum(list(episode_reward_record))/100
            print("LAST 100 EPISODE AVERAGE REWARD: " + str(last_100_avg))
            print("EPSILON: " +  str(EPSILON))
            if last_100_avg > EARLY_STOPPING_THRESHOLD:
                break

   

我使用一个简单的网络来近似 Q 函数。我使用经验回放。我遇到的麻烦是,随着我的训练,episode_rewardis 正在减少。

4

0 回答 0