这是我的代码
网络输入是状态(4d),输出是Q值(2d)
我使用双端队列的体验回放
火车
for i in range(EPOCHS):
episode_reward = 0
done = False
obs = env.reset()
while not done:
if random.random() < EPSILON:
action = env.action_space.sample()
else:
with torch.no_grad():
q_values = np.array(policy_net(torch.tensor(np.array(obs)).float()))
action = np.argmax(q_values)
next_obs, reward, done, info = env.step(action)
episode_reward += reward
queue.append((obs, action, reward, done, next_obs))
if step_counter >= PRETRAINING_LENGTH:
experience = queue.sample_mini_batch()
#for the sampled experience minibatch 'experience'
states, actions, rewards, dones, next_states = [i for i in experience]
rewards_t = torch.FloatTensor(np.array(rewards).reshape(32, 1))
actions_t = torch.LongTensor(np.array(actions).reshape(32, 1))
dones_t = torch.FloatTensor(np.array(dones).reshape(32, 1))
estimate = policy_net(torch.Tensor(states)).gather(1, actions_t)
with torch.no_grad():
q_next = target_policy_net(torch.Tensor(np.array(next_states)))
y_vector = rewards_t + DISCOUNT_FACTOR * q_next.max(1)[0].view(BATCH_SIZE, 1) * (1 - dones_t)
optimizer.zero_grad()
loss = functional.smooth_l1_loss(estimate, y_vector)
loss.backward()
optimizer.step()
if step_counter % TARGET_NETWORK_UPDATE_FREQUENCY == 0:
target_policy_net.load_state_dict(policy_net.state_dict()) # here we update the target policy network to match the policy network
step_counter += 1
obs = next_obs
EPSILON = EPSILON * EPSILON_DECAY
if EPSILON < MIN_EPSILON:
EPSILON = MIN_EPSILON
episode_reward_record.append(episode_reward)
if i%100 ==0 and i>0:
last_100_avg = sum(list(episode_reward_record))/100
print("LAST 100 EPISODE AVERAGE REWARD: " + str(last_100_avg))
print("EPSILON: " + str(EPSILON))
if last_100_avg > EARLY_STOPPING_THRESHOLD:
break
我使用一个简单的网络来近似 Q 函数。我使用经验回放。我遇到的麻烦是,随着我的训练,episode_rewardis 正在减少。