嗨,我一直在尝试训练一个 DQN 代理来解决 Lunar Lander 离散问题,但它无法学习,即使在 1000 集之后,平均奖励也徘徊在 -130 左右。我也使用了目标网络。请任何人指导我我做错了什么..我的代码如下。(并且代码没有在 colab 和 kaggle 的 gpu 上运行。这就是为什么我必须等待太久才能看到更改的效果..)请帮帮我。
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense , Conv2D , MaxPooling2D , Activation , Dropout, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from collections import deque
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import random
import time
env = gym.make("LunarLander-v2")
EPISODES = 3000
DISCOUNT = 0.99
LEARNING_RATE = 0.001
REPLAY_MEMORY_SIZE = 100_000
epsilon = 1
EPSILON_DECAY = 0.996
MIN_MEMORY_SIZE = 50_000
BATCH_SIZE = 128
UPDATE_TARGET_EVERY = 10
SHOW_EVERY = 50
MIN_EPSILON = 0.001
SAVE_EVERY = 100
CHCKPNT = 100
AGGREGATE_STATS_EVERY = 50
MIN_REWARD = -200
MODEL_NAME = "LunarModule"
# MEMORY_FRACTION = 0.2
class Lander:
def __init__(self,):
self.model = self.create_model()
self.target_model = self.create_model()
self.target_model.set_weights(self.model.get_weights())
self.replay_memory = deque(maxlen = REPLAY_MEMORY_SIZE )
self.target_update_counter = 0
def create_model(self):
model = Sequential()
model.add(Dense(128,activation="relu" , input_shape = (8,)))
model.add(Dense(128, activation = "relu"))
model.add(Dense(env.action_space.n , activation = "linear"))
model.compile(loss = tf.keras.losses.Huber() , optimizer = Adam(lr = LEARNING_RATE) , metrics = ["accuracy"])
return model
def update_replay_memory(self,transition):
self.replay_memory.append(transition)
def train(self,terminal_state):
if len(self.replay_memory) < MIN_MEMORY_SIZE:
return
minibatch = random.sample(self.replay_memory, BATCH_SIZE)
current_states = np.array([transition[0] for transition in minibatch])
current_qs_list = self.model.predict(current_states)
new_current_states = np.array([transition[3] for transition in minibatch])
new_qs_list = self.target_model.predict(new_current_states)
X = []
y = []
for index , (current_state,action, reward, new_current_state,done ) in enumerate(minibatch):
if not done:
max_future_q = np.max(new_qs_list[index])
new_q = reward + DISCOUNT*max_future_q
else:
new_q = reward
current_qs = current_qs_list[index]
current_qs[action] = new_q
X.append(current_state)
y.append(current_qs)
self.model.fit(np.array(X), np.array(y),batch_size = BATCH_SIZE, verbose = 0 ,shuffle=False )
# Update target network counter every episode
if terminal_state:
self.target_update_counter += 1
if self.target_update_counter > UPDATE_TARGET_EVERY:
self.target_model.set_weights(self.model.get_weights())
self.target_update_counter = 0
# Queries main network for Q values given current observation space (environment state)
def get_qs(self,state):
self.model.predict(np.array(state).reshape(-1, *state.shape))[0]
def save_model(self,ep):
self.model.save(f'./saved_model/agent_{ep}')
def save_checkpoint(self,ckpt):
self.model.save_weights(f"""./checkpoints/my_checkpoint_{ckpt}""")
agent = Lander()
ep_rewards = [-200]
for episode in tqdm(range(1,EPISODES+1) ):
episode_reward = 0
current_state = env.reset()
done = False
step = 0
while not done:
if episode % SHOW_EVERY == 0:
render = True
else:
render = False
if np.random.random() > epsilon:
action = np.argmax(agent.get_qs(current_state))
else:
action = np.random.randint(0, env.action_space.n)
new_state, reward, done, _ = env.step(action)
# if render:
# env.render()
episode_reward += reward
agent.update_replay_memory((current_state,action,reward,new_state,done))
agent.train(done)
current_state = new_state
step += 1
ep_rewards.append(episode_reward)
if not episode % AGGREGATE_STATS_EVERY or episode == 1:
average_reward = sum(ep_rewards[-AGGREGATE_STATS_EVERY:])/len(ep_rewards[-AGGREGATE_STATS_EVERY:])
min_reward = min(ep_rewards[-AGGREGATE_STATS_EVERY:])
max_reward = max(ep_rewards[-AGGREGATE_STATS_EVERY:])
print(f"episode:{episode}, average reward:{average_reward}, min reward:{min_reward},max reward: {max_reward}")
# Save model, but only when min reward is greater or equal a set value
if min_reward >= MIN_REWARD:
agent.model.save(f'/content/drive/MyDrive/Models{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model')
if epsilon > MIN_EPSILON:
epsilon *= EPSILON_DECAY
epsilon = max(MIN_EPSILON, epsilon)