所以我正在构建我自己的第一个简单的 DQN 神经网络。但我真的在为我的网络的输出形状苦苦挣扎。
我有一个具有 139 个特征的输入,使其 input_shape=(None,139) 和批量大小为 64。最后一层有 4 个输出,因为我的环境有 4 个可能的操作(0、1、2、3)。
但我得到这个错误:
ValueError:检查目标时出错:预期dense_4的形状为(None,1)但得到的数组形状为(1、4)
这让我发疯。我做错了什么?
def create_model(self):
model = Sequential()
model.add(Dense(128, input_shape=(None,139), activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(4, activation='softmax'))
#Model compile settings:
opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)
# Compile model
model.compile(
loss='sparse_categorical_crossentropy',
optimizer=opt,
metrics=['accuracy']
)
print(model.summary())
return model
型号总结:
Model: "sequential_7"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense_23 (Dense) (None, None, 128) 17920
_________________________________________________________________
dropout_19 (Dropout) (None, None, 128) 0
_________________________________________________________________
dense_24 (Dense) (None, None, 128) 16512
_________________________________________________________________
dropout_20 (Dropout) (None, None, 128) 0
_________________________________________________________________
dense_25 (Dense) (None, None, 128) 16512
_________________________________________________________________
dropout_21 (Dropout) (None, None, 128) 0
_________________________________________________________________
dense_26 (Dense) (None, None, 4) 516
=================================================================
Total params: 51,460
Trainable params: 51,460
Non-trainable params: 0
_________________________________________________________________
None
Model: "sequential_8"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense_27 (Dense) (None, None, 128) 17920
_________________________________________________________________
dropout_22 (Dropout) (None, None, 128) 0
_________________________________________________________________
dense_28 (Dense) (None, None, 128) 16512
_________________________________________________________________
dropout_23 (Dropout) (None, None, 128) 0
_________________________________________________________________
dense_29 (Dense) (None, None, 128) 16512
_________________________________________________________________
dropout_24 (Dropout) (None, None, 128) 0
_________________________________________________________________
dense_30 (Dense) (None, None, 4) 516
=================================================================
Total params: 51,460
Trainable params: 51,460
Non-trainable params: 0
_________________________________________________________________
None
使用下面的代码发布更新。-带有模型、火车等的DQN类。
class DQNAgent:
def __init__(self):
#main model # gets trained every step
self.model = self.create_model()
#Target model this is what we .predict against every step
self.target_model = self.create_model()
self.target_model.set_weights(self.model.get_weights())
self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)
#self.tensorboard = ModifiedTensorBoard(log_dir=f"logs/{MODEL_NAME}-{int(time.time())}")
self.target_update_counter = 0
def create_model(self):
model = Sequential()
model.add(Dense(128, input_shape=(None,139), activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(4, activation='softmax'))
#Model compile settings:
opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)
# Compile model
model.compile(
loss='sparse_categorical_crossentropy',
optimizer=opt,
metrics=['accuracy']
)
print(model.summary())
return model
def update_replay_memory(self, transition):
self.replay_memory.append(transition)
def train(self, terminal_state):
global export
if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
return
minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE)
current_states = np.array([transition[0] for transition in minibatch])
current_qs_list = self.model.predict(current_states)
new_states = np.array([transition[3] for transition in minibatch])
future_qs_list = self.target_model.predict(new_states)
X = []
y = []
# Now we need to enumerate our batches
for index, (current_state, action, reward, new_state, done) in enumerate(minibatch):
# If not a terminal state, get new q from future states, otherwise set it to 0
# almost like with Q Learning, but we use just part of equation here
if not done:
max_future_q = np.max(future_qs_list[index])
new_q = reward + DISCOUNT * max_future_q
else:
new_q = reward
# Update Q value for given state
qs = current_qs_list[index]
print(qs, qs.shape)
qs[0,action] = new_q
# And append to our training data
X.append(current_state)
y.append(qs)
# Fit on all samples as one batch, log only on terminal state
self.model.fit(np.array(X), np.array(y), batch_size=MINIBATCH_SIZE, verbose=0, shuffle=False, callbacks=[tensorboard] if terminal_state else None)
# Update target network counter every episode
if done:
self.target_update_counter += 1
# If counter reaches set value, update target network with weights of main network
if self.target_update_counter > UPDATE_TARGET_EVERY:
self.target_model.set_weights(self.model.get_weights())
self.target_update_counter = 0
# Queries main network for Q values given current observation space (environment state)
def get_qs(self, state):
return self.model.predict(scaler.transform(np.array(state).reshape(-1, *state.shape)))[0]
agent = DQNAgent()
-
for i in range(EPOCHS):
print("EPOCH #", i, " starting, of ", EPOCHS, "epochs")
if i == EPOCHS - 1: # the last epoch, use test data set
current_state, xdata = preprocess(test_filename)
else:
current_state, xdata = preprocess(dataframe)
win_loss = 0
step = 1
# Iterate over episodes
for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit="episodes"):
# Update tensorboard step every episode
# agent.tensorboard.step = episode
# Restarting episode - reset episode reward and step number
episode_reward = 0
# Reset flag and start iterating until episode ends
done = False
while not done:
done = are_we_done(current_state)
# This part stays mostly the same, the change is to query a model for Q values
if np.random.random() > epsilon:
# Get action from Q table
action = np.argmax(agent.get_qs(current_state))
# print("Q-value action")
action = action_check(current_state, action, orders)
else:
# Get random action
# print("Random action")
action = np.random.randint(0, 4)
action = action_check(current_state, action, orders)
(
new_state,
terminal_state,
win_loss,
close,
total_win_loss,
step,
orders,
) = to_market(current_state, action, step, win_loss)
reward = get_reward(win_loss, prev_win_loss)
episode_reward += reward
# Every step we update replay memory and train main network
scaled_current = (scaler.transform(current_state)).reshape(
1, current_state.shape[1]
)
scaled_new_state = (scaler.transform(new_state)).reshape(
1, new_state.shape[1]
)
agent.update_replay_memory(
(scaled_current, action, reward, scaled_new_state, done)
)
agent.train(done)
# step += 1
current_state = new_state
prev_win_loss = win_loss
if (
current_state.flatten()[3] == 23 and current_state.flatten()[4] >= 57
): # Close for trades between 23.57 and 00.15 due to swaps and crazy market
market_close = True
while market_close:
if (
current_state.flatten()[3] == 0
and current_state.flatten()[4] >= 15
):
market_close = False
else:
sleep(10)
market_close = False
# Append episode reward to a list and log stats (every given number of episodes)
ep_rewards.append(episode_reward)
with writer.as_default():
tf.summary.scalar("Reward", episode_reward, step=episode)
average_reward = sum(ep_rewards) / len(ep_rewards)
min_reward = min(ep_rewards)
max_reward = max(ep_rewards)
agent.tensorboard.update_stats(
reward_avg=average_reward,
reward_min=min_reward,
reward_max=max_reward,
epsilon=epsilon,
)
# Save model, but only when min reward is greater or equal a set value
if total_win_loss >= MIN_WIN_LOSS:
agent.model.save(f"models/{MODEL_NAME}__{SYMBOL}__{int(time.time())}.model")
# Decay epsilon
if epsilon > MIN_EPSILON:
epsilon *= EPSILON_DECAY
epsilon = max(MIN_EPSILON, epsilon)
Print("EPOCH #", i, " done, of ", epochs, "epochs")