0

所以我正在构建我自己的第一个简单的 DQN 神经网络。但我真的在为我的网络的输出形状苦苦挣扎。

我有一个具有 139 个特征的输入,使其 input_shape=(None,139) 和批量大小为 64。最后一层有 4 个输出,因为我的环境有 4 个可能的操作(0、1、2、3)。

但我得到这个错误:

ValueError:检查目标时出错:预期dense_4的形状为(None,1)但得到的数组形状为(1、4)

这让我发疯。我做错了什么?

def create_model(self):
    model = Sequential()
    model.add(Dense(128, input_shape=(None,139), activation='relu'))
    model.add(Dropout(0.2))

    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.2))

    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.2))

    model.add(Dense(4, activation='softmax'))

    #Model compile settings:
    opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

    # Compile model
    model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
    )
    print(model.summary())
    return model

型号总结:

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_23 (Dense)             (None, None, 128)         17920     
_________________________________________________________________
dropout_19 (Dropout)         (None, None, 128)         0         
_________________________________________________________________
dense_24 (Dense)             (None, None, 128)         16512     
_________________________________________________________________
dropout_20 (Dropout)         (None, None, 128)         0         
_________________________________________________________________
dense_25 (Dense)             (None, None, 128)         16512     
_________________________________________________________________
dropout_21 (Dropout)         (None, None, 128)         0         
_________________________________________________________________
dense_26 (Dense)             (None, None, 4)           516       
=================================================================
Total params: 51,460
Trainable params: 51,460
Non-trainable params: 0
_________________________________________________________________
None
Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_27 (Dense)             (None, None, 128)         17920     
_________________________________________________________________
dropout_22 (Dropout)         (None, None, 128)         0         
_________________________________________________________________
dense_28 (Dense)             (None, None, 128)         16512     
_________________________________________________________________
dropout_23 (Dropout)         (None, None, 128)         0         
_________________________________________________________________
dense_29 (Dense)             (None, None, 128)         16512     
_________________________________________________________________
dropout_24 (Dropout)         (None, None, 128)         0         
_________________________________________________________________
dense_30 (Dense)             (None, None, 4)           516       
=================================================================
Total params: 51,460
Trainable params: 51,460
Non-trainable params: 0
_________________________________________________________________
None

使用下面的代码发布更新。-带有模型、火车等的DQN类。

class DQNAgent:
def __init__(self):

    #main model # gets trained every step
    self.model = self.create_model()

    #Target model this is what we .predict against every step
    self.target_model = self.create_model()
    self.target_model.set_weights(self.model.get_weights())

    self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)
    #self.tensorboard = ModifiedTensorBoard(log_dir=f"logs/{MODEL_NAME}-{int(time.time())}")
    self.target_update_counter = 0

def create_model(self):
    model = Sequential()
    model.add(Dense(128, input_shape=(None,139), activation='relu'))
    model.add(Dropout(0.2))

    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.2))

    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.2))

    model.add(Dense(4, activation='softmax'))

    #Model compile settings:
    opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

    # Compile model
    model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
    )
    print(model.summary())
    return model

def update_replay_memory(self, transition):
    self.replay_memory.append(transition)

def train(self, terminal_state):
    global export
    if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
        return

    minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE)

    current_states = np.array([transition[0] for transition in minibatch])

    current_qs_list = self.model.predict(current_states)

    new_states = np.array([transition[3] for transition in minibatch])

    future_qs_list = self.target_model.predict(new_states)

    X = []
    y = []

    # Now we need to enumerate our batches
    for index, (current_state, action, reward, new_state, done) in enumerate(minibatch):

        # If not a terminal state, get new q from future states, otherwise set it to 0
        # almost like with Q Learning, but we use just part of equation here
        if not done:
            max_future_q = np.max(future_qs_list[index])
            new_q = reward + DISCOUNT * max_future_q
        else:
            new_q = reward

        # Update Q value for given state
        qs = current_qs_list[index]
        print(qs, qs.shape)
        qs[0,action] = new_q

        # And append to our training data
        X.append(current_state)
        y.append(qs)

    # Fit on all samples as one batch, log only on terminal state
    self.model.fit(np.array(X), np.array(y), batch_size=MINIBATCH_SIZE, verbose=0, shuffle=False, callbacks=[tensorboard] if terminal_state else None)

    # Update target network counter every episode
    if done:
        self.target_update_counter += 1

    # If counter reaches set value, update target network with weights of main network
    if self.target_update_counter > UPDATE_TARGET_EVERY:
        self.target_model.set_weights(self.model.get_weights())
        self.target_update_counter = 0

# Queries main network for Q values given current observation space (environment state)
def get_qs(self, state):
    return self.model.predict(scaler.transform(np.array(state).reshape(-1, *state.shape)))[0]
agent = DQNAgent()

-

for i in range(EPOCHS):
print("EPOCH #", i, " starting, of ", EPOCHS, "epochs")
if i == EPOCHS - 1:  # the last epoch, use test data set
    current_state, xdata = preprocess(test_filename)
else:
    current_state, xdata = preprocess(dataframe)
win_loss = 0
step = 1
# Iterate over episodes
for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit="episodes"):

    # Update tensorboard step every episode
    # agent.tensorboard.step = episode

    # Restarting episode - reset episode reward and step number
    episode_reward = 0

    # Reset flag and start iterating until episode ends
    done = False

    while not done:
        done = are_we_done(current_state)

        # This part stays mostly the same, the change is to query a model for Q values
        if np.random.random() > epsilon:
            # Get action from Q table
            action = np.argmax(agent.get_qs(current_state))
            # print("Q-value action")
            action = action_check(current_state, action, orders)
        else:
            # Get random action
            # print("Random action")
            action = np.random.randint(0, 4)
            action = action_check(current_state, action, orders)

        (
            new_state,
            terminal_state,
            win_loss,
            close,
            total_win_loss,
            step,
            orders,
        ) = to_market(current_state, action, step, win_loss)

        reward = get_reward(win_loss, prev_win_loss)
        episode_reward += reward

        # Every step we update replay memory and train main network
        scaled_current = (scaler.transform(current_state)).reshape(
            1, current_state.shape[1]
        )
        scaled_new_state = (scaler.transform(new_state)).reshape(
            1, new_state.shape[1]
        )
        agent.update_replay_memory(
            (scaled_current, action, reward, scaled_new_state, done)
        )
        agent.train(done)

        # step += 1


        current_state = new_state

        prev_win_loss = win_loss

        if (
            current_state.flatten()[3] == 23 and current_state.flatten()[4] >= 57
        ):  # Close for trades between 23.57 and 00.15 due to swaps and crazy market
            market_close = True
            while market_close:
                if (
                    current_state.flatten()[3] == 0
                    and current_state.flatten()[4] >= 15
                ):
                    market_close = False
                else:
                    sleep(10)

        market_close = False

    # Append episode reward to a list and log stats (every given number of episodes)
    ep_rewards.append(episode_reward)

    with writer.as_default():
        tf.summary.scalar("Reward", episode_reward, step=episode)

    average_reward = sum(ep_rewards) / len(ep_rewards)
    min_reward = min(ep_rewards)
    max_reward = max(ep_rewards)
    agent.tensorboard.update_stats(
        reward_avg=average_reward,
        reward_min=min_reward,
        reward_max=max_reward,
        epsilon=epsilon,
    )

    # Save model, but only when min reward is greater or equal a set value
    if total_win_loss >= MIN_WIN_LOSS:
        agent.model.save(f"models/{MODEL_NAME}__{SYMBOL}__{int(time.time())}.model")

    # Decay epsilon
    if epsilon > MIN_EPSILON:
        epsilon *= EPSILON_DECAY
        epsilon = max(MIN_EPSILON, epsilon)
Print("EPOCH #", i, " done, of ", epochs, "epochs")
4

0 回答 0