0

对于学校作业,我必须使用深度 Q 网络解决 201 件物品的有界背包问题,准确地说是来自“或健身房”的“背包-v2”。第一步是创建一个神经网络,在 Deep Q Network 算法中使用两次作为所谓的 Q 网络和目标 Q 网络。

Q 网络必须将我的问题的当前状态作为其输入,即背包信息,如下所示:array([[item weights], [item values], [item limits]])

array([[ 39,  40,  13,  34,  11,  28,  40,  37,  61,  53,  13,  32,   9,
     31,  64,  74,  21,  12,  40,  59,  61,  78,   3,  42,  45,  64,
     61,  48,  34,  12,  18,  17,  70,  59,  66,  16,  40,  80,  62,
     54,   2,  53,  56,   9,  91,  56,  66,  47,  65,  40,  49,  40,
     20,  23,  94,   2,  28,  42,  92,  98,  28,  52,   7,  18,  68,
     32,  23,  64,  32,  22,  48,  76,  38,  71,  56,  16,  72,  72,
     53,  17,  28,  11,  74,  57,  76,  32,  27,  80,  79,  50,  44,
     97,  62,  38,  80,  28,   5,  15,  76,  59,  80,  89,  48,  73,
     27,  86,  24,  49,  43,   1,  84,  56,  20,  57,  42,  61,  82,
     44,  29,  61,   4,  72,  82,  41,  25,  88,  48,  43,  83,  14,
     94,  73,  37,   8,  52,  78,  44,  64,  23,  47,  91,   8,  71,
     79,  23,  87,  96,  83,   5,  13,   8,  83,  78,  42,  69,  62,
     10,  61,  58,  29,  19,  11,  38,  12,  74,  62,  68,  60,  89,
     53,  94,  78,  70,  66,  24,  99,  92,  60,  56,  94,   6,  77,
     95,  76,  17,  96,  10,  20,  64,  28,  58,   1,  37,  98,   8,
     34,   3,  57,  15,  40, 200],
   [ 22,   8,  48,  85,  40,  11,  99,  89,  50,  81,  28,  46,  42,
     31,  22,  80,  78,  58,  19,  94,  43,   3,   8,  35,  72,  67,
     24,  12,  82,  71,  84,  59,  93,  82,  37,  28,  44,  37,  87,
     35,  69,  77,  74,  71,  67,   3,   3,  89,  78,  43,  60,  79,
     37,  75,  32,  32,  30,  67,  48,  81,  35,  98,  59,  11,  50,
      8,  92,  98,  65,  59,  31,  42,  43,  59,  81,  32,  15,  82,
     95,  83,  71,  12,  25,  90,  25,  21,  68,  68,   4,  14,  91,
     69,   4,  94,  65,  54,  87,  93,  54,  94,  95,  34,  44,  62,
     44,  99,   5,  36,  29,  14,  93,   5,  44,  87,  27,  55,  14,
      6,  32,  61,  30,   3,  21,  34,  32,  18,  31,  65,  90,  29,
     49,   6,  73,  58,   4,  70,  10,  71,  99,  60,   3,  32,  91,
     40,  53,  70,  37,  92,  66,  67,  76,  51,  32,  85,  72,  98,
     98,  92,  58,  10,  30,  92,  53,  87,  59,  13,  73,  95,  64,
     15,  32,  65,  35,  15,  90,  83,  23,  92,  39,  28,  74,  55,
      4,  49,  14,  22,  24,  40,  48,  49,  36,  81,  98,  86,  84,
     32,  24,  56,  96,  40,   0],
   [  3,   8,   9,   9,   1,   8,   2,   6,   3,   4,   6,   1,   7,
      6,   6,   8,   5,   2,   9,   3,   8,   8,   9,   6,   6,   5,
      8,   1,   6,   2,   7,   7,   4,   6,   5,   6,   1,   4,   9,
      3,   3,   6,   6,   8,   3,   2,   4,   9,   2,   2,   4,   2,
      2,   2,   3,   7,   9,   8,   5,   8,   5,   6,   9,   6,   2,
      3,   1,   5,   1,   9,   6,   8,   4,   4,   1,   4,   9,   2,
      8,   4,   9,   4,   9,   3,   1,   5,   3,   9,   5,   6,   2,
      7,   4,   1,   2,   7,   9,   8,   9,   8,   9,   6,   9,   5,
      6,   1,   8,   3,   5,   8,   1,   7,   6,   5,   4,   3,   4,
      7,   1,   4,   2,   1,   2,   7,   8,   1,   2,   9,   1,   5,
      3,   1,   7,   7,   7,   3,   2,   6,   7,   7,   6,   2,   2,
      1,   9,   9,   7,   8,   1,   5,   4,   2,   2,   3,   3,   2,
      2,   4,   2,   9,   5,   2,   3,   7,   5,   6,   2,   2,   7,
      9,   2,   3,   1,   2,   7,   9,   5,   2,   7,   1,   9,   9,
      8,   6,   6,   6,   4,   2,   6,   8,   6,   8,   7,   5,   5,
      3,   6,   3,   2,   8,   0]])

每次我迈出一步时,数组的第三部分都会更新,所以当我“选择一个项目”时,限制会减少 1。

输入这个数组后,神经网络必须吐出预期的 Q 值以执行某个动作(因此选择一个项目),此后,必须发生许多其他事情,但现在超出了范围。

问题在于我的输入和输出的尺寸规格,我想在给定状态数组的情况下进行预测,并得到 201 个值的输出,即我的 Q 值。

这是我当前的网络(以及一些测试其输出的代码):

env = or_gym.make("Knapsack-v2")
env.mask = False
state_space = env.reset() #array of the choosable items --> item- weights, values, limits
action_space = env.action_space.n #max 200 weight

def NN_model():
    learning_rate = 0.001
    init = tf.keras.initializers.HeUniform()
    model = keras.Sequential()
    model.add(keras.layers.Dense(48, input_shape = (201,), activation='relu', kernel_initializer=init))
    model.add(keras.layers.Dense(96, activation='relu', kernel_initializer=init))
    model.add(keras.layers.Dense(201, activation='linear', kernel_regularizer=init))
    model.compile(loss=tf.keras.losses.Huber(), optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), metrics=['accuracy'])
    return model

model = NN_model()

all_actions = model.predict(state_space, verbose=0)
max_index = np.where(all_actions==np.amax(all_actions))
listOfCordinates = list(zip(max_index[0], max_index[1]))
for cord in listOfCordinates:
    cord
action=cord[0]

我尝试了几种输入和输出形状,但没有一个能满足我的需求。

预期输出示例:

[0, 1, 2, ........., 198, 199, 200]
4

0 回答 0