我正在尝试创建一个以最佳方式丢弃卡片的 rl 代理
我按照一些指南创建了这个环境,代理可以在 31 种动作之间进行选择,因为如果你有 5 张牌并且必须始终丢弃至少一张牌,那么这就是所有不同的弃牌可能性
class PokerEnv(Env):
def __init__(self):
self.action_space = Discrete(31)
self.observation_space = Box(low=np.array([1]), high=np.array([7462]))
self.evaluator = Evaluator()
self.deck = Deck()
options = [0, 1, 2, 3, 4]
self.discard_2 = list(itertools.combinations(options, 2))
self.discard_3 = list(itertools.combinations(options, 3))
self.discard_4 = list(itertools.combinations(options, 4))
self.hand = self.deck.draw(5)
def step(self, action):
action = action - 1
if action < 5:
self.hand.remove(self.hand[action])
self.hand = self.hand + [self.deck.draw(1)]
elif action < 15:
discard = sorted(self.discard_2[action - 5], reverse=True)
[self.hand.remove(self.hand[i]) for i in discard]
self.hand = self.hand + self.deck.draw(2)
elif action < 25:
discard = sorted(self.discard_3[action - 15], reverse=True)
[self.hand.remove(self.hand[i]) for i in discard]
self.hand = self.hand + self.deck.draw(3)
elif action < 30:
discard = sorted(self.discard_4[action - 25], reverse=True)
[self.hand.remove(self.hand[i]) for i in discard]
self.hand = self.hand + self.deck.draw(4)
elif action < 31:
self.hand.remove(self.hand[5])
self.hand = self.hand + self.deck.draw(5)
reward = self.evaluator.evaluate([], self.hand)
info = {}
return self.hand, reward, True, info
def reset(self):
self.deck = Deck()
self.hand = self.deck.draw(5)
return self.hand
env = PokerEnv()
并像这样构建模型
states = env.observation_space.shape
actions = env.action_space.n
def build_model(states, actions):
model = Sequential()
model.add(Flatten(input_shape=(1, 5)))
model.add(Dense(24, activation='relu'))
model.add(Dense(24, activation='relu'))
model.add(Dense(actions, activation='linear'))
return model
model = build_model(states, actions)
训练成这样
def build_agent(model, actions):
policy = BoltzmannQPolicy()
memory = SequentialMemory(limit=500000, window_length=1)
dqn = DQNAgent(model=model, memory=memory, policy=policy, nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
return dqn
dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)
然后我尝试让模型在给出 5 张卡片时选择一个动作
deck = Deck()
hand = deck.draw(5)
hand = np.asarray(hand).reshape(1, 1, 5)
pf = dqn.model.predict(hand)
best_action = np.argmax(pf[0])
print(best_action)
但我只得到 26 的值?我应该得到 1-32 的范围
我尝试阅读文档,但没有找到太多帮助