python - 只从训练有素的 rl 代理获得相同的动作

Question

我正在尝试创建一个以最佳方式丢弃卡片的 rl 代理

我按照一些指南创建了这个环境，代理可以在 31 种动作之间进行选择，因为如果你有 5 张牌并且必须始终丢弃至少一张牌，那么这就是所有不同的弃牌可能性

class PokerEnv(Env):
    def __init__(self):
        self.action_space = Discrete(31)
        
        self.observation_space = Box(low=np.array([1]), high=np.array([7462]))
        
        self.evaluator = Evaluator()
        self.deck = Deck()
        
        options = [0, 1, 2, 3, 4]

        self.discard_2 = list(itertools.combinations(options, 2))
        self.discard_3 = list(itertools.combinations(options, 3))
        self.discard_4 = list(itertools.combinations(options, 4))
        
        self.hand = self.deck.draw(5)
        
    def step(self, action):
        action = action - 1
        if action < 5:
            self.hand.remove(self.hand[action])
            self.hand = self.hand + [self.deck.draw(1)]
            
        elif action < 15:
            discard = sorted(self.discard_2[action - 5], reverse=True)            
            [self.hand.remove(self.hand[i]) for i in discard]                
            self.hand = self.hand + self.deck.draw(2)
            
        elif action < 25:
            discard = sorted(self.discard_3[action - 15], reverse=True)            
            [self.hand.remove(self.hand[i]) for i in discard]                
            self.hand = self.hand + self.deck.draw(3)
            
        elif action < 30:
            discard = sorted(self.discard_4[action - 25], reverse=True)            
            [self.hand.remove(self.hand[i]) for i in discard]                
            self.hand = self.hand + self.deck.draw(4)
            
        elif action < 31:
            self.hand.remove(self.hand[5])
            self.hand = self.hand + self.deck.draw(5)
        
        reward = self.evaluator.evaluate([], self.hand)
        info = {}
        
        return self.hand, reward, True, info
        
    def reset(self):
        
        self.deck = Deck()
        self.hand = self.deck.draw(5)
        
        return self.hand
    
env = PokerEnv()

并像这样构建模型

states = env.observation_space.shape
actions = env.action_space.n

def build_model(states, actions):
    model = Sequential()    
    model.add(Flatten(input_shape=(1, 5)))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

model = build_model(states, actions)

训练成这样

def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=500000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
    return dqn

dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)

然后我尝试让模型在给出 5 张卡片时选择一个动作

deck = Deck()
hand = deck.draw(5)

hand = np.asarray(hand).reshape(1, 1, 5)
pf = dqn.model.predict(hand)
best_action = np.argmax(pf[0])
print(best_action)

但我只得到 26 的值？我应该得到 1-32 的范围

我尝试阅读文档，但没有找到太多帮助

python - 只从训练有素的 rl 代理获得相同的动作

0 回答 0

Related

Reference