python - 为什么这个蒙特卡洛树搜索算法不能正常工作？

Question

问题我正在编写一个蒙特卡洛树搜索算法来用 Python 下棋。我用自定义评估函数替换了模拟阶段。我的代码看起来很完美，但由于某种原因表现得很奇怪。它很容易识别即时获胜，但无法识别 2 步棋和 3 步棋的棋局。有任何想法吗？

我尝试了什么 我尝试给它更多的时间来搜索，但它仍然无法找到最好的着法，即使它可以保证在两步中获胜。但是，我注意到当我关闭自定义评估并使用经典的蒙特卡洛树搜索模拟时，结果会有所改善。（要关闭自定义评估，请不要将任何参数传递给 Agent 构造函数。）但我真的需要它来处理自定义评估，因为我正在研究用于董事会评估的机器学习技术。

我尝试打印出搜索结果，看看算法认为哪些动作是好的。它始终将 mate-in-2 和 mate-in-3 情况下的最佳移动列为最差。排名基于探索移动的次数（这是 MCTS 选择最佳移动的方式）。

我的代码 我已经包含了整个代码，因为一切都与问题相关。要运行此代码，您可能需要安装 python-chess (pip install python-chess)。

我已经为此苦苦挣扎了一个多星期，而且越来越令人沮丧。有任何想法吗？

import math
import random
import time

import chess
import chess.engine

class Node:

    def __init__(self, state, parent, action):
        """Initializes a node structure for a Monte-Carlo search tree."""
        self.state = state
        self.parent = parent
        self.action = action

        self.unexplored_actions = list(self.state.legal_moves)
        random.shuffle(self.unexplored_actions)
        self.colour = self.state.turn
        self.children = []
        
        self.w = 0 # number of wins
        self.n = 0 # number of simulations

class Agent:
    
    def __init__(self, custom_evaluation=None):
        """Initializes a Monte-Carlo tree search agent."""
        
        if custom_evaluation:
            self._evaluate = custom_evaluation

    def mcts(self, state, time_limit=float('inf'), node_limit=float('inf')):
        """Runs Monte-Carlo tree search and returns an evaluation."""

        nodes_searched = 0
        start_time = time.time()

        # Initialize the root node.
        root = Node(state, None, None)

        while (time.time() - start_time) < time_limit and nodes_searched < node_limit:
            
            # Select a leaf node.
            leaf = self._select(root)

            # Add a new child node to the tree.
            if leaf.unexplored_actions:
                child = self._expand(leaf)
            else:
                child = leaf

            # Evaluate the node.
            result = self._evaluate(child)

            # Backpropagate the results.
            self._backpropagate(child, result)

            nodes_searched += 1

        result = max(root.children, key=lambda node: node.n) 

        return result

    def _uct(self, node):
        """Returns the Upper Confidence Bound 1 of a node."""
        c = math.sqrt(2)

        # We want every WHITE node to choose the worst BLACK node and vice versa.
        # Scores for each node are relative to that colour.
        w = node.n - node.w

        n = node.n
        N = node.parent.n

        try:
            ucb = (w / n) + (c * math.sqrt(math.log(N) / n))
        except ZeroDivisionError:
            ucb = float('inf')

        return ucb

    def _select(self, node):
        """Returns a leaf node that either has unexplored actions or is a terminal node."""
        while (not node.unexplored_actions) and node.children:
            # Pick the child node with highest UCB.
            selection = max(node.children, key=self._uct)
            # Move to the next node.
            node = selection
        return node

    def _expand(self, node):
        """Adds one child node to the tree."""
        # Pick an unexplored action.
        action = node.unexplored_actions.pop()
        # Create a copy of the node state.
        state_copy = node.state.copy()
        # Carry out the action on the copy.
        state_copy.push(action)
        # Create a child node.
        child = Node(state_copy, node, action)
        # Add the child node to the list of children.
        node.children.append(child)
        # Return the child node.
        return child

    def _evaluate(self, node):
        """Returns an evaluation of a given node."""
        # If no custom evaluation function was passed into the object constructor, 
        # use classic simulation.
        return self._simulate(node)

    def _simulate(self, node):
        """Randomly plays out to the end and returns a static evaluation of the terminal state."""
        board = node.state.copy()
        while not board.is_game_over():
            # Pick a random action.
            move = random.choice(list(board.legal_moves))
            # Perform the action.
            board.push(move)
        return self._calculate_static_evaluation(board)

    def _backpropagate(self, node, result):
        """Updates a node's values and subsequent parent values."""
        # Update the node's values.
        node.w += result.pov(node.colour).expectation()
        node.n += 1
        # Back up values to parent nodes.
        while node.parent is not None:
            node.parent.w += result.pov(node.parent.colour).expectation()
            node.parent.n += 1
            node = node.parent

    def _calculate_static_evaluation(self, board):
        """Returns a static evaluation of a *terminal* board state."""
        result = board.result(claim_draw=True)

        if result == '1-0':
            wdl = chess.engine.Wdl(wins=1000, draws=0, losses=0)
        elif result == '0-1':
            wdl = chess.engine.Wdl(wins=0, draws=0, losses=1000)        
        else:
            wdl = chess.engine.Wdl(wins=0, draws=1000, losses=0)

        return chess.engine.PovWdl(wdl, chess.WHITE)


def custom_evaluation(node):
    """Returns a static evaluation of a board state."""

    board = node.state
    
    # Evaluate terminal states.
    if board.is_game_over(claim_draw=True):
        result = board.result(claim_draw=True)
        if result == '1-0':
            wdl = chess.engine.Wdl(wins=1000, draws=0, losses=0)
        elif result == '0-1':
            wdl = chess.engine.Wdl(wins=0, draws=0, losses=1000)        
        else:
            wdl = chess.engine.Wdl(wins=0, draws=1000, losses=0)

        return chess.engine.PovWdl(wdl, chess.WHITE)
    
    # Evaluate material.
    material_balance = 0
    material_balance += len(board.pieces(chess.PAWN, chess.WHITE)) * +100
    material_balance += len(board.pieces(chess.PAWN, chess.BLACK)) * -100
    material_balance += len(board.pieces(chess.ROOK, chess.WHITE)) * +500
    material_balance += len(board.pieces(chess.ROOK, chess.BLACK)) * -500
    material_balance += len(board.pieces(chess.KNIGHT, chess.WHITE)) * +300
    material_balance += len(board.pieces(chess.KNIGHT, chess.BLACK)) * -300
    material_balance += len(board.pieces(chess.BISHOP, chess.WHITE)) * +300
    material_balance += len(board.pieces(chess.BISHOP, chess.BLACK)) * -300
    material_balance += len(board.pieces(chess.QUEEN, chess.WHITE)) * +900
    material_balance += len(board.pieces(chess.QUEEN, chess.BLACK)) * -900

    # TODO: Evaluate mobility.
    mobility = 0

    # Aggregate values.
    centipawn_evaluation = material_balance + mobility

    # Convert evaluation from centipawns to wdl.
    wdl = chess.engine.Cp(centipawn_evaluation).wdl(model='lichess')
    static_evaluation = chess.engine.PovWdl(wdl, chess.WHITE)

    return static_evaluation


m1 = chess.Board('8/8/7k/8/8/8/5R2/6R1 w - - 0 1') # f2h2
# WHITE can win in one move. Best move is f2-h2.

m2 = chess.Board('8/6k1/8/8/8/8/1K2R3/5R2 w - - 0 1')
# WHITE can win in two moves. Best move is e2-g2.

m3 = chess.Board('8/8/5k2/8/8/8/3R4/4R3 w - - 0 1')
# WHITE can win in three moves. Best move is d2-f2.

agent = Agent(custom_evaluation)

result = agent.mcts(m2, time_limit=30)
print(result)

python - 为什么这个蒙特卡洛树搜索算法不能正常工作？

0 回答 0

Related

Reference