蒙特卡洛树算法实现mini AlphaGo Reversi

本文代码参考蒙特卡洛树实现黑白棋

设计思想

采用蒙特卡洛树算法,即采用选择-拓展-模拟-反向传播,获取下棋的近似最优解。

代码内容

from copy import deepcopy
import math
import random

class AIPlayer:
    """
    AI 玩家
    """
    def __init__(self, color):
        """
        玩家初始化
        :param color: 下棋方,'X' - 黑棋,'O' - 白棋
        """
        self.color = color      #下棋方
        self.max_times = 200     #最大迭代次数
        self.c = 1.2       #UCB值中的参数c
 
    def get_move(self, board):
        """
        根据当前棋盘状态获取最佳落子位置
        :param board: 棋盘
        :return: action 最佳落子位置, e.g. 'A1'
        """
        if self.color == 'X':
            player_name = '黑棋'
        else:
            player_name = '白棋'
        print("请等一会,对方 {}-{} 正在思考中...".format(player_name, self.color))
 
        board_state = deepcopy(board)       #深拷贝棋盘,因为算法会改变棋盘状态
        root = Node(state = board_state, color=self.color)
        action = self.mcts(root)            #蒙特卡洛树搜索
 
        return action
    
    #蒙特卡洛树搜索
    def mcts(self, root):
        for t in range(self.max_times):
            leave_node = self.select_expand_node(root)  #选择并拓展
            reward = self.simulate(leave_node)          #模拟
            self.back_propagate(leave_node, reward)     #反向传播
            best_child = self.select(root, 0)  #选择胜率最大的子节点
        return best_child.action

    #选择并拓展
    def select_expand_node(self, node):
        while not self.game_over(node.state):   #当前节点不是叶子节点
            if len(node.children) == 0:         #当前节点没有被拓展过
                return self.expand(node)        #拓展任意节点
            else:
                if not node.full_expand():      #当前节点没有被完全拓展
                    return self.expand(node)
                else:
                    node = self.select(node, self.c)   #若被完全拓展,继续选择节点
        return node
 
    #选择
    def select(self, node, c):
        max_ucb = -float('inf')
        select_children = []
        #在所有的子节点中,选择UCB值最大的子节点
        for child in node.children:
            if child.visits == 0:
                select_children = [child]
                break
            ucb = child.reward / child.visits + c * math.sqrt(2.0 * math.log(node.visits) / float(child.visits))
            if ucb == max_ucb:
                select_children.append(child)
            if ucb > max_ucb:
                select_children = [child]
                max_ucb = ucb
        if len(select_children) == 0:
            return node.parent
        return random.choice(select_children)   #如果有多个UCB值相同的子节点,选择任意一个
    
    #拓展
    def expand(self, node):
        #下一步合法行动
        legal_actions = list(node.state.get_legal_actions(node.color))
        if len(legal_actions) == 0:     #如果没有合法行动
            return node.parent
        action = random.choice(legal_actions)   #随机选择一个合法行动
        expanded_actions = [child.action for child in node.children]
        while action in expanded_actions:       #选择一个没有被拓展过的子节点
            action = random.choice(legal_actions)
        #下棋并更新棋盘
        new_state = deepcopy(node.state)
        new_state._move(action, node.color)
        #子节点下棋方相反
        if node.color == 'X':
            new_color = 'O'
        else:
            new_color = 'X'
        #新建节点
        node.add_child(new_state, action=action, color=new_color)
        return node.children[-1]        #返回最新拓展的节点
 
    #模拟并返回奖励
    def simulate(self, node):
        board = deepcopy(node.state)        #深拷贝棋盘,因为模拟会改变棋盘
        color = node.color
        count = 0
        while not self.game_over(board):    #模拟到游戏结束为止
            legal_actions = list(node.state.get_legal_actions(color))
            if not len(legal_actions) == 0: #若有合法行动
                action = random.choice(legal_actions)   #任意选择一个合法行动
                board._move(action, color)
                count = count + 1
            #交换下棋方
            if color == 'X':
                color = 'O'
            else:
                color = 'X'
            if count >= 50:     #最大模拟次数
                break
        winner, win_count = board.get_winner()  #获得赢家和领先几个棋子
        #赢家+10分,每领先一个棋子多1分
        reward = 0
        if not winner == 2:
            reward = 10 + win_count
        if (self.color == 'X' and winner == 1) or (self.color == 'O' and winner == 0):
            reward = - reward
        return reward
 
    #反向传播
    def back_propagate(self, node, reward):
        while node is not None:
            node.visits += 1
            if node.color == self.color:
                node.reward += reward
            else:
                node.reward -= reward
            node = node.parent
        return 0
 
    #判断游戏是否结束
    def game_over(self, state):
        #如果两个选手都没有合法的下棋位置,则比赛停止
        now_loc = list(state.get_legal_actions('X'))
        next_loc = list(state.get_legal_actions('O'))
        over = len(now_loc) == 0 and len(next_loc) == 0  #返回值 True/False
        return over

你可能感兴趣的:(python,算法)