源码阅读解析之Alpha Zero 实现五子棋游戏

项目地

Alpha Zero 项目地址
BoardGame2模块实现

项目介绍

该项目复现DeepMind论文Alpha Zero模型,将其应用在五子棋游戏上面,采用MCTS+深度残差网络组合的方式训练一个游戏AI,全程通过自我对弈的方式提升训练,不需要参考人类的知识。

项目思路介绍

初始化一个价值网络,用价值网络引导MCTS的启动,然后再用自我对弈数据引导价值网络

MCTS模拟过程解读

假设初始状态是s0,
第一次模拟:s0不在policy里面,求出p0后结束
第二次模拟:s0在policy里面,根据s0下的c和p求coef然后再确定a0,进入s1的模拟,s1不在policy里面,求出p1后结束,回到s0的模拟中,更新c值和q值
……
源码阅读解析之Alpha Zero 实现五子棋游戏_第1张图片

项目环境介绍

强化学习的环境实现在BoardGame2模块里面,运行此项目需要提前安装好此模块

pip install BoardGame2

Alpha Zero Agent部分代码注释

class AlphaZeroAgent:
    def __init__(self, env, batches=1, batch_size=4096,
                 kwargs={}, load=None, sim_count=800,
                 c_init=1.25, c_base=19652., prior_exploration_fraction=0.25):
        self.env = env
        self.board = np.zeros_like(env.board)
        self.batches = batches
        self.batch_size = batch_size

        self.net = self.build_network(**kwargs)
        self.reset_mcts()
        self.sim_count = sim_count  # MCTS 次数
        self.c_init = c_init  # PUCT 系数
        self.c_base = c_base  # PUCT 系数
        self.prior_exploration_fraction = prior_exploration_fraction

    def build_network(self, conv_filters, residual_filters, policy_filters,
                      learning_rate=0.001, regularizer=keras.regularizers.l2(1e-4)):
        # 公共部分
        inputs = keras.Input(shape=self.board.shape)
        x = keras.layers.Reshape(self.board.shape + (1,))(inputs)
        for conv_filter in conv_filters:
            z = keras.layers.Conv2D(conv_filter, 3, padding='same',
                                    kernel_regularizer=regularizer,
                                    bias_regularizer=regularizer)(x)
            y = keras.layers.BatchNormalization()(z)
            x = keras.layers.ReLU()(y)
        for residual_filter in residual_filters:
            x = residual(x, filters=residual_filter, regularizer=regularizer)
        intermediates = x

        # 概率部分
        for policy_filter in policy_filters:
            z = keras.layers.Conv2D(policy_filter, 3, padding='same',
                                    kernel_regularizer=regularizer,
                                    bias_regularizer=regularizer)(x)
            y = keras.layers.BatchNormalization()(z)
            x = keras.layers.ReLU()(y)
        logits = keras.layers.Conv2D(1, 3, padding='same',
                                     kernel_regularizer=regularizer, bias_regularizer=regularizer)(x)
        flattens = keras.layers.Flatten()(logits)
        softmaxs = keras.layers.Softmax()(flattens)
        probs = keras.layers.Reshape(self.board.shape)(softmaxs)

        # 价值部分
        z = keras.layers.Conv2D(1, 3, padding='same',
                                kernel_regularizer=regularizer,
                                bias_regularizer=regularizer)(intermediates)
        y = keras.layers.BatchNormalization()(z)
        x = keras.layers.ReLU()(y)
        flattens = keras.layers.Flatten()(x)
        vs = keras.layers.Dense(1, activation=keras.activations.tanh,
                                kernel_regularizer=regularizer,
                                bias_regularizer=regularizer)(flattens)

        model = keras.Model(inputs=inputs, outputs=[probs, vs])

        def categorical_crossentropy_2d(y_true, y_pred):
            labels = tf.reshape(y_true, [-1, self.board.size])
            preds = tf.reshape(y_pred, [-1, self.board.size])
            return keras.losses.categorical_crossentropy(labels, preds)

        loss = [categorical_crossentropy_2d, keras.losses.MSE]
        optimizer = keras.optimizers.Adam(learning_rate)
        model.compile(loss=loss, optimizer=optimizer)
        return model

    def reset_mcts(self):
        def zero_board_factory():  # 用于构造 default_dict
            return np.zeros_like(self.board, dtype=float)

        self.q = collections.defaultdict(zero_board_factory)
        #dict.setdefault()方法接收两个参数,第一个参数是健的名称,第二个参数是默认值。假如字典中不存在给定的键,则返回参数中提供的默认值;反之,则返回字典中保存的值
        # q值估计: board -> board
        self.count = collections.defaultdict(zero_board_factory)#后面的用法是count[s][location],location就是索引
        # q值计数: board -> board
        self.policy = {}  # 策略: board -> board
        self.valid = {}  # 有效位置: board -> board
        self.winner = {}  # 赢家: board -> None or int

    def decide(self, observation, greedy=False, return_prob=False):
        # 计算策略
        board, player = observation
        canonical_board = player * board#当player=1,则不变,当player=-1,则翻转
        s = boardgame2.strfboard(canonical_board)#转为字符串形如 “+++o++++x+++...”
        while self.count[s].sum() < self.sim_count:  # 多次 MCTS 搜索,count[s].sum局面s所有点访问的总次数
            self.search(canonical_board, prior_noise=True)
        prob = self.count[s] / self.count[s].sum()

        # 采样
        location_index = np.random.choice(prob.size, p=prob.reshape(-1))
        location = np.unravel_index(location_index, prob.shape)
        if return_prob:
            return location, prob
        return location

    def learn(self, dfs):
        df = pd.concat(dfs).reset_index(drop=True)
        for batch in range(self.batches):
            indices = np.random.choice(len(df), size=self.batch_size)
            players, boards, probs, winners = (np.stack(
                df.loc[indices, field]) for field in df.columns)
            canonical_boards = players[:, np.newaxis, np.newaxis] * boards
            vs = (players * winners)[:, np.newaxis]
            self.net.fit(canonical_boards, [probs, vs], verbose=0)  # 训练
        self.reset_mcts()

    def search(self, board, prior_noise=False):  # MCTS 搜索,递归结束的条件:1、胜负已分,2、遇到未算过策略的叶子节点
        s = boardgame2.strfboard(board)
        if s not in self.winner:
            self.winner[s] = self.env.get_winner((board, BLACK))  # 计算赢家
        if self.winner[s] is not None:  # 赢家确定的情况
            return self.winner[s]#最终只会是三种情况中的一种-1,0,1

        if s not in self.policy:  # 未计算过策略的叶子节点
            pis, vs = self.net.predict(board[np.newaxis])
            #model = keras.Model(inputs=inputs, outputs=[probs, vs])
            pi, v = pis[0], vs[0]#pi.shape=(10,10),v.shape=(1,)
            valid = self.env.get_valid((board, BLACK))
            masked_pi = pi * valid
            total_masked_pi = np.sum(masked_pi)
            if total_masked_pi <= 0:  # 所有的有效动作都没有概率,偶尔可能发生
                masked_pi = valid  # workaround
                total_masked_pi = np.sum(masked_pi)
            self.policy[s] = masked_pi / total_masked_pi
            self.valid[s] = valid
            return v

        # PUCT 上界计算
        count_sum = self.count[s].sum()
        coef = (self.c_init + np.log1p((1 + count_sum) / self.c_base)) * \
               math.sqrt(count_sum) / (1. + self.count[s])
        if prior_noise:  # 先验噪声
            alpha = 1. / self.valid[s].sum()
            noise = np.random.gamma(alpha, 1., board.shape)
            noise *= self.valid[s]
            noise /= noise.sum()
            prior = (1. - self.prior_exploration_fraction) * \
                    self.policy[s] + \
                    self.prior_exploration_fraction * noise
        else:
            prior = self.policy[s]
        ub = np.where(self.valid[s], self.q[s] + coef * prior, np.nan)## reset_mcts():  q值估计: board -> board
        # np.where(condition, x, y)满足条件(condition),输出x,不满足输出y。
        location_index = np.nanargmax(ub)#返回除了Nan之外的最大值的索引,需要改进:如果全为0,就随机选一个点
        location = np.unravel_index(location_index, board.shape)
        (next_board, next_player), _, _, _ = self.env.next_step(
            (board, BLACK), np.array(location))
        next_canonical_board = next_player * next_board#翻转
        next_v = self.search(next_canonical_board)  # 递归搜索。叶子节点局面价值,假设下一局面为终局,赢家确定,最终只会是三种情况中的一种-1,0,1
        v = next_player * next_v
        self.count[s][location] += 1#递归结束后将反向更新以下内容
        self.q[s][location] += (v - self.q[s][location]) / \
                               self.count[s][location]

构造残差块的代码

def residual(x, filters, kernel_sizes=3, strides=1, activations='relu',
            regularizer=keras.regularizers.l2(1e-4)):
    shortcut = x
    for i, filte in enumerate(filters):
        kernel_size = kernel_sizes if isinstance(kernel_sizes, int) \
                else kernel_sizes[i]
        stride = strides if isinstance(strides, int) else strides[i]
        activation = activations if isinstance(activations, str) \
                else activations[i]
        z = keras.layers.Conv2D(filte, kernel_size, strides=stride,
                padding='same', kernel_regularizer=regularizer,
                bias_regularizer=regularizer)(x)
        y = keras.layers.BatchNormalization()(z)
        if i == len(filters) - 1:
            y = keras.layers.Add()([shortcut, y])
        x = keras.layers.Activation(activation)(y)
    return x

自我对弈部分

def self_play(env, agent, return_trajectory=False, verbose=False):
    if return_trajectory:
        trajectory = []
    observation = env.reset()
    for step in itertools.count():
        board, player = observation
        action, prob = agent.decide(observation, return_prob=True)
        if verbose:
            print(boardgame2.strfboard(board))
            logging.info('第 {} 步:玩家 {}, 动作 {}'.format(step, player,
                    action))
        observation, winner, done, _ = env.step(action)
        if return_trajectory:
            trajectory.append((player, board, prob))
        if done:
            if verbose:
                print(boardgame2.strfboard(observation[0]))
                logging.info('赢家 {}'.format(winner))
            break
    if return_trajectory:
        df_trajectory = pd.DataFrame(trajectory,
                columns=['player', 'board', 'prob'])
        df_trajectory['winner'] = winner
        return df_trajectory
    else:
        return winner

训练部分

env = gym.make('KInARow-v0', board_shape=15, target_length=5)
"""
小规模参数,用来初步求解比较小的问题(如井字棋)
"""
train_iterations = 100
train_episodes_per_iteration = 100
batches = 2
batch_size = 64
sim_count = 200
net_kwargs = {}
net_kwargs['conv_filters'] = [256,]
net_kwargs['residual_filters'] = [[256, 256],]
net_kwargs['policy_filters'] = [256,]
agent = AlphaZeroAgent(env=env, kwargs=net_kwargs, sim_count=sim_count,
        batches=batches, batch_size=batch_size)
for iteration in range(train_iterations):
    # 自我对弈
    dfs_trajectory = []#经验
    for episode in range(train_episodes_per_iteration):
        df_trajectory = self_play(env, agent,
                return_trajectory=True, verbose=False)
        logging.info('训练 {} 回合 {}: 收集到 {} 条经验'.format(
                iteration, episode, len(df_trajectory)))
        dfs_trajectory.append(df_trajectory)
    # 利用经验进行学习
    agent.learn(dfs_trajectory)
    logging.info('训练 {}: 学习完成'.format(iteration))
    self_play(env, agent,return_trajectory=True, verbose=True)

你可能感兴趣的:(深度学习,强化学习,pytorch,tensorflow,强化学习,深度学习)