Alpha Zero 项目地址
BoardGame2模块实现
该项目复现DeepMind论文Alpha Zero模型,将其应用在五子棋游戏上面,采用MCTS+深度残差网络组合的方式训练一个游戏AI,全程通过自我对弈的方式提升训练,不需要参考人类的知识。
初始化一个价值网络,用价值网络引导MCTS的启动,然后再用自我对弈数据引导价值网络
假设初始状态是s0,
第一次模拟:s0不在policy里面,求出p0后结束
第二次模拟:s0在policy里面,根据s0下的c和p求coef然后再确定a0,进入s1的模拟,s1不在policy里面,求出p1后结束,回到s0的模拟中,更新c值和q值
……
强化学习的环境实现在BoardGame2模块里面,运行此项目需要提前安装好此模块
pip install BoardGame2
class AlphaZeroAgent:
def __init__(self, env, batches=1, batch_size=4096,
kwargs={}, load=None, sim_count=800,
c_init=1.25, c_base=19652., prior_exploration_fraction=0.25):
self.env = env
self.board = np.zeros_like(env.board)
self.batches = batches
self.batch_size = batch_size
self.net = self.build_network(**kwargs)
self.reset_mcts()
self.sim_count = sim_count # MCTS 次数
self.c_init = c_init # PUCT 系数
self.c_base = c_base # PUCT 系数
self.prior_exploration_fraction = prior_exploration_fraction
def build_network(self, conv_filters, residual_filters, policy_filters,
learning_rate=0.001, regularizer=keras.regularizers.l2(1e-4)):
# 公共部分
inputs = keras.Input(shape=self.board.shape)
x = keras.layers.Reshape(self.board.shape + (1,))(inputs)
for conv_filter in conv_filters:
z = keras.layers.Conv2D(conv_filter, 3, padding='same',
kernel_regularizer=regularizer,
bias_regularizer=regularizer)(x)
y = keras.layers.BatchNormalization()(z)
x = keras.layers.ReLU()(y)
for residual_filter in residual_filters:
x = residual(x, filters=residual_filter, regularizer=regularizer)
intermediates = x
# 概率部分
for policy_filter in policy_filters:
z = keras.layers.Conv2D(policy_filter, 3, padding='same',
kernel_regularizer=regularizer,
bias_regularizer=regularizer)(x)
y = keras.layers.BatchNormalization()(z)
x = keras.layers.ReLU()(y)
logits = keras.layers.Conv2D(1, 3, padding='same',
kernel_regularizer=regularizer, bias_regularizer=regularizer)(x)
flattens = keras.layers.Flatten()(logits)
softmaxs = keras.layers.Softmax()(flattens)
probs = keras.layers.Reshape(self.board.shape)(softmaxs)
# 价值部分
z = keras.layers.Conv2D(1, 3, padding='same',
kernel_regularizer=regularizer,
bias_regularizer=regularizer)(intermediates)
y = keras.layers.BatchNormalization()(z)
x = keras.layers.ReLU()(y)
flattens = keras.layers.Flatten()(x)
vs = keras.layers.Dense(1, activation=keras.activations.tanh,
kernel_regularizer=regularizer,
bias_regularizer=regularizer)(flattens)
model = keras.Model(inputs=inputs, outputs=[probs, vs])
def categorical_crossentropy_2d(y_true, y_pred):
labels = tf.reshape(y_true, [-1, self.board.size])
preds = tf.reshape(y_pred, [-1, self.board.size])
return keras.losses.categorical_crossentropy(labels, preds)
loss = [categorical_crossentropy_2d, keras.losses.MSE]
optimizer = keras.optimizers.Adam(learning_rate)
model.compile(loss=loss, optimizer=optimizer)
return model
def reset_mcts(self):
def zero_board_factory(): # 用于构造 default_dict
return np.zeros_like(self.board, dtype=float)
self.q = collections.defaultdict(zero_board_factory)
#dict.setdefault()方法接收两个参数,第一个参数是健的名称,第二个参数是默认值。假如字典中不存在给定的键,则返回参数中提供的默认值;反之,则返回字典中保存的值
# q值估计: board -> board
self.count = collections.defaultdict(zero_board_factory)#后面的用法是count[s][location],location就是索引
# q值计数: board -> board
self.policy = {} # 策略: board -> board
self.valid = {} # 有效位置: board -> board
self.winner = {} # 赢家: board -> None or int
def decide(self, observation, greedy=False, return_prob=False):
# 计算策略
board, player = observation
canonical_board = player * board#当player=1,则不变,当player=-1,则翻转
s = boardgame2.strfboard(canonical_board)#转为字符串形如 “+++o++++x+++...”
while self.count[s].sum() < self.sim_count: # 多次 MCTS 搜索,count[s].sum局面s所有点访问的总次数
self.search(canonical_board, prior_noise=True)
prob = self.count[s] / self.count[s].sum()
# 采样
location_index = np.random.choice(prob.size, p=prob.reshape(-1))
location = np.unravel_index(location_index, prob.shape)
if return_prob:
return location, prob
return location
def learn(self, dfs):
df = pd.concat(dfs).reset_index(drop=True)
for batch in range(self.batches):
indices = np.random.choice(len(df), size=self.batch_size)
players, boards, probs, winners = (np.stack(
df.loc[indices, field]) for field in df.columns)
canonical_boards = players[:, np.newaxis, np.newaxis] * boards
vs = (players * winners)[:, np.newaxis]
self.net.fit(canonical_boards, [probs, vs], verbose=0) # 训练
self.reset_mcts()
def search(self, board, prior_noise=False): # MCTS 搜索,递归结束的条件:1、胜负已分,2、遇到未算过策略的叶子节点
s = boardgame2.strfboard(board)
if s not in self.winner:
self.winner[s] = self.env.get_winner((board, BLACK)) # 计算赢家
if self.winner[s] is not None: # 赢家确定的情况
return self.winner[s]#最终只会是三种情况中的一种-1,0,1
if s not in self.policy: # 未计算过策略的叶子节点
pis, vs = self.net.predict(board[np.newaxis])
#model = keras.Model(inputs=inputs, outputs=[probs, vs])
pi, v = pis[0], vs[0]#pi.shape=(10,10),v.shape=(1,)
valid = self.env.get_valid((board, BLACK))
masked_pi = pi * valid
total_masked_pi = np.sum(masked_pi)
if total_masked_pi <= 0: # 所有的有效动作都没有概率,偶尔可能发生
masked_pi = valid # workaround
total_masked_pi = np.sum(masked_pi)
self.policy[s] = masked_pi / total_masked_pi
self.valid[s] = valid
return v
# PUCT 上界计算
count_sum = self.count[s].sum()
coef = (self.c_init + np.log1p((1 + count_sum) / self.c_base)) * \
math.sqrt(count_sum) / (1. + self.count[s])
if prior_noise: # 先验噪声
alpha = 1. / self.valid[s].sum()
noise = np.random.gamma(alpha, 1., board.shape)
noise *= self.valid[s]
noise /= noise.sum()
prior = (1. - self.prior_exploration_fraction) * \
self.policy[s] + \
self.prior_exploration_fraction * noise
else:
prior = self.policy[s]
ub = np.where(self.valid[s], self.q[s] + coef * prior, np.nan)## reset_mcts(): q值估计: board -> board
# np.where(condition, x, y)满足条件(condition),输出x,不满足输出y。
location_index = np.nanargmax(ub)#返回除了Nan之外的最大值的索引,需要改进:如果全为0,就随机选一个点
location = np.unravel_index(location_index, board.shape)
(next_board, next_player), _, _, _ = self.env.next_step(
(board, BLACK), np.array(location))
next_canonical_board = next_player * next_board#翻转
next_v = self.search(next_canonical_board) # 递归搜索。叶子节点局面价值,假设下一局面为终局,赢家确定,最终只会是三种情况中的一种-1,0,1
v = next_player * next_v
self.count[s][location] += 1#递归结束后将反向更新以下内容
self.q[s][location] += (v - self.q[s][location]) / \
self.count[s][location]
def residual(x, filters, kernel_sizes=3, strides=1, activations='relu',
regularizer=keras.regularizers.l2(1e-4)):
shortcut = x
for i, filte in enumerate(filters):
kernel_size = kernel_sizes if isinstance(kernel_sizes, int) \
else kernel_sizes[i]
stride = strides if isinstance(strides, int) else strides[i]
activation = activations if isinstance(activations, str) \
else activations[i]
z = keras.layers.Conv2D(filte, kernel_size, strides=stride,
padding='same', kernel_regularizer=regularizer,
bias_regularizer=regularizer)(x)
y = keras.layers.BatchNormalization()(z)
if i == len(filters) - 1:
y = keras.layers.Add()([shortcut, y])
x = keras.layers.Activation(activation)(y)
return x
def self_play(env, agent, return_trajectory=False, verbose=False):
if return_trajectory:
trajectory = []
observation = env.reset()
for step in itertools.count():
board, player = observation
action, prob = agent.decide(observation, return_prob=True)
if verbose:
print(boardgame2.strfboard(board))
logging.info('第 {} 步:玩家 {}, 动作 {}'.format(step, player,
action))
observation, winner, done, _ = env.step(action)
if return_trajectory:
trajectory.append((player, board, prob))
if done:
if verbose:
print(boardgame2.strfboard(observation[0]))
logging.info('赢家 {}'.format(winner))
break
if return_trajectory:
df_trajectory = pd.DataFrame(trajectory,
columns=['player', 'board', 'prob'])
df_trajectory['winner'] = winner
return df_trajectory
else:
return winner
env = gym.make('KInARow-v0', board_shape=15, target_length=5)
"""
小规模参数,用来初步求解比较小的问题(如井字棋)
"""
train_iterations = 100
train_episodes_per_iteration = 100
batches = 2
batch_size = 64
sim_count = 200
net_kwargs = {}
net_kwargs['conv_filters'] = [256,]
net_kwargs['residual_filters'] = [[256, 256],]
net_kwargs['policy_filters'] = [256,]
agent = AlphaZeroAgent(env=env, kwargs=net_kwargs, sim_count=sim_count,
batches=batches, batch_size=batch_size)
for iteration in range(train_iterations):
# 自我对弈
dfs_trajectory = []#经验
for episode in range(train_episodes_per_iteration):
df_trajectory = self_play(env, agent,
return_trajectory=True, verbose=False)
logging.info('训练 {} 回合 {}: 收集到 {} 条经验'.format(
iteration, episode, len(df_trajectory)))
dfs_trajectory.append(df_trajectory)
# 利用经验进行学习
agent.learn(dfs_trajectory)
logging.info('训练 {}: 学习完成'.format(iteration))
self_play(env, agent,return_trajectory=True, verbose=True)