import numpy as np
import pickle
BOARD_ROWS = 3
BOARD_COLS = 3
BOARD_SIZE = BOARD_ROWS * BOARD_COLS
state状态类:每个状态用自定义hash值描述,主要方法为get_all_states和next_state
class State:
def __init__(self):
# the board is represented by an n * n array,
# 1 represents a chessman of the player who moves first,
# -1 represents a chessman of another player
# 0 represents an empty position
self.data = np.zeros((BOARD_ROWS, BOARD_COLS)) # 初始化为0
self.winner = None # 赢家初始化为None
self.hash_val = None # 值函数也初始化为None
self.end = None # 结束初始化为None
# compute the hash value for one state, it's unique
# 计算一步的hash价值
def hash(self):
if self.hash_val is None: # 初始化的时候是None
self.hash_val = 0 # 将其值设置为0
for i in self.data.reshape(BOARD_ROWS * BOARD_COLS): # 改变数组的形状,便利数组里面的值
if i == -1: # 即原来取值-1,0,1,现在将-1设置为2,为了hash方便
i = 2
self.hash_val = self.hash_val * 3 + i
return int(self.hash_val)
# check whether a player has won the game, or it's a tie
# 检查玩家是否赢得了游戏,或者是否是平局
def is_end(self):
if self.end is not None: # 初始化是None,如果是None,那么就是游戏已经结束了
return self.end # 初始化值是None
results = [] # 结果,这是一个列表
# check row
for i in range(0, BOARD_ROWS):
results.append(np.sum(self.data[i, :])) # 对数组的行求和,将结果放入results数组之中
# check columns
for i in range(0, BOARD_COLS):
results.append(np.sum(self.data[:, i])) # 对数组的列求和,将结果放入results数组之中
# check diagonals,检查对角线
results.append(0) # 数组里面加一个0
for i in range(0, BOARD_ROWS): # 主对角线
results[-1] += self.data[i, i] # 用主对角线元素的求和值去覆盖之前添加的零
results.append(0)
for i in range(0, BOARD_ROWS): # 副对角线
results[-1] += self.data[i, BOARD_ROWS - 1 - i] # 用副对角线元素的求和值去覆盖之前添加的零
for result in results: # 用result变量去遍历results列表
if result == 3:
self.winner = 1
self.end = True
return self.end
if result == -3:
self.winner = -1
self.end = True
return self.end
# 如果是3,则说明是玩家赢了,如果是-3,则说明是机器赢了
# whether it's a tie,在之前的判断之后,玩家也没赢。机器也没赢,下面判断是否是平局
sum = np.sum(np.abs(self.data)) # 将列表中所有的数据加绝对值后求和
if sum == BOARD_ROWS * BOARD_COLS: # 如果求和的结果等于列表元素的个数之和
self.winner = 0 # 则玩家没有赢
self.end = True # 游戏结束了
return self.end
# game is still going on,如果以上情况都没有发生,则说明游戏没有结束,则游戏继续
self.end = False
return self.end
# @symbol: 1 or -1
# put chessman symbol in position (i, j)
# 根据符号标记来确定下一个状态
def next_state(self, i, j, symbol):
# 新创建一个对象,该对象是对State类的实例化,将原来的状态和更新后的状态存入其中
new_state = State()
new_state.data = np.copy(self.data)
new_state.data[i, j] = symbol
return new_state
# print the board,打印一下棋盘数据
def print(self):
for i in range(0, BOARD_ROWS):
print('-------------')
out = '| '
for j in range(0, BOARD_COLS):
if self.data[i, j] == 1:
token = '*'
if self.data[i, j] == 0:
token = '0'
if self.data[i, j] == -1:
token = 'x'
out += token + ' | '
print(out)
print('-------------')
# 这个函数创建一个字典all_state,以hash值为key,value为(state, is_End)
def get_all_states_impl(current_state, current_symbol, all_states):
# all_states:字典,以hash值为key,value为(state,is_End)
for i in range(0, BOARD_ROWS):
for j in range(0, BOARD_COLS): # 遍历列表
if current_state.data[i][j] == 0:
newState = current_state.next_state(i, j, current_symbol) # 根据当前状态来判断新的状态
newHash = newState.hash() # 计算新的hash值
if newHash not in all_states.keys(): # 如果新的hash值不在所有状态的键值里面
isEnd = newState.is_end()
all_states[newHash] = (newState, isEnd) # 值是一个键值对
# 如果没有结束对局,下一个选手继续下
if not isEnd:
get_all_states_impl(newState, -current_symbol, all_states) # 这是一个递归函数,下一次调用换了一个玩家玩。
# 运行一次得到所有状态
def get_all_states():
current_symbol = 1 # symbol标记是为了检验是哪一个玩家在玩,为1时时玩家1,为-1是玩家2
current_state = State()
all_states = dict() # 字典类型
all_states[current_state.hash()] = (current_state, current_state.is_end()) # 字典的value数据类型为(state, is_End)
get_all_states_impl(current_state, current_symbol, all_states)
return all_states
# all possible board configurations
all_states = get_all_states()
裁判:监督选手轮流下棋。主要方法为alternate(轮流选手),play(监督游戏执行,play里重要的为选手的act方法)
class Judger:
# @player1: the player who will move first, its chessman will be 1
# @player2: another player with a chessman -1
# @feedback: if True, both players will receive rewards when game is end
def __init__(self, player1, player2):
self.p1 = player1 # 玩家一
self.p2 = player2 # 玩家二
self.current_player = None # 当前玩家
self.p1_symbol = 1 # 玩家一标志
self.p2_symbol = -1 # 玩家二标志
self.p1.set_symbol(self.p1_symbol) # 设置标志
self.p2.set_symbol(self.p2_symbol)
self.current_state = State() # 创建一个state对象
def reset(self):
self.p1.reset()
self.p2.reset()
# 这个函数实现了两个玩家轮流下棋的功能
def alternate(self):
while True:
yield self.p1
yield self.p2
# @print: if True, print each board during the game
# 这个方法监督游戏执行
def play(self, print=False):
alternator = self.alternate() # 调用迭代器函数
self.reset() # 重置
current_state = State() # 创建一个对象,是当前状态
self.p1.set_state(current_state) # 把current_state的内容存入对象的属性里面
self.p2.set_state(current_state)
while True:
player = next(alternator)
if print:
current_state.print()
[i, j, symbol] = player.act()
next_state_hash = current_state.next_state(i, j, symbol).hash()
current_state, is_end = all_states[next_state_hash]
self.p1.set_state(current_state)
self.p2.set_state(current_state)
if is_end:
if print:
current_state.print()
return current_state.winner
AI选手:estimations表示不同状态下的分值,用以进行下一状态的选择,greedy区分随机行为,即随机行为不参与更新状态的分值主要方法为set_symbol(设置对于每个选手各状态分值的初始值),backup(更新状态分值,如果下一状态分值更高,那么当前状态的分值也要提高,即将长远的结果反作用到现在),act(获取下一步坐标)
class Player:
# @step_size: the step size to update estimations
# @epsilon: the probability to explore
def __init__(self, step_size=0.1, epsilon=0.1):
self.estimations = dict() # estimations是字典类型
self.step_size = step_size # 步长
self.epsilon = epsilon # 贪婪策略的小概率
self.states = []
self.greedy = []
def reset(self):
self.states = []
self.greedy = []
def set_state(self, state):
self.states.append(state)
self.greedy.append(True)
def set_symbol(self, symbol): # 设置对每个选手各状态分值的初始值
self.symbol = symbol
# 对状态分值初始化,最终赢了得1分,输了不得分,平局0.5分,
# 未到终局设置为0.5分
for hash_val in all_states.keys():
(state, is_end) = all_states[hash_val]
if is_end:
if state.winner == self.symbol:
self.estimations[hash_val] = 1.0
elif state.winner == 0:
# we need to distinguish between a tie and a lose
self.estimations[hash_val] = 0.5
else:
self.estimations[hash_val] = 0 # 输了不得分
else:
self.estimations[hash_val] = 0.5
# update value estimation
# 将在贪心动作之后得到的状态对应的价值”回溯更新“到动作之前的状态上。(对早先的状态的价值进行调整,使其更接近于后面的状态所对应的价值)
def backup(self): # 更新状态分值,如果下一状态分值更高,那么当前状态分值也要提高,即将长远的结果反作用到现在。
# for debug
# print('player trajectory')
# for state in self.states:
# state.print()
self.states = [state.hash() for state in self.states]
# 顺序更新
# 反转的迭代器
for i in reversed(range(len(self.states) - 1)):
state = self.states[i]
td_error = self.greedy[i] * (self.estimations[self.states[i + 1]] - self.estimations[state]) # 计算TD误差
self.estimations[state] += self.step_size * td_error
# choose an action based on the state
def act(self): # 获取下一步坐标
# 取出当前(最后一个)状态
state = self.states[-1]
# 下一步可能的状态的hash
next_states = []
# 下一步可能的坐标
next_positions = []
for i in range(BOARD_ROWS):
for j in range(BOARD_COLS):
if state.data[i, j] == 0:
next_positions.append([i, j])
next_states.append(state.next_state(i, j, self.symbol).hash())
# 小概率随机探索
if np.random.rand() < self.epsilon:
action = next_positions[np.random.randint(len(next_positions))]
action.append(self.symbol)
# 表示随机动作不参与价值更新
self.greedy[-1] = False
return action
# 大概率按照奖励最高行动
values = []
for hash, pos in zip(next_states, next_positions):
values.append((self.estimations[hash], pos))
np.random.shuffle(values) # 将这个数组中的元素随机进行排序
values.sort(key=lambda x: x[0], reverse=True)
action = values[0][1]
action.append(self.symbol)
return action
def save_policy(self):
with open('policy_%s.bin' % ('first' if self.symbol == 1 else 'second'), 'wb') as f:
pickle.dump(self.estimations, f)
def load_policy(self):
with open('policy_%s.bin' % ('first' if self.symbol == 1 else 'second'), 'rb') as f:
self.estimations = pickle.load(f)
人类玩家
# human interface
# input a number to put a chessman
# | q | w | e |
# | a | s | d |
# | z | x | c |
class HumanPlayer:
def __init__(self, **kwargs):
self.symbol = None
self.keys = ['q', 'w', 'e', 'a', 's', 'd', 'z', 'x', 'c']
self.state = None
return
def reset(self):
return
def set_state(self, state):
self.state = state
def set_symbol(self, symbol):
self.symbol = symbol
return
def backup(self, _):
return
def act(self):
self.state.print()
key = input("Input your position:")
data = self.keys.index(key)
i = data // int(BOARD_COLS)
j = data % BOARD_COLS
return (i, j, self.symbol)
训练:
def train(epochs): # 训练重复epochs次
player1 = Player(epsilon=0.01) # 实例化玩家1
player2 = Player(epsilon=0.01) # 实例化玩家二
judger = Judger(player1, player2) # 判断玩家一和玩家二的胜负
player1_win = 0.0 # 玩家一胜利场数
player2_win = 0.0 # 玩家二胜利场数
for i in range(1, epochs + 1): # epochs为迭代次数
winner = judger.play(print=False)
if winner == 1:
player1_win += 1
if winner == -1:
player2_win += 1
# 输出两个选手的获胜概率,到最后基本是平局
print('Epoch %d, player 1 win %.02f, player 2 win %.02f' % (i, player1_win / i, player2_win / i))
player1.backup()
player2.backup()
judger.reset() # 一盘对决结束就重置游戏
player1.save_policy() # 保存状态价值,其实训练获取的就是各状态分别对每个选手的价值
player2.save_policy() # 保存策略
AI自测:
def compete(turns):
# 不允许随机行为
player1 = Player(epsilon=0)
player2 = Player(epsilon=0)
judger = Judger(player1, player2)
player1.load_policy()
player2.load_policy()
player1_win = 0.0
player2_win = 0.0
for i in range(0, turns):
winner = judger.play()
if winner == 1:
player1_win += 1
if winner == -1:
player2_win += 1
judger.reset()
print('%d turns, player 1 win %.02f, player 2 win %.02f' % (turns, player1_win / turns, player2_win / turns))
人机对战:
# The game is a zero sum game. If both players are playing with an optimal strategy, every game will end in a tie.
# So we test whether the AI can guarantee at least a tie if it goes second.
def play():
while True:
player1 = HumanPlayer()
player2 = Player(epsilon=0)
judger = Judger(player1, player2)
player2.load_policy()
winner = judger.play()
if winner == player2.symbol:
print("You lose!")
elif winner == player1.symbol:
print("You win!")
else:
print("It is a tie!")
调用程序:
if __name__ == '__main__':
train(int(1e5))
compete(int(1e3))
play()