Q-learning我就不细说了,这是莫烦大神在他自己的官网上的教程
不光有Q-learning
Q-learning简而言之就是通过建立一种S,A之间的Q表,通过设定奖励(reward)机制不断训练,最终训练出一个能够自己向着设定的奖励(reward)以相对来说较为高效的方式行动,最终有效的得到奖励
井字棋的规则也不细说了
三字连珠即为胜利,下满整个棋盘双方都没有三字连珠即为平局
用的是python,人工智能这方面用python太方便了(库真的多的一批,直接就用)
import random
import numpy as np
import pandas as pd
import time
dictkv = {
"ooooooooo0":0,
"ooooooooo1":0,
"ooooooooo2":0,
"ooooooooo3":0,
"ooooooooo4":0,
"ooooooooo5":0,
"ooooooooo6":0,
"ooooooooo7":0,
"ooooooooo8":0
}
#dictkv中dict意为字典,kv表示键值对,这个字典是Q表的初始化,开始到底用二维列表表示棋盘的状态还是直接用一个字符串纠结了,最后还是用字符串比较好操作,键值对中key为一个字符串,前9位表示棋盘状态,第十位表示下一步下在哪,键值对中value表示Q值
EPSILON = 0.9
MAX = 100
ALPHA = 0.1
GAMMA = 0.9
#以上参数分别是更新Q表示时的参数
def replace_char(old_string, char, index):
old_string = str(old_string)
new_string = old_string[:index] + char + old_string[index+1:]
return new_string
#将字符串指定位置的字符换成目标字符(下棋时改变棋盘状态有用)
def winx(str):
#x为Q-learning目标训练的先手,赢了返回1,o为空位
flag = 0
for i in range(3):
if str[i] == str[i+3] and str[i+3] == str[i+6]:
if str[i] == 'x':
flag = 1
for i in range(0,7,3):
if str[i] == str[i+1] and str[i+1] == str[i+2]:
if str[i] == 'x':
flag = 1
if str[0] == str[4] and str[4] == str[8]:
if str[0] == 'x':
flag = 1
if str[2] == str[4] and str[4] == str[6]:
if str[2] == 'x':
flag = 1
return flag
#x为先手,即为我训练的目标棋子,判定x赢的函数
def winc(str):
#c为随机下子,赢了返回1,o为空位
flag = 0
for i in range(3):
if str[i] == str[i+3] and str[i+3] == str[i+6]:
if str[i] == 'c':
flag = 1
for i in range(0,7,3):
if str[i] == str[i+1] and str[i+1] == str[i+2]:
if str[i] == 'c':
flag = 1
if str[0] == str[4] and str[4] == str[8]:
if str[0] == 'c':
flag = 1
if str[2] == str[4] and str[4] == str[6]:
if str[2] == 'c':
flag = 1
return flag
#c为后手,完全随机落子
def full(str):
flag = 1
for i in range(len(str)):
if str[i] == 'o':
flag = 0
return flag
#棋盘下满了
def choose_action(S, q_table):
sum = 0 #sum用于计算该谁下了,因为可以根据当前棋盘状态来判断,即剩下几个空位可以落子
state_actions = []
q_list = []
for i in range(len(S)):
if S[i] == 'o':
sum+=1
jug = 1
for key in list(q_table.keys()):
if key[0:9] == S:
state_actions.append(q_table[key[0:10]])
q_list.append(key[0:10])
#遍历所有的key,有相同情况的棋盘状态出现过直接用于计算
else:
jug = 0
break
#没有的话跳出循环到下一句
if jug == 0:
for i in range(0,9,1):
q_table[S+str(i)] = 0
state_actions.append(0)
q_list.append(S+str(i))
#不光要用于计算,还要把未出现棋盘状态存入Q表中
if sum%2 == 1:
#该x下
if (np.random.uniform() > EPSILON) or (sum == 9):
#棋盘为空或者抽到一个10%的概率需要随机下
while 1:
action = random.randrange(0,8)
if S[action] != 'o':
continue
else:
return str(action)
else:
while 1:
big = 0
for i in state_actions:
if i >= big:
big = i
flag = 0
for j in range(9):
if state_actions[j] == big:
flag = j
string = q_list[flag]
action_name = string[9]
if S[int(action_name)] == 'o':
return action_name
else:
state_actions[flag] = -1
continue
#90%的情况且棋盘不为空,需要Q-learning,贪心策略,选取出现过Q表中Q值最大的A(Action)
else:
# 该下c
while 1:
action = random.randrange(0,8)
if S[action] != 'o':
continue
else:
return str(action)
#随机下
def get_env_feedback(S, A):
sum = 0
for i in range(len(S)):
if S[i] == 'o':
sum+=1
#获得动作(Action)和下一步状态(State)
if sum%2 == 1:
S = replace_char(S,'x',int(A))
if winx(S) == 1:
S_ = S
R = 1
return S_, R
if full(S) == 1:
S_ = S
R = 0
return S_, R
else:
S_ = S
R = 0
return S_, R
#x赢了返回1,平局返回0,未下完且未决出胜负返回0
else:
S = replace_char(S,'c',int(A))
if winc(S) == 1:
S_ = S
R = -1
return S_, R
if full(S) == 1:
S_ = S
R = 0
return S_, R
else:
S_ = S
R = 0
return S_, R
#c同理,只不过c赢了返回-1
def rl():
for i in range(MAX):
S = "ooooooooo"
final = False
while not winx(S) and not winc(S) and not full(S) and not final:
A = choose_action(S,dictkv)
print("A = " + str(A))
S_ , R = get_env_feedback(S,A)
print("S_ = " + S_ + " R = " + str(R))
for key in dictkv.keys():
if key[0:9] == S and key[9] == A:
q_predict = dictkv[key]
if not winx(S_) and not winc(S_) and not full(S_):
big = 0
for key in dictkv.keys():
if key[0:9] == S_ and dictkv[key] > big:
big = dictkv[key]
q_target = R + GAMMA * big
else:
q_target = R
final = True
for key in dictkv.keys():
if key[0:9] == S and key[9] == A:
dictkv[key] += ALPHA * (q_target - q_predict)
S = S_
print("dictkv = ")
print(dictkv)
print("\n")
return dictkv
#整个loop借鉴莫烦大神的结构
if __name__ == "__main__":
q_table = rl()
print('\r\nQ-table:\n')
print(q_table)
#最后输出Q表是一堆下过的期盘状态,大部分是0(我最多就把循环次数设成100次)其中Q值为正数即为胜利的情况,举个例子,"ocoxoxoco4":0.1,意为,下在字符串的4位置上的情况所对应的Q值为0.1
整个代码框架基本借鉴莫烦大神的教程,莫烦大神牛逼