关于实现以井字棋为基础的Q-learning

Q-learning我就不细说了,这是莫烦大神在他自己的官网上的教程
不光有Q-learning
Q-learning简而言之就是通过建立一种S,A之间的Q表,通过设定奖励(reward)机制不断训练,最终训练出一个能够自己向着设定的奖励(reward)以相对来说较为高效的方式行动,最终有效的得到奖励

井字棋的规则也不细说了

三字连珠即为胜利,下满整个棋盘双方都没有三字连珠即为平局

用的是python,人工智能这方面用python太方便了(库真的多的一批,直接就用)

import random
import numpy as np
import pandas as pd
import time

dictkv = {
     
    "ooooooooo0":0,
    "ooooooooo1":0,
    "ooooooooo2":0,
    "ooooooooo3":0,
    "ooooooooo4":0,
    "ooooooooo5":0,
    "ooooooooo6":0,
    "ooooooooo7":0,
    "ooooooooo8":0
}
#dictkv中dict意为字典,kv表示键值对,这个字典是Q表的初始化,开始到底用二维列表表示棋盘的状态还是直接用一个字符串纠结了,最后还是用字符串比较好操作,键值对中key为一个字符串,前9位表示棋盘状态,第十位表示下一步下在哪,键值对中value表示Q值

EPSILON = 0.9
MAX = 100
ALPHA = 0.1
GAMMA = 0.9
#以上参数分别是更新Q表示时的参数

def replace_char(old_string, char, index):
    old_string = str(old_string)
    new_string = old_string[:index] + char + old_string[index+1:]
    return new_string
#将字符串指定位置的字符换成目标字符(下棋时改变棋盘状态有用)

def winx(str):
    #x为Q-learning目标训练的先手,赢了返回1,o为空位
    flag = 0
    for i in range(3):
        if str[i] == str[i+3] and str[i+3] == str[i+6]:
            if str[i] == 'x':
                flag = 1

    for i in range(0,7,3):
        if str[i] == str[i+1] and str[i+1] == str[i+2]:
            if str[i] == 'x':
                flag = 1

    if str[0] == str[4] and str[4] == str[8]:
        if str[0] == 'x':
            flag = 1

    if str[2] == str[4] and str[4] == str[6]:
        if str[2] == 'x':
            flag = 1
    return flag
#x为先手,即为我训练的目标棋子,判定x赢的函数

def winc(str):
    #c为随机下子,赢了返回1,o为空位
    flag = 0
    for i in range(3):
        if str[i] == str[i+3] and str[i+3] == str[i+6]:
            if str[i] == 'c':
                flag = 1 

    for i in range(0,7,3):
        if str[i] == str[i+1] and str[i+1] == str[i+2]:
            if str[i] == 'c':
                flag = 1

    if str[0] == str[4] and str[4] == str[8]:
        if str[0] == 'c':
            flag = 1

    if str[2] == str[4] and str[4] == str[6]:
        if str[2] == 'c':
            flag = 1
    return flag
#c为后手,完全随机落子

def full(str):
    flag = 1
    for i in range(len(str)):
        if str[i] == 'o':
            flag = 0
    return flag
#棋盘下满了

def choose_action(S, q_table):
    sum = 0 #sum用于计算该谁下了,因为可以根据当前棋盘状态来判断,即剩下几个空位可以落子
    state_actions = []
    q_list = []
    for i in range(len(S)):
        if S[i] == 'o':
            sum+=1 
    jug = 1
    for key in list(q_table.keys()):
        if key[0:9] == S:
            state_actions.append(q_table[key[0:10]])
            q_list.append(key[0:10])
            #遍历所有的key,有相同情况的棋盘状态出现过直接用于计算
        else:
            jug = 0
            break
            #没有的话跳出循环到下一句
    if jug == 0:
        for i in range(0,9,1):
            q_table[S+str(i)] = 0
            state_actions.append(0)
            q_list.append(S+str(i))
           	#不光要用于计算,还要把未出现棋盘状态存入Q表中

    if sum%2 == 1: 
    #该x下
        if (np.random.uniform() > EPSILON) or (sum == 9): 
        #棋盘为空或者抽到一个10%的概率需要随机下
            while 1:
                action = random.randrange(0,8)
                if S[action] != 'o':
                    continue
                else:
                    return str(action)
        else:  
            while 1:
                big = 0
                for i in state_actions:
                    if i >= big:
                        big = i
                flag = 0
                for j in range(9):
                    if state_actions[j] == big:
                        flag = j
                string = q_list[flag]
                action_name = string[9] 
                if S[int(action_name)] == 'o':
                    return action_name
                else:
                    state_actions[flag] = -1
                    continue        
                #90%的情况且棋盘不为空,需要Q-learning,贪心策略,选取出现过Q表中Q值最大的A(Action)
    else:
        # 该下c
        while 1:
            action = random.randrange(0,8)
            if S[action] != 'o':
                continue
            else:
                return str(action)
       	#随机下

def get_env_feedback(S, A):
    sum = 0
    for i in range(len(S)):
        if S[i] == 'o':
            sum+=1 
#获得动作(Action)和下一步状态(State)
    if sum%2 == 1:
        S = replace_char(S,'x',int(A))
        if winx(S) == 1:
            S_ = S
            R = 1
            return S_, R
        if full(S) == 1:
            S_ = S
            R = 0
            return S_, R
        else:
            S_ = S
            R = 0
            return S_, R
            #x赢了返回1,平局返回0,未下完且未决出胜负返回0
    else:
        S = replace_char(S,'c',int(A))
        if winc(S) == 1:
            S_ = S
            R = -1
            return S_, R
        if full(S) == 1:
            S_ = S
            R = 0
            return S_, R
        else:
            S_ = S
            R = 0
            return S_, R
            #c同理,只不过c赢了返回-1
def rl():
    for i in range(MAX):
        S = "ooooooooo"
        final = False
        while not winx(S) and not winc(S) and not full(S) and not final:
            A = choose_action(S,dictkv)
            print("A = " + str(A))
            S_ , R = get_env_feedback(S,A)
            print("S_ = " + S_ + " R = " + str(R))
            for key in dictkv.keys():
                if key[0:9] == S and key[9] == A:
                    q_predict = dictkv[key]
            if not winx(S_) and not winc(S_) and not full(S_):
                big = 0
                for key in dictkv.keys():
                    if key[0:9] == S_ and dictkv[key] > big:
                        big = dictkv[key]
                q_target = R + GAMMA * big 
            else:
                q_target = R
                final = True
            for key in dictkv.keys():
                if key[0:9] == S and key[9] == A:
                    dictkv[key] += ALPHA * (q_target - q_predict)
            S = S_  
            print("dictkv = ")
            print(dictkv)
            print("\n")
    return dictkv
    #整个loop借鉴莫烦大神的结构

if __name__ == "__main__":
    q_table = rl()
    print('\r\nQ-table:\n')
    print(q_table)
    #最后输出Q表是一堆下过的期盘状态,大部分是0(我最多就把循环次数设成100次)其中Q值为正数即为胜利的情况,举个例子,"ocoxoxoco4":0.1,意为,下在字符串的4位置上的情况所对应的Q值为0.1

整个代码框架基本借鉴莫烦大神的教程,莫烦大神牛逼

你可能感兴趣的:(关于实现以井字棋为基础的Q-learning)