【强化学习】Q-Learning原理及代码实现

        最近工作是在太忙了,无奈,也没空更新博客,职业上也从研发变成了产品,有小半年没写代码了,怕自己手生的不行,给自己两天时间,写了点东西,之前做搞机器学习,搞深度学习,但一直对依赖全场景数据喂模型的方向有点感冒,因为数据又贵又难搞全,企业靠这个发家有点难,且本身需要企业具有很大的体量,另收集数据-训练-部署三板斧就当做AI的自进化说法感觉有点勉强,不谈特定场景妄图一个AI模型解决通用问题的都是大忽悠,咱们还是等待学术界的大佬们能真正提出新的理论来造福业内吧。

        一直在想,有没有一种更好适应的算法方向可以不那么依赖数据的,或者说能依赖数据少一些,这样对于一个产品也许有更好的成本控制和生命力加成,因此想了想是不是强化学习有可能,顺势就了解了下,花了几天时间简单学习了下入门皮毛,对最简单的Q-Learning先进行了复现。

 

设定环境为一个5x5的棋盘,墙壁四周无法通行,碰到炸弹直接死亡,找到宝石就算过关。

 

关于理论知识和排版我下次总结补充哈,先把我前两天写的代码放在这,先上传个效果:

训练刚开始随机探索,:

训练几十轮之后,基本直奔终点:

代码部分:

环境Env(为啥不使用我之前写的pycharm加pyqt5.0进行环境开发,我表示,忘光了,赶时间,选个最简单tk来用~):

import tkinter as tk
from PIL import ImageTk
from PIL import Image
import time

class Env:
    def __init__(self):
        self.grid_size = 100
        self.win = tk.Tk()
        self.pic_player, self.pic_diamond, self.pic_boom1, self.pic_boom2, self.pic_boom3, self.pic_boom4 = self.__load_img()
        self.__init_win()
        self.canvas = self.__init_rc()
        self.texts= self.__produce_text()
        self.canvas.pack()
        # self._init_test_case()
        # self.win.mainloop()

    def __init_win(self):
        self.win.title('Grid World')
        # self.win.geometry("500x300")

    def __init_rc(self):
        canvas = tk.Canvas(self.win, width=500, height=720, bg='white')
        for h in range(5):
            for v in range(5):
                canvas.create_rectangle(self.grid_size * v, self.grid_size * h, self.grid_size * (v + 1), self.grid_size * (h + 1))
        trans_pixel = int(self.grid_size / 2)
        self.player = canvas.create_image(trans_pixel + self.grid_size * 0, trans_pixel + self.grid_size * 0, image = self.pic_player)
        self.diamond = canvas.create_image(trans_pixel + self.grid_size * 4, trans_pixel + self.grid_size * 4, image=self.pic_diamond)
        self.boom1 = canvas.create_image(trans_pixel + self.grid_size * 1, trans_pixel + self.grid_size * 1, image=self.pic_boom1)
        self.boom2 = canvas.create_image(trans_pixel + self.grid_size * 3, trans_pixel + self.grid_size * 1, image=self.pic_boom2)
        self.boom3 = canvas.create_image(trans_pixel + self.grid_size * 1, trans_pixel + self.grid_size * 3, image=self.pic_boom3)
        self.boom4 = canvas.create_image(trans_pixel + self.grid_size * 3, trans_pixel + self.grid_size * 3, image=self.pic_boom4)

        return canvas

    def __load_img(self):
        pic_resize = int (self.grid_size / 2)
        player = ImageTk.PhotoImage(Image.open("player.png").resize((pic_resize, pic_resize)))
        diamond = ImageTk.PhotoImage(Image.open("diamond.png").resize((pic_resize, pic_resize)))
        boom1 = ImageTk.PhotoImage(Image.open('boom.png').resize((pic_resize, pic_resize)))
        boom2 = ImageTk.PhotoImage(Image.open('boom.png').resize((pic_resize, pic_resize)))
        boom3 = ImageTk.PhotoImage(Image.open('boom.png').resize((pic_resize, pic_resize)))
        boom4 = ImageTk.PhotoImage(Image.open('boom.png').resize((pic_resize, pic_resize)))
        return player, diamond, boom1, boom2, boom3, boom4

    def __produce_text(self):
        texts = []
        x = self.grid_size / 2
        y = self.grid_size / 6
        for h in range(5):
            for v in range(5):
                up = self.canvas.create_text(x + h * self.grid_size, y + v * self.grid_size,text=0)
                down = self.canvas.create_text(x + h *self.grid_size, self.grid_size - y + v * self.grid_size, text = 0)
                left = self.canvas.create_text(y + h*self.grid_size, x + v*self.grid_size,text = 0)
                right = self.canvas.create_text(self.grid_size-y + h*self.grid_size, x + v*self.grid_size,text = 0)
                texts.append({"up": up, "down": down, "left": left, "right": right})
        return texts

    def _win_d_update(self):
        self.win.update()
        time.sleep(0.1)


class GridWorld(Env):
    def __init__(self):
        super().__init__()
        self._win_d_update()

    def player_move(self, x, y):
        # x横向移动向右,y纵向移动向下
        self.canvas.move(self.player, x * self.grid_size, y*self.grid_size)
        self._win_d_update()

    def reset(self):
        # 重置为起始位置
        x, y = self.canvas.coords(self.player)
        self.canvas.move(self.player, -x + self.grid_size/2, -y + self.grid_size/2)
        self._win_d_update()
        return self.get_state(self.player)

    def get_state(self, who):
        x, y = self.canvas.coords(who)
        state = [int(x/self.grid_size), int(y/self.grid_size)]
        return state

    def update_val(self, num, arrow, val):
        pos = num[0] * 5 + num[1]
        x, y = self.canvas.coords(self.texts[pos][arrow])
        self.canvas.delete(self.texts[pos][arrow])
        self.texts[pos][arrow] = self.canvas.create_text(x, y, text=val)
        # self._win_d_update()

    def exec_calc(self, action):
        # 执行一次决策
        feedback = 'alive'  # alive, stop, dead 分别对应通过,撞墙,炸死
        next_state = []
        next_h, next_v, reward = 0.0, 0.0, 0.0
        h, v = self.get_state(self.player)
        if action == 0:     # up
            next_h = h
            next_v = v - 1
            # self.player_move(0, -1)
        elif action == 1:   # down
            next_h = h
            next_v = v + 1
            # self.player_move(0, 1)
        elif action == 2:   # left
            next_h = h - 1
            next_v = v
            # self.player_move(-1, 0)
        elif action == 3:   # right
            next_h = h + 1
            next_v = v
            # self.player_move(1, 0)
        else:
            print('programmer bug ...')
        next_state = [next_h, next_v]
        boom1, boom2, boom3, boom4 = self.get_state(self.boom1), self.get_state(self.boom2), self.get_state(
            self.boom3), self.get_state(self.boom4)
        diamond = self.get_state(self.diamond)
        if next_h < 0 or next_v < 0 or next_h > 4 or next_v >4:  # 超过边界
            reward = -1
            feedback = 'stop'
        elif next_state == boom1 or next_state == boom2 or next_state == boom3 or next_state == boom4:    # 炸弹区域
            reward = -100
            feedback = 'dead'
        elif next_state == diamond:   # 获得的通关物品
            reward = 500
        else:
            reward = 0
        return feedback, next_state, reward

    def update_view(self, state, action, next_state, q_val):
        action_list = ['up', 'down', 'left', 'right']
        self.player_move(next_state[0]-state[0], next_state[1]-state[1])
        self.update_val(state, action_list[action], round(q_val, 2))

    def attach(self):
        # 到达终点,返回True , 未到达,返回False
        return str(self.get_state(self.player)) == str(self.get_state(self.diamond))

智能体Agent代码:

import numpy as np
import env



class Agent:
    def __init__(self):
        self.actions = [0, 1, 2, 3]  # up down left right
        self.q_table = dict()
        self.__init_q_table()
        self.epsilon = 0.1
        self.learning_rate = 0.1
        self.gamma = 0.8
        # print(self.q_table)

    def __init_q_table(self):
        for v in range(5):
            for h in range(5):
                self.q_table[str([h, v])] = [0.0, 0.0, 0.0, 0.0]

    def get_action(self, state):
        # 根据状态选取下一个动作,但不对无法通过的区域进行选取
        action_list = self.q_table[str(state)]
        pass_action_index = []
        for index, val in enumerate(action_list):
            if val >= 0:
                pass_action_index.append(index)
        # 使用epsilon greedy来进行动作选取
        if np.random.rand() <= self.epsilon:
            # 进行探索
            return np.random.choice(pass_action_index)
        else:
            # 直接选取q最大值
            max_val = action_list[pass_action_index[0]]
            max_list = []
            for i in pass_action_index:
                # 最大值相同且不止一个则随机选个最大值
                if max_val < action_list[i]:
                    max_list.clear()
                    max_val = action_list[i]
                    max_list.append(i)
                elif max_val == action_list[i]:
                    max_list.append(i)
            return np.random.choice(max_list)

    def update_q_table(self, feedback, state, action, reward, next_state):
        # Q(s,a) = Q(s,a) + lr * { reward + gamma * max[Q(s`,a`)] - Q(s,a) }
        q_s_a = self.q_table[str(state)][action] # 取出对应当前状态动作的q值
        if feedback == 'stop':
            q_ns_a = 0  # 撞墙时不存在下一状态,属于原地不变
        else :
            q_ns_a = np.max(self.q_table[str(next_state)])
        # 贝尔曼方程更新
        # self.q_table[str(state)][action] = q_s_a + self.learning_rate * (
        #     reward + self.gamma * q_ns_a - q_s_a
        # )
        self.q_table[str(state)][action] = (1 - self.learning_rate) * q_s_a + self.learning_rate * (reward + self.gamma * q_ns_a)
        # print(self.q_table)
        return self.q_table[str(state)][action]

if __name__ == '__main__':
    np.random.seed(0)
    env = env.GridWorld()
    agent = Agent()
    for ep in range(2000):
        if ep < 100 :
            agent.epsilon = 0.2
        else:
            agent.epsilon = 0.1
        state = env.reset()
        print('第{}轮训练开始 ... '.format(ep + 1))
        while not env.attach():
            action = agent.get_action(state)    # 产生动作
            # print(action)
            feedback, next_state, reward = env.exec_calc(action)     # 计算状态
            q_val = agent.update_q_table(feedback, state, action, reward, next_state)   # 更新Q表
            if feedback == 'stop':
                env.update_view(state, action, state, q_val)
                continue
            elif feedback == 'dead':
                env.update_view(state, action, next_state, q_val)
                break
            else:
                env.update_view(state, action, next_state, q_val)
            state = next_state   # 状态改变

代码里出现的细节点和博客要讲的理论知识,我后续有空补充哈!

你可能感兴趣的:(强化学习)