强化学习 Q-Learning学习笔记

Q-Learning算法

强化学习 Q-Learning学习笔记_第1张图片

例一

以最短距离到达终点

import numpy as np
import pandas as pd
import time

np.random.seed(2)  # 生成相同的随机数


N_STATES = 6   # 距离
ACTIONS = ['left', 'right']     # 可进行的操作
EPSILON = 0.9   # 选择最优值概率
ALPHA = 0.1     # 学习效率
GAMMA = 0.9    # 衰减度
MAX_EPISODES = 13   # 最大回合次数
FRESH_TIME = 0.3    # 速度


def build_q_table(n_states, actions):
	#创建去q_table列表
    table = pd.DataFrame(
        np.zeros((n_states, len(actions))),     # q_table 原始值
        columns=actions,    # 列索引名称 行所以为默认值
    )
    return table


def choose_action(state, q_table):
	#创建选择动作功能
    state_actions = q_table.iloc[state, :]
    if (np.random.uniform() > EPSILON) or ((state_actions == 0).all()):  
        action_name = np.random.choice(ACTIONS)
        #初始时以及10%随机选择动作
    else:   
        action_name = state_actions.idxmax()    
        #90%选择最优值动作
    return action_name


def get_env_feedback(S, A):
    # 环境交互
    if A == 'right':    #向右移动
        if S == N_STATES - 2:   # 终点
            S_ = 'terminal'
            R = 1#奖励值
        else:
            S_ = S + 1
            R = 0
    else:   # 向左移动
        R = 0
        if S == 0:
            S_ = S  
        else:
            S_ = S - 1
    return S_, R


def update_env(S, episode, step_counter):
    # 创建环境
    env_list = ['-']*(N_STATES-1) + ['T']   # '---------T' 
    if S == 'terminal':
        interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter)
        print('\r{}'.format(interaction), end='')
        time.sleep(2)
        print('\r                                ', end='')
    else:
        env_list[S] = 'o'
        interaction = ''.join(env_list)
        print('\r{}'.format(interaction), end='')
        time.sleep(FRESH_TIME)


def rl():
    # 创建主循环
    q_table = build_q_table(N_STATES, ACTIONS)
    for episode in range(MAX_EPISODES):
        step_counter = 0
        #起始位置
        S = 0
        is_terminated = False
        update_env(S, episode, step_counter)
        while not is_terminated:

            A = choose_action(S, q_table)
            S_, R = get_env_feedback(S, A)  # 进行移动并得到相应的奖励
            q_predict = q_table.loc[S, A]
            if S_ != 'terminal':
                q_target = R + GAMMA * q_table.iloc[S_, :].max()   #下一个状态不是终点
            else:
                q_target = R     # 下一个状态是终点
                is_terminated = True    # 跳出while循环

            q_table.loc[S, A] += ALPHA * (q_target - q_predict)  # 更新环境反馈
            S = S_  # 移动到下一位置

            update_env(S, episode, step_counter+1)
            step_counter += 1
    return q_table


if __name__ == "__main__":
    q_table = rl()
    print('\r\nQ-table:\n')
    print(q_table)

案例二

探索到达黄球的路线
强化学习 Q-Learning学习笔记_第2张图片

总共有三部分:环境,决策和交互更新
环境部分不做学习
下面是更新部分


from maze_env import Maze#环境
from RL_brain import QLearningTable#大脑


def update():
    for episode in range(100):
        # 起始位置(观测值)
        observation = env.reset()

        while True:
            # 刷新环境
            env.render()

            # 基于观测值挑选动作
            action = RL.choose_action(str(observation))

            # 执行动作并得到反馈
            observation_, reward, done = env.step(action)

            # 学习总结
            RL.learn(str(observation), action, reward, str(observation_))

            # 继续下一步
            observation = observation_

            # 完成或入坑结束循环
            if done:
                break
    print('game over')
    env.destroy()

if __name__ == "__main__":
    env = Maze()
    RL = QLearningTable(actions=list(range(env.n_actions)))

    env.after(100, update)
    env.mainloop()

决策部分

import numpy as np
import pandas as pd


class QLearningTable:
    def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
        self.actions = actions  
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon = e_greedy
        self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)

    def choose_action(self, observation):
        self.check_state_exist(observation)
        # 选择动作
        if np.random.uniform() < self.epsilon:
            # 选择最优值
            state_action = self.q_table.loc[observation, :]
            # 有些动作值一样,随机选一个
            action = np.random.choice(state_action[state_action == np.max(state_action)].index)
        else:
            # 随机选动作
            action = np.random.choice(self.actions)
        return action

    def learn(self, s, a, r, s_):
        self.check_state_exist(s_)
        q_predict = self.q_table.loc[s, a]
        if s_ != 'terminal':
            q_target = r + self.gamma * self.q_table.loc[s_, :].max()  # 下一个状态不是终点
        else:
            q_target = r  # 下一个状态是终点
        self.q_table.loc[s, a] += self.lr * (q_target - q_predict)  # 更新q_table

    def check_state_exist(self, state):
        if state not in self.q_table.index:
            # 添加到q_table中
            self.q_table = self.q_table.append(
                pd.Series(
                    [0]*len(self.actions),
                    index=self.q_table.columns,
                    name=state,
                )
            )

环境部分

"""
Reinforcement learning maze example.
Red rectangle:          explorer.
Black rectangles:       hells       [reward = -1].
Yellow bin circle:      paradise    [reward = +1].
All other states:       ground      [reward = 0].
This script is the environment part of this example. The RL is in RL_brain.py.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
"""


import numpy as np
import time
import sys
if sys.version_info.major == 2:
 import Tkinter as tk
else:
 import tkinter as tk


UNIT = 40   # pixels
MAZE_H = 4  # grid height
MAZE_W = 4  # grid width


class Maze(tk.Tk, object):
 def __init__(self):
     super(Maze, self).__init__()
     self.action_space = ['u', 'd', 'l', 'r']
     self.n_actions = len(self.action_space)
     self.title('maze')
     self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT))
     self._build_maze()

 def _build_maze(self):
     self.canvas = tk.Canvas(self, bg='white',
                        height=MAZE_H * UNIT,
                        width=MAZE_W * UNIT)

     # create grids
     for c in range(0, MAZE_W * UNIT, UNIT):
         x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
         self.canvas.create_line(x0, y0, x1, y1)
     for r in range(0, MAZE_H * UNIT, UNIT):
         x0, y0, x1, y1 = 0, r, MAZE_W * UNIT, r
         self.canvas.create_line(x0, y0, x1, y1)

     # create origin
     origin = np.array([20, 20])

     # hell
     hell1_center = origin + np.array([UNIT * 2, UNIT])
     self.hell1 = self.canvas.create_rectangle(
         hell1_center[0] - 15, hell1_center[1] - 15,
         hell1_center[0] + 15, hell1_center[1] + 15,
         fill='black')
     # hell
     hell2_center = origin + np.array([UNIT, UNIT * 2])
     self.hell2 = self.canvas.create_rectangle(
         hell2_center[0] - 15, hell2_center[1] - 15,
         hell2_center[0] + 15, hell2_center[1] + 15,
         fill='black')

     # create oval
     oval_center = origin + UNIT * 2
     self.oval = self.canvas.create_oval(
         oval_center[0] - 15, oval_center[1] - 15,
         oval_center[0] + 15, oval_center[1] + 15,
         fill='yellow')

     # create red rect
     self.rect = self.canvas.create_rectangle(
         origin[0] - 15, origin[1] - 15,
         origin[0] + 15, origin[1] + 15,
         fill='red')

     # pack all
     self.canvas.pack()

 def reset(self):
     self.update()
     time.sleep(0.5)
     self.canvas.delete(self.rect)
     origin = np.array([20, 20])
     self.rect = self.canvas.create_rectangle(
         origin[0] - 15, origin[1] - 15,
         origin[0] + 15, origin[1] + 15,
         fill='red')
     # return observation
     return self.canvas.coords(self.rect)

 def step(self, action):
     s = self.canvas.coords(self.rect)
     base_action = np.array([0, 0])
     if action == 0:   # up
         if s[1] > UNIT:
             base_action[1] -= UNIT
     elif action == 1:   # down
         if s[1] < (MAZE_H - 1) * UNIT:
             base_action[1] += UNIT
     elif action == 2:   # right
         if s[0] < (MAZE_W - 1) * UNIT:
             base_action[0] += UNIT
     elif action == 3:   # left
         if s[0] > UNIT:
             base_action[0] -= UNIT

     self.canvas.move(self.rect, base_action[0], base_action[1])  # move agent

     s_ = self.canvas.coords(self.rect)  # next state

     # reward function
     if s_ == self.canvas.coords(self.oval):
         reward = 1
         done = True
         s_ = 'terminal'
     elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]:
         reward = -1
         done = True
         s_ = 'terminal'
     else:
         reward = 0
         done = False

     return s_, reward, done

 def render(self):
     time.sleep(0.1)
     self.update()


def update():
 for t in range(10):
     s = env.reset()
     while True:
         env.render()
         a = 1
         s, r, done = env.step(a)
         if done:
             break

if __name__ == '__main__':
 env = Maze()
 env.after(100, update)
 env.mainloop()

你可能感兴趣的:(python)