基于Q-Learning算法的机器人路径规划

最近一直在研究强化学习中的Q学习算法,鉴于网上的代码对于新手来说太过于复杂,我在MATLAB以及Python下都写了相对容易的代码,以便于新手理解与学习。

python代码:

'''
1,环境:先从简单的6x6的方格环境做起,起始位置为右下角,终止位置为右上角,
其中选中两处为障碍物,其余的位置为平地,环境的选择为坐标系(0,0)->(5,5).
2,选择的动作为‘上下左右’,步长为单位一.
3,策略:Q-Learning算法.
'''
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

'''变量初始化'''
################################################################################################
gamma = 0.8#衰减率
learning_rate = 0.1#学习效率
actions = 4#四个动作选择方式
epsilon = 0.9#explora = 0.1
x_matrix = [ 0 , 0 , -1 , 1 ]#对应上下左右
y_matrix = [ 1 , -1 , 0 , 0 ]
ac_matrix = [ -5 , 5 , -1 , 1 ]#就可以换号了

'''建立一个零矩阵,环境矩阵,其中有障碍物'''
################################################################################################
maze =np.zeros(25)
maze[7]=1
maze[16]=1
#print(maze)
'''这三个表的应用规律相同'''
ac_rol = [0,1,2,3]
#定义Q值表
q_table = pd.DataFrame(np.zeros(100).reshape(25,4),index=np.arange(25),columns=ac_rol)
##print(q_table)
r_step = []
'''#Q值表边界条件的设定'''
q_t_r = 0
while q_t_r < 5:
    q_table.loc[q_t_r,0] = float('-inf')
    q_t_r += 1
q_t_r = 0
while q_t_r < 5:
    q_table.loc[5*q_t_r,2] = float('-inf')
    q_t_r += 1
q_t_r = 0
while q_t_r < 5:
    q_table.loc[5*(q_t_r)+4,3] = float('-inf')
    q_t_r += 1
q_t_r = 0
while q_t_r <5:
    q_table.loc[20+q_t_r,1] = float('-inf')
    q_t_r += 1
#print(q_table.loc[2,3])
#q_table.iloc[2,3] = 2#更改矩阵中的某个数值
##print(q_table)
trace = []#机器人的轨迹
'''开始进行打的循环迭代'''
###############################################################################################
for episode in range(1000):
    #在这之前要拥有一个大循环,需要遍历1000次
    x = 4
    y = 0
    initial_s = 24
    terminal_s = 0
    s = initial_s
    step = 1
    while s != 0:
        if np.random.uniform() < epsilon:  # 输出0-1的随机数
            # choose best action
            state_action = q_table.loc[s, :]  # 显示数值obversation为行数
            '''输出形式:
               1    0.0
               2    0.0
               3    0.0
               Name: 0, 
               dtype: float64
            '''
            # some actions may have the same value, randomly choose on in these actions
            p_action = np.random.choice(state_action[state_action == np.max(state_action)].index)
           
        else:
            # choose random action
            a = q_table.loc[s, :]
            p_action = np.random.choice(a[a != float('-inf')].index)

        ##print(p_action)

        s_ = s + ac_matrix[p_action]
        x_ = x + x_matrix[p_action]
        y_ = y + y_matrix[p_action]
        trace.append(s_)
        step = step + 1

        if maze[s_] == 1:
            reward = -10
        else:
            reward = -1

        adc = q_table.loc[s_, :]
        max_q = np.max(adc)
        q_new = q_table.loc[s,p_action]+learning_rate*((reward + gamma*max_q-q_table.loc[s, p_action]))
        q_table.iloc[s,p_action] = q_new
        s=s_
        ##print(s)
        ##print(q_table)
    r_step.append(step)
plt.plot(r_step,linewidth=1)
#plt.show()
plt.savefig("22.png")

MATLAB代码:

主程序:

clc

num_of_iteration = 1000;

N = 5;

r = 20;

gamma = 0.8;

learning_rate = 0.1;

actions = 4; % North, East, West, South (N, E, W, S)

exploration_rate = 0.4;

action_matrix = [-5 1 -1 5 ];

maze = create_random_maze(N, r);

q_matrix = construct_q_matrix(N, actions);

i = 0;

reward = 0;

trace_index = [];

for episode = 1 : num_of_iteration    

    initial_state = N*N;

    goal_state = 1;    

    current_state = initial_state;

    while current_state ~= goal_state

        if rand < exploration_rate

            eliminate_inf = find(q_matrix(current_state, :) > -inf);

            prefered_action = eliminate_inf(randi(numel(eliminate_inf)));

        else

            [temp_value, prefered_action] = max( q_matrix(current_state,:) );

            temp_value

        end

        % receive immediate reward

        maze_exp_index = current_state + action_matrix(prefered_action);

        trace_index(i)  = maze_exp_index;

        i = i + 1;        

        if maze(maze_exp_index) == 1

            reward = -4;

        else

            reward = -1;

        end

        q_matrix(current_state, prefered_action) = (1-learning_rate) * q_matrix(current_state, prefered_action)+...

                                                   learning_rate * (reward + gamma * max(q_matrix(maze_exp_index,:)));

        % observe new state

        current_state = maze_exp_index ;

        

    end

end

maze

q_matrix

Q值表定义函数:

function [ q_matrix ] = costruct_q_matrix( N, actions )

q_matrix = zeros(N*N, actions);

q_matrix(N*N, 4) = -inf;    

q_matrix(1:N, 1) = -inf;

q_matrix(N*4+1 : N*N, 4) = -inf;

for i = 1 : N

    q_matrix(i*N,2) = -inf;

    q_matrix(i*N + 1, 3) = -inf;

end

q_matrix = q_matrix( 1:N*N, :);

End

环境矩阵定义函数:

function [ maze ] = create_random_maze(N, r)

maze_temp = zeros(N, N);

for i = 1 : N

    for j = 1 : N

        if rand < 0.01 * r

            maze_temp(i, j) = 1;

        end

    end

end

maze = maze_temp;

end

MATLAB仿照python写的,含义大同小异不做过多的解释了

你可能感兴趣的:(基于Q-Learning算法的机器人路径规划)