最近一直在研究强化学习中的Q学习算法,鉴于网上的代码对于新手来说太过于复杂,我在MATLAB以及Python下都写了相对容易的代码,以便于新手理解与学习。
python代码:
''' 1,环境:先从简单的6x6的方格环境做起,起始位置为右下角,终止位置为右上角, 其中选中两处为障碍物,其余的位置为平地,环境的选择为坐标系(0,0)->(5,5). 2,选择的动作为‘上下左右’,步长为单位一. 3,策略:Q-Learning算法. ''' import numpy as np import pandas as pd import matplotlib.pyplot as plt '''变量初始化''' ################################################################################################ gamma = 0.8#衰减率 learning_rate = 0.1#学习效率 actions = 4#四个动作选择方式 epsilon = 0.9#explora = 0.1 x_matrix = [ 0 , 0 , -1 , 1 ]#对应上下左右 y_matrix = [ 1 , -1 , 0 , 0 ] ac_matrix = [ -5 , 5 , -1 , 1 ]#就可以换号了 '''建立一个零矩阵,环境矩阵,其中有障碍物''' ################################################################################################ maze =np.zeros(25) maze[7]=1 maze[16]=1 #print(maze) '''这三个表的应用规律相同''' ac_rol = [0,1,2,3] #定义Q值表 q_table = pd.DataFrame(np.zeros(100).reshape(25,4),index=np.arange(25),columns=ac_rol) ##print(q_table) r_step = [] '''#Q值表边界条件的设定''' q_t_r = 0 while q_t_r < 5: q_table.loc[q_t_r,0] = float('-inf') q_t_r += 1 q_t_r = 0 while q_t_r < 5: q_table.loc[5*q_t_r,2] = float('-inf') q_t_r += 1 q_t_r = 0 while q_t_r < 5: q_table.loc[5*(q_t_r)+4,3] = float('-inf') q_t_r += 1 q_t_r = 0 while q_t_r <5: q_table.loc[20+q_t_r,1] = float('-inf') q_t_r += 1 #print(q_table.loc[2,3]) #q_table.iloc[2,3] = 2#更改矩阵中的某个数值 ##print(q_table) trace = []#机器人的轨迹 '''开始进行打的循环迭代''' ############################################################################################### for episode in range(1000): #在这之前要拥有一个大循环,需要遍历1000次 x = 4 y = 0 initial_s = 24 terminal_s = 0 s = initial_s step = 1 while s != 0: if np.random.uniform() < epsilon: # 输出0-1的随机数 # choose best action state_action = q_table.loc[s, :] # 显示数值obversation为行数 '''输出形式: 1 0.0 2 0.0 3 0.0 Name: 0, dtype: float64 ''' # some actions may have the same value, randomly choose on in these actions p_action = np.random.choice(state_action[state_action == np.max(state_action)].index) else: # choose random action a = q_table.loc[s, :] p_action = np.random.choice(a[a != float('-inf')].index) ##print(p_action) s_ = s + ac_matrix[p_action] x_ = x + x_matrix[p_action] y_ = y + y_matrix[p_action] trace.append(s_) step = step + 1 if maze[s_] == 1: reward = -10 else: reward = -1 adc = q_table.loc[s_, :] max_q = np.max(adc) q_new = q_table.loc[s,p_action]+learning_rate*((reward + gamma*max_q-q_table.loc[s, p_action])) q_table.iloc[s,p_action] = q_new s=s_ ##print(s) ##print(q_table) r_step.append(step) plt.plot(r_step,linewidth=1) #plt.show() plt.savefig("22.png") MATLAB代码:
主程序:
clc
num_of_iteration = 1000;
N = 5;
r = 20;
gamma = 0.8;
learning_rate = 0.1;
actions = 4; % North, East, West, South (N, E, W, S)
exploration_rate = 0.4;
action_matrix = [-5 1 -1 5 ];
maze = create_random_maze(N, r);
q_matrix = construct_q_matrix(N, actions);
i = 0;
reward = 0;
trace_index = [];
for episode = 1 : num_of_iteration
initial_state = N*N;
goal_state = 1;
current_state = initial_state;
while current_state ~= goal_state
if rand < exploration_rate
eliminate_inf = find(q_matrix(current_state, :) > -inf);
prefered_action = eliminate_inf(randi(numel(eliminate_inf)));
else
[temp_value, prefered_action] = max( q_matrix(current_state,:) );
temp_value
end
% receive immediate reward
maze_exp_index = current_state + action_matrix(prefered_action);
trace_index(i) = maze_exp_index;
i = i + 1;
if maze(maze_exp_index) == 1
reward = -4;
else
reward = -1;
end
q_matrix(current_state, prefered_action) = (1-learning_rate) * q_matrix(current_state, prefered_action)+...
learning_rate * (reward + gamma * max(q_matrix(maze_exp_index,:)));
% observe new state
current_state = maze_exp_index ;
end
end
maze
q_matrix
Q值表定义函数:
function [ q_matrix ] = costruct_q_matrix( N, actions )
q_matrix = zeros(N*N, actions);
q_matrix(N*N, 4) = -inf;
q_matrix(1:N, 1) = -inf;
q_matrix(N*4+1 : N*N, 4) = -inf;
for i = 1 : N
q_matrix(i*N,2) = -inf;
q_matrix(i*N + 1, 3) = -inf;
end
q_matrix = q_matrix( 1:N*N, :);
End
环境矩阵定义函数:
function [ maze ] = create_random_maze(N, r)
maze_temp = zeros(N, N);
for i = 1 : N
for j = 1 : N
if rand < 0.01 * r
maze_temp(i, j) = 1;
end
end
end
maze = maze_temp;
end
MATLAB仿照python写的,含义大同小异不做过多的解释了