Q-Learning算法下,目标是达到目标状态(Goal State)并获取最高收益,一旦到达目标状态,最终收益保持不变。因此,目标状态又称之为吸收态。
Q-Learning算法下的agent,不知道整体的环境,知道当前状态下可以选择哪些动作。
通常,我们需要构建一个即时奖励矩阵R,用于表示从状态s到下一个状态s’的动作奖励值。
由即时奖励矩阵R计算得出指导agent行动的Q矩阵。
Q矩阵是agent的大脑。
其核心转移更新方程是:
其中, s′ s ′ 表示下一个状态.
Q-Learning算法核心,以一个episode为一个训练周期:从初始状态到终结态。
每学完一个episode后,再进入下一个episode学习。
因此,可以得到Q-Learning外层循环是一个episode,内层循环是episode的每一个step。
现在假设我们需要解决一个问题:使用Q-learning让一个机器人走出如下构造的房间:
#include
#include
#include
#include
#include
using namespace std;
#define gamma 0.24
int R[6][6] = {
{-1, -1, -1, -1, 0, -1},
{-1, 1, 1, 0,-1,100},
{-1, -1, 100, 0,-1, -1},
{-1, 0, 0, -1, 0, -1},
{ 0, 1, -1, 0,-1,100},
{-1, 100, 100, -1, 0,100}
};
double Q_value[6][6] = {
{0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0}
};
std::vector<std::vector<double> > Q;
void make_Q(std::vector<std::vector<double> >& Q)
{
for (int i = 0; i < 6; ++i)
{
std::vector<double> tmp_v;
for (int j = 0; j < 6; ++j)
{
tmp_v.push_back(Q_value[i][j]);
}
Q.push_back(tmp_v);
}
}
double max_Q(int state)
{
double max_Q = Q[state][0];
for (int i = 1; i < 6; ++i)
{
if(Q[state][i]>max_Q){max_Q = Q[state][i]>max_Q;}
}
return max_Q;
}
double transfer_rule(int state,int action)
{
return R[state][action] + gamma*max_Q(action);
}
//double train(int step_num,double& Q[6][6])
double train(int step_num,std::vector<std::vector<double> >& Q)
{
while(step_num)
{
int start_state = step_num%5;
if (start_state!=5)
{
for (int i = 0; i < 6; ++i)
{
if (R[start_state][i]<0){continue;}
srand((unsigned)time(NULL));int r = rand()%5;
if (r%2==0){continue;}
Q[start_state][i] = transfer_rule(start_state,i);
start_state = i;
break;
}
}
step_num--;
}
}
double train_0(int step_num,double **Q)
{
Q[0][0] = 1.0;
}
template <typename DType>
void print_2D_array(DType arr[6][6])
{
for (int i = 0; i < 6; ++i)
{
for (int j = 0; j < 6; ++j)
{
cout<", ";
}
cout<<' '<cout<<' '<void print_2D_vec(std::vector<std::vector<double> > Q)
{
for (int i = 0; i < 6; ++i)
{
for (int j = 0; j < 6; ++j)
{
cout<", ";
}
cout<<' '<cout<<' '<int main(int argc, char const *argv[])
{
/*std::vector > R;
for (int i = 0; i < 6; ++i)
{
std::vector tmp_v;
for (int j = 0; j < 6; ++j)
{
tmp_v.push_back(reward[i][j]);
}
R.push_back(tmp_v);
}*/
//print_2D_vec(Q);
make_Q(Q);
print_2D_vec(Q);
cout<<"------------------------- after trained ---------------"<25,Q);
print_2D_vec(Q);
return 0;
}
编译运行:
root@master:/Homework/c_and_point# g++ Q-learning.cpp -o ql
root@master:/Homework/c_and_point# ./ql
0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0,
------------------------- after trained ---------------
0, 0, 0, 0, 0, 0,
0, 1.24, 0, 0, 0, 0,
0, 0, 100.24, 0, 0, 0,
0, 0.24, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0,