深度学习Q—learning Q矩阵的更新基本公式如下:
Q_new(state,action)=(1-alpha)Q(state,action)+
alpha(R(state,action)+gamma*max*Q(stae_next,action_next))
以10X10矩阵为例代码实现为:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#reward = pd.read_excel("Qdata.xls")#如果以excel形式导入数据,可以使用这两个语句
#reward = np.array(reward)
#根据场景需求定义初始R矩阵
reward = np.array([[-1, 0, -1, -1, 0, -1, -1, -1, -1, -1],
[0, -1, -1, -1, -1, 0, -1, -1, -1, -1],
[-1, -1, -1, -1, -1, -1, 0, -1, -1, 100],
[-1, -1, -1, -1, -1, -1, 0, -1, -1, -1],
[0, -1, -1, -1, -1, 0, -1, -1, -1, -1],
[0, 0, -1, -1, 0, -1, 0, 0, -1, -1],
[-1, -1, 0, 0, -1, 0, -1, 0, -1, -1],
[-1, -1, -1, -1, -1, 0, 0, -1, 0, -1],
[-1, -1, -1, -1, -1, -1, -1, 0, -1, 100],
[-1, -1, 0, -1, -1, -1, -1, -1, 0, 100]])
Q_matrix = np.zeros((len(reward), len(reward)))#生成初始Q矩阵
rows, cols = reward.shape#生成state,action的行列个数
print(rows,cols)
steps = 0
gamma = 0.8
while steps < 500:
steps += 1
start_state = np.random.randint(0, rows)#随机选择一个state
Rmax = max(reward[start_state])#得到最大的R矩阵
for i in range(cols):
if reward[start_state, i] != -1: #判断并不选择reward=-1的
maxQ = max(Q_matrix[i])#选择下个action的最大reward
#得到Q矩阵,并迭代循环
Q_matrix[start_state, i] = reward[start_state, i] + gamma * maxQ
print(np.round(Q_matrix/5))
原理参考网址:
https://blog.csdn.net/wei1033701020/article/details/54311113