从Q-learning到DQN的理解

先理解Q-Learning:https://www.jianshu.com/p/29db50000e3f

Q-Learning中的Q表是根据Reward更新的

python代码实现

import numpy as np
import random

# 定义Reward矩阵
r = np.array([[-1, -1, -1, -1, 0, -1], [-1, -1, -1, 0, -1, 100], [-1, -1, -1, 0, -1, -1], [-1, 0, 0, -1, 0, -1],
              [0, -1, -1, 0, -1, 100], [-1, 0, -1, -1, 0, 100]])

# 定义Q矩阵 初始化为0
q = np.zeros([6, 6], dtype=np.float32)

# 定义衰减值
gamma = 0.8

# 训练Q表 1000次
step = 0
while step < 1000:

    # 获取每个Station对应的所有的Action的列表
    # 每次随机选取状态
    state = random.randint(0, 5)

    # 状态不为5
    if state != 5:
        # 定义动作列表
        next_state_list = []

        # 遍历动作
        for i in range(6):

            # -1代表不能走的通道
            if r[state, i] != -1:
                # 将此状态所有可以执行的动作放到列表中
                next_state_list.append(i)

        # 随机选取一个动作
        next_state = next_state_list[random.randint(0, len(next_state_list) - 1)]

        # 用贝尔曼方程更新Q值
        qval = r[state, next_state] + gamma * max(q[next_state])
        print(state, next_state, qval)
        q[state, next_state] = qval
    step += 1

# 打印训练完成的Q表
print(q)

# 按照Q表执行动作
for i in range(10):
    print("第{}次验证".format(i + 1))
    state = random.randint(0, 5)
    print('机器人处于{}'.format(state))
    count = 0
    while state != 5:
        if count > 20:
            print('fail')
            break
        # 选择最大的q_max
        q_max = q[state].max()

        q_max_action = []

        for action in range(6):
            if q[state, action] == q_max:
                q_max_action.append(action)

        next_state = q_max_action[random.randint(0, len(q_max_action) - 1)]
        print("the robot goes to " + str(next_state) + '.')
        state = next_state
        count += 1

你可能感兴趣的:(强化学习)