python实现 Qlearning算法 完整的输入输出测试数据

Q-learning是一种强化学习算法,用于解决基于动作-奖励机制的问题。以下是一个简单的 Python 实现 Q-learning 算法的示例,以解决一个简单的迷宫问题。

import numpy as np

# 创建迷宫示例,用数字表示迷宫状态
# 0表示可通行的空格,1表示障碍物,9表示目标点
maze = np.array([
    [0, 0, 0, 1],
    [1, 1, 0, 1],
    [0, 0, 0, 0],
    [1, 0, 9, 1]
])

# 定义Q-table,初始化为0
q_table = np.zeros((maze.shape[0] * maze.shape[1], 4))

# 定义参数
alpha = 0.1  # 学习率
gamma = 0.9  # 折扣因子
epsilon = 0.1  # epsilon-greedy策略中的随机探索率

# 定义动作:上下左右
actions = ['up', 'down', 'left', 'right']

# 获取当前状态下的可执行动作
def get_possible_actions(state):
    row, col = np.where(maze == 0)
    current_state = np.ravel_multi_index(state, maze.shape)
    actions = []
    if (state[0] > 0) and (maze[state[0] - 1, state[1]] == 0):
        actions.append('up')
    if (state[0] < maze.shape[0] - 1) and (maze[state[0] + 1, state[1]] == 0):
        actions.append('down')
    if (state[1] > 0) and (maze[state[0], state[1] - 1] == 0):
        actions.append('left')
    if (state[1] < maze.shape[1] - 1) and (maze[state[0], state[1] + 1] == 0):
        actions.append('right')
    return actions

# 选择动作
def choose_action(state):
    if np.random.uniform(0, 1) < epsilon:
        return np.random.choice(actions)
    else:
        possible_actions = get_possible_actions(state)
        state_idx = np.ravel_multi_index(state, maze.shape)
        q_values = q_table[state_idx][[actions.index(action) for action in possible_actions]]
        return possible_actions[np.argmax(q_values)]

# 更新Q-table
def update_q_table(state, action, reward, new_state):
    state_idx = np.ravel_multi_index(state, maze.shape)
    new_state_idx = np.ravel_multi_index(new_state, maze.shape)
    action_idx = actions.index(action)
    max_future_q = np.max(q_table[new_state_idx])
    current_q = q_table[state_idx][action_idx]
    new_q = (1 - alpha) * current_q + alpha * (reward + gamma * max_future_q)
    q_table[state_idx][action_idx] = new_q

# Q-learning主循环
for episode in range(1000):
    state = np.where(maze == 0)
    state = (state[0][0], state[1][0])
    done = False
    while not done:
        action = choose_action(state)
        if action == 'up':
            new_state = (state[0] - 1, state[1])
        elif action == 'down':
            new_state = (state[0] + 1, state[1])
        elif action == 'left':
            new_state = (state[0], state[1] - 1)
        else:
            new_state = (state[0], state[1] + 1)
        
        if maze[new_state] == 9:
            reward = 10
            done = True
        else:
            reward = -1
        
        update_q_table(state, action, reward, new_state)
        state = new_state

# 输出训练后的Q-table
print("训练后的Q-table:")
print(q_table)

这段代码实现了一个简单的 Q-learning 算法来解决一个简化的迷宫问题。这个例子中的迷宫是一个简单的矩阵,数字0表示可通行的空格,1表示障碍物,9表示目标点。算法的核心是根据状态选择动作并更新 Q-table。在实际问题中,状态和动作的定义以及环境的建模方式可能会有所不同。

你可能感兴趣的:(python,算法,开发语言)