Hindsight Experience Replay (HER) 是一种强化学习中的技术,旨在解决稀疏奖励问题,特别适用于目标导向的任务(例如机器人控制、物体抓取等)。它的基本思想是:即使在一个回合中任务失败,我们仍然可以从中获得有效的学习经验,通过“事后推断”(hindsight)来重构目标和奖励。
目标导向任务:这些任务有明确的目标,例如抓取某个物体或到达特定位置,奖励通常只有在成功完成目标时才会给出,成功的反馈可能非常稀疏。
Hindsight(事后推断):如果任务失败,我们仍然可以认为代理完成了另一个“假想”的目标。例如,在机器人抓取任务中,即使机器人没有抓到物体,我们可以假设它的目标是抓取当前的位置,并根据此目标为它生成一个奖励。
经验重放:传统的经验重放技术会从代理与环境的交互中存储经验,以便在后续的训练中进行学习。HER 通过在已有经验的基础上,修改目标状态并生成新的经验样本,以提高学习效率。
收集经验:代理与环境交互,记录每一个时间步的经验:(状态, 动作, 奖励, 下一状态, 是否结束)
。
目标设定:在完成一回合后(无论是否成功),代理可以选择在失败的经验中进行“事后推断”,即修改每个回合中的目标:
经验重放:将生成的“假想目标”的经验样本添加到经验回放池中,之后代理可以从中抽取样本进行训练,更新其策略。
重复学习:代理重复上述过程,不断通过事后推断和经验重放来提高学习效率,最终获得更加稳定的策略。
Hindsight Experience Replay (HER) 是一种通过重放“失败经验”并从中提取有效的学习信号来提高强化学习代理效率的技术。通过修改失败回合中的目标,代理可以在稀疏奖励的任务中获得更多的学习机会,从而加速学习过程。这使得 HER 成为目标导向任务中一种强大的强化学习技术,尤其适合那些目标多样且奖励稀疏的任务。
import gym
from gym import spaces
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random
# 简单环境定义
class SimpleEnv(gym.Env):
def __init__(self):
super(SimpleEnv, self).__init__()
self.observation_space = spaces.Box(low=np.array([-10, -10]), high=np.array([10, 10]), dtype=np.float32)
self.action_space = spaces.Discrete(2) # 0: 左,1: 右
self.state = np.array([0.0, 0.0]) # [位置, 速度]
def reset(self):
self.state = np.array([0.0, 0.0])
return self.state
def step(self, action):
x, v = self.state
v += 0.1 if action == 1 else -0.1 # 向右或向左移动
x += v # 更新位置
done = abs(x) > 5.0 # 结束条件:位置超过阈值
reward = 1.0 if not done else -1.0 # 奖励
self.state = np.array([x, v])
return self.state, reward, done, {}
def render(self):
print(f"Position: {self.state[0]}, Velocity: {self.state[1]}")
# Q 网络模型
class QNetwork(nn.Module):
def __init__(self, state_dim, action_dim):
super(QNetwork, self).__init__()
self.fc1 = nn.Linear(state_dim, 64)
self.fc2 = nn.Linear(64, 64)
self.fc3 = nn.Linear(64, action_dim)
def forward(self, x):
x = torch.relu(self.fc1(x))
x = torch.relu(self.fc2(x))
return self.fc3(x)
# HER 代理类
class HER:
def __init__(self, env, replay_buffer_size=10000, batch_size=64, gamma=0.99, tau=0.1, learning_rate=1e-3):
self.env = env
self.state_dim = env.observation_space.shape[0]
self.action_dim = env.action_space.n
self.replay_buffer = deque(maxlen=replay_buffer_size)
self.batch_size = batch_size
self.gamma = gamma
self.tau = tau
# Q 网络和目标 Q 网络
self.q_network = QNetwork(self.state_dim, self.action_dim).float()
self.target_q_network = QNetwork(self.state_dim, self.action_dim).float()
self.target_q_network.load_state_dict(self.q_network.state_dict())
self.optimizer = optim.Adam(self.q_network.parameters(), lr=learning_rate)
def select_action(self, state, epsilon=0.1):
if random.random() < epsilon:
return self.env.action_space.sample() # 随机动作
state_tensor = torch.FloatTensor(state).unsqueeze(0)
q_values = self.q_network(state_tensor)
return torch.argmax(q_values, dim=1).item() # 选择最大Q值的动作
def store_transition(self, state, action, reward, next_state, done):
self.replay_buffer.append((state, action, reward, next_state, done))
def sample_batch(self):
batch = random.sample(self.replay_buffer, self.batch_size)
states, actions, rewards, next_states, dones = zip(*batch)
return (
torch.FloatTensor(states),
torch.LongTensor(actions),
torch.FloatTensor(rewards),
torch.FloatTensor(next_states),
torch.FloatTensor(dones)
)
def update(self):
states, actions, rewards, next_states, dones = self.sample_batch()
with torch.no_grad():
next_q_values = self.target_q_network(next_states)
next_q_value = next_q_values.max(dim=1)[0]
target_q_value = rewards + self.gamma * next_q_value * (1 - dones)
q_values = self.q_network(states)
current_q_value = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
loss = nn.MSELoss()(current_q_value, target_q_value)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
# 软更新目标 Q 网络
for target_param, param in zip(self.target_q_network.parameters(), self.q_network.parameters()):
target_param.data.copy_(self.tau * param.data + (1.0 - self.tau) * target_param.data)
def her_replay(self, n_sample=4):
# 从经验中生成 HER 经验
for _ in range(n_sample):
idx = random.randint(0, len(self.replay_buffer) - 1)
state, action, reward, next_state, done = self.replay_buffer[idx]
# 假设目标为状态的第二个元素(速度)
goal = next_state[1]
her_state = np.copy(state)
her_state[1] = goal # 将速度替换为目标速度
# 存储重新定义的 HER 经验
self.store_transition(her_state, action, reward, next_state, done)
# 训练函数
def train_her(env, agent, episodes=1000, epsilon=0.1):
for episode in range(episodes):
state = env.reset()
done = False
total_reward = 0
while not done:
action = agent.select_action(state, epsilon)
next_state, reward, done, _ = env.step(action)
agent.store_transition(state, action, reward, next_state, done)
total_reward += reward
state = next_state
# 更新 Q 网络
if len(agent.replay_buffer) >= agent.batch_size:
agent.update()
# HER 重放
if len(agent.replay_buffer) >= agent.batch_size:
agent.her_replay()
print(f"Episode {episode + 1}/{episodes}, Total Reward: {total_reward}")
# 创建并训练代理
env = SimpleEnv() # 使用自定义的简单环境
agent = HER(env) # 创建 HER 代理
train_her(env, agent) # 训练代理