深度 Q 网络(Deep Q-Network, DQN)是强化学习中的一种重要算法,由 Google DeepMind 于2013年提出。DQN 结合了 Q 学习和深度学习,通过使用神经网络来近似 Q 值函数,解决了传统 Q 学习在高维状态空间中的问题。
简化的 DQN 算法的 Python 代码示例,使用 PyTorch 实现:
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque
import gym
# 定义Q网络
class QNetwork(nn.Module):
def __init__(self, input_dim, output_dim):
super(QNetwork, self).__init__()
self.fc = nn.Sequential(
nn.Linear(input_dim, 128),
nn.ReLU(),
nn.Linear(128, 128),
nn.ReLU(),
nn.Linear(128, output_dim)
)
def forward(self, x):
return self.fc(x)
# 定义DQN代理
class DQNAgent:
def __init__(self, state_dim, action_dim, lr=0.001, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, min_epsilon=0.01, buffer_size=10000, batch_size=64):
self.state_dim = state_dim
self.action_dim = action_dim
self.gamma = gamma
self.epsilon = epsilon
self.epsilon_decay = epsilon_decay
self.min_epsilon = min_epsilon
self.batch_size = batch_size
self.q_network = QNetwork(state_dim, action_dim)
self.target_network = QNetwork(state_dim, action_dim)
self.target_network.load_state_dict(self.q_network.state_dict())
self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)
self.replay_buffer = deque(maxlen=buffer_size)
self.criterion = nn.MSELoss()
def choose_action(self, state):
if random.random() < self.epsilon:
return random.randint(0, self.action_dim - 1)
else:
state = torch.tensor([state], dtype=torch.float32)
q_values = self.q_network(state)
return torch.argmax(q_values).item()
def store_experience(self, state, action, reward, next_state, done):
self.replay_buffer.append((state, action, reward, next_state, done))
def update_network(self):
if len(self.replay_buffer) < self.batch_size:
return
batch = random.sample(self.replay_buffer, self.batch_size)
states, actions, rewards, next_states, dones = zip(*batch)
states = torch.tensor(states, dtype=torch.float32)
actions = torch.tensor(actions, dtype=torch.int64)
rewards = torch.tensor(rewards, dtype=torch.float32)
next_states = torch.tensor(next_states, dtype=torch.float32)
dones = torch.tensor(dones, dtype=torch.float32)
q_values = self.q_network(states).gather(1, actions.unsqueeze(1)).squeeze()
next_q_values = self.target_network(next_states).max(1)[0]
target_q_values = rewards + self.gamma * next_q_values * (1 - dones)
loss = self.criterion(q_values, target_q_values)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
def update_target_network(self):
self.target_network.load_state_dict(self.q_network.state_dict())
def decay_epsilon(self):
self.epsilon = max(self.min_epsilon, self.epsilon * self.epsilon_decay)
# 环境
env = gym.make('CartPole-v1')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
# 初始化代理
agent = DQNAgent(state_dim, action_dim)
# 训练循环
episodes = 1000
for episode in range(episodes):
state = env.reset()
total_reward = 0
done = False
while not done:
action = agent.choose_action(state)
next_state, reward, done, _ = env.step(action)
agent.store_experience(state, action, reward, next_state, done)
agent.update_network()
state = next_state
total_reward += reward
agent.update_target_network()
agent.decay_epsilon()
print(f'Episode {episode + 1}/{episodes}, Total Reward: {total_reward}, Epsilon: {agent.epsilon:.3f}')
# 评估
state = env.reset()
done = False
while not done:
action = agent.choose_action(state)
state, _, done, _ = env.step(action)
env.render()
env.close()
欢迎关注本人,我是喜欢搞事的程序猿; 一起进步,一起学习;
欢迎关注知乎/CSDN:SmallerFL
也欢迎关注我的wx公众号(精选高质量文章):一个比特定乾坤