python实现DQN代码

1 Pytorch

import gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import random
from collections import deque

class DQNAgent:
    def __init__(self, state_size, action_size, memory_size=100000, batch_size=32, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, lr=0.001):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.lr = lr
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.q_network = self.build_model().to(self.device)
        self.target_network = self.build_model().to(self.device)
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)

    def build_model(self):
        model = nn.Sequential(
            nn.Linear(self.state_size, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, self.action_size)
        )
        return model

    def update_target_network(self):
        self.target_network.load_state_dict(self.q_network.state_dict())

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def choose_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.action_size)
        state = torch.from_numpy(state).float().to(self.device)
        with torch.no_grad():
            q_values = self.q_network(state)
        return q_values.argmax().item()

    def learn(self):
        if len(self.memory) < self.batch_size:
            return
        minibatch = random.sample(self.memory, self.batch_size)
        states = torch.from_numpy(np.vstack([x[0] for x in minibatch])).float().to(self.device)
        actions = torch.from_numpy(np.array([x[1] for x in minibatch])).long().to(self.device)
        rewards = torch.from_numpy(np.array([x[2] for x in minibatch])).float().to(self.device)
        next_states = torch.from_numpy(np.vstack([x[3] for x in minibatch])).float().to(self.device)
        dones = torch.from_numpy(np.array([x[4] for x in minibatch]).astype(np.uint8)).float().to(self.device)
        q_values = self.q_network(states).gather(1, actions.unsqueeze(1)).squeeze(1)
        next_q_values = self.target_network(next_states).max(1)[0]
        expected_q_values = rewards + (1 - dones) * self.gamma * next_q_values
        loss = F.mse_loss(q_values, expected_q_values.detach())
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def update_epsilon(self):
        self.epsilon = max(self.epsilon * self.epsilon_decay, 0.01)

    def save_model(self, filename):
        torch.save(self.q_network.state_dict(), filename)

    def load_model(self, filename):
        self.q_network.load_state_dict(torch.load(filename))

def train(env, agent, episodes, max_steps):
    scores = deque(maxlen=100)
    for i_episode in range(1, episodes + 1):
        state = env.reset()
        score = 0
        for t in range(max_steps):
            action = agent.choose_action(state)
            next_state, reward, done, _ = env.step(action)
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            score += reward
            agent.learn()
            if done:
                break
        scores.append(score)
        agent.update_epsilon()
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores)), end="")
        if np.mean(scores) >= 200:
            print('\nEnvironment solved in {} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores)))
            agent.save_model('dqn.pth')
            break
        if i_episode % 20 == 0:
            agent.update_target_network()

    return scores

env = gym.make('CartPole-v1')
agent = DQNAgent(state_size=env.observation_space.shape[0], action_size=env.action_space.n)
scores = train(env, agent, episodes=1000, max_steps=1000)

在上述代码中,train() 函数使用给定的 DQNAgent 对象和 OpenAI Gym 环境训练智能体。训练完成后,该函数返回最近 100 个 episode 得分的平均值的列表。在训练过程中,每隔一定 episode 数,目标网络将被更新,并且 ϵ \epsilon ϵ 值将按指定的速率逐渐降低。

这将在 CartPole-v1 环境中训练一个智能体,进行 1000 个 episode 的训练,每个 episode 最多运行 1000 步。在训练过程中,如果智能体的平均得分超过 200 分,训练将停止,并且训练期间的最终模型将保存在名为 dqn.pth 的文件中。

2 Tensorflow

import tensorflow as tf
import numpy as np
import random

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = []
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        model = tf.keras.models.Sequential()
        model.add(tf.keras.layers.Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(tf.keras.layers.Dense(24, activation='relu'))
        model.add(tf.keras.layers.Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma * np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)


运行示例

import gym

# 定义环境和智能体
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)

# 训练智能体
batch_size = 32
num_episodes = 1000
for e in range(num_episodes):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    score = 0
    done = False
    while not done:
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        score += reward
    print("episode: {}/{}, score: {}".format(e, num_episodes, score))
    if len(agent.memory) > batch_size:
        agent.replay(batch_size)

你可能感兴趣的:(pytorch,深度学习,python)