DQN Pytorch示例

智能体是一个字母o,它卡在许多_之间,而要达到的目的是并确保o两侧都有_,这需要让o能够向左右两边移动,而且速度略快于无动作时的自然移动速度,看起来就像下面那样。这是一种很简单的情形。

pytorch版本:1.11.0+cu113

代码

因为每次初始化都一样,会出很多相同的数据,故先定义个数据结构,它是可hash的,便于存放在集合中。

class Data:
    def __init__(self, s: 'list[float]|tuple[float]', a: 'list[float]|tuple[float]', r: float, s_: 'list[float]|tuple[float]') -> None:
        self.tuple_ = (tuple(s), tuple(a), r, tuple(s_))

    @property
    def state(self):
        return list(self.tuple_[0])

    @property
    def action(self):
        return list(self.tuple_[1])

    @property
    def reward(self):
        return self.tuple_[2]

    @property
    def next_state(self):
        return list(self.tuple_[3])

    def __ne__(self, __o: object) -> bool:
        if type(__o) != Data:
            return False
        return self.tuple_ != __o.tuple_

    def __eq__(self, __o: object) -> bool:
        if type(__o) != Data:
            return False
        return self.tuple_ == __o.tuple_

    def __hash__(self) -> int:
        return hash(self.tuple_)

打印进度条,console里面会显示不同颜色的空格

import math
def print_bar(epoch, epochs, step, batch_size, etc=None, bar_size=50):
    process = math.ceil(bar_size*step/batch_size)
    strs = [f"Epoch {epoch}/{epochs}", f" |\033[1;30;47m{' ' * process}\033[0m{' ' * (bar_size-process)}| ",]
    if etc is not None:
        strs.append(str(etc))
    if step:
        strs.insert(0, "\033[A")
    print("".join(strs)+"    ")

实现能够copy参数的容器

from collections import OrderedDict, abc

from torch import Tensor, nn
from torch import optim as optim


class Model(nn.Module):
    def __init__(self, layers: 'list[tuple[nn.Module,abc.callable|function]]', device=None):
        super(Model, self).__init__()
        self.layers = [layer for layer, _ in layers]
        self.module_list = nn.ModuleList(self.layers).to(device)
        self.activations = [f for _, f in layers] 
        self.deep = len(layers)

    def forward(self, x: Tensor) -> Tensor:
        a = x
        for i in range(self.deep):
            a = self.module_list[i](a)
            activation = self.activations[i]
            if activation:
                a = activation(a)
        return a

    def load_state_dict(self, model: 'Model', rate: float = .5):
        for i in range(self.deep):
            if rate >= 1.:
                self.layers[i].load_state_dict(model.layers[i].state_dict())
            else:
                local = self.layers[i].state_dict()
                forign = model.layers[i].state_dict()
                mix = OrderedDict()
                for key in local.keys():
                    mix[key] = local.get(key)*(1-rate) + forign.get(key)*rate
                self.layers[i].load_state_dict(mix)

    def copy(self) -> 'Model':
        params = []
        for i in range(self.deep):
            params.append((self.layers[i], self.activations[i]))
        model = Model(params)
        model.load_state_dict(self, 1)
        return model

训练主要部分

# -*- coding: utf-8 -*-
from datetime import datetime
import random, time, torch, os

from torch import cuda, device, nn, optim, Tensor
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
GPU = device("cuda" if cuda.is_available() else "cpu")

def draw(length, location, new_line: bool = False, item=None):
    string = "_"*int(location)+"o"+"_"*(length-int(location)-1)
    if not new_line:
        string = "\033[A"+string
    print(string[:length] + f" {str(item)} ")


def env(length, location, speed, force=0) -> 'tuple[int,int,bool]':
    if location < length/2:
        f = -1
    elif location > length/2:
        f = 1
    else:
        f = random.randint(-1, 1)
    new_speed = speed+f+force
    if location < 0 or location >= length:
        living = False
    else:
        living = True
    return location+speed, new_speed, living


def init_data(width: int) -> 'tuple[int,int]':
    return(int(width/2), 0)


def make_state(location, width, speed, speed_scale) -> 'list[float]':
    return [(location-width/2)/width*2, speed/speed_scale]


def simulate(model: Model, batch_size: int, width: int, speed_scale: int, action_list: 'list', epsilon: float, reward_range: float = .7):
    """环境模拟,收集数据
    """
    if reward_range <= .5:
        raise ValueError('The arg reward_range cannot leq to .5')
    action_count = len(action_list)
    location, speed = init_data(width)
    cache = set()
    live_time = 0
    score = 0
    max_score = max(1, batch_size*(batch_size+1)/2)  # 如果一直没掉下去最多能得这么多分
    for _ in range(batch_size):
        state = make_state(location, width, speed, speed_scale)
        if random.random() <= epsilon:  # 决定探索还是利用
            action_index = random.randint(0, action_count-1)
        else:
            action_index = torch.argmax(model(torch.tensor(data=state, dtype=torch.float32, device=GPU)))
        a = action_list[action_index]
        location_, speed_, r = env(width, location, speed, action_list[a])  # 计算sₜ₊₁
        data = Data(
            state,
            [action_index == i for i in range(action_count)],  # 数据类型转换
            1. if (1-reward_range) < location/width < reward_range else 0.,  # 这里并不是没掉下去就正奖励,而是在中间才能正奖励
            make_state(location_, width, speed_, speed_scale)  # 数据类型转换
        )
        cache.add(data)  # 填充经验池
        location, speed = location_, speed_  # s=sₜ₊₁
        live_time += 1
        score += live_time
        if not r:
            location, speed = init_data(width)
    return cache, score/max_score


def train(width: int, speed_scale: int, action_list: 'tuple|list', model: Model, optimizer: optim.Optimizer,  loss_func: nn.modules.loss._Loss, epochs: int, batch_size: int, gamma: float = .1, epsilon: float = .1, soft_update_rate=.1, target_accuracy=.99) -> 'list[float]':
    policy_net = model.copy()
    target_net = model
    policy_net.train(mode=True)
    target_net.train(mode=False)  # 目标网络不训练
    action_count = len(action_list)
    cache = set()
    for epoch in range(epochs):
        cache_, accuracy = simulate(model=target_net, batch_size=batch_size, speed_scale=speed_scale, width=width, action_list=action_list, epsilon=epsilon)
        cache = cache | cache_
        states, actions, rewards, state_nexts = [], [], [], []
        real_batch_size = min(batch_size, len(cache))
        for data in random.sample(cache, real_batch_size):  # 随机从经验池选取一批数据
            states.append(data.state)
            actions.append(data.action)
            rewards.append([data.reward, ])
            state_nexts.append(data.next_state)
        # 将列表转换成张量
        states = torch.tensor(states, device=GPU) + torch.randn_like(states, device=GPU)*.02  # 额外加上一个噪声
        actions = torch.tensor(actions, device=GPU)
        rewards = torch.tensor(rewards, device=GPU)
        state_nexts = torch.tensor(state_nexts, device=GPU)
        rewards = rewards.expand((real_batch_size, action_count))
        v = target_net(state_nexts).detach()
        y = v*(v.argsort(dim=-1, descending=False).eq(0)) * gamma + rewards * (1 - gamma)  # 控制立即奖励所占比例
        v_hat = policy_net(states) * actions  # 由于actions是张量,未选取的动作自然成0
        loss = loss_func(y_hat, y)
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        optimizer.step()
        target_net.load_state_dict(policy_net, soft_update_rate)
        print_bar(epoch, epochs, epoch, epochs, ('{:.10f}'.format(loss.item()), '{:.10f}'.format(accuracy),))
        if accuracy >= target_accuracy:  # 分数达到要求说明模型训练完成
            break
    return target_net


if __name__ == "__main__":
    WIDTH = 30  # 定义平台宽度
    SPEED_SCALE = 8  # 定义速度敏感性
    ACTIONS = (-2, 0, 2, )  # 动作列表
    EPOCHS = 10000  # 最大迭代次数
    BATCH_SIZE = 64
    layers = [
        (torch.nn.Linear(2, 8), torch.sigmoid),
        (torch.nn.Linear(8, 3), torch.sigmoid),
    ]  # 模型
    model = Model(layers=layers, device=GPU)
    opt = optim.NAdam(model.parameters(), lr=.07)
    loss_func = nn.MSELoss()
    model = train(
        width=WIDTH, speed_scale=SPEED_SCALE, action_list=ACTIONS, model=model, optimizer=opt, loss_func=loss_func, epochs=EPOCHS, batch_size=BATCH_SIZE,
        gamma=.3, epsilon=.2, soft_update_rate=.3, target_accuracy=.93
    )
    model.to("cpu")
    print("\n\n")
    location, speed = init_data(WIDTH)
    for step in range(200):  # 播放动画展示模型训练结果
        state = torch.tensor(make_state(location, WIDTH, speed, SPEED_SCALE))
        a = ACTIONS[torch.argmax(model(state))]
        location_, speed_, r = env(WIDTH, location, speed, ACTIONS[a])
        draw(WIDTH, location, not step, (a, action.tolist(), location, speed))
        if r <= 0:
            location, speed = init_data(WIDTH)
        else:
            location, speed = location_, speed_
        time.sleep(.1)

你可能感兴趣的:(Python,深度学习,pytorch,python,强化学习,dqn)