智能体是一个字母o
,它卡在许多_
之间,而要达到的目的是并确保o
两侧都有_
,这需要让o
能够向左右两边移动,而且速度略快于无动作时的自然移动速度,看起来就像下面那样。这是一种很简单的情形。
pytorch版本:1.11.0+cu113
因为每次初始化都一样,会出很多相同的数据,故先定义个数据结构,它是可hash的,便于存放在集合中。
class Data:
def __init__(self, s: 'list[float]|tuple[float]', a: 'list[float]|tuple[float]', r: float, s_: 'list[float]|tuple[float]') -> None:
self.tuple_ = (tuple(s), tuple(a), r, tuple(s_))
@property
def state(self):
return list(self.tuple_[0])
@property
def action(self):
return list(self.tuple_[1])
@property
def reward(self):
return self.tuple_[2]
@property
def next_state(self):
return list(self.tuple_[3])
def __ne__(self, __o: object) -> bool:
if type(__o) != Data:
return False
return self.tuple_ != __o.tuple_
def __eq__(self, __o: object) -> bool:
if type(__o) != Data:
return False
return self.tuple_ == __o.tuple_
def __hash__(self) -> int:
return hash(self.tuple_)
打印进度条,console里面会显示不同颜色的空格
import math
def print_bar(epoch, epochs, step, batch_size, etc=None, bar_size=50):
process = math.ceil(bar_size*step/batch_size)
strs = [f"Epoch {epoch}/{epochs}", f" |\033[1;30;47m{' ' * process}\033[0m{' ' * (bar_size-process)}| ",]
if etc is not None:
strs.append(str(etc))
if step:
strs.insert(0, "\033[A")
print("".join(strs)+" ")
实现能够copy参数的容器
from collections import OrderedDict, abc
from torch import Tensor, nn
from torch import optim as optim
class Model(nn.Module):
def __init__(self, layers: 'list[tuple[nn.Module,abc.callable|function]]', device=None):
super(Model, self).__init__()
self.layers = [layer for layer, _ in layers]
self.module_list = nn.ModuleList(self.layers).to(device)
self.activations = [f for _, f in layers]
self.deep = len(layers)
def forward(self, x: Tensor) -> Tensor:
a = x
for i in range(self.deep):
a = self.module_list[i](a)
activation = self.activations[i]
if activation:
a = activation(a)
return a
def load_state_dict(self, model: 'Model', rate: float = .5):
for i in range(self.deep):
if rate >= 1.:
self.layers[i].load_state_dict(model.layers[i].state_dict())
else:
local = self.layers[i].state_dict()
forign = model.layers[i].state_dict()
mix = OrderedDict()
for key in local.keys():
mix[key] = local.get(key)*(1-rate) + forign.get(key)*rate
self.layers[i].load_state_dict(mix)
def copy(self) -> 'Model':
params = []
for i in range(self.deep):
params.append((self.layers[i], self.activations[i]))
model = Model(params)
model.load_state_dict(self, 1)
return model
训练主要部分
# -*- coding: utf-8 -*-
from datetime import datetime
import random, time, torch, os
from torch import cuda, device, nn, optim, Tensor
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
GPU = device("cuda" if cuda.is_available() else "cpu")
def draw(length, location, new_line: bool = False, item=None):
string = "_"*int(location)+"o"+"_"*(length-int(location)-1)
if not new_line:
string = "\033[A"+string
print(string[:length] + f" {str(item)} ")
def env(length, location, speed, force=0) -> 'tuple[int,int,bool]':
if location < length/2:
f = -1
elif location > length/2:
f = 1
else:
f = random.randint(-1, 1)
new_speed = speed+f+force
if location < 0 or location >= length:
living = False
else:
living = True
return location+speed, new_speed, living
def init_data(width: int) -> 'tuple[int,int]':
return(int(width/2), 0)
def make_state(location, width, speed, speed_scale) -> 'list[float]':
return [(location-width/2)/width*2, speed/speed_scale]
def simulate(model: Model, batch_size: int, width: int, speed_scale: int, action_list: 'list', epsilon: float, reward_range: float = .7):
"""环境模拟,收集数据
"""
if reward_range <= .5:
raise ValueError('The arg reward_range cannot leq to .5')
action_count = len(action_list)
location, speed = init_data(width)
cache = set()
live_time = 0
score = 0
max_score = max(1, batch_size*(batch_size+1)/2) # 如果一直没掉下去最多能得这么多分
for _ in range(batch_size):
state = make_state(location, width, speed, speed_scale)
if random.random() <= epsilon: # 决定探索还是利用
action_index = random.randint(0, action_count-1)
else:
action_index = torch.argmax(model(torch.tensor(data=state, dtype=torch.float32, device=GPU)))
a = action_list[action_index]
location_, speed_, r = env(width, location, speed, action_list[a]) # 计算sₜ₊₁
data = Data(
state,
[action_index == i for i in range(action_count)], # 数据类型转换
1. if (1-reward_range) < location/width < reward_range else 0., # 这里并不是没掉下去就正奖励,而是在中间才能正奖励
make_state(location_, width, speed_, speed_scale) # 数据类型转换
)
cache.add(data) # 填充经验池
location, speed = location_, speed_ # s=sₜ₊₁
live_time += 1
score += live_time
if not r:
location, speed = init_data(width)
return cache, score/max_score
def train(width: int, speed_scale: int, action_list: 'tuple|list', model: Model, optimizer: optim.Optimizer, loss_func: nn.modules.loss._Loss, epochs: int, batch_size: int, gamma: float = .1, epsilon: float = .1, soft_update_rate=.1, target_accuracy=.99) -> 'list[float]':
policy_net = model.copy()
target_net = model
policy_net.train(mode=True)
target_net.train(mode=False) # 目标网络不训练
action_count = len(action_list)
cache = set()
for epoch in range(epochs):
cache_, accuracy = simulate(model=target_net, batch_size=batch_size, speed_scale=speed_scale, width=width, action_list=action_list, epsilon=epsilon)
cache = cache | cache_
states, actions, rewards, state_nexts = [], [], [], []
real_batch_size = min(batch_size, len(cache))
for data in random.sample(cache, real_batch_size): # 随机从经验池选取一批数据
states.append(data.state)
actions.append(data.action)
rewards.append([data.reward, ])
state_nexts.append(data.next_state)
# 将列表转换成张量
states = torch.tensor(states, device=GPU) + torch.randn_like(states, device=GPU)*.02 # 额外加上一个噪声
actions = torch.tensor(actions, device=GPU)
rewards = torch.tensor(rewards, device=GPU)
state_nexts = torch.tensor(state_nexts, device=GPU)
rewards = rewards.expand((real_batch_size, action_count))
v = target_net(state_nexts).detach()
y = v*(v.argsort(dim=-1, descending=False).eq(0)) * gamma + rewards * (1 - gamma) # 控制立即奖励所占比例
v_hat = policy_net(states) * actions # 由于actions是张量,未选取的动作自然成0
loss = loss_func(y_hat, y)
optimizer.zero_grad()
loss.backward(retain_graph=True)
optimizer.step()
target_net.load_state_dict(policy_net, soft_update_rate)
print_bar(epoch, epochs, epoch, epochs, ('{:.10f}'.format(loss.item()), '{:.10f}'.format(accuracy),))
if accuracy >= target_accuracy: # 分数达到要求说明模型训练完成
break
return target_net
if __name__ == "__main__":
WIDTH = 30 # 定义平台宽度
SPEED_SCALE = 8 # 定义速度敏感性
ACTIONS = (-2, 0, 2, ) # 动作列表
EPOCHS = 10000 # 最大迭代次数
BATCH_SIZE = 64
layers = [
(torch.nn.Linear(2, 8), torch.sigmoid),
(torch.nn.Linear(8, 3), torch.sigmoid),
] # 模型
model = Model(layers=layers, device=GPU)
opt = optim.NAdam(model.parameters(), lr=.07)
loss_func = nn.MSELoss()
model = train(
width=WIDTH, speed_scale=SPEED_SCALE, action_list=ACTIONS, model=model, optimizer=opt, loss_func=loss_func, epochs=EPOCHS, batch_size=BATCH_SIZE,
gamma=.3, epsilon=.2, soft_update_rate=.3, target_accuracy=.93
)
model.to("cpu")
print("\n\n")
location, speed = init_data(WIDTH)
for step in range(200): # 播放动画展示模型训练结果
state = torch.tensor(make_state(location, WIDTH, speed, SPEED_SCALE))
a = ACTIONS[torch.argmax(model(state))]
location_, speed_, r = env(WIDTH, location, speed, ACTIONS[a])
draw(WIDTH, location, not step, (a, action.tolist(), location, speed))
if r <= 0:
location, speed = init_data(WIDTH)
else:
location, speed = location_, speed_
time.sleep(.1)