import math
import numpy as np
import torch
from collections import defaultdict
class Qlearning():
def __init__(self,action_dim,cfg):
self.action_dim=action_dim
self.lr=cfg.lr
self.gamma=cfg.gamma
self.sample_count=0
self.epsilon=0
self.epsilon_start=cfg.epsilon_start
self.epsilon_end=cfg.epsilon_end
self.epsilon_decay=cfg.epsilon_decay
# 用嵌套字典存放状态->动作->状态-动作值(Q值)的映射,即Q表 'state':array([下标代表动作,存放的代表q值]) 初始化全为0
self.Q_table=defaultdict(lambda :np.zeros(action_dim))
def choose_action(self,state):
self.sample_count+=1
# epsilon是会递减的,这里选择指数递减
self.epsilon=self.epsilon_end+(self.epsilon_start-self.epsilon_end)*math.exp(-1.*self.sample_count/self.epsilon_decay)
if self.epsilon<np.random.uniform(0,1):
action=np.argmax(self.Q_table[str(state)])
else:
action=np.random.choice(self.action_dim)
return action
def update(self,state,action,reward,next_state,done):
Q_predict=self.Q_table[str(state)][action]
if done:
Q_target=reward
else:
Q_target=reward+self.gamma*np.max(self.Q_table[str(next_state)])
self.Q_table[str(state)][action]+=self.lr*(Q_target-Q_predict)
def save(self,path):
#序列化与反序列化
import dill
torch.save(
obj=self.Q_table,
f=path+'Qlearning_model.pkl',
pickle_module=dill
)
def load(self,path):
import dill
self.Q_table=torch.load(f=path+'Qlearning_model.pkl',pickle_module=dill)
print('加载模型成果')
用Q-learing来训练倒立摆,因为倒立摆这个环境的观测是连续的,所以需要离散化。
import math
import numpy as np
import torch
from collections import defaultdict
class Qlearning():
def __init__(self,action_dim,cfg):
self.action_dim=action_dim
self.lr=cfg.lr
self.gamma=cfg.gamma
self.sample_count=0
self.epsilon=0
self.epsilon_start=cfg.epsilon_start
self.epsilon_end=cfg.epsilon_end
self.epsilon_decay=cfg.epsilon_decay
# 用嵌套字典存放状态->动作->状态-动作值(Q值)的映射,即Q表 'state':array([下标代表动作,存放的代表q值]) 初始化全为0
self.Q_table=defaultdict(lambda :np.zeros(action_dim))
self.num_digitied=6
def choose_action(self,state):
state=self.digitizer_state(state)
self.sample_count+=1
self.epsilon=self.epsilon_end+(self.epsilon_start-self.epsilon_end)*math.exp(-1.*self.sample_count/self.epsilon_decay)
if self.epsilon<np.random.uniform(0,1):
action=np.argmax(self.Q_table[str(state)])
else:
action=np.random.choice(self.action_dim)
return action
def update(self,state,action,reward,next_state,done):
state=self.digitizer_state(state)
next_state=self.digitizer_state(next_state)
Q_predict=self.Q_table[str(state)][action]
if done:
Q_target=reward
else:
Q_target=reward+self.gamma*np.max(self.Q_table[str(next_state)])
self.Q_table[str(state)][action]+=self.lr*(Q_target-Q_predict)
# 拿Q-learning来训练CartPole-v1训练把状态离散化
"""
CartPole状态是保存在observation中的,有4个变量,cart位置和速度,pole的角度和速度,它们都是连续值,现在要把这些连续值离散化
cart位置:-2.4 ~ 2.4
cart速度:-inf ~ inf
pole角度:-0.5 ~ 0.5 (radian)
pole角速度:-inf ~ inf
每一个变量都离散化为6个bins
离散化为6个bins,4个变量,所以有6的4次方 = 1296种state,action只有左和右2种
因此Q矩阵size是1296 * 2
"""
def bins(self,clip_min,clip_max,num):
#把连续值转为离散值
#num个箱子需要num+1
return np.linspace(clip_min,clip_max,num+1)[1:-1]
def digitizer_state(self,observation):
cart_pos,cart_v,pole_angle,pole_v=observation
digitized=[
np.digitize(cart_pos,bins=self.bins(-2.4,2.4,self.num_digitied)),
np.digitize(cart_v,bins=self.bins(-3.0,3.0,self.num_digitied)),
np.digitize(pole_angle,bins=self.bins(-0.5,0.5,self.num_digitied)),
np.digitize(pole_v,bins=self.bins(-2.0,2.0,self.num_digitied))
]
return sum([x*(self.num_digitied**i) for i,x in enumerate(digitized)])
def save(self,path):
#序列化与反序列化
import dill
torch.save(
obj=self.Q_table,
f=path+'Qlearning_model.pkl',
pickle_module=dill
)
def load(self,path):
import dill
self.Q_table=torch.load(f=path+'Qlearning_model.pkl',pickle_module=dill)
print('加载模型成果')
import gym
import torch
from Q_learning_discrete import Qlearning
class Config():
def __init__(self):
self.lr=0.001
self.epsilon_start=0.99
self.epsilon_end=0.01
self.epsilon_decay=500
self.seed=10
self.trains_eps=1000
self.test_eps=100
self.algo_name='Q-learning'
self.env_name='CartPole-v1'
self.device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.gamma=0.9
self.save=True
def make_env_agent(cfg,seed=1):
env=gym.make(cfg.env_name)
env.seed(seed=seed)
action_dim=env.action_space.n
agent=Qlearning(action_dim,cfg)
return env,agent
def train(cfg,agent,env):
print('train start')
rewards=[]
for i_ep in range(cfg.trains_eps):
ep_reward=0
state=env.reset()
while True:
action=agent.choose_action(state)
next_state,reward,done,_=env.step(action)
agent.update(state,action,reward,next_state,done)
state=next_state
ep_reward+=reward
if done:
break
rewards.append(ep_reward)
env.close()
print('回合数:{}/{},奖励{:.1f}'.format(i_ep+1,cfg.trains_eps,ep_reward))
print('完成训练')
return rewards
def test(cfg,env,agent):
print('开始测试')
print(f'算法:{cfg.env_name},环境:{cfg.algo_name},设备{cfg.device}')
rewards=[]
for i_ep in range(cfg.test_eps):
state=env.reset()
ep_reward=0
while True:
env.render()
action=agent.choose_action(state)
next_state,reward,done,_=env.step(action)
ep_reward+=reward
state=next_state
if done:
break
rewards.append(ep_reward)
print('回合数{}/{},奖励{:.1f}'.format(i_ep+1,cfg.test_eps,ep_reward))
print('完成测试')
return rewards
if __name__=='__main__':
cfg=Config()
env,agent=make_env_agent(cfg)
rewards=train(cfg,agent,env)
agent.save(path='./model')
#测试
env,agent=make_env_agent(cfg,seed=10)
agent.load('./model')
rewards=test(cfg,env,agent)
env.close()
import numpy as np
from collections import defaultdict
import torch
import dill
class sarsa():
def __init__(self, action_dim, cfg):
self.action_dim = action_dim
self.lr = cfg.lr
self.gamma = cfg.gamma
self.Q_table = defaultdict(lambda: np.zeros(action_dim))
self.epsilon = cfg.epsilon
def choose_action(self, state):
best_action = np.argmax(self.Q_table[state])
# 计算每个动作的平均概率
action_pros = np.ones(self.action_dim, dtype=float) * self.epsilon / self.action_dim
# 让最好的动作概率大一点
action_pros[best_action] += (1.0 - self.epsilon)
# 根据概率采样
action = np.random.choice(np.arange(len(action_pros)), p=action_pros)
return action
def update(self, state, action, reward, next_state, next_action, done):
# 拿到当前状态执行的动作对应的Q值
Q_predict = self.Q_table[state][action]
# 如果结束了 就代表是最后一个状态 Q就等于reward
if done:
Q_target = reward
else:
# 算出下一状态的Q值
Q_target = reward + self.gamma * self.Q_table[next_state][next_action]
# 根据TD的方式更新Q值
self.Q_table[state][action] += self.lr * (Q_target - Q_predict)
def save(self, path):
torch.save(obj=self.Q_table, f=path + 'sarsa_mode.pkl', pickle_module=dill)
def load(self, path):
torch.load(f=path + 'sarsa_mode.pkl', pickle_module=dill)