RL Q-learning Sarsa代码

Q-learning

import math
import numpy as np
import torch
from collections import defaultdict

class Qlearning():
    def __init__(self,action_dim,cfg):
        self.action_dim=action_dim
        self.lr=cfg.lr
        self.gamma=cfg.gamma
        self.sample_count=0
        self.epsilon=0
        self.epsilon_start=cfg.epsilon_start
        self.epsilon_end=cfg.epsilon_end
        self.epsilon_decay=cfg.epsilon_decay
        # 用嵌套字典存放状态->动作->状态-动作值(Q值)的映射,即Q表 'state':array([下标代表动作,存放的代表q值]) 初始化全为0
        self.Q_table=defaultdict(lambda :np.zeros(action_dim))

    def choose_action(self,state):
        self.sample_count+=1
        # epsilon是会递减的,这里选择指数递减
        self.epsilon=self.epsilon_end+(self.epsilon_start-self.epsilon_end)*math.exp(-1.*self.sample_count/self.epsilon_decay)
        if self.epsilon<np.random.uniform(0,1):
            action=np.argmax(self.Q_table[str(state)])
        else:
            action=np.random.choice(self.action_dim)

        return action


    def update(self,state,action,reward,next_state,done):
        Q_predict=self.Q_table[str(state)][action]
        if done:
            Q_target=reward
        else:
            Q_target=reward+self.gamma*np.max(self.Q_table[str(next_state)])
        self.Q_table[str(state)][action]+=self.lr*(Q_target-Q_predict)


    def save(self,path):
        #序列化与反序列化
        import dill
        torch.save(
            obj=self.Q_table,
            f=path+'Qlearning_model.pkl',
            pickle_module=dill
        )

    def load(self,path):
        import dill
        self.Q_table=torch.load(f=path+'Qlearning_model.pkl',pickle_module=dill)
        print('加载模型成果')


用Q-learing来训练倒立摆,因为倒立摆这个环境的观测是连续的,所以需要离散化。

import math
import numpy as np
import torch
from collections import defaultdict

class Qlearning():
    def __init__(self,action_dim,cfg):
        self.action_dim=action_dim
        self.lr=cfg.lr
        self.gamma=cfg.gamma
        self.sample_count=0
        self.epsilon=0
        self.epsilon_start=cfg.epsilon_start
        self.epsilon_end=cfg.epsilon_end
        self.epsilon_decay=cfg.epsilon_decay
        # 用嵌套字典存放状态->动作->状态-动作值(Q值)的映射,即Q表 'state':array([下标代表动作,存放的代表q值]) 初始化全为0
        self.Q_table=defaultdict(lambda :np.zeros(action_dim))
        self.num_digitied=6

    def choose_action(self,state):
        state=self.digitizer_state(state)
        self.sample_count+=1
        self.epsilon=self.epsilon_end+(self.epsilon_start-self.epsilon_end)*math.exp(-1.*self.sample_count/self.epsilon_decay)
        if self.epsilon<np.random.uniform(0,1):
            action=np.argmax(self.Q_table[str(state)])
        else:
            action=np.random.choice(self.action_dim)
        return action


    def update(self,state,action,reward,next_state,done):
        state=self.digitizer_state(state)
        next_state=self.digitizer_state(next_state)
        Q_predict=self.Q_table[str(state)][action]
        if done:
            Q_target=reward
        else:
            Q_target=reward+self.gamma*np.max(self.Q_table[str(next_state)])
        self.Q_table[str(state)][action]+=self.lr*(Q_target-Q_predict)

    # 拿Q-learning来训练CartPole-v1训练把状态离散化
    """
        CartPole状态是保存在observation中的,有4个变量,cart位置和速度,pole的角度和速度,它们都是连续值,现在要把这些连续值离散化
        cart位置:-2.4 ~ 2.4
        cart速度:-inf ~ inf
        pole角度:-0.5 ~ 0.5 (radian)
        pole角速度:-inf ~ inf
        每一个变量都离散化为6个bins
        离散化为6个bins,4个变量,所以有6的4次方 = 1296种state,action只有左和右2种
        因此Q矩阵size是1296 * 2

    """
    def bins(self,clip_min,clip_max,num):
        #把连续值转为离散值
        #num个箱子需要num+1
        return np.linspace(clip_min,clip_max,num+1)[1:-1]

    def digitizer_state(self,observation):
        cart_pos,cart_v,pole_angle,pole_v=observation
        digitized=[
            np.digitize(cart_pos,bins=self.bins(-2.4,2.4,self.num_digitied)),
            np.digitize(cart_v,bins=self.bins(-3.0,3.0,self.num_digitied)),
            np.digitize(pole_angle,bins=self.bins(-0.5,0.5,self.num_digitied)),
            np.digitize(pole_v,bins=self.bins(-2.0,2.0,self.num_digitied))
        ]
        return sum([x*(self.num_digitied**i) for i,x in enumerate(digitized)])

    def save(self,path):
        #序列化与反序列化
        import dill
        torch.save(
            obj=self.Q_table,
            f=path+'Qlearning_model.pkl',
            pickle_module=dill
        )

    def load(self,path):
        import dill
        self.Q_table=torch.load(f=path+'Qlearning_model.pkl',pickle_module=dill)
        print('加载模型成果')


import gym
import torch
from Q_learning_discrete import Qlearning


class Config():
    def __init__(self):
        self.lr=0.001
        self.epsilon_start=0.99
        self.epsilon_end=0.01
        self.epsilon_decay=500
        self.seed=10
        self.trains_eps=1000
        self.test_eps=100
        self.algo_name='Q-learning'
        self.env_name='CartPole-v1'
        self.device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.gamma=0.9
        self.save=True


def make_env_agent(cfg,seed=1):
    env=gym.make(cfg.env_name)
    env.seed(seed=seed)
    action_dim=env.action_space.n
    agent=Qlearning(action_dim,cfg)
    return env,agent


def train(cfg,agent,env):
    print('train start')
    rewards=[]
    for i_ep in range(cfg.trains_eps):
        ep_reward=0
        state=env.reset()
        while True:
            action=agent.choose_action(state)
            next_state,reward,done,_=env.step(action)
            agent.update(state,action,reward,next_state,done)
            state=next_state
            ep_reward+=reward
            if done:
                break
        rewards.append(ep_reward)
        env.close()
        print('回合数:{}/{},奖励{:.1f}'.format(i_ep+1,cfg.trains_eps,ep_reward))
    print('完成训练')
    return rewards



def test(cfg,env,agent):
    print('开始测试')
    print(f'算法:{cfg.env_name},环境:{cfg.algo_name},设备{cfg.device}')
    rewards=[]

    for i_ep in range(cfg.test_eps):
        state=env.reset()
        ep_reward=0
        while True:
            env.render()
            action=agent.choose_action(state)
            next_state,reward,done,_=env.step(action)
            ep_reward+=reward
            state=next_state
            if done:
                break
        rewards.append(ep_reward)
        print('回合数{}/{},奖励{:.1f}'.format(i_ep+1,cfg.test_eps,ep_reward))
    print('完成测试')
    return rewards


if __name__=='__main__':
    cfg=Config()
    env,agent=make_env_agent(cfg)
    rewards=train(cfg,agent,env)
    agent.save(path='./model')
    
    
    #测试
    env,agent=make_env_agent(cfg,seed=10)
    agent.load('./model')
    rewards=test(cfg,env,agent)
    env.close()




Sarsa

import numpy as np
from collections import defaultdict
import torch
import dill


class sarsa():
    def __init__(self, action_dim, cfg):
        self.action_dim = action_dim
        self.lr = cfg.lr
        self.gamma = cfg.gamma
        self.Q_table = defaultdict(lambda: np.zeros(action_dim))
        self.epsilon = cfg.epsilon

    def choose_action(self, state):
        best_action = np.argmax(self.Q_table[state])
        # 计算每个动作的平均概率
        action_pros = np.ones(self.action_dim, dtype=float) * self.epsilon / self.action_dim
        # 让最好的动作概率大一点
        action_pros[best_action] += (1.0 - self.epsilon)
        # 根据概率采样
        action = np.random.choice(np.arange(len(action_pros)), p=action_pros)
        return action

    def update(self, state, action, reward, next_state, next_action, done):
        # 拿到当前状态执行的动作对应的Q值
        Q_predict = self.Q_table[state][action]
        # 如果结束了 就代表是最后一个状态 Q就等于reward
        if done:
            Q_target = reward
        else:
            # 算出下一状态的Q值
            Q_target = reward + self.gamma * self.Q_table[next_state][next_action]
        # 根据TD的方式更新Q值
        self.Q_table[state][action] += self.lr * (Q_target - Q_predict)

    def save(self, path):

        torch.save(obj=self.Q_table, f=path + 'sarsa_mode.pkl', pickle_module=dill)

    def load(self, path):

        torch.load(f=path + 'sarsa_mode.pkl', pickle_module=dill)

你可能感兴趣的:(杂乱的笔记本,python,numpy)