学习笔记xjy

一、GYM安装

conda create -n gym python==3.8
y
pip install gym
#完整安装
pip install --upgrade gym[mujoco]
pip install --upgrade gym[all]

二、GYM入门

1.1不设置终止条件

import gym
env = gym.make('InvertedDoublePendulum',render_mode='human')
env.reset()
for _ in range(1000):
        # 可视化环境
        env.render()
        # 从动作空间随机获取一个动作
        action = env.action_space.sample()
        ##observation对环境的观测;reward即时的奖励;
        ##termination是否需要重置环境(如游戏这个时间步后游戏结束);
        ##truncated用于调试诊断信息;info;
        state, reward, terminated, truncated, info = env.step(action)
env.close()

开始第一次瞎跑学习笔记xjy_第1张图片

 1.2设置终止条件

import gym
env = gym.make('InvertedDoublePendulum',render_mode='human')
for i_episode in range(20):
    observation = env.reset()
    for t in range(100):
        env.render()
        print(observation)
        # 从动作空间随机获取一个动作
        action = env.action_space.sample()
        ##observation对环境的观测;reward即时的奖励;
        ##termination是否需要重置环境(如游戏这个时间步后游戏结束);
        ##truncated用于调试诊断信息;info;
        state, reward, terminated, truncated, info = env.step(action)
        if terminated:
            print("Episode finished after {} timesteps".format(t+1))
            break
env.close()

只能在某范围跑学习笔记xjy_第2张图片

 以上完整可查看博客

三、写模型(pytorch)

3.1安装pytorch库

conda activate gym
conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import logging
from torch.distributions.bernoulli import Bernoulli

3.2神经网络模型

3.2.1全连接层(多层感知机)

class PGN(nn.Module):
    def __init__(self):
        super(PGN, self).__init__()
        self.linear1 = nn.Linear(8, 24)
        self.linear2 = nn.Linear(24, 36)
        self.linear3 = nn.Linear(36, 1)

    def forward(self, x):
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = torch.sigmoid(self.linear3(x))
        return x

3.2.2LSTM

# Define LSTM Neural Networks
class LstmRNN(nn.Module):
    """
        Parameters:
        - input_size: feature size
        - hidden_size: number of hidden units
        - output_size: number of output
        - num_layers: layers of LSTM to stack
    """

    def __init__(self, input_size, hidden_size=1, output_size=1, num_layers=1):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers)  # utilize the LSTM model in torch.nn
        self.linear1 = nn.Linear(hidden_size, output_size)  # 全连接层

    def forward(self, _x):
        _x = _x.reshape(1,1,-1)
        x, _ = self.lstm(_x)  # _x is input, size (seq_len, batch, input_size)
        s, b, h = x.shape  # x is output, size (seq_len, batch, hidden_size)
        x = x.view(s * b, h)
        x = self.linear1(x)
        x = x.view(s, b, -1)
        x = x.reshape(-1)
        return x

3.2.3将代码连接到gym

class CartAgent(object):
    def __init__(self, learning_rate, gamma):
        self.pgn = PGN()
        # self.lstm = LstmRNN(input_size=11)
        self.gamma = gamma
        self.optimizer = torch.optim.RMSprop(self.pgn.parameters(), lr=learning_rate)
        # self.optimizer = torch.optim.RMSprop(self.lstm.parameters(), lr=learning_rate)
        self.state_pool = []
        self.action_pool = []
        self.reward_pool = []
        self.steps = 0

    def memorize(self, state, action, reward):
        # save to memory for mini-batch gradient descent
        self.state_pool.append(state)
        self.action_pool.append(action)
        self.reward_pool.append(reward)
        self.steps += 1

    def _adjust_reward(self):
        # backward weight
        running_add = 0
        for i in reversed(range(self.steps)):
            if self.reward_pool[i] == 0:
                running_add = 0
            else:
                running_add = running_add * self.gamma + self.reward_pool[i]
                self.reward_pool[i] = running_add
        # normalize reward
        reward_mean = numpy.mean(self.reward_pool)
        reward_std = numpy.std(self.reward_pool)
        for i in range(self.steps):
            self.reward_pool[i] = (self.reward_pool[i] - reward_mean) / reward_std

    def learn(self):
        self._adjust_reward()
        # policy gradient
        self.optimizer.zero_grad()
        for i in range(self.steps):
            # all steps in multi games
            state = self.state_pool[i]
            action = [torch.FloatTensor([self.action_pool[i]])]
            reward = self.reward_pool[i]
            probs = self.act(state)
            m = Bernoulli(probs)
            # 反馈
            loss = -m.log_prob(action[0]) * reward
            loss.backward()
        self.optimizer.step()

    #前馈部分
    def act(self, state):
        return self.pgn(state)
        # return self.lstm(state)

    def reward_drawing(self):
        reward_pool = numpy.array(self.reward_pool)
        x = range(reward_pool.size)
        plt.plot(x,reward_pool)
        plt.show()

开始运行

if __name__=='__main__':
    logger = logging.getLogger(__name__)
    # hyper parameter神经网络参数
    BATCH_SIZE = 5
    # 并行速度
    LEARNING_RATE = 0.01
    GAMMA = 0.99
    NUM_EPISODES = 50

    env = gym.make('InvertedDoublePendulum',render_mode='human')
    cart_agent = CartAgent(learning_rate=LEARNING_RATE, gamma=GAMMA)

    for i_episode in range(NUM_EPISODES):
        next_state,_ = env.reset()
        env.render()
        t=0
        ##改了这个部分,原代码不是个循环
        while True:
            t=t+1
            state = torch.from_numpy(next_state[:-3]).float()
            probs = cart_agent.act(state)
            m = Bernoulli(probs)
            action = m.sample()
            action = [action.data.numpy().astype(int).item()]
            next_state, reward, terminated, truncated, info = env.step(action)
            env.render()
            # end action's reward equals 0
            if terminated:
                reward = 0
            cart_agent.memorize(state, action, reward)
            if terminated:
                print('Episode {}: durations {}'.format(i_episode, t))
                logger.info({'Episode {}: durations {}'.format(i_episode, t)})
                break
        # update parameter every batch size
        if i_episode > 0 and i_episode % BATCH_SIZE == 0:
            cart_agent.learn()

    cart_agent.reward_drawing()

 以上完整可查看博客

你可能感兴趣的:(学习,python,开发语言)