conda create -n gym python==3.8
y
pip install gym
#完整安装
pip install --upgrade gym[mujoco]
pip install --upgrade gym[all]
import gym
env = gym.make('InvertedDoublePendulum',render_mode='human')
env.reset()
for _ in range(1000):
# 可视化环境
env.render()
# 从动作空间随机获取一个动作
action = env.action_space.sample()
##observation对环境的观测;reward即时的奖励;
##termination是否需要重置环境(如游戏这个时间步后游戏结束);
##truncated用于调试诊断信息;info;
state, reward, terminated, truncated, info = env.step(action)
env.close()
import gym
env = gym.make('InvertedDoublePendulum',render_mode='human')
for i_episode in range(20):
observation = env.reset()
for t in range(100):
env.render()
print(observation)
# 从动作空间随机获取一个动作
action = env.action_space.sample()
##observation对环境的观测;reward即时的奖励;
##termination是否需要重置环境(如游戏这个时间步后游戏结束);
##truncated用于调试诊断信息;info;
state, reward, terminated, truncated, info = env.step(action)
if terminated:
print("Episode finished after {} timesteps".format(t+1))
break
env.close()
以上完整可查看博客
conda activate gym
conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import logging
from torch.distributions.bernoulli import Bernoulli
3.2.1全连接层(多层感知机)
class PGN(nn.Module):
def __init__(self):
super(PGN, self).__init__()
self.linear1 = nn.Linear(8, 24)
self.linear2 = nn.Linear(24, 36)
self.linear3 = nn.Linear(36, 1)
def forward(self, x):
x = F.relu(self.linear1(x))
x = F.relu(self.linear2(x))
x = torch.sigmoid(self.linear3(x))
return x
3.2.2LSTM
# Define LSTM Neural Networks
class LstmRNN(nn.Module):
"""
Parameters:
- input_size: feature size
- hidden_size: number of hidden units
- output_size: number of output
- num_layers: layers of LSTM to stack
"""
def __init__(self, input_size, hidden_size=1, output_size=1, num_layers=1):
super().__init__()
self.lstm = nn.LSTM(input_size, hidden_size, num_layers) # utilize the LSTM model in torch.nn
self.linear1 = nn.Linear(hidden_size, output_size) # 全连接层
def forward(self, _x):
_x = _x.reshape(1,1,-1)
x, _ = self.lstm(_x) # _x is input, size (seq_len, batch, input_size)
s, b, h = x.shape # x is output, size (seq_len, batch, hidden_size)
x = x.view(s * b, h)
x = self.linear1(x)
x = x.view(s, b, -1)
x = x.reshape(-1)
return x
3.2.3将代码连接到gym
class CartAgent(object):
def __init__(self, learning_rate, gamma):
self.pgn = PGN()
# self.lstm = LstmRNN(input_size=11)
self.gamma = gamma
self.optimizer = torch.optim.RMSprop(self.pgn.parameters(), lr=learning_rate)
# self.optimizer = torch.optim.RMSprop(self.lstm.parameters(), lr=learning_rate)
self.state_pool = []
self.action_pool = []
self.reward_pool = []
self.steps = 0
def memorize(self, state, action, reward):
# save to memory for mini-batch gradient descent
self.state_pool.append(state)
self.action_pool.append(action)
self.reward_pool.append(reward)
self.steps += 1
def _adjust_reward(self):
# backward weight
running_add = 0
for i in reversed(range(self.steps)):
if self.reward_pool[i] == 0:
running_add = 0
else:
running_add = running_add * self.gamma + self.reward_pool[i]
self.reward_pool[i] = running_add
# normalize reward
reward_mean = numpy.mean(self.reward_pool)
reward_std = numpy.std(self.reward_pool)
for i in range(self.steps):
self.reward_pool[i] = (self.reward_pool[i] - reward_mean) / reward_std
def learn(self):
self._adjust_reward()
# policy gradient
self.optimizer.zero_grad()
for i in range(self.steps):
# all steps in multi games
state = self.state_pool[i]
action = [torch.FloatTensor([self.action_pool[i]])]
reward = self.reward_pool[i]
probs = self.act(state)
m = Bernoulli(probs)
# 反馈
loss = -m.log_prob(action[0]) * reward
loss.backward()
self.optimizer.step()
#前馈部分
def act(self, state):
return self.pgn(state)
# return self.lstm(state)
def reward_drawing(self):
reward_pool = numpy.array(self.reward_pool)
x = range(reward_pool.size)
plt.plot(x,reward_pool)
plt.show()
开始运行
if __name__=='__main__':
logger = logging.getLogger(__name__)
# hyper parameter神经网络参数
BATCH_SIZE = 5
# 并行速度
LEARNING_RATE = 0.01
GAMMA = 0.99
NUM_EPISODES = 50
env = gym.make('InvertedDoublePendulum',render_mode='human')
cart_agent = CartAgent(learning_rate=LEARNING_RATE, gamma=GAMMA)
for i_episode in range(NUM_EPISODES):
next_state,_ = env.reset()
env.render()
t=0
##改了这个部分,原代码不是个循环
while True:
t=t+1
state = torch.from_numpy(next_state[:-3]).float()
probs = cart_agent.act(state)
m = Bernoulli(probs)
action = m.sample()
action = [action.data.numpy().astype(int).item()]
next_state, reward, terminated, truncated, info = env.step(action)
env.render()
# end action's reward equals 0
if terminated:
reward = 0
cart_agent.memorize(state, action, reward)
if terminated:
print('Episode {}: durations {}'.format(i_episode, t))
logger.info({'Episode {}: durations {}'.format(i_episode, t)})
break
# update parameter every batch size
if i_episode > 0 and i_episode % BATCH_SIZE == 0:
cart_agent.learn()
cart_agent.reward_drawing()
以上完整可查看博客