强化学习--多维动作状态空间的设计

目录

  • 一、离散动作
  • 二、连续动作
    • 1、例子1
    • 2、知乎给出的示例
    • 2、github里面的代码

免责声明:以下代码部分来自网络,部分来自ChatGPT,部分来自个人的理解。如有其他观点,欢迎讨论!

一、离散动作

注意:本文均以PPO算法为例。

# time: 2023/11/22 21:04
# author: YanJP


import torch
import torch
import torch.nn as nn
from torch.distributions import Categorical

class MultiDimensionalActor(nn.Module):
    def __init__(self, input_dim, output_dims):
        super(MultiDimensionalActor, self).__init__()

        # Define a shared feature extraction network
        self.feature_extractor = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU()
        )

        # Define individual output layers for each action dimension
        self.output_layers = nn.ModuleList([
            nn.Linear(64, num_actions) for num_actions in output_dims
        ])

    def forward(self, state):
        # Feature extraction
        features = self.feature_extractor(state)

        # Generate Categorical objects for each action dimension
        categorical_objects = [Categorical(logits=output_layer(features)) for output_layer in self.output_layers]

        return categorical_objects

# 定义主函数
def main():
    # 定义输入状态维度和每个动作维度的动作数
    input_dim = 10
    output_dims = [5, 8]  # 两个动作维度,分别有 3 和 4 个可能的动作

    # 创建 MultiDimensionalActor 实例
    actor_network = MultiDimensionalActor(input_dim, output_dims)

    # 生成输入状态(这里使用随机数据作为示例)
    state = torch.randn(1, input_dim)

    # 调用 actor 网络
    categorical_objects = actor_network(state)

    # 输出每个动作维度的采样动作和对应的对数概率
    for i, categorical in enumerate(categorical_objects):
        sampled_action = categorical.sample()
        log_prob = categorical.log_prob(sampled_action)
        print(f"Sampled action for dimension {i+1}: {sampled_action.item()}, Log probability: {log_prob.item()}")

if __name__ == "__main__":
    main()

#Sampled action for dimension 1: 1, Log probability: -1.4930928945541382
#Sampled action for dimension 2: 3, Log probability: -2.1875085830688477

注意代码中categorical函数的两个不同传入参数的区别:参考链接
简单来说,logits是计算softmax的,probs直接就是已知概率的时候传进去就行。

二、连续动作

参考链接:github、知乎
为什么取对数概率?参考回答
强化学习--多维动作状态空间的设计_第1张图片

1、例子1

先看如下的代码:

# time: 2023/11/21 21:33
# author: YanJP
#这是对应多维连续变量的例子:
# 参考链接:https://github.com/XinJingHao/PPO-Continuous-Pytorch/blob/main/utils.py
# https://www.zhihu.com/question/417161289
import torch.nn as nn
import torch
class Policy(nn.Module):
    def __init__(self, in_dim, n_hidden_1, n_hidden_2, num_outputs):
        super(Policy, self).__init__()
        self.layer = nn.Sequential(
            nn.Linear(in_dim, n_hidden_1),
            nn.ReLU(True),
            nn.Linear(n_hidden_1, n_hidden_2),
            nn.ReLU(True),
            nn.Linear(n_hidden_2, num_outputs)
        )

class Normal(nn.Module):
    def __init__(self, num_outputs):
        super().__init__()
        self.stds = nn.Parameter(torch.zeros(num_outputs))  #创建一个可学习的参数 
    def forward(self, x):
        dist = torch.distributions.Normal(loc=x, scale=self.stds.exp())
        action = dist.sample((every_dimention_output,))  #这里我觉得是最重要的,不填sample的参数的话,默认每个分布只采样一个值!!!!!!!!
        return action

if __name__ == '__main__':
    policy = Policy(4,20,20,5)
    normal = Normal(5) #设置5个维度
    every_dimention_output=10  #每个维度10个输出
    observation = torch.Tensor(4)
    action = normal.forward(policy.layer( observation))
    print("action: ",action)
  • self.stds.exp(),表示求指数,因为正态分布的标准差都是正数。
  • action = dist.sample((every_dimention_output,))这里最重要!!!

2、知乎给出的示例


class Agent(nn.Module):
    def __init__(self, envs):
        super(Agent, self).__init__()
        self.actor_mean = nn.Sequential(
            layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, np.prod(envs.single_action_space.shape)), std=0.01),
        )
        self.actor_logstd = nn.Parameter(torch.zeros(1, np.prod(envs.single_action_space.shape)))

    def get_action_and_value(self, x, action=None):
        action_mean = self.actor_mean(x)
        action_logstd = self.actor_logstd.expand_as(action_mean)
        action_std = torch.exp(action_logstd)
        probs = Normal(action_mean, action_std)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action).sum(1), probs.entropy().sum(1), self.critic(x)

这里的np.prod(envs.single_action_space.shape),表示每个维度的动作数相乘,然后初始化这么多个actor网络的标准差和均值,最后action里面的sample就是采样这么多个数据。(感觉还是拉成了一维计算)

2、github里面的代码

github

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Beta,Normal

class GaussianActor_musigma(nn.Module):
	def __init__(self, state_dim, action_dim, net_width):
		super(GaussianActor_musigma, self).__init__()

		self.l1 = nn.Linear(state_dim, net_width)
		self.l2 = nn.Linear(net_width, net_width)
		self.mu_head = nn.Linear(net_width, action_dim)
		self.sigma_head = nn.Linear(net_width, action_dim)

	def forward(self, state):
		a = torch.tanh(self.l1(state))
		a = torch.tanh(self.l2(a))
		mu = torch.sigmoid(self.mu_head(a))
		sigma = F.softplus( self.sigma_head(a) )
		return mu,sigma

	def get_dist(self, state):
		mu,sigma = self.forward(state)
		dist = Normal(mu,sigma)
		return dist

	def deterministic_act(self, state):
		mu, sigma = self.forward(state)
		return mu

上述代码主要是通过设置mu_head 和sigma_head的个数,来实现多维动作。

class GaussianActor_mu(nn.Module):
	def __init__(self, state_dim, action_dim, net_width, log_std=0):
		super(GaussianActor_mu, self).__init__()

		self.l1 = nn.Linear(state_dim, net_width)
		self.l2 = nn.Linear(net_width, net_width)
		self.mu_head = nn.Linear(net_width, action_dim)
		self.mu_head.weight.data.mul_(0.1)
		self.mu_head.bias.data.mul_(0.0)

		self.action_log_std = nn.Parameter(torch.ones(1, action_dim) * log_std)

	def forward(self, state):
		a = torch.relu(self.l1(state))
		a = torch.relu(self.l2(a))
		mu = torch.sigmoid(self.mu_head(a))
		return mu

	def get_dist(self,state):
		mu = self.forward(state)
		action_log_std = self.action_log_std.expand_as(mu)
		action_std = torch.exp(action_log_std)

		dist = Normal(mu, action_std)
		return dist

	def deterministic_act(self, state):
		return self.forward(state)
class Critic(nn.Module):
	def __init__(self, state_dim,net_width):
		super(Critic, self).__init__()

		self.C1 = nn.Linear(state_dim, net_width)
		self.C2 = nn.Linear(net_width, net_width)
		self.C3 = nn.Linear(net_width, 1)

	def forward(self, state):
		v = torch.tanh(self.C1(state))
		v = torch.tanh(self.C2(v))
		v = self.C3(v)
		return v

上述代码只定义了mu的个数与维度数一样,std作为可学习的参数之一。

你可能感兴趣的:(机器学习,深度学习,强化学习,人工智能,python,算法,机器学习)