前几日写了一个链表实现的神经网络,没有实际运行,由于没有使用Vector的数据结构,效率肯定很低,由于没有运行肯定有错呵呵,可以当成类似于算法导论里的伪代码看吧呵呵
import random
import numpy as np
import pandas as pd
import copy
def tanh(x):
return np.tanh(x)
def tanh_derivative(x):
return 1.0 - x * x
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def sigmoid_derivative(x):
return x * (1 - x)
def relu(x):
return max(x, 0)
def relu_derivative(x):
if x <= 0:
return 0
else:
return 1
class ActivationFunc:
def __init__(self):
self.tdict = dict()
self.tdict['tanh'] = np.tanh
self.tdict['sigmoid'] = lambda x: 1 / (1 + np.exp(-x))
self.tdict['relu'] = relu
self.tdict['softmax'] = np.exp
self.ddict = dict()
self.ddict['tanh'] = tanh_derivative
self.ddict['sigmoid'] = sigmoid_derivative
self.ddict['relu'] = relu_derivative
self.ddict['softmax'] = np.exp
def getActivation(self, activation):
if activation in self.tdict:
return self.tdict[activation]
else:
return lambda x: x
def getDActivation(self, activation):
if activation in self.ddict:
return self.ddict[activation]
else:
return lambda x: np.ones(x.shape)
class Node:
def __init__(self, tid, belongNet , activation = 'relu', mtype = 'middle'):
self.id = tid
self.mtop = 0
self.forwardlist = []
self.backlist = []
self.outputd = 0
self.inputd = 0
self.moutput = 0
self.minput = 0
self.AF = ActivationFunc()
self.Activation = self.AF.getActivation(activation)
self.Dactivation = self.AF.getDActivation(activation)
self.mtype = mtype
self.Net = belongNet
def fappend(self, nodeid):
self.forwardlist.append((nodeid, random.random() - 0.5, 0))
def bappend(self, nodeid):
self.backlist.append(nodeid)
def cleargrad(self):
self.outputd = 0
self.inputd = 0
for var in self.forwardlist:
var = var[0:1] + tuple(0)
def cleari(self):
self.minput = 0
self.outputd = 0
self.mtop = 0
def forward(self):
self.moutput = self.Activation(self.minput)
for (nxtnode, w, _) in self.forwardlist:
self.Net.nodepool[nxtnode].minput += w * self.moutput
def backward(self):
self.inputd = self.outputd * self.Dactivation(self.moutput)
for nid in self.backlist:
bnode = self.Net.nodepool[nid]
fid, _w, _grad = bnode.forwardlist[bnode.mtop]
bnode.outputd += self.inputd * _w
bnode.fowardlist[bnode.mtop] = (fid, _w, _grad + bnode.moutput * self.inputd)
bnode.mtop += 1
def step(self):
for i in range(self.forwardlist):
(nid, _w, _grad) = self.forwardlist[i]
self.forwardlist[i] = (nid, _w-_grad, 0)
def activate(self):
self.moutput = self.Activation(self.minput)
return self.moutput
class LNet:
def __init__(self, inputsize):
self.nodelen = 0
self.nodepool = []
self.layerlist = []
self.activationlist = []
self.layernum = 1
layer0 = []
for i in range(inputsize):
layer0.append(self.addNode(mtype = 'input'))
self.layerlist.append(layer0)
def addNode(self, actstr, stype):
self.nodepool.append(Node(self.nodelen, actstr, self, mtype = stype))
self.nodelen += 1
return nodelen - 1
def addLayer(self, num, wbias = True, dense = True, actstr = 'relu'):
if dense:
nlayer = []
for i in range(num):
nid = self.addNode(actstr, 'mid')
for nodeid in self.layerlist[-1]:
self.nodepool[nodeid].fappend(nid)
self.nodepool[nid].bappend(nodeid)
nlayer.append(nid)
self.layerlist.append(nlayer)
def forward(self, tinput):
for mlayer in self.layerlist:
for nid in mlayer:
self.nodepool[nid].cleari()
for tid, nid in enumerate(self.layerlist[0]):
self.nodepool[nid].minput = tinput[tid]
for mlayer in layerlist:
for nid in mlayer:
self.nodepool[nid].forward()
def backward(self, loss):
for tid, nid in enumerate(self.layerlist[-1]):
self.nodepool[nid].outputd = loss[tid]
for mlayer in reversed(self.layerlist):
for nid in mlayer:
self.nodepool[nid].backward()
后来,我又把原先写的网络(不是上面的)运用到飞行器的Takeoff,Hover,Landing训练上,结果又失败了,后来想了一下可能是由于网络的初始参数太小所导致的,但由于启动虚拟机连接模拟器太麻烦,所以没有继续尝试,不过在hover的时候如果一开始很成功一直可以保持hover,但总结来说就是很失败,根据我这么多年的经验很有可能是self.para.append(np.random.rand(densesize, tsize) - 0.5)的初始值太小,也有可能是里面数学有问题,但由于连接模拟器太麻烦(已经运行很多次了)所以没有再确认是不是这里的问题
Actor-Critic
import numpy as np
import pandas as pd
import copy
def tanh(x):
return np.tanh(x)
def tanh_derivative(x):
return 1.0 - x * x
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def sigmoid_derivative(x):
return x * (1 - x)
def relu(x):
return np.maximum(x, 0)
#t = copy.copy(x)
#for i in range(len(t)):
# if t[i] < 0:
# t[i] = 0
#return t
def relu_derivative(x):
t = copy.copy(x)
for i in range(len(t)):
if t[i] <= (1e-12):
t[i] = 0
else:
t[i] = 1
return t
class ActivationFunc:
def __init__(self):
self.tdict = dict()
self.tdict['tanh'] = np.tanh
self.tdict['sigmoid'] = lambda x: 1 / (1 + np.exp(-np.copy(x).clip(-20, 20)))
self.tdict['relu'] = relu
self.tdict['softmax'] = np.exp
self.ddict = dict()
self.ddict['tanh'] = tanh_derivative
self.ddict['sigmoid'] = sigmoid_derivative
self.ddict['relu'] = relu_derivative
self.ddict['softmax'] = lambda x:x
def getActivation(self, activation):
if activation in self.tdict:
return self.tdict[activation]
else:
return lambda x: x
def getDActivation(self, activation):
if activation in self.ddict:
return self.ddict[activation]
else:
return lambda x: np.ones(x.shape)
#print(ActivationFunc().getActivation('logistic')(1.0))
#print(logistic_derivative(1.0))
class NNetwork:
def __init__(self, inputsize, lr = 0.001, withbias = True, optimizer = 'adam', allzero = False) :
self.para = []
self.layerout = []
self.grad = []
self.backout = []
self.allzero = allzero
self.activationclass = ActivationFunc()
self.inputsize = inputsize
self.outputsize = inputsize
self.lastsize = inputsize
self.lr = lr
self.layerlen = 0
self.activation = []
self.deactivation = []
self.wbias = withbias
self.outputfunc = 'softmax'
self.maxnum = 0.001
self.bstep = 0
self.belta1 = 0.7
self.belta2 = 0.7
self.alphat = 1.0
self.Eg = None
self.m = None
self.moutput = None
self.tmpgrad = None
self.test = 0
if optimizer == 'adam':
#print('optimized with adam')
self.stepfunc = self.adamstep
else:
#print('optimized with std')
self.stepfunc = self.stdstep
#self.activation = ActivationFunc().getActivation(mactivation)
def add(self, densesize, actstr):
tsize = self.lastsize
if self.wbias:
tsize += 1
if not self.allzero:
self.para.append(np.random.rand(densesize, tsize) * 2 - 1)
else:
self.para.append(np.zeros((densesize, tsize)) * 2 - 1)
self.grad.append(np.zeros((densesize, tsize)))
self.lastsize = densesize
self.activation.append(self.activationclass.getActivation(actstr))
self.deactivation.append(self.activationclass.getDActivation(actstr))
self.layerlen += 1
self.outputfunc = actstr
self.outputsize = densesize
def forward(self, input):
self.layerout = []
if self.wbias:
self.layerout.append(np.append(np.array(input), 1))
else:
self.layerout.append(np.array(input))
for i in range(self.layerlen):
#print(self.layerout[-1].shape, self.para[i].shape)
if self.wbias and i != self.layerlen - 1:
self.layerout.append(np.append((self.activation[i](np.dot(self.para[i], self.layerout[-1].T).clip(-5000,5000))).clip(-5000,5000), 1))
else:
self.layerout.append((self.activation[i](np.dot(self.para[i], self.layerout[-1].T).clip(-5000,5000))).clip(-5000,5000))
self.moutput = np.copy(self.layerout[-1])
return self.moutput
def backward(self, y, y_label = 0):
self.maxnum = 0.001
self.bstep += 1
tsumy = sum(y)
if self.outputfunc == 'softmax':
y[y_label] -= tsumy
y /= max(1e-4, tsumy)
#self.maxnum = max(self.maxnum, max(y))
self.backout = []
self.backout.append(np.matrix(y).T)
for i in range(self.layerlen, 0, -1):
#print(self.backout[-1].shape, np.matrix(self.layerout[i - 1]).shape)
self.grad[i - 1] += np.dot(self.backout[-1], np.matrix(self.layerout[i - 1])).clip(-5000,5000)
self.grad[i - 1].clip(-5000,5000)
self.maxnum = max(np.abs(self.grad[i - 1]).max().max(), self.maxnum)
if i > 1:
if self.wbias:
self.backout.append(np.multiply(self.deactivation[i - 2](self.layerout[i - 1]), np.dot(self.backout[-1].T, self.para[i - 1])).T[:-1,:].clip(-5000,5000))
else:
self.backout.append(np.multiply(self.deactivation[i - 2](self.layerout[i - 1]), np.dot(self.backout[-1].T, self.para[i - 1])).T.clip(-5000,5000))
else:
self.backout.append(np.dot(self.backout[-1].T, self.para[i - 1]).clip(-5000,5000))
def backwardr(self, y):
self.maxnum = 0.001
self.bstep += 1
self.backout = []
self.backout.append(np.matrix(self.deactivation[-1](self.moutput) * y).T.clip(-5000,5000))
for i in range(self.layerlen, 0, -1):
self.grad[i - 1] += np.dot(self.backout[-1], np.matrix(self.layerout[i - 1])).clip(-5000,5000)
self.maxnum = max(np.abs(self.grad[i - 1]).max().max(), self.maxnum)
if i > 1:
if self.wbias:
self.backout.append(np.multiply(self.deactivation[i - 2](self.layerout[i - 1]), np.dot(self.backout[-1].T, self.para[i - 1])).T[:-1,:].clip(-5000,5000))
else:
self.backout.append(np.multiply(self.deactivation[i - 2](self.layerout[i - 1]), np.dot(self.backout[-1].T, self.para[i - 1])).T.clip(-5000,5000))
else:
self.backout.append(np.dot(self.backout[-1].T, self.para[i - 1]).clip(-5000,5000))
if np.abs(self.backout[-1]).max() < 1e-6 and self.test < 1:
print(self.layerout[i - 1], self.para[i - 1], self.backout[-2].T)
self.test += 1
def zero_grad(self):
for obj in self.grad:
obj.fill(0)
self.maxnum = 0.001
self.bstep = 0
def step(self, maxinum = None):
self.stepfunc(maxinum)
def stdstep(self, maxinum = None):
if maxinum == None:
maxinum = self.maxnum
for obj1, obj2 in zip(self.para, self.grad):
obj1 -= (self.lr * obj2 * self.bstep)
self.zero_grad()
def adamstep(self, maxinum = None):
if maxinum == None:
maxinum = self.maxnum
self.belta2 = min(0.9, self.belta2 * 1.01)
self.belta1 = min(0.9, self.belta1 * 1.01)
if self.Eg != None:
self.Eg = (1 - self.belta2) * maxinum + self.belta2 * self.Eg
for obj1, obj2 in zip(self.m, self.grad):
obj1 = (1 - self.belta1) * obj2 + self.belta1 * obj1
else:
self.Eg = maxinum
self.m = self.grad
te = self.Eg / (1 - np.power(self.belta2, self.alphat))
tm = [obj / (1 - np.power(self.belta1, self.alphat)) for obj in self.m]
for obj1, obj2 in zip(self.para, self.m):
obj1 -= (self.lr * obj2 / max(te, 0.01) * self.bstep)
self.zero_grad()
def predict(self, input):
y = self.forward(input)
y /= np.sum(y)
return y
class Qsa:
def __init__(self, nn1, nn2, lr = 0.1):
self.nn1 = nn1
self.nn2 = nn2
self.inputsize = nn1.outputsize + nn2.outputsize
self.w = 2 * np.random.rand(self.inputsize + 1) -1
self.grad = np.zeros(self.inputsize + 1)
self.moutput = None
self.maxnum = 0
self.lr = lr
self.bstep = 0
self.test = 0
def forward(self, input1, input2):
self.moutput = np.dot(np.append(np.append(self.nn1.forward(input1), self.nn2.forward(input2)),1), self.w).clip(-5000,5000)
return self.moutput
def backward(self, delta):
self.grad[:self.nn1.outputsize] += (delta * self.nn1.moutput).clip(-5000,5000)
self.grad[-self.nn2.outputsize-1: -1] += (delta * self.nn2.moutput).clip(-5000,5000)
self.grad[-1] += delta
self.nn1.backwardr((delta * self.w[:self.nn1.outputsize]).clip(-5000,5000))
self.nn2.backwardr((delta * self.w[-self.nn2.outputsize - 1: -1]).clip(-5000,5000))
self.maxnum = np.abs(self.grad).max()
self.maxnum = max(self.maxnum, self.nn1.maxnum)
self.maxnum = max(self.maxnum, self.nn2.maxnum)
self.bstep += 1
def step(self):
self.w -= (self.lr * self.grad) * self.bstep
self.grad.fill(0)
self.nn1.stdstep(self.maxnum)
self.nn2.stdstep(self.maxnum)
#print(self.maxnum)
def getAgrad(self):
tmp = np.copy(self.nn2.backout[-1])
ret = tmp.reshape(self.nn2.inputsize+1)[:self.nn2.inputsize]
return ret
class MActor:
def __init__(self, ssize, asize, actionlow, actionhigh, moptimizer = 'adam', mlr = 0.001):
self.net = NNetwork(ssize, optimizer = moptimizer, lr = mlr)
self.net.add(16, 'relu')
self.net.add(32, 'relu')
self.net.add(asize, 'sigmoid')
self.actionlow = actionlow
self.actionhigh = actionhigh
self.actionrange = self.actionhigh - self.actionlow
def forward(self, input1):
return self.actionlow + self.actionrange * self.net.forward(input1)
def backward(self, delta1):
self.net.backwardr(delta1 * self.actionrange)
def step(self):
#print('learnt from experience')
self.net.step()
class MCritic:
def __init__(self, ssize, asize):
self.Snet = NNetwork(ssize, optimizer = 'std', lr = 0.1, allzero = False)
self.Snet.add(32, 'relu')
self.Anet = NNetwork(asize, optimizer = 'std', lr = 0.1, allzero = False)
self.Anet.add(32, 'relu')
self.net = Qsa(self.Snet, self.Anet)
def forward(self, minput1, minput2):
return self.net.forward(minput1, minput2)
def backward(self, merror):
self.net.backward(merror)
def getAgrad(self):
return self.net.getAgrad()
def step(self):
self.net.step()
#2*x + y - 3
if __name__ == "__main__":
model = NNetwork(2, withbias = True, lr = 0.001, optimizer = 'adam')
model.add(16, 'relu')
model.add(8, 'relu')
model.add(2, 'softmax')
data = pd.read_csv('data.csv').astype('float64').sample(frac=1)
datalen = len(data)
data_train = data.iloc[:int(datalen*0.9),:]
data_test = data.iloc[int(datalen*0.9):,:]
X_train = data_train.iloc[:,:2]
y_train = data_train.iloc[:,2].astype('int')
X_test = data_test.iloc[:,:2]
y_test = data_test.iloc[:,2].astype('int')
len_train = len(X_train)
#print(X_train.dtype)
for i in range(400000):
tid = i % len_train
#print(X_train.iloc[tid])
output = model.forward(X_train.iloc[tid])
model.backward(output, y_train.iloc[tid])
if tid == len_train - 1:
model.step()
pres = []
for ind, val in X_test.iterrows():
pres.append(np.argmax(model.predict(val)))
res1 = np.array(pres)
res2 = np.array(y_test)
print(res1)
print(res2)
'''
X = [[0,0],[0,1],[1,0],[1,1]]
y = [0, 1, 1, 0]
for i in range(200000):
tid = i % 4
#model.zero_grad()
output = model.forward(X[tid])
model.backward(output, y[tid])
if tid == 3:
model.step()
print(model.predict([1,1]))
print(model.predict([0,1]))
print(model.predict([0,0]))
print(model.predict([1,0]))
'''
下面是Agent, 论文后面的软更新也没有实现,直接硬更新的,由于对这个算法的理解比较虚也啥效果都没有实现,所以不知道这两段代码里面有一个两个三个或者四个错误,但我估计是网络初始参数w太小,这个应用我也没有实现用adam优化器版本,感觉用那个可能效果会好点,后面调了一下发现中间层有数据溢出,clip以后运行效果还是不大理想
import numpy as np
import os
import pandas as pd
from quad_controller_rl.agents.base_agent import BaseAgent
from quad_controller_rl.agents.ounoise import OUNoise
from quad_controller_rl.agents.replay_buffer import ReplayBuffer
from quad_controller_rl import util
#import quad_controller_rl.agents.qpisa as QPISA
from quad_controller_rl.agents.qpisa import MActor,MCritic
class DIYAgent(BaseAgent):
"""Reinforcement Learning agent that learns using DDPG."""
def __init__(self, task):
...
# Task (environment) information
self.task = task # should contain observation_space and action_space
self.state_size = np.prod(self.task.observation_space.shape)
self.state_range = self.task.observation_space.high - self.task.observation_space.low
self.action_size = np.prod(self.task.action_space.shape)
self.action_range = self.task.action_space.high - self.task.action_space.low
# Actor (Policy) Model
self.action_low = self.task.action_space.low[:3]
self.action_high = self.task.action_space.high[:3]
self.actor_local = MActor(self.state_size, 3, self.action_low, self.action_high)
# Critic (Value) Model
self.critic_local = MCritic(self.state_size, 3)
# Initialize target model parameters with local model parameters
#self.critic_target.model.set_weights(self.critic_local.model.get_weights())
#self.actor_target.model.set_weights(self.actor_local.model.get_weights())
# Noise process
self.noise = OUNoise(self.action_size, mu = np.array([0, 0, 0, 0, 0, 0]))
# Replay memory2
self.buffer_size = 100000
self.batch_size = 64
self.memory = ReplayBuffer(self.buffer_size)
# Algorithm parameters
self.gamma = 0.99 # discount factor
self.tau = 0.01 # for soft update of target parameters
self.last_state = None
self.last_action = None
self.stats_filename = os.path.join(
util.get_param('out'),"stats_{}.csv".format(util.get_timestamp())) # path to CSV file
self.stats_columns = ['episode', 'total_reward'] # specify columns to save
self.episode_num = 1
self.total_reward = 0
print("Saving stats {} to {}".format(self.stats_columns, self.stats_filename)) # [debug]
...
def preprocess_state(self, state):
"""Reduce state vector to relevant dimensions."""
return state[0:3] # position only
def postprocess_action(self, action):
"""Return complete action vector."""
complete_action = np.zeros(self.task.action_space.shape) # shape: (6,)
#print(complete_action.shape, action.shape)
complete_action[0:3] = action[0:3] # linear force only
return complete_action
def write_stats(self, stats):
"""Write single episode stats to CSV file."""
df_stats = pd.DataFrame([stats], columns=self.stats_columns) # single-row dataframe
df_stats.to_csv(self.stats_filename, mode='a', index=False, header=not os.path.isfile(self.stats_filename))
def step(self, state, reward, done):
...
# Choose an action
#print('first step')
action = self.postprocess_action(self.act(state))
# Save experience / reward
if self.last_state is not None and self.last_action is not None:
self.memory.add(self.last_state, self.last_action, reward, state, done)
...
# Learn, if enough samples are available in memory
if len(self.memory) >= self.batch_size:
experiences = self.memory.sample(self.batch_size)
self.learn(experiences)
self.last_state = state
self.last_action = action
self.total_reward += reward
if done:
# Write episode stats
self.write_stats([self.episode_num, self.total_reward])
self.episode_num += 1
self.total_reward = 0
return action
...
def act(self, states):
"""Returns actions for given state(s) as per current policy."""
states = np.reshape(states, [-1, self.state_size])
actions = np.zeros(6)
actions[:3] = self.actor_local.forward(states)
#print(type(tmp),tmp.shape)
return actions + self.noise.sample() # add some noise for exploration
def learn(self, experiences):
"""Update policy and value parameters using given batch of experience tuples."""
# Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
states = np.vstack([e.state for e in experiences if e is not None])
actions = np.array([e.action[:3] for e in experiences if e is not None]).astype(np.float32).reshape(-1, 3)
rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
next_states = np.vstack([e.next_state for e in experiences if e is not None])
# Get predicted next-state actions and Q values from target models
# Q_targets_next = critic_target(next_state, actor_target(next_state))
actions_next = []
for mstate in next_states:
actions_next.append(self.actor_local.forward(mstate))
Q_targets_next = []
for mstate, maction in zip(list(next_states), actions_next):
Q_targets_next.append(self.critic_local.forward(mstate, maction))
# Compute Q targets for current states and train critic model (local)
Q_targets = rewards.reshape(len(Q_targets_next)) + self.gamma * np.multiply(np.array(Q_targets_next) , (1 - dones.reshape(len(Q_targets_next))))
#print(len(states), len(actions), len(Q_targets))
for i in range(len(states)):
mstate = states[i]
maction = actions[i]
mvalue = Q_targets[i]
tvalue = self.critic_local.forward(mstate, maction)
self.critic_local.backward(tvalue - mvalue)
Agrad = self.critic_local.getAgrad()
taction = self.actor_local.forward(mstate)
self.actor_local.backward(-taction.clip(-25, 25) * Agrad)
#print(tvalue,mvalue)
print(Agrad)
self.critic_local.step()
self.actor_local.step()
不过使用Uda官网上的ddpg算法三个都成功了,不过降落(landing)速度有点快,不是说好的gently landing,可能是reward表达式不太好,给速度的惩罚不够大,但是太大又很容易变成hover