带adam优化器版本的神经网络

     自己写了一个类似于adam优化的函数,一直想把自己写的神经网络运用到四轴飞行器的训练这个项目里,但又害怕训练效率太低,所以写了一个优化训练的函数,但不是标准的,和标准的有些出入.那个项目Actor-critic用tensorflow好像必须是用后端的一些参数,所以不如用自己的,直接就可以获得参数.

     一般来说,optimizer类是单独实现的,nn.forward()计算每层的输出,loss.backward()计算损失函数对每个参数的导数,optimizer在构造函数里面就传入了nn的parameter的引用(所以可以获得nn的参数和梯度),optimizer.step()根据历史参数和算法将nn的参数向类梯度方向移动一定的距离,我的实现比较简易,没有单独实现optimizer类(pytorch里神经网络的实现步骤)

import numpy as np
import pandas as pd
import copy
def tanh(x):
    return np.tanh(x)
def tanh_derivative(x):
    return 1.0 - x * x
def sigmoid(x):
    return 1 / (1 + np.exp(-x))
def sigmoid_derivative(x):
    return x * (1 - x)
def relu(x):
	return np.maximum(x, 0)
	#t = copy.copy(x)
	#for i in range(len(t)):
	#	if t[i] < 0:
	#		t[i] = 0
	#return t

def relu_derivative(x):
	t = copy.copy(x)
	for i in range(len(t)):
		if t[i] <= (1e-12):
			t[i] = 0
		else:
			t[i] = 1
	return t


class ActivationFunc:
	def __init__(self):
		self.tdict = dict()
		self.tdict['tanh'] = np.tanh
		self.tdict['sigmoid'] = lambda x: 1 / (1 + np.exp(-x))
		self.tdict['relu'] = relu
		self.tdict['softmax'] = np.exp
		self.ddict = dict()
		self.ddict['tanh'] = tanh_derivative
		self.ddict['sigmoid'] = sigmoid_derivative
		self.ddict['relu'] = relu_derivative
		self.ddict['softmax'] = np.exp

	def getActivation(self, activation):
		if activation in self.tdict:
			return self.tdict[activation]
		else:
			return lambda x: x

	def getDActivation(self, activation):
		if activation in self.ddict:
			return self.ddict[activation]
		else:
			return lambda x: np.ones(x.shape)
	

#print(ActivationFunc().getActivation('logistic')(1.0))
#print(logistic_derivative(1.0))
class NNetwork:
	def __init__(self, inputsize, lr = 0.001, withbias = True, optimizer = 'adam') :
		self.para = []
		self.layerout = []
		self.grad = []
		self.backout = []
		
		self.activationclass = ActivationFunc()
		self.inputsize = inputsize
		self.lastsize = inputsize
		self.lr = lr
		self.layerlen = 0
		self.activation = []
		self.deactivation = []
		self.wbias = withbias
		self.outputfunc = 'softmax'
		self.maxnum = 0.001
		self.bstep = 0

		self.belta1 = 0.7
		self.belta2 = 0.7
		self.alphat = 1.0
		self.Eg = None
		self.m = None

		if optimizer == 'adam':
			print('optimized with adam')
			self.stepfunc = self.adamstep
		else:
			print('optimized with std')
			self.stepfunc = self.stdstep
		#self.activation = ActivationFunc().getActivation(mactivation)
		
	
	def add(self, densesize, actstr):
		tsize = self.lastsize
		if self.wbias:
			tsize += 1

		self.para.append(np.random.rand(densesize, tsize) - 0.5)
		self.grad.append(np.zeros((densesize, tsize)))
		
		self.lastsize = densesize
		self.activation.append(self.activationclass.getActivation(actstr))
		self.deactivation.append(self.activationclass.getDActivation(actstr))
		self.layerlen += 1
		self.outputfunc = actstr

	def forward(self, input):
		self.layerout = []
		if self.wbias:
			self.layerout.append(np.append(np.array(input), 1))
		else:
			self.layerout.append(np.array(input))
		for i in range(self.layerlen):
			#print(self.layerout[-1].shape, self.para[i].shape)
			if self.wbias and i != self.layerlen - 1:
				self.layerout.append(np.append(self.activation[i](np.dot(self.para[i], self.layerout[-1].T)), 1))
			else:
				self.layerout.append(self.activation[i](np.dot(self.para[i], self.layerout[-1].T)))
		return self.layerout[-1]

		
	def backward(self, y, y_label):
		self.maxnum = 0.001
		self.bstep += 1
		tsumy = sum(y)
		if self.outputfunc == 'softmax':
			y[y_label] -= tsumy
		
		#self.maxnum = max(self.maxnum, max(y))
		self.backout = []
		self.backout.append(np.matrix(y).T)

		for i in range(self.layerlen, 0, -1):
			#print(self.backout[-1].shape, np.matrix(self.layerout[i - 1]).shape)
			self.grad[i - 1] += np.dot(self.backout[-1], np.matrix(self.layerout[i - 1]))
			self.maxnum = max(np.abs(self.grad[i - 1]).max().max(), self.maxnum)
			if i > 1:
				if self.wbias:
					self.backout.append(np.multiply(self.deactivation[i - 2](self.layerout[i - 1]), np.dot(self.backout[-1].T, self.para[i - 1])).T[:-1,:])
				else:
					self.backout.append(np.multiply(self.deactivation[i - 2](self.layerout[i - 1]), np.dot(self.backout[-1].T, self.para[i - 1])).T)
			else:
				self.backout.append(np.dot(self.backout[-1].T, self.para[i - 1]))

	def zero_grad(self):
		for obj in self.grad:
			obj.fill(0)
		self.maxnum = 0.001
		self.bstep = 0

	def step(self):
		self.stepfunc()

	def stdstep(self):
		for obj1, obj2 in zip(self.para, self.grad):
			obj1 -= self.lr * obj2 / max(self.maxnum, 0.001) * self.bstep
		self.zero_grad()
	
	def adamstep(self):
		self.belta2 = min(0.9, self.belta2 * 1.01)
		self.belta1 = min(0.9, self.belta1 * 1.01)
		if self.Eg != None:
			self.Eg = (1 - self.belta2) * self.maxnum + self.belta2 * self.Eg 
			for obj1, obj2 in zip(self.m,self.grad):
				obj1 = (1 - self.belta1) * obj2 + self.belta1 * obj1
		else:
			self.Eg = self.maxnum
			self.m = self.grad
		#if abs(self.Eg - self.maxnum) > 0.01*self.maxnum:
		#	print(self.Eg, self.maxnum)
		te = self.Eg / (1 - np.power(self.belta2, self.alphat))
		tm = [obj / (1 - np.power(self.belta1, self.alphat)) for obj in self.m]
		for obj1, obj2 in zip(self.para, self.m):
			obj1 -= self.lr * obj2 / max(te, 0.001) * self.bstep
		self.zero_grad()

	def predict(self, input):
		y = self.forward(input)
		y /= np.sum(y)
		return y



#2*x + y - 3
if __name__ == "__main__":
	model = NNetwork(2, withbias = True, lr = 0.001, optimizer = 'adam')
	model.add(16, 'relu')
	model.add(8, 'relu')
	model.add(2, 'softmax')
	
	data = pd.read_csv('data.csv').astype('float64').sample(frac=1)
	datalen = len(data)
	data_train = data.iloc[:int(datalen*0.9),:]
	data_test = data.iloc[int(datalen*0.9):,:]
	X_train = data_train.iloc[:,:2]
	y_train = data_train.iloc[:,2].astype('int')
	X_test = data_test.iloc[:,:2]
	y_test = data_test.iloc[:,2].astype('int')
	len_train = len(X_train)
	#print(X_train.dtype)
	for i in range(200000):
		tid = i % len_train
		#print(X_train.iloc[tid])
		output = model.forward(X_train.iloc[tid])
		model.backward(output, y_train.iloc[tid])
		if tid == len_train - 1:
			model.step()
	pres = []
	for ind, val in X_test.iterrows():	
		pres.append(np.argmax(model.predict(val)))
	res1 = np.array(pres)
	res2 = np.array(y_test)
	print(res1)
	print(res2)
	'''
	X = [[0,0],[0,1],[1,0],[1,1]]
	y = [0, 1, 1, 0]
	
	for i in range(200000):
		tid = i % 4
		#model.zero_grad()
		output = model.forward(X[tid])
		model.backward(output, y[tid])
		if tid == 3: 
			model.step()


	print(model.predict([1,1]))
	print(model.predict([0,1]))
	print(model.predict([0,0]))
	print(model.predict([1,0]))
	'''
//data.csv
0.78051,-0.063669,1
0.28774,0.29139,1
0.40714,0.17878,1
0.2923,0.4217,1
0.50922,0.35256,1
0.27785,0.10802,1
0.27527,0.33223,1
0.43999,0.31245,1
0.33557,0.42984,1
0.23448,0.24986,1
0.0084492,0.13658,1
0.12419,0.33595,1
0.25644,0.42624,1
0.4591,0.40426,1
0.44547,0.45117,1
0.42218,0.20118,1
0.49563,0.21445,1
0.30848,0.24306,1
0.39707,0.44438,1
0.32945,0.39217,1
0.40739,0.40271,1
0.3106,0.50702,1
0.49638,0.45384,1
0.10073,0.32053,1
0.69907,0.37307,1
0.29767,0.69648,1
0.15099,0.57341,1
0.16427,0.27759,1
0.33259,0.055964,1
0.53741,0.28637,1
0.19503,0.36879,1
0.40278,0.035148,1
0.21296,0.55169,1
0.48447,0.56991,1
0.25476,0.34596,1
0.21726,0.28641,1
0.67078,0.46538,1
0.3815,0.4622,1
0.53838,0.32774,1
0.4849,0.26071,1
0.37095,0.38809,1
0.54527,0.63911,1
0.32149,0.12007,1
0.42216,0.61666,1
0.10194,0.060408,1
0.15254,0.2168,1
0.45558,0.43769,1
0.28488,0.52142,1
0.27633,0.21264,1
0.39748,0.31902,1
0.5533,1,0
0.44274,0.59205,0
0.85176,0.6612,0
0.60436,0.86605,0
0.68243,0.48301,0
1,0.76815,0
0.72989,0.8107,0
0.67377,0.77975,0
0.78761,0.58177,0
0.71442,0.7668,0
0.49379,0.54226,0
0.78974,0.74233,0
0.67905,0.60921,0
0.6642,0.72519,0
0.79396,0.56789,0
0.70758,0.76022,0
0.59421,0.61857,0
0.49364,0.56224,0
0.77707,0.35025,0
0.79785,0.76921,0
0.70876,0.96764,0
0.69176,0.60865,0
0.66408,0.92075,0
0.65973,0.66666,0
0.64574,0.56845,0
0.89639,0.7085,0
0.85476,0.63167,0
0.62091,0.80424,0
0.79057,0.56108,0
0.58935,0.71582,0
0.56846,0.7406,0
0.65912,0.71548,0
0.70938,0.74041,0
0.59154,0.62927,0
0.45829,0.4641,0
0.79982,0.74847,0
0.60974,0.54757,0
0.68127,0.86985,0
0.76694,0.64736,0
0.69048,0.83058,0
0.68122,0.96541,0
0.73229,0.64245,0
0.76145,0.60138,0
0.58985,0.86955,0
0.73145,0.74516,0
0.77029,0.7014,0
0.73156,0.71782,0
0.44556,0.57991,0
0.85275,0.85987,0
0.51912,0.62359,0

Actor-crtic好像还必须使用网络并联,这个实现起来好像有点要改结构,以后有空再实现,Actor-Critic本质看代码应该是Q(s,a)的对每个a的导数,和π(s) = a的输出相乘,然后backward更新π(s) = a网络的参数,Q(s,a)(critic)网络也要同时更新,和普通方法相同

动量在帮助函数走出局部最优有作用主要是因为函数自然情况下坑都比较小比较浅,如果人为第挖两个大坑则动量肯定会失效,动量也会使收敛速度减慢.

我猜想标准的神经网络节点应该是这么实现的,其中python里的列表应该会换成Vector3之类的,forwardlst存储后续连接信息:

class Node:
	def __init__(tid):
		self.mtop = 0
		self.forwardlist = Vector3[(id,w,delta)]
        self.backwardlist = Vector3[(int)]
		self.outputd = 0
		self.inputd = 0
        self.id = tid
Layer = Vector(Node)

每一层应该就是一个Node的列表,使用类似于C++ Vector的数据结构存储

self.forwardlist里存放的是一个三元数组,存放所有后继的节点的id,该连接的参数w,导数delta.    self.backwardlist里存放的是节点反向连接了哪些节点,只需要存id即可.self.mtop存放在backward算导数时前一层更新到该节点第几个连接了,由于对python里类似于Vector的数据结构不够了解,所以没有实现(list好像效率不够,因为里面可以存储不同的变量,需要可以存储相同变量的容器,比如c++的vector,效率更高),每次后一层backward时要将前一层的self.mtop清0,

     该实现有点类似于图论中邻接表的实现.前一份代码实现有点类似于邻接矩阵,应该在全连接层的效率是非常高的(完全图使用邻接矩阵效率应该是更高的),但只能用于全连接层,卷积神经网络和连接的稍微一点变化的网络都必须使用类似于邻接表的实现,

     之后又将该网络用于试验猫狗大战提取到的特征在全连接层上的训练,效果还可以,用于提取bottle_neck_feature的模型有VGG19,InceptionResNetV2,Xception(参数经过微调)模型

import numpy as np
import pandas as pd
import copy
def tanh(x):
    return np.tanh(x)
def tanh_derivative(x):
    return 1.0 - x * x
def sigmoid(x):
    return 1 / (1 + np.exp(-x.copy().clip(-20,20)))
def sigmoid_derivative(x):
    return x * (1 - x)
def relu(x):
	return np.maximum(x, 0)
	#t = copy.copy(x)
	#for i in range(len(t)):
	#	if t[i] < 0:
	#		t[i] = 0
	#return t

def relu_derivative(x):
	t = copy.copy(x)
	for i in range(len(t)):
		if t[i] <= (1e-12):
			t[i] = 0
		else:
			t[i] = 1
	return t


class ActivationFunc:
	def __init__(self):
		self.tdict = dict()
		self.tdict['tanh'] = np.tanh
		self.tdict['sigmoid'] = sigmoid
		self.tdict['relu'] = relu
		self.tdict['softmax'] = np.exp
		self.ddict = dict()
		self.ddict['tanh'] = tanh_derivative
		self.ddict['sigmoid'] = sigmoid_derivative
		self.ddict['relu'] = relu_derivative
		self.ddict['softmax'] = np.exp

	def getActivation(self, activation):
		if activation in self.tdict:
			return self.tdict[activation]
		else:
			return lambda x: x

	def getDActivation(self, activation):
		if activation in self.ddict:
			return self.ddict[activation]
		else:
			return lambda x: np.ones(x.shape)
	

#print(ActivationFunc().getActivation('logistic')(1.0))
#print(logistic_derivative(1.0))
class NNetwork:
	def __init__(self, inputsize, lr = 0.001, withbias = True, optimizer = 'adam') :
		self.para = []
		self.layerout = []
		self.grad = []
		self.backout = []
		
		self.activationclass = ActivationFunc()
		self.inputsize = inputsize
		self.lastsize = inputsize
		self.lr = lr
		self.layerlen = 0
		self.activation = []
		self.deactivation = []
		self.wbias = withbias
		self.outputfunc = 'softmax'
		self.maxnum = 0.001
		self.maxpara = 1
		self.bstep = 0

		self.belta1 = 0.7
		self.belta2 = 0.7
		self.alphat = 1.0
		self.Eg = None
		self.m = None

		if optimizer == 'adam':
			print('optimized with adam')
			self.stepfunc = self.adamstep
		else:
			print('optimized with std')
			self.stepfunc = self.stdstep
		#self.activation = ActivationFunc().getActivation(mactivation)
		
	
	def add(self, densesize, actstr):
		tsize = self.lastsize
		if self.wbias:
			tsize += 1

		self.para.append((np.random.rand(densesize, tsize) - 0.5) * 2 * np.sqrt(6 / (self.inputsize + 2)) ) #randn * np.power(2 / (self.inputsize + 2), 0.25)
		self.grad.append(np.zeros((densesize, tsize)))
		
		self.lastsize = densesize
		self.activation.append(self.activationclass.getActivation(actstr))
		self.deactivation.append(self.activationclass.getDActivation(actstr))
		self.layerlen += 1
		self.outputfunc = actstr

	def forward(self, input):
		self.layerout = []
		if self.wbias:
			self.layerout.append(np.append(np.array(input), 1))
		else:
			self.layerout.append(np.array(input))
		for i in range(self.layerlen):
			#print(self.layerout[-1].shape, self.para[i].shape)
			if self.wbias and i != self.layerlen - 1:
				self.layerout.append(np.append(self.activation[i](np.dot(self.para[i], self.layerout[-1].T)), 1))
			else:
				self.layerout.append(self.activation[i](np.dot(self.para[i], self.layerout[-1].T)))
		return self.layerout[-1]

		
	def backward(self, y, y_label):
		self.maxnum = 0.001
		self.bstep += 1
		
		if self.outputfunc == 'softmax':
			tsumy = sum(y)
			y[y_label] -= tsumy
			y /= max(tsumy, 1e-4)
		if self.outputfunc == 'sigmoid':
			if y_label == 1:
				#print(y)
				y -= 1
				

		#self.maxnum = max(self.maxnum, max(y))
		self.backout = []
		self.backout.append(np.matrix(y).T)

		for i in range(self.layerlen, 0, -1):
			#print(self.backout[-1].shape, np.matrix(self.layerout[i - 1]).shape)
			self.grad[i - 1] += np.dot(self.backout[-1], np.matrix(self.layerout[i - 1]))
			self.maxnum = max(np.abs(self.grad[i - 1]).max().max(), self.maxnum)
			if i > 1:
				if self.wbias:
					self.backout.append(np.multiply(self.deactivation[i - 2](self.layerout[i - 1]), np.dot(self.backout[-1].T, self.para[i - 1])).T[:-1,:])
				else:
					self.backout.append(np.multiply(self.deactivation[i - 2](self.layerout[i - 1]), np.dot(self.backout[-1].T, self.para[i - 1])).T)
			else:
				self.backout.append(np.dot(self.backout[-1].T, self.para[i - 1]))

	def zero_grad(self):
		for obj in self.grad:
			obj.fill(0)
		self.maxnum = 0.001
		self.bstep = 0

	def step(self):
		self.stepfunc()

	def stdstep(self):
		tmaxpara = 0
		for obj1, obj2 in zip(self.para, self.grad):
			obj1 -= self.lr * obj2  * self.bstep 
			#/ max(self.maxnum, 1e-4) * self.maxpara
			tmaxpara = max(tmaxpara, np.abs(obj1).max().max())
		self.maxpara = tmaxpara
		self.zero_grad()
	
	def adamstep(self):
		self.belta2 = min(0.9, self.belta2 * 1.01)
		self.belta1 = min(0.9, self.belta1 * 1.01)
		if self.Eg != None:
			self.Eg = (1 - self.belta2) * self.maxnum + self.belta2 * self.Eg 
			for obj1, obj2 in zip(self.m, self.grad):
				obj1 = (1 - self.belta1) * obj2 + self.belta1 * obj1
		else:
			self.Eg = self.maxnum
			self.m = self.grad
		#if abs(self.Eg - self.maxnum) > 0.01*self.maxnum:
		#	print(self.Eg, self.maxnum)
		te = self.Eg / (1 - np.power(self.belta2, self.alphat))
		tm = [obj / (1 - np.power(self.belta1, self.alphat)) for obj in self.m]
		for obj1, obj2 in zip(self.para, self.m):
			obj1 -= self.lr * obj2 / max(te, 1e-6) * self.bstep
		self.zero_grad()

	def predict(self, input):
		y = self.forward(input)
		if self.outputfunc == 'softmax':
			y /= np.sum(y)
		return y


#2*x + y - 3

if __name__ == "__main__":

#	model = NNetwork(2, withbias = True, lr = 0.001, optimizer = 'std')
#	model.add(16, 'relu')
#	model.add(8, 'relu')
#	model.add(8, 'relu')
#	model.add(8, 'relu')
#	model.add(2, 'softmax')
	
	train_labels = np.load("y_train.npy")
	validation_labels = np.load("y_val.npy")
	
	
	for i,name in enumerate(['vgg19', 'Xception','InceptionResNetV2']):
		if i == 0:
			train_data = np.load('bottleneck_features_train_' + name  + '.npy') / 255
			validation_data = np.load('bottleneck_features_validation_' + name  + '.npy') / 255
		else:
			train_data = np.append(train_data,np.load('bottleneck_features_train_' + name + '.npy'), axis = 1)
			validation_data = np.append(validation_data, np.load('bottleneck_features_validation_' + name + '.npy'),axis = 1)

	tinputsize = train_data[0].shape[0]
	model = NNetwork(tinputsize, lr = 0.001, optimizer = 'std')
	model.add(256, 'relu')
	model.add(1, 'sigmoid')
	#print(model.outputfunc)
	epochs = 4
	
	maxcnt = 0
	finalModel = None
	for e in range(epochs):
		for i in range(len(train_data)):
			output = model.forward(train_data[i])
			ty = output.copy()
			model.backward(output, train_labels[i])
			#if train_labels[i] == 1 and not flag:
			#	flag = True
			#	print(train_data[i], ty, train_labels[i])
			#	#print(model.backout)
			if i % 20 == 10:
				model.step()
		msum = 0
		for i in range(len(validation_data)):
			tmp = int(model.predict(validation_data[i]) > 0.5)
			if tmp == validation_labels[i]:
				msum += 1
		if msum >= maxcnt:
			maxcnt = msum
			finalModel = copy.copy(model)
		print('epoch {} Accuracy {}%'.format(e+1, msum / len(validation_data) * 100))


	namelist = ['vgg19', 'Xception', 'InceptionResNetV2']
	bottleneck_features_test = None
	for i,name in enumerate(namelist):
		if i == 0:
			bottleneck_features_test = np.load('bottleneck_features_test' + name + '.npy') 
		else:
			bottleneck_features_test = np.append(bottleneck_features_test, np.load('bottleneck_features_test' + name + '.npy'),  axis = 1)
	tmp = []
	for val in bottleneck_features_test:
		x = finalModel.predict(val)
		tmp.append(x)
	predictions = np.array(tmp)
	for i in range(len(predictions)):
		if predictions[i] < 0.005:
			predictions[i] = 0.005
		if predictions[i] > 0.995:
			predictions[i] = 0.995
	result_csv = pd.read_csv("sample_submission.csv")
	result_csv['label'] = predictions
	test_result_name = "catvsdog.csv"
	result_csv.to_csv(test_result_name, index=False)
	#print(tmp[:10])
	
#	res1 = np.array(pres)
#	res2 = np.array(y_test)
#	print(res1)
#	print(res2)
	'''
	X = [[0,0],[0,1],[1,0],[1,1]]
	y = [0, 1, 1, 0]
	
	for i in range(200000):
		tid = i % 4
		#model.zero_grad()
		output = model.forward(X[tid])
		model.backward(output, y[tid])
		if tid == 3: 
			model.step()


	print(model.predict([1,1]))
	print(model.predict([0,1]))
	print(model.predict([0,0]))
	print(model.predict([1,0]))
	'''

    最好的结果如下图所示:

     后面发现卷积神经网络可以使用python numpy切片实现,数据每层存储输出存储为二维的,但测试起来有些难度,以后有空会写一写.

     

你可能感兴趣的:(随笔)