主要参考:《深度学习入门:基于python的理论和实现》
def sigmoid(x):
return 1/(1+np.exp(-x))
def relu(x):
return np.maximum(0,x)
def softmax(a):
exp_a = np.exp(a)
sum_exp_a = np.sum(exp_a)
y = exp_a/sum_exp_a
return y
def better_softmax(a):
c = np.max(a)
exp_a = np.exp(a-c)
sum_exp_a = np.sum(exp_a)
y = exp_a/sum_exp_a
return y
def mean_squared_error(y,t):
return 0.5 * np.sum((y-t)**2)
def cross_entropy_error(y,t):
return -np.sum(t*np.log(y))
给出一个比较具体的做法,其中label是类别,输入的logit是模型输出的logit。
首先,label要转成one-hot的形式。
def np_onehot(nc, label):
tmp = np.arange(nc)
tmp = tmp[None,:]
true_label = label[:,None]
ans = tmp == true_label
return ans.astype(int)
label = np_onehot(4,label)
def np_softmax(arr):
assert len(arr.shape)==2
arr_exp = np.exp(arr)
arr_sum = np.sum(arr_exp, axis=1)
arr_sum = arr_sum[:,None]
return arr_exp / arr_sum
soft_logit = np_softmax(logit)
log_logit = np.log(soft_logit)
res = - log_logit * label
loss = np.mean(np.sum(res,axis=1))
class SGD:
def __init__(self, lr = 0.01):
self.lr = lr
def update(self, params, grads):
for key in params.keys():
params[key] -= self.lr * grads[key]
class Momentum:
# w<- w+v
# v<- alpha v - lr * gd
def __init__(self, lr = 0.01, momentum=0.9):
self.lr = lr
self.momentum = momentum
self.v = None
def update(self, params, grads):
if self.v is None:
self.v = {}
for key, val in params.items():
self.v[key] = np.zeros_like(val)
for key in params.keys():
self.v[key] = self.momentum*self.v[key] - self.lr*grads[key]
params += self.v[key]
class Nesterov:
# 先用速度更新,再用梯度更新
def __init__(self, lr = 0.01, momentum = 0.9):
self.lr = lr
self.momentum = momentum
self.v = None
def update(self, params, grads):
if self.v is None:
self.v = {}
for key, val in params.items():
self.v[key] = np.zeros_like(val)
for key in params.keys():
self.v[key] = self.momentum*self.v[key] - self.lr*grads[key]
params[key] += self.momentum*self.momentum*self.v[key]
params[key] -= (1+self.momentum)*self.lr*grads[key]
class AdaGrad:
# 学习率衰减
def __init__(self, lr = 0.01):
self.lr = lr
self.h = None
def update(self, params, grads):
if self.h is None:
self.h = {}
for key, val in params.items():
self.h[key] = np.zeros_like(val)
for key in params.keys():
self.h[key] += grads[key]*grads[key]
params[key] -= self.lr * grads[key]/ np.sqrt(self.h[key])
class RMSprop:
def __init__(self, lr=0.01,decay_rate = 0.99):
self.lr = lr
self.decay_rate = decay_rate
self.h = None
def update(self,params, grads):
if self.h is None:
self.h = {}
for key, val in params.items():
self.h[key] = np.zeros_like(val)
for key in params.keys():
self.h[key]*= self.decay_rate
self.h[key] += (1-self.decay_rate)*grads[key]&grads[key]
params[key] -= self.lr * grads[key]/np.sqrt(self.h[key])
class Adam:
def __init__(self, lr = 0.001, beta1=0.9, beta2 = 0.99):
self.lr = lr
self.beta1 = beta1
self.beta2 = beta2
self.iter = 0
self.m = None
self.v = None
def update(self, params, grads):
if self.m is None:
self.m, self.v = {},{}
for key,val in params.items():
self.m[key] = np.zeros_like(val)
self.v[key] = np.zeros_like(val)
self.iter += 1
lr_t = self.lr * np.sqrt(1 - self.beta2**self.iter)/ (1.0 - self.beta1**self.iter)
for key in params.keys():
self.m[key] = (1-self.beta1)*grads[key] + self.beta1* self.m[key]
self.v[key] = (1-self.beta2)*(grads[eky]**2) + self.beta2* self.v[key]
params[key] -= lr_t * self.m[key]/ np.sqrt(self.v[key])
class MulLayer:
def __init__(self):
self.x = None
self.y = None
def forward(self,x,y):
self.x = x
self.y = y
out = x*y
return out
def backward(self,dout):
dx = dout*self.y
dy = dout*self.x
return dx, dy
class AddLayer:
def __init__(self):
pass
def forward(self, x, y):
out = x+y
return out
def backward(self, dout):
dx = dout * 1
dy = dout * 1
return dx, dy
class ReluLayer:
def __init__(self):
pass
def forward(self, x):
self.mask = x<=0
out = x.copy()
out[self.mask] = 0
return out
def backward(self, dout):
dout[self.mask] = 0
dx = dout
return dx
class SigmoidLayer:
def __init__(self):
pass
def forward(self, x):
self.y = 1/(1+np.exp(-x))
return self.y
def backward(self,dout):
out = dout*(self.y)*(1-self.y)
return out
class Affine:
def __init__(self,W,b):
self.W = W
self.b = b
def forward(self, x):
self.x = x
out = np.dot(self.x, self.W)+self.b
return out
def backward(self,dout):
dx = np.dot(dout, self.W.T)
self.dW = np.dout(self.x.T, dout)
self.db = np.sum(dout,axis=0)
return dx
class SoftmaxWithLoss:
def __init__(self):
pass
def softmax(self,x):
if x.ndim == 2:
x = x.T
x = x - np.max(x,axis=0)
y = np.exp(x)/np.sum(np.exp(x),axis=0)
return y.T
x = x - np.max(x)
return np.exp(x)/np.sum(np.exp(x))
def cross_entropy(self, y, t):
# y是softmax的结果
# -sum(t*logy)
log_logit = np.log(y)
res = -t*log_logit
loss = np.mean(np.sum(res,axis=1))
return loss
def forward(self,x,t):
self.t = t
self.y = self.softmax(x)
self.error = self.cross_entropy(self.y, self.t)
return self.error
def backward(self, dout):
batch_size = self.t.shape[0]
dx = (self.y - self.t)/batch_size
return dx
class BatchNormalization:
def __init__(self, gamma, beta, momentum=0.9, runing_mean = None, running_var = None):
self.gamma = gamma
self.beta = beta
self.momentum = momentum
self.running_mean = running_mean
self.running_var = running_var
def forward(self, x, Train = True):
batch_size = x.shape[0]
self.input_shape = x.shape
x = x.reshape(batch, -1)
if self.running_mean = None:
self.runing_mean = np.zeros(x.shape[-1])
self.running_var = np.zeros(x.shape[-1])
if Train:
mu = np.mean(x,axis=0)
var = np.mean((x-mu)**2, axis=0)
std = np.sqrt(var)
self.std = std
self.x = (x-mu)/std
self.running_mean = self.momentum*self.running_mean + (1-self.momentum)*mu
self.running_var = self.momentum*self.running_var + (1-self.momentum)*var
else:
mu = self.running_mean
std = np.sqrt(self.runnning_var)
out = self.gamma* (x-mu)/std + self.beta
out = out.reshape(*self.input_shape)
return out
def backwards(self, dout):
batch_size = dout.shape[0]
dout = dout.reshape(batch_size,-1)
dbeta = dout.sum(axis=0)
dgamma = np.sum(dout*self.x, axis=0)
dxmu = dout(self.gamma )
dxc = dxmu/std
dstd = -np.sum(d(xmu*self.x)/(self.std**2),axis=0)
dvar = 0.5*dstd/self.std
dmu = np.sum(dxc, axis=0)
dx = dxc - dmu / self.batch_size
self.dgamma = dgamma
self.dbeta = dbeta
dx = dx.reshape(*self.input_shape)
return dx
# weight decay
def weight_decay(x):
weight_decay = 0
for idx in range(1, self.hidden_layer_num + 2):
W = self.params['W' + str(idx)]
weight_decay += 0.5 * self.weight_decay_lambda * np.sum(W ** 2)
class Dropout:
def __init__(self, dropout_ratio = 0.5):
self.dropout_ratio = dropout_ratio
self.mask = None
def forward(self, x, train_flag = True):
if train_flag:
self.mask = np.random.rand(*x.shape)> self.dropout_ratio
return x*self.mask
else:
return x*(1.0 - self.dropout_ratio)
class TwoLayerNet:
def __init__(self, input_size, hidden_size, output_size, weights_init_std = 0.01):
self.w1 = np.random.randn(input_size, hidden_size)
self.b1 = np.zeros(hidden_size)
self.w2 = np.random.randn(hidden_size, output_size)
self.b2 = np.zeros(output_size)
def sigmoid(self,x):
return 1/(1+np.exp(-x))
def softmax(self,x):
exp_logits = np.exp(x)
sum_logits = np.sum(exp_logits,axis=1)
return exp_logits/sum_logits
def predict(self,x):
a1 = np.dot(x,self.w1)+self.b1
z1 = self.sigmoid(a1)
a2 = np.dot(z1,self.w2) + self.b2
y = self.sigmoid(a2)
return y
def cross_entropy_error(self,pred, target):
def onehot(target,num_classes):
classes = np.arange(num_classes)
classes = classes[None,:]
target = target[:,None]
ans = classes == target
return ans.astype(int)
num_classes = pred.shape[-1]
one_hot_label = onehot(target)
log_logit = np.log(pred)
res = -log_logit*one_hot_label
res = np.mean(np.sum(res,axis=1))
return res
def loss(self, x, y):
y_hat = self.predict(x)
return self.cross_entropy_error(y_hat,y)
def accuracy(self, x, target):
y_hat = self.predict(x)
pred = np.argmax(y_hat,axis=1)
accuracy = np.sum(pred == target)/float(x.shape[0])
return accuracy
def gradient(self, y,x,target):
batch_size = x.shape[0]
dy = (y-target)/batch_size
grad_w2 = np.dot(z1.T,dy)
grad_b2 = np.sum(dy)
grad_z1 = np.dot(dy,self.w2)
grad_a1 = grad_z1*self.sigmoid(a1)*(1-self.sigmoid(a1))
self.w1 = np.dot(x.T,grad_a1)
self.b1 = np.sum(grad_z1)
def im2col(input_data, filter_h, filter_w, stride=1,pad=0):
N,C,H,W = input_data.shape
out_h = 1+int((H+2*pad-filter_h)/self.stride)
out_w = 1+int((W+2*pad-filter_w)/self.stride)
img = np.pad(input_data,[(0,0),(0,0),(pad,pad),(pad,pad)],'constant') #四个维度的填充,N和C不填充
col = np.zeros((N,C,filter_h, filter_w, out_h, out_w))
for y in range(filter_h):
y_max = y + stride*out_h
for x in range(filter_w):
x_max = x + stride*out_w
col[:,:,y,x,:,:] = img[:,:,y:y_max:stride, x:x_max:stride]
col = col.transpose(0,4,5,1,2,3,).reshape(N*out_h*out_w,-1)
return col
def col2im(col, input_shape,fh, fw, stride=1, pad=0):
# n*out*out, cff)
N,C,H,W = input.shape
out_h = 1+int((H+2*pad-filter_h)/self.stride)
out_w = 1+int((W+2*pad-filter_w)/self.stride)
col = col.reshape(N,out_h,out_w,C,fh, fw)
img = np.zeros((N, C, H + 2*pad + stride - 1, W + 2*pad + stride - 1))
for y in range(filter_h):
y_max = y+stride*out_h
for x in range(filter_w):
x_max = x+stride*out_w
img[:,:,y:y_max:stride, x:x_max:stride]+=col[:,:,y,x,,:,:]
return img[:,:,pad:H+pad,pad:W+pad]
class Convolution:
def __init__(self,w, b, stride=1, pad=0):
self.w = w
# self.w = np.random.randn((out_c, in_c, fh, fw))
self.b = b
self.stride = stride
self.pad = pad
def forward(self,x):
oc, C, fh,fw = self.w.shape
N, C, H, W = x.shape
out_h = 1+int((H+2*self.pad-fh)/self.stride)
out_w = 1+int((W+2*self.pad-fw)/self.stride)
col = im2col(x,fh,fw,self.stride, self.pad)
self.col = col
# n*out_h*out_w, c*k*k
out = col* self.w.reshape(oc,-1).T + self.b
out = out.reshape(N,out_h,out_w, -1).transpose(0,3,1,2)
return out
def backward(self,dout):
oc,C,fh,fw = self.w.shape
dout = dout.transpose(0,2,3,1).reshape(-1,oc)
self.db = np.sum(dout,axis=0)
self.dw = np.dot(dout,self.col.T)
self.dw = self.dw.transpose(1,0).reshape(oc,C,fh,fw)
self.col_w = self.w.reshape(oc,-1).T
dcol = np.dot(dout,self.col_w.T)
dx = col2im(dcol, self.x.shape, FH, FW, self.stride, self.pad)
return dx
class Pooling:
def __init__(self, pool_h, poo_w, stride = 1, pad = 0):
self.pool_h = pool_h
self.pool_w = pool_w
self.stride = stride
self.pad = pad
def forward(self,x):
N,C,H,W = x.shape
out_h = int(1+(H-self.pool_h)/self.stride)
out_w = int(1+(W-self.pool_w)/self.stride)
# n,c,h,w 展开一下
col = np.zeros((N,C,self.pool_h,self.pool_w,out_h,out_h))
for y in range(self.pool_h):
y_max = y + stride * out_h
for x in range(self.pool_w):
x_max = x + stride*out_w
col[:,:,x,y,:,:] = x[:,:,y:y_max:stride,x:x_max:stride]
col = col.transpose(0,4,5,1,2,3).reshape(-1, self.pool_h*sefl.pool_w)
self.mask = np.argmax(col,axis=1)
out = np.max(col,axis=1)
out = out.reshape(N,out_h,out_w,C).transpose(0,3,1,2)
return out
def backward(self,dout):
# N,C,out_h,out_w
dout = dout.transpose(0,2,3,1)
pool_size = self.pool_h*self.pool_w
dmax = np.zeros((dout.soze,pool_size))
dmax[np.arange(self.mask.size), self.mask.flatten()] = dout.flatten()
dmax = dmax.reshape(dout.shape + (pool_size,))
dcol = dmax.reshape(dmax.shape[0] * dmax.shape[1] * dmax.shape[2], -1)
dx = col2im(dcol, self.x.shape, self.pool_h, self.pool_w, self.stride, self.pad)
return dx
def calculate_iou(pred, target):
x1 = max(pred[0],target[0])
y1 = max(pred[1],target[1])
x2 = min(pred[2],target[2])
y2 = min(pred[3], target[3])
inter = max(0,x2-x1)*max(0,y2-y1)
area1 = (box1[2]-box1[0]) * (box1[3]-box1[1])
area2 = (box2[2]-box2[0]) * (box2[3]-box2[1])
union = inter/(area1+area2-inter)
return iou
def bboxes_iou(boxes1, boxes2):
# 交集和并集 x1y1 x2 y1
area1 = (boxes1[:,0]-boxes1[:,2])*(boxes1[:1]-boxes1[:3])
area2 = (boxes2[:,0]-boxes2[:,2])*(boxes2[:1]-boxes2[:3])
n1 = boxes1.shape[0]
n2 = boxes2.shape[0]
iou = np.zeros((n1,n2))
for i in range(n1):
cur = boxes1[i]
x1 = np.max(cur[0], boxes2[:,0])
y1 = np.max(cur[1],boxes2[:,1])
x2 = np.min(cur[2], boxes2[:,2])
y2 = np.min(cur[3],boxes2[:3])
inter = np.max((x2-x1),0) * np.max((y2-y1),0)
outer = area1[i] + area2
iou_cur = inter/(outer-inter)
iou[i] = iou_cur
return iou
def computePR(pred, target, iou_threshold):
num = len(target)
TP = 0
for target_box in target:
max_iou = 0
for p_bbox in pred:
iou = calculate_iou(p_bbox, target_box)
if iou > max_iou:
max_iou = iou
if max_iou >= iou_threshold:
TP +=1
break
FP = len(pred) - TP
FN = len(target) - TP
precision = TP/(TP+FP)
recall = TP/(TP+FN)