理论部分:2022Cs231n笔记-神经网络和反向传播_iwill323的博客-CSDN博客
目录
导包和处理数据
仿射层
网络层代码测试
ReLu层
仿射+ReLu
Loss layers: Softmax and SVM
Two-layer network
SGD
Solver
使用solver训练模型
可视化权重
调试模型
需要注意的函数
矩阵和向量相加
矩阵和向量相乘
np.random.randint
setdefault
pop
# As usual, a bit of setup
from __future__ import print_function
import time
import numpy as np
import matplotlib.pyplot as plt
from cs231n.data_utils import get_CIFAR10_data
from cs231n.gradient_check import eval_numerical_gradient, eval_numerical_gradient_array
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
def rel_error(x, y):
""" returns relative error """
return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))
# Load the (preprocessed) CIFAR10 data.
# 要去get_CIFAR10_data()文件中修改数据集位置:cifar10_dir = 'cs231n\datasets\CIFAR10'
data = get_CIFAR10_data()
for k, v in list(data.items()):
print(('%s: ' % k, v.shape))
这里把get_CIFAR10_data贴出来,做了书籍集加载、划分、减均值、调整通道位置这几件事。、
但是没有做重复加载数据的排查
def get_CIFAR10_data(
num_training=49000, num_validation=1000, num_test=1000, subtract_mean=True
):
"""
Load the CIFAR-10 dataset from disk and perform preprocessing to prepare
it for classifiers. These are the same steps as we used for the SVM, but
condensed to a single function.
"""
# Load the raw CIFAR-10 data
cifar10_dir = "cs231n\datasets\CIFAR10"
X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
# Subsample the data
mask = list(range(num_training, num_training + num_validation))
X_val = X_train[mask]
y_val = y_train[mask]
mask = list(range(num_training))
X_train = X_train[mask]
y_train = y_train[mask]
mask = list(range(num_test))
X_test = X_test[mask]
y_test = y_test[mask]
# Normalize the data: subtract the mean image
if subtract_mean:
mean_image = np.mean(X_train, axis=0)
X_train -= mean_image
X_val -= mean_image
X_test -= mean_image
# Transpose so that channels come first
X_train = X_train.transpose(0, 3, 1, 2).copy()
X_val = X_val.transpose(0, 3, 1, 2).copy()
X_test = X_test.transpose(0, 3, 1, 2).copy()
# Package data into a dictionary
return {
"X_train": X_train,
"y_train": y_train,
"X_val": X_val,
"y_val": y_val,
"X_test": X_test,
"y_test": y_test,
}
def affine_forward(x, w, b):
"""
The input x has shape (N, d_1, ..., d_k) and contains a minibatch of N
examples. We will reshape each input into a vector of dimension
D = d_1 * ... * d_k, and then transform it to an output vector of dimension M.
Inputs:
- x: A numpy array containing input data, of shape (N, d_1, ..., d_k)
- w: A numpy array of weights, of shape (D, M)
- b: A numpy array of biases, of shape (M,)
Returns a tuple of:
- out: output, of shape (N, M)
- cache: (x, w, b)
"""
x_vector = x.reshape(x.shape[0], -1)
out = x_vector.dot(w) + b
# 上面第一项形状是(N, M),b的形状是(M,),dot乘法要求x_vector的最后一维和b一致
cache = (x, w, b)
return out, cache
def affine_backward(dout, cache):
"""
Computes the backward pass for an affine layer.
Inputs:
- dout: Upstream derivative, of shape (N, M)
- cache: Tuple of:
- x: Input data, of shape (N, d_1, ... d_k)
- w: Weights, of shape (D, M)
- b: Biases, of shape (M,)
Returns a tuple of:
- dx: Gradient with respect to x, of shape (N, d1, ..., d_k)
- dw: Gradient with respect to w, of shape (D, M)
- db: Gradient with respect to b, of shape (M,)
"""
x, w, b = cache
dx = dout.dot(w.T).reshape(x.shape) # (N, M) * (M, D)
x_vector = x.reshape(x.shape[0], -1)
dw = x_vector.T.dot(dout) # (D, N) * (N, M)
# db = np.dot(dout.T, np.ones(x.shape[0])) # dout.T:(M, N) 相当于每一行求和
db = dout.sum(axis=0) # 这么写也对 为什么是求和不是求平均?
return dx, dw, db
正向
# Test the affine_forward function
num_inputs = 2
input_shape = (4, 5, 6)
output_dim = 3
input_size = num_inputs * np.prod(input_shape)
weight_size = output_dim * np.prod(input_shape)
x = np.linspace(-0.1, 0.5, num=input_size).reshape(num_inputs, *input_shape)
w = np.linspace(-0.2, 0.3, num=weight_size).reshape(np.prod(input_shape), output_dim)
b = np.linspace(-0.3, 0.1, num=output_dim)
out, _ = affine_forward(x, w, b)
correct_out = np.array([[ 1.49834967, 1.70660132, 1.91485297],
[ 3.25553199, 3.5141327, 3.77273342]])
# Compare your output with ours. The error should be around e-9 or less.
print('Testing affine_forward function:')
print('difference: ', rel_error(out, correct_out))
反向:梯度检验
# Test the affine_backward function
np.random.seed(231)
x = np.random.randn(10, 2, 3)
w = np.random.randn(6, 5)
b = np.random.randn(5)
dout = np.random.randn(10, 5)
dx_num = eval_numerical_gradient_array(lambda x: affine_forward(x, w, b)[0], x, dout)
dw_num = eval_numerical_gradient_array(lambda w: affine_forward(x, w, b)[0], w, dout)
db_num = eval_numerical_gradient_array(lambda b: affine_forward(x, w, b)[0], b, dout)
_, cache = affine_forward(x, w, b)
dx, dw, db = affine_backward(dout, cache)
# The error should be around e-10 or less
print('Testing affine_backward function:')
print('dx error: ', rel_error(dx_num, dx))
print('dw error: ', rel_error(dw_num, dw))
print('db error: ', rel_error(db_num, db))
这里把梯度检查的代码贴出来
def eval_numerical_gradient_array(f, x, df, h=1e-5):
"""
Evaluate a numeric gradient for a function that accepts a numpy
array and returns a numpy array.
"""
grad = np.zeros_like(x)
it = np.nditer(x, flags=["multi_index"], op_flags=["readwrite"])
while not it.finished:
ix = it.multi_index
oldval = x[ix]
x[ix] = oldval + h
pos = f(x).copy()
x[ix] = oldval - h
neg = f(x).copy()
x[ix] = oldval
grad[ix] = np.sum((pos - neg) * df) / (2 * h)
it.iternext()
return grad
def relu_forward(x):
"""
Computes the forward pass for a layer of rectified linear units (ReLUs).
Input:
- x: Inputs, of any shape
Returns a tuple of:
- out: Output, of the same shape as x
- cache: x
"""
out = None
out = np.maximum(0,x)
cache = x
return out, cache
def relu_backward(dout, cache):
"""
Computes the backward pass for a layer of rectified linear units (ReLUs).
Input:
- dout: Upstream derivatives, of any shape
- cache: Input x, of same shape as dout
Returns:
- dx: Gradient with respect to x
"""
dx, x = None, cache
dx = x
dx[dx < 0] = 0 # x是否会被改变?
dx[dx > 0] = 1
dx *= dout
return dx
def affine_relu_forward(x, w, b):
"""
Convenience layer that perorms an affine transform followed by a ReLU
Inputs:
- x: Input to the affine layer
- w, b: Weights for the affine layer
Returns a tuple of:
- out: Output from the ReLU
- cache: Object to give to the backward pass
"""
a, fc_cache = affine_forward(x, w, b) # out, cache
out, relu_cache = relu_forward(a)
cache = (fc_cache, relu_cache)
return out, cache
def affine_relu_backward(dout, cache):
"""
Backward pass for the affine-relu convenience layer
"""
fc_cache, relu_cache = cache
da = relu_backward(dout, relu_cache)
dx, dw, db = affine_backward(da, fc_cache)
return dx, dw, db
def svm_loss(x, y):
"""
Computes the loss and gradient using for multiclass SVM classification.
Inputs:
- x: Input data, of shape (N, C) where x[i, j] is the score for the jth
class for the ith input.
- y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and
0 <= y[i] < C
Returns a tuple of:
- loss: Scalar giving the loss
- dx: Gradient of the loss with respect to x
"""
loss, dx = None, None
num_train = x.shape[0]
num_classes = x.shape[1]
correct_class_score = x[np.arange(num_train), y].reshape(-1, 1)
margin = np.maximum(0, x - correct_class_score + 1)
margin[np.arange(num_train), y] = 0
loss = np.sum(margin) / num_train
margin[margin > 0] = 1
correct_number = np.sum(margin, axis = 1)
margin[np.arange(num_train), y] -= correct_number
dx = margin / num_train
return loss, dx
def softmax_loss(x, y):
"""
Computes the loss and gradient for softmax classification.
Inputs:
- x: Input data, of shape (N, C) where x[i, j] is the score for the jth
class for the ith input.
- y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and
0 <= y[i] < C
Returns a tuple of:
- loss: Scalar giving the loss
- dx: Gradient of the loss with respect to x
"""
loss, dx = None, None
num_train = x.shape[0]
scores = x - np.max(x, axis = 1).reshape(-1, 1)
normalized_scores = np.exp(scores) / np.sum(np.exp(scores), axis = 1).reshape(-1,1)
loss = -np.sum(np.log(normalized_scores[np.arange(num_train), y]))
loss /= num_train
normalized_scores[np.arange(num_train), y] -= 1
dx = normalized_scores / num_train
return (loss, dx)
import numpy as np
class TwoLayerNet(object):
"""
A two-layer fully-connected neural network with ReLU nonlinearity and
softmax loss that uses a modular layer design. We assume an input dimension
of D, a hidden dimension of H, and perform classification over C classes.
The architecure should be affine - relu - affine - softmax.
Note that this class does not implement gradient descent; instead, it
will interact with a separate Solver object that is responsible for running
optimization.
"""
def __init__(
self,
input_dim=3 * 32 * 32,
hidden_dim=100,
num_classes=10,
weight_scale=1e-3,
reg=0.0,
):
"""
Initialize a new network.
Inputs:
- input_dim: An integer giving the size of the input
- hidden_dim: An integer giving the size of the hidden layer
- num_classes: An integer giving the number of classes to classify
- weight_scale: Scalar giving the standard deviation for random
initialization of the weights.
- reg: Scalar giving L2 regularization strength.
"""
self.params = {}
self.reg = reg
# Initialize the weights and biases of the two-layer net. Weights
W1 = np.random.normal(0.0, weight_scale, (input_dim, hidden_dim))
W2 = np.random.normal(0.0, weight_scale, (hidden_dim, num_classes))
b1 = np.zeros(hidden_dim)
b2 = np.zeros(num_classes)
self.params = {'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}
def loss(self, X, y=None):
"""
Compute loss and gradient for a minibatch of data.
Inputs:
- X: Array of input data of shape (N, d_1, ..., d_k)
- y: Array of labels, of shape (N,). y[i] gives the label for X[i].
Returns:
If y is None, then run a test-time forward pass of the model and return:
- scores: Array of shape (N, C) giving classification scores, where
scores[i, c] is the classification score for X[i] and class c.
If y is not None, then run a training-time forward and backward pass and
return a tuple of:
- loss: Scalar value giving the loss
- grads: Dictionary with the same keys as self.params, mapping parameter
names to gradients of the loss with respect to those parameters.
"""
scores = None
W1 = self.params['W1']
b1 = self.params['b1']
W2 = self.params['W2']
b2 = self.params['b2']
relu_output, relu_cache = affine_relu_forward(X, W1, b1)
scores, cache = affine_forward(relu_output, W2, b2)
# If y is None then we are in test mode so just return scores
if y is None:
return scores
loss, grads = 0, {}
loss, d_scores = softmax_loss(scores, y)
loss += 0.5 * self.reg * (W1 * W1).sum() + 0.5 * self.reg * (W2 * W2).sum()
dx, grads['W2'], grads['b2'] = affine_backward(d_scores, cache)
_, grads['W1'], grads['b1'] = affine_relu_backward(dx, relu_cache)
grads['W1'] += self.reg * W1
grads['W2'] += self.reg * W2
return loss, grads
def sgd(w, dw, config=None):
"""
Performs vanilla stochastic gradient descent.
config format:
- learning_rate: Scalar learning rate.
"""
if config is None:
config = {}
config.setdefault("learning_rate", 1e-2)
w -= config["learning_rate"] * dw
return w, config
from __future__ import print_function, division
from future import standard_library
from cs231n import optim
standard_library.install_aliases()
import os
import pickle as pickle
class Solver(object):
"""
A Solver encapsulates all the logic necessary for training classification
models. The Solver performs stochastic gradient descent using different
update rules defined in optim.py.
The solver accepts both training and validataion data and labels so it can
periodically check classification accuracy on both training and validation
data to watch out for overfitting.
Example usage might look something like this:
data = {
'X_train': # training data
'y_train': # training labels
'X_val': # validation data
'y_val': # validation labels
}
model = MyAwesomeModel(hidden_size=100, reg=10)
solver = Solver(model, data,
update_rule='sgd',
optim_config={
'learning_rate': 1e-4,
},
lr_decay=0.95,
num_epochs=5, batch_size=200,
print_every=100)
solver.train()
A Solver works on a model object that must conform to the following API:
- model.params must be a dictionary mapping string parameter names to numpy
arrays containing parameter values.
- model.loss(X, y) must be a function that computes training-time loss and
gradients, and test-time classification scores, with the following inputs
and outputs:
Inputs:
- X: Array giving a minibatch of input data of shape (N, d_1, ..., d_k)
- y: Array of labels, of shape (N,) giving labels for X where y[i] is the
label for X[i].
Returns:
If y is None, run a test-time forward pass and return:
- scores: Array of shape (N, C) giving classification scores for X where
scores[i, c] gives the score of class c for X[i].
If y is not None, run a training time forward and backward pass and
return a tuple of:
- loss: Scalar giving the loss
- grads: Dictionary with the same keys as self.params mapping parameter
names to gradients of the loss with respect to those parameters.
"""
def __init__(self, model, data, **kwargs):
"""
Construct a new Solver instance.
Required arguments:
- model: A model object conforming to the API described above
- data: A dictionary of training and validation data containing:
'X_train': Array, shape (N_train, d_1, ..., d_k) of training images
'X_val': Array, shape (N_val, d_1, ..., d_k) of validation images
'y_train': Array, shape (N_train,) of labels for training images
'y_val': Array, shape (N_val,) of labels for validation images
Optional arguments:
- update_rule: A string giving the name of an update rule in optim.py.
Default is 'sgd'.
- optim_config: A dictionary containing hyperparameters that will be
passed to the chosen update rule. Each update rule requires different
hyperparameters (see optim.py) but all update rules require a
'learning_rate' parameter so that should always be present.
- lr_decay: A scalar for learning rate decay; after each epoch the
learning rate is multiplied by this value.
- batch_size: Size of minibatches used to compute loss and gradient
during training.
- num_epochs: The number of epochs to run for during training.
- print_every: Integer; training losses will be printed every
print_every iterations.
- verbose: Boolean; if set to false then no output will be printed
during training.
- num_train_samples: Number of training samples used to check training
accuracy; default is 1000; set to None to use entire training set.
- num_val_samples: Number of validation samples to use to check val
accuracy; default is None, which uses the entire validation set.
- checkpoint_name: If not None, then save model checkpoints here every
epoch.
"""
self.model = model
self.X_train = data['X_train']
self.y_train = data["y_train"]
self.X_val = data["X_val"]
self.y_val = data["y_val"]
# Unpack keyword arguments
self.update_rule = kwargs.pop("update_rule", "sgd")
self.optim_config = kwargs.pop("optim_config", {})
self.lr_decay = kwargs.pop("lr_decay", 1.0)
self.batch_size = kwargs.pop("batch_size", 100)
self.num_epochs = kwargs.pop("num_epochs", 10)
self.num_train_samples = kwargs.pop("num_train_samples", 1000)
self.num_val_samples = kwargs.pop("num_val_samples", None)
self.checkpoint_name = kwargs.pop("checkpoint_name", None)
self.print_every = kwargs.pop("print_every", 10)
self.verbose = kwargs.pop("verbose", True)
# Throw an error if there are extra keyword arguments
if len(kwargs) > 0:
extra = ", ".join('"%s"' % k for k in list(kwargs.keys()))
raise ValueError("Unrecognized arguments %s" % extra)
# Make sure the update rule exists, then replace the string name with the actual function
if not hasattr(optim, self.update_rule):
raise ValueError('Invalid update_rule "%s"' % self.update_rule)
self.update_rule = getattr(optim, self.update_rule)
# hasattr() 函数用于判断对象是否包含对应的属性。 在optim包中找对应的update_rule
self._reset()
def _reset(self):
"""
Set up some book-keeping variables for optimization. Don't call this manually.
"""
# Set up some variables for book-keeping
self.epoch = 0
self.best_val_acc = 0
self.best_params = {}
self.loss_history = []
self.train_acc_history = []
self.val_acc_history = []
# Make a deep copy of the optim_config for each parameter
self.optim_configs = {}
for p in self.model.params: # model.params是一个字典
d = {k: v for k, v in self.optim_config.items()} # optim_config是字典
self.optim_configs[p] = d # optim_configs每一个元素岂不是一样?
def _step(self):
"""
Make a single gradient update. This is called by train()
"""
# Make a minibatch of training data
num_train = self.X_train.shape[0]
batch_mask = np.random.choice(num_train, self.batch_size)
X_batch = self.X_train[batch_mask]
y_batch = self.y_train[batch_mask]
# Compute loss and gradient
loss, grads = self.model.loss(X_batch, y_batch)
self.loss_history.append(loss)
# Perform a parameter update
for p, w in self.model.params.items(): # model.params是一个字典
dw = grads[p] # dw只是一个名字,实际上可以是任何参数
config = self.optim_configs[p] # 比如learning rate
next_w, next_config = self.update_rule(w, dw, config)
self.model.params[p] = next_w
self.optim_configs[p] = next_config
def _save_checkpoint(self):
if self.checkpoint_name is None:
return
checkpoint = {
"model": self.model,
"update_rule": self.update_rule,
"lr_decay": self.lr_decay,
"optim_config": self.optim_config,
"batch_size": self.batch_size,
"num_train_samples": self.num_train_samples,
"num_val_samples": self.num_val_samples,
"epoch": self.epoch,
"loss_history": self.loss_history,
"train_acc_history": self.train_acc_history,
"val_acc_history": self.val_acc_history,
}
filename = "%s_epoch_%d.pkl" % (self.checkpoint_name, self.epoch)
if self.verbose:
print('Saving checkpoint to "%s"' % filename)
with open(filename, "wb") as f:
pickle.dump(checkpoint, f)
def check_accuracy(self, X, y, num_samples=None, batch_size=100):
"""
Check accuracy of the model on the provided data.
Inputs:
- X: Array of data, of shape (N, d_1, ..., d_k)
- y: Array of labels, of shape (N,)
- num_samples: If not None, subsample the data and only test the model
on num_samples datapoints.
- batch_size: Split X and y into batches of this size to avoid using
too much memory.
Returns:
- acc: Scalar giving the fraction of instances that were correctly
classified by the model.
"""
# Maybe subsample the data
N = X.shape[0]
if num_samples is not None and N > num_samples:
mask = np.random.choice(N, num_samples)
N = num_samples
X = X[mask]
y = y[mask]
# Compute predictions in batches
num_batches = N // batch_size
if N % batch_size != 0:
num_batches += 1
y_pred = []
for i in range(num_batches):
start = i * batch_size
end = (i + 1) * batch_size
scores = self.model.loss(X[start:end])
# model.loss中y==none返回的是score
y_pred.append(np.argmax(scores, axis=1))
y_pred = np.hstack(y_pred)
acc = np.mean(y_pred == y)
return acc
def train(self):
"""
Run optimization to train the model.
"""
num_train = self.X_train.shape[0]
iterations_per_epoch = max(num_train // self.batch_size, 1)
num_iterations = self.num_epochs * iterations_per_epoch
for t in range(num_iterations):
self._step()
# Maybe print training loss
if self.verbose and t % self.print_every == 0:
print(
"(Iteration %d / %d) loss: %f"
% (t + 1, num_iterations, self.loss_history[-1])
)
# At the end of every epoch, increment the epoch counter and decay
# the learning rate.
epoch_end = (t + 1) % iterations_per_epoch == 0
if epoch_end:
self.epoch += 1
for k in self.optim_configs:
self.optim_configs[k]["learning_rate"] *= self.lr_decay
# Check train and val accuracy on the first iteration, the last
# iteration, and at the end of each epoch.
first_it = t == 0
last_it = t == num_iterations - 1
if first_it or last_it or epoch_end:
train_acc = self.check_accuracy(
self.X_train, self.y_train, num_samples=self.num_train_samples
)
val_acc = self.check_accuracy(
self.X_val, self.y_val, num_samples=self.num_val_samples
)
self.train_acc_history.append(train_acc)
self.val_acc_history.append(val_acc)
self._save_checkpoint()
if self.verbose:
print(
"(Epoch %d / %d) train acc: %f; val_acc: %f"
% (self.epoch, self.num_epochs, train_acc, val_acc)
)
# Keep track of the best model
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
self.best_params = {}
for k, v in self.model.params.items():
self.best_params[k] = v.copy()
# At the end of training swap the best params into the model
self.model.params = self.best_params
input_size = 32 * 32 * 3
hidden_size = 50
num_classes = 10
model = TwoLayerNet(input_size, hidden_size, num_classes)
solver = None
solver = Solver(model, data,
update_rule='sgd',
optim_config={
'learning_rate': 1e-3,
},
lr_decay=0.95,
num_epochs=5, batch_size=100,
print_every=500, verbose = True)
# 如果上面不输入data后面的参数,会使用默认的
solver.train()
(Iteration 1 / 2450) loss: 2.300230 (Epoch 0 / 5) train acc: 0.126000; val_acc: 0.127000 (Epoch 1 / 5) train acc: 0.438000; val_acc: 0.434000 (Iteration 501 / 2450) loss: 1.513676 (Epoch 2 / 5) train acc: 0.475000; val_acc: 0.464000 (Iteration 1001 / 2450) loss: 1.297377 (Epoch 3 / 5) train acc: 0.491000; val_acc: 0.471000 (Iteration 1501 / 2450) loss: 1.630036 (Epoch 4 / 5) train acc: 0.511000; val_acc: 0.477000 (Iteration 2001 / 2450) loss: 1.391281 (Epoch 5 / 5) train acc: 0.516000; val_acc: 0.479000
打印loss和accuracy
# Run this cell to visualize training loss and train / val accuracy
plt.subplot(2, 1, 1)
plt.title('Training loss')
plt.plot(solver.loss_history, 'o')
plt.xlabel('Iteration')
plt.subplot(2, 1, 2)
plt.title('Accuracy')
plt.plot(solver.train_acc_history, '-o', label='train')
plt.plot(solver.val_acc_history, '-o', label='val')
plt.plot([0.5] * len(solver.val_acc_history), 'k--')
plt.xlabel('Epoch')
plt.legend(loc='lower right')
plt.gcf().set_size_inches(15, 12)
plt.show()
from math import sqrt, ceil
import numpy as np
def visualize_grid(Xs, ubound=255.0, padding=1):
"""
Reshape a 4D tensor of image data to a grid for easy visualization.
Inputs:
- Xs: Data of shape (N, H, W, C)
- ubound: Output grid will have values scaled to the range [0, ubound]
- padding: The number of blank pixels between elements of the grid
"""
(N, H, W, C) = Xs.shape
grid_size = int(ceil(sqrt(N)))
grid_height = H * grid_size + padding * (grid_size - 1)
grid_width = W * grid_size + padding * (grid_size - 1)
grid = np.zeros((grid_height, grid_width, C))
next_idx = 0
y0, y1 = 0, H
for y in range(grid_size):
x0, x1 = 0, W
for x in range(grid_size):
if next_idx < N:
img = Xs[next_idx]
low, high = np.min(img), np.max(img)
grid[y0:y1, x0:x1] = ubound * (img - low) / (high - low)
# grid[y0:y1, x0:x1] = Xs[next_idx]
next_idx += 1
x0 += W + padding
x1 += W + padding
y0 += H + padding
y1 += H + padding
# grid_max = np.max(grid)
# grid_min = np.min(grid)
# grid = ubound * (grid - grid_min) / (grid_max - grid_min)
return grid
# Visualize the weights of the network
def show_net_weights(net):
W1 = net.params['W1']
W1 = W1.reshape(3, 32, 32, -1).transpose(3, 1, 2, 0)
plt.imshow(visualize_grid(W1, padding=3).astype('uint8'))
plt.gca().axis('off')
plt.show()
show_net_weights(model)
Below, you should experiment with different values of the various hyperparameters, including hidden layer size, learning rate, numer of training epochs, and regularization strength. You might also consider tuning the learning rate decay, but you should be able to get good performance using the default value.
下面的代码对超参数的搜索介于随机搜索和网格搜索之间
best_model = None
best_val_accuracy = 0.0
#Tune hyperparameters using the validation set. Store your best trained
input_dim = 32 * 32 * 3
num_classes= 10
best_val_accuracy = 0.0
hidden_dims = [50, 60, 80]
lrs = [5e-5, 1e-4, 5e-4, 1e-3]
regs = [0.5, 0.6]
epos = [1, 2]
def random_chose_para(hidden_dims, lrs, regs, epos):
hidden_dim = hidden_dims[np.random.randint(0, len(hidden_dims))]
lr = lrs[np.random.randint(0, len(lrs))]
reg = regs[np.random.randint(0, len(regs))]
epo = epos[np.random.randint(0, len(epos))]
return hidden_dim, lr, reg, epo
for ite in range(1):
hidden_dim, lr, reg, epo= random_chose_para(hidden_dims, lrs, regs, epos)
model = TwoLayerNet(input_dim, hidden_dim, num_classes, reg = reg)
solver = Solver(model, data,
update_rule='sgd',
optim_config={
'learning_rate': lr,
},
lr_decay=0.95,
num_epochs=epo, batch_size=100,
print_every=500, verbose = True)
solver.train()
print('Validation accuracy: ', best_val_accuracy)
if solver.best_val_acc > best_val_accuracy:
best_val_accuracy = solver.best_val_acc
best_model = model
print('Validation accuracy: ', best_val_accuracy)
>>a = np.array([[1,2,3],[4,5,6],[7,8,9]])
>>print(a)
[[1 2 3] [4 5 6] [7 8 9]]
>>b = np.array([1,2,3])
>>print(b.shape)
(3,)
>>print(a+b)
[[ 2 4 6] [ 5 7 9] [ 8 10 12]]
>>a = np.ones(2)
>>b = np.array([[1,2],[3,4]])
>>print(b.dot(a))
[3. 7.]
>>print(np.random.randint(5, size = 3))
[1 3 2]
>>print(np.random.randint(5, size = (2,2)))
[[1 2] [1 4]]
>>a = {}
>>a.setdefault("learning_rate", 1e-2)
>>print(a)
>>b = {}
>>b.pop("update_rule", "sgd")
>>b1 = b.pop("update_rule", "sgd")
>>print(b1)
sgd
>>c = {'update_rule':'dad'}
>>c1 = c.pop("update_rule", "sgd")
>>print(c)
>>print(c1)
{} dad