svm
SVM就是将你当前数据集映射到一个不同维度的支持向量机算法
第一步:随机生成一个W矩阵,用这个矩阵将(3072,49000)的training图像集转化成(10,49000),对应的就是十种类别
第二步:计算loss function和W的梯度,简单而言就是对
L = (1/N)∑iLi + λR(W)
求导,其中Li为
Li = ∑j≠yi max(0, (xiW)j−(xyiW)j+Δ)
,看起来十分复杂,其实就是分情况为j=yi和j!=yi的情况,导致的结果就是正负的关系.
∇Wyi Li = - xiT(∑j≠yi1(xiWj - xiWyi +1>0)) + 2λWyi
and
∇Wj Li = xiT 1(xiWj - xiWyi +1>0) + 2λWj , (j≠yi)
第三步:利用W -= dW来迭代减少loss function直到loss function收敛,这里用的是SGD(随机梯度下降),意思就是设定计算步数和每一步随机取得的training data数量
第四步:利用训练好的W来预测test data的准确率,大概都是在40%左右,调整步长和正则化大小可以适量提升准确率
linear_classifiar
import numpy as np
from cs231n.classifiers.linear_svm import *
from cs231n.classifiers.softmax import *
class LinearClassifier:
def __init__(self):
self.W = None
def train(self, X, y, learning_rate=1e-3, reg=1e-5, num_iters=100,
batch_size=200, verbose=False):
dim, num_train = X.shape
num_classes = np.max(y) + 1 # assume y takes values 0...K-1 where K is number of classes
if self.W is None:
# lazily initialize W
self.W = np.random.randn(num_classes, dim) * 0.001
# Run stochastic gradient descent to optimize W
loss_history = []
for it in xrange(num_iters):
X_batch = None
y_batch = None
sample_index = np.random.choice(num_train, batch_size, replace=False)
X_batch = X[:,sample_index]
y_batch = y[sample_index]
# y_batch = np.random.choice(y,batch_size,replace=False)
# X_batch = X[:,y_batch]
# evaluate loss and gradient
loss, grad = self.loss(X_batch, y_batch, reg)
loss_history.append(loss)
# evaluate weight with using loss and gradient
self.W += -learning_rate * grad
if verbose and it % 100 == 0:
print 'iteration %d / %d: loss %f' % (it, num_iters, loss)
return loss_history
def predict(self, X):
y_pred = np.zeros(X.shape[1])
y_pred = np.argmax(np.dot(self.W,X),axis=0)
return y_pred
def loss(self, X_batch, y_batch, reg):
"""
Compute the loss function and its derivative.
Subclasses will override this.
Inputs:
- X_batch: D x N array of data; each column is a data point.
- y_batch: 1-dimensional array of length N with labels 0...K-1, for K classes.
- reg: (float) regularization strength.
Returns: A tuple containing:
- loss as a single float
- gradient with respect to self.W; an array of the same shape as W
"""
pass
class LinearSVM(LinearClassifier):
""" A subclass that uses the Multiclass SVM loss function """
def loss(self, X_batch, y_batch, reg):
return svm_loss_vectorized(self.W, X_batch, y_batch, reg)
class Softmax(LinearClassifier):
""" A subclass that uses the Softmax + Cross-entropy loss function """
def loss(self, X_batch, y_batch, reg):
return softmax_loss_vectorized(self.W, X_batch, y_batch, reg)
- linear_svm
import numpy as np
from random import shuffle
def svm_loss_naive(W, X, y, reg):
"""
Structured SVM loss function, naive implementation (with loops)
Inputs:
- W: 10 x 3073 array of weights
- X: 3073 x 49000 array of data. Data are D-dimensional columns
- y: 1-dimensional array of length N with labels 0...K-1, for 49000 classes
- reg: (float) regularization strength
Returns:
a tuple of:
- loss as single float
- gradient with respect to weights W; an array of same shape as W
"""
dW = np.zeros(W.shape) # initialize the gradient as zero
# compute the loss and the gradient
num_classes = W.shape[0]#10
num_train = X.shape[1]#49000
loss = 0.0
# implement max(0,s[j]-s[yi]+1)
for i in xrange(num_train):
scores = W.dot(X[:, i])#calculate to get the every label under W(f(x,w))
correct_class_score = scores[y[i]]#y[i]
for j in xrange(num_classes):
if j == y[i]:
continue
margin = scores[j] - correct_class_score + 1 # max(0,f[i]-f[yi]+1)
if margin > 0:#if margin < 0 and j!= y[i],there is no effect on result
loss += margin
dW[y[i],:] += -X[:,i]
dW[j,:] += X[:,i]
# Right now the loss is a sum over all training examples, but we want it
# to be an average instead so we divide by num_train.
loss /= num_train #the average of loss
dW /= num_train
# Add regularization to the loss.
loss += 0.5 * reg * np.sum(W * W)
dW += reg*W
#############################################################################
# TODO: #
# Compute the gradient of the loss function and store it dW. #
# Rather that first computing the loss and then computing the derivative, #
# it may be simpler to compute the derivative at the same time that the #
# loss is being computed. As a result you may need to modify some of the #
# code above to compute the gradient. #
#############################################################################
return loss, dW
def svm_loss_vectorized(W, X, y, reg):
"""
Structured SVM loss function, vectorized implementation.
Inputs and outputs are the same as svm_loss_naive.
"""
#compute the loss function
loss = 0.0
dW = np.zeros(W.shape) # initialize the gradient as zero
num_train = X.shape[1]
num_classes = W.shape[0]
scores = W.dot(X)
correct_score = scores[y,np.arange(num_train)]
margins = scores-correct_score+1.0
margins[y,np.arange(num_train)] = 0.0
margins[margins<0] = 0.0
loss += np.sum(margins)/num_train
loss += 0.5*reg*np.sum(W*W)
#compute the gradient
margins[margins > 0] = 1.0
row_sum = np.sum(margins,axis=0)
margins[y,np.arange(num_train)] = -row_sum
dW += np.dot(margins,X.T)/num_train + reg*W
return loss, dW
- svm.py
import random
import numpy as np
from cs231n.data_utils import load_CIFAR10
import matplotlib.pyplot as plt
import time
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
cifar10_dir = 'cs231n/datasets/cifar-10-batches-py'
X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
num_training = 49000
num_validation = 1000
num_test = 1000
mask = range(num_training, num_training + num_validation)
X_val = X_train[mask]
y_val = y_train[mask]
mask = range(num_training)
X_train = X_train[mask]
y_train = y_train[mask]
mask = range(num_test)
X_test = X_test[mask]
y_test = y_test[mask]
X_train = np.reshape(X_train,(X_train.shape[0],-1))
X_val = np.reshape(X_val,(X_val.shape[0],-1))
X_test = np.reshape(X_test,(X_test.shape[0],-1))
print 'Training data shape: ', X_train.shape
print 'Validation data shape: ', X_val.shape
print 'Test data shape: ', X_test.shape
#get the average of every label in X_train
mean_image = np.mean(X_train,axis=0)
print mean_image[:10]
plt.figure(figsize=(4,4))
#plt.imshow(mean_image.reshape((32,32,3)).astype('uint8'))
X_train -= mean_image
X_val -= mean_image
X_test -= mean_image
print X_train.shape,X_train.shape[0]
print "np.ones((X_train.shape[0],1))'s shape is",np.ones((X_train.shape[0],1)).shape
X_train = np.hstack([X_train,np.ones((X_train.shape[0],1))]).T
X_val = np.hstack([X_val,np.ones((X_val.shape[0],1))]).T
X_test = np.hstack([X_test,np.ones((X_test.shape[0],1))]).T
print X_train.shape,X_val.shape,X_test.shape
# from cs231n.classifiers.linear_svm import svm_loss_naive
#
# W = np.random.randn(10,3073)*0.0001
# print W.shape
# loss,grad = svm_loss_naive(W,X_train,y_train,0.00001)
# print 'loss:%f' %(loss,)
#
# loss,grad = svm_loss_naive(W,X_train,y_train,0.0)
# from cs231n.gradient_check import grad_check_sparse
# f = lambda w:svm_loss_naive(W,X_train,y_train,0.0)[0]
# grad_numerical = grad_check_sparse(f,W,grad,10)
#
# tic = time.time()
# loss_native,grad_native = svm_loss_naive(W,X_train,y_train,0.00001)
# toc = time.time()
# print 'Naive loss:%e computed in %fs' %(loss_native,toc-tic)
#
# from cs231n.classifiers.linear_svm import svm_loss_vectorized
# tic = time.time()
# loss_vectorized,_ = svm_loss_vectorized(W,X_train,y_train,0.00001)
# toc = time.time()
#
# print 'Vectorized loss: %e computed in %fs' %(loss_vectorized,toc-tic)
# print 'difference: %f' %(loss_native - loss_vectorized)
#
# tic = time.time()
# _,grad_naive = svm_loss_naive(W,X_train,y_train,0.00001)
# toc = time.time()
# print 'Naive loss and gradient:computed in %fs' %(toc-tic)
#
# tic = time.time()
# _, grad_vectorized = svm_loss_vectorized(W, X_train, y_train, 0.00001)
# toc = time.time()
# print 'Vectorized loss and gradient: computed in %fs' % (toc - tic)
#
# difference = np.linalg.norm(grad_naive-grad_vectorized,ord='fro')
# print 'difference:%f' %difference
from cs231n.classifiers import LinearSVM
svm = LinearSVM()
tic = time.time()
loss_hist = svm.train(X_train, y_train, learning_rate=1e-7, reg=5e4,num_iters=1500, verbose=True)
toc = time.time()
print 'That took %fs' % (toc - tic)
# plt.plot(loss_hist)
# plt.xlabel('Iteration number')
# plt.ylabel('Loss value')
# plt.show()
y_train_pred = svm.predict(X_train)
print 'training accutacy: %f' %(np.mean(y_train == y_train_pred), )
y_val_pred = svm.predict(X_val)
print 'validation accuracy: %f' %(np.mean(y_val == y_val_pred), )
learning_rates = [1e-7,5e-5]
regularization_strengths = [5e4,1e5]
results = {}
best_val = -1
best_svm = None
iters = 2000
for lr in learning_rates:
for rs in regularization_strengths:
svm = LinearSVM()
svm.train(X_train,y_train,learning_rate=lr,reg=rs,num_iters=iters)#train times=iters
y_train_pred = svm.predict(X_train)
acc_train = np.mean(y_train==y_train_pred)
y_val_pred = svm.predict(X_val)
acc_val = np.mean(y_val==y_val_pred)
results[(lr,rs)] = (acc_train,acc_val)
if best_val