Digit Recognizer使用的是MNIST数据集。每个图片包含一个数字,图片大小为 28 ∗ 28 ∗ 1 28*28*1 28∗28∗1。尝试使用SVM以及CNN实现对手写体数字的识别。
1. 使用SVM识别手写体数字
使用sklarn可以较为方便的调用SVM分类器。由于图片包含28*28个像素点,不适合使用线性分类器,因此使用径向基核函数,参数的调优可以使用网格搜索(GridSearchCV,可以参考scikit-learn中超参数搜索之网格搜索(GridSearchCV))。使用交叉验证(cross_validate)检验一下模型的分类性能。
代码如下(在kaggle上最好的成绩是0.97457,由于后面代码进行了调整,其中的参数并不是效果最好的):
# -*- coding:utf-8 -*-
from __future__ import print_function
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
trainx = train_df.filter(regex='p.*').to_numpy()
trainy = train_df.filter(regex='label').to_numpy()
# parameters = {'svc__gamma': ['scale', 0.001, 0.01, 0.1, 1], 'svc__C': [0.1, 1, 3, 5, 7, 9]}
# clf = Pipeline([('ss', StandardScaler()), ('svc', SVC())])
# gds = GridSearchCV(clf, parameters, refit=True, cv=5, verbose=1, n_jobs=-1)
# gds.fit(trainx, trainy)
# print('best paramaters: ' + gds.best_params_)
# print('best score: ' + gds.best_score_)
# feed into svm classifier
svm_clf = SVC(C=3.0, kernel='rbf', gamma='scale', max_iter=100)
# cross_val = cross_validate(svm_clf, train_df.filter(regex='p.*'), train_df.filter(regex='label')
# , cv=2, n_jobs=-1, return_train_score=True)
# print(cross_val)
# predict the label of test set using svm classifier
svm_clf.fit(trainx, trainy)
predict_result = svm_clf.predict(test_df)
predict_result = pd.Series(predict_result, name='Label')
predict_result.to_csv('result.csv')
2. 使用cnn进行手写体识别
使用tensorflow实现一个cnn网络,包括两个卷积层,两个池化层以及最后的两个全连接层。维度变化为 28 ∗ 28 ∗ 1 → conv 5*5 28 ∗ 28 ∗ 2 → maxpool 2*2 14 ∗ 14 ∗ 2 → conv 3*3 14 ∗ 14 ∗ 4 → maxpool 2*2 7 ∗ 7 ∗ 4 → r e s h a p e 1 ∗ 196 → dense 1 ∗ 64 → dense 1 ∗ 10 28*28*1\xrightarrow{\text{conv 5*5}}28*28*2\xrightarrow{\text{maxpool 2*2}}14*14*2\xrightarrow{\text{conv 3*3}}14*14*4\xrightarrow{\text{maxpool 2*2}}7*7*4\xrightarrow{reshape}1*196\xrightarrow{\text{dense}}1*64\xrightarrow{\text{dense}}1*10 28∗28∗1conv 5*528∗28∗2maxpool 2*214∗14∗2conv 3*314∗14∗4maxpool 2*27∗7∗4reshape1∗196dense1∗64dense1∗10 一开始卷积层outchannel,以及网络层数设置的是比较大的,但是后来考虑到MNIST的手写体图片的信息相对较少,没有必要使用过多的网络层。
在训练过程的每一轮迭代中插入获取预测错误图片并显示的代码,发现有一些手写体数字的图片是倾斜的,部分达到90度左右,尝试对训练feed的数据随机的进行旋转操作,但是分类效果并没有提升,可能有旋转后引入错误的原因(比如6和9)。之后尝试使用阈值将图片像素置为0或255、尝试不同的激活函数等。发现加入阈值操作并没有带来效果的提升,激活函数使用relu可以取得不错的效果,使用softmax虽然可以取得较高的准确率但是收敛速度较慢,优化器使用Adagrad。为了保证结果的可重现性,设置随机数种子(seed)为0。在训练过程中,在训练集上的准确率可以超过0.99,但是在验证集上的准确率始终未能超过0.97,尝试使用并调整l2正则以及dropout,但未能有较大提高。在kaggle上,最高可以达到0.967左右。
代码如下:
# -*- coding:utf-8 -*-
from __future__ import print_function
import tensorflow as tf
from tensorflow.contrib.layers import l2_regularizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from copy import deepcopy
from os import path, listdir
from numpy.random import seed
seed(0)
tf.set_random_seed(0)
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
# split train data into two parts
trainx, testx, trainy, testy = train_test_split(train_df.filter(regex='p.*'),
train_df.filter(regex='label'), test_size=0.3)
# convert every image data from 1*784 to 28*28
tmp_ndarr = []
trainx = trainx.to_numpy()
for i in range(np.shape(trainx)[0]):
tmp_ndarr.append(np.reshape(trainx[i], (28, 28)))
tmp_ndarr = np.asarray(tmp_ndarr)
trainx = deepcopy(tmp_ndarr)
tmp_ndarr = []
testx = testx.to_numpy()
for i in range(np.shape(testx)[0]):
tmp_ndarr.append(np.reshape(testx[i], (28, 28)))
tmp_ndarr = np.asarray(tmp_ndarr)
testx = deepcopy(tmp_ndarr)
# convert test_df
tmp_ndarr = []
test_df = test_df.to_numpy()
for i in range(np.shape(test_df)[0]):
tmp_ndarr.append(np.reshape(test_df[i], (28, 28)))
tmp_ndarr = np.asarray(tmp_ndarr)
test_df = deepcopy(tmp_ndarr)
del tmp_ndarr
# define the cnn model
def img_model(input_tf, is_training, regularizer):
with tf.variable_scope('layer1-conv'):
conv1_w = tf.get_variable('weight', [5, 5, 1, 2], initializer=tf.truncated_normal_initializer(
stddev=0.1)) # 3*3 conv window, in channel is 1, out channel is 8
conv1_b = tf.get_variable('bias', [2], initializer=tf.constant_initializer(0.0))
conv1 = tf.nn.conv2d(input_tf, conv1_w, strides=[1, 1, 1, 1], padding='SAME')
relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_b))
with tf.name_scope('layer2-pool'):
pool1 = tf.nn.max_pool(relu1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID')
with tf.variable_scope('layer3-conv'):
conv2_w = tf.get_variable('weights', [3, 3, 2, 4], initializer=tf.truncated_normal_initializer(stddev=0.1))
conv2_b = tf.get_variable('bias', [4], initializer=tf.constant_initializer(0.0))
conv2 = tf.nn.conv2d(pool1, conv2_w, strides=[1, 1, 1, 1], padding='SAME')
relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_b))
with tf.name_scope('layer4-pool'):
pool2 = tf.nn.max_pool(relu2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID')
pool2 = tf.reshape(pool2, [-1, 7 * 7 * 4])
# dense layer
with tf.variable_scope('layer5-dense'):
dense1_w = tf.get_variable('weight', [7 * 7 * 4, 64], initializer=tf.
truncated_normal_initializer(stddev=0.1))
if regularizer:
tf.add_to_collection('losses', regularizer(dense1_w))
dense1_b = tf.get_variable('bias', [64], initializer=tf.constant_initializer(0.1))
dense1 = tf.nn.relu(tf.matmul(pool2, dense1_w) + dense1_b)
if is_training:
dense1 = tf.nn.dropout(dense1, keep_prob)
# last dense layer, ten type classification
with tf.variable_scope('layer6-dense'):
dense2_w = tf.get_variable('weight', [64, 10], initializer=tf.
truncated_normal_initializer(stddev=0.1))
if regularizer:
tf.add_to_collection('losses', regularizer(dense2_w))
dense2_b = tf.get_variable('bias', [10], initializer=tf.constant_initializer(0.1))
dense2 = tf.nn.relu(tf.add(tf.matmul(dense1, dense2_w), dense2_b))
if is_training:
dense2 = tf.nn.dropout(dense2, keep_prob)
return dense2
# define placeholders
X = tf.placeholder(tf.float32, shape=[None, 28, 28, 1], name='x')
Y = tf.placeholder(tf.int32, shape=[None], name='y')
keep_prob = tf.placeholder(tf.float32, name='keep_prob')
l2regularizer = l2_regularizer(0.001)
logits = img_model(X, True, l2regularizer)
logits = tf.multiply(logits, 1.0, name='logits_eval')
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=Y)
train_optimization = tf.train.AdagradOptimizer(learning_rate=0.01).minimize(loss)
correct_prediction = tf.equal(tf.cast(tf.argmax(logits, axis=1), tf.int32), Y)
accu = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
epoch_num = 5000
batch_size = 512
# batch data generator
def batch_gen(inputs, targets, batch_sz):
assert len(inputs) == len(targets), 'Error: inputs size is not correclated with the targets'
inputs = np.asarray(inputs)
targets = np.asarray(targets)
indices = np.arange(len(inputs))
np.random.shuffle(indices)
# rt = [-1, 0, 1]
for start_index in range(0, len(inputs) - batch_sz, batch_sz):
excerpt = indices[start_index:start_index + batch_sz]
# # randomly rotate the image by -90,0,90 degree
# x_ = []
# for ind in excerpt:
# np.random.shuffle(rt)
# x_.append(np.rot90(inputs[ind], rt[0]))
# x_ = np.asarray(x_)
# y_ = np.asarray([targets[ind] for ind in excerpt])
# yield x_, y_
yield np.asarray([inputs[ind] for ind in excerpt]), np.asarray([targets[ind] for ind in excerpt])
# generate batches for prediction
def batch_gen_predict(inputs, batch_sz):
for start_index in range(0, len(inputs), batch_sz):
export_data = inputs[start_index:min(len(inputs), start_index + batch_size)]
yield export_data
# 对图像将10以上的置为255,10以下的置为0
def binary_bw(in_np) -> np.ndarray:
assert len(np.shape(in_np)) == 2, 'the input ndarray must in two dimension'
m, n = np.shape(in_np)
for i_ in range(m):
for j in range(n):
in_np[i_, j] = 0 if in_np[i_, j] <= 10 else 255
return in_np
if __name__ == '__main__':
saver = tf.train.Saver()
if len(listdir('ckpt')) == 0:
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
# # make the input image differ more dramatic by pixels
# for i in range(np.shape(trainx)[0]):
# trainx[i] = binary_bw(trainx[i])
# for i in range(np.shape(testx)[0]):
# testx[i] = binary_bw(testx[i])
for epoch in range(epoch_num):
print('epoch #{}'.format(epoch), end='')
train_loss, train_acc, n_batch = 0, 0, 0
for x_train, y_train in batch_gen(trainx, trainy, batch_size):
x_train = np.expand_dims(x_train, axis=-1)
y_train = np.reshape(y_train, (-1))
_, err, acc = sess.run([train_optimization, loss, accu],
feed_dict={X: x_train, Y: y_train, keep_prob: 1.0})
train_loss += err
train_acc += acc
n_batch += 1
print(' train loss=%f, train accu=%f' % (np.sum(train_loss) * 1.0 / n_batch,
np.sum(train_acc) * 1.0 / n_batch))
# validation
val_loss, val_accu, n_batch = 0, 0, 0
for x_val, y_val in batch_gen(testx, testy, batch_size):
x_val = np.expand_dims(x_val, axis=-1)
y_val = np.reshape(y_val, (-1))
err_val, accu_val = sess.run([loss, accu], feed_dict={X: x_val, Y: y_val, keep_prob: 1.0})
val_loss += err_val
val_accu += accu_val
n_batch += 1
print(' validation loss=%f, accu=%f' % (np.sum(val_loss) * 1.0 / n_batch,
np.sum(val_accu) * 1.0 / n_batch))
saver.save(sess, path.join('ckpt', 'model.ckpt'))
else:
with tf.Session() as sess:
saver.restore(sess, path.join('ckpt', 'model.ckpt'))
# for i in range(np.shape(test_df)[0]):
# test_df[i] = binary_bw(test_df[i])
test_df = np.expand_dims(test_df, axis=-1)
ans = []
for pred_batch in batch_gen_predict(test_df, batch_size):
# predict process ,keep prob should be set 1
predict_res = sess.run([logits], feed_dict={X: pred_batch, keep_prob: 1.0})
ans.extend(list(np.reshape(np.argmax(predict_res, axis=2), (-1))))
# save predict result to csv file
result_df = pd.Series(ans, index=np.arange(start=1, stop=len(ans) + 1), name='Label')
result_df = pd.concat([pd.DataFrame(), result_df], axis=1)
result_df.to_csv('result.csv')