kaggle-手写字体识别

In[1]:

import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import matplotlib.pyplot as plt

In[79]:

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D,  ZeroPadding2D, Input, BatchNormalization
from keras.layers import Activation
from keras.optimizers import Adam
from keras.models import Model
import tensorflow as tf

In[17]:

img_train = pd.read_csv('/train.csv')
img_test = pd.read_csv('/test.csv')

In[18]:

将原始数据转换为图片格式,利用CNN,图片格式(m, n_H, n_W, n_C)

img_train = np.array(img_train)
img_test = np.array(img_test)

trainSet = np.zeros((img_train.shape[0], 28, 28, 1))
trainLabel = np.zeros(img_train.shape[0])
testSet = np.zeros((img_test.shape[0], 28, 28, 1))
testLabel = np.zeros(img_test.shape[0])

for i in range(img_train.shape[0]):
    trainSet[i] = img_train[i][1:].reshape(28, 28, 1)
    trainLabel[i] = img_train[i][0].astype(int)
for i in range(img_test.shape[0]):
    testSet[i] = img_test[i].reshape(28, 28, 1)

In[19]:

trainSet /= 255  #归一化
testSet /= 255
trainLabel = np.array(list(map(int, trainLabel))) 
#注意keras使用的标签格式,要转换为shape= 数量 x 6的格式

In[23]:

def one_to_hot(Y, C):  #将数据标签装换为 M x 种类
    return np.eye(C)[Y.reshape(-1)].T

trainLabel = one_to_hot(trainLabel, 10)
trainLabel = trainLabel.T

In[183]:

具体的CNN设计大概跑到loss = 0.0078,的时候,然后用此时的模型做预测,最后的结果是0.9917,排名400+,可能需要ResNet等大型网络才能跑出更好的
结果吧,奈何电脑没有gpu,太慢了,就等买电脑再跑吧。
注意这里的keras模型的好处就是你随时可以暂停,使用交叉验证观察模型的交叉验证输出结果,如果发现比较满意了,可以停下来。然后进行预测,再来
训练模型,因为keras的特点是之前的训练的模型不会消失,只要你不去更改模型,这点keras你值得拥有。
具体的模型就是:conV-pool-Conv-pool-conV-pool-fc(中间使用的全部是relu)

ps:有训练出1的准确度大佬,能不能分享一波模型。。。
input_shape = (28, 28 ,1)
X_input = Input(input_shape)

X = ZeroPadding2D((1, 1))(X_input)
X = Conv2D(16, (3, 3), strides = (1, 1), name = 'conv0')(X)
#X = BatchNormalization(axis = 3, name = 'bn0')(X)
X = Activation('relu')(X)
X = MaxPooling2D((2, 2), name = 'max_pool0')(X)

X = ZeroPadding2D((1, 1))(X)
X = Conv2D(32, (3, 3), strides = (1, 1), name = 'conv1')(X)
#X = BatchNormalization(axis = 3, name = 'bn0')(X)
X = Activation('relu')(X)
X = MaxPooling2D((2, 2), name = 'max_pool1')(X)


X = ZeroPadding2D((1, 1))(X)
X = Conv2D(64, (3, 3), strides = (1, 1), name = 'conv2')(X)
#X = BatchNormalization(axis = 3, name = 'bn0')(X)
X = Activation('relu')(X)
X = MaxPooling2D((2, 2), name = 'max_pool2')(X)


X = Flatten()(X)
X = Dense(256, activation='relu')(X)
X = Dropout(0.5)(X)
X = Dense(10, activation = 'softmax')(X)
model = Model(inputs = X_input, outputs = X, name = 'Digit_Sign')

In[184]:

model.summary() #查看一下模型的各组成部分

In[139]:

adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.00)
这里的decay可以尝试一下

In[176]:

model.compile(loss='categorical_crossentropy', optimizer = adam, metrics=['accuracy'])
model.fit(trainSet, trainLabel, batch_size=256, epochs=3, validation_split=0.2)

In[177]:

score = model.evaluate(trainSet, trainLabel, batch_size=256)

In[178]:

print('The train score is %f' %score[1])

In[179]:

pred = model.predict(testSet, batch_size = 256)  #预测结果

In[180]:

pred_X = np.argmax(pred, axis = 1)

因为上面预测的结果是softmax函数输出的结果,不是我们想要的最终结果,用numpy的argmax函数得到最大的分数对应的index就可以了。

In[181]:

将结果保存到csv文件中,提交该csv文件就可以了。。。哈

result = DataFrame({'ImageId':np.arange(1, 28001), 'Label':pred_X})
result.to_csv("/predict.csv", index=False)

你可能感兴趣的:(数据挖掘)