TensorFlow(6)kaggle Digit Recognizer实战

下载、读取并展示数据

下载

  • 分别下载train.csv和test.csv文件
  • 链接地址

读取

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

data = pd.read_csv("train.csv")
data.head(3)

dataset = data.iloc[:,1:]   # 提取特征
dataset.head()

label = data.iloc[:,0] # 提取标签
label.head()


# 将Pandas的DataFrame数据类型转换为矩阵
trainset = dataset.as_matrix(columns=None)
labelset = label.as_matrix(columns=None)


# 随机取5行数据,已28*28像素的方式展示出来
nsample = 5
randidx = np.random.randint(trainset.shape[0], size = nsample)
for i in randidx:
    curr_img    = np.reshape(trainset[i,:],(28,28)) # 28 by 28 matrix
    curr_label  = labelset[i] # label
    plt.matshow(curr_img,cmap=plt.get_cmap('gray'))
    plt.title(""+str(i)+"th Training Data" + "Label is "+str(curr_label))
    print(""+str(i)+"th Training Data" + "Label is "+str(curr_label))
    plt.show()

图片展示

TensorFlow(6)kaggle Digit Recognizer实战_第1张图片

数据预处理

from sklearn import preprocessing
# 标签值预处理:数值型转换为onehot型 
ohe = preprocessing.OneHotEncoder()
ohe.fit([[0],[1],[2],[3],[4],[5],[6],[7],[8],[9]])
labelset.shape=(42000,1)
labelset=ohe.transform(labelset).toarray()


label.head()
labelset[0:4,:]

# 特征值预处理:缩放到0-1区间
min_max_scaler = preprocessing.MinMaxScaler()
trainset = min_max_scaler.fit_transform(trainset)
trainset[1,:]

数据拟合

采用逻辑回归模型

x = tf.placeholder("float", [None, 784]) 
y = tf.placeholder("float", [None, 10])  # None is for infinite 
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
# LOGISTIC REGRESSION MODEL
actv = tf.nn.softmax(tf.matmul(x, W) + b) 
# COST FUNCTION
cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(actv), reduction_indices=1)) 
# OPTIMIZER
learning_rate = 0.01
optm = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)

# PREDICTION
pred = tf.equal(tf.argmax(actv, 1), tf.argmax(y, 1))   
# ACCURACY
accr = tf.reduce_mean(tf.cast(pred, "float"))
# INITIALIZER
init = tf.global_variables_initializer()

training_epochs = 50
batch_size      = 100
display_step    = 5
# SESSION
sess = tf.Session()
sess.run(init)
# MINI-BATCH LEARNING
for epoch in range(training_epochs):
    avg_cost = 0.
    num_batch = int(trainset.shape[0]/batch_size)
    for i in range(num_batch): 
        batch_xs = trainset[batch_size*i:batch_size*(i+1),:]
        batch_ys = labelset[batch_size*i:batch_size*(i+1)]
        #batch_xs, batch_ys = mnist.train.next_batch(batch_size)
        sess.run(optm, feed_dict={x: batch_xs, y: batch_ys})
        feeds = {x: batch_xs, y: batch_ys}
        avg_cost += sess.run(cost, feed_dict=feeds)/num_batch
    # DISPLAY
    if epoch % display_step == 0:
        feeds_train = {x: batch_xs, y: batch_ys}
        #feeds_test = {x: mnist.test.images, y: mnist.test.labels}
        train_acc = sess.run(accr, feed_dict=feeds_train)
        #test_acc = sess.run(accr, feed_dict=feeds_test)
        print ("Epoch: %03d/%03d cost: %.9f train_acc: %.3f " 
               % (epoch, training_epochs, avg_cost, train_acc))
print ("DONE")

将数据集与MNIST数据集合并

trainbig = np.vstack((trainset,trainimg))
labelbig = np.vstack((labelset,trainlabel))
trainbig = np.vstack((trainbig,testimg))
labelbig = np.vstack((labelbig,testlabel))
trainbig.shape
#(107000, 784)
labelbig.shape
#(10700,10)
np.save("trainbig.npy",trainbig) #save
np.save("labelbig.npy",labelbig)

你可能感兴趣的:(TensorFlow(6)kaggle Digit Recognizer实战)