kaggle Digit Recognizer 数字识别

https://www.kaggle.com/c/digit-recognizer

首先看一下提供的训练文件train.csv

import pandas as pd

trainingFile = pd.read_csv('C:/Users/Administrator/Desktop/train.csv')

print(trainingFile.head())
'''
   label  pixel0  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  \
0      1       0       0       0       0       0       0       0       0   
1      0       0       0       0       0       0       0       0       0   
2      1       0       0       0       0       0       0       0       0   
3      4       0       0       0       0       0       0       0       0   
4      0       0       0       0       0       0       0       0       0   

   pixel8    ...     pixel774  pixel775  pixel776  pixel777  pixel778  \
0       0    ...            0         0         0         0         0   
1       0    ...            0         0         0         0         0   
2       0    ...            0         0         0         0         0   
3       0    ...            0         0         0         0         0   
4       0    ...            0         0         0         0         0   

   pixel779  pixel780  pixel781  pixel782  pixel783  
0         0         0         0         0         0  
1         0         0         0         0         0  
2         0         0         0         0         0  
3         0         0         0         0         0  
4         0         0         0         0         0  

[5 rows x 785 columns]
'''
print(len(trainingFile))
'''
42000
'''
根据他的描述可以知道label是指数字是几 pixel是指784个像素点 共42000个数据


首先尝试用kNN算法

点击打开kNN.py

首先先让前41900个数据当训练集 后100个用作测试 看看正确率

import numpy as np
import pandas as pd

import kNN


# 加载数据
def loadDataSet():
    # 获取训练集
    print('获取训练集...')

    trainingFile = pd.read_csv('C:/Users/Administrator/Desktop/train.csv')
    train_x = np.array(trainingFile.drop('label', 1))[:41900]
    train_x[train_x > 0] = 1
    train_y = np.array(trainingFile['label'])[:41900]

    # 获取测试集
    print('获取测试集...')

    testingFile = pd.read_csv('C:/Users/Administrator/Desktop/train.csv')
    test_x = np.array(testingFile.drop('label', 1))[41900:]
    test_x[test_x > 0] = 1
    test_y = np.array(testingFile['label'])[41900:]

    return train_x, train_y, test_x, test_y

# 手写数字测试
def testHandWritingClass():
    # 加载数据
    print('加载数据...')

    train_x, train_y, test_x, test_y = loadDataSet()

    # 训练
    print('训练中...')

    pass

    # 测试
    print('测试中...')

    numTestSamples = len(test_x)
    matchCount = 0
    result = []
    for i in range(numTestSamples):
        predict = kNN.kNNClassify(test_x[i], train_x, train_y, 3)
        if predict == test_y[i]:
            matchCount += 1

    accuracy = float(matchCount) / numTestSamples

    # 输出结果
    print('输出结果...')

    print('分类准确率为: %.2f%%' % (accuracy * 100))

if __name__ == '__main__':
    testHandWritingClass()
输出结果:
加载数据...
获取训练集...
获取测试集...
训练中...
测试中...
输出结果...
分类准确率为: 99.00%

看正确率还不错 直接让train.csv作为训练集 计算test.csv中每个数 照着sample_submission.csv的格式 把答案存到result.csv
import numpy as np
import pandas as pd

import kNN


# 加载数据
def loadDataSet():
    # 获取训练集
    print('获取训练集...')

    trainingFile = pd.read_csv('C:/Users/Administrator/Desktop/train.csv')
    train_x = np.array(trainingFile.drop('label', 1))[:]
    train_x[train_x > 0] = 1
    train_y = np.array(trainingFile['label'])[:]

    # 获取测试集
    print('获取测试集...')

    testingFile = pd.read_csv('C:/Users/Administrator/Desktop/test.csv')
    test_x = np.array(testingFile)[:]
    test_x[test_x > 0] = 1
    test_y = []

    return train_x, train_y, test_x, test_y

# 手写数字测试
def testHandWritingClass():
    # 加载数据
    print('加载数据...')

    train_x, train_y, test_x, test_y = loadDataSet()

    # 训练
    print('训练中...')

    pass

    # 测试
    print('测试中...')

    numTestSamples = len(test_x)
    result = []
    for i in range(numTestSamples):
        predict = kNN.kNNClassify(test_x[i], train_x, train_y, 4)
        result.append([i + 1, predict])
        if i % 100 == 0:
            print('进度:', i, '/', numTestSamples)

    # 输出结果
    print('输出结果...')

    #print(result)
    pd.DataFrame(result, columns=['ImageId', 'Label']).to_csv('C:/Users/Administrator/Desktop/result.csv', index=False)

if __name__ == '__main__':
    testHandWritingClass()
最后把result.csv提交分数为0.96543

使用scikit-learn库的kNN
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier


# 加载数据
def loadDataSet():
    # 获取训练集
    print('获取训练集...')

    trainingFile = pd.read_csv('C:/Users/Administrator/Desktop/train.csv')
    train_x = np.array(trainingFile.drop('label', 1))[:]
    preprocessing.Binarizer().fit(train_x)
    train_y = np.array(trainingFile['label'])[:]

    # 获取测试集
    print('获取测试集...')

    testingFile = pd.read_csv('C:/Users/Administrator/Desktop/test.csv')
    test_x = np.array(testingFile)[:]
    preprocessing.Binarizer().fit(test_x)
    test_y = []

    return train_x, train_y, test_x, test_y

# 手写数字测试
def testHandWritingClass():
    # 加载数据
    print('加载数据...')

    train_x, train_y, test_x, test_y = loadDataSet()

    # 训练
    print('训练中...')

    model = KNeighborsClassifier()
    model.fit(train_x, train_y)

    # 测试
    print('测试中...')

    predict = model.predict(test_x)

    # 输出结果
    print('输出结果...')

    result = list(enumerate(predict, 1))
    #print(result)
    pd.DataFrame(result, columns=['ImageId', 'Label']).to_csv('C:/Users/Administrator/Desktop/result.csv', index=False)

if __name__ == '__main__':
    testHandWritingClass()
最后把result.csv提交分数为0.96800

你可能感兴趣的:(Python,机器学习,kaggle)