Kaggle笔记:DigitRecognition 数字识别 kNN解法

import numpy as np
import pandas as pd
import operator
import csv


# inX:进行类别判断的一条数据
# trainSet:训练数据
# labels:每条数据对应的标签
# k:近邻数量
def classify(inX, trainSet, labels, k):
    trainSetSize = trainSet.shape[0]
    diffMatrix = np.tile(inX, (trainSetSize, 1)) - trainSet
    sqDiffMatrix = diffMatrix ** 2
    sqDistance = sqDiffMatrix.sum(axis=1)
    distances = sqDistance ** 0.5
    # argsort() 返回排序索引
    sortedDistanceIndices = distances.argsort()
    classCount = {}
    for i in range(k):
        voteIlabel = labels[sortedDistanceIndices[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]


def DigitRecognition():
    # 构造训练集和测试集
    pdTrainData = pd.read_csv("train.csv")
    # 转换成array对象
    trainSet = pdTrainData.values
    # 对array对象进行切片操作
    trainSetLabels = trainSet[:, 0]
    # 将标签数据从训练数据集中剔除,方便和测试数据进行数值运算
    trainSet = trainSet[:, 1:]
    pdTestData = pd.read_csv("test.csv")
    testSet = pdTestData.values

    # 保存结果的列表
    index = []
    result = []
    # 对测试集中的每条数据进行处理
    for i in range(testSet.shape[0]):
        print("开始判断第%d条数据"%i)
        index.append(i + 1)
        predictNum = classify(testSet[i], trainSet, trainSetLabels, 3)
        result.append(predictNum)
    # 将数据保存到csv文件中
    predictions = pd.DataFrame({"ImageId":index, "Label":result})
    predictions.to_csv("submission.csv", index=False)

DigitRecognition()
识别的准确率在97%左右

你可能感兴趣的:(Kaggle)