https://www.kaggle.com/c/digit-recognizer
首先看一下提供的训练文件train.csv
import pandas as pd
trainingFile = pd.read_csv('C:/Users/Administrator/Desktop/train.csv')
print(trainingFile.head())
'''
label pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 \
0 1 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0
2 1 0 0 0 0 0 0 0 0
3 4 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0
pixel8 ... pixel774 pixel775 pixel776 pixel777 pixel778 \
0 0 ... 0 0 0 0 0
1 0 ... 0 0 0 0 0
2 0 ... 0 0 0 0 0
3 0 ... 0 0 0 0 0
4 0 ... 0 0 0 0 0
pixel779 pixel780 pixel781 pixel782 pixel783
0 0 0 0 0 0
1 0 0 0 0 0
2 0 0 0 0 0
3 0 0 0 0 0
4 0 0 0 0 0
[5 rows x 785 columns]
'''
print(len(trainingFile))
'''
42000
'''
根据他的描述可以知道label是指数字是几 pixel是指784个像素点 共42000个数据
首先尝试用kNN算法
点击打开kNN.py
首先先让前41900个数据当训练集 后100个用作测试 看看正确率
import numpy as np
import pandas as pd
import kNN
# 加载数据
def loadDataSet():
# 获取训练集
print('获取训练集...')
trainingFile = pd.read_csv('C:/Users/Administrator/Desktop/train.csv')
train_x = np.array(trainingFile.drop('label', 1))[:41900]
train_x[train_x > 0] = 1
train_y = np.array(trainingFile['label'])[:41900]
# 获取测试集
print('获取测试集...')
testingFile = pd.read_csv('C:/Users/Administrator/Desktop/train.csv')
test_x = np.array(testingFile.drop('label', 1))[41900:]
test_x[test_x > 0] = 1
test_y = np.array(testingFile['label'])[41900:]
return train_x, train_y, test_x, test_y
# 手写数字测试
def testHandWritingClass():
# 加载数据
print('加载数据...')
train_x, train_y, test_x, test_y = loadDataSet()
# 训练
print('训练中...')
pass
# 测试
print('测试中...')
numTestSamples = len(test_x)
matchCount = 0
result = []
for i in range(numTestSamples):
predict = kNN.kNNClassify(test_x[i], train_x, train_y, 3)
if predict == test_y[i]:
matchCount += 1
accuracy = float(matchCount) / numTestSamples
# 输出结果
print('输出结果...')
print('分类准确率为: %.2f%%' % (accuracy * 100))
if __name__ == '__main__':
testHandWritingClass()
输出结果:
加载数据...
获取训练集...
获取测试集...
训练中...
测试中...
输出结果...
分类准确率为: 99.00%
import numpy as np
import pandas as pd
import kNN
# 加载数据
def loadDataSet():
# 获取训练集
print('获取训练集...')
trainingFile = pd.read_csv('C:/Users/Administrator/Desktop/train.csv')
train_x = np.array(trainingFile.drop('label', 1))[:]
train_x[train_x > 0] = 1
train_y = np.array(trainingFile['label'])[:]
# 获取测试集
print('获取测试集...')
testingFile = pd.read_csv('C:/Users/Administrator/Desktop/test.csv')
test_x = np.array(testingFile)[:]
test_x[test_x > 0] = 1
test_y = []
return train_x, train_y, test_x, test_y
# 手写数字测试
def testHandWritingClass():
# 加载数据
print('加载数据...')
train_x, train_y, test_x, test_y = loadDataSet()
# 训练
print('训练中...')
pass
# 测试
print('测试中...')
numTestSamples = len(test_x)
result = []
for i in range(numTestSamples):
predict = kNN.kNNClassify(test_x[i], train_x, train_y, 4)
result.append([i + 1, predict])
if i % 100 == 0:
print('进度:', i, '/', numTestSamples)
# 输出结果
print('输出结果...')
#print(result)
pd.DataFrame(result, columns=['ImageId', 'Label']).to_csv('C:/Users/Administrator/Desktop/result.csv', index=False)
if __name__ == '__main__':
testHandWritingClass()
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
# 加载数据
def loadDataSet():
# 获取训练集
print('获取训练集...')
trainingFile = pd.read_csv('C:/Users/Administrator/Desktop/train.csv')
train_x = np.array(trainingFile.drop('label', 1))[:]
preprocessing.Binarizer().fit(train_x)
train_y = np.array(trainingFile['label'])[:]
# 获取测试集
print('获取测试集...')
testingFile = pd.read_csv('C:/Users/Administrator/Desktop/test.csv')
test_x = np.array(testingFile)[:]
preprocessing.Binarizer().fit(test_x)
test_y = []
return train_x, train_y, test_x, test_y
# 手写数字测试
def testHandWritingClass():
# 加载数据
print('加载数据...')
train_x, train_y, test_x, test_y = loadDataSet()
# 训练
print('训练中...')
model = KNeighborsClassifier()
model.fit(train_x, train_y)
# 测试
print('测试中...')
predict = model.predict(test_x)
# 输出结果
print('输出结果...')
result = list(enumerate(predict, 1))
#print(result)
pd.DataFrame(result, columns=['ImageId', 'Label']).to_csv('C:/Users/Administrator/Desktop/result.csv', index=False)
if __name__ == '__main__':
testHandWritingClass()
最后把result.csv提交分数为0.96800