import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
导入单个数字
digit = plt.imread('./data/8/8_400.bmp')
digit
Out:
array([[255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255],
……
plt.figure(figsize=(1,1))
plt.imshow(digit,cmap = 'gray')
查看数据集结构:一共10个文件夹,每个文件夹有500张图片,则总量一共5000张图片
全部导入,特征数据存入data,目标数据存入target
# 上午fit(X,y)
# data ----->X
# target---->y
data = []
target = []
for i in range(10):
# 每个数字500
for j in range(1,501):
digit = plt.imread('./data/%d/%d_%d.bmp'%(i,i,j))
data.append(digit)
target.append(i)
len(data)
Out:5000
len(target)
Out:5000
index = np.random.randint(0,5000,size = 1)[0]
index
plt.figure(figsize=(1,1))
plt.imshow(data[index],cmap = 'gray')
print(target[index])
使用KNN分类算法
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
# 使用算法,进行训练
# ValueError: Found array with dim 3. Estimator expected <= 2.
knn.fit(data,target)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
in ()
1 # 使用算法,进行训练
----> 2 knn.fit(data,target)
type(data)
Out: list
转化成ndarray
X = np.array(data)
# [[[]]]
# [[样本一],[样本二]……]
X.shape
Out: (5000, 28, 28)
28*28
Out: 784
y = np.array(target)
y
Out:array([0, 0, 0, ..., 9, 9, 9])
数据的形状改变,数据没变
# 数据的形状改变,数据没变
X = X.reshape(5000,784)
plt.figure(figsize=(1,1))
plt.imshow(X[1600].reshape(28,28))
打乱顺序
nd = np.array([0,1,2,3,4,9])
np.random.shuffle(nd)
nd
Out: array([1, 9, 4, 2, 0, 3])
index = np.arange(5000)
index
np.random.shuffle(index)
index
Out: array([ 608, 2911, 2475, ..., 4248, 4205, 2117])
X = X[index]
y = y[index]
# 5000个样本,每个样本784个属性
# 784个未知数 f() = x0*w0 + x1*w1 + …… + x783*w783
X.shape
Out:(5000, 784)
举栗子
# 2x + 3y + 4z = 10
# x - 2y + 3z = 8
# 3x -y + z = 7
X1 = np.array([[2,3,4],[1,-2,3],[3,-1,1]])
X1
y1 = np.array([10,8,7])
display(X1,y1)
Out:
array([[ 2, 3, 4],
[ 1, -2, 3],
[ 3, -1, 1]])
array([10, 8, 7])
划分训练和测试数据
# 5000 一分为二4950,50
knn.fit(X[:4950],y[:4950])
Out:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=1, n_neighbors=5, p=2,
weights='uniform')
y_ = knn.predict(X[-50:])
y_
Out:
array([1, 9, 9, 7, 1, 9, 1, 9, 3, 5, 7, 4, 3, 1, 0, 5, 0, 4, 0, 6, 3, 8,
9, 7, 1, 3, 1, 9, 3, 4, 0, 5, 5, 2, 4, 0, 7, 8, 0, 6, 1, 7, 0, 8,
1, 3, 3, 8, 8, 4])
y[-50:]
Out:
array([3, 1, 5, 4, 4, 2, 0, 6, 0, 3, 4, 6, 2, 4, 2, 7, 3, 1, 9, 3, 0, 7,
5, 9, 9, 9, 3, 1, 8, 3, 3, 1, 6, 5, 2, 6, 4, 6, 9, 8, 6, 2, 5, 1,
5, 3, 5, 4, 9, 1])
预测后50个数字图片
plt.figure(figsize=(5*1,10*2))
for i in range(50):
axes = plt.subplot(10,5,i+1)
axes.imshow(X[4950+i].reshape(28,28))
t = y[4950+i]
# 预测值 50个(对最后的50个数据进行了预测)
p = y_[i]
# 标题 True:Predict:
axes.set_title('True:%d\nPredict:%d'%(t,p))
axes.axis('off')
评分
# socre ---->predict ---- 进行比较
knn.score(X[-50:],y[-50:])
Out:
0.96
保存算法
# 保存算法,以后使用
from sklearn.externals import joblib
# model 模型,数学建模----算法
joblib.dump(knn,'./digits.m')
Out:['./digits.m']
特征数据和目标数据保存至numpy的npy中方便日后调用
# 5000张图片在X中
# 5000张图片的目标值在y中
# numpy可以直接保存
np.save('./digits.npy',X)
np.save('./digits_target.npy',y)
各种调参
knn = KNeighborsClassifier(10)
knn.fit(X[:4950],y[:4950])
knn.score(X[-50:],y[-50:])
Out:
0.94
knn = KNeighborsClassifier(50)
knn.fit(X[:4950],y[:4950])
knn.score(X[-50:],y[-50:])
Out:
0.94
knn = KNeighborsClassifier(500)
knn.fit(X[:4950],y[:4950])
knn.score(X[-50:],y[-50:])
Out:
0.84
knn = KNeighborsClassifier(1)
knn.fit(X[:4950],y[:4950])
knn.score(X[-50:],y[-50:])
Out:
0.96