先说一下要用到的数据集:
数据集自取地址:
链接:https://pan.baidu.com/s/1Vd2ADHEalSNnuOEcPJD8gQ
提取码:3hk6
数据集构成:
0-9十个数字,总共1934个样本,以数字_n命名,每个样本为32*32大小的txt文件(事先将图片处理后二值化)
数据读取代码:
def img2vector(filename):
# 创建向量
returnVect = np.zeros((1, 1024))
# 打开数据文件,读取每行内容
fr = open(filename)
for i in range(32):
# 读取每一行
lineStr = fr.readline()
# 将每行前32字符转成int,存入向量
for j in range(32):
returnVect[0, 32 * i + j] = int(lineStr[j])
return returnVect
最后生成数据集形式:
X:1934*1024的矩阵,每行代表一个样本,1024列为每个样本的1024个像素点(二值表示)
Y:1934*1的矩阵,每一行对应相同下标X中样本的标签
数据集生成代码:
def trainData(trainPath):
trainfile = os.listdir(trainPath) # 获取训练集文件下的所有文件名
Y = np.zeros((len(trainfile), 1))
# 先建立一个行数为训练样本数。列数为1024的0数组矩阵,1024为图片像素总和,即32*32
X = np.zeros((len(trainfile), 1024))
# 取文件名的第一个数字为标签名
for i in range(0, len(trainfile)):
thislabel = trainfile[i].split(".")[0].split("_")[0]
if len(thislabel) != 0:
Y[i][0] = int(thislabel) # 保存标签
X[i, :] = img2vector(trainPath + "/" + trainfile[i]) # 将训练数据写入0矩阵
return X, Y
关于K折交叉验证的原理其他博文都有详细说明,我这就不赘述了,直接上代码
(本人水平有限,实现方法繁琐,但是能用,手动狗头保命)
代码思路介绍:
1. 将每个数字对应的下标打乱
2. 由于每个数字样本数不一致,这里将其统一到200,不足的随机抽样本补充,多余的随机剔除
3. 按照打乱后的下标生成shuffle后的数据集
4. 按照每个数字40个样本的方式提取出五折子数据集
详细代码:
# import dataset
X, Y = trainData('data1')
size = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] # 保存每个数字的样本数
for i in range(1934):
size[int(Y[i][0])] += 1
# shuffle and balance
position = []
for i in range(10):
left = 0
right = 0
for k in range(i + 1):
right += size[k]
left = right - size[i] # 计算每个数字对应的下标左右边界
ran = list(range(left, right))
random.shuffle(ran) # 生成打乱后的下标列表
if len(ran) < 200:
for j in range(200 - len(ran)):
ran.append(np.random.randint(left, right)) # 随机抽取补充
if len(ran) > 200:
for j in range(len(ran) - 200):
del ran[-1] # 随机剔除
position.append(ran)
X_shuffled = np.zeros((2000, 1024))
for i in range(10):
for j in range(200):
x = X[position[i][j]]
X_shuffled[j + 200*i] = x # 按照打乱后的下标生成新数据集
# split into 5 parts
X_part = []
for i in range(5):
X_split = np.zeros((400, 1024))
for j in range(10):
for k in range(40):
X_split[(k + 40*j), :] = X_shuffled[(k + 200*j + 40*i), :]
X_part.append(X_split)
Y_part = []
for i in range(10):
for j in range(40):
Y_part.append(i) #生成对应的标签集
1. RFC
# K-Folder
score =[]
for i in range(5):
X_test = X_part[i]
Y_test = Y_part
X_train = np.concatenate((X_part[(i+1) % 5], X_part[(i+2) % 5], X_part[(i+3) % 5], X_part[(i+4) % 5]), axis=0)
Y_train = Y_test*4
clf = RandomForestClassifier(n_estimators=200, criterion='gini', max_depth=None,
min_samples_split=3, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0,
min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1,
random_state=None, verbose=0, warm_start=False, class_weight=None)
# train
clf.fit(X_train, Y_train)
score.append(clf.score(X_test, Y_test))
Y_pred = clf.predict(X_test)
cm = confusion_matrix(Y_test, Y_pred)
plt.matshow(cm)
plt.title('epoch %d' % (i + 1))
plt.show()
acc_sum = 0
for i in range(5):
acc_sum += score[i]
print("Average Acc: %f" % (acc_sum / 5))
2. SVM
score =[]
for i in range(5):
X_test = X_part[i]
Y_test = Y_part
X_train = np.concatenate((X_part[(i+1) % 5], X_part[(i+2) % 5], X_part[(i+3) % 5], X_part[(i+4) % 5]), axis=0)
Y_train = Y_test*4
clf = svm.SVC(C=200.0, kernel='rbf', degree=3, gamma='auto',
coef0=0.0, shrinking=True, probability=False, tol=0.001,
cache_size=200, class_weight=None, verbose=False,
max_iter=-1, decision_function_shape='ovr',
random_state=None)
# train
clf.fit(X_train, Y_train)
score.append(clf.score(X_test, Y_test))
Y_pred = clf.predict(X_test)
cm = confusion_matrix(Y_test, Y_pred)
plt.matshow(cm)
plt.title('epoch %d' % (i + 1))
plt.show()
acc_sum = 0
for i in range(5):
acc_sum += score[i]
print("Average Acc: %f" % (acc_sum / 5))
3. KNN
score =[]
plt.figure()
for i in range(5):
X_test = X_part[i]
Y_test = Y_part
X_train = np.concatenate((X_part[(i+1) % 5], X_part[(i+2) % 5], X_part[(i+3) % 5], X_part[(i+4) % 5]), axis=0)
Y_train = Y_test*4
clf = KNeighborsClassifier(n_neighbors=3, weights='uniform',
algorithm='auto', leaf_size=30,
p=2, metric='minkowski', metric_params=None,
n_jobs=None, )
# train
clf.fit(X_train, Y_train)
score.append(clf.score(X_test, Y_test))
Y_pred = clf.predict(X_test)
cm = confusion_matrix(Y_test, Y_pred)
plt.matshow(cm)
plt.title('epoch %d' % (i+1))
plt.show()
acc_sum = 0
for i in range(5):
acc_sum += score[i]
print("Average Acc: %f" % (acc_sum / 5))
# KNN
import random
import numpy as np
import os
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
# SVM
import random
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn import svm
# RFC
import random
import numpy as np
import os
from sklearn. ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix