按国家标准修订技术数据集(Modified National Institute of Standards and Technology database, MNIST database)是大型的手写数字数据集,用于训练多样的图像处理系统,数据集广泛应用于机器学习领域的训练和测试,数据集通过对NIST的原始数据集进行re-mixing
而来,作者认为NIST的训练数据集是来自美国人口调查局,而测试数据集来自高中生,所以该数据集并不能完全适用于机器学习的实验.该数据集中的NIST黑白图通过归一化处理,使其匹配到 28 × 28 28\times 28 28×28像素的边框中,并进行反锯齿处理.
MNIST数据集包含60000张训练图片和10000张测试图片,其中一半的训练数据和一半的测试数据取自NIST的训练数据集,另外一半来自NIST的测试数据集.有大量的科技论文在该数据集上试图达到最低错误率,其中一篇论文使用卷积神经网络的分层系统,在MNIST数据集上错误率为0.23%.数据集作者的文章中,使用支持向量机方法的错误率为0.8%,扩展的类似MNIST数据集成为EMNIST于2017年发布,该数据集包含240000训练数据,40000测试数据.
github函数
注:
为方便参数查看,定义的变量直接给出解释.
import tensorflow as tf
tf.reset_default_graph()
from tensorflow.examples.tutorials.mnist import input_data
import matplotlib.pyplot as plt
import numpy as np
mnist = input_data.read_data_sets("../mnist_data", one_hot=True)
def mnist_train_datasets():
'''Train image datasets.
:params train_images: train images data list
:params train_num: number of train images
:params train_labels: train images label list
:params train_num: number of train images lable
'''
train_images = mnist.train.images
train_labels = mnist.train.labels
train_data_nums = mnist.train.num_examples
train_images_shape = train_images.shape
train_images_value = train_images[0]
train_images_value_shape = train_images_value.shape
train_images_label = train_labels[0]
train_images_label_shape = train_images_label.shape
print("type or image value: {}".format(type(train_images_value)))
print("Train images numbers: {}".format(train_data_nums))
print("Train images shape: {}".format(train_images_shape))
# print("train images value: {}".format(train_images_value))
print("Train images label: {}".format(train_images_label))
print("Train images value shape: {}".format(train_images_value_shape))
print("Train images label shape: {}".format(train_images_label_shape))
def mnist_test_datasets():
'''Test image datasets.
:params test_images: test images data list
:params test_num: number of test images
:params test_labels: test images label list
:params test_num: number of test images lable
'''
test_images = mnist.test.images
test_labels = mnist.test.labels
test_data_nums = mnist.test.num_examples
test_images_shape = test_images.shape
test_images_value = test_images[0]
test_images_label = test_labels[0]
test_images_value_shape = test_images_value.shape
test_images_label_shape = test_images_label.shape
print("Test data numbers: {}".format(test_data_nums))
print("Test images shape: {}".format(test_images_shape))
# print("Test image value: {}".format(test_images_value))
print("Test images label: {}".format(test_images_label))
print("Test images value shape: {}".format(test_images_value_shape))
print("Test images label shape: {}".format(test_images_label_shape))
mnist_train_datasets()
mnist_test_datasets()
Extracting ../mnist_data/train-images-idx3-ubyte.gz
Extracting ../mnist_data/train-labels-idx1-ubyte.gz
Extracting ../mnist_data/t10k-images-idx3-ubyte.gz
Extracting ../mnist_data/t10k-labels-idx1-ubyte.gz
type or image value:
Train images numbers: 55000
Train images shape: (55000, 784)
Train images label: [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
Train images value shape: (784,)
Train images label shape: (10,)
Test data numbers: 10000
Test images shape: (10000, 784)
Test images label: [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
Test images value shape: (784,)
Test images label shape: (10,)
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("../mnist_data", one_hot=True)
mnist.train.images
,mnist.train.labels
mnist.test.images
,mnist.test.labels
import tensorflow as tf
tf.reset_default_graph()
from tensorflow.examples.tutorials.mnist import input_data
import matplotlib.pyplot as plt
import numpy as np
mnist = input_data.read_data_sets("../mnist_data", one_hot=True)
def mnist_train_datasets():
'''Train image datasets.
:params train_images: train images data list
:params train_num: number of train images
:params train_labels: train images label list
:params train_num: number of train images lable
'''
train_images = mnist.train.images
train_labels = mnist.train.labels
train_data_nums = mnist.train.num_examples
train_images_shape = train_images.shape
train_images_value = train_images[0]
print("type or image value: {}".format(type(train_images_value)))
train_value = train_images_value.reshape((28, 28, -1))
plt.figure()
plt.imshow(train_value[:,:,0], cmap="Greys_r")
plt.show()
train_images_label = train_labels[0]
def mnist_test_datasets():
'''Test image datasets.
:params test_images: test images data list
:params test_num: number of test images
:params test_labels: test images label list
:params test_num: number of test images lable
'''
test_images = mnist.test.images
test_labels = mnist.test.labels
test_data_nums = mnist.test.num_examples
test_images_shape = test_images.shape
test_images_value = test_images[0]
test_images_label = test_labels[0]
test_value = test_images_value.reshape((28, 28, -1))
plt.figure()
plt.imshow(test_value[:,:,0], cmap="Greys_r")
plt.show()
mnist_train_datasets()
mnist_test_datasets()
matplotlib
绘制图像时需要采用提取通道数据的方法绘制,即[:,:,0];Greys_r
获取灰度图像;from sklearn.metrics import confusion_matrix
tf.reset_default_graph()
from tensorflow.examples.tutorials.mnist import input_data
import matplotlib.pyplot as plt
import numpy as np
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
mnist = input_data.read_data_sets("../mnist_data", one_hot=True)
def mnist_train_datasets():
'''Train image datasets.
:params train_images: train images data list
:params train_num: number of train images
:params train_labels: train images label list
:params train_num: number of train images lable
'''
train_images = mnist.train.images
train_labels = mnist.train.labels
train_data_nums = mnist.train.num_examples
train_images_shape = train_images.shape
train_images_value = train_images[0]
print("type or image value: {}".format(type(train_images_value)))
train_value = train_images_value.reshape((28, 28, -1))
plt.figure()
plt.imshow(train_value[:,:,0], cmap="Greys_r")
plt.show()
train_images_label = train_labels[0]
def mnist_test_datasets():
'''Test image datasets.
:params test_images: test images data list
:params test_num: number of test images
:params test_labels: test images label list
:params test_num: number of test images lable
'''
test_images = mnist.test.images
test_labels = mnist.test.labels
test_data_nums = mnist.test.num_examples
test_images_shape = test_images.shape
test_images_value = test_images[0]
test_images_label = test_labels[0]
test_value = test_images_value.reshape((28, 28, -1))
plt.figure()
plt.imshow(test_value[:,:,0], cmap="Greys_r")
plt.show()
train_label = mnist_train_datasets()
test_label = mnist_test_datasets()
conf_mx = confusion_matrix(train_label, test_label)
plt.figure()
plt.matshow(conf_mx, cmap=plt.cm.gray)
plt.savefig("./images/cofusion_image1.png", format="png")
plt.show()
(1) MINIST数据集包含训练集和测试,图像数据存储为[784,1]的列向量,标签为[10, 1]的列向量,图像原始尺寸为(28, 28, 1);
(2) MNIST图像数据引入:from tensorflow.examples.tutorials.mnist import input_data
(3) MNIST数据读取:mnist = input_data.read_data_sets("../mnist_data", one_hot=True)
(4) 数据结构:
数据分类 | 图像数量/张 | 数据格式 | 图像向量 | 标签向量 | 图像尺寸 |
---|---|---|---|---|---|
训练集 | 55000 | numpy.ndarray | [784, 1] | [10, 1] | 28 × \times × 28 |
测试集 | 10000 | numpy.ndarray | [784, 1] | [10, 1] | 28 × \times × 28 |
[参考文献]
[1]https://blog.csdn.net/Orange_Spotty_Cat/article/details/80520839
[2]https://en.wikipedia.org/wiki/MNIST_database