目录
kaggle猫狗大战数据集地址:kaggle
# 将kaggle的数据集直接下载到codelab中
!pip install -U -q kaggle
!mkdir -p ~/.kaggle
!echo '{"username":"填写你自己的username","key":"填写你自己的key"}' > ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c dogs-vs-cats-redux-kernels-edition
Downloading test.zip to /content
98% 265M/271M [00:02<00:00, 125MB/s]
100% 271M/271M [00:02<00:00, 115MB/s]
Downloading train.zip to /content
99% 536M/544M [00:03<00:00, 192MB/s]
100% 544M/544M [00:03<00:00, 163MB/s]
Downloading sample_submission.csv to /content
0% 0.00/111k [00:00, ?B/s]
100% 111k/111k [00:00<00:00, 116MB/s]
# 将文件解压
! unzip ./train.zip
! unzip ./test.zip
总之,训练集有25000张图片,猫狗各占一半。测试集有12500张,没有标定是猫还是狗
!ls ./train
可以看出图片名是以type.number.jpg
格式命名的
# 导入开发需要的库
import keras
import os
import shutil
import threading
import numpy as np
import cv2
import h5py
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from keras.models import *
from keras.layers import *
from keras.applications import *
from keras.preprocessing import *
from keras.preprocessing.image import *
Using TensorFlow backend.
加载数据
def load_data(load_type="train"):
path = None
n = 25000
if load_type=="train":
imgs = []
labels = []
path = "./train/"
img_names = os.listdir(path)
for name in img_names:
imgs.append("train/"+name)
labels.append([0] if name[:3] == "cat" else [1])
train_img_names,valid_img_names,train_labels,valid_labels = train_test_split(imgs,labels,test_size=0.2,random_state=42)
return train_img_names,valid_img_names,train_labels,valid_labels
else:
# test,don`t have the labels
path = "./test"
img_names = os.listdir(path)
imgs = []
for img in img_names:
imgs.append(img)
return imgs
train_img_names,valid_img_names,train_labels,valid_labels = load_data()
print(train_img_names[:5])
print(train_labels[:5])
print(len(train_img_names)+len(valid_img_names))
['train/dog.12105.jpg', 'train/cat.10129.jpg', 'train/dog.11009.jpg', 'train/dog.9346.jpg', 'train/dog.1599.jpg']
[[1], [0], [1], [1], [1]]
25000
数据准备之后,接下来就是准备开始训练:
预先步骤
:
# 定义keras的generator方法
# custom keras generator
class MOGenerator(keras.utils.Sequence):
def __init__(self, data, n, des_size=(224, 224), means=None, stds=None,
is_directory=True, batch_size=32, shuffle=True, seed=0):
'''
data: tuple of (x,y)
n: data size
des_size: standard size
means: the dataset mean of RGB,default is imagenet means [103.939, 116.779, 123.68]
batch_size: default is 32
shuffle: random the data,default is True
'''
self.x = np.array(data[0])
if len(data) >= 2:
self.y = np.array(data[1])
else:
self.y = None
self.n = n
self.des_size = des_size
self.is_directory = is_directory
self.batch_size = batch_size
self.shuffle = shuffle
self.lock = threading.Lock()
self.index_array = self._set_index_array()
self.means = means
self.stds = stds
def reset_index(self):
self.batch_index = 0
def _set_index_array(self):
self.index_array = np.arange(self.n)
if self.shuffle:
np.random.shuffle(self.index_array)
def on_epoch_end(self):
# 重置索引数组
self._set_index_array()
def __len__(self):
# 计算batch的总数量
return int(np.ceil(self.n / self.batch_size))
def __getitem__(self, idx):
if self.index_array is None:
self._set_index_array()
index_array = self.index_array[self.batch_size * idx:
self.batch_size * (idx + 1)]
return self._data_generate(index_array)
def _data_generate(self, index_array):
# read from path
# request the memory
imgs = np.zeros((len(index_array), self.des_size[0], self.des_size[1], 3), dtype=np.uint8)
# read the data
if self.is_directory:
img_names = self.x[index_array]
for name_index in range(len(img_names)):
img = cv2.imread(img_names[name_index])
if img is not None:
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = cv2.resize(img, self.des_size)
imgs[name_index] = img
else:
for i in range(len(index_array)):
img = self.x[index_array[i]]
img = cv2.resize(img, self.des_size)
imgs[i] = img
if self.y is not None:
labels = self.y[index_array]
if labels is None:
return imgs
else:
return imgs, labels
我们会使用3个模型去迁移学习,它们的对输入图片的大小要求不同,所以我们会初始化不同大小的 Generator
大小 | |
---|---|
Xception | (299, 299, 3) |
Inceptionv3 | (299, 299, 3) |
Resnet50 | (224, 224, 3) |
batch_size = 32
train_img_names,valid_img_names,train_labels,valid_labels = load_data()
test_img_names = load_data(load_type="test")
train_generator_224 = MOGenerator((train_img_names,train_labels), len(train_img_names), des_size=(224,224),
batch_size=batch_size, shuffle=True)
train_generator_299 = MOGenerator((train_img_names,train_labels), len(train_img_names), des_size=(299,299),
batch_size=batch_size, shuffle=True)
valid_generator_299 = MOGenerator((valid_img_names,valid_labels), len(valid_img_names), des_size=(299,299),
batch_size=batch_size, shuffle=False)
valid_generator_224 = MOGenerator((valid_img_names,valid_labels), len(valid_img_names), des_size=(224,224),
batch_size=batch_size, shuffle=True)
test_generator_299 = MOGenerator((test_img_names), len(test_img_names), des_size=(299,299),
batch_size=batch_size, shuffle=False)
test_generator_224 = MOGenerator((test_img_names), len(test_img_names), des_size=(224,224),
batch_size=batch_size, shuffle=False)
我们要完成三个模型的迁移(Resnet,Inception,Xception)
下面开始第一个模型的迁移(Resnet)
# 加载Resnet50网络, Include_top: 是否包含卷积之后的全连接层.
base_model = ResNet50(input_tensor=Lambda(resnet50.preprocess_input)(Input(shape=(224,224,3))),
weights="imagenet", include_top=False)
# 遍历所有层, 将预训练模型的卷积层设置为不可训练, 这样它们的参数就不会被改变了.
for layers in base_model.layers:
layers.trainable = False
# 重新设置输出层, 我们的类别是两类, 只需要一个神经元即可.
x = GlobalAveragePooling2D()(base_model.output)
x = Dropout(0.25)(x)
x = Dense(1, activation="sigmoid")(x)
# 实例化模型
model = Model(base_model.input, x)
# 设置优化器, 损失函数, 展示参数信息
model.compile(optimizer="adam",
loss="binary_crossentropy",
metrics=["accuracy"])
# 开始训练, 通过设置迭代器模式
model.fit_generator(train_generator_224,len(train_img_names)//batch_size,epochs=5,
validation_data=valid_generator_224,validation_steps=len(valid_img_names)//batch_size,shuffle=False)
Epoch 1/5
625/625 [==============================] - 101s 162ms/step - loss: 0.1142 - acc: 0.9540 - val_loss: 0.0467 - val_acc: 0.9840
Epoch 2/5
625/625 [==============================] - 97s 155ms/step - loss: 0.0671 - acc: 0.9747 - val_loss: 0.0397 - val_acc: 0.9873
Epoch 3/5
625/625 [==============================] - 98s 156ms/step - loss: 0.0633 - acc: 0.9752 - val_loss: 0.0371 - val_acc: 0.9875
Epoch 4/5
625/625 [==============================] - 98s 157ms/step - loss: 0.0607 - acc: 0.9773 - val_loss: 0.0479 - val_acc: 0.9847
Epoch 5/5
625/625 [==============================] - 98s 157ms/step - loss: 0.0582 - acc: 0.9769 - val_loss: 0.0403 - val_acc: 0.9873
实现Inception模型的迁移
## inception
inception = inception_v3.InceptionV3(include_top=False,
weights="imagenet",input_tensor=Lambda(inception_v3.preprocess_input)(Input(shape=(299,299,3))),pooling="avg")
output = inception.output
output = Dropout(0.25)(output)
prediction = Dense(1,activation="sigmoid")(output)
inception_model = Model(inputs=inception.input,outputs=prediction)
for layer in inception.layers: #[:-res_global_pool_index]
layer.trainable = False
inception_model.compile(optimizer="adam",loss="binary_crossentropy",metrics=["accuracy"])
inception_model.fit_generator(train_generator_299,(len(train_img_names) + batch_size - 1)//batch_size,epochs=5,
validation_data=valid_generator_299,validation_steps=len(valid_img_names)//batch_size)
W0812 06:20:46.242835 139830584244096 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3980: The name tf.nn.avg_pool is deprecated. Please use tf.nn.avg_pool2d instead.
Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.5/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5
87916544/87910968 [==============================] - 1s 0us/step
Epoch 1/5
625/625 [==============================] - 125s 200ms/step - loss: 0.1636 - acc: 0.9425 - val_loss: 0.0436 - val_acc: 0.9894
Epoch 2/5
625/625 [==============================] - 118s 189ms/step - loss: 0.0878 - acc: 0.9687 - val_loss: 0.0647 - val_acc: 0.9817
Epoch 3/5
625/625 [==============================] - 118s 188ms/step - loss: 0.0818 - acc: 0.9692 - val_loss: 0.0571 - val_acc: 0.9851
Epoch 4/5
625/625 [==============================] - 118s 188ms/step - loss: 0.0756 - acc: 0.9715 - val_loss: 0.0531 - val_acc: 0.9863
Epoch 5/5
625/625 [==============================] - 118s 189ms/step - loss: 0.0780 - acc: 0.9715 - val_loss: 0.0611 - val_acc: 0.9833
实现XCeption的迁移学习
## xception
xcep = Xception(include_top=False, weights="imagenet",
input_tensor=Lambda(xception.preprocess_input)(Input(shape=(299,299,3))),pooling="avg")
output = xcep.output
output = Dropout(0.25)(output)
prediction = Dense(1,activation="sigmoid")(output)
xcep_model = Model(inputs=xcep.input,outputs=prediction)
for layer in xcep.layers:
layer.trainable = False
xcep_model.compile(optimizer="adam",loss="binary_crossentropy",metrics=["accuracy"])
xcep_model.fit_generator(train_generator_299,(len(train_img_names) + batch_size - 1)//batch_size,epochs=5,
validation_data=valid_generator_299,validation_steps=len(valid_img_names)//batch_size)
Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.4/xception_weights_tf_dim_ordering_tf_kernels_notop.h5
83689472/83683744 [==============================] - 1s 0us/step
Epoch 1/5
625/625 [==============================] - 280s 447ms/step - loss: 0.1085 - acc: 0.9721 - val_loss: 0.0816 - val_acc: 0.9796
Epoch 2/5
625/625 [==============================] - 274s 439ms/step - loss: 0.0508 - acc: 0.9847 - val_loss: 0.0775 - val_acc: 0.9783
Epoch 3/5
625/625 [==============================] - 275s 440ms/step - loss: 0.0452 - acc: 0.9851 - val_loss: 0.0530 - val_acc: 0.9869
Epoch 4/5
625/625 [==============================] - 275s 440ms/step - loss: 0.0392 - acc: 0.9868 - val_loss: 0.0516 - val_acc: 0.9871
Epoch 5/5
625/625 [==============================] - 274s 438ms/step - loss: 0.0411 - acc: 0.9857 - val_loss: 0.0785 - val_acc: 0.9775
训练完成后,需要将模型保存,方便下次调用和二次训练
model.save_weights("resnet.h5")
model.save_weights("xcep.h5")
model.save_weights("incep.h5")
导出特征向量
由于数据集的文件名是type.num.jpg
这样的方式命名的,但是使用keras的ImageDataGenerator需要将不同种类的图片分在不同的文件中,因此我们需要对数据集进行处理。
这里采取的思路是创建符号链接,这样的好处是不用复制一遍图片,占用不必要的空间。
from keras.preprocessing.image import *
# 获取所有训练集的图片名
train_filenames = os.listdir("train")
test_filenames = os.listdir("test")
# 文件名是特殊命名形式的, 所以直接获取分类后的文件
train_cat = list(filter(lambda x:x[:3] == 'cat', train_filenames))
train_dog = list(filter(lambda x:x[:3] == 'dog', train_filenames))
def rmrf_mkdir(dirname):
if os.path.exists(dirname):
shutil.rmtree(dirname)
os.mkdir(dirname)
# 创建目录
rmrf_mkdir('train2')
os.mkdir('train2/cat')
os.mkdir('train2/dog')
rmrf_mkdir('valid2')
os.mkdir('valid2/cat')
os.mkdir('valid2/dog')
rmrf_mkdir('test2')
os.mkdir("test2/test")
for filename in test_filenames:
# 建立软链(不会从创建文件, 有点类似快捷方式)
os.symlink("/content/test/"+filename, "/content/test2/test/"+filename)
for filename in train_cat[:-2500]:
os.symlink("/content/train/"+filename, "/content/train2/cat/"+filename)
for filename in train_dog[:-2500]:
os.symlink("/content/train/"+filename, "/content/train2/dog/"+filename)
for filename in train_cat[-2500:]:
os.symlink("/content/train/"+filename, "/content/valid2/cat/"+filename)
for filename in train_dog[-2500:]:
os.symlink("/content/train/"+filename, "/content/valid2/dog/"+filename)
# 使用Keras默认的生成器, 因为我们已经生成对应的目录了
# 224 是 resnet的处理方式, 299是xception, inception的处理大小
gen = ImageDataGenerator()
train_generator_224 = gen.flow_from_directory("train2", (224,224), shuffle=False,
batch_size=batch_size,class_mode='binary')
valid_generator_224 = gen.flow_from_directory("valid2", (224,224), shuffle=False,
batch_size=batch_size,class_mode='binary')
test_generator_224 = gen.flow_from_directory("test2", (224,224), shuffle=False,
batch_size=batch_size, class_mode=None)
train_generator_299 = gen.flow_from_directory("train2", (299,299), shuffle=False,
batch_size=batch_size,class_mode='binary')
valid_generator_299 = gen.flow_from_directory("valid2", (299,299), shuffle=False,
batch_size=batch_size,class_mode='binary')
test_generator_299 = gen.flow_from_directory("test2", (299,299), shuffle=False,
batch_size=batch_size, class_mode=None)
Found 20000 images belonging to 2 classes.
Found 5000 images belonging to 2 classes.
Found 12500 images belonging to 1 classes.
Found 20000 images belonging to 2 classes.
Found 5000 images belonging to 2 classes.
Found 12500 images belonging to 1 classes.
保存特征向量到本地
import h5py
# 特征向量的保存方法
def write_feature(model_name,model,train_generator,train_labels,valid_generator,valid_labels,test_generator,batch_size=32):
# 通过模型名字来加载不同的权重
if model_name == 'resnet_feature':
model.load_weights('resnet.h5',by_name=True)
elif model_name == 'inception_feature':
model.load_weights('incep.h5',by_name=True)
else:
model.load_weights('xcep.h5',by_name=True)
# 转换为numpy数组
train_labels = np.array(train_labels)
valid_labels = np.array(valid_labels)
# 直接进行输出, 得到特征向量
train_feature = model.predict_generator(train_generator,int(np.ceil(train_generator.samples/batch_size)),verbose=1)
valid_feature = model.predict_generator(valid_generator,int(np.ceil(valid_generator.samples/batch_size)),verbose=1)
test_feature = model.predict_generator(test_generator,int(np.ceil(test_generator.samples/batch_size)),verbose=1)
print("train_feature.shape:",train_feature.shape)
print("valid_feature.shape:",valid_feature.shape)
# 保存到本地
with h5py.File(model_name+'.h5','w') as file:
file.create_dataset("train",data=train_feature,dtype="float32")
file.create_dataset('trian_labels',data=np.array(train_generator.classes),dtype="uint8")
file.create_dataset("valid",data=valid_feature,dtype="float32")
file.create_dataset("valid_labels",data=np.array(valid_generator.classes),dtype="uint8")
file.create_dataset("test",data=test_feature,dtype="float32")
# resnet50
write_feature('resnet_feature',Model(inputs=model.input,outputs=model.layers[-3].output),
train_generator_224,train_labels,valid_generator_224,valid_labels,test_generator_224)
625/625 [==============================] - 85s 136ms/step
157/157 [==============================] - 21s 132ms/step
391/391 [==============================] - 53s 136ms/step
train_feature.shape: (20000, 2048)
valid_feature.shape: (5000, 2048)
# inception
write_feature('inception_feature',Model(inputs=inception_model.input,outputs=inception_model.layers[-3].output),
train_generator_299,train_labels, valid_generator_299,valid_labels,test_generator_299)
625/625 [==============================] - 96s 154ms/step
157/157 [==============================] - 24s 152ms/step
391/391 [==============================] - 60s 155ms/step
train_feature.shape: (20000, 2048)
valid_feature.shape: (5000, 2048)
# xception
write_feature('xception_feature',Model(inputs=xcep_model.input,outputs=xcep_model.layers[-3].output),
train_generator_299,train_labels,valid_generator_299,valid_labels,test_generator_299)
625/625 [==============================] - 228s 364ms/step
157/157 [==============================] - 57s 360ms/step
391/391 [==============================] - 142s 364ms/step
train_feature.shape: (20000, 2048)
valid_feature.shape: (5000, 2048)
搭建融合的模型
feature_files = ['resnet_feature.h5','inception_feature.h5','xception_feature.h5']
X_train = []
y_train = []
X_valid = []
y_valid = []
X_test = []
for file_name in feature_files:
with h5py.File(file_name, 'r') as h:
X_train.append(np.array(h['train']))
X_valid.append(np.array(h['valid']))
X_test.append(np.array(h['test']))
y_train = np.array(h['trian_labels'])
y_valid = np.array(h['valid_labels'])
print(np.array(h['train']).shape,np.array(h['valid']).shape,np.array(h['test']).shape)
X_train = np.concatenate(X_train, axis=1)
X_valid = np.concatenate(X_valid, axis=1)
X_test = np.concatenate(X_test, axis=1)
print("last:",X_train.shape,X_valid.shape,X_test.shape)
(20000, 2048) (5000, 2048) (12500, 2048)
(20000, 2048) (5000, 2048) (12500, 2048)
(20000, 2048) (5000, 2048) (12500, 2048)
last: (20000, 6144) (5000, 6144) (12500, 6144)
训练融合模型
直接在特征向量上进行训练
from sklearn.utils import shuffle
# 打乱数据,模拟现实中的随机效果
X_train, y_train = shuffle(X_train, y_train)
import keras.utils
input_tensor = Input(X_train.shape[1:])
x = input_tensor
x = Dropout(0.5)(x)
x = Dense(1, activation='sigmoid')(x)
concatenate_model = Model(inputs=input_tensor, outputs=x)
concatenate_model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
concatenate_model.fit(X_train,y_train,batch_size=128, epochs=5,validation_data=(X_valid,y_valid))#validation_split=0.2
Train on 20000 samples, validate on 5000 samples
Epoch 1/5
20000/20000 [==============================] - 4s 176us/step - loss: 0.0430 - acc: 0.9850 - val_loss: 0.0093 - val_acc: 0.9978
Epoch 2/5
20000/20000 [==============================] - 1s 45us/step - loss: 0.0109 - acc: 0.9967 - val_loss: 0.0067 - val_acc: 0.9980
Epoch 3/5
20000/20000 [==============================] - 1s 46us/step - loss: 0.0085 - acc: 0.9970 - val_loss: 0.0053 - val_acc: 0.9980
Epoch 4/5
20000/20000 [==============================] - 1s 46us/step - loss: 0.0065 - acc: 0.9982 - val_loss: 0.0051 - val_acc: 0.9982
Epoch 5/5
20000/20000 [==============================] - 1s 45us/step - loss: 0.0057 - acc: 0.9983 - val_loss: 0.0052 - val_acc: 0.9986
结果预测
hint:这里我们使用了 clip 函数,这个是一个比赛中的一个非常有效的 Trick(注意,这里强调了比赛,实际项目是没有什么效果的),将我们的输出限制住范围,,如果输出是绝对的,就可以减少损失,提高排名
import pandas as pd
y_pred = concatenate_model.predict(X_test, verbose=1)
y_pred = y_pred.clip(min=0.005, max=0.995)
df = pd.read_csv("sample_submission.csv")
image_size = (224, 224)
gen = ImageDataGenerator()
test_generator = gen.flow_from_directory("/content/test2/", image_size, shuffle=False,
batch_size=16, class_mode=None)
for i, fname in enumerate(test_generator.filenames):
index = int(fname[fname.rfind('/')+1:fname.rfind('.')])
df.set_value(index-1, 'label', y_pred[i])
df.to_csv('sample_submission.csv', index=None)
print(df.head(20))
12500/12500 [==============================] - 1s 69us/step
Found 12500 images belonging to 1 classes.
id label
0 1 0.986834
1 2 0.995000
2 3 0.995000
3 4 0.995000
4 5 0.005000
5 6 0.005000
6 7 0.005000
7 8 0.005000
8 9 0.005000
9 10 0.005000
10 11 0.005000
11 12 0.995000
12 13 0.005000
13 14 0.005000
14 15 0.005000
15 16 0.005000
16 17 0.995000
17 18 0.995000
18 19 0.005000
19 20 0.005000
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:14: FutureWarning: set_value is deprecated and will be removed in a future release. Please use .at[] or .iat[] accessors instead
提交结果
!kaggle competitions submit -c dogs-vs-cats-redux-kernels-edition -f sample_submission.csv -m "Message"
100% 306k/306k [00:01<00:00, 238kB/s]
Successfully submitted to Dogs vs. Cats Redux: Kernels Edition
最终得分:0.07584
项目链接