4.1 机器学习的四个分支
4.1.1 监督学习 4.1.3 自监督学习 4.1.4 强化学习
4.2 评估机器学习模型
4.2.1 训练集、验证集和测试机 4.2.2 评估模型的注意事项
4.3 数据预处理、特征过程和特征学习
4.3.1 神经网络的数据预处理 4.3.2 特征过程
4.4 过拟合和欠拟合
4.4.1 减小网络大小 4.4.2 添加权重正则化 完整代码
4.4.3 添加dropout正则化 完整代码
4.5 机器学习的通用工作流程
4.5.1 定义问题,收集数据集 4.5.2 选择衡量成功的标准 4.5.3 确定评估方法 4.5.4 准备数据
4.5.5 开发比基准更好的模型 4.5.6 扩大模型规模:开发过拟合模型 4.5.7 模型正则化与调节超参数
本章着重于解决机器学习任务:模型评估、数据预处理、特征工程、解决过拟合。
监督学习(supervised learning):多分类、二分类和标量回归问题
降维(dimensionality reduction)
聚类(cluster)
自编码器(autoencoder),给定目标是未经修改的输入。给定视频过去的帧,然后预测下一帧,或者给定文本前面的词预测下一个词,也属于时序监督学习(temporarilly sypervised learning)。
同时自监督学习可以被理解为无监督或者监督学习。
可以适用于自动驾驶汽车、机器人、资源管理、教育等。
由钱一章可以看到在几个训练集上模型评估:仅仅几轮过后,三个模型都开始过拟合。机器去学习的目的是得到可以泛化(generalize)的模型
信息泄露(information leak):每次基于模型在验证集上的醒来来调节模型超参数,都会有一些关于验证数据的信息泄露到模型,很快会导致模型过拟合。
三种经典的评估方法:简单的留出验证、K折验证,以及有打乱数据的重复K折验证。
1.简单的留出验证(hold-out validation)
留出一定比例的数据集作为测试集。在剩余数据上训练模型。
# 4-1 留出验证
import numpy as np
num_validation_samples = 10000
# 打乱数据
np.random.shuffle(data)
# 定义验证集
validation_data = data[:num_validation_samples]
data = data[num_validation_samples:]
training_data = data[:]
model = get_model()
model.train(training_data)
validation_score = model.evaluate(validation_data)
# 重新调节模型、评估、然后再次调节
model = get_model()
model.train(np.concatenate([training_data, validation_data]))
test.score = model.evaluate(test_data)
缺点:如果可用的数据很少,那么可能验证集和测试集包含的样本就太少。
2.K折验证 K-fold validation将数据划分为大小相同的K个分区,对于每个分区K进行测试,在剩余的分区上训练模型。
# 4-2 K折交叉验证
k = 4
num_validation_samples= len(data) // k
np.random.shuffle(data)
validation_scores = []
for fold in range(k):
validation data = data[num_validation_samples * fold:
num_validation_samples * (flod + 1):]
training_data = data[:num_validation_samples * fold] +
data[num_validation_samples * (fold + 1):]
model = get_model()
model.train(training_data)
validation_score = model.evaluate(validation_data)
validation_scores.append(validation_score)
validation_score = np.average(validation_scores)
model = get_model()
model.train(data)
test_score = mdoel.evaluate(test_data)
3.带有打乱数据的重复K折验证(iterated K-fold validation with shuffling)
一共需要训练和评估PxK个模型,计算成本较大,在Kaggle竞赛中十分有用。
数据预处理的目的是使原始数据更适合用于神经网络处理,包括向量化、标准化、处理缺失值和特征提取。
1.向量化
神经网络的所有输入和目标都必须是浮点数张量。(data vectorization) 2.值标准化
取值小、同质性(homogenous)
3.处理缺失值
从更高层次理解问题
降低过拟合的方法-正则化(regularization)
# 4-3 原始模型
from keras import models
from keras import layers
model = models.Sequential()
model.add(layers.Dense(16, activation = 'relu', input_shape=(10000,)))
model.add(layers.Dense(16, activation = 'relu'))
model.add(layers.Dense(1, activation = 'sigmoid'))
# 4-4 容量更小的模型
model = models.Sequential()
model.add(layers.Dense(4, activation = 'relu', input_shape=(10000,)))
model.add(layers.Dense(4, activation = 'relu'))
model.add(layers.Dense(1, activation = 'sigmoid'))
这里我们再使用前面用到的imdb数据进行测试
# 用imdb数据测试
# 3-1 加载IMDB数据库
from keras.datasets import imdb
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)
print(train_data[0])
print(train_labels[0])
# 可以迅速把某条评论解码为英文单词
word_index = imdb.get_word_index()
reverse_word_index = dict(
[(value, key) for (key, value) in word_index.items()])
decoded_review = ' '.join(
[reverse_word_index.get(i - 3, '?') for i in train_data[0]])
# 3-2 将整数序列编码为二进制矩阵
import numpy as np
def vectorize_sequences(sequences, dimension = 10000):
results = np.zeros((len(sequences), dimension))
for i, sequence in enumerate(sequences):
results[i, sequence] = 1.
return results
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)
print(x_train[0])
y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')
# 3-7 留出验证集
x_val = x_train[:10000]
partial_x_train = x_train[10000:]
y_val = y_train[:10000]
partial_y_train = y_train[10000:]
# 4-3 原始模型
from keras import models
from keras import layers
model1 = models.Sequential()
model1.add(layers.Dense(16, activation = 'relu', input_shape=(10000,)))
model1.add(layers.Dense(16, activation = 'relu'))
model1.add(layers.Dense(1, activation = 'sigmoid'))
# 4-4 容量更小的模型
model2 = models.Sequential()
model2.add(layers.Dense(4, activation = 'relu', input_shape=(10000,)))
model2.add(layers.Dense(4, activation = 'relu'))
model2.add(layers.Dense(1, activation = 'sigmoid'))
# 3-4 编译模型
model1.compile(optimizer = 'rmsprop',
loss = 'binary_crossentropy',
metrics = ['accuracy'])
model2.compile(optimizer = 'rmsprop',
loss = 'binary_crossentropy',
metrics = ['accuracy'])
history1 = model1.fit(partial_x_train,
partial_y_train,
epochs = 20,
batch_size = 512,
validation_data = (x_val, y_val))
history2 = model2.fit(partial_x_train,
partial_y_train,
epochs = 20,
batch_size = 512,
validation_data = (x_val, y_val))
# 3-9 绘制训练损失和验证损失
import matplotlib.pyplot as plt
history1_dict = history1.history
history2_dict = history2.history
loss_values = history1_dict['loss']
val_loss_values1 = history1_dict['val_loss']
val_loss_values2 = history2_dict['val_loss']
epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, val_loss_values1, '+', label = 'Original model')
plt.plot(epochs, val_loss_values2, 'bo', label = 'Smaller model')
plt.title('Training and Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Validation Loss')
plt.legend()
plt.show()
明显更小的网络过拟合时间晚于参考网络。
# 4-5 容量更大的模型
model3 = models.Sequential()
model3.add(layers.Dense(512, activation = 'relu', input_shape=(10000,)))
model3.add(layers.Dense(512, activation = 'relu'))
model3.add(layers.Dense(1, activation = 'sigmoid'))
model3.compile(optimizer = 'rmsprop',
loss = 'binary_crossentropy',
metrics = ['accuracy'])
history3 = model3.fit(partial_x_train,
partial_y_train,
epochs = 20,
batch_size = 512,
validation_data = (x_val, y_val))
import matplotlib.pyplot as plt
history1_dict = history1.history
history3_dict = history3.history
loss_values = history1_dict['loss']
val_loss_values1 = history1_dict['val_loss']
val_loss_values3 = history3_dict['val_loss']
epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, val_loss_values1, '+', label = 'Original model')
plt.plot(epochs, val_loss_values3, 'bo', label = 'Bigger model')
plt.title('Bigger model')
plt.xlabel('Epochs')
plt.ylabel('Validation Loss')
plt.legend()
plt.show()
import matplotlib.pyplot as plt
history1_dict = history1.history
history3_dict = history3.history
loss_values = history1_dict['loss']
val_loss_values1 = history1_dict['val_loss']
val_loss_values3 = history3_dict['val_loss']
epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, val_loss_values1, '+', label = 'Original model')
plt.plot(epochs, val_loss_values3, 'bo', label = 'Bigger model')
plt.title('Bigger model')
plt.xlabel('Epochs')
plt.ylabel('Validation Loss')
plt.legend()
plt.show()
明显可见,更大的网络早一轮开始过拟合,同时过拟合情况也更严重。
import matplotlib.pyplot as plt
history1_dict = history1.history
history2_dict = history2.history
history3_dict = history3.history
loss_values = history1_dict['loss']
val_loss_values1 = history1_dict['val_loss']
val_loss_values2 = history2_dict['val_loss']
val_loss_values3 = history3_dict['val_loss']
epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, val_loss_values1, '+', label = 'Original model')
plt.plot(epochs, val_loss_values2, 'd', label = 'Smaller model')
plt.plot(epochs, val_loss_values3, 'bo', label = 'Bigger model')
plt.title('Bigger model')
plt.xlabel('Epochs')
plt.ylabel('Validation Loss')
plt.legend()
plt.show()
L1正则化
L2正则化
# 4-6 向模型添加L2权重正则化
from keras import models
from keras import layers
from keras import regularizers
model4 = models.Sequential()
# 每个系数都会使总网络损失增加0.001 * weight_coefficient_value
model4.add(layers.Dense(16, kernel_regularizer = regularizers.l2(0.001), activation = 'relu', input_shape=(10000,)))
model4.add(layers.Dense(16, kernel_regularizer = regularizers.l2(0.001), activation = 'relu'))
model4.add(layers.Dense(1, activation = 'sigmoid'))
model4.compile(optimizer = 'rmsprop',
loss = 'binary_crossentropy',
metrics = ['accuracy'])
# 4-6 向模型添加L2权重正则化
from keras import models
from keras import layers
from keras import regularizers
import matplotlib.pyplot as plt
model4 = models.Sequential()
# 每个系数都会使总网络损失增加0.001 * weight_coefficient_value
model4.add(layers.Dense(16, kernel_regularizer = regularizers.l2(0.001), activation = 'relu', input_shape=(10000,)))
model4.add(layers.Dense(16, kernel_regularizer = regularizers.l2(0.001), activation = 'relu'))
model4.add(layers.Dense(1, activation = 'sigmoid'))
model4.compile(optimizer = 'rmsprop',
loss = 'binary_crossentropy',
metrics = ['accuracy'])
history4 = model4.fit(partial_x_train,
partial_y_train,
epochs = 20,
batch_size = 512,
validation_data = (x_val, y_val))
history1_dict = history1.history
history4_dict = history4.history
loss_values = history1_dict['loss']
val_loss_values1 = history1_dict['val_loss']
val_loss_values4 = history4_dict['val_loss']
epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, val_loss_values1, '+', label = 'Original model')
plt.plot(epochs, val_loss_values4, 'bo', label = 'L2 regularization')
plt.title('L2 regularization')
plt.xlabel('Epochs')
plt.ylabel('Validation Loss')
plt.legend()
plt.show()
Geoffrey Hinton
训练时随机将矩阵一部分值设为0
model.add(layers.Dropout(0.5))
# 4-8 向IMDB网络中添加dropout
model = models.Sequential()
model.add(layers.Dense(16,activation = 'relu', input_shape=(10000,)))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(16, activation = 'relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1, activation = 'sigmoid'))
# 4-8 向IMDB网络中添加dropout
model5 = models.Sequential()
model5.add(layers.Dense(16,activation = 'relu', input_shape=(10000,)))
model5.add(layers.Dropout(0.5))
model5.add(layers.Dense(16, activation = 'relu'))
model5.add(layers.Dropout(0.5))
model5.add(layers.Dense(1, activation = 'sigmoid'))
model5.compile(optimizer = 'rmsprop',
loss = 'binary_crossentropy',
metrics = ['accuracy'])
history5 = model5.fit(partial_x_train,
partial_y_train,
epochs = 20,
batch_size = 512,
validation_data = (x_val, y_val))
history1_dict = history1.history
history5_dict = history5.history
loss_values = history1_dict['loss']
val_loss_values1 = history1_dict['val_loss']
val_loss_values5 = history5_dict['val_loss']
epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, val_loss_values1, '+', label = 'Original model')
plt.plot(epochs, val_loss_values5, 'bo', label = 'Dropout')
plt.title('Dropout')
plt.xlabel('Epochs')
plt.ylabel('Validation Loss')
plt.legend()
plt.show()
work:测试L1正则化以及同时L1和L2正则化
regularizers.l1(0.001)
regularizers.l1_l2(l1=0.001, l2=0.001