TensorFlow 2.0 笔记(六)—— 过拟合与欠拟合

Mismatch: ground-truth VS estimated

  • model capacity
    y = β 0 + β 1 x + β 2 x 2 + β 3 x 3 + . . . + β n x n + ϵ . y=\beta_0+\beta_1x+\beta_2x^2+\beta_3x^3+...+\beta_nx^n+\epsilon. y=β0+β1x+β2x2+β3x3+...+βnxn+ϵ.

  • under-fitting: Estimated < ground-truth

    • train acc. is bad
    • test acc. is bad as well
  • over-fitting: Estimated > ground-truth

    • train loss and acc. is much better
    • test acc. is worse
    • => Generalization Performance

Detect over-fitting
split dataset: Train Set, Val Set, Test Set

dataset, metadata = tfds.load('fashion_mnist', as_supervised=True, with_info=True)
train_dataset, test_data = dataset['train'], dataset['test']

num_val_examples = 10000
num_train_examples = metadata.splits['train'].num_examples - num_val_examples
num_test_examples = metadata.splits['test'].num_examples

# 将 train数据集划分为 train和 validate两部分
train_data = train_dataset.take(num_train_examples)
val_data = train_dataset.skip(num_train_examples).take(num_val_examples)
(x, y), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()

idx = tf.range(60000)
idx = tf.random.shuffle(idx)
x_train, y_train = tf.gather(x, idx[:50000]), tf.gather(y, idx[:50000])
x_val, y_val = tf.gather(x, idx[-10000:]), tf.gather(y, idx[-10000:])

train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
val_data = tf.data.Dataset.from_tensor_slices((x_val, y_val))
test_data = tf.data.Dataset.from_tensor_slices((x_test, y_test))

train test trade-off
K-fold cross-validation

  • merge train/val sets
  • randomly sample 1/k as val set
model.fit(train_data, epochs=10, validation_split=0.1, validation_freq=1)

Reduce Overfitting

  • More data
  • Constraint model complexity
    • shallow
    • regularization
  • Dropout
  • Data argumentation
  • Early Stopping
import os
import tensorflow as tf

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

(x, y), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()


idx = tf.range(60000)
idx = tf.random.shuffle(idx)
x_train, y_train = tf.gather(x, idx[:50000]), tf.gather(y, idx[:50000])
x_val, y_val = tf.gather(x, idx[-10000:]), tf.gather(y, idx[-10000:])

train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
val_data = tf.data.Dataset.from_tensor_slices((x_val, y_val))
test_data = tf.data.Dataset.from_tensor_slices((x_test, y_test))
train_val_data = zip(tf.convert_to_tensor(x),tf.convert_to_tensor(y))


def normalize(images, labels):
    images = tf.cast(images, tf.float32)
    images /= 255.
    images = tf.reshape(images, [28*28])
    labels = tf.one_hot(labels, depth=10)
    return images, labels


BATCH_SIZE = 100
train_data = train_data.map(normalize).shuffle(50000).batch(BATCH_SIZE)\
    .prefetch(tf.data.experimental.AUTOTUNE)
val_data = val_data.map(normalize).shuffle(10000).batch(BATCH_SIZE)\
    .prefetch(tf.data.experimental.AUTOTUNE)
test_data = test_data.map(normalize).batch(BATCH_SIZE)


class MyDense(tf.keras.layers.Layer):

    def __init__(self, inp_dim, outp_dim):
        super(MyDense, self).__init__()

        self.kernel = self.add_variable('w', [inp_dim, outp_dim])
        self.bias = self.add_variable('b', [outp_dim])

    def call(self, inputs, training=None):
        out = inputs @ self.kernel + self.bias

        return out


class MyModel(tf.keras.Model):

    def __init__(self):
        super(MyModel, self).__init__()
        self.fc1 = MyDense(28*28, 256)
        self.fc2 = MyDense(256, 128)
        self.fc3 = MyDense(128, 64)
        self.fc4 = MyDense(64, 32)
        self.fc5 = MyDense(32, 10)

    def call(self, inputs, training=None, mask=None):
        x = self.fc1(inputs)
        x = tf.nn.relu(x)
        x = self.fc2(x)
        x = tf.nn.relu(x)
        x = self.fc3(x)
        x = tf.nn.relu(x)
        x = self.fc4(x)
        x = tf.nn.relu(x)
        x = self.fc5(x)

        return x


model = MyModel()
model.build(input_shape=[None, [28*28]])
model.summary()
model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.01),
              loss=tf.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
#model.fit(train_data, epochs=5, validation_data=val_data, validation_freq=1)
model.fit(train_val_x, train_val_y, epochs=5, validation_split=0.1, validation_freq=1)

model.evaluate(test_data)

Regularization
J ( θ ) = − 1 m ∑ i = 1 m [ y i l n y i ^ + ( 1 − y i ) l n ( 1 − y i ^ ) ] + λ ∑ i = 1 n ∣ θ i ∣ J(\theta)=-\frac{1}{m}\sum_{i=1}^m[y_iln\hat{y_i}+(1-y_i)ln(1-\hat{y_i})]+\lambda\sum_{i=1}^n|\theta_i| J(θ)=m1i=1m[yilnyi^+(1yi)ln(1yi^)]+λi=1nθi

  • L1-regularization
    J ( θ ) = − 1 m ∑ i = 1 m [ y i l n y i ^ + ( 1 − y i ) l n ( 1 − y i ^ ) ] + λ ∑ i = 1 n ∣ θ i ∣ J(\theta)=-\frac{1}{m}\sum_{i=1}^m[y_iln\hat{y_i}+(1-y_i)ln(1-\hat{y_i})]+\lambda\sum_{i=1}^n|\theta_i| J(θ)=m1i=1m[yilnyi^+(1yi)ln(1yi^)]+λi=1nθi
  • L2-regularization
    J ( W ; X , y ) + 1 2 λ ⋅ ∣ ∣ W ∣ ∣ 2 J(W;X,y)+\frac{1}{2}\lambda\cdot||W||^2 J(W;X,y)+21λW2
l2_model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(16, kernel_regularizer=tf.keras.regularizers.l2(0.001), 
                          activation=tf.nn.relu,
                          input_shape=([None, 28*28])),
    tf.keras.layers.Dense(16, kernel_regularizer=tf.keras.regularizers.l2(0.001), 
                          activation=tf.nn.relu),
    tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)
])
for step, (x,y) in enumerate(train_data):
    with tf.GradientTape() as tape:
        loss = tf.reduce_mean(tf.losses.categorical_crossentropy(y_onehot, out, from_logits=true))
        loss_regularization = []
        for p in model.trainable_variables:
            loss_regularization.append(tf.nn.l2_loss(p))
        loss_regularization = tf.reduce_sum(tf.stack(loss_regularization))

        loss = loss + 0.0001 * loss_regularization

    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

momentum
w k + 1 = w k − α ∇ f ( w k ) w^{k+1}=w^k-\alpha\nabla f(w^k) wk+1=wkαf(wk)
z k + 1 = β z k + ∇ f ( w k ) z^{k+1}=\beta z^k+\nabla f(w^k) zk+1=βzk+f(wk)
w k + 1 = w k − α z k + 1 w^{k+1}=w^k-\alpha z^{k+1} wk+1=wkαzk+1

optimizer = tf.keras.optimizers.SGD(learning_rate=0.02, momentum=0.9)
optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.02, momentum=0.9)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.02, beta_1=0.9, beta_2=0.999)

learning rate decay

optimizer = tf.keras.optimizers.SGD(learning_rate=0.2)
for epoch in range(100):
    #get loss
    
    #change learning rate
    optimizer.learning_rate = 0.2 *(100 - epoch) / 100
    
    #update weights

Early Stopping

  • Validation set to select parameters
  • Monitor validation performance
  • Stop at the highest val perf

Dropout

  • Learning less to learn better
  • Each connection has p = [ 0 , 1 ] p=[0,1] p=[0,1] to lose
model = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation=tf.nn.relu),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(128, activation=tf.nn.relu),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(64, activation=tf.nn.relu),
    tf.keras.layers.Dense(32, activation=tf.nn.relu),
    tf.keras.layers.Dense(10)
])
for step, (x,y) in enumerate(train_data):
    with tf.GradientTape() as tape:
        x = tf.reshape(x, (-1, 28*28))
        # train
        out = model(x, training=True)
        # val out = model(x, training=False)
    # test
    out = model(x, training=False)

Stochastic Gradient Descent

  • Stochastic
    • not random!
  • Deterministic
  • Not single usually
  • batch = 16, 32, 64, 128…
import os
import tensorflow as tf


os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

(x, y), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()


def normalize(images, labels):
    images = tf.cast(images, tf.float32)
    images /= 255.
    images = tf.reshape(images, [28 * 28])
    return images, labels


model = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation=tf.nn.relu),
    tf.keras.layers.Dense(128, activation=tf.nn.relu),
    tf.keras.layers.Dense(64, activation=tf.nn.relu),
    tf.keras.layers.Dense(32, activation=tf.nn.relu),
    tf.keras.layers.Dense(10)
])

model.build(input_shape=[None, 28*28])
model.summary()
optimizer = tf.keras.optimizers.Adam(lr=1e-3)

acc_meter = tf.keras.metrics.Accuracy()
loss_meter = tf.keras.metrics.Mean()
test_meter = tf.keras.metrics.Accuracy()

BATCH_SIZE = 128
test_data = tf.data.Dataset.from_tensor_slices((x_test, y_test))
test_data = test_data.map(normalize).batch(BATCH_SIZE)

for epoch in range(5):
    idx = tf.range(60000)
    idx = tf.random.shuffle(idx)
    x_train, y_train = tf.gather(x, idx[:50000]), tf.gather(y, idx[:50000])
    x_val, y_val = tf.gather(x, idx[-10000:]), tf.gather(y, idx[-10000:])

    # 分割数据集
    train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    train_data = train_data.map(normalize).shuffle(50000).batch(BATCH_SIZE)

    val_data = tf.data.Dataset.from_tensor_slices((x_val, y_val))
    val_data = val_data.map(normalize).shuffle(10000).batch(BATCH_SIZE)

    for step, (x_train, y_train) in enumerate(train_data):
        with tf.GradientTape() as tape:
            y_one = tf.one_hot(y_train, depth=10)
            logits = model(x_train)
            # loss_mse = tf.reduce_mean(tf.losses.MSE(y_, logits))
            loss_ce = tf.reduce_mean(tf.losses.categorical_crossentropy(y_one, logits, from_logits=True))
            loss_meter.update_state(loss_ce)

        grads = tape.gradient(loss_ce, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        
        # 验证
        if step % 10 == 0:
            for _, (x_val, y_val) in enumerate(val_data):
                logits = model(x_val)
                prob = tf.nn.softmax(logits, axis=1)
                pred = tf.argmax(prob, axis=1)
                acc_meter.update_state(y_val, pred)

            print(epoch, step, 'loss:', loss_meter.result().numpy(),
                  'Evaluate Acc:', acc_meter.result().numpy())
            loss_meter.reset_states()
            acc_meter.reset_states()

    # 测试
    for _, (x_test, y_test) in enumerate(test_data):
        logits = model(x_test)
        prob = tf.nn.softmax(logits, axis=1)
        pred = tf.argmax(prob, axis=1)
        test_meter.update_state(y_test, pred)
    print(epoch, 'Test Acc:', test_meter.result().numpy())
    test_meter.reset_states()

你可能感兴趣的:(机器学习,TensorFlow2.0)