自动推导输入数据的格式
x = tf.random.normal([4, 784])
net = tf.keras.layers.Dense(512)
out = net(x)
out.shape
net.kernel.shape,net.bias.shape
net.build()
可多次build
预先设置输入格式,如果不匹配,不能输入
net = tf.keras.layers.Dense(10)
net.bias #AttributeError: 'Dense' object has no attribute 'bias'
net.get_weights() #[]
net.weights #[]
net.build(input_shape=(None, 4))
net.kernel.shape,net.bias.shape #(TensorShape([4, 10]), TensorShape([10]))
net.build(input_shape=(None, 20))
net.kernel.shape,net.bias.shape #(TensorShape([20, 10]), TensorShape([10]))
多层全连接层
tf.keras.Sequential()
x = tf.random.normal([2, 3]);
model = tf.keras.Sequential([
tf.keras.layers.Dense(2, activation='relu'),
tf.keras.layers.Dense(2, activation='relu'),
tf.keras.layers.Dense(2),
])
model.build(input_shape=[None, 3])
model.summary()
for p in model.trainable_variables:
print(p.name, p.shape)
- y ∈ R d y \in R^d y∈Rd
- y i ∈ [ 0 , 1 ] , i = 0 , 1 , . . . , y d − 1 y_i \in [0, 1], \quad i = 0,1,...,y_d - 1 yi∈[0,1],i=0,1,...,yd−1
- y i ∈ [ 0 , 1 ] , ∑ i = 0 y d y i = 1 , i = 0 , 1 , . . . , y d − 1 y_i \in [0, 1],\ \sum_{i=0}^{y_d}y_i=1, \quad i = 0,1,...,y_d - 1 yi∈[0,1], ∑i=0ydyi=1,i=0,1,...,yd−1
- y i ∈ [ − 1 , 1 ] , i = 0 , 1 , . . . , y d − 1 y_i \in [-1, 1], \quad i = 0,1,...,y_d - 1 yi∈[−1,1],i=0,1,...,yd−1
- linear regression
- naïve classification with MSE
- other general prediction
- = (@ + )
▪ logits
a = tf.linspace(-6., 6., 10)
tf.sigmoid(a)
x = tf.random.normal([1, 28, 28]) * 5
tf.reduce_min(x), tf.reduce_max(x)
x = tf.sigmoid(x)
tf.reduce_min(x), tf.reduce_max(x)
- binary classification
▪ y>0.5, →1
▪ y<0.5, →0- Image Generation
▪ rgb- = (@ + )
- sigmoid
▪ tf.sigmoid
▪ tf.nn.sigmoid
f ( x ) = 1 1 + e − x f(x)=\frac{1}{1+e^{-x}} f(x)=1+e−x1- ′ = sigmoid(out)
- sigmoid 无法实现
- softmax
Meet SoftMax
σ ( z ) j = e z j ∑ k = 1 K e z k f o r j = 1 , 2 , . . . , K . \sigma(z)_j=\frac{e^{z_j}}{\sum_{k=1}^{K}{e^{z_k}}} \quad for \ j=1,2,...,K. σ(z)j=∑k=1Kezkezjfor j=1,2,...,K.Classification
a = tf.linspace(-2., 2, 5)
tf.nn.sigmoid(a)
tf.nn.softmax(a)
logits = tf.random.uniform([1, 10], minval=-2, maxval=2)
prob = tf.nn.softmax(logits, axis=1)
tf.reduce_sum(prob, axis=1)
- tanh
t a n h ( x ) = s i n h ( x ) c o s h ( x ) = e x − e − x e x + e − x tanh(x)=\frac{sinh(x)}{cosh(x)} =\frac{e^x-e^{-x}}{e^x+e^{-x}} tanh(x)=cosh(x)sinh(x)=ex+e−xex−e−x
a = tf.linspace(-2., 2, 5)
tf.nn.tanh(a)
tf.tanh(a)
损失函数
- MSE
- Cross Entropy Loss
- Hinge Loss
∑ i m a x ( 0 , 1 − y i ∗ h θ ( x i ) ) \sum_imax(0, 1-y_i*h_\theta(x_i)) i∑max(0,1−yi∗hθ(xi))
- l o s s = 1 N ∑ ( y − o u t ) 2 loss=\frac{1}{N}\sum(y-out)^2 loss=N1∑(y−out)2
- L 2 − n o r m = ∑ ( y − o u t ) 2 L_{2-norm}=\sqrt{\sum(y-out)^2} L2−norm=∑(y−out)2
y = tf.constant([1, 2, 3, 0, 2])
y = tf.one_hot(y, depth=4)
y = tf.cast(y, dtype=tf.float32)
out = tf.random.normal([5, 4])
loss1 = tf.reduce_mean(tf.square(y - out))
loss2 = tf.square(tf.norm(y - out))/(5*4)
loss3 = tf.reduce_mean(tf.losses.MSE(y, out)) #VS MeanSquaredError is a class
- Uncertainty
- measure of surprise
- lower entropy → more certainty
E n t r o p y = − ∑ i P ( i ) l o g P ( i ) Entropy =-\sum_iP(i)logP(i) Entropy=−i∑P(i)logP(i)
a = tf.fill([4], 0.25)
a * tf.math.log(a) / tf.math.log(2.)
-tf.reduce_sum(a * tf.math.log(a) / tf.math.log(2.))
a = tf.constant([0.1, 0.1, 0.1, 0.7])
-tf.reduce_sum(a * tf.math.log(a) / tf.math.log(2.))
a = tf.constant([0.01, 0.01, 0.01, 0.97])
-tf.reduce_sum(a * tf.math.log(a) / tf.math.log(2.))
H ( p , q ) = − ∑ p ( x ) l o g q ( x ) H(p,q)=-\sum p(x)logq(x) H(p,q)=−∑p(x)logq(x)
H ( p , q ) = H ( p ) + D K L ( p ∣ q ) H(p,q)=H(p)+D_{KL}(p|q) H(p,q)=H(p)+DKL(p∣q)
f o r p = q for \quad p = q forp=q
M i n i m a : H ( p , q ) = H ( p ) \quad \quad Minima: H(p,q)=H(p) Minima:H(p,q)=H(p)
f o r p : o n e − h o t e n c o d i n g for \quad p: one-hot encoding forp:one−hotencoding
h ( p : [ 0 , 1 , 0 ] ) = − 1 l o g 1 = 0 \quad \quad h(p:[0,1,0])=-1log1=0 h(p:[0,1,0])=−1log1=0
H ( [ 0 , 1 , 0 ] , [ p 0 , p 1 , p 2 ] ) = 0 + D K L ( p ∣ q ) = − 1 l o g q 1 \quad \quad H([0,1,0],[p_0,p_1,p_2])=0+D_{KL}(p|q) = -1logq_1 H([0,1,0],[p0,p1,p2])=0+DKL(p∣q)=−1logq1
Binary Classification
Single output
H ( P , Q ) = − P ( c a t ) l o g Q ( c a t ) − ( 1 − P ( c a t ) ) l o g ( 1 − Q ( c a t ) ) H(P,Q)=-P(cat)logQ(cat)-(1-P(cat))log(1-Q(cat)) H(P,Q)=−P(cat)logQ(cat)−(1−P(cat))log(1−Q(cat))
P ( d o g ) = ( 1 − P ( c a t ) ) P(dog)=(1-P(cat)) P(dog)=(1−P(cat))
H ( P , Q ) = − ∑ i = ( c a t , d o g ) P ( i ) l o g Q ( i ) = − P ( c a t ) l o g Q ( c a t ) − P ( d o g ) l o g Q ( d o g ) − ( y l o g ( p ) + ( 1 − y ) l o g ( 1 − p ) ) H(P,Q)=-\sum_{i=(cat,dog)}P(i)logQ(i) \\ =-P(cat)logQ(cat)-P(dog)logQ(dog)-(ylog(p)+(1-y)log(1-p)) H(P,Q)=−i=(cat,dog)∑P(i)logQ(i)=−P(cat)logQ(cat)−P(dog)logQ(dog)−(ylog(p)+(1−y)log(1−p))
H ( [ 0 , 1 , 0 ] , [ p 0 , p 1 , p 2 ] ) = 0 + D K L ( p ∣ q ) = − 1 l o g q 1 H([0,1,0],[p_0,p_1,p_2])=0+D_{KL}(p|q) = -1logq_1 H([0,1,0],[p0,p1,p2])=0+DKL(p∣q)=−1logq1
tf.losses.categorical_crossentropy([0, 1, 0, 0], [0.25, 0.25, 0.25, 0.25])
tf.losses.categorical_crossentropy([0, 1, 0, 0], [0.1, 0.1, 0.8, 0.1])
tf.losses.categorical_crossentropy([0, 1, 0, 0], [0.1, 0.7, 0.1, 0.1])
tf.losses.categorical_crossentropy([0, 1, 0, 0], [0.01, 0.97, 0.01, 0.01])
tf.losses.CategoricalCrossentropy()([0, 1, 0, 0], [0.1, 0.7, 0.1, 0.1])
tf.losses.CategoricalCrossentropy()([0, 1], [0.9, 0.1])
tf.losses.BinaryCrossentropy()([1], [0.1])
tf.losses.binary_crossentropy([1], [0.1])
Categorical Cross Entropy
Why not MSE?
- sigmoid + MSE
gradient vanish- converge slower
- However
e.g. meta-learning
x = tf.random.normal([1, 784])
w = tf.random.normal([784, 2])
b = tf.zeros([2])
logits = x@w + b
prob = tf.math.softmax(logits, axis=1)
tf.losses.categorical_crossentropy([0, 1], logits, from_logits=True) #推荐使用
tf.losses.categorical_crossentropy([0, 1], prob) # 存在数值不稳定的情况