RBF(径向基函数网络)网络是一类三层网络,结构形如:
x ⏟ R s → [ ϕ 1 ( x ; c 1 ) , … , ϕ h ( x ; c h ) ] T ⏟ 隐藏层 , R h → y ⏟ R t \underbrace{x}_{\R^s}\rightarrow\underbrace{[\phi_1(x;c_1),\dots,\phi_h(x;c_h)]^T}_{\text{隐藏层},\R^h}\rightarrow \underbrace{y}_{\R^t} Rs x→隐藏层,Rh [ϕ1(x;c1),…,ϕh(x;ch)]T→Rt y
其中 ϕ i : R s → R \phi_i:\R^s\rightarrow \R ϕi:Rs→R 是一个基函数,随径向距离 d ( x , c i ) d(x,c_i) d(x,ci) 单调增/减的,比如 欧氏距离 配 高斯函数:
ϕ i ( x ; c i ) = exp ( − ∥ x − c i ∥ 2 2 2 σ i 2 ) \phi_i(x;c_i)=\exp(-\frac{\parallel x-c_i \parallel^2_2}{2\sigma_i^2}) ϕi(x;ci)=exp(−2σi2∥x−ci∥22)
c i c_i ci 是第 i 个单元参数,称为中心,是与输入的 x 同维的向量。上式可以看成 x 与各中心 c i c_i ci 的亲近/相似程度。如果数据呈现聚类现象, c i c_i ci 取为各簇中心,那上式可以理解成 x 对各簇的隶属度。这些中心可以在前期用聚类获得,或者用某种策略构造。
这种隐层表示相当于换了一组基来描述数据,思想跟 anchor graph[6] 类似。
输出是隐层输出的加权和,就是普通的全连接层。
以上述欧氏距离配高斯函数为例,RBF 网的基本结构如下,主要是隐层单元。
# from tensorflow import keras as K
# DIM_OUT = ...
class RBF_Cell(K.Model):
"""隐层单元,基函数 phi"""
def __init__(self, centre, var):
"""RBF with Gaussian kernel
centre: tensor, with same shape as input x
var: scalar, variance of Gaussian
"""
super(RBF_Cell, self).__init__()
self.centre = tf.cast(centre[None, :], "float32") # [1, s]
self.coef = tf.cast(- 0.5 / var, "float32")
def call(self, x):
"""[n, 1] -> [n, 1]"""
return tf.math.exp(self.coef * euclidean(x, self.centre))
class RBF_Net(K.Model):
"""RBF 网"""
def __init__(self, centres, variances):
"""X -> [rbf_i(X; C_i)] -> Y
centres: [#centres, s]
variances: [#centres]
"""
super(RBF_Net, self).__init__()
assert len(centres) == len(variances)
self.rbf_list = [RBF_Cell(c, v) for c, v in zip(centres, variances)]
self.fc = K.layers.Dense(DIM_OUT, input_shape=[centres.shape[0]])
def call(self, x):
"""[n, s] -> [n, #centres] -> [n, t]"""
x = tf.concat([rbf(x) for rbf in self.rbf_list], axis=1)
return self.fc(x)
RBF 网可以用来做插值/函数逼近。以拟合 y = lg x y=\lg x y=lgx 为例,生成一系列 ( x i , y i ) i = 1 n (x_i,y_i)_{i=1}^n (xi,yi)i=1n,中心可以从 [ min { x i } , max { x i } ] [\min\{x_i\}, \max\{x_i\}] [min{xi},max{xi}] 中均匀抽取,方差用 σ i 2 = d m a x 2 2 m \sigma_i^2=\frac{d_{max}^2}{2m} σi2=2mdmax2 计算, d m a x d_{max} dmax 是各中心之间的距离的最大值,m 是中心个数。
import argparse
import tensorflow as tf
from tensorflow import keras as K
import numpy as np
import matplotlib.pyplot as plt
parser = argparse.ArgumentParser()
parser.add_argument("--low", type=float, default=0.1)
parser.add_argument("--high", type=float, default=10)
parser.add_argument("--n_data", type=int, default=50)
parser.add_argument("--n_centre", type=int, default=25)
parser.add_argument("--epoch", type=int, default=500)
args = parser.parse_args()
# 造数据
X = np.linspace(args.low, args.high, args.n_data).astype(np.float32) # [n, 1]
Y = np.log10(X).astype(np.float32) # [n, 1]
print("X:", X.shape, ", Y:", Y.shape)
def euclidean(A, B=None, sqrt=False):
"""欧氏距离"""
if (B is None) or (B is A):
aTb = tf.matmul(A, tf.transpose(A))
aTa = bTb = tf.linalg.diag_part(aTb)
else:
aTb = tf.matmul(A, tf.transpose(B))
aTa = tf.linalg.diag_part(tf.matmul(A, tf.transpose(A)))
bTb = tf.linalg.diag_part(tf.matmul(B, tf.transpose(B)))
D = aTa[:, None] - 2.0 * aTb + bTb[None, :]
D = tf.maximum(D, 0.0)
if sqrt:
mask = tf.cast(tf.equal(D, 0.0), "float32")
D = D + mask * 1e-16
D = tf.math.sqrt(D)
D = D * (1.0 - mask)
return D
class RBF_Cell(K.Model):
def __init__(self, centre, var):
"""RBF with Gaussian kernel
centre: tensor, with same shape as input x
var: scalar, variance of Gaussian
"""
super(RBF_Cell, self).__init__()
self.centre = tf.cast(centre[np.newaxis, :], "float32") # [1, 1]
self.coef = tf.cast(- 0.5 / var, "float32")
def call(self, x):
"""[n, 1] -> [n, 1]"""
return tf.math.exp(self.coef * euclidean(x, self.centre))
class RBF_Net(K.Model):
def __init__(self, centres):
super(RBF_Net, self).__init__()
var = 0.5 * euclidean(centres[-1:], centres[:1]) / centres.shape[0]
print("var:", var)
self.rbf_list = [RBF_Cell(c, var) for c in centres]
self.fc = K.layers.Dense(1, input_shape=[centres.shape[0]])
def call(self, x):
"""[n, 1] -> [n, #centres] -> [n, 1]"""
x = tf.concat([rbf(x) for rbf in self.rbf_list], axis=1)
return self.fc(x)
# 在定义域内均匀抽取中心
centres = np.linspace(X.min(), X.max(), args.n_centre)[:, np.newaxis] # [n, 1]
#print("centres:", centres.shape, '\n', centres)
model = RBF_Net(centres)
optimizer = K.optimizers.Adam()
X_hat = X[:, np.newaxis] # [n, 1]
Y_hat = Y[:, np.newaxis]
print("X^:", X_hat.shape)
loss_list = []
for epoch in range(args.epoch):
with tf.GradientTape() as tape:
z = model(X_hat)
loss = tf.nn.l2_loss(Y_hat - z)
grad = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(grad, model.trainable_variables))
loss_list.append(loss.numpy())
print("epoch:", epoch, ", loss:", loss.numpy())
# 可视化
fig = plt.figure()
plt.title("y = lg(x)")
plt.scatter(X, Y, s=25)
plt.plot(X, model(X_hat))
plt.show()
fig.savefig("log10.png")
本例使中心数与类别数相等,由训练集中属同一类别的样本均值给出。也可以先用 k-means 等聚类方法求中心,且中心数不一定要和类别数相等。
import argparse
import tensorflow as tf
from tensorflow import keras as K
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
parser = argparse.ArgumentParser()
parser.add_argument("--n_centre", type=int, default=10)
parser.add_argument("--n_per", type=int, default=512)
parser.add_argument("--n_class", type=int, default=10)
parser.add_argument("--epoch", type=int, default=10)
parser.add_argument("--batch_size", type=int, default=64)
args = parser.parse_args()
def euclidean(A, B=None, sqrt=False):
if (B is None) or (B is A):
aTb = tf.matmul(A, tf.transpose(A))
aTa = bTb = tf.linalg.diag_part(aTb)
else:
aTb = tf.matmul(A, tf.transpose(B))
aTa = tf.linalg.diag_part(tf.matmul(A, tf.transpose(A)))
bTb = tf.linalg.diag_part(tf.matmul(B, tf.transpose(B)))
D = aTa[:, None] - 2.0 * aTb + bTb[None, :]
D = tf.maximum(D, 0.0)
if sqrt:
mask = tf.cast(tf.equal(D, 0.0), "float32")
D = D + mask * 1e-16
D = tf.math.sqrt(D)
D = D * (1.0 - mask)
return D
# data
mnist = K.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0 # [n, 28, 28], [n]
x_train = tf.reshape(x_train, [-1, 784])
x_test = tf.reshape(x_test, [-1, 784])
print("data:", type(x_train), x_train.shape, y_test.shape)
train_ds = tf.data.Dataset.from_tensor_slices(
(x_train, y_train)).shuffle(10000).batch(args.batch_size)
test_ds = tf.data.Dataset.from_tensor_slices(
(x_test, y_test)).batch(args.batch_size)
class RBF_Cell(K.Model):
def __init__(self, centre, var):
"""RBF with Gaussian kernel
centre: tensor, with same shape as input x
var: scalar, variance of Gaussian
"""
super(RBF_Cell, self).__init__()
self.centre = tf.cast(centre[None, :], "float32") # [1, d]
self.coef = tf.cast(- 0.5 / var, "float32")
def call(self, x):
"""[n, 1] -> [n, 1]"""
return tf.math.exp(self.coef * euclidean(x, self.centre))
class RBF_Net(K.Model):
def __init__(self, centres, variances):
"""X -> [rbf_i(X; C_i)] -> Y
centres: [n_centre, d]
variances: [n_centre]
"""
super(RBF_Net, self).__init__()
self.rbf_list = [RBF_Cell(c, v) for c, v in zip(centres, variances)]
self.fc = K.layers.Dense(args.n_class, input_shape=[centres.shape[0]])
def call(self, x):
"""[n, d] -> [n, #centres] -> [n, n_class]"""
feat = tf.concat([rbf(x) for rbf in self.rbf_list], axis=1)
logit = self.fc(feat)
return feat, logit
# 构造中心、计算方差
centres, variances = [], []
for c in range(args.n_class):
Xc = x_train[y_train == c]
EX = tf.reduce_mean(Xc, 0, keepdims=True)
centres.append(EX)
# Var(X) = E(X - EX)^2
variances.append(tf.reduce_mean(euclidean(Xc, EX)))
centres, variances = np.vstack(centres), np.asarray(variances)
np.save("centres.npy", centres)
np.save("variances.npy", variances)
# centres = np.load("centres.npy")
# variances = np.load("variances.npy")
print("centres:", centres.shape, ", variances:", variances.shape)
# model
model = RBF_Net(centres, variances)
criterion = K.losses.SparseCategoricalCrossentropy(
from_logits=True) # `Sparse` for NOT one-hot
optimizer = K.optimizers.Adam()
train_loss = K.metrics.Mean(name='train_loss')
train_accuracy = K.metrics.SparseCategoricalAccuracy(name='train_accuracy')
test_loss = K.metrics.Mean(name='test_loss')
test_accuracy = K.metrics.SparseCategoricalAccuracy(name='test_accuracy')
#@tf.function
def train_step(images, labels):
with tf.GradientTape() as tape:
_, pred = model(images)
loss = criterion(labels, pred)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
train_loss(loss)
train_accuracy(labels, pred)
return loss
#@tf.function
def test_step(images, labels):
_, pred = model(images)
t_loss = criterion(labels, pred)
test_loss(t_loss)
test_accuracy(labels, pred)
pred = tf.argmax(pred, axis=1)
labels = tf.cast(labels, "int64")
n_correct = tf.reduce_sum(tf.cast(pred == labels, "float32"))
return n_correct
loss_list, acc_list = [], []
for epoch in range(args.epoch):
# 在下一个epoch开始时,重置评估指标
train_loss.reset_states()
train_accuracy.reset_states()
test_loss.reset_states()
test_accuracy.reset_states()
for images, labels in train_ds:
l = train_step(images, labels)
loss_list.append(l.numpy())
n_corr = 0
for images, labels in test_ds:
_n_corr = test_step(images, labels)
n_corr += _n_corr.numpy()
acc_list.append(n_corr / y_test.shape[0])
template = 'Epoch {}, Loss: {}, Accuracy: {}, Test Loss: {}, Test Accuracy: {}'
print(template.format(epoch+1,
train_loss.result(),
train_accuracy.result()*100,
test_loss.result(),
test_accuracy.result()*100))
# 可视化 loss
fig = plt.figure()
plt.title("loss")
plt.plot(np.arange(len(loss_list)), loss_list)
# plt.show()
fig.savefig("loss.png")
# 可视化 accuracy
fig = plt.figure()
plt.title("accuracy")
plt.plot(np.arange(len(acc_list)), acc_list)
# plt.show()
fig.savefig("accuracy.png")
# T-SNE 可视化隐层特征
fea_list = []
for i, (images, labels) in enumerate(test_ds):
fea, _ = model(images)
fea_list.append(fea.numpy())
if i > 5:
break
F = np.vstack(fea_list)
tsne = TSNE(n_components=2, init="pca", random_state=0)
F = tsne.fit_transform(F)
x_min, x_max = np.min(F, 0), np.max(F, 0)
F = (F - x_min) / (x_max - x_min)
fig = plt.figure()
plt.title("T-SNE")
for i in range(F.shape[0]):
plt.text(F[i, 0], F[i, 1], str(y_test[i]),
color=plt.cm.Set1(y_test[i] / 10.))
fig.savefig("tsne.png")