KMeans是无监督的。当然也可以是有监督的。有监督形式非常简单。就是根据labels计算聚类中心即可。相当于无监督KMeans的半步迭代;反过来说,KMeans算法是把无监督学习转化成一系列监督学习的迭代过程。
本文贡献的是半监督KMeans。半监督KMeans可以充分利用已知的labels信息。在机器学习里,有利于将人类知识和机器从数据发现的知识相互融合。
输入点集 D l = { ( x i , c i ) } , D u = { x i ′ } D_l=\{(x_i,c_i)\}, D_u=\{x_i'\} Dl={(xi,ci)},Du={xi′}
输出分类器(或聚类中心)
令类标签集 C l = { c i } , C u = C ∖ C l C_l=\{c_i\}, C_u=C\setminus C_l Cl={ci},Cu=C∖Cl, 下述迭代不改变 γ ( x i ) = c i , ( x i , c i ) ∈ D l \gamma(x_i)=c_i,(x_i,c_i)\in D_l γ(xi)=ci,(xi,ci)∈Dl。
和无监督的KMeans相比,这里唯一复杂的是初始化。如果 C l C_l Cl不包括所有类别,那么首先给 C u C_u Cu指定聚类中心,如在 D u D_u Du中随机选择,然后 D l D_l Dl中每个类的中心作为 C l C_l Cl的聚类中心(退化为一个有监督的分类算法)。
#!/usr/bin/env python
"""
Semi K-Means
"""
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.cluster import KMeans, kmeans_plusplus
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.utils.validation import check_is_fitted
class SupervisedKMeans(ClassifierMixin, KMeans):
classes = None
def fit(self, X, y):
if self.classes is None:
self.classes = np.unique(y)
self.centers_ = np.array([np.mean(X[y==c], axis=0) for c in self.classes])
self.cluster_centers_ = self.centers_
self.n_classes = len(self.classes)
return self
def predict(self, X):
ed = euclidean_distances(X, self.cluster_centers_)
return [self.classes[k] for k in np.argmin(ed, axis=1)]
def score(self, X, y):
y_ = self.predict(X)
return np.mean(y == y_)
class SemiKMeans(SupervisedKMeans):
def fit(self, Xl, yl, Xu):
"""To fit the semisupervised model
Args:
Xl (array): input variables with labels
yl (array): labels
Xu (array): input variables without labels
Returns:
the model
"""
classes0 = np.unique(yl)
if not hasattr(self, 'classes') or self.classes is None:
self.classes = np.arange(self.n_clusters)
else:
assert all(c in self.classes for c in classes0), 'yl has an element not in `classes`!'
X = np.row_stack((Xl, Xu))
n1 = self.n_clusters - len(classes0)
mu0 = SupervisedKMeans().fit(Xl, yl).centers_
if n1:
centers, indices = kmeans_plusplus(Xu, n_clusters=n1)
self.cluster_centers_ = np.row_stack((centers, mu0))
else:
self.cluster_centers_ = mu0
return self._fit(Xl,yl, Xu, self.cluster_centers_, self.classes)
def _fit(self, Xl, yl, Xu, cluster_centers, classes):
X = np.row_stack((Xl, Xu))
for _ in range(self.max_iter):
ED = euclidean_distances(Xu, cluster_centers)
yu = [classes[k] for k in np.argmin(ED, axis=1)]
y = np.concatenate((yl, yu))
cluster_centers = np.array([np.mean(X[y==c], axis=0) for c in classes])
self.labels_ = y
self.cluster_centers_ = cluster_centers
return self
def partial_fit(self, *args, **kwargs):
check_is_fitted(self, ('cluster_centers_',))
return self._fit(Xl,yl, Xu, self.cluster_centers_, self.classes)
#!/usr/bin/env python
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.cluster import KMeans, kmeans_plusplus
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import train_test_split
from sklearn import datasets
from semi_kmeans import *
digists = datasets.load_digits()
X_train, X_test, y_train, y_test = train_test_split(digists.data, digists.target, test_size=0.5)
X_labeled, X_unlabeled, y_labeled, _ = train_test_split(X_train, y_train, test_size=0.95)
if __name__ == '__main__':
km = SemiKMeans(n_clusters=10)
km.fit(X_labeled, y_labeled, X_unlabeled) # y_test0 is unknown
skm = SupervisedKMeans()
skm.fit(X_labeled, y_labeled)
print(f"""
# clusters: 10
# samples: {X_labeled.shape[0]} + {X_unlabeled.shape[0]}
SemiKMeans: {km.score(X_test, y_test)}
SupervisedKMeans: {skm.score(X_test, y_test)}
""")
# clusters: 10
# samples: 44 + 854
SemiKMeans: 0.7975528364849833
SupervisedKMeans: 0.7675194660734149