几种 multi-view clustering 的指标代码,介绍见 [1-3],[4-6] 有实现。
由于聚类没有类顺序,而有些指标用到 ground-truth labels(如 accuracy 等分类指标),需要求聚类簇跟 ground-truth 类之间的对应关系。
总体来说就是跑一个匹配算法求最优匹配,记 ground-truth labels y y y 为 y_true
、聚类模型输出的 cluster assignment y ′ y' y′ 为 y_assign
、匹配并调整顺序后的 assignment y ′ ′ y'' y′′ 为 y_adjust
,都是 N n \N^n Nn 的一维向量,长度 n 是 instance 数。
算匹配之前,要求一个 cost 矩阵 W,在此之前先求一个 co-occurrence 矩阵(临时起的名) C ∈ N d × c C\in \N^{d \times c} C∈Nd×c: C i j = ∣ { k ∣ y k ′ = i ∧ y k = j , k = 1 , … , n } ∣ C_{ij}=\left|\{k|y'_k=i\wedge y_k=j, k=1,\dots,n\}\right| Cij=∣{k∣yk′=i∧yk=j,k=1,…,n}∣ 其中 c 是 ground-truth 类数,d 是聚类簇数,MVC 里一般 c = d。然后 W i j = m − C i j , m = max r , c C r c W_{ij}=m - C_{ij}, \quad m = \max_{r,c} C_{rc} Wij=m−Cij,m=r,cmaxCrc
有些数据集的 y_true
的 class ID 不是从 0 开始,以这里假设 class ID、cluster ID 都是从 0 开始。
# import numpy as np
def calc_cost_matrix(y_true, y_assign, n_classes, n_clusters):
"""calculate cost matrix W
Input:
y_true: [n], in {0, ..., n_classes - 1}
y_assign: [n], in {0, ..., n_clusters - 1}
n_classes: int, provide in case that y_true.max() != n_classes
n_clusters: int, provide in case that y_assign.max() != n_clusters
Output:
W: [n_clusters, n_classes]
"""
y_true = y_true.astype(np.int64)
y_assign = y_assign.astype(np.int64)
assert y_assign.size == y_true.size # n
# C = np.zeros((y_assign.max() + 1, y_true.max() + 1), dtype=np.int64)
C = np.zeros((n_clusters, n_classes), dtype=np.int64)
for i in range(y_assign.size):
C[y_assign[i], y_true[i]] += 1
W = C.max() - C
return W
[4] 用 munkres 包求,代码在 get_y_preds。
# import numpy as np
# from munkres import Munkres
def reorder_assignment(y_true, y_assign, n_classes, n_clusters):
"""(munkres) re-order y_assign to be y_adjust so that it has the same order as y_true
Input:
y_true: [n], in {0, ..., c - 1}
y_assign: [n], in {0, ..., d - 1}
n_classes: int, provide in case that y_true.max() != n_classes
n_clusters: int, provide in case that y_assign.max() != n_clusters
Output:
y_adjust: [n], in {0, ..., c - 1}, in same order as y_true
"""
W = calc_cost_matrix(y_true, y_assign, n_classes, n_clusters)
indices = Munkres().compute(W)
map_a2t = np.zeros(n_clusters, dtype=np.int64)
for i in range(n_clusters):
map_a2t[i] = indices[i][1]
y_adjust = map_a2t[y_assign]
return y_adjust
[5,6] 用 scipy.optimize.linear_sum_assignment 求,代码分别在 cluster_acc、ordered_cmat;[7] 也有示例。
# import numpy as np
# from scipy.optimize import linear_sum_assignment
def reorder_assignment(y_true, y_assign, n_classes, n_clusters):
"""(linear_sum_assignment) re-order y_assign to be y_adjust so that it has the same order as y_true
Input:
y_true: [n], in {0, ..., c - 1}
y_assign: [n], in {0, ..., d - 1}
n_classes: int, provide in case that y_true.max() != n_classes
n_clusters: int, provide in case that y_assign.max() != n_clusters
Output:
y_adjust: [n], in {0, ..., c - 1}, in same order as y_true
"""
W = calc_cost_matrix(y_true, y_assign, n_classes, n_clusters)
row_idx, col_idx = linear_sum_assignment(W)
map_a2t = np.zeros(n_clusters, dtype=np.int64)
for i, j in zip(row_idx, col_idx):
map_a2t[i] = j
y_adjust = map_a2t[y_assign]
return y_adjust
这里区分「聚类」指标和「分类」指标,这一节聚类指标指不需重排 cluster assignment 以对齐 ground-truth label 顺序的指标。
介绍见 [1,2],范围 [0, 1],越大越好。[5] 有实现:purity。
# import numpy as np
# from sklearn.metrics import accuracy_score
def purity(y_true, y_assign):
y_voted_labels = np.zeros(y_true.shape)
labels = np.unique(y_true)
ordered_labels = np.arange(labels.shape[0])
for k in range(labels.shape[0]):
y_true[y_true == labels[k]] = ordered_labels[k]
labels = np.unique(y_true)
bins = np.concatenate((labels, [np.max(labels)+1]), axis=0)
for cluster in np.unique(y_assign):
hist, _ = np.histogram(y_true[y_assign == cluster], bins=bins)
winner = np.argmax(hist)
y_voted_labels[y_assign == cluster] = winner
return accuracy_score(y_true, y_voted_labels)
Normalized mutual information,介绍见 [1,3],范围 [0, 1],越大越好。[4,6] 用 sklearn.metrics.normalized_mutual_info_score(见 clustering_metric、calc_metrics),[5] 用 sklearn.metrics.v_measure_score(见 evaluate)。
v_measure_score
,scikit-learn 说:This score is identical to normalized_mutual_info_score
with the ‘arithmetic’ option for averaging.# from sklearn.metrics import normalized_mutual_info_score, v_measure_score
def nmi(y_true, y_assign):
# return v_measure_score(y_true, y_assign)
return normalized_mutual_info_score(y_true, y_assign)
Adjusted mutual information,介绍见 [3],范围 [-1, 1],越大越好。[4] 用 sklearn.metrics.adjusted_mutual_info_score,见 clustering_metric。
# from sklearn.metrics import adjusted_mutual_info_score
def ami(y_true, y_assign):
return adjusted_mutual_info_score(y_true, y_assign)
Adjusted Rand index,介绍见 [1-3],范围 [-1, 1],越大越好。[4,6] 用 sklearn.metrics.adjusted_rand_score,见 clustering_metric、calc_metrics。
# from sklearn.metrics import adjusted_rand_score
def ari(y_true, y_assign):
return adjusted_rand_score(y_true, y_assign)
这一节的分类指标指需重排 assignment 得到 y_adjust
的指标。
Accuracy,范围 [0, 1],越大越好。[4] 用 sklearn.metrics.accuracy_score,[5,6] 手写,分别见 classification_metric、cluster_acc、ordered_cmat。
# from sklearn.metrics import accuracy_score
def acc(y_true, y_adjust):
return accuracy_score(y_true, y_adjust)
范围 [0, 1],越大越好。[4] 用 sklearn.metrics.precision_score,见 classification_metric。
# from sklearn.metrics import precision_score
def precision(y_true, y_adjust, average='macro'):
return precision_score(y_true, y_adjust, average=average)
范围 [0, 1],越大越好。[4] 用 sklearn.metrics.recall_score,见 classification_metric。
# from sklearn.metrics import recall_score
def recall(y_true, y_adjust, average='macro'):
return recall_score(y_true, y_adjust, average=average)
范围 [0, 1],越大越好。[4] 用 sklearn.metrics.f1_score,见 classification_metric。
# from sklearn.metrics import f1_score
def f1_score(y_true, y_adjust, average='macro'):
return f1_score(y_true, y_adjust, average=average)
写在一起方便调用
# evaluate.py
import numpy as np
from scipy.optimize import linear_sum_assignment
import sklearn.metrics as metrics
def calc_cost_matrix(y_true, y_assign, n_classes, n_clusters):
"""calculate cost matrix W
Input:
y_true: [n], in {0, ..., n_classes - 1}
y_assign: [n], in {0, ..., n_clusters - 1}
n_classes: int, provide in case that y_true.max() != n_classes
n_clusters: int, provide in case that y_assign.max() != n_clusters
Output:
W: [n_clusters, n_classes]
"""
y_true = y_true.astype(np.int64)
y_assign = y_assign.astype(np.int64)
assert y_assign.size == y_true.size # n
# C = np.zeros((y_assign.max() + 1, y_true.max() + 1), dtype=np.int64)
C = np.zeros((n_clusters, n_classes), dtype=np.int64)
for i in range(y_assign.size):
C[y_assign[i], y_true[i]] += 1
W = C.max() - C
return W
def reorder_assignment(y_true, y_assign, n_classes, n_clusters):
"""(linear_sum_assignment) re-order y_assign to be y_adjust so that it has the same order as y_true
Input:
y_true: [n], in {0, ..., c - 1}
y_assign: [n], in {0, ..., d - 1}
n_classes: int, provide in case that y_true.max() != n_classes
n_clusters: int, provide in case that y_assign.max() != n_clusters
Output:
y_adjust: [n], in {0, ..., c - 1}, in same order as y_true
"""
W = calc_cost_matrix(y_true, y_assign, n_classes, n_clusters)
row_idx, col_idx = linear_sum_assignment(W)
map_a2t = np.zeros(n_clusters, dtype=np.int64)
for i, j in zip(row_idx, col_idx):
map_a2t[i] = j
y_adjust = map_a2t[y_assign]
return y_adjust
def purity(y_true, y_assign):
y_voted_labels = np.zeros(y_true.shape)
labels = np.unique(y_true)
ordered_labels = np.arange(labels.shape[0])
for k in range(labels.shape[0]):
y_true[y_true == labels[k]] = ordered_labels[k]
labels = np.unique(y_true)
bins = np.concatenate((labels, [np.max(labels)+1]), axis=0)
for cluster in np.unique(y_assign):
hist, _ = np.histogram(y_true[y_assign == cluster], bins=bins)
winner = np.argmax(hist)
y_voted_labels[y_assign == cluster] = winner
return metrics.accuracy_score(y_true, y_voted_labels)
def evaluate(y_true, y_assign, n_classes, n_clusters, average='macro'):
y_adjust = reorder_assignment(y_true, y_assign, n_classes, n_clusters)
return {
# clustering
'purity': purity(y_true, y_assign),
'nmi': metrics.normalized_mutual_info_score(y_true, y_assign),
'ami': metrics.adjusted_mutual_info_score(y_true, y_assign),
'ari': metrics.adjusted_rand_score(y_true, y_assign),
# classification
'acc': metrics.accuracy_score(y_true, y_adjust),
'precision': metrics.precision_score(y_true, y_adjust, average=average),
'recall': metrics.recall_score(y_true, y_adjust, average=average),
'f1-score': metrics.f1_score(y_true, y_adjust, average=average)
}