聚类的评价指标准确率ACC和互信息NMI
版本1
# -*- coding:utf-8 -*-
import math
import numpy as np
from sklearn import metrics
def cluster_acc(y_true, y_pred):
D = max(y_pred.max(), y_true.max()) + 1
w = np.zeros((D, D))
for i in range(y_pred.size):
w[y_pred[i], y_true[i]] += 1
from scipy.optimize import linear_sum_assignment
ind = np.transpose(np.asarray(linear_sum_assignment(w.max() - w)))
acc = sum([w[i, j] for i, j in ind]) / y_pred.size
return acc
def cluster_nmi(y_true,y_pred):
#样本点数
toty_truel = len(y_true)
y_true_ids = set(y_true)
y_pred_ids = set(y_pred)
#互信息计算
MI = 0
eps = 1.4e-45
for idy_true in y_true_ids:
for idy_pred in y_pred_ids:
idy_trueOccur = np.where(y_true==idy_true)
idy_predOccur = np.where(y_pred==idy_pred)
idy_truey_predOccur = np.intersect1d(idy_trueOccur,idy_predOccur)
px = 1.0*len(idy_trueOccur[0])/toty_truel
py = 1.0*len(idy_predOccur[0])/toty_truel
pxy = 1.0*len(idy_truey_predOccur)/toty_truel
MI = MI + pxy*math.log(pxy/(px*py)+eps,2)
# 标准化互信息
Hx = 0
for idy_true in y_true_ids:
idy_trueOccurCount = 1.0*len(np.where(y_true==idy_true)[0])
Hx = Hx - (idy_trueOccurCount/toty_truel)*math.log(idy_trueOccurCount/toty_truel+eps,2)
Hy = 0
for idy_pred in y_pred_ids:
idy_predOccurCount = 1.0*len(np.where(y_pred==idy_pred)[0])
Hy = Hy - (idy_predOccurCount/toty_truel)*math.log(idy_predOccurCount/toty_truel+eps,2)
MIhy_truet = 2.0*MI/(Hx+Hy)
return MIhy_truet
if __name__ == '__main__':
y_true = np.array([1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3])# 真实
y_pred = np.array([1,2,1,1,1,1,1,2,2,2,2,3,1,1,3,3,3])# 预测
print(cluster_acc(y_true,y_pred))
print (cluster_nmi(y_true,y_pred))
print (metrics.normalized_mutual_info_score(y_true,y_pred,average_method='arithmetic'))
版本2
import numpy as np
from sklearn import metrics
# L1:真实标签,L2:预测标签
def cluster_acc(L1, L2):
sum = np.sum(L1[:]==L2[:])
return sum/len(L2)
def cluster_nmi(L1, L2):
return metrics.normalized_mutual_info_score(L1, L2)
参考链接
聚类的性能度量