python实现聚类评价指标

聚类的评价指标准确率ACC和互信息NMI
版本1

# -*- coding:utf-8 -*-

import math
import numpy as np
from sklearn import metrics
def cluster_acc(y_true, y_pred):
    D = max(y_pred.max(), y_true.max()) + 1
    w = np.zeros((D, D))
    for i in range(y_pred.size):
        w[y_pred[i], y_true[i]] += 1
    from scipy.optimize import linear_sum_assignment
    ind = np.transpose(np.asarray(linear_sum_assignment(w.max() - w)))
    acc = sum([w[i, j] for i, j in ind]) / y_pred.size
    return acc
def cluster_nmi(y_true,y_pred):
    #样本点数
    toty_truel = len(y_true)
    y_true_ids = set(y_true)
    y_pred_ids = set(y_pred)
    #互信息计算
    MI = 0
    eps = 1.4e-45
    for idy_true in y_true_ids:
        for idy_pred in y_pred_ids:
            idy_trueOccur = np.where(y_true==idy_true)
            idy_predOccur = np.where(y_pred==idy_pred)
            idy_truey_predOccur = np.intersect1d(idy_trueOccur,idy_predOccur)
            px = 1.0*len(idy_trueOccur[0])/toty_truel
            py = 1.0*len(idy_predOccur[0])/toty_truel
            pxy = 1.0*len(idy_truey_predOccur)/toty_truel
            MI = MI + pxy*math.log(pxy/(px*py)+eps,2)
    # 标准化互信息
    Hx = 0
    for idy_true in y_true_ids:
        idy_trueOccurCount = 1.0*len(np.where(y_true==idy_true)[0])
        Hx = Hx - (idy_trueOccurCount/toty_truel)*math.log(idy_trueOccurCount/toty_truel+eps,2)
    Hy = 0
    for idy_pred in y_pred_ids:
        idy_predOccurCount = 1.0*len(np.where(y_pred==idy_pred)[0])
        Hy = Hy - (idy_predOccurCount/toty_truel)*math.log(idy_predOccurCount/toty_truel+eps,2)
    MIhy_truet = 2.0*MI/(Hx+Hy)
    return MIhy_truet

if __name__ == '__main__':
    y_true = np.array([1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3])# 真实
    y_pred = np.array([1,2,1,1,1,1,1,2,2,2,2,3,1,1,3,3,3])# 预测
    print(cluster_acc(y_true,y_pred))
    print (cluster_nmi(y_true,y_pred))
    print (metrics.normalized_mutual_info_score(y_true,y_pred,average_method='arithmetic'))

版本2

import numpy as np
from sklearn import metrics

# L1:真实标签,L2:预测标签
def cluster_acc(L1, L2):
    sum = np.sum(L1[:]==L2[:])
    return sum/len(L2)


def cluster_nmi(L1, L2):
    return metrics.normalized_mutual_info_score(L1, L2)

参考链接
聚类的性能度量

你可能感兴趣的:(python)