机器学习Educoder平台实训-聚类性能评估指标

机器学习Educoder平台实训–聚类性能评估指标

平台上的其他实训题目都有答案,只有这个没有,我把题目贴出来方便对比,顺便分享答案,欢迎大佬来指正。

第1关:外部指标

任务描述
本关任务:填写python代码,完成calc_JC函数、calc_FM函数和calc_Rand函数分别实现计算JC系数、FM指数和Rand指数。

测试输入:
{‘y_true’:[0, 0, 0, 1, 1, 1], ‘y_pred’:[0, 0, 1, 1, 2, 2]}

预期输出:
0.285714 0.471405 0.666667

import numpy as np
def calc_JC(y_true, y_pred):
    '''
    计算并返回JC系数
    :param y_true: 参考模型给出的簇,类型为ndarray
    :param y_pred: 聚类模型给出的簇,类型为ndarray
    :return: JC系数
    '''
    #******** Begin *******#
    def a(y_true, y_pred):
        result = 0
        for i in range(len(y_true)):
            for j in range(len(y_pred)):
                if i < j:
                    if y_true[i] == y_true[j] and y_pred[i] == y_pred[j]:
                        result += 1
        return result
    def b(y_true, y_pred):
        result = 0
        for i in range(len(y_true)):
            for j in range(len(y_pred)):
                if i < j:
                    if y_true[i] != y_true[j] and y_pred[i] == y_pred[j]:
                        result += 1
        return result
    def c(y_true, y_pred):
        result = 0
        for i in range(len(y_true)):
            for j in range(len(y_pred)):
                if i < j:
                    if y_true[i] == y_true[j] and y_pred[i] != y_pred[j]:
                        result += 1
        return result
    return a(y_true, y_pred)/(a(y_true, y_pred)+b(y_true, y_pred)+c(y_true, y_pred))
    #******** End *******#
def calc_FM(y_true, y_pred):
    '''
    计算并返回FM指数
    :param y_true: 参考模型给出的簇,类型为ndarray
    :param y_pred: 聚类模型给出的簇,类型为ndarray
    :return: FM指数
    '''
    #******** Begin *******#
    def a(y_true, y_pred):
        result = 0
        for i in range(len(y_true)):
            for j in range(len(y_pred)):
                if i < j:
                    if y_true[i] == y_true[j] and y_pred[i] == y_pred[j]:
                        result += 1
        return result
    def b(y_true, y_pred):
        result = 0
        for i in range(len(y_true)):
            for j in range(len(y_pred)):
                if i < j:
                    if y_true[i] != y_true[j] and y_pred[i] == y_pred[j]:
                        result += 1
        return result
    def c(y_true, y_pred):
        result = 0
        for i in range(len(y_true)):
            for j in range(len(y_pred)):
                if i < j:
                    if y_true[i] == y_true[j] and y_pred[i] != y_pred[j]:
                        result += 1
        return result
    return a(y_true, y_pred)/np.sqrt((a(y_true, y_pred)+b(y_true, y_pred))*(a(y_true, y_pred)+c(y_true, y_pred)))
    #******** End *******#
def calc_Rand(y_true, y_pred):
    '''
    计算并返回Rand指数
    :param y_true: 参考模型给出的簇,类型为ndarray
    :param y_pred: 聚类模型给出的簇,类型为ndarray
    :return: Rand指数
    '''
    #******** Begin *******#
    def a(y_true, y_pred):
        result = 0
        for i in range(len(y_true)):
            for j in range(len(y_pred)):
                if i < j:
                    if y_true[i] == y_true[j] and y_pred[i] == y_pred[j]:
                        result += 1
        return result
    def d(y_true, y_pred):
        result = 0
        for i in range(len(y_true)):
            for j in range(len(y_pred)):
                if i < j:
                    if y_true[i] != y_true[j] and y_pred[i] != y_pred[j]:
                        result += 1
        return result
    m = len(y_true)
    return (2 * (a(y_true, y_pred) + d(y_true, y_pred))) / (m * (m - 1))
    #******** End *******#

第2关:内部指标

任务描述
本关任务:填写python代码,完成calc_DBI函数和calc_DI函数分别实现计算DB指数和Dunn指数。

测试说明
平台会对你编写的代码进行测试,期望您的代码根据输入来按顺序输出正确的DB指数和Dunn指数,以下为其中一个测试用例(feature表示待聚类数据的特征,pred表示聚类后数据所对应的簇):

测试输入:
{‘feature’:[[3,4],[6,9],[2,3],[3,4],[7,10],[8,11]], ‘pred’:[1, 2, 1, 1, 2, 2]}

预期输出:
0.204765 2.061553

import numpy as np
def calc_DBI(feature, pred):
    '''
    计算并返回DB指数
    :param feature: 待聚类数据的特征,类型为`ndarray`
    :param pred: 聚类后数据所对应的簇,类型为`ndarray`
    :return: DB指数
    '''
    #********* Begin *********#
    label_set = np.unique(pred)
    mu = {}
    label_count = {}
    #计算簇的中点
    for label in label_set:
        mu[label] = np.zeros([len(feature[0])])
        label_count[label] = 0
    for i in range(len(pred)):
        mu[pred[i]] += feature[i]
        label_count[pred[i]] += 1
    for key in mu.keys():
        mu[key] /= label_count[key]
    #算数据到中心点的平均距离
    avg_d = {}
    for label in label_set:
        avg_d[label] = 0
    for i in range(len(pred)):
        avg_d[pred[i]] += np.sqrt(np.sum(np.square(feature[i] - mu[pred[i]])))
    for key in mu.keys():
        avg_d[key] /= label_count[key]
    #算两个簇的中点之间的距离
    cen_d = []
    for i in range(len(label_set)-1):
        t = {'c1':label_set[i], 'c2':label_set[i+1], 'dist':np.sqrt(np.sum(np.square(mu[label_set[i]] - mu[label_set[i+1]])))}
        cen_d.append(t)
    dbi = 0
    for k in range(len(label_set)):
        max_item = 0
        for i in range(len(label_set)):
            for j in range(i, len(label_set)):
                for p in range(len(cen_d)):
                    if cen_d[p]['c1'] == label_set[i] and cen_d[p]['c2'] == label_set[j]:
                        d = (avg_d[label_set[i]] + avg_d[label_set[j]])/cen_d[p]['dist']
                        if d > max_item:
                            max_item = d
        dbi += max_item
    dbi /= len(label_set)
    return dbi
    #********* End *********#
def calc_DI(feature, pred):
    '''
    计算并返回Dunn指数
    :param feature: 待聚类数据的特征,类型为`ndarray`
    :param pred: 聚类后数据所对应的簇,类型为`ndarray`
    :return: Dunn指数
    '''
    #********* Begin *********#
    label_set = np.unique(pred)
    min_d = []
    for i in range(len(label_set)-1):
        t = {'c1': label_set[i], 'c2': label_set[i+1], 'dist': np.inf}
        min_d.append(t)
    #计算两个簇之间的最短距离
    for i in range(len(feature)):
        for j in range(i, len(feature)):
            for p in range(len(min_d)):
                if min_d[p]['c1'] == pred[i] and min_d[p]['c2'] == pred[j]:
                    d = np.sqrt(np.sum(np.square(feature[i] - feature[j])))
                    if d < min_d[p]['dist']:
                        min_d[p]['dist'] = d
    #计算同一个簇中距离最远的样本对的距离
    max_diam = 0
    for i in range(len(feature)):
        for j in range(i, len(feature)):
            if pred[i] == pred[j]:
                d = np.sqrt(np.sum(np.square(feature[i] - feature[j])))
                if d > max_diam:
                    max_diam = d
    di = np.inf
    for i in range(len(label_set)):
        for j in range(i, len(label_set)):
            for p in range(len(min_d)):
                d = min_d[p]['dist']/max_diam
                if d < di:
                    di = d
    return d
    #********* End *********#

第3关:sklearn中的聚类性能评估指标

测试说明
平台会对你编写的代码进行测试,期望您的代码根据输入来按顺序返回正确的Rand指数和FM指数,以下为其中一个测试用例(字典中的y_true部分代表参考模型给出的簇划分,y_pred部分代表聚类模型给出的簇划分):

测试输入:
{‘y_true’:[0, 0, 1, 1],‘y_pred’:[1, 0, 1, 1]}

预期输出:
0.408248, 0.000000

from sklearn.metrics.cluster import fowlkes_mallows_score, adjusted_rand_score
def cluster_performance(y_true, y_pred):
    '''
    返回Rand指数和FM指数
    :param y_true:参考模型的簇划分,类型为ndarray
    :param y_pred:聚类模型给出的簇划分,类型为ndarray
    :return: Rand指数,FM指数
    '''
    #********* Begin *********#
    return fowlkes_mallows_score(y_true, y_pred), adjusted_rand_score(y_true, y_pred)
    #********* End *********#

你可能感兴趣的:(机器学习,机器学习,Educoder,聚类性能评估指标)