平台上的其他实训题目都有答案,只有这个没有,我把题目贴出来方便对比,顺便分享答案,欢迎大佬来指正。
任务描述
本关任务:填写python代码,完成calc_JC函数、calc_FM函数和calc_Rand函数分别实现计算JC系数、FM指数和Rand指数。
测试输入:
{‘y_true’:[0, 0, 0, 1, 1, 1], ‘y_pred’:[0, 0, 1, 1, 2, 2]}
预期输出:
0.285714 0.471405 0.666667
import numpy as np
def calc_JC(y_true, y_pred):
'''
计算并返回JC系数
:param y_true: 参考模型给出的簇,类型为ndarray
:param y_pred: 聚类模型给出的簇,类型为ndarray
:return: JC系数
'''
#******** Begin *******#
def a(y_true, y_pred):
result = 0
for i in range(len(y_true)):
for j in range(len(y_pred)):
if i < j:
if y_true[i] == y_true[j] and y_pred[i] == y_pred[j]:
result += 1
return result
def b(y_true, y_pred):
result = 0
for i in range(len(y_true)):
for j in range(len(y_pred)):
if i < j:
if y_true[i] != y_true[j] and y_pred[i] == y_pred[j]:
result += 1
return result
def c(y_true, y_pred):
result = 0
for i in range(len(y_true)):
for j in range(len(y_pred)):
if i < j:
if y_true[i] == y_true[j] and y_pred[i] != y_pred[j]:
result += 1
return result
return a(y_true, y_pred)/(a(y_true, y_pred)+b(y_true, y_pred)+c(y_true, y_pred))
#******** End *******#
def calc_FM(y_true, y_pred):
'''
计算并返回FM指数
:param y_true: 参考模型给出的簇,类型为ndarray
:param y_pred: 聚类模型给出的簇,类型为ndarray
:return: FM指数
'''
#******** Begin *******#
def a(y_true, y_pred):
result = 0
for i in range(len(y_true)):
for j in range(len(y_pred)):
if i < j:
if y_true[i] == y_true[j] and y_pred[i] == y_pred[j]:
result += 1
return result
def b(y_true, y_pred):
result = 0
for i in range(len(y_true)):
for j in range(len(y_pred)):
if i < j:
if y_true[i] != y_true[j] and y_pred[i] == y_pred[j]:
result += 1
return result
def c(y_true, y_pred):
result = 0
for i in range(len(y_true)):
for j in range(len(y_pred)):
if i < j:
if y_true[i] == y_true[j] and y_pred[i] != y_pred[j]:
result += 1
return result
return a(y_true, y_pred)/np.sqrt((a(y_true, y_pred)+b(y_true, y_pred))*(a(y_true, y_pred)+c(y_true, y_pred)))
#******** End *******#
def calc_Rand(y_true, y_pred):
'''
计算并返回Rand指数
:param y_true: 参考模型给出的簇,类型为ndarray
:param y_pred: 聚类模型给出的簇,类型为ndarray
:return: Rand指数
'''
#******** Begin *******#
def a(y_true, y_pred):
result = 0
for i in range(len(y_true)):
for j in range(len(y_pred)):
if i < j:
if y_true[i] == y_true[j] and y_pred[i] == y_pred[j]:
result += 1
return result
def d(y_true, y_pred):
result = 0
for i in range(len(y_true)):
for j in range(len(y_pred)):
if i < j:
if y_true[i] != y_true[j] and y_pred[i] != y_pred[j]:
result += 1
return result
m = len(y_true)
return (2 * (a(y_true, y_pred) + d(y_true, y_pred))) / (m * (m - 1))
#******** End *******#
任务描述
本关任务:填写python代码,完成calc_DBI函数和calc_DI函数分别实现计算DB指数和Dunn指数。
测试说明
平台会对你编写的代码进行测试,期望您的代码根据输入来按顺序输出正确的DB指数和Dunn指数,以下为其中一个测试用例(feature表示待聚类数据的特征,pred表示聚类后数据所对应的簇):
测试输入:
{‘feature’:[[3,4],[6,9],[2,3],[3,4],[7,10],[8,11]], ‘pred’:[1, 2, 1, 1, 2, 2]}
预期输出:
0.204765 2.061553
import numpy as np
def calc_DBI(feature, pred):
'''
计算并返回DB指数
:param feature: 待聚类数据的特征,类型为`ndarray`
:param pred: 聚类后数据所对应的簇,类型为`ndarray`
:return: DB指数
'''
#********* Begin *********#
label_set = np.unique(pred)
mu = {}
label_count = {}
#计算簇的中点
for label in label_set:
mu[label] = np.zeros([len(feature[0])])
label_count[label] = 0
for i in range(len(pred)):
mu[pred[i]] += feature[i]
label_count[pred[i]] += 1
for key in mu.keys():
mu[key] /= label_count[key]
#算数据到中心点的平均距离
avg_d = {}
for label in label_set:
avg_d[label] = 0
for i in range(len(pred)):
avg_d[pred[i]] += np.sqrt(np.sum(np.square(feature[i] - mu[pred[i]])))
for key in mu.keys():
avg_d[key] /= label_count[key]
#算两个簇的中点之间的距离
cen_d = []
for i in range(len(label_set)-1):
t = {'c1':label_set[i], 'c2':label_set[i+1], 'dist':np.sqrt(np.sum(np.square(mu[label_set[i]] - mu[label_set[i+1]])))}
cen_d.append(t)
dbi = 0
for k in range(len(label_set)):
max_item = 0
for i in range(len(label_set)):
for j in range(i, len(label_set)):
for p in range(len(cen_d)):
if cen_d[p]['c1'] == label_set[i] and cen_d[p]['c2'] == label_set[j]:
d = (avg_d[label_set[i]] + avg_d[label_set[j]])/cen_d[p]['dist']
if d > max_item:
max_item = d
dbi += max_item
dbi /= len(label_set)
return dbi
#********* End *********#
def calc_DI(feature, pred):
'''
计算并返回Dunn指数
:param feature: 待聚类数据的特征,类型为`ndarray`
:param pred: 聚类后数据所对应的簇,类型为`ndarray`
:return: Dunn指数
'''
#********* Begin *********#
label_set = np.unique(pred)
min_d = []
for i in range(len(label_set)-1):
t = {'c1': label_set[i], 'c2': label_set[i+1], 'dist': np.inf}
min_d.append(t)
#计算两个簇之间的最短距离
for i in range(len(feature)):
for j in range(i, len(feature)):
for p in range(len(min_d)):
if min_d[p]['c1'] == pred[i] and min_d[p]['c2'] == pred[j]:
d = np.sqrt(np.sum(np.square(feature[i] - feature[j])))
if d < min_d[p]['dist']:
min_d[p]['dist'] = d
#计算同一个簇中距离最远的样本对的距离
max_diam = 0
for i in range(len(feature)):
for j in range(i, len(feature)):
if pred[i] == pred[j]:
d = np.sqrt(np.sum(np.square(feature[i] - feature[j])))
if d > max_diam:
max_diam = d
di = np.inf
for i in range(len(label_set)):
for j in range(i, len(label_set)):
for p in range(len(min_d)):
d = min_d[p]['dist']/max_diam
if d < di:
di = d
return d
#********* End *********#
测试说明
平台会对你编写的代码进行测试,期望您的代码根据输入来按顺序返回正确的Rand指数和FM指数,以下为其中一个测试用例(字典中的y_true部分代表参考模型给出的簇划分,y_pred部分代表聚类模型给出的簇划分):
测试输入:
{‘y_true’:[0, 0, 1, 1],‘y_pred’:[1, 0, 1, 1]}
预期输出:
0.408248, 0.000000
from sklearn.metrics.cluster import fowlkes_mallows_score, adjusted_rand_score
def cluster_performance(y_true, y_pred):
'''
返回Rand指数和FM指数
:param y_true:参考模型的簇划分,类型为ndarray
:param y_pred:聚类模型给出的簇划分,类型为ndarray
:return: Rand指数,FM指数
'''
#********* Begin *********#
return fowlkes_mallows_score(y_true, y_pred), adjusted_rand_score(y_true, y_pred)
#********* End *********#