聚类的评价指标(无监督学习)

详细理论说明,可以查看其他博客:

# coding:utf-8

from sklearn import metrics

"""
    聚类性能评估
"""
"""
    1、Adjusted Rand index (ARI)
    优点:
    1.1 对任意数量的聚类中心和样本数,随机聚类的ARI都非常接近于0;
    1.2 取值在[-1,1]之间,负数代表结果不好,越接近于1越好;
    1.3 可用于聚类算法之间的比较
    缺点:
    1.4 ARI需要真实标签
"""
labels_true = [0, 0, 0, 1, 1, 1]
labels_pre1 = [0, 0, 1, 1, 2, 2]
labels_pre2 = [1, 1, 2, 2, 3, 3]
labels_pre3 = [1, 1, 2, 2, 2, 1]
labels_pre4 = [1, 1, 1, 2, 2, 2]  # 完美聚类,聚类结果与初始类别完全一致,此时评估结果为:1.0

print metrics.adjusted_rand_score(labels_true=labels_true, labels_pred=labels_pre1)
print metrics.adjusted_rand_score(labels_true=labels_true, labels_pred=labels_pre2)
print metrics.adjusted_rand_score(labels_true=labels_true, labels_pred=labels_pre3)
print metrics.adjusted_rand_score(labels_true=labels_true, labels_pred=labels_pre4)
print '--------------------------------------------------------------------------------'

"""
    2、Mutual Information based scores (MI) 互信息
    优点:除取值范围在[0,1]之间,其他同ARI;可用于聚类模型选择
    缺点:需要先验知识
"""
labels_true = [0, 0, 0, 1, 1, 1]
labels_pre5 = [0, 0, 1, 1, 2, 2]
labels_pre6 = [1, 1, 0, 0, 3, 3]
labels_pre7 = [1, 3, 2, 3, 2, 1]
labels_pre8 = [1, 1, 1, 2, 2, 2]  # 完美聚类,聚类结果与初始类别完全一致,此时评估结果为:1.0

print metrics.adjusted_mutual_info_score(labels_pred=labels_pre5, labels_true=labels_true)
print metrics.adjusted_mutual_info_score(labels_pred=labels_pre6, labels_true=labels_true)
print metrics.adjusted_mutual_info_score(labels_true=labels_true, labels_pred=labels_pre7)
print metrics.adjusted_mutual_info_score(labels_true=labels_true, labels_pred=labels_pre8)
print '-----------------------------------------------------------------------------------'

"""
    3、Homogeneity, completeness and V-measure
    优点:[0,1]之间
"""
labels_true = [0, 0, 0, 1, 1, 1]
labels_pre9 = [0, 0, 1, 1, 2, 2]
labels_pre10 = [0, 0, 0, 1, 1, 1]

print metrics.homogeneity_score(labels_true, labels_pre9)
print metrics.completeness_score(labels_pre9, labels_true)
print metrics.completeness_score(labels_pre10, labels_true)
print metrics.v_measure_score(labels_true, labels_pre9)
print metrics.homogeneity_completeness_v_measure(labels_true, labels_pre9)

"""
    Example: 不同评分保准所依赖的计算方式不同
"""
import numpy as np
import matplotlib.pyplot as plt
from time import time
from sklearn import metrics


def uniform_labelings_scores(score_func, n_samples, n_clusters_range, fixed_n_classes=None, n_runs=5, seed=42):
    random_labels = np.random.RandomState(seed).randint
    scores = np.zeros((len(n_clusters_range), n_runs))

    if fixed_n_classes is not None:
        labels_a = random_labels(low=0, high=fixed_n_classes, size=n_samples)

    for i, k in enumerate(n_clusters_range):
        for j in range(n_runs):
            if fixed_n_classes is None:
                labels_a = random_labels(low=0, high=k, size=n_samples)
            labels_b = random_labels(low=0, high=k, size=n_samples)
            scores[i, j] = score_func(labels_a, labels_b)
    return scores


score_funcs = [
    metrics.adjusted_rand_score,
    metrics.v_measure_score,
    metrics.adjusted_mutual_info_score,
    metrics.mutual_info_score,
]

n_samples = 100
n_clusters_range = np.linspace(2, n_samples, 10).astype(np.int)

plt.figure(1)

plots = []
names = []
for score_func in score_funcs:
    print("Computing %s for %d values of n_clusters and n_samples=%d"
          % (score_func.__name__, len(n_clusters_range), n_samples))

    t0 = time()
    scores = uniform_labelings_scores(score_func, n_samples, n_clusters_range)
    print("done in %0.3fs" % (time() - t0))
    plots.append(plt.errorbar(
        n_clusters_range, np.median(scores, axis=1), scores.std(axis=1))[0])
    names.append(score_func.__name__)

plt.title("Clustering measures for 2 random uniform labelings\n"
          "with equal number of clusters")
plt.xlabel('Number of clusters (Number of samples is fixed to %d)' % n_samples)
plt.ylabel('Score value')
plt.legend(plots, names)
plt.ylim(ymin=-0.05, ymax=1.05)

n_samples = 1000
n_clusters_range = np.linspace(2, 100, 10).astype(np.int)
n_classes = 10

plt.figure(2)

plots = []
names = []
for score_func in score_funcs:
    print("Computing %s for %d values of n_clusters and n_samples=%d"
          % (score_func.__name__, len(n_clusters_range), n_samples))

    t0 = time()
    scores = uniform_labelings_scores(score_func, n_samples, n_clusters_range,
                                      fixed_n_classes=n_classes)
    print("done in %0.3fs" % (time() - t0))
    plots.append(plt.errorbar(
        n_clusters_range, scores.mean(axis=1), scores.std(axis=1))[0])
    names.append(score_func.__name__)

plt.title("Clustering measures for random uniform labeling\n"
          "against reference assignment with %d classes" % n_classes)
plt.xlabel('Number of clusters (Number of samples is fixed to %d)' % n_samples)
plt.ylabel('Score value')
plt.ylim(ymin=-0.05, ymax=1.05)
plt.legend(plots, names)
plt.show()
print '-----------------------------------------------------------------------------'

"""
    4、 Fowlkes-Mallows scores(FMI)
    优点:[0, 1]
"""
labels_true = [0, 0, 0, 1, 1, 1]
labels_pre11 = [0, 0, 1, 1, 2, 2]
labels_pre12 = [0, 0, 0, 1, 1, 1]
labels_pre13 = [0, 1, 2, 0, 3, 4]

print metrics.fowlkes_mallows_score(labels_true, labels_pre11)
print metrics.fowlkes_mallows_score(labels_true, labels_pre12)
print metrics.fowlkes_mallows_score(labels_true, labels_pre13)
print '------------------------------------------------------------------------------'

"""
    5、 Silhouette Coefficient
    优点:
    5.1 评分结果在[-1, +1]之间,评分结果越高,聚类结果越好
    5.2 评分很高时,簇的密度越高,划分越好,这也关系到一个聚类的标准性
    5.3 重要的是,这个评分标准不需要先验知识
"""
from sklearn import metrics
from sklearn.metrics import pairwise_distances
from sklearn import datasets

datasets = datasets.load_iris()
X = datasets.data
y = datasets.target

import numpy as np
from sklearn.cluster import KMeans

kmeans_model = KMeans(n_clusters=2).fit(X)
labels = kmeans_model.labels_
print metrics.silhouette_score(X, labels, metric='euclidean')

"""
    Example: silhouette analysis on KMeans clustering
"""
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

X, y = make_blobs(n_samples=500, n_features=2, centers=4, cluster_std=1, center_box=(-10.0, 10.0), shuffle=True,
                  random_state=1)

range_n_clusters = [2, 3, 4, 5, 6]

for n_clusters in range_n_clusters:
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # 第一个图是silhouette plot
    # silhouette plot的坐标是[-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(X)

    silhouette_avg = silhouette_score(X, cluster_labels)
    print 'For n_clusters = %d,' % n_clusters, 'The average silhouette_score is : %f' % silhouette_avg

    # 计算silhouette scores
    sample_silhouette_values = silhouette_samples(X, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # 计算不同类簇的silhouette scores,并且进行排序
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        y_lower = y_upper + 10

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # 在silhouette_avg值处画垂直线
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 第二个图显示真实的聚类形式
    colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,
                c=colors)

    # 聚类中心
    centers = clusterer.cluster_centers_
    # 在聚类中心画圈
    ax2.scatter(centers[:, 0], centers[:, 1],
                marker='o', c="white", alpha=1, s=200)

    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50)

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")

    plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                  "with n_clusters = %d" % n_clusters),
                 fontsize=14, fontweight='bold')
    plt.show()
print '-----------------------------------------------------------------------------'
"""
     6、Calinski-Harabaz Index
     优点:
     6.1 评分很高时,簇的密度越高,划分越好,这也关系到一个聚类的标准性
     6.2 评分是计算速度
"""
from sklearn import metrics
from sklearn.metrics import pairwise_distances
from sklearn import datasets

datasets = datasets.load_iris()
X = datasets.data
y = datasets.target

import numpy as np
from sklearn.cluster import KMeans

kmeans_model = KMeans(n_clusters=2, random_state=1).fit(X)
labels = kmeans_model.labels_
print metrics.calinski_harabaz_score(X, labels)

你可能感兴趣的:(Sklearn机器学习)