源码解读----之-----KMeans

(小白的个人理解,很多地方可能不准确,欢迎大家指正,向大家学习)



#encoding = utf-8
"""
@version:??
@author: xq
@contact:[email protected]
@file: k_means.py
@time: 2017/10/18 15:56
"""
import warnings

import numpy as np
import scipy.sparse as sp

from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.utils.extmath import row_norms, squared_norm, stable_cumsum
from sklearn.utils.sparsefuncs_fast import assign_rows_csr
from sklearn.utils import check_array
from sklearn.utils import check_random_state
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.validation import FLOAT_DTYPES
from sklearn.cluster import k_means

class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
    """K-Means clustering
    Read more in the :ref:`User Guide `.
    Examples
    --------
    >>> from sklearn.cluster import KMeans
    >>> import numpy as np
    >>> X = np.array([[1, 2], [1, 4], [1, 0],
    ...               [4, 2], [4, 4], [4, 0]])
    >>> kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
    >>> kmeans.labels_
    array([0, 0, 0, 1, 1, 1], dtype=int32)
    >>> kmeans.predict([[0, 0], [4, 4]])
    array([0, 1], dtype=int32)
    >>> kmeans.cluster_centers_
    array([[ 1.,  2.],
           [ 4.,  2.]])

    """

    def __init__(self, n_clusters=8, init='k-means++', n_init=10,
                 max_iter=300, tol=1e-4, precompute_distances='auto',
                 verbose=0, random_state=None, copy_x=True,
                 n_jobs=1, algorithm='auto'):

        self.n_clusters = n_clusters#集群个数
        self.init = init#选择中心点的方式
        self.max_iter = max_iter#算法每次迭代的最大次数
        self.tol = tol#迭代的总次数
        self.precompute_distances = precompute_distances#是否提前计算距离
        self.n_init = n_init#用不同的中心点初始化值运行算法的次数
        self.verbose = verbose#是否输出详细信息
        self.random_state = random_state#用于初始化中心点的生成器
        self.copy_x = copy_x#是否对输入数据继续copy 操作
        self.n_jobs = n_jobs#使用进程的数量
        self.algorithm = algorithm#k-means算法的类型

    def _check_fit_data(self, X):
        """验证输入数据样本数据X要大于质点的数目K """
        #输入的数据将被转化为至少是2维的数组,验证X的类型返回转化个验证后的X
        X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32])
        print('X为',X)
        print('X的sharp',X.shape[0])
        #如果样本数据小于质点数K,返回ValueError:样本数量应该大于等于质点数K
        if X.shape[0] < self.n_clusters:
            raise ValueError("n_samples=%d should be >= n_clusters=%d" % (X.shape[0], self.n_clusters))
        return X

    def _check_test_data(self, X):
        X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES)
        n_samples, n_features = X.shape
        expected_n_features = self.cluster_centers_.shape[1]
        if not n_features == expected_n_features:
            raise ValueError("Incorrect number of features. "
                             "Got %d features, expected %d" % (
                                 n_features, expected_n_features))

        return X

    def fit(self, X, y=None):
        """计算 k-means聚类
        @Parameters X:array或者稀疏矩阵,shape=(样本数, 特征数)
        @Parameters y:目标向量
        """

        #把self.random_state变成np.random.RandomState的实例
        random_state = check_random_state(self.random_state)
        #验证数据
        X = self._check_fit_data(X)
        #计算聚类,返回
        # 1.centroid===》ndarray类型的质心,sharp(簇的数目,特征数)
        # 2.label===》整型的ndarray,sharp(样本数),label[i]表示离索引为i的样本点最近的簇的索引值
        # 3.inertia===》float,算法最后的风险评估值(训练集中所有点离最近的中心点的距离的平方和)
        # 4.best_n_iter====》结果最好所对应的迭代次数,只有return_n_iter设置为True时返回
        self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \
            k_means(
                X, n_clusters=self.n_clusters, init=self.init,
                n_init=self.n_init, max_iter=self.max_iter, verbose=self.verbose,
                precompute_distances=self.precompute_distances,
                tol=self.tol, random_state=random_state, copy_x=self.copy_x,
                n_jobs=self.n_jobs, algorithm=self.algorithm,
                return_n_iter=True)
        return self

    def fit_predict(self, X, y=None):
        """
        @Parameters X : 要分类的数据
        @returns labels : array, shape [n_samples,]每个样本所属的簇的索引值
        """
        return self.fit(X).labels_

    def fit_transform(self, X, y=None):
        """计算集群并把X转换到cluster-distance空间,相当于fix(X).transform(X),但是更高效
        @Parameters  X : {array-like, sparse matrix}, shape = [n_samples, n_features] 需要聚类的数据
        @Parameters  y : Ignored
        @Returns     X_new : array, shape [n_samples, k]X转换到新空间,每一个维表示该点到各个中心的距离
        """
        #检查数据
        X = self._check_fit_data(X)
        #返回每个样本点到各个中心的距离所构成的array
        return self.fit(X)._transform(X)

    def transform(self, X):
        """把X转换到cluster-distance空间,返回X到中心的距离
        @Parameters X : {array-like, sparse matrix}, shape = [n_samples, n_features]
        @Returns X_new : array, shape [n_samples, k]
        """
        check_is_fitted(self, 'cluster_centers_')
        X = self._check_test_data(X)
        return self._transform(X)

    def _transform(self, X):
        """#返回X到中心的距离"""
        return euclidean_distances(X, self.cluster_centers_)

    def predict(self, X):
        """预测X中的每个样本所属的最近的簇。

        @Parameters  X : {array-like, sparse matrix}, shape = [n_samples, n_features]
        @Returns labels : array, shape [n_samples,]返回所属质心的索引构成的array
        """
        check_is_fitted(self, 'cluster_centers_')

        X = self._check_test_data(X)
        x_squared_norms = row_norms(X, squared=True)
        return _labels_inertia(X, x_squared_norms, self.cluster_centers_)[0]

    def score(self, X, y=None):(这个方法没有懂,如果明白的,留言告知,谢谢)
        """Opposite of the value of X on the K-means objective.
        @parameters  X : {array-like, sparse matrix}, shape = [n_samples, n_features]
        @:parameter y : Ignored
        @returns  score : float
            Opposite of the value of X on the K-means objective.
        """
        check_is_fitted(self, 'cluster_centers_')

        X = self._check_test_data(X)
        x_squared_norms = row_norms(X, squared=True)
        return -_labels_inertia(X, x_squared_norms, self.cluster_centers_)[1]

测试:

#测试数据
stopList= [{'Id': '50001','lat': 28.571906,'lng': 112.337788},
           {'Id': '50001','lat': 28.573678,'lng': 112.381103},
           { 'Id': '50001','lat': 28.571915,'lng': 112.337533},
           { 'Id': '50001','lat': 28.573978,'lng': 112.35765},
            { 'Id': '50001','lat': 28.572656,'lng': 112.3366},
           {'Id': '50001', 'lat': 28.578011, 'lng': 112.330688},
           {'Id': '50001', 'lat': 28.572228, 'lng': 112.335841},
           {'Id': '50001', 'lat': 28.57849, 'lng': 112.3338},
           {'Id': '50001', 'lat': 28.57239, 'lng': 112.336491},
           {'Id': '50001', 'lat': 28.577943, 'lng': 112.330995},
           {'Id': '50001', 'lat': 28.571921, 'lng': 112.337783},
           {'Id': '50001', 'lat': 28.572401, 'lng': 112.3359},
           {'Id': '50001', 'lat': 28.569629, 'lng': 112.34005},
           {'Id': '50001', 'lat': 28.588048, 'lng': 112.337783},
           {'Id': '50001', 'lat': 28.572035, 'lng': 112.335683},
           {'Id': '50001', 'lat': 28.560938, 'lng': 112.378183},
           {'Id': '50001', 'lat': 28.544781, 'lng': 112.494936},
           {'Id': '50001', 'lat': 28.572296, 'lng': 112.336288},
           {'Id': '50001', 'lat': 28.571951, 'lng': 112.337806},
           {'Id': '50001', 'lat': 28.571551, 'lng': 112.32685}]

print('共有%d个点'%len(stopList))
clustertest = clusterApi(stopList)#实例化
data = clustertest.initData()
# clustertest.k_meansUp()#聚类画图
model = KMeans(n_clusters=6)
testData =model.fit(data)
print('labels_',testData.labels_)#每个点所属质心点的索引
print('fit.transform(data)',testData.transform(data))#每个点到各个中心的距离
print('*'*20)
print('predict',model.predict(data))#每个点到各个中心的距离
print('score',model.score(data))#score -0.000161647796631
#print('transform(data)',model.transform(data))和fit.transform一样

输出结果:

共有20个点
labels_ [1 0 1 4 1 5 1 5 1 5 1 1 1 3 1 0 2 1 1 5]
fit.transform(data) [[ 0.04211125  0.00097656  0.15947     0.01616499  0.01997781  0.00859708]
 [ 0.00647778  0.04406992  0.11744154  0.0456432   0.02345784  0.0505931 ]
 [ 0.04235963  0.          0.15972395  0.01613546  0.02023865  0.00831513]
 [ 0.0229648   0.02068146  0.14035517  0.02434561  0.          0.02718635]
 [ 0.04336641  0.00069053  0.16076989  0.01545624  0.02108109  0.00717624]
 [ 0.05010537  0.00878906  0.16757369  0.0122752   0.02724767  0.00119604]
 [ 0.04407533  0.00119604  0.16144471  0.01594221  0.02185843  0.00676582]
 [ 0.04718424  0.00724238  0.16462249  0.01033497  0.02426714  0.00371864]
 [ 0.04344331  0.          0.16083217  0.01570111  0.02121637  0.00717624]
 [ 0.04979511  0.00854143  0.16726181  0.01221679  0.02695737  0.00169146]
 [ 0.04210559  0.00069053  0.15947747  0.01612068  0.01996587  0.00851347]
 [ 0.04402662  0.00097656  0.1614137   0.0157466   0.02180382  0.00673049]
 [ 0.03966215  0.00378221  0.15686524  0.01855469  0.01812569  0.01173908]
 [ 0.04671196  0.01612068  0.16299966  0.00069053  0.02433581  0.01356684]
 [ 0.04421575  0.00119604  0.16156872  0.01615023  0.02205389  0.00683594]
 [ 0.00651448  0.04255617  0.11786507  0.04866183  0.02432601  0.05007681]
 [ 0.11747199  0.16018453  0.          0.16300259  0.14035687  0.16738578]
 [ 0.0436459   0.00097656  0.16101736  0.01583718  0.02142882  0.00707587]
 [ 0.04209426  0.          0.15946252  0.01609107  0.01991804  0.00851347]
 [ 0.05296445  0.01019561  0.17020507  0.01979798  0.03088934  0.0061376 ]]
********************
predict [1 0 1 4 1 5 1 5 1 5 1 1 1 3 1 0 2 1 1 5]
score -0.000161647796631(这个结果没有理解)




你可能感兴趣的:(算法,python)