(小白的个人理解,很多地方可能不准确,欢迎大家指正,向大家学习)
#encoding = utf-8 """ @version:?? @author: xq @contact:[email protected] @file: k_means.py @time: 2017/10/18 15:56 """ import warnings import numpy as np import scipy.sparse as sp from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin from sklearn.metrics.pairwise import euclidean_distances from sklearn.utils.extmath import row_norms, squared_norm, stable_cumsum from sklearn.utils.sparsefuncs_fast import assign_rows_csr from sklearn.utils import check_array from sklearn.utils import check_random_state from sklearn.utils.validation import check_is_fitted from sklearn.utils.validation import FLOAT_DTYPES from sklearn.cluster import k_means class KMeans(BaseEstimator, ClusterMixin, TransformerMixin): """K-Means clustering Read more in the :ref:`User Guide`. Examples -------- >>> from sklearn.cluster import KMeans >>> import numpy as np >>> X = np.array([[1, 2], [1, 4], [1, 0], ... [4, 2], [4, 4], [4, 0]]) >>> kmeans = KMeans(n_clusters=2, random_state=0).fit(X) >>> kmeans.labels_ array([0, 0, 0, 1, 1, 1], dtype=int32) >>> kmeans.predict([[0, 0], [4, 4]]) array([0, 1], dtype=int32) >>> kmeans.cluster_centers_ array([[ 1., 2.], [ 4., 2.]]) """ def __init__(self, n_clusters=8, init='k-means++', n_init=10, max_iter=300, tol=1e-4, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1, algorithm='auto'): self.n_clusters = n_clusters#集群个数 self.init = init#选择中心点的方式 self.max_iter = max_iter#算法每次迭代的最大次数 self.tol = tol#迭代的总次数 self.precompute_distances = precompute_distances#是否提前计算距离 self.n_init = n_init#用不同的中心点初始化值运行算法的次数 self.verbose = verbose#是否输出详细信息 self.random_state = random_state#用于初始化中心点的生成器 self.copy_x = copy_x#是否对输入数据继续copy 操作 self.n_jobs = n_jobs#使用进程的数量 self.algorithm = algorithm#k-means算法的类型 def _check_fit_data(self, X): """验证输入数据样本数据X要大于质点的数目K """ #输入的数据将被转化为至少是2维的数组,验证X的类型返回转化个验证后的X X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32]) print('X为',X) print('X的sharp',X.shape[0]) #如果样本数据小于质点数K,返回ValueError:样本数量应该大于等于质点数K if X.shape[0] < self.n_clusters: raise ValueError("n_samples=%d should be >= n_clusters=%d" % (X.shape[0], self.n_clusters)) return X def _check_test_data(self, X): X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES) n_samples, n_features = X.shape expected_n_features = self.cluster_centers_.shape[1] if not n_features == expected_n_features: raise ValueError("Incorrect number of features. " "Got %d features, expected %d" % ( n_features, expected_n_features)) return X def fit(self, X, y=None): """计算 k-means聚类 @Parameters X:array或者稀疏矩阵,shape=(样本数, 特征数) @Parameters y:目标向量 """ #把self.random_state变成np.random.RandomState的实例 random_state = check_random_state(self.random_state) #验证数据 X = self._check_fit_data(X) #计算聚类,返回 # 1.centroid===》ndarray类型的质心,sharp(簇的数目,特征数) # 2.label===》整型的ndarray,sharp(样本数),label[i]表示离索引为i的样本点最近的簇的索引值 # 3.inertia===》float,算法最后的风险评估值(训练集中所有点离最近的中心点的距离的平方和) # 4.best_n_iter====》结果最好所对应的迭代次数,只有return_n_iter设置为True时返回 self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \ k_means( X, n_clusters=self.n_clusters, init=self.init, n_init=self.n_init, max_iter=self.max_iter, verbose=self.verbose, precompute_distances=self.precompute_distances, tol=self.tol, random_state=random_state, copy_x=self.copy_x, n_jobs=self.n_jobs, algorithm=self.algorithm, return_n_iter=True) return self def fit_predict(self, X, y=None): """ @Parameters X : 要分类的数据 @returns labels : array, shape [n_samples,]每个样本所属的簇的索引值 """ return self.fit(X).labels_ def fit_transform(self, X, y=None): """计算集群并把X转换到cluster-distance空间,相当于fix(X).transform(X),但是更高效 @Parameters X : {array-like, sparse matrix}, shape = [n_samples, n_features] 需要聚类的数据 @Parameters y : Ignored @Returns X_new : array, shape [n_samples, k]X转换到新空间,每一个维表示该点到各个中心的距离 """ #检查数据 X = self._check_fit_data(X) #返回每个样本点到各个中心的距离所构成的array return self.fit(X)._transform(X) def transform(self, X): """把X转换到cluster-distance空间,返回X到中心的距离 @Parameters X : {array-like, sparse matrix}, shape = [n_samples, n_features] @Returns X_new : array, shape [n_samples, k] """ check_is_fitted(self, 'cluster_centers_') X = self._check_test_data(X) return self._transform(X) def _transform(self, X): """#返回X到中心的距离""" return euclidean_distances(X, self.cluster_centers_) def predict(self, X): """预测X中的每个样本所属的最近的簇。 @Parameters X : {array-like, sparse matrix}, shape = [n_samples, n_features] @Returns labels : array, shape [n_samples,]返回所属质心的索引构成的array """ check_is_fitted(self, 'cluster_centers_') X = self._check_test_data(X) x_squared_norms = row_norms(X, squared=True) return _labels_inertia(X, x_squared_norms, self.cluster_centers_)[0] def score(self, X, y=None):(这个方法没有懂,如果明白的,留言告知,谢谢) """Opposite of the value of X on the K-means objective. @parameters X : {array-like, sparse matrix}, shape = [n_samples, n_features] @:parameter y : Ignored @returns score : float Opposite of the value of X on the K-means objective. """ check_is_fitted(self, 'cluster_centers_') X = self._check_test_data(X) x_squared_norms = row_norms(X, squared=True) return -_labels_inertia(X, x_squared_norms, self.cluster_centers_)[1]
测试:
#测试数据 stopList= [{'Id': '50001','lat': 28.571906,'lng': 112.337788}, {'Id': '50001','lat': 28.573678,'lng': 112.381103}, { 'Id': '50001','lat': 28.571915,'lng': 112.337533}, { 'Id': '50001','lat': 28.573978,'lng': 112.35765}, { 'Id': '50001','lat': 28.572656,'lng': 112.3366}, {'Id': '50001', 'lat': 28.578011, 'lng': 112.330688}, {'Id': '50001', 'lat': 28.572228, 'lng': 112.335841}, {'Id': '50001', 'lat': 28.57849, 'lng': 112.3338}, {'Id': '50001', 'lat': 28.57239, 'lng': 112.336491}, {'Id': '50001', 'lat': 28.577943, 'lng': 112.330995}, {'Id': '50001', 'lat': 28.571921, 'lng': 112.337783}, {'Id': '50001', 'lat': 28.572401, 'lng': 112.3359}, {'Id': '50001', 'lat': 28.569629, 'lng': 112.34005}, {'Id': '50001', 'lat': 28.588048, 'lng': 112.337783}, {'Id': '50001', 'lat': 28.572035, 'lng': 112.335683}, {'Id': '50001', 'lat': 28.560938, 'lng': 112.378183}, {'Id': '50001', 'lat': 28.544781, 'lng': 112.494936}, {'Id': '50001', 'lat': 28.572296, 'lng': 112.336288}, {'Id': '50001', 'lat': 28.571951, 'lng': 112.337806}, {'Id': '50001', 'lat': 28.571551, 'lng': 112.32685}] print('共有%d个点'%len(stopList)) clustertest = clusterApi(stopList)#实例化 data = clustertest.initData() # clustertest.k_meansUp()#聚类画图 model = KMeans(n_clusters=6) testData =model.fit(data) print('labels_',testData.labels_)#每个点所属质心点的索引 print('fit.transform(data)',testData.transform(data))#每个点到各个中心的距离 print('*'*20) print('predict',model.predict(data))#每个点到各个中心的距离 print('score',model.score(data))#score -0.000161647796631 #print('transform(data)',model.transform(data))和fit.transform一样
输出结果:
共有20个点
labels_ [1 0 1 4 1 5 1 5 1 5 1 1 1 3 1 0 2 1 1 5]
fit.transform(data) [[ 0.04211125 0.00097656 0.15947 0.01616499 0.01997781 0.00859708]
[ 0.00647778 0.04406992 0.11744154 0.0456432 0.02345784 0.0505931 ]
[ 0.04235963 0. 0.15972395 0.01613546 0.02023865 0.00831513]
[ 0.0229648 0.02068146 0.14035517 0.02434561 0. 0.02718635]
[ 0.04336641 0.00069053 0.16076989 0.01545624 0.02108109 0.00717624]
[ 0.05010537 0.00878906 0.16757369 0.0122752 0.02724767 0.00119604]
[ 0.04407533 0.00119604 0.16144471 0.01594221 0.02185843 0.00676582]
[ 0.04718424 0.00724238 0.16462249 0.01033497 0.02426714 0.00371864]
[ 0.04344331 0. 0.16083217 0.01570111 0.02121637 0.00717624]
[ 0.04979511 0.00854143 0.16726181 0.01221679 0.02695737 0.00169146]
[ 0.04210559 0.00069053 0.15947747 0.01612068 0.01996587 0.00851347]
[ 0.04402662 0.00097656 0.1614137 0.0157466 0.02180382 0.00673049]
[ 0.03966215 0.00378221 0.15686524 0.01855469 0.01812569 0.01173908]
[ 0.04671196 0.01612068 0.16299966 0.00069053 0.02433581 0.01356684]
[ 0.04421575 0.00119604 0.16156872 0.01615023 0.02205389 0.00683594]
[ 0.00651448 0.04255617 0.11786507 0.04866183 0.02432601 0.05007681]
[ 0.11747199 0.16018453 0. 0.16300259 0.14035687 0.16738578]
[ 0.0436459 0.00097656 0.16101736 0.01583718 0.02142882 0.00707587]
[ 0.04209426 0. 0.15946252 0.01609107 0.01991804 0.00851347]
[ 0.05296445 0.01019561 0.17020507 0.01979798 0.03088934 0.0061376 ]]
********************
predict [1 0 1 4 1 5 1 5 1 5 1 1 1 3 1 0 2 1 1 5]
score -0.000161647796631(这个结果没有理解)