def _kmeans_single(X, n_clusters, x_squared_norms, max_iter=300, init='k-means++', verbose=False, random_state=None, tol=1e-4, precompute_distances=True): """A single run of k-means, assumes preparation completed prior. Parameters ---------- X: array-like of floats, shape (n_samples, n_features) The observations to cluster. n_clusters: int The number of clusters to form as well as the number of centroids to generate. max_iter: int, optional, default 300 Maximum number of iterations of the k-means algorithm to run. init: {'k-means++', 'random', or ndarray, or a callable}, optional Method for initialization, default to 'k-means++': 'k-means++' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. See section Notes in k_init for more details. 'random': generate k centroids from a Gaussian with mean and variance estimated from the data. If an ndarray is passed, it should be of shape (k, p) and gives the initial centers. If a callable is passed, it should take arguments X, k and and a random state and return an initialization. tol: float, optional The relative increment in the results before declaring convergence. verbose: boolean, optional Verbosity mode x_squared_norms: array Precomputed x_squared_norms. precompute_distances : boolean, default: True Precompute distances (faster but takes more memory). random_state: integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. Returns ------- centroid: float ndarray with shape (k, n_features) Centroids found at the last iteration of k-means. label: integer ndarray with shape (n_samples,) label[i] is the code or index of the centroid the i'th observation is closest to. inertia: float The final value of the inertia criterion (sum of squared distances to the closest centroid for all observations in the training set). n_iter : int Number of iterations run. """ #将random_state统一转化为np.random.RandomState.详参函数。 random_state = check_random_state(random_state) #定义变量。 best_labels, best_inertia, best_centers = None, None, None # init。初始化起始中心点。默认方法为kmeans++。详参函数。 centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) # I don't know why. if verbose: print("Initialization complete") # Allocate memory to store the distances for each sample to its # closer center for reallocation in case of ties #为每个样本点到其最近中心距离分配存储空间。shape(x.shape[0],) distances = np.zeros(shape=(X.shape[0],), dtype=np.float64) # iterations.迭代。max_iter. for i in range(max_iter): # 保存现有中心点。 centers_old = centers.copy() # labels assignment is also called the E-step of EM # 分类。寻找最近中心,将样本点划分多各自的类即分派标签。 labels, inertia = \ _labels_inertia(X, x_squared_norms, centers, precompute_distances=precompute_distances, distances=distances) # computation of the means is also called the M-step of EM # 计算每个类的均值点作为中心。 if sp.issparse(X): centers = _k_means._centers_sparse(X, labels, n_clusters, distances) else: centers = _k_means._centers_dense(X, labels, n_clusters, distances) if verbose: print("Iteration %2d, inertia %.3f" % (i, inertia)) # 与best_inertia比较并更新best_inertia if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia # 比较新旧中心点,看其是否达到精度。达到精度break。 if squared_norm(centers_old - centers) <= tol: if verbose: print("Converged at iteration %d" % i) break # 返回 标签,距离和,中心点,迭代次数。 return best_labels, best_inertia, best_centers, i + 1
函数:_labels_inertia()