python实现Kmeans

代码:

import numpy as np
import random
from math import sqrt


def dist(arr1, arr2):
    return sqrt(np.sum(arr1 - arr2) ** 2)

def random_center(dataset, k):
    """
    随机生成初始的聚类中心,dataset的每一行是一个样本
    :param dataset:
    :param k: 聚类中心的个数
    :return:
    """
    number_cols = dataset.shape[1]
    centers = np.mat(np.zeros([k, number_cols]))

    for i in range(number_cols):
        min_value = np.min(dataset[:, i])
        max_value = np.max(dataset[:, i])
        centers[:, i] = min_value + (max_value - min_value) * np.random.random([k, 1])
    return centers


def kmeans(dataset, k):
    centers = random_center(dataset, k)       # 生成初始的聚类中心
    num_data = dataset.shape[0]               # 数据的个数

    # 保存每个样本的聚类情况,第一列表示该样本属于某一类,第二列是与该类聚类中心的距离
    clusterAssment = np.mat(np.zeros((num_data, 2)))
    cluster_changed = True    # 控制聚类算法迭代停止的标志,当聚类中心不在改变时停止
    while cluster_changed:
        cluster_changed = False
        for i in range(num_data):
            min_dist = np.inf    # 初始化最小的距离
            min_index = -1       # 初始化属于某一类
            for j in range(k):
                dist_j = dist(dataset[i, :], centers[j, :])

                if dist_j < min_dist:
                    min_dist = dist_j
                    min_index = j
            if clusterAssment[i, 0] != min_index:
                cluster_changed = True
            clusterAssment[i, :] = min_index, min_dist ** 2
        # 更新聚类中心
        for cent in range(k):
            data_cent = dataset[np.nonzero(clusterAssment[:, 0].A == cent)[0]]    # .A表示将矩阵转化为数组
            centers[cent, :] = np.mean(data_cent, axis=0)
    return centers, clusterAssment


dataset = np.random.randint(1, 20, [20, 5])

centers, clusterAssment = kmeans(dataset, 3)

 

你可能感兴趣的:(算法,python,聚类,算法,python)