kmean聚类python实现

import pandas as pd
import numpy as np
import xlrd
# 导入数据
df2 = pd.read_excel("test2.xlsx")
data = np.array(df2)
# 去掉前两列
data = data[:, 2:]
# 分为k类
k = 3
# 临近均值E
E = 0.00001
# 获得行数和列数
(row, line) = data.shape
# 随机分配中心点
datak = np.array([data[1, :], data[2, :], data[10, :]])
# print(datak.shape)
# 初始化距离
d = np.zeros((k, row, 1))
while 1:
    for i in range(k):
        # 与中心点做差
        dev = abs(data - datak[i])
        # 用曼哈顿距离求距离
        d[i] = np.sum(dev, axis=1, keepdims=True)
    # 返回最小值下标
    mind = np.argmin(d, axis=0)
    # 生成(row,0)秩
    mind = np.array(mind[:, 0])
    # print(mind)

    kx0 = np.where(mind == 0)
    kx0 = np.array(kx0)
    # print(kx0)
    kx1 = np.where(mind == 1)
    kx1 = np.array(kx1)
    kx2 = np.where(mind == 2)
    kx2 = np.array(kx2)
    # print(kx2.shape)
    kx = np.array([kx0[0], kx1[0], kx2[0]])
    # print(kx[0])
    # 计算中心点和误差
    # 初始化方差e
    e = np.zeros((2, k, 1))
    for i in range(k):

        # 获取第i个对象的行数
        linex = kx[i].shape
        linex = linex[0]
        # print(linex)
        # 初始化第I个类包含的对象
        # print(linex, line)
        data_x1 = np.zeros((linex, line))
        data_x2 = np.zeros((k, linex, line))
        # print(data_x.shape)
        for j in range(linex):
            # 每个对象和第i个中心点作差
            data_x1[j] = abs(data[kx[i][j]]-datak[i])
            data_x2[i, j] = data[kx[i][j]]
        # 求方差
        e[0, i, 0] = np.sum(np.sum(data_x1**2))
        # print(data_x1.shape)
        # print(i, e[0, i, 0])
        # print(data_x2.shape)
        # 更新第i类的中心点
        # a = np.mean(data_x2, axis=0)
        # print(a.shape)
        datak[i] = np.mean(data_x2[i], axis=0)
        for j in range(linex):
            # 每个对象和第i个中心点作差
            data_x1[j] = abs(data[kx[i][j]]-datak[i])
        e[1, i, 0] = np.sum(np.sum(data_x1 ** 2))
        # print(i, e[1, i, 0])
        # print(data_x)
        # print(datak[i])
        # print(e[0, 0], e[1, 0], e[2, 0])
    if e[0, 0, 0]-e[1, 0, 0] <= E and e[0, 1, 0]-e[1, 1, 0] <= E and e[0, 2, 0]-e[1, 2, 0] <= E:
        break
print("success!")
# 打印类和类中心
for i in range(k):
    print(kx[i].shape)
    print(kx[i])
    print(datak[i])




测试数据
运行结果:
kmean聚类python实现_第1张图片
!!!
时间复杂度和空间复杂度可以改进!
抛砖引玉,希望大家积极评论!

你可能感兴趣的:(数据挖掘)