Python K-means 聚类算法实现

Python K-means 聚类算法实现_第1张图片 

 以上是K-means算法的大概流程图

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import seaborn as sns

import random
import sys

from sklearn import metrics


# 初始化画板
def initPlot():
    sns.set(style="darkgrid", palette="muted",
            color_codes=True, font='SimHei')  # set( )设置主题,调色板更常用
    # mpl.rcParams['font.family'] = 'SimHei'
    plt.rcParams['axes.unicode_minus'] = False  # 解决坐标轴负数的负号显示问题


# 初始化聚类数据
def initData():
    df = pd.read_excel(r'./data/sort.xlsx', 'Sheet1')
    data = {}
    i = 0
    for index, row in df.iterrows():
        res = []
        for j in row:
            res.append(j)
        data[i] = res
        i += 1
    return data


# 选择新的聚类中心
def selectCenter(data=None, k=2, cluster=None, center=None):
    if data is None or cluster is None:
        return
    if len(center) <= 0:
        i = 0
        while i < k:
            rand = random.randint(0, len(data) - 1)
            if rand not in center:
                center.append(data[rand])
                subCluster = [data[rand]]
                cluster.append(subCluster)
                i += 1
    else:
        cluster.clear()
        for c in center:
            src = []
            src.append(c)
            cluster.append(src)


# 计算各个数据对象至各个数据集的聚类中心的最短距离
def calDistance(data=None, cluster=None, center=None):
    if data is None or cluster is None or center is None:
        return
    for obj in data:
        if obj not in center:
            src = data[obj]
            index = 0
            c = -1  # 聚类中心下标
            minED = sys.maxsize
            for cc in center:
                ed = calEuclideanDistance(data1=src, data2=cc)
                if ed < minED:
                    minED = ed
                    c = index
                index += 1
            cluster[c].append(src)


# 计算欧氏距离
def calEuclideanDistance(data1=None, data2=None):
    if data1 is None or data2 is None or len(data1) != len(data2):
        return -1
    res = 0
    for i in range(len(data1)):
        res += (data1[i] - data2[i]) ** 2
    return res ** .5


# 计算每个簇的数据对象的特征均值
def calClusterAvgDistance(cluster=None, k=2, center=None, dim=2):
    if cluster is None or center is None:
        return
    else:
        center.clear()
    for c in cluster:
        newCenter = [0] * dim
        for obj in c:
            index = 0
            for attr in obj:
                newCenter[index] += attr
                index += 1
        index = 0
        lenC = len(c)
        for newAttr in newCenter:
            newCenter[index] = newAttr / lenC
            index += 1
        center.append(newCenter)


def calSSE(center=None, cluster=None):
    if center is None or cluster is None:
        return
    SSE = []
    index = 0
    for obj in range(len(cluster)):
        res = 0
        for data in cluster[obj]:
            for attrc in center[index]:
                for attr in data:
                    res += (attr - attrc) ** 2
                    # print(res)
        SSE.append(res)
        index += 1
        
    return SSE


k = 2
data = initData()
center = []
cluster = []
iterators = 15
for i in range(iterators):
    selectCenter(data=data, k=k, cluster=cluster, center=center)
    calDistance(data=data, cluster=cluster, center=center)
    calClusterAvgDistance(cluster=cluster, k=k, center=center, dim=len(data[0]))
    
# print()
# for i in cluster:
#     print(len(i))
#     print(cluster)

initPlot()
SSE = calSSE(center=center, cluster=cluster)
print(SSE)

for c in cluster:
    X = []
    Y = []
    for obj in c:
        X.append(obj[0])
        Y.append(obj[1])
    plt.scatter(X, Y, s=75, alpha=.5)

# plt.scatter(X1, Y1, s=75, alpha=.5)
# plt.scatter(X2, Y2, s=10, alpha=.5)
# plt.xlabel('二氧化硅(SiO2)')
# plt.ylabel('氧化铝(Al2O3)')

plt.show()

# CH指标检测
index = 0
y = []
for i in cluster:
    for j in i:
        y.append(index)
    index += 1
src = []
for i in cluster:
    for j in i:
        src.append(j)
print(metrics.calinski_harabasz_score(src, y))

可以发现在循环迭代到一定次数时,不管如何增加迭代次数聚类结果并不会再发生改变,此时聚类过程才算是结束。

以下是k=2时的结果图,CH指标为123.60521036512122

Python K-means 聚类算法实现_第2张图片

 以下是k=3时的结果图,CH指标为97.93651104016449Python K-means 聚类算法实现_第3张图片

 

你可能感兴趣的:(AI算法,聚类,python,算法)