Python实现KMeans聚类算法

import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.datasets import make_blobs
np.random.seed(123)

X,y = make_blobs(centers = 4,n_samples = 1000) #make_bolbs 是为聚类产生数据集
#n_samples 是待生成的样本的总数 n_samples = 1000 对应于 m = 1000
#centers 表示类别数 centers = 4代表可以分成四个簇

print(f'Shape of dataset:{X.shape}')

# 展示初始数据集
fig = plt.figure(figsize = (8,6)) #表示figure的大小为宽为6,长为8(inch)
plt.scatter(X[:,0],X[:,1])
plt.title("Dataset with 4 clusters")
plt.xlabel("First feature")
plt.ylabel("Second feature")
plt.show()

class KMeans():
    def __init__(self,n_clusters):
        self.k = n_clusters

    def fit(self,data):
        """
        使KMeans聚类算法适用于训练集
        """
        n_samples,_ = data.shape
        #初始化聚类中心,根据k值随机生成k个样本点
        self.centers = np.array(random.sample(list(data),self.k))
        self.initial_centers = np.copy(self.centers)
        # 跟踪样本点的分配直到聚类中心停止移动,停止移动时完成聚类
        old_assigns = None
        n_iters = 0

        while True:
            new_assigns = [self.classify(datapoint) for datapoint in data]

            if new_assigns == old_assigns:
                print(f"Training finished after {n_iters} iterations")
                return

            old_assigns = new_assigns
            n_iters += 1 #Python 不支持 i++ 这种自增语法

            for id_ in range(self.k):
                points_idx = np.where(np.array(new_assigns) == id_)
                datapoints = data[points_idx]
                self.centers[id_] = datapoints.mean(axis = 0)

    def l2_distance(self,datapoint):
        dists = np.sqrt(np.sum((self.centers - datapoint)**2,axis = 1))
        return dists

    def classify(self,datapoint):
        """
        给定一个样本点,计算它距离哪一个聚类中心最近,返回聚类中心的索引
        """
        dists = self.l2_distance(datapoint)
        return np.argmin(dists)
        # argmin 给出了水平方向上最小的下标值

    def plot_clusters(self,data):
        plt.figure(figsize=(12,10))
        plt.title("Initial centers in black, final centers in red")
        plt.scatter(data[:, 0], data[:, 1], marker='.', c=y)
        plt.scatter(self.centers[:, 0], self.centers[:, 1], c='r')
        plt.scatter(self.initial_centers[:, 0], self.initial_centers[:, 1], c='k')
        plt.show()

if __name__ == '__main__':
    kmeans = KMeans(n_clusters=4)
    kmeans.fit(X) #运行聚类算法
    kmeans.plot_clusters(X)

#可见,聚类算法并不能每一次都得到全局最优解,可以通过反复运行,以得到全局最优解

你可能感兴趣的:(机器学习,机器学习,聚类算法)