Python实现数据挖掘K-均值算法(欧氏距离)

算法实现思路

首先,根据指定的形心标志获取初始形心坐标,将所有形心存储为列表
其次,编写计算所有点到每个形心的距离的方法、根据所有点到每个形心的距离获取新簇的方法、得到新簇的形心的方法
计算所有点到每个形心的距离的方法:遍历形心列表,在内部遍历数据集,计算每个数据集中的每个点到形心的欧式距离平方,将形心和形心与每个点的欧氏距离平方存储为字典,形心为字典键,形心与每个点的欧氏距离为字典值。
根据所有点到每个形心的距离获取新簇的方法:根据所有点到每个形心的距离,将距离某一形心最近的点划分为一簇,存储其对应的点标号(A1、A2等)
得到新簇的形心:根据获得的新簇,计算每一簇所有点坐标的平均值,将其标记为新的形心
最后,编写主函数。首先,获取初始形心坐标,依次调用以下方法直至簇不发生变化(形心也不发生变化),计算所有点到每个形心的距离,获取新簇、获取新的形心,最终获得的簇便是最终结果。

源代码

# -*- coding: utf-8 -*-

"""
@Time        : 2020/12/8
@Author      : lixinci
@File        : 10_2_K均值算法
@Description :
"""
import copy
import operator

dataset = {
     
    "A1": (2, 10), "A2": (2, 5), "A3": (8, 4), "B1": (5, 8),
    "B2": (7, 5), "B3": (6, 4), "C1": (1, 2), "C2": (4, 9)
}


def get_initial_centroids(dataset, centroids):
    """
    得到初始形心
    """
    initial_centroid = []
    for centroid in centroids:
        initial_centroid.append(dataset[centroid])
    return initial_centroid


def get_all_distance(dataset, centroids):
    """
    计算所有点到每个形心的距离
    """
    # 存储所有点到每个形心的距离
    all_distance = {
     }
    # 遍历形心坐标,计算形心到每个点的距离
    for centroid in centroids:
        # 存储所有点到每个形心的距离平方
        all_distance[centroid] = {
     }
        # 计算数据集中的每个点到形心的距离
        for data in dataset:
            point_coordinates = dataset[data]
            if operator.eq(centroid, point_coordinates):
                all_distance[centroid][data] = 0
            else:
                dist = round(pow(centroid[0] -
                                 point_coordinates[0], 2) +
                             pow(centroid[1] -
                                 point_coordinates[1], 2), 3)
                all_distance[centroid][data] = dist
    return all_distance


def get_cluster(dataset, all_distance):
    """
    根据所有点到每个形心的距离,获取新簇
    """
    # 临时存储新簇
    cluster = {
     }
    for i in range(len(all_distance)):
        cluster[i + 1] = []
    values = list(all_distance.values())
    # 所有的点标号:['A1', 'A2', 'A3', 'B1', 'B2', 'B3', 'C1', 'C2']
    data_label = list(dataset.keys())
    # 遍历点标号,获取每个点所在的簇
    for label in data_label:
        # 存储点与每个形心的距离
        label_dis = []
        for dis in values:
            label_dis.append(dis[label])
        min_label_dis = min(label_dis)
        min_label_dis_index = label_dis.index(min_label_dis)
        cluster[min_label_dis_index + 1].append(label)
    return list(cluster.values())


def get_centroids(dataset, cluster):
    """
    得到新簇的形心
    """
    # 保存新的形心
    centroids = []
    for i in range(len(cluster)):
        sum_x, sum_y = 0.0, 0.0
        for data in cluster[i]:
            sum_x += dataset[data][0]
            sum_y += dataset[data][1]
        centroids.append(
            (round(sum_x / len(cluster[i]), 3), round(sum_y / len(cluster[i]), 3)))
    return centroids


def main():
    """
    主函数
    """
    initial_centroids = ['A1', 'B1', 'C1']
    centroids = get_initial_centroids(dataset, initial_centroids)  # 得到初始簇的形心坐标
    final_cluster = []  # 存储簇
    final_centroids = []  # 存储形心
    count = 0
    while True:
        count += 1
        all_distance = get_all_distance(dataset, centroids)
        cluster = get_cluster(dataset, all_distance)
        centroids = get_centroids(dataset, cluster)
        print("第{}次循环后的簇为:".format(count))
        for i in range(len(cluster)):
            print("簇{}{}\t: ".format(i + 1, centroids[i]), end="")
            for element in cluster[i]:
                print(element, end="  ")
            print()
        print()
        if operator.eq(cluster, final_cluster):
            break
        final_cluster = copy.deepcopy(cluster)
        final_centroids = copy.deepcopy(centroids)
    print("共计循环{}次, 最终的簇为:".format(count - 1))
    for i in range(len(final_cluster)):
        print("簇{}{}\t: ".format(i + 1, final_centroids[i]), end="")
        for element in final_cluster[i]:
            print(element, end="  ")
        print()


if __name__ == '__main__':
    main()

你可能感兴趣的:(Python学习,python,数据挖掘,机器学习,算法)