FCM聚类算法(模糊C均值算法)

相关学习链接

  1. 视频(https://www.bilibili.com/video/BV18J411a7yY?p=1)
  2. 博客:(https://blog.csdn.net/in_nocence/article/details/78306297)
  3. 文档:(https://wenku.baidu.com/view/ee968c00eff9aef8941e06a2.html)
  4. 代码:(https://blog.csdn.net/zwqhehe/article/details/75174918)

上面第一个链接视频是台湾一位老师将的课程,可谓是相当详细,可以把一个人从零讲懂,里面的原理讲的很透彻,所有推导过程都讲了,虽然视频时间较长,但是强烈推荐看完前面三节的推导过程,一定会收货颇多。看完了该的视频,基本上原理就很了解了,然后看下一些博客和代码就会十分轻松,基本就掌握了FCM,以上也是我本人学习FCM的过程。

学习总结:

前段时间刚刚学习过K-means聚类算法,然后最近再看FCM,其实有了k-means的基础学起来要容易理解的多。他们的共同点都是基于无监督的聚类算法,算法流程基本一致:

  1. 指明聚类数量;
  2. 初始聚类中心;
  3. 计算每个点到聚类中心的距离,配分每个点的归属类别;
  4. 从新计算聚类中心;
  5. 判断条件是否达到,否则重复3.

FCM的与k-means的区别在于,在计算每个点间距离是并不是直接使用欧式距离,而是加入了每个点间的权重系数,也就是每个点间的距离等于欧式距离与权重系数之积,其它内容完全一致。稍微复杂的难点就是如何计算出每个点间的权重系数矩阵U,在上面学习链接中的视频和博客对计算U做了详细推导。

代码:

下面附上本人对参考链接里面的代码进行修改后的代码:

1.main.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/5/20 18:23
# @Author  : ystraw
# @Site    : 
# @File    : main.py
# @Software: PyCharm Community Edition
# @function: FCM 模糊C均值聚类

import copy
import math
import random
import time
import pandas as pd


# 用于初始化隶属度矩阵U
global MAX
MAX = 10000.0
# 用于结束条件
global Epsilon
Epsilon = 0.00000001

def import_data_format_iris(file):
    """
    格式化数据,前四列为data,最后一列为cluster_location
    数据地址 http://archive.ics.uci.edu/ml/machine-learning-databases/iris/
    """
    data = []
    cluster_location =[]
    with open(str(file), 'r') as f:
        for line in f:
            if line == '\n':
                continue
            current = line.strip().split(",")
            current_dummy = []
            for j in range(0, len(current)-1):
                current_dummy.append(float(current[j]))
            # print(len(current), j)
            j += 1
            if  current[j] == "Iris-setosa\n":
                cluster_location.append(0)
            elif current[j] == "Iris-versicolor\n":
                cluster_location.append(1)
            else:
                cluster_location.append(2)
            data.append(current_dummy)
    print("加载数据完毕")
    return data , cluster_location

def randomise_data(data):
    """
    该功能将数据随机化,并保持随机化顺序的记录
    """
    order = list(range(0, len(data)))
    random.shuffle(order)
    new_data = [[] for i in range(0, len(data))]
    for index in range(0, len(order)):
        new_data[index] = data[order[index]]
    return new_data, order

def de_randomise_data(data, order):
    """
    此函数将返回数据的原始顺序,将randomise_data()返回的order列表作为参数
    """
    new_data = [[]for i in range(0, len(data))]
    for index in range(len(order)):
        new_data[order[index]] = data[index]
    return new_data

def print_matrix(list):
    """
    以可重复的方式打印矩阵
    """
    for i in range(0, len(list)):
        print (list[i])

def initialise_U(data, cluster_number):
    """
    这个函数是隶属度矩阵U的每行加起来都为1. 此处需要一个全局变量MAX.
    实现方法:随机n个1-MAX的数,然后将每个数除以n个数的和,就可以得到了n个0-1之间的小数,且和为1.
    """
    global MAX
    U = []  # 最终大小为:n*k (n为数点个数,k为聚类中心数量)
    for i in range(0, len(data)):
        current = []
        rand_sum = 0.0
        for j in range(0, cluster_number):
            dummy = random.randint(1, int(MAX))
            current.append(dummy)
            rand_sum += dummy
        for j in range(0, cluster_number):
            current[j] = current[j] / rand_sum
        U.append(current)
    return U

def distance(point, center):
    """
    该函数计算2点之间的距离(作为列表)。我们指欧几里德距离。闵可夫斯基距离
    """
    if len(point) != len(center):
        return -1
    dummy = 0.0
    for i in range(0, len(point)):
        dummy += abs(point[i] - center[i]) ** 2
    return math.sqrt(dummy)

def end_conditon(U, U_old):
    """
	结束条件。当U矩阵随着连续迭代停止变化时,触发结束
	"""
    global Epsilon
    for i in range(0, len(U)):
        for j in range(0, len(U[0])):
            if abs(U[i][j] - U_old[i][j]) > Epsilon:
                return False
    return True

def normalise_U(U):
    """
    在聚类结束时使U模糊化。每个样本的隶属度最大的为1,其余为0
    """
    for i in range(0, len(U)):
        maximum = max(U[i])
        for j in range(0, len(U[0])):
            if U[i][j] != maximum:
                U[i][j] = 0
            else:
                U[i][j] = 1
    return U

# m的最佳取值范围为[1.5,2.5]
def fuzzy(data, cluster_number, m):
    """
    这是主函数,它将计算所需的聚类中心,并返回最终的归一化隶属矩阵U.
    参数是:簇数(cluster_number)和隶属度的因子(m)
    """
    # 初始化隶属度矩阵U
    U = initialise_U(data, cluster_number)
    # print_matrix(U)
    # 循环更新U
    while True:
        # 创建它的副本,以检查结束条件
        U_old = copy.deepcopy(U)
        # 计算聚类中心
        C = []
        for j in range(0, cluster_number):
            current_cluster_center = []
            for i in range(0, len(data[0])):
                dummy_sum_num = 0.0
                dummy_sum_dum = 0.0
                for k in range(0, len(data)):
                    # 分子
                    dummy_sum_num += (U[k][j] ** m) * data[k][i]
                    # 分母
                    dummy_sum_dum += (U[k][j] ** m)
                # 第i列的聚类中心
                current_cluster_center.append(dummy_sum_num/dummy_sum_dum)
                # 第j簇的所有聚类中心
            C.append(current_cluster_center)

        # 创建一个距离向量(即每个点到每个聚类中心的距离), 用于计算U矩阵。
        distance_matrix =[]
        for i in range(0, len(data)):
            current = []
            for j in range(0, cluster_number):
                current.append(distance(data[i], C[j]))
            distance_matrix.append(current)

        # 更新U
        for j in range(0, cluster_number):
            for i in range(0, len(data)):
                dummy = 0.0
                for k in range(0, cluster_number):
                    # 分母
                    dummy += (distance_matrix[i][j] / distance_matrix[i][k]) ** (2/(m-1))
                U[i][j] = 1 / dummy

        if end_conditon(U, U_old):
            print("结束聚类")
            break
    print("标准化 U")
    U = normalise_U(U)
    return U, C

def checker_iris(final_location):
    """
	和真实的聚类结果进行校验比对
	"""
    right = 0.0
    for k in range(0, 3):
        checker = [0, 0, 0]
        for i in range(0, 50):
            for j in range(0, len(final_location[0])):
                if final_location[i + (50*k)][j] == 1:
                    checker[j] += 1
        right += max(checker)
        print(right)
    answer = right / 150 * 100
    return "准确度:" + str(answer) + "%"

if __name__ == '__main__':
    # 利用pandas读取自己需要数据集
    dataset = pd.read_excel('./data/数据.xlsx', index_col=0, header=1)
    print(dataset.shape)

    # 随机化数据
    data, order = randomise_data(dataset.values)
    # print_matrix(data)

    start = time.time()

    # 调用模糊C均值函数: 返回U和聚类中心C
    final_location, C = fuzzy(data, 5, 2)
    # print(final_location)
    # print(C)

    # 还原数据顺序
    final_location = de_randomise_data(final_location, order)

    # 整理出所属类别:
    type = [nums.index(1) + 1 for nums in final_location]
    print(type)
    dataset['类别'] = type

    # 存储样本聚类结果
    dataset.to_excel('./data/result.xlsx')

    # 存储聚类中心:
    df = pd.DataFrame(C)
    df.columns = dataset.columns[:-1]
    print('聚类中心:\n', df)
    df.to_excel('./data/center.xlsx')

    print("用时:{0}".format(time.time() - start))

2.analysis.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/5/20 23:07
# @Author  : ystraw
# @Site    : 
# @File    : analysis.py
# @Software: PyCharm Community Edition
# @function: 绘图

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 中文和负号的正常显示
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['font.serif'] = ['SimHei']

# 绘雷达图
'''
描绘客户特征分析雷达图
'''
def draw1():
    # 读取聚类中心
    data = pd.read_excel('./data/center.xlsx', index_col=0)
    print(data.head())
    # 中文和负号的正常显示
    plt.rcParams['font.sans-serif'] = 'SimHei'
    plt.rcParams['axes.unicode_minus'] = False
    # 设置标签
    labels = data.columns
    # 设置数据长度
    dataLenth = len(labels)
    # 设置数据
    data_radar = data.values
    # 分割圆周长
    angles = np.linspace(0, 2*np.pi, dataLenth, endpoint=False)
    # 角度闭合
    angles = np.concatenate((angles, [angles[0]]))
    # 设置每组的线型
    style = ['r-', 'o--', 'g-.', 'b:', 'p-'] * 2
    for i in range(len(data_radar)):
        # 使每组数据闭合
        data_radar_1 = np.concatenate((data_radar[i], [data_radar[i][0]]))
        # 做极坐标系
        plt.polar(angles, data_radar_1, style[i], linewidth=2)
    # 贴上标签
    plt.thetagrids(angles * 180/np.pi, labels)
    # plt.ylim(0, 70)
    # 添加客户类别:
    kflable = ['类别' + str(i+1) for i in range(dataLenth)]
    plt.legend(kflable, bbox_to_anchor=(1.25, 1.15))
    plt.title(u'聚类中心分析雷达图')
    plt.savefig('./image/聚类中心分析雷达图.png')
    plt.show()

def draw2():
    # 读入数据
    data = pd.read_excel('./data/result.xlsx', index_col=0)
    print(data.head())

    # 调节图形大小,宽,高
    # plt.figure(figsize=(8,5))
    # 类别数据
    type = data['类别']
    # 获得统计的数据
    x = type.value_counts().values
    label = type.value_counts().index.values
    print('结果统计:\n', type.value_counts())

    # 设置饼状图各个区块的颜色
    color=['aqua', 'linen', 'lightcoral', 'olive', 'gold']
    # 绘图并获取返回值:p_texts饼图内部文本的,l_texts饼图外label的文本
    patches, l_text, p_text = plt.pie(x,autopct='%3.1f%%',radius=0.5,pctdistance=0.85,colors=color,wedgeprops=dict(linewidth=2,width=0.3,edgecolor='w'))
    # 添加图例
    legend_text = ['类别' + str(label[i]) for i in range(len(label))]
    # 设置图例标题、位置
    legend = plt.legend(legend_text, title='类别', loc='center right',bbox_to_anchor=(1.12,0.8),fontsize=13)
    # 修改图例标题字体大小
    legend.get_title().set_fontsize(12)
    #改变文本(比例)的大小: 方法是把每一个text遍历。调用set_size方法设置它的属性
    for t,type in zip(p_text, legend_text):
        t.set_size(13)
        t.set_text(t.get_text() + '\n'+ type)
    # plt.title('不同群占比图', fontsize=27)
    # 设置坐标轴比例(x,y轴刻度一致)以显示为圆形
    plt.axis('equal')
    plt.savefig('image/类别统计饼图.png')
    plt.show()

if __name__ == '__main__':
    # 绘制聚类中心的图:
    draw1()

    # 聚类结果分析:
    draw2()

你可能感兴趣的:(机器学习)