K-means聚类以及fuzzy c-means聚类的实现

一、K-means聚类

    在这里提及两种实现方法,1、使用机器学习库scikit-learn(sklearn)导入KMeans类进行聚类;2、自己用python语句实现K-means算法

1.sklearn快速方法:

from sklearn.cluster import KMeans
#X是各个样本对应属性数据所构成二维数组
X = [[0.0888, 0.5885],[0.1399, 0.8291],[0.0747, 0.4974], [0.0983, 0.5772],[0.1276, 0.5703], [0.1671, 0.5835], [0.1906, 0.5276], [0.1061, 0.5523], [0.2446, 0.4007], [0.1670, 0.4770], [0.2485, 0.4313],[0.1227, 0.4909],[0.1240, 0.5668],[0.1461, 0.5113],[0.2315, 0.3788], [0.0494, 0.5590],[0.1107, 0.4799], [0.2521, 0.5735],[0.1007, 0.6318], [0.1067, 0.4326], [0.1956, 0.4280]]
# Kmeans聚类,n_clusters是簇数,clf是实例化的聚类对象,y_pred为聚类训练得到的样本的标签列表(共三类0,1,2)
clf = KMeans(n_clusters=3)
y_pred = clf.fit_predict(X)
print(clf)
print(y_pred)

    其中:clf = KMeans(algorithm=‘auto’, copy_x=True, init=‘k-means++’, max_iter=300,
n_clusters=3, n_init=10, n_jobs=None, precompute_distances=‘auto’,
random_state=None, tol=0.0001, verbose=0),我们可以自己设置所需要的参数,具体参照官方文档。

2、用python语句实现K-means算法

    这里使用一般大家都用的欧式距离作为聚类相似度,而是使用两个向量之间的夹角度数作为聚类相似度进行度量。

import numpy as np
class kmeans():
    def __init__(self, k,centroids):
        '''
        k(int):类别个数
        centroids(list):给定的初始化的聚类中心,一般设为[[0,0,1],[0,1,0],[1,0,0]]
        '''
        self.num_clusters = k
        self.centroids = centroids
        
    def get_results(self,data):
        '''
        输出:聚类中心,以及各个簇组成的列表
        '''
        m, n = np.shape(data) # m:样本数,n:样本特征的维度
        subCenter = np.mat(np.zeros((m, 2)),dtype=float)  # 初始化
        change = True  # 判断是否需要重新计算聚类中心
        while change == True:
            change = False  
            co_clusters =[[]]*self.num_clusters
            for i in range(m):
                minDist = 180.0 # 设置样本与聚类中心之间的最小的距离,初始值为180度
                minIndex = 0  
                for j in range(self.num_clusters):
                    dist = self.angel_distance(data[i], self.centroids[j])
                    if dist < minDist:
                        minDist = dist
                        minIndex = j
                if subCenter[i, 0] != minIndex:  
                    change = True
                    subCenter[i, ] = np.mat([minIndex, minDist])
            for j in range(self.num_clusters):
                sum_all = np.mat(np.zeros((1, n)),dtype=float)#一个1x3的矩阵
                r = 0  # 每个类别中的样本的个数
                for i in range(m):
                    if subCenter[i, 0] == j:  # 计算第j个类别的个体有哪些,
                        sum_all += data[i]
                        r += 1
                        co_clusters[j].append(data[i])
                for z in range(n):
                    try:
                        self.centroids[j][z] = sum_all[0,z]/r    
                    except:
                        print(" r is zero")
        return self.centroids,co_clusters

    def angel_distance(self, vecA, vecB):
        '''计算vecA与vecB之间的夹角(其实是计算点对应的向量与某向量之间的夹角)
        :param vecA(list): 该点对应的向量
        :param vecB(list):某向量
        :return: 夹角度数(为角度不是弧度)
        '''
        La = np.sqrt(np.dot(vecA,vecA))
        Lb = np.sqrt(np.dot(vecB,vecB))
        cos_angle = np.dot(vecA,vecB)/(La * Lb)
        angle = np.arccos(cos_angle)
        angle2 = angle * 360/2/np.pi
        return angle2

二 Fuzzy C means聚类(FCM)

import pandas as pd
import numpy as np
import random
import operator
import math

class F_C_Clusters():
    def __init__(self,pop):
        self.df = pd.DataFrame(pop, columns=['one', 'two', 'three'])
        # Number of Attributes
        self.num_attr = len(self.df.columns) - 1
        # Number of Clusters
        self.k = 10
        # Maximum number of iterations
        self.MAX_ITER = 100
        # Number of data points,个体数目
        self.n = len(self.df)
        # Fuzzy parameter
        self.m = 2.00

    def initializeMembershipMatrix(self):#初始化N个个体的隶属度矩阵
        membership_mat = list()
        for i in range(self.n):
            random_num_list = [random.random() for i in range(self.k)]
            summation = sum(random_num_list)
            temp_list = [x/summation for x in random_num_list]
            membership_mat.append(temp_list)
        return membership_mat


    def calculateClusterCenter(self,membership_mat):#计算聚类中心
        cluster_mem_val = list(zip(*membership_mat))
        cluster_centers = list()
        for j in range(self.k):
            x = list(cluster_mem_val[j])
            xraised = [e ** self.m for e in x]
            denominator = sum(xraised)
            temp_num = list()
            for i in range(self.n):
                data_point = list(self.df.iloc[i])
                prod = [xraised[i] * val for val in data_point]
                temp_num.append(prod)
            numerator = map(sum, zip(*temp_num))
            center = [z/denominator for z in numerator]
            cluster_centers.append(center)
        return cluster_centers


    def updateMembershipValue(self,membership_mat, cluster_centers):#利用聚类中心和当前隶属度矩阵更新隶属度矩阵
        p = float(2/(self.m-1))
        for i in range(self.n):
            x = list(self.df.iloc[i])
            distances = [np.linalg.norm(list(map(operator.sub, x, cluster_centers[j]))) for j in range(self.k)]
            for j in range(self.k):
                den = sum([math.pow(float(distances[j]/distances[c]), p) for c in range(self.k)])
                membership_mat[i][j] = float(1/den)
        return membership_mat



    def fuzzyCMeansClustering(self):
        # Membership Matrix
        membership_mat = self.initializeMembershipMatrix()
        # print(membership_mat)
        curr = 0
        while curr <= self.MAX_ITER:
            cluster_centers = self.calculateClusterCenter(membership_mat)
            membership_mat = self.updateMembershipValue(membership_mat, cluster_centers)
            curr += 1
        # print(membership_mat)
        return membership_mat

你可能感兴趣的:(机器学习,聚类,机器学习)