在这里提及两种实现方法,1、使用机器学习库scikit-learn(sklearn)导入KMeans类进行聚类;2、自己用python语句实现K-means算法
from sklearn.cluster import KMeans
#X是各个样本对应属性数据所构成二维数组
X = [[0.0888, 0.5885],[0.1399, 0.8291],[0.0747, 0.4974], [0.0983, 0.5772],[0.1276, 0.5703], [0.1671, 0.5835], [0.1906, 0.5276], [0.1061, 0.5523], [0.2446, 0.4007], [0.1670, 0.4770], [0.2485, 0.4313],[0.1227, 0.4909],[0.1240, 0.5668],[0.1461, 0.5113],[0.2315, 0.3788], [0.0494, 0.5590],[0.1107, 0.4799], [0.2521, 0.5735],[0.1007, 0.6318], [0.1067, 0.4326], [0.1956, 0.4280]]
# Kmeans聚类,n_clusters是簇数,clf是实例化的聚类对象,y_pred为聚类训练得到的样本的标签列表(共三类0,1,2)
clf = KMeans(n_clusters=3)
y_pred = clf.fit_predict(X)
print(clf)
print(y_pred)
其中:clf = KMeans(algorithm=‘auto’, copy_x=True, init=‘k-means++’, max_iter=300,
n_clusters=3, n_init=10, n_jobs=None, precompute_distances=‘auto’,
random_state=None, tol=0.0001, verbose=0),我们可以自己设置所需要的参数,具体参照官方文档。
这里使用一般大家都用的欧式距离作为聚类相似度,而是使用两个向量之间的夹角度数作为聚类相似度进行度量。
import numpy as np
class kmeans():
def __init__(self, k,centroids):
'''
k(int):类别个数
centroids(list):给定的初始化的聚类中心,一般设为[[0,0,1],[0,1,0],[1,0,0]]
'''
self.num_clusters = k
self.centroids = centroids
def get_results(self,data):
'''
输出:聚类中心,以及各个簇组成的列表
'''
m, n = np.shape(data) # m:样本数,n:样本特征的维度
subCenter = np.mat(np.zeros((m, 2)),dtype=float) # 初始化
change = True # 判断是否需要重新计算聚类中心
while change == True:
change = False
co_clusters =[[]]*self.num_clusters
for i in range(m):
minDist = 180.0 # 设置样本与聚类中心之间的最小的距离,初始值为180度
minIndex = 0
for j in range(self.num_clusters):
dist = self.angel_distance(data[i], self.centroids[j])
if dist < minDist:
minDist = dist
minIndex = j
if subCenter[i, 0] != minIndex:
change = True
subCenter[i, ] = np.mat([minIndex, minDist])
for j in range(self.num_clusters):
sum_all = np.mat(np.zeros((1, n)),dtype=float)#一个1x3的矩阵
r = 0 # 每个类别中的样本的个数
for i in range(m):
if subCenter[i, 0] == j: # 计算第j个类别的个体有哪些,
sum_all += data[i]
r += 1
co_clusters[j].append(data[i])
for z in range(n):
try:
self.centroids[j][z] = sum_all[0,z]/r
except:
print(" r is zero")
return self.centroids,co_clusters
def angel_distance(self, vecA, vecB):
'''计算vecA与vecB之间的夹角(其实是计算点对应的向量与某向量之间的夹角)
:param vecA(list): 该点对应的向量
:param vecB(list):某向量
:return: 夹角度数(为角度不是弧度)
'''
La = np.sqrt(np.dot(vecA,vecA))
Lb = np.sqrt(np.dot(vecB,vecB))
cos_angle = np.dot(vecA,vecB)/(La * Lb)
angle = np.arccos(cos_angle)
angle2 = angle * 360/2/np.pi
return angle2
import pandas as pd
import numpy as np
import random
import operator
import math
class F_C_Clusters():
def __init__(self,pop):
self.df = pd.DataFrame(pop, columns=['one', 'two', 'three'])
# Number of Attributes
self.num_attr = len(self.df.columns) - 1
# Number of Clusters
self.k = 10
# Maximum number of iterations
self.MAX_ITER = 100
# Number of data points,个体数目
self.n = len(self.df)
# Fuzzy parameter
self.m = 2.00
def initializeMembershipMatrix(self):#初始化N个个体的隶属度矩阵
membership_mat = list()
for i in range(self.n):
random_num_list = [random.random() for i in range(self.k)]
summation = sum(random_num_list)
temp_list = [x/summation for x in random_num_list]
membership_mat.append(temp_list)
return membership_mat
def calculateClusterCenter(self,membership_mat):#计算聚类中心
cluster_mem_val = list(zip(*membership_mat))
cluster_centers = list()
for j in range(self.k):
x = list(cluster_mem_val[j])
xraised = [e ** self.m for e in x]
denominator = sum(xraised)
temp_num = list()
for i in range(self.n):
data_point = list(self.df.iloc[i])
prod = [xraised[i] * val for val in data_point]
temp_num.append(prod)
numerator = map(sum, zip(*temp_num))
center = [z/denominator for z in numerator]
cluster_centers.append(center)
return cluster_centers
def updateMembershipValue(self,membership_mat, cluster_centers):#利用聚类中心和当前隶属度矩阵更新隶属度矩阵
p = float(2/(self.m-1))
for i in range(self.n):
x = list(self.df.iloc[i])
distances = [np.linalg.norm(list(map(operator.sub, x, cluster_centers[j]))) for j in range(self.k)]
for j in range(self.k):
den = sum([math.pow(float(distances[j]/distances[c]), p) for c in range(self.k)])
membership_mat[i][j] = float(1/den)
return membership_mat
def fuzzyCMeansClustering(self):
# Membership Matrix
membership_mat = self.initializeMembershipMatrix()
# print(membership_mat)
curr = 0
while curr <= self.MAX_ITER:
cluster_centers = self.calculateClusterCenter(membership_mat)
membership_mat = self.updateMembershipValue(membership_mat, cluster_centers)
curr += 1
# print(membership_mat)
return membership_mat