一、K-mediods 算法步骤:
1、随机选取k个样本作为中心点
2、一层遍历每一个样本,二层遍历每一个中心样本,找出离该样本最近的中心样本
3、遍历中心样本,该中心样本划分出来的该簇样本,遍历该簇样本,找出离所有样本距离最小的样本,代替旧中心
4、直到达到指定训练次数或者样本分类结果不再变化,结束训练
1 import numpy as np 2 from numpy import * 3 4 5 with open("xigua.txt") as file: 6 data_lines = file.readlines() 7 data_list = [[] for i in data_lines] 8 for item in range(len(data_lines)): 9 data_list[item][:] = (float(i) for i in data_lines[item].strip().split(",")[0:3]) 10 data_list[item].append(-1) 11 # print(data_list) 12 13 14 def choice_center(data, k): 15 centers = [] 16 for i in np.random.choice(len(data), k): 17 centers.append(data[i]) 18 print("随机选取的中心点(第一次):\n", centers) 19 return centers 20 21 22 def distance(a, b): 23 dis = [] 24 for i in range(len(a)): 25 dis.append(pow(a[i] - b[i], 2)) 26 # print(sqrt(sum(dis))) 27 return sqrt(sum(dis)) 28 29 30 def k_center(data_list,center): 31 flag = True 32 i = 0 33 while flag: 34 flag = False 35 for i in range(len(data_list)): # 遍历所有样本,最后一列标记该样本所属簇 36 min_index = -2 37 min_dis = inf 38 for j in range(len(center)): 39 dis = distance(data_list[i][1:3],center[j][1:3]) 40 if dis < min_dis: 41 min_dis = dis 42 min_index = j 43 if data_list[i][-1] != min_index: 44 flag = True 45 data_list[i][-1] = min_index 46 print("分类结果111:",data_list) 47 # 重新计算簇中心 48 for k in range(len(center)): # 遍历中心向量,取出属于当前中心向量簇的样本 49 current_k = [] 50 for i in range(len(data_list)): 51 if data_list[i][-1] == k: 52 current_k.append(data_list[i]) 53 # print(k, ":", current_k) 54 old_dis = 0.0 55 for i in range(len(current_k)): 56 old_dis += distance(current_k[i][1:3], center[k][1:3]) 57 for m in range(len(current_k)): 58 new_dis = 0.0 59 for n in range(len(current_k)): 60 new_dis += distance(current_k[m][1:3], current_k[n][1:3]) 61 if new_dis < old_dis: 62 old_dis = new_dis 63 center[k][:] = current_k[m][:] 64 # flag = True 65 # print("新中心点", center) 66 # i +=1 67 # print("循环次数: 68 print("选中的最终中心点", center) 69 for i in range(len(data_list)): # 遍历所有样本,最后一列标记该样本所属簇 70 min_index = -2 71 min_dis = inf 72 for j in range(len(center)): 73 dis = distance(data_list[i][1:3], center[j][1:3]) 74 if dis < min_dis: 75 min_dis = dis 76 min_index = j 77 data_list[i][-1] = min_index 78 print("分类结果222:", data_list) 79 80 81 82 centers = choice_center(data_list,3) 83 k_center(data_list,centers)
二、k-means算法:
1、随机选取k个样本作为中心向量
2、遍历每一个样本,以及每一个中心向量,对每一个样本进行类别的更新
3、取出同一类别的所有样本,求每一列的平均值,得到新的中心向量
4、直到达到指定训练次数,或者中心向量不再改变
1 import numpy as np 2 3 4 def kmeans(x,k,maxIt): 5 numPoints,numDim = x.shape 6 dataset = np.zeros((numPoints,numDim+1)) # 多加一列存储类别 7 dataset[:,:-1] = x 8 centroids = dataset[np.random.randint(numPoints,size=k)] # 随机选取k个中心点 9 centroids[:,-1] = range(1,k+1) 10 iteration = 0 11 oldCentroids = None 12 while not shouldStop(oldCentroids,centroids,iteration,maxIt): 13 oldCentroids=np.copy(centroids) 14 iteration +=1 15 updataLable(dataset,centroids) # 重新分类 16 centroids = getCentriods(dataset,k) # 得到新的中心点 17 return dataset 18 19 20 def shouldStop (oldCentroids,centroids,iteration,maxIt): # 满足了两个结束条件 21 if iteration>maxIt: 22 return True 23 return np.array_equal(oldCentroids,centroids) # !!!!!!!!!!!比较两个array是否相等 24 25 def updataLable(dataset,centroids): 26 numPoints,numDim = dataset.shape 27 for i in range(0,numPoints): 28 dataset[i,-1] = getLableFromClosestCentriod(dataset[i,:-1],centroids) 29 30 def getLableFromClosestCentriod(dataSetRow,centroids): 31 lable = centroids[0,-1] 32 minDist = np.linalg.norm(dataSetRow-centroids[0,:-1]) # 求范数,跟求欧氏距离一个道理 33 for i in range(1,centroids.shape[0]): 34 dist = np.linalg.norm(dataSetRow-centroids[i,:-1]) 35 if dist<minDist : 36 minDist = dist 37 lable = centroids[i,-1] 38 return lable 39 40 41 def getCentriods(dataset,k): 42 result = np.zeros((k,dataset.shape[1])) 43 for i in range(1,k+1): 44 oneCluster = dataset[dataset[:,-1]==i,:-1] # 取出最后一列等于指定值的样本行 45 result[i-1,:-1] = np.mean(oneCluster,axis=0) # 对传入矩阵,求列的平均值,即可以求到该簇的中心向量 46 result[i-1,-1] = i 47 return result 48 49 x1 = np.array([1,2]) 50 x2 = np.array([2,1]) 51 x3 = np.array([4,3]) 52 x4 = np.array([5,4]) 53 x = np.vstack((x1,x2,x3,x4)) 54 result = kmeans(x,2,10) 55 print("result",result)