import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm.notebook import tqdm
import random
def get_center(df, method='random', cn=2):
"""随机选择聚类中心
return: np.array
"""
count = len(df)
assert count >= cn >= 1
if method == 'random':
center = df.sample(cn).values
elif method == 'km++':
"""km++ 算法确定初始聚类中心"""
data = df.values.tolist()
center = [data.pop(random.randint(0, len(data)-1))]
for i in range(cn-1):
d = []
for p in data:
d.append(((np.array(center)-np.array(p))**2).sum(axis=1).min())
center.append(data.pop(np.argmax(d)))
center = np.array(center)
return center
# --测试
a = pd.DataFrame(np.random.randn(20, 2))
c = get_center(a, 'km++', 3)
plt.scatter(a.iloc[:, 0], a.iloc[:, 1])
plt.scatter(c[:, 0], c[:, 1], s=50, marker='^')
plt.show()
def get_label(data, center, m='ojld'):
"""计算
data : 与center中距离最近的行
"""
a = ((center-data)**2).sum(axis=1)
return a.argmin()
# 计算聚类中心
def k_mean(df, center):
"""
params:
df : 数据
center : 聚类中心
returns:
center : 新的聚类中心
labels : 新的标签
"""
df = df.copy()
labels = []
for data in df.values:
labels.append(get_label(data, center))
df['label'] = labels
center = df.sort_values(by=['label'], ignore_index=True).groupby(by=['label']).mean().values
return center, labels
def c_l(X, c_n, c_init='random', iters=10):
"""
X : 待聚类的数据
c_init : 初始化聚类中心应用的方法
c_n : 聚类中心的数量
iters : 聚类的迭代次数
"""
# 初始化聚类中心
center = get_center(X, method=c_init, cn=c_n)
for i in range(iters):
center, labels = k_mean(X, center)
return center, labels
# 计算簇间距离平方和
def matric(df, cn, labels):
"""cn : 聚类中心 """
# 类间距离
outer = 0
for i in cn:
outer += (((cn-i)**2).sum(axis=1)**0.5).sum()
outer *= 0.5
# 类内距离
iner = 0
df = df.copy()
df['label'] = labels
for l in sorted(list(set(labels))):
iner +=(((df.loc[df.label == l].iloc[:,:-1].values-cn[l])**2).sum(axis=1)**0.5).sum()
return iner/(outer+10e-5)
# 综合测试
a = np.random.randn(120,2)+2
b = np.random.randn(120,2)-1
d = np.vstack([a,b])
p = pd.DataFrame(d,columns=['l1','l2'])
cn, lb = c_l(p, c_n=4, c_init='km++', iters=10)
plt.scatter(x=p['l1'],y=p['l2'],label='point')
plt.scatter(x=cn[:,0],y=cn[:,1],label='center')
plt.legend()
plt.show()
# 聚类中心的移动可视化
cs = []
center = get_center(p, method='random', cn=3)
for i in range(12):
center, labels = k_mean(p, center)
cs.append(center)
plt.figure(figsize=[9, 6])
plt.plot(p['l1'], p['l2'], 'o', c='orange', mfc='white') # 所有节点图
for i in zip(*cs):
c = np.array(i)
plt.plot(c[:, 0], c[:, 1], '^-') # center移动轨迹
plt.plot(cs[0][:, 0], cs[0][:, 1], 'k^',label='start') # 开始
plt.plot(cs[-1][:, 0], cs[-1][:, 1], 'r^',label='end') # 结束
plt.legend()
plt.show()
# k对聚类效果的影响
m = []
K = range(2,10)
for k in tqdm(K):
cn, lb = c_l(p, c_n=4, c_init='km++', iters=100)
m.append(matric(p,cn,lb))
plt.plot(K,m)
plt.show()