原型聚类中的K均值算法是一种常用的聚类方法,该算法的目标是通过迭代过程找到数据集的簇划分,使得每个簇内的样本与簇内均值的平方误差最小化。这一过程通过不断迭代更新簇的均值来实现。
conda create -n ML python==3.9
conda activate ML
conda install scikit-learn matplotlib seaborn
软件包 | 本实验版本 |
---|---|
matplotlib | 3.5.2 |
numpy | 1.21.5 |
python | 3.9.13 |
scikit-learn | 1.0.2 |
seaborn | 0.11.2 |
import numpy as np
import random
import seaborn as sns
import matplotlib.pyplot as plt
import argparse
__init__
:初始化K均值聚类的参数,包括聚类数目 k
、数据 data
、初始化模式 mode
(默认为 “random”)、最大迭代次数 max_iters
、闵可夫斯基距离的阶数 p
、随机种子 seed
等。minkowski_distance
函数:计算两个样本点之间的闵可夫斯基距离。center_init
函数:根据指定的模式初始化聚类中心。fit
方法:执行K均值聚类的迭代过程,包括分配样本到最近的簇、更新簇中心,直到满足停止条件。visualization
函数:使用Seaborn和Matplotlib可视化聚类结果。class Kmeans(object):
def __init__(self, k, data: np.ndarray, mode="random", max_iters=0, p=2, seed=0):
self.k = k
self.data = data
self.mode = mode
self.max_iter = max_iters if max_iters > 0 else int(1e8)
self.p = p
self.seed = seed
self.centers = None
self.clu_idx = np.zeros(len(self.data), dtype=np.int32) # 样本的分类簇
self.clu_dist = np.zeros(len(self.data), dtype=np.float64) # 样本与簇心的距离
k
data
mode
max_iters
p
以及随机种子 seed
。self.centers
被初始化为 None
,表示簇心尚未计算self.clu_idx
和 self.clu_dist
被初始化为全零数组,表示每个样本的分类簇和与簇心的距离。 def minkowski_distance(self, x, y=0):
return np.linalg.norm(x - y, ord=self.p)
linalg.norm
函数,其中 ord
参数用于指定距离的阶数。 def center_init(self):
random.seed(self.seed)
if self.mode == "random":
ids = random.sample(range(len(self.data)), k=self.k) # 随机抽取k个样本下标
self.centers = self.data[ids] # 选取k个样本作为簇中心
else:
ids = [random.randint(0, self.data.shape[0])]
for _ in range(1, self.k):
max_idx = 0
max_dis = 0
for i, x in enumerate(self.data):
if i in ids:
continue
dis = 0
for y in self.data[ids]:
dis += self.minkowski_distance(x - y)
if max_dis < dis:
max_dis = dis
max_idx = i
ids.append(max_idx)
self.centers = self.data[ids]
k
个样本作为簇心; def fit(self):
self.center_init() # 簇心初始化
for _ in range(self.max_iter):
flag = False # 判断是否有样本被重新分类
# 遍历每个样本
for i, x in enumerate(self.data):
min_idx = -1 # 最近簇心下标
min_dist = np.inf # 最小距离
for j, y in enumerate(self.centers): # 遍历每个簇,计算与该样本的距离
# 计算样本i到簇j的距离dist
dist = self.minkowski_distance(x, y)
if min_dist > dist:
min_dist = dist
min_idx = j
if self.clu_idx[i] != min_idx:
# 有样本改变分类簇,需要继续迭代更新簇心
flag = True
# 记录样本i与簇的最小距离min_dist,及对应簇的下标min_idx
self.clu_idx[i] = min_idx
self.clu_dist[i] = min_dist
# 样本的簇划分好之后,用样本均值更新簇心
for i in range(self.k):
x = self.data[self.clu_idx == i]
# 用样本均值更新簇心
self.centers[i] = np.mean(x, axis=0)
if not flag:
break
def visualization(self, k=3):
current_palette = sns.color_palette()
sns.set_theme(context="talk", palette=current_palette)
for i in range(self.k):
x = self.data[self.clu_idx == i]
sns.scatterplot(x=x[:, 0], y=x[:, 1], alpha=0.8)
sns.scatterplot(x=self.centers[:, 0], y=self.centers[:, 1], marker="+", s=500)
plt.title("k=" + str(k))
plt.show()
def order_type(v: str):
if v.lower() in ("-inf", "inf"):
return -np.inf if v.startswith("-") else np.inf
else:
try:
return float(v)
except ValueError:
raise argparse.ArgumentTypeError("Unsupported value encountered")
def mode_type(v: str):
if v.lower() in ("random", "far"):
return v.lower()
else:
raise argparse.ArgumentTypeError("Unsupported value encountered")
order_type
函数:用于处理命令行参数中的 -p
(距离测量参数),将字符串转换为浮点数。mode_type
函数:用于处理命令行参数中的 --mode
(初始化模式参数),将字符串转换为合法的初始化模式。argparse
解析命令行参数 parser = argparse.ArgumentParser(description="Kmeans Demo")
parser.add_argument("-k", type=int, default=3, help="The number of clusters")
parser.add_argument("--mode", type=mode_type, default="random", help="Initial centroid selection")
parser.add_argument("-m", "--max-iters", type=int, default=40, help="Maximum iterations")
parser.add_argument("-p", type=order_type, default=2., help="Distance measurement")
parser.add_argument("--seed", type=int, default=0, help="Random seed")
parser.add_argument("--dataset", type=str, default="./kmeans.2.txt", help="Path to dataset")
args = parser.parse_args()
dataset = np.loadtxt(args.dataset)
model = Kmeans(k=args.k, data=dataset, mode=args.mode, max_iters=args.max_iters, p=args.p,
seed=args.seed)
model.fit()
# 聚类结果可视化
model.visualization(k=args.k)
python kmeans.py -k 3 --mode random -m 40 -p 2 --seed 0 --dataset ./kmeans.2.txt
import numpy as np
import random
import seaborn as sns
import matplotlib.pyplot as plt
import argparse
class Kmeans(object):
def __init__(self, k, data: np.ndarray, mode="random", max_iters=0, p=2, seed=0):
self.k = k
self.data = data
self.mode = mode
self.max_iter = max_iters if max_iters > 0 else int(1e8)
self.p = p
self.seed = seed
self.centers = None
self.clu_idx = np.zeros(len(self.data), dtype=np.int32) # 样本的分类簇
self.clu_dist = np.zeros(len(self.data), dtype=np.float64) # 样本与簇心的距离
def minkowski_distance(self, x, y=0):
return np.linalg.norm(x - y, ord=self.p)
# 簇心初始化
def center_init(self):
random.seed(self.seed)
if self.mode == "random":
ids = random.sample(range(len(self.data)), k=self.k) # 随机抽取k个样本下标
self.centers = self.data[ids] # 选取k个样本作为簇中心
else:
ids = [random.randint(0, self.data.shape[0])]
for _ in range(1, self.k):
max_idx = 0
max_dis = 0
for i, x in enumerate(self.data):
if i in ids:
continue
dis = 0
for y in self.data[ids]:
dis += self.minkowski_distance(x - y)
if max_dis < dis:
max_dis = dis
max_idx = i
ids.append(max_idx)
self.centers = self.data[ids]
def fit(self):
self.center_init() # 簇心初始化
for _ in range(self.max_iter):
flag = False # 判断是否有样本被重新分类
# 遍历每个样本
for i, x in enumerate(self.data):
min_idx = -1 # 最近簇心下标
min_dist = np.inf # 最小距离
for j, y in enumerate(self.centers): # 遍历每个簇,计算与该样本的距离
# 计算样本i到簇j的距离dist
dist = self.minkowski_distance(x, y)
if min_dist > dist:
min_dist = dist
min_idx = j
if self.clu_idx[i] != min_idx:
# 有样本改变分类簇,需要继续迭代更新簇心
flag = True
# 记录样本i与簇的最小距离min_dist,及对应簇的下标min_idx
self.clu_idx[i] = min_idx
self.clu_dist[i] = min_dist
# 样本的簇划分好之后,用样本均值更新簇心
for i in range(self.k):
x = self.data[self.clu_idx == i]
# 用样本均值更新簇心
self.centers[i] = np.mean(x, axis=0)
if not flag:
break
def visualization(self, k=3):
current_palette = sns.color_palette()
sns.set_theme(context="talk", palette=current_palette)
for i in range(self.k):
x = self.data[self.clu_idx == i]
sns.scatterplot(x=x[:, 0], y=x[:, 1], alpha=0.8)
sns.scatterplot(x=self.centers[:, 0], y=self.centers[:, 1], marker="+", s=500)
plt.title("k=" + str(k))
plt.show()
def order_type(v: str):
if v.lower() in ("-inf", "inf"):
return -np.inf if v.startswith("-") else np.inf
else:
try:
return float(v)
except ValueError:
raise argparse.ArgumentTypeError("Unsupported value encountered")
def mode_type(v: str):
if v.lower() in ("random", "far"):
return v.lower()
else:
raise argparse.ArgumentTypeError("Unsupported value encountered")
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Kmeans Demo")
parser.add_argument("-k", type=int, default=3, help="The number of clusters")
parser.add_argument("--mode", type=mode_type, default="random", help="Initial centroid selection")
parser.add_argument("-m", "--max-iters", type=int, default=40, help="Maximum iterations")
parser.add_argument("-p", type=order_type, default=2., help="Distance measurement")
parser.add_argument("--seed", type=int, default=0, help="Random seed")
parser.add_argument("--dataset", type=str, default="./kmeans.2.txt", help="Path to dataset")
args = parser.parse_args()
dataset = np.loadtxt(args.dataset)
model = Kmeans(k=args.k, data=dataset, mode=args.mode, max_iters=args.max_iters, p=args.p,
seed=args.seed) # args.seed)
model.fit()
# 聚类结果可视化
model.visualization(k=args.k)