KMeans属于无监督(即无标签)聚类算法,在不知道数据没有具体的划分标准时,通过物以类聚的方法,将相似数据放在一起。
import random
import matplotlib.pyplot as plt
points_num = 100
random_x = [random.randint(-100, 100) for _ in range(points_num)]
random_y = [random.randint(-100, 100) for _ in range(points_num)]
random_poinsts = [(x, y) for x, y in zip(random_x, random_y)]
plt.scatter(random_x, random_y)
(初始点的选择不影响最后的分类结果,只影响迭代次数)
#随机初始点
def generate_random_point(min_, max_):
return random.randint(min_, max_), random.randint(min_, max_)
#随机生成五个点
generate_random_point(-100, 100)
K = 5
#初始中心点
pervious_kernels = [generate_random_point(-100, 100) for _ in range(K)]
print(pervious_kernels)
#将五个点在图中画出
plt.scatter([pervious_kernels[0][0]], [pervious_kernels[0][1]], color='red', s=100)
plt.scatter([pervious_kernels[1][0]], [pervious_kernels[1][1]], color='green', s=100)
plt.scatter([pervious_kernels[2][0]], [pervious_kernels[2][1]], color='yellow', s=100)
plt.scatter([pervious_kernels[3][0]], [pervious_kernels[3][1]], color='blue', s=100)
plt.scatter([pervious_kernels[4][0]], [pervious_kernels[4][1]], color='purple', s=100)
#画出随机的点[x,y]
plt.scatter(random_x, random_y)
距离函数:欧氏距离
import numpy as np
#欧氏距离
def dis(x, y):
return np.sqrt((x[0] - y[0]) ** 2 + (x[1] - y[1]) ** 2)
kernel_colors = ['red', 'green', 'yellow', 'blue', 'purple']
#存放迭代更新后的中心点
new_kernels = []
#存放与相应的中心点为一类的点
groups = [[] for _ in range(K)]
#对所有点与现在的中心点求距离,并将其与最小距离的点归为同一类
for p in random_poinsts:
distances = [dis(p, k) for k in previous_kernels]
min_index = np.argmin(distances)
groups[min_index].append(p)
#画出当前的中心的点与其集合中的点
for i, p in enumerate(previous_kernels):
plt.scatter([p[0]], [p[1]], color=kernel_colors[i], s=100)
#plt.scatter([previous_kernels[1][0]], [previous_kernels[1][1]], color=kernel_colors[1], s=100)
#plt.scatter([previous_kernels[2][0]], [previous_kernels[2][1]], color=kernel_colors[2], s=100)
#当前点不一定是最终的中心点,对其进行更新
#根据所有点的均值求出其更新后的中心点
for i, g in enumerate(groups):
g_x = [_x for _x, _y in g]
g_y = [_y for _x, _y in g]
n_k_x, n_k_y = np.mean(g_x), np.mean(g_y)
new_kernels.append((n_k_x, n_k_y))
print(kernel_colors[i])
plt.scatter(g_x, g_y, color=kernel_colors[i])
plt.scatter([n_k_x], [n_k_y], color=kernel_colors[i], alpha=0.5, s=200)
print('根据新的Group获得的kernal和之前的kernel的距离是: {}'.format(dis((n_k_x, n_k_y), previous_kernels[i])))
#plt.scatter(random_x, random_y, s=10)
#更新中心点
previous_kernels = new_kernels
手肘法
matplotlib.pyplot 绘制双 y 轴图像
seaborn 绘制数据分布图
#商场顾客分类
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.preprocessing import Normalizer
#加载数据
data = pd.read_csv("Mall_Customers.csv")
# data = pd.DataFrame(data)
# print(data.describe(), data.info())
#取出需要分类的数据
train_x = data.iloc[:,1:5]
#标签编码, LabelEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_x['Gender'] = le.fit_transform(train_x['Gender'])
# col = train_x.columns.to_list()
#如何取列名:https://www.cnblogs.com/wqbin/p/11845042.html
# 规范化到 [0,1] 空间
min_max_scaler = preprocessing.MinMaxScaler()
train_x1 = min_max_scaler.fit_transform(train_x)
pd.DataFrame(train_x1, columns =train_x.columns).to_csv('temp.csv', index=False)
# print(train_x2))
#正态分布的规范化
# normalizer = Normalizer()
# train_x1 = normalizer.fit_transform(train_x)
# print(train_x1)
#使用KMeans聚类
kmeans = KMeans(n_clusters = 3)
kmeans.fit(train_x)
pre_y = kmeans.predict(train_x)
#将数据与源数据拼接
results = pd.concat((train_x, pd.DataFrame(pre_y, columns = ['Group'])), axis = 1)
results.to_csv('Customers_Group.csv', index = False)
results.head(20)
#得到数据可视化内容
group = pd.DataFrame()
group_list = [group, group, group]
for i in range(3):
group = pd.DataFrame(results.iloc[results[results["Group"] == i].index])
group.columns = ['gender', 'age', 'income', 'spending','group']
group_list[i] = group.describe()
#得到描述统计的列表
print(group_list)
group_attr = {}
for i in range(3):
group_attr[i] = []
group_attr[i] += [group_list[i].spending['mean'], group_list[i].income['mean'], group_list[i].age['count']]
group_attr = pd.DataFrame(group_attr.values(), index = group_attr.keys(), columns = ['spending', 'income', 'count'])
group_attr
spending、income 在各群体中的均值,各群体中顾客的数量 count 如表所示。
对其进行可视化分析:
#Group特征分析
#新建画布可视化分析
fig = plt.figure(figsize = [18,18])
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
plt.suptitle('商场顾客分类:KMeans聚类', fontsize=30)
#绘制第一张图:各群体的spending, income, count特征
#添加标题
ax1 = plt.subplot(311)
ax2 = ax1.twinx()
plt.title('各群体的spending, income, count特征', size=20)
#设置参数
bar_width = 0.2
bar_1 = group_attr.index.tolist()
bar_2 = [i+bar_width for i in bar_1]
bar_3 = [i+bar_width for i in bar_2]
#导入数据,绘制条形图
ax1.bar(bar_1, group_attr['spending'], width=bar_width, label='spending', alpha = 0.3)
ax1.bar(bar_2, group_attr['income'], width=bar_width, label='income', alpha = 0.3)
ax2.bar(bar_3, group_attr['count'], width=bar_width, label='count',color = 'green', alpha = 0.3)
#添加xy轴
# ax1.set_xlabel('Group',size = 20)
ax1.set_ylabel('元', size = 20)
ax2.set_ylabel('数量', size = 20)
#x轴刻度
ax1.set_xticks(bar_2)
ax1.set_xticklabels( ['group1', 'group2', 'group3'], size = 15)
# 图例
handles1, labels1 = ax1.get_legend_handles_labels()
handles2, labels2 = ax2.get_legend_handles_labels()
plt.legend(handles1+handles2, labels1+labels2, loc='upper right')
#网格
ax1.grid(alpha = 0.1)
#绘制第二张图:各群体的年龄分布
# # plt.subplot(3,3,4)
# sns.set(font='SimHei',font_scale=1.5) # 解决Seaborn中文显示问题并调整字体大小
# fig1, axes = plt.subplots(1,3, figsize=(18, 7))
# sns.set(color_codes=True, font_scale = 1.25)
# # sns.distplot(results.Age[[result.Group == 0].index], ax=axes[0])
# sns.distplot(results[results.Group == 0]['Age'].tolist(), rug_kws={"color": "g"}, kde_kws={ 'alpha' : 0.5, "lw":4, "label": "kde", 'color': 'g'}, hist_kws={"label" : 'hist'}, axlabel = 'group1:年龄分布', ax = axes[0])
# # plt.subplot(3,3,5)
# sns.distplot(results[results.Group == 1]['Age'].tolist(), rug_kws={"color": "g"}, kde_kws={ 'alpha' : 0.5, "lw":4, "label": "kde", 'color': 'g'}, hist_kws={"label" : 'hist'}, axlabel = 'group2:年龄分布', ax = axes[1])
# # plt.subplot(3,3,6)
# sns.distplot(results[results.Group == 2]['Age'].tolist(), rug_kws={"color": "g"}, kde_kws={ 'alpha' : 0.5, "lw":4, "label": "kde", 'color': 'g'}, hist_kws={"label" : 'hist'}, axlabel = 'group3:年龄分布', ax = axes[2])
for i in range(3):
plt.subplot(3,3, i +4)
plt.title('group'+str(i + 1)+':年龄分布特性', size = 20)
sns.distplot(results[results.Group == i]['Age'].tolist(), rug_kws={"color": "g"}, kde_kws={ 'alpha' : 0.5, "lw":4, "label": "kde", 'color': 'g'}, hist_kws={"label" : 'hist'}, axlabel = '')
group1:群体数量相对较少,相对年龄最小,消费能力最强,收入相对较高,但均用于消费;
group2:相对年龄跨度最大,属于普遍状态,但收入较低,消费能力强于收入能力,超前消费较多;
group3:稳健型消费者,收入高且收入大于消费,年龄在20 ~ 50岁之间,数量与 group1 数量基本相当,有剩余资金可以进行消费。