前文介绍的K-Means算法需要指定K值(分组数),本文实现的MeanShift聚类算法不需要预先知道聚类的分组数,对聚类的形状也没有限制。
为了更好的理解这个算法,本帖使用Python实现Mean Shift算法。
MeanShift算法详细介绍:https://en.wikipedia.org/wiki/Mean_shift
import numpy as np
from sklearn.cluster import MeanShift
from matplotlib import pyplot
from mpl_toolkits.mplot3d import Axes3D
from sklearn.datasets.samples_generator import make_blobs
fig = pyplot.figure()
ax = fig.add_subplot(111, projection='3d')
# 生成3组数据样本
centers = [[2,1,3], [6,6,6], [10,8,9]]
x,_ = make_blobs(n_samples=200, centers=centers, cluster_std=1)
#for i in range(len(x)):
# ax.scatter(x[i][0], x[i][1], x[i][2])
# 对上面数据进行分组
clf = MeanShift()
clf.fit(x)
labels = clf.labels_ # 每个点对应的组
cluster_centers = clf.cluster_centers_ # 每个组的"中心点"
#print(labels)
print(cluster_centers)
colors = ['r', 'g', 'b']
for i in range(len(x)):
ax.scatter(x[i][0], x[i][1], x[i][2], c=colors[labels[i]])
ax.scatter(cluster_centers[:,0], cluster_centers[:,1], cluster_centers[:,2], marker='*', c='k', s=200, zorder=10)
pyplot.show()
MeanShift把上面数据自动分为3组,计算出的三个组的”中心点”为:
[[ 1.97566619 1.04212548 3.02410725]
[ 6.01672157 6.18325271 5.96562957]
[ 10.14455378 12.02394435 9.03499578]]
# 和[[2,1,3], [6,6,6], [10,12,9]]接近;生成的样本越多越接近
# -*- coding:utf-8 -*-
import numpy as np
from matplotlib import pyplot
from mpl_toolkits.mplot3d import Axes3D
from sklearn.datasets.samples_generator import make_blobs
class MeanShift(object):
def __init__(self, bandwidth=4):
#bandwidth参数代表点的半径(radius)范围
self.bandwidth_ = bandwidth
def fit(self, data):
centers = {}
# 把每个点都当做中心点
for i in range(len(data)):
centers[i] = data[i]
# print(centers)
while True:
new_centers = []
for i in centers:
in_bandwidth = []
# 取一个点,把在范围内的其它点放到in_bandwidth
center = centers[i]
for feature in data:
# self.bandwidth_越小分的组越多
if np.linalg.norm(feature - center) < self.bandwidth_:
in_bandwidth.append(feature)
new_center = np.average(in_bandwidth, axis=0)
new_centers.append(tuple(new_center))
uniques = sorted(list(set(new_centers)))
prev_centers = dict(centers)
centers = {}
for i in range(len(uniques)):
centers[i] = np.array(uniques[i])
optimzed = True
for i in centers:
if not np.array_equal(centers[i], prev_centers[i]):
optimzed = False
if not optimzed:
break
if optimzed:
break
self.centers_ = centers
if __name__ == '__main__':
fig = pyplot.figure()
ax = fig.add_subplot(111, projection='3d')
centers = [[2, 1, 3], [6, 6, 6], [10, 12, 9]]
x, _ = make_blobs(n_samples=18, centers=centers, cluster_std=1)
clf = MeanShift()
clf.fit(x)
print(clf.centers_)
for i in clf.centers_:
ax.scatter(clf.centers_[i][0], clf.centers_[i][1], clf.centers_[i][2], marker='*', c='k', s=200, zorder=10)
for i in range(len(x)):
ax.scatter(x[i][0], x[i][1], x[i][2])
pyplot.show()
执行结果:
bandwidth参数代表点的半径(radius)范围,bandwidth=20:
bandwidth=2.5:
这个bandwidth可以根据数据样本求出最适合的值。
# -*- coding:utf-8 -*-
import numpy as np
from matplotlib import pyplot
from mpl_toolkits.mplot3d import Axes3D
from sklearn.datasets.samples_generator import make_blobs
class MeanShift(object):
def __init__(self, bandwidth=None, bandwidth_step=100):
self.bandwidth_ = bandwidth
self.bandwidth_step_ = bandwidth_step
def fit(self, data):
if self.bandwidth_ == None:
all_data_center = np.average(data, axis = 0)
self.bandwidth_ = np.linalg.norm(all_data_center) /self.bandwidth_step_
print(self.bandwidth_)
centers = {}
# 把每个点都当做中心点
for i in range(len(data)):
centers[i] = data[i]
# print(centers)
while True:
new_centers = []
for i in centers:
in_bandwidth = []
# 取一个点,把在范围内的其它点放到in_bandwidth
center = centers[i]
w = [i for i in range(self.bandwidth_step_)][::-1]
for feature in data:
distance = np.linalg.norm(feature - center)
if distance == 0:
distance = 0.000000001
w_index = int(distance /self.bandwidth_)
if w_index > self. bandwidth_step_ -1:
w_index = self. bandwidth_step_ -1
in_bandwidth += (w[w_index] **2) * [feature]
new_center = np.average(in_bandwidth, axis=0)
new_centers.append(tuple(new_center))
uniques = sorted(list(set(new_centers)))
tmp = []
for i in uniques:
for ii in uniques:
if i == ii:
pass
elif np.linalg.norm(np.array(i) - np.array(ii)) <= self.bandwidth_:
tmp.append(ii)
break
for i in tmp:
try:
uniques.remove(i)
except:
pass
prev_centers = dict(centers)
centers = {}
for i in range(len(uniques)):
centers[i] = np.array(uniques[i])
optimzed = True
for i in centers:
if not np.array_equal(centers[i], prev_centers[i]):
optimzed = False
if not optimzed:
break
if optimzed:
break
self.centers_ = centers
def predict(self, data):
self.labels_ = {}
for i in range(len(centers)):
self.labels_[i] = []
for feature in data:
distances = [np.linalg.norm(feature - self.centers_[center]) for center in self.centers_]
clf = distances.index(min(distances))
self.labels_[clf].append(feature)
数据集:titanic.xls(泰坦尼克号遇难者/幸存者名单)。目的:对乘客进行分类,看看这几组人有什么共同特点。
# -*- coding:utf-8 -*-
import numpy as np
from sklearn.cluster import MeanShift,estimate_bandwidth
from sklearn import preprocessing
import pandas as pd
'''
数据集:titanic.xls(泰坦尼克号遇难者/幸存者名单)
***字段***
pclass: 社会阶层(1,精英;2,中层;3,船员/劳苦大众)
survived: 是否幸存
name: 名字
sex: 性别
age: 年龄
sibsp: 哥哥姐姐个数
parch: 父母儿女个数
ticket: 船票号
fare: 船票价钱
cabin: 船舱
embarked
boat
body: 尸体
home.dest
******
目的:使用除survived字段外的数据进行means shift分组,看看能分为几组,这几组人有什么共同特点
'''
# 加载数据
df = pd.read_excel('titanic.xls')
# print(df.shape) (1309, 14)
# print(df.head())
# print(df.tail())
"""
pclass survived name sex \
0 1 1 Allen, Miss. Elisabeth Walton female
1 1 1 Allison, Master. Hudson Trevor male
2 1 0 Allison, Miss. Helen Loraine female
3 1 0 Allison, Mr. Hudson Joshua Creighton male
4 1 G 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female
age sibsp parch ticket fare cabin embarked boat body \
0 29.0000 0 0 24160 211.3375 B5 S 2 NaN
1 0.9167 1 2 113781 151.5500 C22 C26 S 11 NaN
2 2.0000 1 2 113781 151.5500 C22 C26 S NaN NaN
3 30.0000 1 2 113781 151.5500 C22 C26 S NaN 135.0
4 25.0000 1 2 113781 151.5500 C22 C26 S NaN NaN
home.dest
0 St Louis, MO
1 Montreal, PQ / Chesterville, ON
2 Montreal, PQ / Chesterville, ON
3 Montreal, PQ / Chesterville, ON
4 Montreal, PQ / Chesterville, ON
"""
org_df = pd.DataFrame.copy(df)
# 去掉无用字段
df.drop(['body', 'name'], 1, inplace=True)
df.convert_objects(convert_numeric=True)#将object格式转float64格式
df.fillna(0, inplace=True) # 把NaN替换为0
# 把字符串映射为数字,例如{female:1, male:0}
df_map = {} # 保存映射
cols = df.columns.values
for col in cols:
if df[col].dtype != np.int64 and df[col].dtype != np.float64:
temp = {}
x = 0
for ele in set(df[col].values.tolist()):
if ele not in temp:
temp[ele] = x
x += 1
df_map[df[col].name] = temp
df[col] = list(map(lambda val: temp[val], df[col]))
for key, value in df_map.items():
print(key,value)
# print(df.head())
# 由于是非监督学习,不使用label
x = np.array(df.drop(['survived'], 1).astype(float))
# 将每一列特征标准化为标准正太分布,注意,标准化是针对每一列而言的
x = preprocessing.scale(x)
clf = MeanShift()
clf.fit(x)
labels = clf.labels_
cluster_centers = clf.cluster_centers_
print('labels:',labels)
print('cluster_centers:',cluster_centers)
n_cluster = len(np.unique(labels))
print('n_cluster:',n_cluster)
org_df['group'] = np.nan
for i in range(len(x)):
org_df['group'].iloc[i] = labels[i]
survivals = {}
for i in range(n_cluster):
temp_df = org_df[org_df['group'] == float(i)]
survival_cluster = temp_df[(temp_df['survived'] == 1)]
survial = 1.0 * len(survival_cluster) / len(temp_df)
survivals[i] = survial
print(survivals)
# MeanShift自动把数据分成了三组,每组对应的生还率为(有时分成4组):
# {0: 0.37782982045277125, 1: 0.8333333333333334, 2: 0.1}
# 你可以详细分析一下org_df, 看看这几组人的共同特点是什么
# print(org_df[ org_df['group'] == 2 ])
# print(org_df[ org_df['group'] == 2 ].describe())
org_df.to_excel('group.xls')
来源:http://blog.topspeedsnail.com/archives/10366