random.shuffle()
对数据进行洗牌,再按比例取出# data:原数据列表 shuffle:是否随机切分 ratio:测试集比例
def split(data, shuffle=False, ratio=0.2):
n = len(data)
offset = int(n * ratio)
if n == 0 or offset < 1:
return data, []
if shuffle:
# 对数据进行洗牌
random.shuffle(data)
# 按比例取出训练集和测试集
train_data = data[offset:]
test_data = data[:offset]
return train_data, test_data
def loadData(filename):
data = np.loadtxt(filename)
return data
def randomCent(data, k):
m, n = data.shape
Cpoints = np.zeros((k, n))
indexs = np.random.uniform(0, m, k)
for i in range(len(indexs)):
Cpoints[i,:] = data[int(indexs[i]),:]
return Cpoints
# E:欧式距离, M:曼哈顿距离, C:余弦距离 默认为欧式距离
def distance(x, y, method='E'):
if method == 'E':
dist = distE(x, y)
elif method == 'M':
dist = distM(x, y)
elif method == 'C':
dist = distC(x, y)
return dist
# 计算欧式距离
def distE(x, y):
return np.linalg.norm(x - y, ord=2)
# 计算曼哈顿距离
def distM(x, y):
return np.linalg.norm(x - y, ord=1)
# 计算余弦距离
def distC(x, y):
Matrix_Mul = np.dot(x, y)
x_l2 = np.linalg.norm(x, ord=2)
y_l2 = np.linalg.norm(y, ord=2)
dist = 1 - Matrix_Mul / (x_l2 * y_l2)
return dist
def KMeans(data, k, method='E'):
m, n = data.shape
clusters = np.zeros(m)
Cpoints = randomCent(data, k)
Flag = True
while Flag:
Flag = False
# 遍历每一个点
for i in range(m):
minDist = float("inf")
minIndex = -1
# 寻找最近的中心点
for j in range(k):
# 计算该点到中心点的距离,默认为欧式距离
dist = distance(Cpoints[j,:], data[i,:], method)
if dist < minDist:
minDist = dist
minIndex = j
# 更新每个点所属的簇
if clusters[i] != minIndex:
Flag = True
clusters[i] = minIndex
# 更新中心点
for i in range(k):
# 获取簇中所有的点
ClusterPoints = data[clusters == i]
# 根据均值获取中心点的位置
Cpoints[i,:] = np.mean(ClusterPoints, axis=0)
return clusters, Cpoints
def showClusters(data, k, Cpoints, clusters):
m, n = data.shape
# mark为颜色代号
mark = ['or', 'ob', 'og', 'ok', 'vr', 'vb', 'vg', 'vk']
for i in range(m):
# 选取一个颜色
markIndex = int(clusters[i])
# 画点
plt.plot(data[i, 0], data[i, 1], mark[markIndex])
mark = ['^b', '^g', '^k', '^r', '+b', '+g', '+k', '+r']
for i in range(k):
# 在图中标出中心点,与聚类中其它点的颜色区分开
plt.plot(Cpoints[i, 0], Cpoints[i, 1], mark[i])
plt.show()
def saveModel(obj, modelname):
f = open(modelname, 'wb')
pickle.dump(obj, f, 0)
f.close()
def loadModel(modelname):
f = open(modelname, 'rb')
Cpoints = pickle.load(f)
f.close()
return Cpoints
def fit(Cpoints, data, method='E'):
k = len(Cpoints)
m, n = data.shape
clusters = np.zeros(m)
# 遍历每一个点
for i in range(m):
minDist = float("inf")
minIndex = -1
# 找到该点距离最近的簇中心点
for j in range(k):
dist = tr.distance(Cpoints[j], data[i], method)
# 更新最近的簇中心点
if dist < minDist:
minDist = dist
minIndex = j
# 将该点划分到簇中
clusters[i] = minIndex
# 绘制聚类图
tr.showClusters(data, k, Cpoints, clusters)
# 返回聚类结果
return clusters
def sse(Cpoints, data, clusters):
m = data.shape[0]
k = len(Cpoints)
# 存放不同簇中点到中心点的距离和和点的个数
sseAr = np.zeros((k, 2))
for i in range(m):
dist = tr.distance(Cpoints[int(clusters[i])], data[i])
# 距离和增加
sseAr[int(clusters[i])][0] += dist
# 个数增加
sseAr[int(clusters[i])][1] += 1
sse = 0
# 计算均值
for i in range(k):
sse += (sseAr[i][0] / sseAr[i][1])
return sse
基于欧式距离分类
k = 3 | k = 4 | k = 5 |
---|---|---|
基于曼哈顿距离分类
k = 3 | k = 4 |
---|---|
基于余弦距离进行分类
k = 3 | k = 4 | k = 5 |
---|---|---|
基于欧式距离的分类评估
k = 3 | k = 4 | k = 5 |
---|---|---|
6.1702 | 7.1997 | 7.6863 |
基于曼哈顿距离的分类评估
k = 3 | k = 4 |
---|---|
6.1370 | 7.1116 |
基于余弦距离的分类评估
k = 3 | k = 4 | k = 5 |
---|---|---|
6.1005 | 7.4509 | 8.5941 |
在此数据集上,无论是基于欧式距离、曼哈顿距离还是余弦距离,均在k = 3时有着最好的表现,理论上能解释数据最好的k值应该是3。