聚类算法
- watermelon4.0.csv 西瓜数据集
- LVQ.py
- K-means.py
- GMM.py
- AGNES.py(初始化30个不同颜色的簇)
- AGNES.py
watermelon4.0.csv 西瓜数据集
1,0.697,0.460
2,0.774,0.376
3, 0.634,0.264
4,0.608,0.318
5,0.556,0.215
6,0.403,0.237
7,0.481,0.149
7,0.666,0.091
8,0.437,0.211
9,0.666,0.091
10,0.243,0.267
11,0.245,0.057
12,0.343,0.099
13,0.639,0.161
14,0.657,0.198
15,0.360,0.370
16,0.593,0.042
17,0.719,0.103
18,0.359,0.188
19,0.339,0.241
20,0.282,0.257
21,0.748,0.232
22,0.714,0.346
23,0.483,0.312
24,0.478,0.437
25,0.525,0.369
26,0.751,0.489
27,0.532,0.472
28,0.473,0.376
29,0.725,0.445
30,0.446,0.459
LVQ.py
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
def LVQ(X1, y1, pNum, learningRate=0.1):
idx = np.random.choice(X1.shape[0], pNum)
p = X1[idx, :]
py = y1[idx]
fig, ax = plt.subplots(3, 3, figsize=(12, 12), sharex='all', sharey='all')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
ax[0, 0].scatter(X1[:, 0], X1[:, 1], c=y1)
ax[0, 0].scatter(p[:, 0], p[:, 1], marker='x', color='red', s=100)
ax[0, 0].set_title("初始化原型向量")
ax[0, 0].set_xlim(xlim)
ax[0, 0].set_ylim(ylim)
j = 0
for i in range(2001):
idx = np.random.choice(X1.shape[0], 1)
xi = X1[idx, :]
dist = np.sqrt(np.sum(np.square(xi - p), axis=1))
minIdx = np.argmin(dist)
if y1[idx] == py[minIdx]:
p[minIdx] = p[minIdx] + learningRate * (xi - p[minIdx])
else:
p[minIdx] = p[minIdx] - learningRate * (xi - p[minIdx])
if (i > 0) and (i in [20, 50, 100, 200, 500, 1000, 1500, 2000]):
j += 1
clusters = []
for x in X1:
dist = np.sqrt(np.sum(np.square(x - p), axis=1))
label = np.argmin(dist)
clusters.append(label)
if j < 3:
k = 0
elif j < 6:
k = 1
else:
k = 2
if not ((k == 0) and ((j % 3) == 0)):
ax[k, j % 3].scatter(X[:, 0], X[:, 1], c=clusters)
ax[k, j % 3].scatter(p[:, 0], p[:, 1], marker='x', color='red', s=100)
ax[k, j % 3].set_title("迭代次数: %d" % i)
ax[k, j % 3].set_xlim(xlim)
ax[k, j % 3].set_ylim(ylim)
if __name__ == "__main__":
data = pd.read_csv('watermelon4.0.csv', header=None)
data['y'] = np.zeros((data.shape[0], 1), dtype=int)
data.iloc[9:22, 3] = 1
X = data.iloc[:, 1:3].values
y = data.iloc[:, 3].values
plt.scatter(X[:, 0], X[:, 1], c=y)
xlim = (plt.axis()[0], plt.axis()[1])
ylim = (plt.axis()[2], plt.axis()[3])
LVQ(X, y, 5)
plt.show()
K-means.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
def distance(sample1, centers1):
dist = np.sqrt(np.sum(np.square(sample1 - centers1), axis=1))
minIdx = np.argmin(dist)
return minIdx
def clusters_show(clusters, center, step):
color = ["g", "b", "y"]
plt.figure(figsize=(8, 8))
plt.title("迭代次数: {}".format(step))
plt.xlabel("密度", loc="center")
plt.ylabel("糖含量", loc="center")
for i, cluster in enumerate(clusters):
cluster = np.array(cluster)
plt.scatter(center[:, 0], center[:, 1], marker='x', color='red', s=100)
plt.scatter(cluster[:, 0], cluster[:, 1], c=color[i], marker='.', s=150)
def k_means(samples, k):
data_number = len(samples)
centers_flag = np.zeros((k,))
center = samples[np.random.choice(data_number, k, replace=False)]
plt.title("初始化原型向量")
plt.xlabel("密度", loc="center")
plt.ylabel("糖含量", loc="center")
plt.scatter(center[:, 0], center[:, 1], marker='x', color='red', s=100)
plt.scatter(samples[:, 0], samples[:, 1], c='black')
step = 0
while True:
clusters = [[] for i in range(k)]
for sample1 in samples:
ci = distance(sample1, center)
clusters[ci].append(sample1)
clusters_show(clusters, center, step)
for i, sub_clusters in enumerate(clusters):
new_center = np.array(sub_clusters).mean(axis=0)
if (center[i] != new_center).all():
center[i] = new_center
else:
centers_flag[i] = 1
step += 1
if centers_flag.all():
break
return center
def split_data(samples, centers1):
k = len(centers1)
clusters = [[] for i in range(k)]
for samples in samples:
ci = distance(samples, centers1)
clusters[ci].append(samples)
return clusters
if __name__ == '__main__':
np.random.seed(5)
data = pd.read_csv('watermelon4.0.csv', header=None)
sample = data.iloc[:, 1:3].values
centers = k_means(sample, 3)
plt.show()
GMM.py
import numpy as np
from matplotlib import pyplot as plt
def createDataSet():
data = np.array([
[0.697, 0.460], [0.774, 0.376], [0.634, 0.264], [0.608, 0.318], [0.556, 0.215],
[0.403, 0.237], [0.481, 0.149], [0.437, 0.211], [0.666, 0.091], [0.243, 0.267],
[0.245, 0.057], [0.343, 0.099], [0.639, 0.161], [0.657, 0.198], [0.360, 0.370],
[0.593, 0.042], [0.719, 0.103], [0.359, 0.188], [0.339, 0.241], [0.282, 0.257],
[0.748, 0.232], [0.714, 0.346], [0.483, 0.312], [0.478, 0.437], [0.525, 0.369],
[0.751, 0.489], [0.532, 0.472], [0.473, 0.376], [0.725, 0.445], [0.446, 0.459]])
return data
def multiGaussian(x, n_clusters, miu, sigma):
left = 1 / (pow(2 * np.pi, n_clusters / 2) * pow(np.linalg.det(sigma), 0.5))
right = np.exp((-0.5) * (x - miu).dot(np.linalg.pinv(sigma)).dot(x - miu).T)
return left * right
def computeGamma(X, miu, sigma, alpha, multiGaussian):
n_samples = X.shape[0]
n_clusters = len(alpha)
gamma = np.zeros((n_samples, n_clusters))
p = np.zeros(n_clusters)
g = np.zeros(n_clusters)
for i in range(n_samples):
for j in range(n_clusters):
p[j] = multiGaussian(X[i], n_clusters, miu[j], sigma[j])
g[j] = alpha[j] * p[j]
for k in range(n_clusters):
gamma[i, k] = g[k] / np.sum(g)
return gamma
class GMM():
def __init__(self, n_clusters, iter=50):
self.n_clusters = n_clusters
self.iter = iter
self.miu = 0
self.sigma = 0
self.alpha = 0
def fit(self, data):
n_samples = data.shape[0]
n_features = data.shape[1]
alpha = np.ones(self.n_clusters) / self.n_clusters
miu = np.array([[.403, .237],
[.714, .346],
[.532, .472]])
sigma = np.full((self.n_clusters, n_features, n_features), np.diag(np.full(n_features, 0.1)))
for i in range(self.iter):
gamma = computeGamma(data, miu, sigma, alpha, multiGaussian)
alpha = np.sum(gamma, axis=0) / n_samples
for j in range(self.n_clusters):
miu[j] = np.sum(data * gamma[:, j].reshape((n_samples, 1)), axis=0) / np.sum(gamma, axis=0)[j]
sigma[j] = 0
for k in range(n_samples):
sigma[j] += (data[k].reshape((1, n_features)) - miu[j]).T.dot(
(data[k] - miu[j]).reshape((1, n_features))) * gamma[k, j]
sigma[j] = sigma[j] / np.sum(gamma, axis=0)[j]
self.miu = miu
self.sigma = sigma
self.alpha = alpha
def predict(self, data):
pred = computeGamma(data, self.miu, self.sigma, self.alpha, multiGaussian)
cluster_results = np.argmax(pred, axis=1)
return cluster_results
if __name__ == '__main__':
data = createDataSet()
model = GMM(3, iter=100)
model.fit(data)
result = model.predict(data)
plt.scatter(data[:, 0], data[:, 1], c=result)
plt.scatter(model.miu[:, 0], model.miu[:, 1], marker='x', color='red')
plt.show()
AGNES.py(初始化30个不同颜色的簇)
import colorsys
import random
import pandas as pd
import pylab as pl
def get_n_hls_colors(num):
hls_colors = []
i = 0
step = 360.0 / num
while i < 360:
h = i
s = 90 + random.random() * 10
l = 50 + random.random() * 10
_hlsc = [h / 360.0, l / 100.0, s / 100.0]
hls_colors.append(_hlsc)
i += step
return hls_colors
def ncolors(num):
rgb_colors = []
if num < 1:
return rgb_colors
hls_colors = get_n_hls_colors(num)
for hlsc in hls_colors:
_r, _g, _b = colorsys.hls_to_rgb(hlsc[0], hlsc[1], hlsc[2])
r, g, b = [int(x * 255.0) for x in (_r, _g, _b)]
rgb_colors.append([r, g, b])
return rgb_colors
def color(value):
digit = list(map(str, range(10))) + list("ABCDEF")
if isinstance(value, tuple):
string = '#'
for i in value:
a1 = i // 16
a2 = i % 16
string += digit[a1] + digit[a2]
return string
elif isinstance(value, str):
a1 = digit.index(value[1]) * 16 + digit.index(value[2])
a2 = digit.index(value[3]) * 16 + digit.index(value[4])
a3 = digit.index(value[5]) * 16 + digit.index(value[6])
return (a1, a2, a3)
def dist(a, b):
return math.sqrt(math.pow(a[0] - b[0], 2) + math.pow(a[1] - b[1], 2))
def dist_min(Ci, Cj):
return min(dist(i, j) for i in Ci for j in Cj)
def dist_max(Ci, Cj):
return max(dist(i, j) for i in Ci for j in Cj)
def dist_avg(Ci, Cj):
return sum(dist(i, j) for i in Ci for j in Cj) / (len(Ci) * len(Cj))
def find_Min(M):
min = 1000
x = 0
y = 0
for i in range(len(M)):
for j in range(len(M[i])):
if i != j and M[i][j] < min:
min = M[i][j]
x = i
y = j
return (x, y, min)
def AGNES(dataset, dist, k):
C = []
M = []
for i in dataset:
Ci = []
Ci.append(i)
C.append(Ci)
for i in C:
Mi = []
for j in C:
Mi.append(dist(i, j))
M.append(Mi)
q = len(dataset)
while q > k:
x, y, min = find_Min(M)
C[x].extend(C[y])
C.remove(C[y])
M = []
for i in C:
Mi = []
for j in C:
Mi.append(dist(i, j))
M.append(Mi)
q -= 1
return C
def c11():
import random
L1 = random.sample(range(1, 255), 15)
L2 = random.sample(range(1, 255), 25)
L3 = random.sample(range(1, 255), 17)
d = []
for i in L1:
for j in L3:
for k in L2:
d.append((k, j, i))
return d
def color1(value):
digit = list(map(str, range(10))) + list("ABCDEF")
if isinstance(value, tuple):
string = '#'
for i in value:
a1 = i // 16
a2 = i % 16
string += digit[a1] + digit[a2]
return string
elif isinstance(value, str):
a1 = digit.index(value[1]) * 16 + digit.index(value[2])
a2 = digit.index(value[3]) * 16 + digit.index(value[4])
a3 = digit.index(value[5]) * 16 + digit.index(value[6])
return (a1, a2, a3)
def draw(C, c2):
colValue = ['r', 'y', 'g', 'b', 'c', 'k', 'm']
for i in range(len(C)):
coo_X = []
coo_Y = []
for j in range(len(C[i])):
coo_X.append(C[i][j][0])
coo_Y.append(C[i][j][1])
pl.rcParams['font.sans-serif'] = ['SimHei']
pl.rcParams['axes.unicode_minus'] = False
set_lst = set(c2)
if len(set_lst) == len(c2):
print('列表里的元素互不重复!')
else:
print('列表里有重复的元素!')
pl.scatter(coo_X, coo_Y, marker='x', color=c2[i], label=i)
pl.title("迭代次数:")
pl.show()
def ColourDistance(rgb_1, rgb_2):
R_1, G_1, B_1 = rgb_1
R_2, G_2, B_2 = rgb_2
rmean = (R_1 + R_2) / 2
R = R_1 - R_2
G = G_1 - G_2
B = B_1 - B_2
distance = math.sqrt((2 + rmean / 256) * (R ** 2) + 4 * (G ** 2) + (2 + (255 - rmean) / 256) * (B ** 2))
return distance
import math
def colorSimilarity(rgb1, rgb2):
r1, g1, b1 = rgb1
r2, g2, b2 = rgb2
r3 = (r1 - r2) / 256
g3 = (g1 - g2) / 256
b3 = (b1 - b2) / 256
diff = math.sqrt(r3 * r3 + g3 * g3 + b3 * b3)
return diff
import time
def createRGB():
colors = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'A', 'B', 'C', 'D', 'E', 'F']
np.random.seed(int(time.time()))
cs1 = colors[np.random.randint(0, 16)]
cs2 = colors[np.random.randint(0, 16)]
cs1 = int(cs1, 16)
cs2 = int(cs2, 16)
r1 = cs1 * 16 + cs2
cs3 = colors[np.random.randint(0, 16)]
cs4 = colors[np.random.randint(0, 16)]
cs3 = int(cs3, 16)
cs4 = int(cs4, 16)
b1 = cs3 * 16 + cs4
cs5 = colors[np.random.randint(0, 16)]
cs6 = colors[np.random.randint(0, 16)]
cs5 = int(cs5, 16)
cs6 = int(cs6, 16)
g1 = cs5 * 16 + cs6
rgb = [r1, b1, g1]
return rgb
def rgbOctToHex(rgbOct):
rgbHex = "#"
colors = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F']
for i in rgbOct:
cs = hex(i)[2:]
if len(cs) == 1:
rgbHex += '0'
rgbHex += cs
return rgbHex
def createColors(num):
i = 0
d = []
while i < num:
rgb_1 = createRGB()
if len(d) == 0:
d.append(rgb_1)
i += 1
else:
good = 1
for rgb in d:
diff = ColourDistance(rgb_1, rgb)
if diff < 80:
good = 0
break
if good == 1:
print("diff=", diff)
i += 1
d.append(rgb_1)
print("d=", d)
rd = []
for rgb in d:
rd.append(str(rgbOctToHex(rgb)))
return rd
import numpy as np
if __name__ == '__main__':
data = pd.read_csv('watermelon4.0.csv', header=None)
sample = data.iloc[:, 1:3].values
dataset = [tuple(i) for i in sample]
C = AGNES(dataset, dist_min, 30)
c2 = c11()
d = createColors(30)
print(d)
draw(C, d)
AGNES.py
import math
import numpy as np
import pandas as pd
import pylab as pl
def dist(a, b):
return math.sqrt(math.pow(a[0] - b[0], 2) + math.pow(a[1] - b[1], 2))
def dist_min(Ci, Cj):
return min(dist(i, j) for i in Ci for j in Cj)
def dist_max(Ci, Cj):
return max(dist(i, j) for i in Ci for j in Cj)
def dist_avg(Ci, Cj):
return sum(dist(i, j) for i in Ci for j in Cj) / (len(Ci) * len(Cj))
def find_Min(M):
min = 1000
x = 0
y = 0
for i in range(len(M)):
for j in range(len(M[i])):
if i != j and M[i][j] < min:
min = M[i][j]
x = i
y = j
return (x, y, min)
def AGNES(dataset, dist, k):
C = []
M = []
for i in dataset:
Ci = []
Ci.append(i)
C.append(Ci)
for i in C:
Mi = []
for j in C:
Mi.append(dist(i, j))
M.append(Mi)
q = len(dataset)
while q > k:
x, y, min = find_Min(M)
C[x].extend(C[y])
C.remove(C[y])
M = []
for i in C:
Mi = []
for j in C:
Mi.append(dist(i, j))
M.append(Mi)
q -= 1
return C
def draw(C):
colValue = ['r', 'y', 'g', 'b', 'c', 'k', 'm', 'peru']
for i in range(len(C)):
coo_X = []
coo_Y = []
for j in range(len(C[i])):
coo_X.append(C[i][j][0])
coo_Y.append(C[i][j][1])
pl.rcParams['font.sans-serif'] = ['SimHei']
pl.rcParams['axes.unicode_minus'] = False
pl.scatter(coo_X, coo_Y, marker='x', color=colValue[i % len(colValue)], label=i)
pl.title("fig4:聚类簇数k=5")
pl.show()
if __name__ == '__main__':
np.random.seed(5)
data = pd.read_csv('watermelon4.0.csv', header=None)
sample = data.iloc[:, 1:3].values
dataset = [tuple(i) for i in sample]
C = AGNES(dataset, dist_min, 5)
draw(C)