注:聚类这一相关的无监督学习涉及到了很多的算法,及其原理,如:Kmeans、Kmeans++、层次聚类、基于密度的聚类、以及谱聚类等等。这里主要通过使用 TensorFlow 实现 Kmeans 以及 Knn 算法(这里采用半监督学习方法实现,以达到学习 TensorFlow API 的目的。其他具体算法的讲解,请参看别处。
对于停止条件我们可以有不同的选择:如下
# coding: utf-8
import tensorflow as tf
import numpy as np
import time
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.datasets.samples_generator import make_blobs
from sklearn.datasets.samples_generator import make_circles
def bucket_mean(data,bucker_ids,num_buckers):
# 各自按照不同的类别(即bucker_ids)进行相加,且bucker_ids 不保证顺序
total = tf.unsorted_segment_sum(data,bucker_ids,num_buckers)
count = tf.unsorted_segment_sum(tf.ones_like(data),bucker_ids,num_buckers)
return total / count
if __name__ == '__main__':
# 首先第一步就是构造数据
DATA_TYPE = 'blobs'
N = 200
# 首先确定聚类的 K 值,即聚类个数
# Number of clusters, if we choose circles, only 2 will be enough
if (DATA_TYPE == 'circle'):
K = 2
else:
K = 4
# 开始计时
start = time.time()
# 迭代次数
MAX_ITERS = 1000
# 聚类中心
centers = [(-2,-2),(-2,1.5),(1.5,-2),(2,1.5)]
if (DATA_TYPE == 'circle'):
# make_circles 函数创建两个一大一下的圆环(由200个点构成),noise 是高斯噪声,factor 是内外圆之间的放大因子
data, features = make_circles(n_samples=200,shuffle=True,noise=0.01,factor=0.4)
else:
# 注意这里构造的是同方差 cluster_std 为一个值
data, features = make_blobs(n_samples=200,centers=centers,n_features=2,cluster_std=0.8,shuffle=False,random_state=42)
# print type(data), data.shape, '\n'
# print features
fig, ax = plt.subplots()
plt.figure(figsize=(10,8),facecolor='w')
if DATA_TYPE == 'blobs':
# plt.subplot(1,2,1)
ax.scatter(np.asarray(centers).T[0],np.asarray(centers).T[1],marker = 'o',s=250)
# 当需要将四组不同的类别的样本着上不同的颜色时,便需要将 c = features 有多少类别传入作为依据
# plt.subplot(1,2,2)
ax.scatter(data.T[0],data.T[1],marker='*',s=100,c=features,cmap=plt.cm.coolwarm)
# plt.show()
# 接着,第二步便是给初值,计算样本与质心的距离,更新质心,迭代
points = tf.Variable(data) # point 用来存放数据集点的集合
# 用来存储样本被聚到哪一类,初始置为 0
cluster_assignments = tf.Variable(tf.zeros([N],dtype=tf.int64))
# centroids 用于保存质心坐标,初始化为任意k个点,这里取data 的前K个
centroids = tf.Variable(tf.slice(points.initialized_value(),[0,0],[K,2]))
# 开启 TensorFlow 会话
sess = tf.Session()
sess.run(tf.global_variables_initializer())
sess.run(centroids)
# 第三步,便是定义损失函数以及优化,及停止条件等
# print centroids,'\n'
# print tf.tile(centroids,[2,1])
# 将质心做 N 次复制,对每个样本做 K 次复制,这样样本的和质心形状都是 N*K*2 ,即 N个样本,K个簇
# 于是,我们便可以计算每个样本到每个质心点之间的所有维度的距离
rep_centroids = tf.reshape(tf.tile(centroids,[N,1]),[N,K,2])
# print rep_centroids
rep_points = tf.reshape(tf.tile(points,[1,K]),[N,K,2])
# print tf.square(rep_points - rep_centroids)
sum_squares = tf.reduce_sum(tf.square(rep_points - rep_centroids),
reduction_indices=2)
# print sum_squares
# 寻找距离最小的中心,返回其最小距离的序号,即哪一个簇
best_centroids = tf.argmin(sum_squares,1)
# 这里采用当所有质心不再发生变化作为停止条件,以下为是否更新质心得 flag
did_assignments_change = tf.reduce_any(tf.not_equal(best_centroids,cluster_assignments))
# bucket_mean 函数用来更新新的质心
means = bucket_mean(points,best_centroids,K)
# 然后,根据 control_dependencies 来判别是否更新质心,
# 可以用tf.control_dependencies(control_inputs)来实现指定某些操作执行的依赖关系,
# 它会返回一个控制依赖的上下文管理器,使用with关键字可以让在这个上下文环境中
# 的操作都在 control_inputs 执行
with tf.control_dependencies([did_assignments_change]):
# assign 赋值的意思
do_updates = tf.group(centroids.assign(means),cluster_assignments.assign(best_centroids))
changed = True
iters = 0
fig, ax = plt.subplots()
if DATA_TYPE == 'blobs':
colourindexes = [2,1,4,3]
else:
colourindexes = [2,1]
while changed and iters < MAX_ITERS:
fig, ax = plt.subplots()
iters += 1
[chagned, _] = sess.run([did_assignments_change,do_updates])
[centers,assignments] = sess.run([centroids,cluster_assignments])
ax.scatter(sess.run(points).T[0],sess.run(points).T[1],marker='o',s=200,c=assignments,cmap=plt.cm.coolwarm)
# ax.scatter(np.asarray(centers).T[0],np.asarray(centers).T[1], marker = '^', s = 550, c = colourindexes, cmap=plt.cm.plasma)
ax.scatter(centers[:,0],centers[:,1],marker='^',s=550,c=colourindexes,cmap=plt.cm.plasma)
ax.set_title('Iteration' + str(iters))
plt.savefig('./My_pratise/Kmean_pic/kmeans' + str(iters) + '.png')
ax.scatter(sess.run(points).T[0],sess.run(points).T[1],marker = 'o',s = 200,c=assignments,cmap=plt.cm.coolwarm)
# plt.show()
end = time.time()
print 'Found in %.2f seconds'% (end - start), iters,'iterations'
print 'Centroids:'
print centers
print 'Cluster assignments:',assignments
# coding:utf-8
import tensorflow as tf
import numpy as np
import time
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.datasets.samples_generator import make_circles
from sklearn.metrics import accuracy_score
if __name__ == '__main__':
# 样本点个数
N = 210
# 类别个数
K = 2
# 最大迭代次数
MAX_ITERS = 1000
# cut = int(N * 0.7)
start = time.time()
# 生成数据,划分数据
# noise 的取值较大,kmeans不擅长的非线性,的圆形数据集,这里使用 knn
data, features = make_circles(n_samples=N,shuffle=True,noise=0.12,factor=.4)
data_train, data_test, features_train, features_test = train_test_split(data,features,test_size=0.3,random_state=1)
fig, ax = plt.subplots()
ax.scatter(data_train[:,0],data_train[:,1],marker='o',s=100, c=features_train,cmap=plt.cm.coolwarm)
plt.plot()
plt.grid(True)
# plt.show()
# point 用来存放数据集点的集合
points = tf.Variable(data)
# 用来存储样本被聚到哪一类,初始置为 0
cluster_assignments = tf.Variable(tf.zeros([N],dtype=tf.int64))
# 创建会话
sess = tf.Session()
sess.run(tf.global_variables_initializer())
test = []
for i, j in zip(data_test,features_test):
distances = tf.reduce_sum(tf.square(tf.subtract(i, data_train)),reduction_indices=1)
neighbor = tf.arg_min(distances,0)
# 下面语句可以粗略观察一下
# print 'neighbor: ',features_train[sess.run(neighbor)]
# print 'Real_value: ',j
test.append(features_train[sess.run(neighbor)])
print test
# 可以通过此语句查看 knn 的正确率
fig, ax = plt.subplots()
ax.scatter(data_test[:,0],data_test[:,1],marker='o',s=100,c=test,cmap=plt.cm.coolwarm)
plt.plot()
# final
end = time.time()
print ("Found in %.2f seconds" % (end-start))
print 'clusster assignment:', test
print '准确率:',accuracy_score(features_test,test)
plt.show()