用于聚类的数据集
%matplotlib inline
import scipy.io as sio
import matplotlib.pyplot as plt
'''
各种聚类数据
'''
#two_cluster
def two_cluster():
two_cluster=u'cluster_data/two_cluster.mat'
two_cluster=sio.loadmat(two_cluster)['X'].T
data = two_cluster
return data
#three_cluster
def three_cluster():
path=u'cluster_data/three_cluster.mat'
three_cluster=sio.loadmat(path)['X'].T
data = three_cluster
return data
#five_cluster
def five_cluster():
path=u'cluster_data/five_cluster.mat'
five_cluster=sio.loadmat(path)
x=five_cluster['x'] #得到的数据为二行n列
y=five_cluster['y'] #到的数据为一行n列
data = np.vstack((x,y)).T #先垂直合并,而后转置
#data = np.array([x[0,:],x[1,:],y[0,:]]).T #list与array互换
return data
#spiral
def spiral():
path=u'cluster_data/spiral.mat'
spiral=sio.loadmat(path)['spiral']
spiral = spiral[0::3,:] #每隔3行取一个数据
data = spiral
data = np.array([data[:,1],data[:,2],data[:,0]]).T #list与array互换
return data
#spiral_unbalance
def spiral_unbalance():
path=u'cluster_data/spiral_unbalance.mat'
spiral_unbalance=sio.loadmat(path)['spiral_unbalance']
spiral_unbalance = spiral_unbalance[0::3,:] #每隔3行取一个数据
data = spiral_unbalance
data = np.array([data[:,1],data[:,2],data[:,0]]).T #list与array互换
return data
#ThreeCircles
def ThreeCircles():
path=u'cluster_data/ThreeCircles.mat'
ThreeCircles=sio.loadmat(path)['ThreeCircles']
ThreeCircles = ThreeCircles[0::3,:] #每隔3行取一个数据
data = ThreeCircles
data = np.array([data[:,1],data[:,2],data[:,0]]).T #list与array互换
return data
#Twomoons
def Twomoons():
path=u'cluster_data/Twomoons.mat'
Twomoons=sio.loadmat(path)['Twomoons']
Twomoons = Twomoons[0::3,:] #每隔3行取一个数据
data = Twomoons
data = np.array([data[:,1],data[:,2],data[:,0]]).T #list与array互换
plt.scatter(data[:,0],data[:,1],c=data[:,2])
return data
#Twomoons1
def Twomoons1():
path=u'cluster_data/Twomoons.mat'
Twomoons1=sio.loadmat(path)['Twomoons']
Twomoons1 = Twomoons1[0::3,:] #每隔3行取一个数据
data = Twomoons1
data = np.array([data[:,1],data[:,2],data[:,0]]).T #list与array互换
return data
def test():
print 'test'
def show_all():
plt.figure(figsize=(16,8))
#动态调用方法
func_name_list = ['two_cluster','three_cluster','five_cluster','spiral','spiral_unbalance','ThreeCircles','Twomoons','Twomoons1']
for i in range(8):
data_list.append(eval(func_name_list[i])())
#动态画图
for i in range(8):
data = data_list[i]
plt.subplot(2,4,i+1)
#plt.figure()
plt.scatter(data[:,0],data[:,1],c=data[:,2])
data_list = []
show_all()
使用scikit的kmeans进行聚类
%matplotlib inline
import scipy.io as sio
#matlab文件名
two_cluster=u'cluster_data/two_cluster.mat'
data=sio.loadmat(two_cluster)
print data
%matplotlib inline
import matplotlib.pyplot as plt
x = data['X']
cValue = x[2]
plt.scatter(x[0],x[1],c=cValue)
from sklearn import cluster, datasets
b = np.array(x).T
b = b[:,0:2]
y_pred = cluster.KMeans(n_clusters=2, random_state=170).fit_predict(b)
cValue = x[2]
plt.scatter(x[0],x[1],c=y_pred)
scikit-learn教程
%matplotlib inline
import scipy.io as sio
#matlab文件名
two_cluster=u'cluster_data/spiral.mat'
spiral=sio.loadmat(two_cluster)['spiral']
spiral = spiral[0::3,:] #每隔3行取一个数据
print len(spiral),len(spiral[0])
cValue = spiral[:,0]
print cValue.shape
color = ['b','y']
cValue = [color[int(i)] for i in list(cValue)]
plt.scatter(spiral[:,1],spiral[:,2],c=cValue)
使用kmeans结果
from sklearn import cluster, datasets
y_pred = cluster.KMeans(n_clusters=2, random_state=170).fit_predict(spiral[:,1:3])
plt.scatter(spiral[:,1],spiral[:,2],c=y_pred)
使用scipy进行聚类效果
# -*- coding: utf8 -*-
%matplotlib inline
import scipy.io as sio
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as hcluster
from sklearn.cluster import AgglomerativeClustering
import numpy.random as random
import numpy as np
import numpy.core.fromnumeric
def loadData():
#matlab文件名
two_cluster=u'cluster_data/spiral.mat'
spiral=sio.loadmat(two_cluster)['spiral']
spiral = spiral[0::3,:] #每隔3行取一个数据
print len(spiral),len(spiral[0])
cValue = spiral[:,0]
print cValue.shape
color = ['b','y']
cValue = [color[int(i)] for i in list(cValue)]
plt.scatter(spiral[:,1],spiral[:,2],c=cValue)
def spiralSample():
plt.subplot(131)
plt.title(u'origal data')
plt.scatter(spiral[:,1],spiral[:,2],c=spiral[:,0])
#scipy进行聚类,默认depth=2(可得到两类),阈值t为距离阈值,设置criterion='maxclust',找到两类之间最小距离小于t的进行合并
#http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.fcluster.html#scipy.cluster.hierarchy.fcluster
y_pred=hcluster.fclusterdata(spiral[:,1:3],criterion='maxclust',t=2)
plt.subplot(132)
plt.title(u'use scipy to hierarchy cluster')
plt.scatter(spiral[:,1],spiral[:,2],c=y_pred)
#scikit进行聚类
plt.subplot(133)
plt.title(u'use scikit to hierarchy cluster')
y_pred = AgglomerativeClustering(n_clusters=2, linkage='ward').fit_predict(spiral[:,1:3])
plt.scatter(spiral[:,1],spiral[:,2],c=y_pred)
plt.show()
spiralSample()