python进行聚类(scikit-lean、scipy)

用于聚类的数据集
%matplotlib inline
import scipy.io as sio
import matplotlib.pyplot as plt


'''
各种聚类数据
'''
#two_cluster
def two_cluster():
    two_cluster=u'cluster_data/two_cluster.mat'
    two_cluster=sio.loadmat(two_cluster)['X'].T
    data = two_cluster
    return data
#three_cluster
def three_cluster():
    path=u'cluster_data/three_cluster.mat'
    three_cluster=sio.loadmat(path)['X'].T
    data = three_cluster    
    return data
#five_cluster
def five_cluster():
    path=u'cluster_data/five_cluster.mat'
    five_cluster=sio.loadmat(path)
    x=five_cluster['x'] #得到的数据为二行n列
    y=five_cluster['y'] #到的数据为一行n列
    data = np.vstack((x,y)).T #先垂直合并,而后转置
    #data = np.array([x[0,:],x[1,:],y[0,:]]).T #list与array互换
    return data
#spiral
def spiral():
    path=u'cluster_data/spiral.mat'
    spiral=sio.loadmat(path)['spiral']
    spiral = spiral[0::3,:] #每隔3行取一个数据
    data = spiral
    data = np.array([data[:,1],data[:,2],data[:,0]]).T #list与array互换
    return data
#spiral_unbalance
def spiral_unbalance():
    path=u'cluster_data/spiral_unbalance.mat'
    spiral_unbalance=sio.loadmat(path)['spiral_unbalance']
    spiral_unbalance = spiral_unbalance[0::3,:] #每隔3行取一个数据
    data = spiral_unbalance
    data = np.array([data[:,1],data[:,2],data[:,0]]).T #list与array互换
    return data
#ThreeCircles
def ThreeCircles():
    path=u'cluster_data/ThreeCircles.mat'
    ThreeCircles=sio.loadmat(path)['ThreeCircles']
    ThreeCircles = ThreeCircles[0::3,:] #每隔3行取一个数据
    data = ThreeCircles
    data = np.array([data[:,1],data[:,2],data[:,0]]).T #list与array互换
    return data
#Twomoons
def Twomoons():
    path=u'cluster_data/Twomoons.mat'
    Twomoons=sio.loadmat(path)['Twomoons']
    Twomoons = Twomoons[0::3,:] #每隔3行取一个数据
    data = Twomoons
    data = np.array([data[:,1],data[:,2],data[:,0]]).T #list与array互换
    plt.scatter(data[:,0],data[:,1],c=data[:,2])
    return data
#Twomoons1
def Twomoons1():
    path=u'cluster_data/Twomoons.mat'
    Twomoons1=sio.loadmat(path)['Twomoons']
    Twomoons1 = Twomoons1[0::3,:] #每隔3行取一个数据
    data = Twomoons1
    data = np.array([data[:,1],data[:,2],data[:,0]]).T #list与array互换
    return data
def test():
    print 'test'


def show_all():
    plt.figure(figsize=(16,8))
    #动态调用方法
    func_name_list = ['two_cluster','three_cluster','five_cluster','spiral','spiral_unbalance','ThreeCircles','Twomoons','Twomoons1']
    for i in range(8):
        data_list.append(eval(func_name_list[i])())
    #动态画图
    for i in range(8):
        data = data_list[i]
        plt.subplot(2,4,i+1)
        #plt.figure()
        plt.scatter(data[:,0],data[:,1],c=data[:,2])
    
data_list = []
show_all()

python进行聚类(scikit-lean、scipy)_第1张图片
 
  
使用scikit的kmeans进行聚类
%matplotlib inline
import scipy.io as sio
#matlab文件名  
two_cluster=u'cluster_data/two_cluster.mat'
data=sio.loadmat(two_cluster)
print data
%matplotlib inline
import matplotlib.pyplot as plt
x = data['X']
cValue = x[2]
plt.scatter(x[0],x[1],c=cValue)
from sklearn import cluster, datasets
b = np.array(x).T
b = b[:,0:2]

y_pred = cluster.KMeans(n_clusters=2, random_state=170).fit_predict(b)

cValue = x[2]
plt.scatter(x[0],x[1],c=y_pred)

数据集下载

scikit-learn教程

python进行聚类(scikit-lean、scipy)_第2张图片

%matplotlib inline
import scipy.io as sio
#matlab文件名  
two_cluster=u'cluster_data/spiral.mat'
spiral=sio.loadmat(two_cluster)['spiral']
spiral = spiral[0::3,:] #每隔3行取一个数据
print len(spiral),len(spiral[0])
cValue = spiral[:,0]
print cValue.shape
color = ['b','y']
cValue = [color[int(i)] for i in list(cValue)]
plt.scatter(spiral[:,1],spiral[:,2],c=cValue)

python进行聚类(scikit-lean、scipy)_第3张图片

使用kmeans结果

from sklearn import cluster, datasets

y_pred = cluster.KMeans(n_clusters=2, random_state=170).fit_predict(spiral[:,1:3])

plt.scatter(spiral[:,1],spiral[:,2],c=y_pred)

python进行聚类(scikit-lean、scipy)_第4张图片

使用scipy进行聚类效果

# -*- coding: utf8 -*-
%matplotlib inline
import scipy.io as sio
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as hcluster
from sklearn.cluster import AgglomerativeClustering
import numpy.random as random  
import numpy as np  
import numpy.core.fromnumeric  


def loadData():
    #matlab文件名  
    two_cluster=u'cluster_data/spiral.mat'
    spiral=sio.loadmat(two_cluster)['spiral']
    spiral = spiral[0::3,:] #每隔3行取一个数据
    print len(spiral),len(spiral[0])
    cValue = spiral[:,0]
    print cValue.shape
    color = ['b','y']
    cValue = [color[int(i)] for i in list(cValue)]
    plt.scatter(spiral[:,1],spiral[:,2],c=cValue)


def spiralSample():
    plt.subplot(131)
    plt.title(u'origal data')
    plt.scatter(spiral[:,1],spiral[:,2],c=spiral[:,0])
    #scipy进行聚类,默认depth=2(可得到两类),阈值t为距离阈值,设置criterion='maxclust',找到两类之间最小距离小于t的进行合并
    #http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.fcluster.html#scipy.cluster.hierarchy.fcluster
    y_pred=hcluster.fclusterdata(spiral[:,1:3],criterion='maxclust',t=2)    
    plt.subplot(132)
    plt.title(u'use scipy to hierarchy cluster')
    plt.scatter(spiral[:,1],spiral[:,2],c=y_pred)
    #scikit进行聚类
    plt.subplot(133)
    plt.title(u'use scikit to hierarchy cluster')
    y_pred = AgglomerativeClustering(n_clusters=2, linkage='ward').fit_predict(spiral[:,1:3])    
    plt.scatter(spiral[:,1],spiral[:,2],c=y_pred)
    plt.show()
spiralSample()

python进行聚类(scikit-lean、scipy)_第5张图片



你可能感兴趣的:(机器学习)