MDS算法

非常传统的降维的方法,以距离为标准,将高维坐标中的点投影到低维坐标中,保持彼此之间的相对距离变化最小,更新的方法是T-SNE,基于分布概率变化最小进行投影。

假定原始高维数据样本的距离矩阵为D,则在低维下的距离矩阵为Z,我们可以用优化算法选取初始点,用梯度下降法求最佳逼近,使得||D-Z||最小,同时,也可以利用內积来求的低维映射。前者在样本较多时容易陷入局部最优,后者较稳定,但在样本不多时,效果比前者要差。

算法如下:


MDS算法_第1张图片

MDS算法_第2张图片MDS算法_第3张图片



代码如下:这里用了两种数据集:球形数据集合Iris数据集,算法分别用了基于上述內积求优的算法和sklearn中基于梯度下降求最优的算法,并进行了比较,总体来说,后者稳定,前者更快速。

import numpy
from sklearn import metrics,datasets,manifold
from scipy import optimize
from matplotlib import pyplot
import pandas
import collections

def generate_circle_data():
    xx=numpy.zeros((1200,3))
    x1=numpy.ones((400,))+0.5*numpy.random.rand(400)-0.5
    r1=numpy.linspace(0,2*numpy.pi,20)
    r2=numpy.linspace(0,numpy.pi,20)
    r1,r2=numpy.meshgrid(r1,r2)
    r1=r1.ravel()
    r2=r2.ravel()
    xx[0:400,0]=x1*numpy.sin(r1)*numpy.sin(r2)
    xx[0:400,1]=x1*numpy.cos(r1)*numpy.sin(r2)
    xx[0:400,2]=x1*numpy.cos(r2)
    x1=3*numpy.ones((400,))+0.6*numpy.random.rand(400)-0.6
    xx[400:800,0]=x1*numpy.sin(r1)*numpy.sin(r2)
    xx[400:800,1]=x1*numpy.cos(r1)*numpy.sin(r2)
    xx[400:800,2]=x1*numpy.cos(r2)
    x1=6*numpy.ones((400,))+1.1*numpy.random.rand(400)-0.6
    xx[800:1200,0]=x1*numpy.sin(r1)*numpy.sin(r2)
    xx[800:1200,1]=x1*numpy.cos(r1)*numpy.sin(r2)
    xx[800:1200,2]=x1*numpy.cos(r2)
    target=numpy.zeros((1200,))
    target[0:400]=0
    target[400:800]=1
    target[800:1200]=2
    target=target.astype('int')
    return xx,target


def get_data():
    data=datasets.load_iris()
    return data.data,data.target

def calculate_distance(x,y):
    d=numpy.sqrt(numpy.sum((x-y)**2))
    return d

def calculate_distance_matrix(x,y):
    d=metrics.pairwise_distances(x,y)
    return d

def cal_B(D):
    (n1,n2)=D.shape
    DD=numpy.square(D)
    Di=numpy.sum(DD,axis=1)/n1
    Dj=numpy.sum(DD,axis=0)/n1
    Dij=numpy.sum(DD)/(n1**2)
    B=numpy.zeros((n1,n1))
    for i in xrange(n1):
        for j in xrange(n2):
            B[i,j]=(Dij+DD[i,j]-Di[i]-Dj[j])/(-2)
    return B
    

def MDS(data,n=2):
    D=calculate_distance_matrix(data,data)
    B=cal_B(D)
    Be,Bv=numpy.linalg.eigh(B)
    # print numpy.sum(B-numpy.dot(numpy.dot(Bv,numpy.diag(Be)),Bv.T))
    Be_sort=numpy.argsort(-Be)
    Be=Be[Be_sort]
    Bv=Bv[:,Be_sort]
    Bez=numpy.diag(Be[0:n])
    # print Bez
    Bvz=Bv[:,0:n]
    Z=numpy.dot(numpy.sqrt(Bez),Bvz.T).T
    return Z


def test_iris():
    data,target=get_data()
    Z=MDS(data)
    
    figure1=pyplot.figure()
    pyplot.subplot(1,2,1)
    pyplot.plot(Z[target==0,0],Z[target==0,1],'r*',markersize=20)
    pyplot.plot(Z[target==1,0],Z[target==1,1],'bo',markersize=20)
    pyplot.plot(Z[target==2,0],Z[target==2,1],'gx',markersize=20)
    pyplot.title('CUSTOM')
    pyplot.subplot(1,2,2)
    Z1=manifold.MDS(n_components=2).fit_transform(data)
    pyplot.plot(Z1[target==0,0],Z1[target==0,1],'r*',markersize=20)
    pyplot.plot(Z1[target==1,0],Z1[target==1,1],'bo',markersize=20)
    pyplot.plot(Z1[target==2,0],Z1[target==2,1],'gx',markersize=20)
    pyplot.title('SKLEARN')
    pyplot.show()

def test_ball():
    data,target=generate_circle_data()
    Z=MDS(data)
    figure1=pyplot.figure()
    pyplot.subplot(1,2,1)
    pyplot.plot(Z[target==0,0],Z[target==0,1],'r*',markersize=10)
    pyplot.plot(Z[target==1,0],Z[target==1,1],'bo',markersize=10)
    pyplot.plot(Z[target==2,0],Z[target==2,1],'gx',markersize=10)
    pyplot.title('CUSTOM')
    pyplot.subplot(1,2,2)
    Z1=manifold.MDS(n_components=2).fit_transform(data)
    pyplot.plot(Z1[target==0,0],Z1[target==0,1],'r*',markersize=10)
    pyplot.plot(Z1[target==1,0],Z1[target==1,1],'bo',markersize=10)
    pyplot.plot(Z1[target==2,0],Z1[target==2,1],'gx',markersize=10)
    pyplot.title('SKLEARN')
    pyplot.show()

if __name__=='__main__':
    # test_ball()
    test_iris()




MDS算法_第4张图片MDS算法_第5张图片





你可能感兴趣的:(机器学习,Python,算法,python,MDS,降维)