非常传统的降维的方法,以距离为标准,将高维坐标中的点投影到低维坐标中,保持彼此之间的相对距离变化最小,更新的方法是T-SNE,基于分布概率变化最小进行投影。
假定原始高维数据样本的距离矩阵为D,则在低维下的距离矩阵为Z,我们可以用优化算法选取初始点,用梯度下降法求最佳逼近,使得||D-Z||最小,同时,也可以利用內积来求的低维映射。前者在样本较多时容易陷入局部最优,后者较稳定,但在样本不多时,效果比前者要差。
算法如下:
代码如下:这里用了两种数据集:球形数据集合Iris数据集,算法分别用了基于上述內积求优的算法和sklearn中基于梯度下降求最优的算法,并进行了比较,总体来说,后者稳定,前者更快速。
import numpy
from sklearn import metrics,datasets,manifold
from scipy import optimize
from matplotlib import pyplot
import pandas
import collections
def generate_circle_data():
xx=numpy.zeros((1200,3))
x1=numpy.ones((400,))+0.5*numpy.random.rand(400)-0.5
r1=numpy.linspace(0,2*numpy.pi,20)
r2=numpy.linspace(0,numpy.pi,20)
r1,r2=numpy.meshgrid(r1,r2)
r1=r1.ravel()
r2=r2.ravel()
xx[0:400,0]=x1*numpy.sin(r1)*numpy.sin(r2)
xx[0:400,1]=x1*numpy.cos(r1)*numpy.sin(r2)
xx[0:400,2]=x1*numpy.cos(r2)
x1=3*numpy.ones((400,))+0.6*numpy.random.rand(400)-0.6
xx[400:800,0]=x1*numpy.sin(r1)*numpy.sin(r2)
xx[400:800,1]=x1*numpy.cos(r1)*numpy.sin(r2)
xx[400:800,2]=x1*numpy.cos(r2)
x1=6*numpy.ones((400,))+1.1*numpy.random.rand(400)-0.6
xx[800:1200,0]=x1*numpy.sin(r1)*numpy.sin(r2)
xx[800:1200,1]=x1*numpy.cos(r1)*numpy.sin(r2)
xx[800:1200,2]=x1*numpy.cos(r2)
target=numpy.zeros((1200,))
target[0:400]=0
target[400:800]=1
target[800:1200]=2
target=target.astype('int')
return xx,target
def get_data():
data=datasets.load_iris()
return data.data,data.target
def calculate_distance(x,y):
d=numpy.sqrt(numpy.sum((x-y)**2))
return d
def calculate_distance_matrix(x,y):
d=metrics.pairwise_distances(x,y)
return d
def cal_B(D):
(n1,n2)=D.shape
DD=numpy.square(D)
Di=numpy.sum(DD,axis=1)/n1
Dj=numpy.sum(DD,axis=0)/n1
Dij=numpy.sum(DD)/(n1**2)
B=numpy.zeros((n1,n1))
for i in xrange(n1):
for j in xrange(n2):
B[i,j]=(Dij+DD[i,j]-Di[i]-Dj[j])/(-2)
return B
def MDS(data,n=2):
D=calculate_distance_matrix(data,data)
B=cal_B(D)
Be,Bv=numpy.linalg.eigh(B)
# print numpy.sum(B-numpy.dot(numpy.dot(Bv,numpy.diag(Be)),Bv.T))
Be_sort=numpy.argsort(-Be)
Be=Be[Be_sort]
Bv=Bv[:,Be_sort]
Bez=numpy.diag(Be[0:n])
# print Bez
Bvz=Bv[:,0:n]
Z=numpy.dot(numpy.sqrt(Bez),Bvz.T).T
return Z
def test_iris():
data,target=get_data()
Z=MDS(data)
figure1=pyplot.figure()
pyplot.subplot(1,2,1)
pyplot.plot(Z[target==0,0],Z[target==0,1],'r*',markersize=20)
pyplot.plot(Z[target==1,0],Z[target==1,1],'bo',markersize=20)
pyplot.plot(Z[target==2,0],Z[target==2,1],'gx',markersize=20)
pyplot.title('CUSTOM')
pyplot.subplot(1,2,2)
Z1=manifold.MDS(n_components=2).fit_transform(data)
pyplot.plot(Z1[target==0,0],Z1[target==0,1],'r*',markersize=20)
pyplot.plot(Z1[target==1,0],Z1[target==1,1],'bo',markersize=20)
pyplot.plot(Z1[target==2,0],Z1[target==2,1],'gx',markersize=20)
pyplot.title('SKLEARN')
pyplot.show()
def test_ball():
data,target=generate_circle_data()
Z=MDS(data)
figure1=pyplot.figure()
pyplot.subplot(1,2,1)
pyplot.plot(Z[target==0,0],Z[target==0,1],'r*',markersize=10)
pyplot.plot(Z[target==1,0],Z[target==1,1],'bo',markersize=10)
pyplot.plot(Z[target==2,0],Z[target==2,1],'gx',markersize=10)
pyplot.title('CUSTOM')
pyplot.subplot(1,2,2)
Z1=manifold.MDS(n_components=2).fit_transform(data)
pyplot.plot(Z1[target==0,0],Z1[target==0,1],'r*',markersize=10)
pyplot.plot(Z1[target==1,0],Z1[target==1,1],'bo',markersize=10)
pyplot.plot(Z1[target==2,0],Z1[target==2,1],'gx',markersize=10)
pyplot.title('SKLEARN')
pyplot.show()
if __name__=='__main__':
# test_ball()
test_iris()