在设计算法或对算法进行改进时,不可避免地要对新算法进行测试,则会用到toy类型的数据进行直观展示. 最常用的数据类型是高斯类型数据或者直线型数据, 同时要对数据点的顺序进行标注, 然后看相互之间的关系. 样本点相互之间的关系以近邻关系最为常用, 它是很多算法优秀效果的保障.
import numpy as np
import matplotlib.pyplot as plt
def gaussian2d2c(n1,n2):
mean1 = [0,0]
cov1 = [[1,0],[0,1]]
x1 = np.random.multivariate_normal(mean1,cov1,n1,'raise')# 100表示样本数,每个样本的维数等同于均值的长度
print(x1.shape)
mean2 = [10,10]
cov2 = [[1,0],[0,1]]
x2 = np.random.multivariate_normal(mean2,cov2,n2,'raise')
print(x2.shape)
X = np.vstack((x1,x2))
print(X.shape)
return X.T
data = gaussian2d2c(10,10)
plt.scatter(data[0,:],data[1,:])
epsilon = 0.01
for i in range(data.shape[1]):
plt.text(data[0,i]+epsilon, data[1,i]+epsilon, str(i))
plt.show()
import numpy as np
import matplotlib.pyplot as plt
def line2d2c(n):
t = np.linspace(0, 1, n)
q1t = 2 * t + 1 + 0.1 * np.random.randn(n)
x1 = np.vstack((t, q1t))
q2t = 2 * t + 5 + 0.1 * np.random.randn(n)
x2 = np.vstack((t, q2t))
x = np.hstack((x1, x2))
return x
data = line2d2c(10)
plt.scatter(data[0,:],data[1,:])
for i in range(data.shape[1]):
plt.text(data[0,i]+epsilon, data[1,i]+epsilon, str(i))
plt.show()
def kneightbors(X, k):
D, N = X.shape
print('LLE running on {} points in {} dimensions\n'.format(N,D))
# Step1: compute pairwise distances & find neighbors
print('-->Finding {} nearest neighbours.\n'.format(k))
X2 = np.sum(X**2,axis = 0).reshape(1,-1) # 1xN
distance = np.tile(X2,(N,1)) + np.tile(X2.T, (1, N)) - 2 * np.dot(X.T,X) # NxN
index = np.argsort(distance,axis=0)
neighborhood = index[1:1+k,:] # kxN filter itself
return neighborhood
kfield = kneightbors(data, 3)
print(kfield[:,:10])