数据归一化的意义:
主要在于数据影响性,例如100天/200天 与 0.27年/0.55年的差距
数据归一化的方法:
最值归一
均值方差归一
import numpy as np
# 创建100个随机数
x = np.random.randint(0,100,size=100)
# 最值归一化(向量)
# 最值归一化公式,映射到0,1之间
(x - np.min(x)) / (np.max(x) - np.min(x))
# 最值归一化(矩阵)
# 0~100范围内的50*2的矩阵
X = np.random.randint(0,100,(50,2))
# 将矩阵改为浮点型
X = np.array(X, dtype=float)
# 最值归一化公式,对于每一个维度(列方向)进行归一化。
# X[:,0]第一列,第一个特征
X[:,0] = (X[:,0] - np.min(X[:,0])) / (np.max(X[:,0]) - np.min(X[:,0]))
# X[:,1]第二列,第二个特征
X[:,1] = (X[:,1] - np.min(X[:,1])) / (np.max(X[:,1]) - np.min(X[:,1]))
# 如果有n个特征,可以写个循环:
for i in range(0,2):
X[:,i] = (X[:,i]-np.min(X[:,i])) / (np.max(X[:,i] - np.min(X[:,i])))
import matplotlib.pyplot as plt
# 简单绘制样本,看横纵坐标
plt.scatter(X[:,0],X[:,1])
plt.show()
X2 = np.array(np.random.randint(0,100,(50,2)),dtype=float)
# 套用公式,对每一列做均值方差归一化
for i in range(0,2):
X2[:,i]=(X2[:,i]-np.mean(X2[:,i])) / np.std(X2[:,i])
plt.scatter(X2[:,0],X2[:,1])
plt.show()
np.mean(X2[:,0])
np.std(X2[:,1])
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train,X_test,y_train,y_test = train_test_split(iris.data,iris.target,test_size=0.2,random_state=666)
from sklearn.preprocessing import StandardScaler
standardScaler = StandardScaler()
# 归一化的过程跟训练模型一样
standardScaler.fit(X_train)
standardScaler.mean_
standardScaler.scale_ # 表述数据分布范围的变量,替代std_
# 使用transform
X_train_standard = standardScaler.transform(X_train)
X_test_standard = standardScaler.transform(X_test)
import numpy as np
class StandardScaler:
def __init__(self):
self.mean_ = None
self.scale_ = None
def fit(self, X):
"""根据训练数据集X获得数据的均值和方差"""
assert X.ndim == 2, "The dimension of X must be 2"
# 求出每个列的均值
self.mean_ = np.array([np.mean(X[:,i] for i in range(X.shape[1]))])
self.scale_ = np.array([np.std(X[:, i] for i in range(X.shape[1]))])
return self
def tranform(self, X):
"""将X根据StandardScaler进行均值方差归一化处理"""
assert X.ndim == 2, "The dimension of X must be 2"
assert self.mean_ is not None and self.scale_ is not None, \
"must fit before transform"
assert X.shape[1] == len(self.mean_), \
"the feature number of X must be equal to mean_ and std_"
# 创建一个空的浮点型矩阵,大小和X相同
resX = np.empty(shape=X.shape, dtype=float)
# 对于每一列(维度)都计算
for col in range(X.shape[1]):
resX[:,col] = (X[:,col] - self.mean_[col]) / self.scale_[col]
return resX
# points为实例点集合,depth深度,为用来确定取维度的参数
def kd_tree(points, depth):
if 0 == len(points):
return None
# 指定切分维度,len(points[0])是数据的实际维度,这样计算可以保证循环
cutting_dim = depth % len(points[0])
# 切分点初始化
medium_index = len(points)
# 对所有的实例点按照指定维度进行排序,itemgetter用于获取对象哪些维度上的数据,参数为需要获取的数据在对象中的序号
points.sort(key=itemgetter(cutting_dim))
# 将该维度的中值点作为根节点
node = Node(points[medium_index])
# 对于左子树,重复构建(depth+1)
node.left = kd_tree(points[:medium_index], depth + 1)
# 对于右子树,重复构建(depth+1)
node.right = kd_tree(points[medium_index + 1:], depth + 1)
return node
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.patches import Circle
from sklearn.neighbors import KDTree
np.random.seed(0)
points = np.random.random((100, 2))
tree = KDTree(points)
point = points[0]
# kNN
dists, indices = tree.query([point], k=3)
print(dists, indices)
# query radius
indices = tree.query_radius([point], r=0.2)
print(indices)
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
ax.add_patch(Circle(point, 0.2, color='r', fill=False))
X, Y = [p[0] for p in points], [p[1] for p in points]
plt.scatter(X, Y)
plt.scatter([point[0]], [point[1]], c='r')
plt.show()