第三周

数据归一化的意义：

主要在于数据影响性，例如100天/200天与 0.27年/0.55年的差距

数据归一化的方法：

最值归一

均值方差归一

import numpy as np

# 创建100个随机数

x = np.random.randint(0,100,size=100)

# 最值归一化（向量）

# 最值归一化公式，映射到0，1之间

(x - np.min(x)) / (np.max(x) - np.min(x))

# 最值归一化（矩阵）

# 0～100范围内的50*2的矩阵

X = np.random.randint(0,100,(50,2))

# 将矩阵改为浮点型

X = np.array(X, dtype=float)

# 最值归一化公式，对于每一个维度（列方向）进行归一化。

# X[:,0]第一列，第一个特征

X[:,0] = (X[:,0] - np.min(X[:,0])) / (np.max(X[:,0]) - np.min(X[:,0]))

# X[:,1]第二列，第二个特征

X[:,1] = (X[:,1] - np.min(X[:,1])) / (np.max(X[:,1]) - np.min(X[:,1]))

# 如果有n个特征，可以写个循环：

for i in range(0,2):

X[:,i] = (X[:,i]-np.min(X[:,i])) / (np.max(X[:,i] - np.min(X[:,i])))

import matplotlib.pyplot as plt

# 简单绘制样本，看横纵坐标

plt.scatter(X[:,0],X[:,1])

plt.show()

X2 = np.array(np.random.randint(0,100,(50,2)),dtype=float)

# 套用公式，对每一列做均值方差归一化

for i in range(0,2):

X2[:,i]=(X2[:,i]-np.mean(X2[:,i])) / np.std(X2[:,i])

plt.scatter(X2[:,0],X2[:,1])

plt.show()

np.mean(X2[:,0])

np.std(X2[:,1])

import numpy as np

from sklearn import datasets

from sklearn.model_selection import train_test_split

iris = datasets.load_iris()

X = iris.data

y = iris.target

X_train,X_test,y_train,y_test = train_test_split(iris.data,iris.target,test_size=0.2,random_state=666)

from sklearn.preprocessing import StandardScaler

standardScaler = StandardScaler()

# 归一化的过程跟训练模型一样

standardScaler.fit(X_train)

standardScaler.mean_

standardScaler.scale_ # 表述数据分布范围的变量，替代std_

# 使用transform

X_train_standard = standardScaler.transform(X_train)

X_test_standard = standardScaler.transform(X_test)

import numpy as np

class StandardScaler:

def __init__(self):

self.mean_ = None

self.scale_ = None

def fit(self, X):

"""根据训练数据集X获得数据的均值和方差"""

assert X.ndim == 2, "The dimension of X must be 2"

# 求出每个列的均值

self.mean_ = np.array([np.mean(X[:,i] for i in range(X.shape[1]))])

self.scale_ = np.array([np.std(X[:, i] for i in range(X.shape[1]))])

return self

def tranform(self, X):

"""将X根据StandardScaler进行均值方差归一化处理"""

assert X.ndim == 2, "The dimension of X must be 2"

assert self.mean_ is not None and self.scale_ is not None, \

"must fit before transform"

assert X.shape[1] == len(self.mean_), \

"the feature number of X must be equal to mean_ and std_"

# 创建一个空的浮点型矩阵，大小和X相同

resX = np.empty(shape=X.shape, dtype=float)

# 对于每一列（维度）都计算

for col in range(X.shape[1]):

resX[:,col] = (X[:,col] - self.mean_[col]) / self.scale_[col]

return resX

# points为实例点集合，depth深度，为用来确定取维度的参数

def kd_tree(points, depth):

if 0 == len(points):

return None

# 指定切分维度，len(points[0])是数据的实际维度，这样计算可以保证循环

cutting_dim = depth % len(points[0])

# 切分点初始化

medium_index = len(points)

# 对所有的实例点按照指定维度进行排序，itemgetter用于获取对象哪些维度上的数据，参数为需要获取的数据在对象中的序号

points.sort(key=itemgetter(cutting_dim))

# 将该维度的中值点作为根节点

node = Node(points[medium_index])

# 对于左子树，重复构建（depth+1）

node.left = kd_tree(points[:medium_index], depth + 1)

# 对于右子树，重复构建（depth+1）

node.right = kd_tree(points[medium_index + 1:], depth + 1)

return node

import numpy as np

from matplotlib import pyplot as plt

from matplotlib.patches import Circle

from sklearn.neighbors import KDTree

np.random.seed(0)

points = np.random.random((100, 2))

tree = KDTree(points)

point = points[0]

# kNN

dists, indices = tree.query([point], k=3)

print(dists, indices)

# query radius

indices = tree.query_radius([point], r=0.2)

print(indices)

fig = plt.figure()

ax = fig.add_subplot(111, aspect='equal')

ax.add_patch(Circle(point, 0.2, color='r', fill=False))

X, Y = [p[0] for p in points], [p[1] for p in points]

plt.scatter(X, Y)

plt.scatter([point[0]], [point[1]], c='r')

plt.show()

第三周

你可能感兴趣的:(第三周)