第三周

数据归一化的意义:

    主要在于数据影响性,例如100天/200天 与 0.27年/0.55年的差距

数据归一化的方法:

    最值归一

    均值方差归一    

import numpy as np

# 创建100个随机数

x = np.random.randint(0,100,size=100)

# 最值归一化(向量)

# 最值归一化公式,映射到0,1之间

(x - np.min(x)) / (np.max(x) -  np.min(x))

# 最值归一化(矩阵)

# 0~100范围内的50*2的矩阵

X = np.random.randint(0,100,(50,2))

# 将矩阵改为浮点型

X = np.array(X, dtype=float)

# 最值归一化公式,对于每一个维度(列方向)进行归一化。

# X[:,0]第一列,第一个特征

X[:,0] = (X[:,0] - np.min(X[:,0])) / (np.max(X[:,0]) - np.min(X[:,0]))

# X[:,1]第二列,第二个特征

X[:,1] = (X[:,1] - np.min(X[:,1])) / (np.max(X[:,1]) - np.min(X[:,1]))

# 如果有n个特征,可以写个循环:

for i in range(0,2):

    X[:,i] = (X[:,i]-np.min(X[:,i])) / (np.max(X[:,i] - np.min(X[:,i])))

import matplotlib.pyplot as plt

# 简单绘制样本,看横纵坐标

plt.scatter(X[:,0],X[:,1])

plt.show()

X2 = np.array(np.random.randint(0,100,(50,2)),dtype=float)

# 套用公式,对每一列做均值方差归一化

for i in range(0,2):

    X2[:,i]=(X2[:,i]-np.mean(X2[:,i])) / np.std(X2[:,i])

plt.scatter(X2[:,0],X2[:,1])

plt.show()


np.mean(X2[:,0])

np.std(X2[:,1])


import numpy as np

from sklearn import datasets

from sklearn.model_selection import train_test_split

iris = datasets.load_iris()

X = iris.data

y = iris.target

X_train,X_test,y_train,y_test = train_test_split(iris.data,iris.target,test_size=0.2,random_state=666)

from sklearn.preprocessing import StandardScaler

standardScaler = StandardScaler()

# 归一化的过程跟训练模型一样

standardScaler.fit(X_train)

standardScaler.mean_

standardScaler.scale_  # 表述数据分布范围的变量,替代std_

# 使用transform

X_train_standard = standardScaler.transform(X_train)

X_test_standard = standardScaler.transform(X_test)

import numpy as np

class StandardScaler:

    def __init__(self):

        self.mean_ = None

        self.scale_ = None

    def fit(self, X):

        """根据训练数据集X获得数据的均值和方差"""

        assert X.ndim == 2, "The dimension of X must be 2"

        # 求出每个列的均值

        self.mean_ = np.array([np.mean(X[:,i] for i in range(X.shape[1]))])

        self.scale_ = np.array([np.std(X[:, i] for i in range(X.shape[1]))])

        return self

    def tranform(self, X):

        """将X根据StandardScaler进行均值方差归一化处理"""

        assert X.ndim == 2, "The dimension of X must be 2"

        assert self.mean_ is not None and self.scale_ is not None, \

        "must fit before transform"

        assert X.shape[1] == len(self.mean_), \

        "the feature number of X must be equal to mean_ and std_"

        # 创建一个空的浮点型矩阵,大小和X相同

        resX = np.empty(shape=X.shape, dtype=float)

        # 对于每一列(维度)都计算

        for col in range(X.shape[1]):

            resX[:,col] = (X[:,col] - self.mean_[col]) / self.scale_[col]

        return resX

# points为实例点集合,depth深度,为用来确定取维度的参数

def kd_tree(points, depth):   

    if 0 == len(points):

        return None

    # 指定切分维度,len(points[0])是数据的实际维度,这样计算可以保证循环

    cutting_dim = depth % len(points[0])

    # 切分点初始化

    medium_index = len(points)

    # 对所有的实例点按照指定维度进行排序,itemgetter用于获取对象哪些维度上的数据,参数为需要获取的数据在对象中的序号

    points.sort(key=itemgetter(cutting_dim)) 

    # 将该维度的中值点作为根节点

    node = Node(points[medium_index])

    # 对于左子树,重复构建(depth+1)

    node.left = kd_tree(points[:medium_index], depth + 1)   

    # 对于右子树,重复构建(depth+1)

    node.right = kd_tree(points[medium_index + 1:], depth + 1)   

    return node

import numpy as np

from matplotlib import pyplot as plt

from matplotlib.patches import Circle

from sklearn.neighbors import KDTree

np.random.seed(0)

points = np.random.random((100, 2))

tree = KDTree(points)

point = points[0]

# kNN

dists, indices = tree.query([point], k=3)

print(dists, indices)

# query radius

indices = tree.query_radius([point], r=0.2)

print(indices)

fig = plt.figure()

ax = fig.add_subplot(111, aspect='equal')

ax.add_patch(Circle(point, 0.2, color='r', fill=False))

X, Y = [p[0] for p in points], [p[1] for p in points]

plt.scatter(X, Y)

plt.scatter([point[0]], [point[1]], c='r')

plt.show()




你可能感兴趣的:(第三周)