使用机器学习和深度学习对数据进行训练前,需要对数据进行预处理,本文记录下,与数据预处理相关过程。
数据预处理两个操作:
transform.ToTensor(),
transform.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))
关于对这两个的操作理解均来自文章参考
https://zhuanlan.zhihu.com/p/414242338
1、transform.ToTensor() 功能
x = np.array(x,dtype='uint8')
y = transforms.Totensor()(x)
示例实现:
import torch
import numpy as np
from torchvision import transforms
import cv2
#自定义图片数组,数据类型一定要转为‘uint8’,不然transforms.ToTensor()不会归一化
data = np.array([
[[1,1,1],[1,1,1],[1,1,1],[1,1,1],[1,1,1]],
[[2,2,2],[2,2,2],[2,2,2],[2,2,2],[2,2,2]],
[[3,3,3],[3,3,3],[3,3,3],[3,3,3],[3,3,3]],
[[4,4,4],[4,4,4],[4,4,4],[4,4,4],[4,4,4]],
[[5,5,5],[5,5,5],[5,5,5],[5,5,5],[5,5,5]]
],dtype='uint8')
print(data)
print(data.shape) #(5,5,3)
data = transforms.ToTensor()(data)
print(data)
print(data.shape) #(3,5,5)
输出:
tensor([[[0.0039, 0.0039, 0.0039, 0.0039, 0.0039],
[0.0078, 0.0078, 0.0078, 0.0078, 0.0078],
[0.0118, 0.0118, 0.0118, 0.0118, 0.0118],
[0.0157, 0.0157, 0.0157, 0.0157, 0.0157],
[0.0196, 0.0196, 0.0196, 0.0196, 0.0196]],
[[0.0039, 0.0039, 0.0039, 0.0039, 0.0039],
[0.0078, 0.0078, 0.0078, 0.0078, 0.0078],
[0.0118, 0.0118, 0.0118, 0.0118, 0.0118],
[0.0157, 0.0157, 0.0157, 0.0157, 0.0157],
[0.0196, 0.0196, 0.0196, 0.0196, 0.0196]],
[[0.0039, 0.0039, 0.0039, 0.0039, 0.0039],
[0.0078, 0.0078, 0.0078, 0.0078, 0.0078],
[0.0118, 0.0118, 0.0118, 0.0118, 0.0118],
[0.0157, 0.0157, 0.0157, 0.0157, 0.0157],
[0.0196, 0.0196, 0.0196, 0.0196, 0.0196]]])
2、transforms.Normalize() 功能
计算同一维度数据的平均值和标准差,将该维度的每个值减去平均值再除以标准差。使每一维数据符合标准正态分布,即均值为0,标准差为1,并非[-1,1],使模型更容易收敛。
x = (x - mean) / std
计算均值和标准差:
import torch
import numpy as np
from torchvision import transforms
Input_dimension = 3
def Get_Mean_Std(data,Input_dimension = 9):
C, H, W = data.shape[:3]
data = data.view(C, -1)
print(data.shape)
#展平后,w,h属于第一维度,对他们求和求标准差
channel_mean = data.mean(1)
channel_std = data.std(1)
print(channel_mean, channel_std)
return channel_mean, channel_std
# 这里以上述创建的单数据为例子
data = np.array([
[[1,1,1],[1,1,1],[1,1,1],[1,1,1],[1,1,1]],
[[2,2,2],[2,2,2],[2,2,2],[2,2,2],[2,2,2]],
[[3,3,3],[3,3,3],[3,3,3],[3,3,3],[3,3,3]],
[[4,4,4],[4,4,4],[4,4,4],[4,4,4],[4,4,4]],
[[5,5,5],[5,5,5],[5,5,5],[5,5,5],[5,5,5]]
],dtype='uint8') #不用uint8时,进行ToTensor不进行归一化
data = np.array(data)
data = transforms.ToTensor()(data) #数据转为Tensor,通道进行变换
print(data.shape)
channel_mean, channel_std = Get_Mean_Std(data,Input_dimension)
x = transforms.Normalize(channel_mean, channel_std)(data)
计算一批数据的mean和std:
import torch
import numpy as np
from torchvision import transforms
# 这里以上述创建的单数据为例子
data = np.array([
[[1,1,1],[1,1,1],[1,1,1],[1,1,1],[1,1,1]],
[[2,2,2],[2,2,2],[2,2,2],[2,2,2],[2,2,2]],
[[3,3,3],[3,3,3],[3,3,3],[3,3,3],[3,3,3]],
[[4,4,4],[4,4,4],[4,4,4],[4,4,4],[4,4,4]],
[[5,5,5],[5,5,5],[5,5,5],[5,5,5],[5,5,5]]
],dtype='uint8')
#将数据转为C,W,H,并归一化到[0,1]
data = transforms.ToTensor()(data)
# 需要对数据进行扩维,增加batch维度
data = torch.unsqueeze(data,0)
nb_samples = 0.
#创建3维的空列表
channel_mean = torch.zeros(3)
channel_std = torch.zeros(3)
print(data.shape)
N, C, H, W = data.shape[:4]
data = data.view(N, C, -1) #将w,h维度的数据展平,为batch,channel,data,然后对三个维度上的数分别求和和标准差
print(data.shape)
#展平后,w,h属于第二维度,对他们求平均,sum(0)为将同一纬度的数据累加
channel_mean += data.mean(2).sum(0)
#展平后,w,h属于第二维度,对他们求标准差,sum(0)为将同一纬度的数据累加
channel_std += data.std(2).sum(0)
#获取所有batch的数据,这里为1
nb_samples += N
#获取同一batch的均值和标准差
channel_mean /= nb_samples
channel_std /= nb_samples
print(channel_mean, channel_std)
得到均值和标准差之后自己实现标准化:
data = np.array([
[[1,1,1],[1,1,1],[1,1,1],[1,1,1],[1,1,1]],
[[2,2,2],[2,2,2],[2,2,2],[2,2,2],[2,2,2]],
[[3,3,3],[3,3,3],[3,3,3],[3,3,3],[3,3,3]],
[[4,4,4],[4,4,4],[4,4,4],[4,4,4],[4,4,4]],
[[5,5,5],[5,5,5],[5,5,5],[5,5,5],[5,5,5]]
],dtype='uint8')
data = transforms.ToTensor()(data)
for i in range(3):
data[i,:,:] = (data[i,:,:] - channel_mean[i]) / channel_std[i]
print(data)
输出:
tensor([[[-1.3856, -1.3856, -1.3856, -1.3856, -1.3856],
[-0.6928, -0.6928, -0.6928, -0.6928, -0.6928],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
[ 0.6928, 0.6928, 0.6928, 0.6928, 0.6928],
[ 1.3856, 1.3856, 1.3856, 1.3856, 1.3856]],
[[-1.3856, -1.3856, -1.3856, -1.3856, -1.3856],
[-0.6928, -0.6928, -0.6928, -0.6928, -0.6928],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
[ 0.6928, 0.6928, 0.6928, 0.6928, 0.6928],
[ 1.3856, 1.3856, 1.3856, 1.3856, 1.3856]],
[[-1.3856, -1.3856, -1.3856, -1.3856, -1.3856],
[-0.6928, -0.6928, -0.6928, -0.6928, -0.6928],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
[ 0.6928, 0.6928, 0.6928, 0.6928, 0.6928],
[ 1.3856, 1.3856, 1.3856, 1.3856, 1.3856]]])
官方方法实现:
data = np.array([
[[1,1,1],[1,1,1],[1,1,1],[1,1,1],[1,1,1]],
[[2,2,2],[2,2,2],[2,2,2],[2,2,2],[2,2,2]],
[[3,3,3],[3,3,3],[3,3,3],[3,3,3],[3,3,3]],
[[4,4,4],[4,4,4],[4,4,4],[4,4,4],[4,4,4]],
[[5,5,5],[5,5,5],[5,5,5],[5,5,5],[5,5,5]]
],dtype='uint8')
data = transforms.ToTensor()(data)
data = transforms.Normalize(channel_mean, channel_std)(data)
print(data)
3、StandardScaler工具
Sklearn中提供的标准化工具,针对每个特征维度进行去均值和方差归一化。处理后也使得经过处理的数据符合标准正态分布,即均值为0,标准差为1。该方法不需要输入的数据是tensor。
参考博文
https://long97.blog.csdn.net/article/details/90549391?spm=1001.2101.3001.6661.1&utm_medium=distribute.pc_relevant_t0.none-task-blog-2%7Edefault%7ECTRLIST%7Edefault-1.pc_relevant_default&depth_1-utm_source=distribute.pc_relevant_t0.none-task-blog-2%7Edefault%7ECTRLIST%7Edefault-1.pc_relevant_default&utm_relevant_index=1
import numpy as np
from sklearn.preprocessing import StandardScaler
def standartize(data):
H, W, C = data.shape[:3]
newX = np.reshape(data, (-1,C)) #这是相当于变成了 X.shape[0] * X.shape[1] * X.shape[2] 行, 1列。 下面的这个StandardScaler只能处理2维以下的数据。
scaler = StandardScaler().fit(newX) # Compute the mean and std to be used for later scaling. 得到平均数和标准差,先保存起来。
newX = scaler.transform(newX) # Perform standardization by centering and scaling.
data = np.reshape(newX, (H, W, C)) #再给形状转变回去
return data
data = np.array([
[[1,1,1],[1,1,1],[1,1,1],[1,1,1],[1,1,1]],
[[2,2,2],[2,2,2],[2,2,2],[2,2,2],[2,2,2]],
[[3,3,3],[3,3,3],[3,3,3],[3,3,3],[3,3,3]],
[[4,4,4],[4,4,4],[4,4,4],[4,4,4],[4,4,4]],
[[5,5,5],[5,5,5],[5,5,5],[5,5,5],[5,5,5]]
])
data = standartize(data)
print(data)