数据预处理

import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import Imputer

data = np.random.randint(0,100,(10,5))

归一化:

#归一化
res = np.zeros(data.shape)
for i in range(data.shape[-1]):
    min_ = data[:,i].min()
    max_ = data[:,i].max()
    res[:,i] = (data[:,i] - min_)/(max_ - min_)
res
mms = MinMaxScaler()
res2 = mms.fit_transform(data)

标准化:

#标准化,去中心化
res = np.zeros(data.shape)
for i in range(data.shape[-1]):
    mean_ = data[:,i].mean()
    std_ = data[:,i].std()
    res[:,i] = (data[:,i] - mean_)/std_
ss = StandardScaler()
res2 = ss.fit_transform(data)

缺失值填充:

#缺失值填充
iris = sns.load_dataset('iris')
iris.iloc[1,1] = np.nan
iris.iloc[2,2] = np.nan

im = Imputer(strategy='most_frequent',axis=0)
im.fit_transform(iris.iloc[:,:-1])

你可能感兴趣的:(数据预处理)