python机器学习包sklearn数据预处理

sklearn.preprocessing包提供了几个常见的实用函数和转换类,将原始特征向量改变为更适合下游机器学习的表示。

函数

功能
preprocessing.scale( ) 标准化
preprocessing.MinMaxScaler( ) 最大最小值标准化
preprocessing.StandardScaler( ) 数据标准化
preprocessing.MaxAbsScaler( ) 绝对值最大标准化
preprocessing.RobustScaler( ) 带离群值数据集标准化
preprocessing.QuantileTransformer( ) 使用分位数信息变换特征
preprocessing.PowerTransformer( ) 使用幂变换执行到正态分布的映射
preprocessing.Normalizer( ) 正则化
preprocessing.OrdinalEncoder( ) 将分类特征转换为分类数值
preprocessing.LabelEncoder( ) 将分类特征转换为分类数值
preprocessing.MultiLabelBinarizer( ) 多标签二值化
preprocessing.OneHotEncoder( ) 独热编码
preprocessing.KBinsDiscretizer( ) 将连续数据离散化
preprocessing.FunctionTransformer( ) 自定义特征处理函数
preprocessing.Binarizer( ) 特征二值化
preprocessing.PolynomialFeatures( ) 创建多项式特征
preprocesssing.Normalizer( ) 正则化
preprocessing.Imputer( ) 弥补缺失值

1.StandardScaler

# numpy: 1.21.1
# sklearn: 1.0.1

import numpy as np
from sklearn import preprocessing

## 1.StandardScaler, 
# z = (x - u) / s,:将数据转换为均值为0,方差为1的数据, 即标准正态分布的数据,数据维度<=2
# 注:沿着axis=0进行数据处理,数据维度要小于等于2

X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])
scaler = preprocessing.StandardScaler().fit(X_train)
X_scaled = scaler.transform(X_train)
print(X_scaled.mean(axis=0),X_scaled.std(axis=0))

# 数据标准化用于分类预测示例
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

X, y = make_classification(random_state=0)
print(X,y)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
pipe = make_pipeline(StandardScaler(), LogisticRegression())
pipe.fit(X_train, y_train)  # apply scaling on training data
pipe.score(X_test, y_test)

2. MinMaxScaler

## 2. MinMaxScaler:数据转换到[0,1]
X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])
min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train)
print(X_train_minmax)

# 用X_train的max,min转化测试数据,范围不一定位于[0,1]区间
X_test = np.array([[-3., -1.,  4.]])
X_test_minmax = min_max_scaler.transform(X_test)
print(X_test_minmax)

3.MaxAbsScaler

## 3.MaxAbsScaler:数据映射到[-1, 1]区间
X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])

max_abs_scaler = preprocessing.MaxAbsScaler()
X_train_maxabs = max_abs_scaler.fit_transform(X_train)
print(X_train_maxabs)
# 用X_train的scaler转化测试数据,范围不一定位于[-1, 1]区间
X_test = np.array([[ -3., -1.,  4.]])
X_test_maxabs = max_abs_scaler.transform(X_test)
#print(X_test_maxabs)
print(max_abs_scaler.scale_)

4.QuantileTransformer

## 4.QuantileTransformer,属于非线形变换
# map the data to a uniform distribution with values between 0 and 1:

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
quantile_transformer = preprocessing.QuantileTransformer(random_state=0)
X_train_trans = quantile_transformer.fit_transform(X_train)
# print(X_train)
# print(X_train_trans)

X_test_trans = quantile_transformer.transform(X_test)
print(np.percentile(X_train[:, 1], [0, 25, 50, 75, 100]))
print(X_train.shape)
print(min(X_train[:, 1]))

5.Normalization

## 5.Normalization
# Normalization is the process of scaling individual samples to have unit norm. 
X = [[ 1., -1.,  2.],
     [ 2.,  0.,  0.],
     [ 0.,  1., -1.]]
X_normalized = preprocessing.normalize(X, norm='l2')
print(X_normalized)

normalizer = preprocessing.Normalizer(norm='l2').fit(X)  # fit does nothing
X_normalized = normalizer.transform(X) 
print(X_normalized)

6.序数编码

## 6.序数编码:分类特征 to 整数 (0 to n_categories - 1)
enc = preprocessing.OrdinalEncoder()
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)
enc.transform([['female', 'from US', 'uses Safari']])
X_transform = enc.transform(X)
print(X_transform)

# sklearn.impute: 缺失值处理模块
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
enc = Pipeline(steps=[("encoder", preprocessing.OrdinalEncoder()),
                      ("imputer", SimpleImputer(strategy="constant", fill_value=-1)),])
enc.fit_transform(X)

7.热独编码

##7.热独编码
enc = preprocessing.OneHotEncoder()
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)
print(enc.categories_)
X_transform = enc.transform(X)
print("One hot")
print(X_transform)
print(X_transform.toarray())

genders = ['female', 'male']
locations = ['from Africa', 'from Asia', 'from Europe', 'from US']
browsers = ['uses Chrome', 'uses Firefox', 'uses IE', 'uses Safari']
enc = preprocessing.OneHotEncoder(categories=[genders, locations, browsers])
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)
X_transform = enc.transform(X)
print(X_transform.toarray())
print(enc.transform([['male', 'from Asia', 'uses Firefox']]).toarray())

# 注:编码规则,genders前两位(01或10),locations:中间四位,browsers:后四位

enc = preprocessing.OneHotEncoder(handle_unknown='ignore')
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)
print("missing categorical features")
x = enc.transform([['female', 'from Asia', 'uses Chrome']]).toarray()
print(x)
# handle_unknown='ignore':fit中没有的分类特征编码为全0

# 每种特征只有两个值,可以去掉第一位,1,0 编码
X = [['male', 'from US', 'uses Safari'],
     ['female', 'from Europe', 'uses Firefox']]
drop_enc = preprocessing.OneHotEncoder(drop='first').fit(X)
print(drop_enc.categories_)
print(drop_enc.transform(X).toarray())

# 对于大于2个值的特征,不要去除最高位
X = [['male', 'US', 'Safari'],
     ['female', 'Europe', 'Firefox'],
     ['female', 'Asia', 'Chrome']]
drop_enc = preprocessing.OneHotEncoder(drop='if_binary').fit(X)
print(drop_enc.categories_)

print(drop_enc.transform(X).toarray())

# 处理特征中的缺失值
drop_enc = preprocessing.OneHotEncoder(drop='if_binary',
                                       handle_unknown='ignore').fit(X)
X_test = [['male', 'Europe', 'IE']]
print("with unknown value")
print(drop_enc.transform(X_test).toarray())

# 缺失值当作一种特征值
X = [['male', 'Safari'],
     ['female', None],
     [np.nan, 'Firefox']]
enc = preprocessing.OneHotEncoder(handle_unknown='error').fit(X)
print(enc.categories_)
print(enc.transform(X).toarray())

# 热独编码映射到特征值
X = [['male', 'US', 'Safari'],
     ['female', 'Europe', 'Firefox'],
     ['female', 'Asia', 'Chrome']]
drop_enc = preprocessing.OneHotEncoder(drop='if_binary', sparse=False,
                                       handle_unknown='ignore').fit(X)
X_test = [['male', 'US', 'IE']]
X_trans = drop_enc.transform(X_test)
print(X_trans)
print(drop_enc.inverse_transform(X_trans))

8.PolynomialFeatures

## 8.PolynomialFeatures
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
X = np.arange(6).reshape(3, 2)
poly = PolynomialFeatures(2)
print(poly.fit_transform(X))

9.FunctionTransformer

## 9.FunctionTransformer:自定义转化
import numpy as np
from sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(np.log1p, validate=True)
# log1p = log(x+1),数据取对数后,在一定程度上符合正态分布的特征
X = np.array([[0, 1], [2, 3]])
transformer.transform(X)

参考:

https://zhuanlan.zhihu.com/p/393113910

https://scikit-learn.org/stable/modules/preprocessing.html#

你可能感兴趣的:(python,机器学习)