sklearn.preprocessing包提供了几个常见的实用函数和转换类,将原始特征向量改变为更适合下游机器学习的表示。
函数 |
功能 |
---|---|
preprocessing.scale( ) | 标准化 |
preprocessing.MinMaxScaler( ) | 最大最小值标准化 |
preprocessing.StandardScaler( ) | 数据标准化 |
preprocessing.MaxAbsScaler( ) | 绝对值最大标准化 |
preprocessing.RobustScaler( ) | 带离群值数据集标准化 |
preprocessing.QuantileTransformer( ) | 使用分位数信息变换特征 |
preprocessing.PowerTransformer( ) | 使用幂变换执行到正态分布的映射 |
preprocessing.Normalizer( ) | 正则化 |
preprocessing.OrdinalEncoder( ) | 将分类特征转换为分类数值 |
preprocessing.LabelEncoder( ) | 将分类特征转换为分类数值 |
preprocessing.MultiLabelBinarizer( ) | 多标签二值化 |
preprocessing.OneHotEncoder( ) | 独热编码 |
preprocessing.KBinsDiscretizer( ) | 将连续数据离散化 |
preprocessing.FunctionTransformer( ) | 自定义特征处理函数 |
preprocessing.Binarizer( ) | 特征二值化 |
preprocessing.PolynomialFeatures( ) | 创建多项式特征 |
preprocesssing.Normalizer( ) | 正则化 |
preprocessing.Imputer( ) | 弥补缺失值 |
# numpy: 1.21.1
# sklearn: 1.0.1
import numpy as np
from sklearn import preprocessing
## 1.StandardScaler,
# z = (x - u) / s,:将数据转换为均值为0,方差为1的数据, 即标准正态分布的数据,数据维度<=2
# 注:沿着axis=0进行数据处理,数据维度要小于等于2
X_train = np.array([[ 1., -1., 2.],
[ 2., 0., 0.],
[ 0., 1., -1.]])
scaler = preprocessing.StandardScaler().fit(X_train)
X_scaled = scaler.transform(X_train)
print(X_scaled.mean(axis=0),X_scaled.std(axis=0))
# 数据标准化用于分类预测示例
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
X, y = make_classification(random_state=0)
print(X,y)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
pipe = make_pipeline(StandardScaler(), LogisticRegression())
pipe.fit(X_train, y_train) # apply scaling on training data
pipe.score(X_test, y_test)
## 2. MinMaxScaler:数据转换到[0,1]
X_train = np.array([[ 1., -1., 2.],
[ 2., 0., 0.],
[ 0., 1., -1.]])
min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train)
print(X_train_minmax)
# 用X_train的max,min转化测试数据,范围不一定位于[0,1]区间
X_test = np.array([[-3., -1., 4.]])
X_test_minmax = min_max_scaler.transform(X_test)
print(X_test_minmax)
## 3.MaxAbsScaler:数据映射到[-1, 1]区间
X_train = np.array([[ 1., -1., 2.],
[ 2., 0., 0.],
[ 0., 1., -1.]])
max_abs_scaler = preprocessing.MaxAbsScaler()
X_train_maxabs = max_abs_scaler.fit_transform(X_train)
print(X_train_maxabs)
# 用X_train的scaler转化测试数据,范围不一定位于[-1, 1]区间
X_test = np.array([[ -3., -1., 4.]])
X_test_maxabs = max_abs_scaler.transform(X_test)
#print(X_test_maxabs)
print(max_abs_scaler.scale_)
## 4.QuantileTransformer,属于非线形变换
# map the data to a uniform distribution with values between 0 and 1:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
quantile_transformer = preprocessing.QuantileTransformer(random_state=0)
X_train_trans = quantile_transformer.fit_transform(X_train)
# print(X_train)
# print(X_train_trans)
X_test_trans = quantile_transformer.transform(X_test)
print(np.percentile(X_train[:, 1], [0, 25, 50, 75, 100]))
print(X_train.shape)
print(min(X_train[:, 1]))
## 5.Normalization
# Normalization is the process of scaling individual samples to have unit norm.
X = [[ 1., -1., 2.],
[ 2., 0., 0.],
[ 0., 1., -1.]]
X_normalized = preprocessing.normalize(X, norm='l2')
print(X_normalized)
normalizer = preprocessing.Normalizer(norm='l2').fit(X) # fit does nothing
X_normalized = normalizer.transform(X)
print(X_normalized)
## 6.序数编码:分类特征 to 整数 (0 to n_categories - 1)
enc = preprocessing.OrdinalEncoder()
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)
enc.transform([['female', 'from US', 'uses Safari']])
X_transform = enc.transform(X)
print(X_transform)
# sklearn.impute: 缺失值处理模块
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
enc = Pipeline(steps=[("encoder", preprocessing.OrdinalEncoder()),
("imputer", SimpleImputer(strategy="constant", fill_value=-1)),])
enc.fit_transform(X)
##7.热独编码
enc = preprocessing.OneHotEncoder()
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)
print(enc.categories_)
X_transform = enc.transform(X)
print("One hot")
print(X_transform)
print(X_transform.toarray())
genders = ['female', 'male']
locations = ['from Africa', 'from Asia', 'from Europe', 'from US']
browsers = ['uses Chrome', 'uses Firefox', 'uses IE', 'uses Safari']
enc = preprocessing.OneHotEncoder(categories=[genders, locations, browsers])
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)
X_transform = enc.transform(X)
print(X_transform.toarray())
print(enc.transform([['male', 'from Asia', 'uses Firefox']]).toarray())
# 注:编码规则,genders前两位(01或10),locations:中间四位,browsers:后四位
enc = preprocessing.OneHotEncoder(handle_unknown='ignore')
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)
print("missing categorical features")
x = enc.transform([['female', 'from Asia', 'uses Chrome']]).toarray()
print(x)
# handle_unknown='ignore':fit中没有的分类特征编码为全0
# 每种特征只有两个值,可以去掉第一位,1,0 编码
X = [['male', 'from US', 'uses Safari'],
['female', 'from Europe', 'uses Firefox']]
drop_enc = preprocessing.OneHotEncoder(drop='first').fit(X)
print(drop_enc.categories_)
print(drop_enc.transform(X).toarray())
# 对于大于2个值的特征,不要去除最高位
X = [['male', 'US', 'Safari'],
['female', 'Europe', 'Firefox'],
['female', 'Asia', 'Chrome']]
drop_enc = preprocessing.OneHotEncoder(drop='if_binary').fit(X)
print(drop_enc.categories_)
print(drop_enc.transform(X).toarray())
# 处理特征中的缺失值
drop_enc = preprocessing.OneHotEncoder(drop='if_binary',
handle_unknown='ignore').fit(X)
X_test = [['male', 'Europe', 'IE']]
print("with unknown value")
print(drop_enc.transform(X_test).toarray())
# 缺失值当作一种特征值
X = [['male', 'Safari'],
['female', None],
[np.nan, 'Firefox']]
enc = preprocessing.OneHotEncoder(handle_unknown='error').fit(X)
print(enc.categories_)
print(enc.transform(X).toarray())
# 热独编码映射到特征值
X = [['male', 'US', 'Safari'],
['female', 'Europe', 'Firefox'],
['female', 'Asia', 'Chrome']]
drop_enc = preprocessing.OneHotEncoder(drop='if_binary', sparse=False,
handle_unknown='ignore').fit(X)
X_test = [['male', 'US', 'IE']]
X_trans = drop_enc.transform(X_test)
print(X_trans)
print(drop_enc.inverse_transform(X_trans))
## 8.PolynomialFeatures
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
X = np.arange(6).reshape(3, 2)
poly = PolynomialFeatures(2)
print(poly.fit_transform(X))
## 9.FunctionTransformer:自定义转化
import numpy as np
from sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(np.log1p, validate=True)
# log1p = log(x+1),数据取对数后,在一定程度上符合正态分布的特征
X = np.array([[0, 1], [2, 3]])
transformer.transform(X)
参考:
https://zhuanlan.zhihu.com/p/393113910
https://scikit-learn.org/stable/modules/preprocessing.html#