import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler #归一化Normalization
from sklearn.preprocessing import StandardScaler #数据标准化
from sklearn.impute import SimpleImputer #填补缺失值(建议直接使用pandas)
from sklearn.preprocessing import LabelEncoder #标签专用,能够将分类转换为分类数值
from sklearn.preprocessing import OrdinalEncoder #特征专用,能够将分类特征转换为分类数值
from sklearn.preprocessing import OneHotEncoder #独热编码,创建哑变量
from sklearn.preprocessing import Binarizer #二值化与分段
# 2.1数据无量纲化
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
pd.DataFrame(data)
>> 0 1
0 -1.0 2
1 -0.5 6
2 0.0 10
3 1.0 18
scaler = MinMaxScaler() #实例化(归一化)
scaler = scaler.fit(data) #fit接口,在这里本质是生成min(x)和max(x)
result = scaler.transform(data) #通过接口导出结果
result
>>
array([[0. , 0. ],
[0.25, 0.25],
[0.5 , 0.5 ],
[1. , 1. ]])
scaler.inverse_transform(result)
>>
array([[-1. , 2. ],
[-0.5, 6. ],
[ 0. , 10. ],
[ 1. , 18. ]])
#MinMaxScaler().fit(data).transform(data) #一步到位
#使用MinMaxScaler的参数feature_range实现将数据归一化到[0,1]以外的范围中
MinMaxScaler(feature_range=(0, 1), copy=True)
scaler_ = StandardScaler() #实例化,数据标准化
scaler_ = scaler_.fit(data) #fit,本质是生成均值和方差
result_ = scaler_.transform(data)
#StandardScaler().fit_transform(data) #熟练后无需分步
#scaler_inverse_transform(result_) #使用inverse_transform逆转标准化
result_
>>
array([[-1.18321596, -1.18321596],
[-0.50709255, -0.50709255],
[ 0.16903085, 0.16903085],
[ 1.52127766, 1.52127766]])
result_.mean() #导出的结果是一个数组,用mean()查看均值
>>0.0
result_.std() #用std()查看方差
>>1.0
# 2.2缺失值(建议直接用pandas)
data = pd.read_csv(r'E:\资料2018.12月始\债权融资\银行提供学习资料\python学习\数据1\Narrativedata.csv',index_col = 0)
data.head()
>>
Age Sex Embarked Survived
0 22.0 male S No
1 38.0 female C Yes
2 26.0 female S Yes
3 35.0 female S Yes
4 35.0 male S No
data.info()
>>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Age 714 non-null float64
1 Sex 891 non-null object
2 Embarked 889 non-null object
3 Survived 891 non-null object
dtypes: float64(1), object(3)
memory usage: 34.8+ KB
Age = data.iloc[:,0].values.reshape(-1,1) ##sklearn当中特征矩阵必须是二维
from sklearn.impute import SimpleImputer #填补缺失值
imp_mean = SimpleImputer().fit_transform(Age) #实例化,默认均值填补, 完成调取结果
imp_median = SimpleImputer(strategy='median').fit_transform(Age) #实例化,中值填补, 完成调取结果
imp_0 = SimpleImputer(strategy = 'constant', fill_value = 0).fit_transform(Age) #实例化,0填补, 完成调取结果
data.loc[:,'Age'] = imp_median #在这里我们使用中位数填补Age
data.info()
>>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Age 891 non-null float64
1 Sex 891 non-null object
2 Embarked 889 non-null object
3 Survived 891 non-null object
dtypes: float64(1), object(3)
memory usage: 34.8+ KB
#使用众数填补Embarked
Embarked = data.loc[:,"Embarked"].values.reshape(-1,1)
imp_mode = SimpleImputer(strategy = "most_frequent")
data.loc[:,"Embarked"] = imp_mode.fit_transform(Embarked)
data.info()
>>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Age 891 non-null float64
1 Sex 891 non-null object
2 Embarked 891 non-null object
3 Survived 891 non-null object
dtypes: float64(1), object(3)
memory usage: 34.8+ KB
#pandas填充缺失值
data_ = pd.read_csv(r'E:\资料2018.12月始\债权融资\银行提供学习资料\python学习\数据1\Narrativedata.csv',index_col = 0)
data_.iloc[:,0] = data_.iloc[:,0].fillna(data_.iloc[:,0].median())
data_.info()
>>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Age 891 non-null float64
1 Sex 891 non-null object
2 Embarked 889 non-null object
3 Survived 891 non-null object
dtypes: float64(1), object(3)
memory usage: 34.8+ KB
# 2.3 处理分类型特征:编码与哑变量
from sklearn.preprocessing import LabelEncoder #标签专用,能够将分类转换为分类数值
y = data.loc[:,'Survived'] #要输入的是标签,不是特征矩阵,所以允许一维
#le = LabelEncoder().fit_transform(y) #实例化、调用接口并获取结果
le = LabelEncoder() #实例化
le = le.fit(y) #导入数据
label = le.transform(y) ##transform接口调取结果
le.classes_ #属性.classes_查看标签中究竟有多少类别
>>array([0, 1, 2])
data.iloc[:,-1] = label
data.head()
>>
Age Sex Embarked Survived
0 22.0 male S 0
1 38.0 female C 2
2 26.0 female S 2
3 35.0 female S 2
4 35.0 male S 0
from sklearn.preprocessing import OrdinalEncoder #特征专用,能够将分类特征转换为分类数值
data_ = data.copy()
OrdinalEncoder().fit(data_.iloc[:,1:-1]).categories_ #查看Sex、Embarked两列的特征类别
>>[array(['female', 'male'], dtype=object), array(['C', 'Q', 'S'], dtype=object)]
data_.iloc[:,1:-1] = OrdinalEncoder().fit_transform(data_.iloc[:,1:-1]) #实例化、调用接口、获取接口并替换原有数据
data_.head()
>>
Age Sex Embarked Survived
0 22.0 1.0 2.0 0
1 38.0 0.0 0.0 2
2 26.0 0.0 2.0 2
3 35.0 0.0 2.0 2
4 35.0 1.0 2.0 0
#独热编码,创建哑变量
data.head()
>>
Age Sex Embarked Survived
0 22.0 1.0 2.0 0
1 38.0 0.0 0.0 2
2 26.0 0.0 2.0 2
3 35.0 0.0 2.0 2
4 35.0 1.0 2.0 0
from sklearn.preprocessing import OneHotEncoder #独热编码,创建哑变量
X = data.iloc[:,1:-1]
# OneHotEncoder(categories='auto').fit_transform(X).toarray() #一步到位,其中toarray()稀疏矩阵合并归一化
enc = OneHotEncoder(categories='auto').fit(X) #实例化、调用接口
result = enc.transform(X).toarray() #数据结果并稀疏矩阵合并归一
result
>>
array([[0., 1., 0., 0., 1.],
[1., 0., 1., 0., 0.],
[1., 0., 0., 0., 1.],
...,
[1., 0., 0., 0., 1.],
[0., 1., 1., 0., 0.],
[0., 1., 0., 1., 0.]])
enc.get_feature_names()
>>array(['x0_0.0', 'x0_1.0', 'x1_0.0', 'x1_1.0', 'x1_2.0'], dtype=object)
#依然可以还原
pd.DataFrame(enc.inverse_transform(result))
result.shape
>>(891, 5)
newdata = pd.concat([data,pd.DataFrame(result)],axis=1)
newdata.drop(["Sex","Embarked"],axis=1,inplace=True)
newdata.head()
>>
Age Survived 0 1 2 3 4
0 22.0 0 0.0 1.0 0.0 0.0 1.0
1 38.0 2 1.0 0.0 1.0 0.0 0.0
2 26.0 2 1.0 0.0 0.0 0.0 1.0
3 35.0 2 1.0 0.0 0.0 0.0 1.0
4 35.0 0 0.0 1.0 0.0 0.0 1.0
newdata.columns =["Age","Survived","Female","Male","Embarked_C","Embarked_Q","Embarked_S"]
newdata.head()
>>
Age Survived Female Male Embarked_C Embarked_Q Embarked_S
0 22.0 0 0.0 1.0 0.0 0.0 1.0
1 38.0 2 1.0 0.0 1.0 0.0 0.0
2 26.0 2 1.0 0.0 0.0 0.0 1.0
3 35.0 2 1.0 0.0 0.0 0.0 1.0
4 35.0 0 0.0 1.0 0.0 0.0 1.0
# 2.4 处理连续型特征:二值化与分段
data_2 = data.copy()
data_2.head()
>>
Age Sex Embarked Survived
0 22.0 1.0 2.0 0
1 38.0 0.0 0.0 2
2 26.0 0.0 2.0 2
3 35.0 0.0 2.0 2
4 35.0 1.0 2.0 0
from sklearn.preprocessing import Binarizer #连续型特征二值化
X = data_2.iloc[:,0].values.reshape(-1,1) #类为特征专用,所以不能使用一维数组
transformer = Binarizer(threshold=30).fit_transform(X) #一步到位,以30为阈值,大于阈值的值映射为1,而小于或等于阈值的值映射为0
transformer[:5]
>>
array([[0.],
[1.],
[0.],
[1.],
[1.]])
from sklearn.preprocessing import KBinsDiscretizer #连续型特征分段
X = data_2.iloc[:,0].values.reshape(-1,1)
est = KBinsDiscretizer(n_bins=3,encode='ordinal',strategy='uniform')
#“ordinal”:每个特征的每个箱都被编码为一个整数,返回每一列是一个特征,每个特征下含有不同整数编码的箱的矩阵
est.fit_transform(X)
>>
.....
#查看转换后分的箱:变成了一列中的三箱
set(est.fit_transform(X).ravel())
est = KBinsDiscretizer(n_bins=3, encode='onehot', strategy='uniform')
#"onehot":做哑变量,之后返回一个稀疏矩阵,每一列是一个特征中的一个类别,含有该类别的样本表示为1,不含的表示为0
#查看转换后分的箱:变成了哑变量
est.fit_transform(X).toarray()
>>
array([[1., 0., 0.],
[0., 1., 0.],
[1., 0., 0.],
...,
[0., 1., 0.],
[1., 0., 0.],
[0., 1., 0.]])