特征工程和数据预处理常用工具和方法

import pandas as pd

train_data = pd.read_csv("train.csv")

train_data.shape   #应该是给了property
(891, 12)

train_data.describe()

train_data["Age"].fillna(value=train_data["Age"].mean())

ter
from sklearn.preprocessing import Imputer

class
help(Imputer)   #这是个class

 #axis=0指定填充列,1指定填充行,初始化imputer类
imp = Imputer(missing_values='NaN',strategy='mean',axis=0) #axis=0指定填充列,1指定填充行,初始化imputer类

e = 
age = imp.fit_transform(train_data[["Age"]].values)  #fit_transform 两个步骤,fit读取数据计算,transform完成填充。如果只要拟合就用fit。

train_data.loc[:,"Age"] = train_data["Age"].fillna(value=train_data["Age"].mean()) #[x,y] x行,y列。:代表所有。把右边填充好的赋值给左边。

train_data.info()

RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB

操作
#常见的工程操作

#数值型
#数值型

##幅度变换

)
#取对数 
import numpy as np
log_age = train_data["Age"].apply(lambda x: np.log(x))  #Applies function along input axis of DataFrame.DataFrame.apply(func, axis=0)

#最大最小值缩放  公式:xnorm = (x - xmin)/(xmax - xmin)  归一化
from sklearn.preprocessing import MinMaxScaler
mm_scaler = MinMaxScaler()
fare_mm = mm_scaler.fit_transform(train_data[["Fare"]])

#标准化缩放 standardscaler   xstand = (x - u)/σ  u平均值 σ标准差
from sklearn.preprocessing import StandardScaler
sds = StandardScaler()
fare_sds = sds.fit_transform(train_data[["Fare"]])

#1.统计值
max_age = train_data["Age"].max()
min_age = train_data["Age"].min()

#分位数  
age_quarter_1 = train_data["Age"].quantile(0.25)

age_quarter_1
22.0

eatures
#高次特征和交叉特征
from sklearn.preprocessing import PolynomialFeatures
​

pnf = PolynomialFeatures(degree = 2) #degree 多项式的阶数,一般默认是2。
age_pnf = pnf.fit_transform(train_data[["SibSp","Parch"]])

 六列
age_pnf   #[1,a,b,a^2,a*b,b^] 六列
array([[ 1.,  1.,  0.,  1.,  0.,  0.],
       [ 1.,  1.,  0.,  1.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       ..., 
       [ 1.,  1.,  2.,  1.,  2.,  4.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.]])

#离散化,分箱,分桶。把值平均分配。cut和qcut
#cut 等距切分,1-100岁等距切分4分就是0-25是一个箱,26-50是一个。然后年龄落入哪个箱就划分进去。
train_data.loc[:,"Fare_cut"] = pd.cut(train_data["Fare"],5)  #五个分割成四个区间。

train_data.head()
PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked	Fare_cut
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	NaN	S	(-0.512, 102.466]
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	0	PC 17599	71.2833	C85	C	(-0.512, 102.466]
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	0	STON/O2. 3101282	7.9250	NaN	S	(-0.512, 102.466]
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	0	113803	53.1000	C123	S	(-0.512, 102.466]
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	0	373450	8.0500	NaN	S	(-0.512, 102.466]

train_data["Fare_cut"].unique()
[(-0.512, 102.466], (204.932, 307.398], (102.466, 204.932], (409.863, 512.329]]
Categories (4, object): [(-0.512, 102.466] < (102.466, 204.932] < (204.932, 307.398] < (409.863, 512.329]]

,5
#等频切分 qcut 按照频率去切分,让每个区间中的数目一样,频率一样。
train_data.loc[:,"Fare_qcut"] = pd.qcut(train_data["Fare"],5)
​

train_data["Fare_qcut"].unique()
[[0, 7.854], (39.688, 512.329], (7.854, 10.5], (10.5, 21.679], (21.679, 39.688]]
Categories (5, object): [[0, 7.854] < (7.854, 10.5] < (10.5, 21.679] < (21.679, 39.688] < (39.688, 512.329]]

#one hot encoding 独热向量编码  但是会稀释样本特征,造成数据量增大
embarked_ohe = pd.get_dummies(train_data[['Embarked']])

embarked_ohe.head()
C	Q	S
0	0	0	1
1	1	0	0
2	0	0	1
3	0	0	1
4	0	0	1

fareqcut_ohe = pd.get_dummies(train_data["Fare_qcut"])

fareqcut_ohe.head()
[0, 7.854]	(7.854, 10.5]	(10.5, 21.679]	(21.679, 39.688]	(39.688, 512.329]
0	1	0	0	0	0
1	0	0	0	0	1
2	0	1	0	0	0
3	0	0	0	0	1
4	0	1	0	0	0

 对日期处理
#时间型的特征处理 对日期处理

car_sales = pd.read_csv("")
import pandas as pd

df_train = pd.read_csv('train.csv')
df_train.head(10)
PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	0	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	0	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	0	373450	8.0500	NaN	S
5	6	0	3	Moran, Mr. James	male	NaN	0	0	330877	8.4583	NaN	Q
6	7	0	1	McCarthy, Mr. Timothy J	male	54.0	0	0	17463	51.8625	E46	S
7	8	0	3	Palsson, Master. Gosta Leonard	male	2.0	3	1	349909	21.0750	NaN	S
8	9	1	3	Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)	female	27.0	0	2	347742	11.1333	NaN	S
9	10	1	2	Nasser, Mrs. Nicholas (Adele Achem)	female	14.0	1	0	237736	30.0708	NaN	C

df_train.describe()
PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

df_train.info()

RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB

df_train[['Age']].values
array([[ 22.  ],
       [ 38.  ],
       [ 26.  ],
       [ 35.  ],
       [ 35.  ],
       [   nan],
       [ 54.  ],
       [  2.  ],
       [ 27.  ],
       [ 14.  ],
       [  4.  ],
       [ 58.  ],
       [ 20.  ],
       [ 39.  ],
       [ 14.  ],
       [ 55.  ],
       [  2.  ],
       [   nan],
       [ 31.  ],
       [   nan],
       [ 35.  ],
       [ 34.  ],
       [ 15.  ],
       [ 28.  ],
       [  8.  ],
       [ 38.  ],
       [   nan],
       [ 19.  ],
       [   nan],
       [   nan],
       [ 40.  ],
       [   nan],
       [   nan],
       [ 66.  ],
       [ 28.  ],
       [ 42.  ],
       [   nan],
       [ 21.  ],
       [ 18.  ],
       [ 14.  ],
       [ 40.  ],
       [ 27.  ],
       [   nan],
       [  3.  ],
       [ 19.  ],
       [   nan],
       [   nan],
       [   nan],
       [   nan],
       [ 18.  ],
       [  7.  ],
       [ 21.  ],
       [ 49.  ],
       [ 29.  ],
       [ 65.  ],
       [   nan],
       [ 21.  ],
       [ 28.5 ],
       [  5.  ],
       [ 11.  ],
       [ 22.  ],
       [ 38.  ],
       [ 45.  ],
       [  4.  ],
       [   nan],
       [   nan],
       [ 29.  ],
       [ 19.  ],
       [ 17.  ],
       [ 26.  ],
       [ 32.  ],
       [ 16.  ],
       [ 21.  ],
       [ 26.  ],
       [ 32.  ],
       [ 25.  ],
       [   nan],
       [   nan],
       [  0.83],
       [ 30.  ],
       [ 22.  ],
       [ 29.  ],
       [   nan],
       [ 28.  ],
       [ 17.  ],
       [ 33.  ],
       [ 16.  ],
       [   nan],
       [ 23.  ],
       [ 24.  ],
       [ 29.  ],
       [ 20.  ],
       [ 46.  ],
       [ 26.  ],
       [ 59.  ],
       [   nan],
       [ 71.  ],
       [ 23.  ],
       [ 34.  ],
       [ 34.  ],
       [ 28.  ],
       [   nan],
       [ 21.  ],
       [ 33.  ],
       [ 37.  ],
       [ 28.  ],
       [ 21.  ],
       [   nan],
       [ 38.  ],
       [   nan],
       [ 47.  ],
       [ 14.5 ],
       [ 22.  ],
       [ 20.  ],
       [ 17.  ],
       [ 21.  ],
       [ 70.5 ],
       [ 29.  ],
       [ 24.  ],
       [  2.  ],
       [ 21.  ],
       [   nan],
       [ 32.5 ],
       [ 32.5 ],
       [ 54.  ],
       [ 12.  ],
       [   nan],
       [ 24.  ],
       [   nan],
       [ 45.  ],
       [ 33.  ],
       [ 20.  ],
       [ 47.  ],
       [ 29.  ],
       [ 25.  ],
       [ 23.  ],
       [ 19.  ],
       [ 37.  ],
       [ 16.  ],
       [ 24.  ],
       [   nan],
       [ 22.  ],
       [ 24.  ],
       [ 19.  ],
       [ 18.  ],
       [ 19.  ],
       [ 27.  ],
       [  9.  ],
       [ 36.5 ],
       [ 42.  ],
       [ 51.  ],
       [ 22.  ],
       [ 55.5 ],
       [ 40.5 ],
       [   nan],
       [ 51.  ],
       [ 16.  ],
       [ 30.  ],
       [   nan],
       [   nan],
       [ 44.  ],
       [ 40.  ],
       [ 26.  ],
       [ 17.  ],
       [  1.  ],
       [  9.  ],
       [   nan],
       [ 45.  ],
       [   nan],
       [ 28.  ],
       [ 61.  ],
       [  4.  ],
       [  1.  ],
       [ 21.  ],
       [ 56.  ],
       [ 18.  ],
       [   nan],
       [ 50.  ],
       [ 30.  ],
       [ 36.  ],
       [   nan],
       [   nan],
       [  9.  ],
       [  1.  ],
       [  4.  ],
       [   nan],
       [   nan],
       [ 45.  ],
       [ 40.  ],
       [ 36.  ],
       [ 32.  ],
       [ 19.  ],
       [ 19.  ],
       [  3.  ],
       [ 44.  ],
       [ 58.  ],
       [   nan],
       [ 42.  ],
       [   nan],
       [ 24.  ],
       [ 28.  ],
       [   nan],
       [ 34.  ],
       [ 45.5 ],
       [ 18.  ],
       [  2.  ],
       [ 32.  ],
       [ 26.  ],
       [ 16.  ],
       [ 40.  ],
       [ 24.  ],
       [ 35.  ],
       [ 22.  ],
       [ 30.  ],
       [   nan],
       [ 31.  ],
       [ 27.  ],
       [ 42.  ],
       [ 32.  ],
       [ 30.  ],
       [ 16.  ],
       [ 27.  ],
       [ 51.  ],
       [   nan],
       [ 38.  ],
       [ 22.  ],
       [ 19.  ],
       [ 20.5 ],
       [ 18.  ],
       [   nan],
       [ 35.  ],
       [ 29.  ],
       [ 59.  ],
       [  5.  ],
       [ 24.  ],
       [   nan],
       [ 44.  ],
       [  8.  ],
       [ 19.  ],
       [ 33.  ],
       [   nan],
       [   nan],
       [ 29.  ],
       [ 22.  ],
       [ 30.  ],
       [ 44.  ],
       [ 25.  ],
       [ 24.  ],
       [ 37.  ],
       [ 54.  ],
       [   nan],
       [ 29.  ],
       [ 62.  ],
       [ 30.  ],
       [ 41.  ],
       [ 29.  ],
       [   nan],
       [ 30.  ],
       [ 35.  ],
       [ 50.  ],
       [   nan],
       [  3.  ],
       [ 52.  ],
       [ 40.  ],
       [   nan],
       [ 36.  ],
       [ 16.  ],
       [ 25.  ],
       [ 58.  ],
       [ 35.  ],
       [   nan],
       [ 25.  ],
       [ 41.  ],
       [ 37.  ],
       [   nan],
       [ 63.  ],
       [ 45.  ],
       [   nan],
       [  7.  ],
       [ 35.  ],
       [ 65.  ],
       [ 28.  ],
       [ 16.  ],
       [ 19.  ],
       [   nan],
       [ 33.  ],
       [ 30.  ],
       [ 22.  ],
       [ 42.  ],
       [ 22.  ],
       [ 26.  ],
       [ 19.  ],
       [ 36.  ],
       [ 24.  ],
       [ 24.  ],
       [   nan],
       [ 23.5 ],
       [  2.  ],
       [   nan],
       [ 50.  ],
       [   nan],
       [   nan],
       [ 19.  ],
       [   nan],
       [   nan],
       [  0.92],
       [   nan],
       [ 17.  ],
       [ 30.  ],
       [ 30.  ],
       [ 24.  ],
       [ 18.  ],
       [ 26.  ],
       [ 28.  ],
       [ 43.  ],
       [ 26.  ],
       [ 24.  ],
       [ 54.  ],
       [ 31.  ],
       [ 40.  ],
       [ 22.  ],
       [ 27.  ],
       [ 30.  ],
       [ 22.  ],
       [   nan],
       [ 36.  ],
       [ 61.  ],
       [ 36.  ],
       [ 31.  ],
       [ 16.  ],
       [   nan],
       [ 45.5 ],
       [ 38.  ],
       [ 16.  ],
       [   nan],
       [   nan],
       [ 29.  ],
       [ 41.  ],
       [ 45.  ],
       [ 45.  ],
       [  2.  ],
       [ 24.  ],
       [ 28.  ],
       [ 25.  ],
       [ 36.  ],
       [ 24.  ],
       [ 40.  ],
       [   nan],
       [  3.  ],
       [ 42.  ],
       [ 23.  ],
       [   nan],
       [ 15.  ],
       [ 25.  ],
       [   nan],
       [ 28.  ],
       [ 22.  ],
       [ 38.  ],
       [   nan],
       [   nan],
       [ 40.  ],
       [ 29.  ],
       [ 45.  ],
       [ 35.  ],
       [   nan],
       [ 30.  ],
       [ 60.  ],
       [   nan],
       [   nan],
       [ 24.  ],
       [ 25.  ],
       [ 18.  ],
       [ 19.  ],
       [ 22.  ],
       [  3.  ],
       [   nan],
       [ 22.  ],
       [ 27.  ],
       [ 20.  ],
       [ 19.  ],
       [ 42.  ],
       [  1.  ],
       [ 32.  ],
       [ 35.  ],
       [   nan],
       [ 18.  ],
       [  1.  ],
       [ 36.  ],
       [   nan],
       [ 17.  ],
       [ 36.  ],
       [ 21.  ],
       [ 28.  ],
       [ 23.  ],
       [ 24.  ],
       [ 22.  ],
       [ 31.  ],
       [ 46.  ],
       [ 23.  ],
       [ 28.  ],
       [ 39.  ],
       [ 26.  ],
       [ 21.  ],
       [ 28.  ],
       [ 20.  ],
       [ 34.  ],
       [ 51.  ],
       [  3.  ],
       [ 21.  ],
       [   nan],
       [   nan],
       [   nan],
       [ 33.  ],
       [   nan],
       [ 44.  ],
       [   nan],
       [ 34.  ],
       [ 18.  ],
       [ 30.  ],
       [ 10.  ],
       [   nan],
       [ 21.  ],
       [ 29.  ],
       [ 28.  ],
       [ 18.  ],
       [   nan],
       [ 28.  ],
       [ 19.  ],
       [   nan],
       [ 32.  ],
       [ 28.  ],
       [   nan],
       [ 42.  ],
       [ 17.  ],
       [ 50.  ],
       [ 14.  ],
       [ 21.  ],
       [ 24.  ],
       [ 64.  ],
       [ 31.  ],
       [ 45.  ],
       [ 20.  ],
       [ 25.  ],
       [ 28.  ],
       [   nan],
       [  4.  ],
       [ 13.  ],
       [ 34.  ],
       [  5.  ],
       [ 52.  ],
       [ 36.  ],
       [   nan],
       [ 30.  ],
       [ 49.  ],
       [   nan],
       [ 29.  ],
       [ 65.  ],
       [   nan],
       [ 50.  ],
       [   nan],
       [ 48.  ],
       [ 34.  ],
       [ 47.  ],
       [ 48.  ],
       [   nan],
       [ 38.  ],
       [   nan],
       [ 56.  ],
       [   nan],
       [  0.75],
       [   nan],
       [ 38.  ],
       [ 33.  ],
       [ 23.  ],
       [ 22.  ],
       [   nan],
       [ 34.  ],
       [ 29.  ],
       [ 22.  ],
       [  2.  ],
       [  9.  ],
       [   nan],
       [ 50.  ],
       [ 63.  ],
       [ 25.  ],
       [   nan],
       [ 35.  ],
       [ 58.  ],
       [ 30.  ],
       [  9.  ],
       [   nan],
       [ 21.  ],
       [ 55.  ],
       [ 71.  ],
       [ 21.  ],
       [   nan],
       [ 54.  ],
       [   nan],
       [ 25.  ],
       [ 24.  ],
       [ 17.  ],
       [ 21.  ],
       [   nan],
       [ 37.  ],
       [ 16.  ],
       [ 18.  ],
       [ 33.  ],
       [   nan],
       [ 28.  ],
       [ 26.  ],
       [ 29.  ],
       [   nan],
       [ 36.  ],
       [ 54.  ],
       [ 24.  ],
       [ 47.  ],
       [ 34.  ],
       [   nan],
       [ 36.  ],
       [ 32.  ],
       [ 30.  ],
       [ 22.  ],
       [   nan],
       [ 44.  ],
       [   nan],
       [ 40.5 ],
       [ 50.  ],
       [   nan],
       [ 39.  ],
       [ 23.  ],
       [  2.  ],
       [   nan],
       [ 17.  ],
       [   nan],
       [ 30.  ],
       [  7.  ],
       [ 45.  ],
       [ 30.  ],
       [   nan],
       [ 22.  ],
       [ 36.  ],
       [  9.  ],
       [ 11.  ],
       [ 32.  ],
       [ 50.  ],
       [ 64.  ],
       [ 19.  ],
       [   nan],
       [ 33.  ],
       [  8.  ],
       [ 17.  ],
       [ 27.  ],
       [   nan],
       [ 22.  ],
       [ 22.  ],
       [ 62.  ],
       [ 48.  ],
       [   nan],
       [ 39.  ],
       [ 36.  ],
       [   nan],
       [ 40.  ],
       [ 28.  ],
       [   nan],
       [   nan],
       [ 24.  ],
       [ 19.  ],
       [ 29.  ],
       [   nan],
       [ 32.  ],
       [ 62.  ],
       [ 53.  ],
       [ 36.  ],
       [   nan],
       [ 16.  ],
       [ 19.  ],
       [ 34.  ],
       [ 39.  ],
       [   nan],
       [ 32.  ],
       [ 25.  ],
       [ 39.  ],
       [ 54.  ],
       [ 36.  ],
       [   nan],
       [ 18.  ],
       [ 47.  ],
       [ 60.  ],
       [ 22.  ],
       [   nan],
       [ 35.  ],
       [ 52.  ],
       [ 47.  ],
       [   nan],
       [ 37.  ],
       [ 36.  ],
       [   nan],
       [ 49.  ],
       [   nan],
       [ 49.  ],
       [ 24.  ],
       [   nan],
       [   nan],
       [ 44.  ],
       [ 35.  ],
       [ 36.  ],
       [ 30.  ],
       [ 27.  ],
       [ 22.  ],
       [ 40.  ],
       [ 39.  ],
       [   nan],
       [   nan],
       [   nan],
       [ 35.  ],
       [ 24.  ],
       [ 34.  ],
       [ 26.  ],
       [  4.  ],
       [ 26.  ],
       [ 27.  ],
       [ 42.  ],
       [ 20.  ],
       [ 21.  ],
       [ 21.  ],
       [ 61.  ],
       [ 57.  ],
       [ 21.  ],
       [ 26.  ],
       [   nan],
       [ 80.  ],
       [ 51.  ],
       [ 32.  ],
       [   nan],
       [  9.  ],
       [ 28.  ],
       [ 32.  ],
       [ 31.  ],
       [ 41.  ],
       [   nan],
       [ 20.  ],
       [ 24.  ],
       [  2.  ],
       [   nan],
       [  0.75],
       [ 48.  ],
       [ 19.  ],
       [ 56.  ],
       [   nan],
       [ 23.  ],
       [   nan],
       [ 18.  ],
       [ 21.  ],
       [   nan],
       [ 18.  ],
       [ 24.  ],
       [   nan],
       [ 32.  ],
       [ 23.  ],
       [ 58.  ],
       [ 50.  ],
       [ 40.  ],
       [ 47.  ],
       [ 36.  ],
       [ 20.  ],
       [ 32.  ],
       [ 25.  ],
       [   nan],
       [ 43.  ],
       [   nan],
       [ 40.  ],
       [ 31.  ],
       [ 70.  ],
       [ 31.  ],
       [   nan],
       [ 18.  ],
       [ 24.5 ],
       [ 18.  ],
       [ 43.  ],
       [ 36.  ],
       [   nan],
       [ 27.  ],
       [ 20.  ],
       [ 14.  ],
       [ 60.  ],
       [ 25.  ],
       [ 14.  ],
       [ 19.  ],
       [ 18.  ],
       [ 15.  ],
       [ 31.  ],
       [  4.  ],
       [   nan],
       [ 25.  ],
       [ 60.  ],
       [ 52.  ],
       [ 44.  ],
       [   nan],
       [ 49.  ],
       [ 42.  ],
       [ 18.  ],
       [ 35.  ],
       [ 18.  ],
       [ 25.  ],
       [ 26.  ],
       [ 39.  ],
       [ 45.  ],
       [ 42.  ],
       [ 22.  ],
       [   nan],
       [ 24.  ],
       [   nan],
       [ 48.  ],
       [ 29.  ],
       [ 52.  ],
       [ 19.  ],
       [ 38.  ],
       [ 27.  ],
       [   nan],
       [ 33.  ],
       [  6.  ],
       [ 17.  ],
       [ 34.  ],
       [ 50.  ],
       [ 27.  ],
       [ 20.  ],
       [ 30.  ],
       [   nan],
       [ 25.  ],
       [ 25.  ],
       [ 29.  ],
       [ 11.  ],
       [   nan],
       [ 23.  ],
       [ 23.  ],
       [ 28.5 ],
       [ 48.  ],
       [ 35.  ],
       [   nan],
       [   nan],
       [   nan],
       [ 36.  ],
       [ 21.  ],
       [ 24.  ],
       [ 31.  ],
       [ 70.  ],
       [ 16.  ],
       [ 30.  ],
       [ 19.  ],
       [ 31.  ],
       [  4.  ],
       [  6.  ],
       [ 33.  ],
       [ 23.  ],
       [ 48.  ],
       [  0.67],
       [ 28.  ],
       [ 18.  ],
       [ 34.  ],
       [ 33.  ],
       [   nan],
       [ 41.  ],
       [ 20.  ],
       [ 36.  ],
       [ 16.  ],
       [ 51.  ],
       [   nan],
       [ 30.5 ],
       [   nan],
       [ 32.  ],
       [ 24.  ],
       [ 48.  ],
       [ 57.  ],
       [   nan],
       [ 54.  ],
       [ 18.  ],
       [   nan],
       [  5.  ],
       [   nan],
       [ 43.  ],
       [ 13.  ],
       [ 17.  ],
       [ 29.  ],
       [   nan],
       [ 25.  ],
       [ 25.  ],
       [ 18.  ],
       [  8.  ],
       [  1.  ],
       [ 46.  ],
       [   nan],
       [ 16.  ],
       [   nan],
       [   nan],
       [ 25.  ],
       [ 39.  ],
       [ 49.  ],
       [ 31.  ],
       [ 30.  ],
       [ 30.  ],
       [ 34.  ],
       [ 31.  ],
       [ 11.  ],
       [  0.42],
       [ 27.  ],
       [ 31.  ],
       [ 39.  ],
       [ 18.  ],
       [ 39.  ],
       [ 33.  ],
       [ 26.  ],
       [ 39.  ],
       [ 35.  ],
       [  6.  ],
       [ 30.5 ],
       [   nan],
       [ 23.  ],
       [ 31.  ],
       [ 43.  ],
       [ 10.  ],
       [ 52.  ],
       [ 27.  ],
       [ 38.  ],
       [ 27.  ],
       [  2.  ],
       [   nan],
       [   nan],
       [  1.  ],
       [   nan],
       [ 62.  ],
       [ 15.  ],
       [  0.83],
       [   nan],
       [ 23.  ],
       [ 18.  ],
       [ 39.  ],
       [ 21.  ],
       [   nan],
       [ 32.  ],
       [   nan],
       [ 20.  ],
       [ 16.  ],
       [ 30.  ],
       [ 34.5 ],
       [ 17.  ],
       [ 42.  ],
       [   nan],
       [ 35.  ],
       [ 28.  ],
       [   nan],
       [  4.  ],
       [ 74.  ],
       [  9.  ],
       [ 16.  ],
       [ 44.  ],
       [ 18.  ],
       [ 45.  ],
       [ 51.  ],
       [ 24.  ],
       [   nan],
       [ 41.  ],
       [ 21.  ],
       [ 48.  ],
       [   nan],
       [ 24.  ],
       [ 42.  ],
       [ 27.  ],
       [ 31.  ],
       [   nan],
       [  4.  ],
       [ 26.  ],
       [ 47.  ],
       [ 33.  ],
       [ 47.  ],
       [ 28.  ],
       [ 15.  ],
       [ 20.  ],
       [ 19.  ],
       [   nan],
       [ 56.  ],
       [ 25.  ],
       [ 33.  ],
       [ 22.  ],
       [ 28.  ],
       [ 25.  ],
       [ 39.  ],
       [ 27.  ],
       [ 19.  ],
       [   nan],
       [ 26.  ],
       [ 32.  ]])

from sklearn.preprocessing import Imputer

help(Imputer)
Help on class Imputer in module sklearn.preprocessing.imputation:

class Imputer(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin)
 |  Imputation transformer for completing missing values.
 |  
 |  Read more in the :ref:`User Guide `.
 |  
 |  Parameters
 |  ----------
 |  missing_values : integer or "NaN", optional (default="NaN")
 |      The placeholder for the missing values. All occurrences of
 |      `missing_values` will be imputed. For missing values encoded as np.nan,
 |      use the string value "NaN".
 |  
 |  strategy : string, optional (default="mean")
 |      The imputation strategy.
 |  
 |      - If "mean", then replace missing values using the mean along
 |        the axis.
 |      - If "median", then replace missing values using the median along
 |        the axis.
 |      - If "most_frequent", then replace missing using the most frequent
 |        value along the axis.
 |  
 |  axis : integer, optional (default=0)
 |      The axis along which to impute.
 |  
 |      - If `axis=0`, then impute along columns.
 |      - If `axis=1`, then impute along rows.
 |  
 |  verbose : integer, optional (default=0)
 |      Controls the verbosity of the imputer.
 |  
 |  copy : boolean, optional (default=True)
 |      If True, a copy of X will be created. If False, imputation will
 |      be done in-place whenever possible. Note that, in the following cases,
 |      a new copy will always be made, even if `copy=False`:
 |  
 |      - If X is not an array of floating values;
 |      - If X is sparse and `missing_values=0`;
 |      - If `axis=0` and X is encoded as a CSR matrix;
 |      - If `axis=1` and X is encoded as a CSC matrix.
 |  
 |  Attributes
 |  ----------
 |  statistics_ : array of shape (n_features,)
 |      The imputation fill value for each feature if axis == 0.
 |  
 |  Notes
 |  -----
 |  - When ``axis=0``, columns which only contained missing values at `fit`
 |    are discarded upon `transform`.
 |  - When ``axis=1``, an exception is raised if there are rows for which it is
 |    not possible to fill in the missing values (e.g., because they only
 |    contain missing values).
 |  
 |  Method resolution order:
 |      Imputer
 |      sklearn.base.BaseEstimator
 |      sklearn.base.TransformerMixin
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=True)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  fit(self, X, y=None)
 |      Fit the imputer on X.
 |      
 |      Parameters
 |      ----------
 |      X : {array-like, sparse matrix}, shape (n_samples, n_features)
 |          Input data, where ``n_samples`` is the number of samples and
 |          ``n_features`` is the number of features.
 |      
 |      Returns
 |      -------
 |      self : object
 |          Returns self.
 |  
 |  transform(self, X)
 |      Impute all missing values in X.
 |      
 |      Parameters
 |      ----------
 |      X : {array-like, sparse matrix}, shape = [n_samples, n_features]
 |          The input data to complete.
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from sklearn.base.BaseEstimator:
 |  
 |  __getstate__(self)
 |  
 |  __repr__(self)
 |      Return repr(self).
 |  
 |  __setstate__(self, state)
 |  
 |  get_params(self, deep=True)
 |      Get parameters for this estimator.
 |      
 |      Parameters
 |      ----------
 |      deep : boolean, optional
 |          If True, will return the parameters for this estimator and
 |          contained subobjects that are estimators.
 |      
 |      Returns
 |      -------
 |      params : mapping of string to any
 |          Parameter names mapped to their values.
 |  
 |  set_params(self, **params)
 |      Set the parameters of this estimator.
 |      
 |      The method works on simple estimators as well as on nested objects
 |      (such as pipelines). The latter have parameters of the form
 |      ``__`` so that it's possible to update each
 |      component of a nested object.
 |      
 |      Returns
 |      -------
 |      self
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from sklearn.base.BaseEstimator:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from sklearn.base.TransformerMixin:
 |  
 |  fit_transform(self, X, y=None, **fit_params)
 |      Fit to data, then transform it.
 |      
 |      Fits transformer to X and y with optional parameters fit_params
 |      and returns a transformed version of X.
 |      
 |      Parameters
 |      ----------
 |      X : numpy array of shape [n_samples, n_features]
 |          Training set.
 |      
 |      y : numpy array of shape [n_samples]
 |          Target values.
 |      
 |      Returns
 |      -------
 |      X_new : numpy array of shape [n_samples, n_features_new]
 |          Transformed array.


impu = Imputer(missing_values='NaN',strategy='mean',axis=0)
age =impu.fit_transform(df_train[['Age']]) #df_obj.values 查看数据值 需要dataframe格式  .values有没有都一样
age
array([[ 22.        ],
       [ 38.        ],
       [ 26.        ],
       [ 35.        ],
       [ 35.        ],
       [ 29.69911765],
       [ 54.        ],
       [  2.        ],
       [ 27.        ],
       [ 14.        ],
       [  4.        ],
       [ 58.        ],
       [ 20.        ],
       [ 39.        ],
       [ 14.        ],
       [ 55.        ],
       [  2.        ],
       [ 29.69911765],
       [ 31.        ],
       [ 29.69911765],
       [ 35.        ],
       [ 34.        ],
       [ 15.        ],
       [ 28.        ],
       [  8.        ],
       [ 38.        ],
       [ 29.69911765],
       [ 19.        ],
       [ 29.69911765],
       [ 29.69911765],
       [ 40.        ],
       [ 29.69911765],
       [ 29.69911765],
       [ 66.        ],
       [ 28.        ],
       [ 42.        ],
       [ 29.69911765],
       [ 21.        ],
       [ 18.        ],
       [ 14.        ],
       [ 40.        ],
       [ 27.        ],
       [ 29.69911765],
       [  3.        ],
       [ 19.        ],
       [ 29.69911765],
       [ 29.69911765],
       [ 29.69911765],
       [ 29.69911765],
       [ 18.        ],
       [  7.        ],
       [ 21.        ],
       [ 49.        ],
       [ 29.        ],
       [ 65.        ],
       [ 29.69911765],
       [ 21.        ],
       [ 28.5       ],
       [  5.        ],
       [ 11.        ],
       [ 22.        ],
       [ 38.        ],
       [ 45.        ],
       [  4.        ],
       [ 29.69911765],
       [ 29.69911765],
       [ 29.        ],
       [ 19.        ],
       [ 17.        ],
       [ 26.        ],
       [ 32.        ],
       [ 16.        ],
       [ 21.        ],
       [ 26.        ],
       [ 32.        ],
       [ 25.        ],
       [ 29.69911765],
       [ 29.69911765],
       [  0.83      ],
       [ 30.        ],
       [ 22.        ],
       [ 29.        ],
       [ 29.69911765],
       [ 28.        ],
       [ 17.        ],
       [ 33.        ],
       [ 16.        ],
       [ 29.69911765],
       [ 23.        ],
       [ 24.        ],
       [ 29.        ],
       [ 20.        ],
       [ 46.        ],
       [ 26.        ],
       [ 59.        ],
       [ 29.69911765],
       [ 71.        ],
       [ 23.        ],
       [ 34.        ],
       [ 34.        ],
       [ 28.        ],
       [ 29.69911765],
       [ 21.        ],
       [ 33.        ],
       [ 37.        ],
       [ 28.        ],
       [ 21.        ],
       [ 29.69911765],
       [ 38.        ],
       [ 29.69911765],
       [ 47.        ],
       [ 14.5       ],
       [ 22.        ],
       [ 20.        ],
       [ 17.        ],
       [ 21.        ],
       [ 70.5       ],
       [ 29.        ],
       [ 24.        ],
       [  2.        ],
       [ 21.        ],
       [ 29.69911765],
       [ 32.5       ],
       [ 32.5       ],
       [ 54.        ],
       [ 12.        ],
       [ 29.69911765],
       [ 24.        ],
       [ 29.69911765],
       [ 45.        ],
       [ 33.        ],
       [ 20.        ],
       [ 47.        ],
       [ 29.        ],
       [ 25.        ],
       [ 23.        ],
       [ 19.        ],
       [ 37.        ],
       [ 16.        ],
       [ 24.        ],
       [ 29.69911765],
       [ 22.        ],
       [ 24.        ],
       [ 19.        ],
       [ 18.        ],
       [ 19.        ],
       [ 27.        ],
       [  9.        ],
       [ 36.5       ],
       [ 42.        ],
       [ 51.        ],
       [ 22.        ],
       [ 55.5       ],
       [ 40.5       ],
       [ 29.69911765],
       [ 51.        ],
       [ 16.        ],
       [ 30.        ],
       [ 29.69911765],
       [ 29.69911765],
       [ 44.        ],
       [ 40.        ],
       [ 26.        ],
       [ 17.        ],
       [  1.        ],
       [  9.        ],
       [ 29.69911765],
       [ 45.        ],
       [ 29.69911765],
       [ 28.        ],
       [ 61.        ],
       [  4.        ],
       [  1.        ],
       [ 21.        ],
       [ 56.        ],
       [ 18.        ],
       [ 29.69911765],
       [ 50.        ],
       [ 30.        ],
       [ 36.        ],
       [ 29.69911765],
       [ 29.69911765],
       [  9.        ],
       [  1.        ],
       [  4.        ],
       [ 29.69911765],
       [ 29.69911765],
       [ 45.        ],
       [ 40.        ],
       [ 36.        ],
       [ 32.        ],
       [ 19.        ],
       [ 19.        ],
       [  3.        ],
       [ 44.        ],
       [ 58.        ],
       [ 29.69911765],
       [ 42.        ],
       [ 29.69911765],
       [ 24.        ],
       [ 28.        ],
       [ 29.69911765],
       [ 34.        ],
       [ 45.5       ],
       [ 18.        ],
       [  2.        ],
       [ 32.        ],
       [ 26.        ],
       [ 16.        ],
       [ 40.        ],
       [ 24.        ],
       [ 35.        ],
       [ 22.        ],
       [ 30.        ],
       [ 29.69911765],
       [ 31.        ],
       [ 27.        ],
       [ 42.        ],
       [ 32.        ],
       [ 30.        ],
       [ 16.        ],
       [ 27.        ],
       [ 51.        ],
       [ 29.69911765],
       [ 38.        ],
       [ 22.        ],
       [ 19.        ],
       [ 20.5       ],
       [ 18.        ],
       [ 29.69911765],
       [ 35.        ],
       [ 29.        ],
       [ 59.        ],
       [  5.        ],
       [ 24.        ],
       [ 29.69911765],
       [ 44.        ],
       [  8.        ],
       [ 19.        ],
       [ 33.        ],
       [ 29.69911765],
       [ 29.69911765],
       [ 29.        ],
       [ 22.        ],
       [ 30.        ],
       [ 44.        ],
       [ 25.        ],
       [ 24.        ],
       [ 37.        ],
       [ 54.        ],
       [ 29.69911765],
       [ 29.        ],
       [ 62.        ],
       [ 30.        ],
       [ 41.        ],
       [ 29.        ],
       [ 29.69911765],
       [ 30.        ],
       [ 35.        ],
       [ 50.        ],
       [ 29.69911765],
       [  3.        ],
       [ 52.        ],
       [ 40.        ],
       [ 29.69911765],
       [ 36.        ],
       [ 16.        ],
       [ 25.        ],
       [ 58.        ],
       [ 35.        ],
       [ 29.69911765],
       [ 25.        ],
       [ 41.        ],
       [ 37.        ],
       [ 29.69911765],
       [ 63.        ],
       [ 45.        ],
       [ 29.69911765],
       [  7.        ],
       [ 35.        ],
       [ 65.        ],
       [ 28.        ],
       [ 16.        ],
       [ 19.        ],
       [ 29.69911765],
       [ 33.        ],
       [ 30.        ],
       [ 22.        ],
       [ 42.        ],
       [ 22.        ],
       [ 26.        ],
       [ 19.        ],
       [ 36.        ],
       [ 24.        ],
       [ 24.        ],
       [ 29.69911765],
       [ 23.5       ],
       [  2.        ],
       [ 29.69911765],
       [ 50.        ],
       [ 29.69911765],
       [ 29.69911765],
       [ 19.        ],
       [ 29.69911765],
       [ 29.69911765],
       [  0.92      ],
       [ 29.69911765],
       [ 17.        ],
       [ 30.        ],
       [ 30.        ],
       [ 24.        ],
       [ 18.        ],
       [ 26.        ],
       [ 28.        ],
       [ 43.        ],
       [ 26.        ],
       [ 24.        ],
       [ 54.        ],
       [ 31.        ],
       [ 40.        ],
       [ 22.        ],
       [ 27.        ],
       [ 30.        ],
       [ 22.        ],
       [ 29.69911765],
       [ 36.        ],
       [ 61.        ],
       [ 36.        ],
       [ 31.        ],
       [ 16.        ],
       [ 29.69911765],
       [ 45.5       ],
       [ 38.        ],
       [ 16.        ],
       [ 29.69911765],
       [ 29.69911765],
       [ 29.        ],
       [ 41.        ],
       [ 45.        ],
       [ 45.        ],
       [  2.        ],
       [ 24.        ],
       [ 28.        ],
       [ 25.        ],
       [ 36.        ],
       [ 24.        ],
       [ 40.        ],
       [ 29.69911765],
       [  3.        ],
       [ 42.        ],
       [ 23.        ],
       [ 29.69911765],
       [ 15.        ],
       [ 25.        ],
       [ 29.69911765],
       [ 28.        ],
       [ 22.        ],
       [ 38.        ],
       [ 29.69911765],
       [ 29.69911765],
       [ 40.        ],
       [ 29.        ],
       [ 45.        ],
       [ 35.        ],
       [ 29.69911765],
       [ 30.        ],
       [ 60.        ],
       [ 29.69911765],
       [ 29.69911765],
       [ 24.        ],
       [ 25.        ],
       [ 18.        ],
       [ 19.        ],
       [ 22.        ],
       [  3.        ],
       [ 29.69911765],
       [ 22.        ],
       [ 27.        ],
       [ 20.        ],
       [ 19.        ],
       [ 42.        ],
       [  1.        ],
       [ 32.        ],
       [ 35.        ],
       [ 29.69911765],
       [ 18.        ],
       [  1.        ],
       [ 36.        ],
       [ 29.69911765],
       [ 17.        ],
       [ 36.        ],
       [ 21.        ],
       [ 28.        ],
       [ 23.        ],
       [ 24.        ],
       [ 22.        ],
       [ 31.        ],
       [ 46.        ],
       [ 23.        ],
       [ 28.        ],
       [ 39.        ],
       [ 26.        ],
       [ 21.        ],
       [ 28.        ],
       [ 20.        ],
       [ 34.        ],
       [ 51.        ],
       [  3.        ],
       [ 21.        ],
       [ 29.69911765],
       [ 29.69911765],
       [ 29.69911765],
       [ 33.        ],
       [ 29.69911765],
       [ 44.        ],
       [ 29.69911765],
       [ 34.        ],
       [ 18.        ],
       [ 30.        ],
       [ 10.        ],
       [ 29.69911765],
       [ 21.        ],
       [ 29.        ],
       [ 28.        ],
       [ 18.        ],
       [ 29.69911765],
       [ 28.        ],
       [ 19.        ],
       [ 29.69911765],
       [ 32.        ],
       [ 28.        ],
       [ 29.69911765],
       [ 42.        ],
       [ 17.        ],
       [ 50.        ],
       [ 14.        ],
       [ 21.        ],
       [ 24.        ],
       [ 64.        ],
       [ 31.        ],
       [ 45.        ],
       [ 20.        ],
       [ 25.        ],
       [ 28.        ],
       [ 29.69911765],
       [  4.        ],
       [ 13.        ],
       [ 34.        ],
       [  5.        ],
       [ 52.        ],
       [ 36.        ],
       [ 29.69911765],
       [ 30.        ],
       [ 49.        ],
       [ 29.69911765],
       [ 29.        ],
       [ 65.        ],
       [ 29.69911765],
       [ 50.        ],
       [ 29.69911765],
       [ 48.        ],
       [ 34.        ],
       [ 47.        ],
       [ 48.        ],
       [ 29.69911765],
       [ 38.        ],
       [ 29.69911765],
       [ 56.        ],
       [ 29.69911765],
       [  0.75      ],
       [ 29.69911765],
       [ 38.        ],
       [ 33.        ],
       [ 23.        ],
       [ 22.        ],
       [ 29.69911765],
       [ 34.        ],
       [ 29.        ],
       [ 22.        ],
       [  2.        ],
       [  9.        ],
       [ 29.69911765],
       [ 50.        ],
       [ 63.        ],
       [ 25.        ],
       [ 29.69911765],
       [ 35.        ],
       [ 58.        ],
       [ 30.        ],
       [  9.        ],
       [ 29.69911765],
       [ 21.        ],
       [ 55.        ],
       [ 71.        ],
       [ 21.        ],
       [ 29.69911765],
       [ 54.        ],
       [ 29.69911765],
       [ 25.        ],
       [ 24.        ],
       [ 17.        ],
       [ 21.        ],
       [ 29.69911765],
       [ 37.        ],
       [ 16.        ],
       [ 18.        ],
       [ 33.        ],
       [ 29.69911765],
       [ 28.        ],
       [ 26.        ],
       [ 29.        ],
       [ 29.69911765],
       [ 36.        ],
       [ 54.        ],
       [ 24.        ],
       [ 47.        ],
       [ 34.        ],
       [ 29.69911765],
       [ 36.        ],
       [ 32.        ],
       [ 30.        ],
       [ 22.        ],
       [ 29.69911765],
       [ 44.        ],
       [ 29.69911765],
       [ 40.5       ],
       [ 50.        ],
       [ 29.69911765],
       [ 39.        ],
       [ 23.        ],
       [  2.        ],
       [ 29.69911765],
       [ 17.        ],
       [ 29.69911765],
       [ 30.        ],
       [  7.        ],
       [ 45.        ],
       [ 30.        ],
       [ 29.69911765],
       [ 22.        ],
       [ 36.        ],
       [  9.        ],
       [ 11.        ],
       [ 32.        ],
       [ 50.        ],
       [ 64.        ],
       [ 19.        ],
       [ 29.69911765],
       [ 33.        ],
       [  8.        ],
       [ 17.        ],
       [ 27.        ],
       [ 29.69911765],
       [ 22.        ],
       [ 22.        ],
       [ 62.        ],
       [ 48.        ],
       [ 29.69911765],
       [ 39.        ],
       [ 36.        ],
       [ 29.69911765],
       [ 40.        ],
       [ 28.        ],
       [ 29.69911765],
       [ 29.69911765],
       [ 24.        ],
       [ 19.        ],
       [ 29.        ],
       [ 29.69911765],
       [ 32.        ],
       [ 62.        ],
       [ 53.        ],
       [ 36.        ],
       [ 29.69911765],
       [ 16.        ],
       [ 19.        ],
       [ 34.        ],
       [ 39.        ],
       [ 29.69911765],
       [ 32.        ],
       [ 25.        ],
       [ 39.        ],
       [ 54.        ],
       [ 36.        ],
       [ 29.69911765],
       [ 18.        ],
       [ 47.        ],
       [ 60.        ],
       [ 22.        ],
       [ 29.69911765],
       [ 35.        ],
       [ 52.        ],
       [ 47.        ],
       [ 29.69911765],
       [ 37.        ],
       [ 36.        ],
       [ 29.69911765],
       [ 49.        ],
       [ 29.69911765],
       [ 49.        ],
       [ 24.        ],
       [ 29.69911765],
       [ 29.69911765],
       [ 44.        ],
       [ 35.        ],
       [ 36.        ],
       [ 30.        ],
       [ 27.        ],
       [ 22.        ],
       [ 40.        ],
       [ 39.        ],
       [ 29.69911765],
       [ 29.69911765],
       [ 29.69911765],
       [ 35.        ],
       [ 24.        ],
       [ 34.        ],
       [ 26.        ],
       [  4.        ],
       [ 26.        ],
       [ 27.        ],
       [ 42.        ],
       [ 20.        ],
       [ 21.        ],
       [ 21.        ],
       [ 61.        ],
       [ 57.        ],
       [ 21.        ],
       [ 26.        ],
       [ 29.69911765],
       [ 80.        ],
       [ 51.        ],
       [ 32.        ],
       [ 29.69911765],
       [  9.        ],
       [ 28.        ],
       [ 32.        ],
       [ 31.        ],
       [ 41.        ],
       [ 29.69911765],
       [ 20.        ],
       [ 24.        ],
       [  2.        ],
       [ 29.69911765],
       [  0.75      ],
       [ 48.        ],
       [ 19.        ],
       [ 56.        ],
       [ 29.69911765],
       [ 23.        ],
       [ 29.69911765],
       [ 18.        ],
       [ 21.        ],
       [ 29.69911765],
       [ 18.        ],
       [ 24.        ],
       [ 29.69911765],
       [ 32.        ],
       [ 23.        ],
       [ 58.        ],
       [ 50.        ],
       [ 40.        ],
       [ 47.        ],
       [ 36.        ],
       [ 20.        ],
       [ 32.        ],
       [ 25.        ],
       [ 29.69911765],
       [ 43.        ],
       [ 29.69911765],
       [ 40.        ],
       [ 31.        ],
       [ 70.        ],
       [ 31.        ],
       [ 29.69911765],
       [ 18.        ],
       [ 24.5       ],
       [ 18.        ],
       [ 43.        ],
       [ 36.        ],
       [ 29.69911765],
       [ 27.        ],
       [ 20.        ],
       [ 14.        ],
       [ 60.        ],
       [ 25.        ],
       [ 14.        ],
       [ 19.        ],
       [ 18.        ],
       [ 15.        ],
       [ 31.        ],
       [  4.        ],
       [ 29.69911765],
       [ 25.        ],
       [ 60.        ],
       [ 52.        ],
       [ 44.        ],
       [ 29.69911765],
       [ 49.        ],
       [ 42.        ],
       [ 18.        ],
       [ 35.        ],
       [ 18.        ],
       [ 25.        ],
       [ 26.        ],
       [ 39.        ],
       [ 45.        ],
       [ 42.        ],
       [ 22.        ],
       [ 29.69911765],
       [ 24.        ],
       [ 29.69911765],
       [ 48.        ],
       [ 29.        ],
       [ 52.        ],
       [ 19.        ],
       [ 38.        ],
       [ 27.        ],
       [ 29.69911765],
       [ 33.        ],
       [  6.        ],
       [ 17.        ],
       [ 34.        ],
       [ 50.        ],
       [ 27.        ],
       [ 20.        ],
       [ 30.        ],
       [ 29.69911765],
       [ 25.        ],
       [ 25.        ],
       [ 29.        ],
       [ 11.        ],
       [ 29.69911765],
       [ 23.        ],
       [ 23.        ],
       [ 28.5       ],
       [ 48.        ],
       [ 35.        ],
       [ 29.69911765],
       [ 29.69911765],
       [ 29.69911765],
       [ 36.        ],
       [ 21.        ],
       [ 24.        ],
       [ 31.        ],
       [ 70.        ],
       [ 16.        ],
       [ 30.        ],
       [ 19.        ],
       [ 31.        ],
       [  4.        ],
       [  6.        ],
       [ 33.        ],
       [ 23.        ],
       [ 48.        ],
       [  0.67      ],
       [ 28.        ],
       [ 18.        ],
       [ 34.        ],
       [ 33.        ],
       [ 29.69911765],
       [ 41.        ],
       [ 20.        ],
       [ 36.        ],
       [ 16.        ],
       [ 51.        ],
       [ 29.69911765],
       [ 30.5       ],
       [ 29.69911765],
       [ 32.        ],
       [ 24.        ],
       [ 48.        ],
       [ 57.        ],
       [ 29.69911765],
       [ 54.        ],
       [ 18.        ],
       [ 29.69911765],
       [  5.        ],
       [ 29.69911765],
       [ 43.        ],
       [ 13.        ],
       [ 17.        ],
       [ 29.        ],
       [ 29.69911765],
       [ 25.        ],
       [ 25.        ],
       [ 18.        ],
       [  8.        ],
       [  1.        ],
       [ 46.        ],
       [ 29.69911765],
       [ 16.        ],
       [ 29.69911765],
       [ 29.69911765],
       [ 25.        ],
       [ 39.        ],
       [ 49.        ],
       [ 31.        ],
       [ 30.        ],
       [ 30.        ],
       [ 34.        ],
       [ 31.        ],
       [ 11.        ],
       [  0.42      ],
       [ 27.        ],
       [ 31.        ],
       [ 39.        ],
       [ 18.        ],
       [ 39.        ],
       [ 33.        ],
       [ 26.        ],
       [ 39.        ],
       [ 35.        ],
       [  6.        ],
       [ 30.5       ],
       [ 29.69911765],
       [ 23.        ],
       [ 31.        ],
       [ 43.        ],
       [ 10.        ],
       [ 52.        ],
       [ 27.        ],
       [ 38.        ],
       [ 27.        ],
       [  2.        ],
       [ 29.69911765],
       [ 29.69911765],
       [  1.        ],
       [ 29.69911765],
       [ 62.        ],
       [ 15.        ],
       [  0.83      ],
       [ 29.69911765],
       [ 23.        ],
       [ 18.        ],
       [ 39.        ],
       [ 21.        ],
       [ 29.69911765],
       [ 32.        ],
       [ 29.69911765],
       [ 20.        ],
       [ 16.        ],
       [ 30.        ],
       [ 34.5       ],
       [ 17.        ],
       [ 42.        ],
       [ 29.69911765],
       [ 35.        ],
       [ 28.        ],
       [ 29.69911765],
       [  4.        ],
       [ 74.        ],
       [  9.        ],
       [ 16.        ],
       [ 44.        ],
       [ 18.        ],
       [ 45.        ],
       [ 51.        ],
       [ 24.        ],
       [ 29.69911765],
       [ 41.        ],
       [ 21.        ],
       [ 48.        ],
       [ 29.69911765],
       [ 24.        ],
       [ 42.        ],
       [ 27.        ],
       [ 31.        ],
       [ 29.69911765],
       [  4.        ],
       [ 26.        ],
       [ 47.        ],
       [ 33.        ],
       [ 47.        ],
       [ 28.        ],
       [ 15.        ],
       [ 20.        ],
       [ 19.        ],
       [ 29.69911765],
       [ 56.        ],
       [ 25.        ],
       [ 33.        ],
       [ 22.        ],
       [ 28.        ],
       [ 25.        ],
       [ 39.        ],
       [ 27.        ],
       [ 19.        ],
       [ 29.69911765],
       [ 26.        ],
       [ 32.        ]])

import numpy as np
log_age = df_train[['Age']].apply(lambda x:np.log(x))
log_age
Age
0	3.091042
1	3.637586
2	3.258097
3	3.555348
4	3.555348
5	NaN
6	3.988984
7	0.693147
8	3.295837
9	2.639057
10	1.386294
11	4.060443
12	2.995732
13	3.663562
14	2.639057
15	4.007333
16	0.693147
17	NaN
18	3.433987
19	NaN
20	3.555348
21	3.526361
22	2.708050
23	3.332205
24	2.079442
25	3.637586
26	NaN
27	2.944439
28	NaN
29	NaN
...	...
861	3.044522
862	3.871201
863	NaN
864	3.178054
865	3.737670
866	3.295837
867	3.433987
868	NaN
869	1.386294
870	3.258097
871	3.850148
872	3.496508
873	3.850148
874	3.332205
875	2.708050
876	2.995732
877	2.944439
878	NaN
879	4.025352
880	3.218876
881	3.496508
882	3.091042
883	3.332205
884	3.218876
885	3.663562
886	3.295837
887	2.944439
888	NaN
889	3.258097
890	3.465736
891 rows × 1 columns


from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
mms.fit_transform(df_train[['Fare']])  #加2个中括号是把里面每个数都变成一个向量,那么整体就是array的格式

from sklearn.preprocessing import StandardScaler
SS = StandardScaler()
SS.fit_transform(df_train[['Fare']])

max_age = df_train['Fare'].max()
print(max_age)
512.3292

min_age = df_train[['Age']].min()
print(min_age)
Age    0.42
dtype: float64

df_train.loc[:,'family_size'] = df_train['SibSp'] + df_train['Parch'] + 1
df_train.head(10)
df_train['family_size']

from sklearn.preprocessing import PolynomialFeatures

PnF = PolynomialFeatures()
Poly_fit = PnF.fit_transform(df_train[['SibSp']],df_train[['Parch']])
Poly_fit
array([[ 1.,  1.,  1.],
       [ 1.,  1.,  1.],
       [ 1.,  0.,  0.],
       ..., 
       [ 1.,  1.,  1.],
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.]])

#cut将根据值本身来选择箱子均匀间隔,等长划分,qcut是根据这些值的频率来选择箱子的均匀间隔,等比划分。 
cutdata = pd.qcut(df_train['Age'],8)
cutdata

df_train.loc[:,'fare_cut'] = pd.cut(df_train['Fare'],5)  #里面应该是一个一维的类数组对象
df_train.head(30)

df_train.info()

#one hot 处理
dfg = df_train.groupby(df_train['Embarked'])
dfg.describe()    #Embarled中有c , q ,s 三个分类

#one hot 处理
embark_one_hot = pd.get_dummies(df_train['Embarked'])
embark_one_hot

car_time = pd.read_csv('car_data.csv')
car_time.head(10)

car_time.loc[:,"date"] = pd.to_datetime(car_time["date_t"],format="")  #把object类型的date转成datetime的时间类型
car_time.head()
date_t	cnt	date	month
0	2012-12-31	NaN	2012-12-31	12
1	2013-01-01	NaN	2013-01-01	1
2	2013-01-02	68.0	2013-01-02	1
3	2013-01-03	36.0	2013-01-03	1
4	2013-01-04	5565.0	2013-01-04	1

car_time.loc[:,"month"] = car_time["date"].dt.month

car_time.loc[:,"dom"] = car_time["date"].dt.day
car_time.head()
date_t	cnt	date	month	dom
0	2012-12-31	NaN	2012-12-31	12	31
1	2013-01-01	NaN	2013-01-01	1	1
2	2013-01-02	68.0	2013-01-02	1	2
3	2013-01-03	36.0	2013-01-03	1	3
4	2013-01-04	5565.0	2013-01-04	1	4

car_time.loc[:,"dow"] = car_time["date"].dt.dayofweek
car_time.loc[:,"weekend"] = car_time["dow"].apply(lambda x: 1 if (x == 6 or x == 1) else 0)  #不能用 lambda x:x==1,右边是个函数,这个函数输出true
car_time.head(10)
date_t	cnt	date	month	dom	dow	weekend
0	2012-12-31	NaN	2012-12-31	12	31	0	0
1	2013-01-01	NaN	2013-01-01	1	1	1	1
2	2013-01-02	68.0	2013-01-02	1	2	2	0
3	2013-01-03	36.0	2013-01-03	1	3	3	0
4	2013-01-04	5565.0	2013-01-04	1	4	4	0
5	2013-01-05	4966.0	2013-01-05	1	5	5	0
6	2013-01-06	3346.0	2013-01-06	1	6	6	1
7	2013-01-07	3396.0	2013-01-07	1	7	0	0
8	2013-01-08	4146.0	2013-01-08	1	8	1	1
9	2013-01-09	3096.0	2013-01-09	1	9	2	0

#特殊类型,文本型 
#1. 词袋模型
from sklearn.feature_extraction.text import CountVectorizer   #计数器

verctorize = CountVectorizer()  #初始化CountVectorizer这个类,这个类init不用传入参数

corpus = [
        'This is the first document.',
        'This is the second second document.',
        'And the third one.',
        'Is this the first document?'
        ]

X = verctorize.fit_transform(corpus)

verctorize.get_feature_names()
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

​

X.toarray() #['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this'] 一一对应出现的次数
#总之,CountVectorizer 计算每个词在句子中出现的次数,并且形成向量化的形式,每个colum对应一个词,1代表and词在第一列出现了一次。
array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 2, 1, 0, 1],
       [1, 0, 0, 0, 1, 0, 1, 1, 0],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]], dtype=int64)

vec = CountVectorizer(ngram_range=(1,3))   #把关键字组合起来看次数
X_ngram = vec.fit_transform(corpus)
X_ngram.toarray()
array([[0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
        0, 0, 0, 0, 1, 1, 1, 0, 0],
       [0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 2, 1, 1, 1, 1, 0, 0, 1, 1,
        0, 0, 0, 0, 1, 1, 1, 0, 0],
       [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        1, 1, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 1, 1]], dtype=int64)

​

###TF-IDF  带权重的
from sklearn.feature_extraction.text import TfidfVectorizer

tfid = TfidfVectorizer()

tfid_X =tfid.fit_transform(corpus)

tfid.get_feature_names()
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

tfid_X.toarray()  #告诉哪个词在第几个句子所占的比重是多少
array([[ 0.        ,  0.43877674,  0.54197657,  0.43877674,  0.        ,
         0.        ,  0.35872874,  0.        ,  0.43877674],
       [ 0.        ,  0.27230147,  0.        ,  0.27230147,  0.        ,
         0.85322574,  0.22262429,  0.        ,  0.27230147],
       [ 0.55280532,  0.        ,  0.        ,  0.        ,  0.55280532,
         0.        ,  0.28847675,  0.55280532,  0.        ],
       [ 0.        ,  0.43877674,  0.54197657,  0.43877674,  0.        ,
         0.        ,  0.35872874,  0.        ,  0.43877674]])

tfid.get_feature_names()
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

df_train.head()
PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked	family_size	fare_cut
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	NaN	S	2	(-0.512, 102.466]
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	0	PC 17599	71.2833	C85	C	2	(-0.512, 102.466]
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	0	STON/O2. 3101282	7.9250	NaN	S	1	(-0.512, 102.466]
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	0	113803	53.1000	C123	S	2	(-0.512, 102.466]
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	0	373450	8.0500	NaN	S	1	(-0.512, 102.466]

#借助条件去获取组合特征

df_train.loc[:,"alone"] = (df_train['SibSp']==0)&(df_train['Parch']==0)

df_train.head()
PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked	family_size	fare_cut	alone
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	NaN	S	2	(-0.512, 102.466]	False
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	0	PC 17599	71.2833	C85	C	2	(-0.512, 102.466]	False
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	0	STON/O2. 3101282	7.9250	NaN	S	1	(-0.512, 102.466]	True
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	0	113803	53.1000	C123	S	2	(-0.512, 102.466]	False
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	0	373450	8.0500	NaN	S	1	(-0.512, 102.466]	True

#过滤式选择更加好的特征
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest

iris = load_iris()
X, y = iris.data,iris.target
X

X_new= SelectKBest(k=2).fit_transform(X,y)  #选2个特征,找到x中的两个特征,那么这2个特征就是更符合分类(也是y值)的特征。计算相关系数。
X_new.shape
(150, 2)

X_new

  #递归的特征筛选
#包裹型 wrapper
from sklearn.feature_selection import RFE  #递归的特征筛选

from sklearn.ensemble import RandomForestClassifier  #用rf去判定选择特征重要度
rf = RandomForestClassifier()
rfe = RFE(estimator=rf, n_features_to_select=2)

X_rfe = rfe.fit_transform(X,y)

X_rfe.shape
(150, 2)

X_rfe[:5,:]  #最后2列
array([[ 1.4,  0.2],
       [ 1.4,  0.2],
       [ 1.3,  0.2],
       [ 1.5,  0.2],
       [ 1.4,  0.2]])

   #需要线性模型
#嵌入式 
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC   #需要线性模型

lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)  #l1正则化 
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)  #l1正则化 

model = SelectFromModel(lsvc, prefit=True)

X_embed = model.transform(X)

X_embed.shape
(150, 3)

​


你可能感兴趣的:(特征工程和数据预处理常用工具和方法)