from sklearn.model_selection import train_test_split
train_set, test_set = trian_test_split(housing, test_size=0.2, random_state=42)
housing['income_cat'] = np.ceil(housing['median_income'] / 1.5)
housing.income_cat.where(housing.income_cat < 5, 5, inplace=True)
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
fro train_index, test_index in split.split(housing, housing['income_cat']):
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]
for set in (strat_train_set, strat_test_set): # delete featuren of income_cat
set.drop(['income_cat'], axis=1, inplace=True)
housing = strat_ttrain_set.copy() # 深拷贝, 避免可视化过程中对数据影响
housing = strat_train_set.drop("median_house_value", axis=1) # drop labels for training set
housing_labels = strat_train_set["median_house_value"].copy()
try:
from sklearn.impute import SimpleImputer # Scikit-Learn 0.20+
except ImportError:
from sklearn.preprocessing import Imputer as SimpleImputer
# create Imputer object
imputer = SimpleImputer(strategy="median") # using median
housing_num = housing.drop('ocean_proximity', axis=1)
# alternatively: housing_num = housing.select_dtypes(include=[np.number])
imputer.fit(housing_num)
# transform the training set
X = imputer.transform(housing_num)
# cause the type of return value of imputer.transform is array, we need to tranform it into Dataframe
housing_tr = pd.DataFrame(X, columns=housing_num.columns,
index=housing.index)
OrdinalEncoder
中的fit_transform()
将文本类数据转化为number型数据OneHotEncoder
独热编码, 因为上面的编码仍然会对后续Machine Learning Algorithm产生障碍。比如在分类问题中, 会因为数字大小不一认为4比1距离1更远.reshape(-1, 1)
进行转换为矩阵# extract feature
housing_cat = housing[['ocean_proximity']]
try:
from sklearn.preprocessing import OrdinalEncoder
except ImportError:
from future_encoders import OrdinalEncoder # Scikit-Learn < 0.20
# create OrdinalEncoder object
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
try:
from sklearn.preprocessing import OrdinalEncoder # just to raise an ImportError if Scikit-Learn < 0.20
from sklearn.preprocessing import OneHotEncoder
except ImportError:
from future_encoders import OneHotEncoder # Scikit-Learn < 0.20
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat) # return sparse array, we could use housing_cat_1hot.toarray() to create a dense array, like matrix.
Note that we need to set validate=False because the data contains non-float values (validate will default to False in Scikit-Learn 0.22).
from sklearn.preprocessing import FunctionTransformer
# get the right column indices
rooms_ix, bedrooms_ix, population_ix, household_ix = [
list(housing.columns).index(col)
for col in ("total_rooms", "total_bedrooms", "population", "households")]
def add_extra_features(X, add_bedrooms_per_room=True):
rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
population_per_household = X[:, population_ix] / X[:, household_ix]
if add_bedrooms_per_room:
bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
return np.c_[X, rooms_per_household, population_per_household,
bedrooms_per_room]
else:
return np.c_[X, rooms_per_household, population_per_household]
attr_adder = FunctionTransformer(add_extra_features, validate=False,
kw_args={"add_bedrooms_per_room": False})
housing_extra_attribs = attr_adder.fit_transform(housing.values)
housing_extra_attribs = pd.DataFrame(
housing_extra_attribs,
columns=list(housing.columns)+["rooms_per_household", "population_per_household"],
index=housing.index)
(5). 转换流水线
让上述转换按照一定流水线进行, 因为上述转化有一定顺序
在sklearn中 Pipeline 执行时前面的 n-1 项会调用 fit_transform, 最后一项之调用fit。
sciki-learn官方文档
管道机制在机器学习算法中得以应用的根源在于,参数集在新数据集(比如测试集)上的重复使用。
管道机制实现了对全部步骤的流式化封装和管理(streaming workflows with pipelines)。
注意:管道机制更像是编程技巧的创新,而非算法的创新。
Pipeline 的中间过程由scikit-learn相适配的转换器(transformer)构成,最后一步是一个estimator。比如,StandardScaler和 PCA transformer 构成 intermediate steps,LogisticRegression 作为最终的estimator。
当我们执行 pipe_lr.fit(X_train, y_train)时,首先由StandardScaler在训练集上执行 fit和transform方法,transformed后的数据又被传递给Pipeline对象的下一步,也即PCA()。和StandardScaler一样,PCA也是执行fit和transform方法,最终将转换后的数据传递给 LosigsticRegression。整个流程如下图所示:
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-jzDG9tsC-1580134498614)(evernotecid://CBA4164C-5A56-4141-899C-42B7F62EAC9F/appyinxiangcom/27727878/ENResource/p1)]
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler # standardlisation
# 1. fill is-null number, but when it occurs to txt, could be different
# 2. add extra features
# 3. standardlisation
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('attribs_adder', FunctionTransformer(add_extra_features, validate=False)),
('std_scaler', StandardScaler()),
])
housing_num_tr = num_pipeline.fit_transform(housing_num)