Chapter 2 – End-to-end Machine Learning project
1. Setup:同上忽略
2. Get the data:
#获取数据
housing = load_housing_data()
housing.head()
#数据集划分的几种方法:
1. 利用下标,选择对应行数据,区分train、test。
2. crc32、hashlib等接口。
3. 使用sklearn.model_selection中的train_test_split划分数据集。
4. 使用sklearn.model_selection中的StratifiedShuffleSplit划分数据集。
3. Discover and visualize the data to gain insights:
#可视化某些数据,从而发现数据规律
4. Prepare the data for Machine Learning algorithms:
#训练集中的样本、标签分离
#确实值处理:
1. 调用dropna(),subset为子设置。
2. 调用drop()
3. 按照平均值进行填充缺失值。调用接口fillna(替换值,inplace=True)。
4. 调用高级接口,SimpleImputer。
#编码处理,例如非数值型特征转换为数值型数据。
1. SimpleImputer
2. OneHotEncoder
#自定义transformer(转换器)
from sklearn.base import BaseEstimator, TransformerMixin
# column index
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
self.add_bedrooms_per_room = add_bedrooms_per_room
def fit(self, X, y=None):
return self # nothing else to do
def transform(self, X, y=None):
rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
population_per_household = X[:, population_ix] / X[:, household_ix]
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
return np.c_[X, rooms_per_household, population_per_household,
bedrooms_per_room]
else:
return np.c_[X, rooms_per_household, population_per_household]
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)
添加了两列数据。
#pipeline使用举例
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
housing_num_tr = num_pipeline.fit_transform(housing_num)
缺失值处理、自定义处理器添加特征、数据标准化,使用pipeline打包处理。
#ColumnTransformer
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
("num", num_pipeline, num_attribs),
("cat", OneHotEncoder(), cat_attribs),
])
housing_prepared = full_pipeline.fit_transform(housing)
column处理。
#旧方式处理数据,即设置两个pipeline,然后调用FeatureUnion进行组合。
from sklearn.base import BaseEstimator, TransformerMixin
# Create a class to select numerical or categorical columns
class OldDataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names].values
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
old_num_pipeline = Pipeline([
('selector', OldDataFrameSelector(num_attribs)),
('imputer', SimpleImputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
old_cat_pipeline = Pipeline([
('selector', OldDataFrameSelector(cat_attribs)),
('cat_encoder', OneHotEncoder(sparse=False)),
])
from sklearn.pipeline import FeatureUnion
old_full_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", old_num_pipeline),
("cat_pipeline", old_cat_pipeline),
])
5. Select and train a model:
#构建一个线性回归模型
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
#与真实数据进行对比,即模型评估
#平均值平方估计
#平均值绝对值估计
#决策树模型
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor(random_state=42)
6. Fine-tune your model:
#交叉验证,分别对比线性模型以及决策树
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)
#随机回归森林模型
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor(n_estimators=10, random_state=42)
用平均值平方估计
用交叉验证估计
#svm回归模型
svm_reg = SVR(kernel="linear")
用均值平方估计
#GridSearchCV网格搜索参数最优值
from sklearn.model_selection import GridSearchCV
param_grid = [
# try 12 (3×4) combinations of hyperparameters
{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
# then try 6 (2×3) combinations with bootstrap set as False
{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)
#随机搜索参数最优值
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
param_distribs = {
'n_estimators': randint(low=1, high=200),
'max_features': randint(low=1, high=8),
}
forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(housing_prepared, housing_labels)
#特征重要性评估,适合随机森林
feature_importances = grid_search.best_estimator_.feature_importances_
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
#cat_encoder = cat_pipeline.named_steps["cat_encoder"] # old solution
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)
#最终模型
final_model = grid_search.best_estimator_
X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()
X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
#计算%95置信度下的RMSE
from scipy import stats
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
mean = squared_errors.mean()
m = len(squared_errors)
np.sqrt(stats.t.interval(confidence, m - 1,
loc=np.mean(squared_errors),
scale=stats.sem(squared_errors)))
#手工计算置信度区间为:
tscore = stats.t.ppf((1 + confidence) / 2, df=m - 1)
tmargin = tscore * squared_errors.std(ddof=1) / np.sqrt(m)
np.sqrt(mean - tmargin), np.sqrt(mean + tmargin)
#z-score如下:
zscore = stats.norm.ppf((1 + confidence) / 2)
zmargin = zscore * squared_errors.std(ddof=1) / np.sqrt(m)
np.sqrt(mean - zmargin), np.sqrt(mean + zmargin)
7. Extra material:
A full pipeline with both preparation and prediction:
full_pipeline_with_predictor = Pipeline([
("preparation", full_pipeline),
("linear", LinearRegression())
])
full_pipeline_with_predictor.fit(housing, housing_labels)
full_pipeline_with_predictor.predict(some_data)
Model persistence using joblib:
from sklearn.externals import joblib
joblib.dump(my_model, "my_model.pkl") # DIFF
#...
my_model_loaded = joblib.load("my_model.pkl") # DIFF
Example SciPy distributions for RandomizedSearchCV:
from scipy.stats import geom, expon
geom_distrib=geom(0.5).rvs(10000, random_state=42)
expon_distrib=expon(scale=1).rvs(10000, random_state=42)
plt.hist(geom_distrib, bins=50)
plt.show()
plt.hist(expon_distrib, bins=50)
plt.show()
sklearn-api:
from sklearn.impute import SimpleImputer 缺失值处理器,高级api,statistics_显示对应设置值,支持平均值、中值、最多值。strategy显示当前选取策略。
.transform() 转化数据。
from sklearn.preprocessing import OrdinalEncoder 特征分类编码器,非数字类特征分类值转化为对应的数字。
.fit_transform() 拟合并转化数据
from sklearn.preprocessing import OneHotEncoder onehot格式的特征分类方式。
from sklearn.base import BaseEstimator,TransformerMixin 用于自定义转换器。
.fit() 拟合数据
from sklearn.pipeline import Pipeline 管道,用于数据处理打包。
from sklearn.preprocessing import StandardScaler 数据标准化
from sklearn.compose import ColumnTransformer 列转换器,第一个参数是转换器,第二个参数是对应列数据。
from sklearn.pipeline import FeatureUnion 将多个pipeline组合。
from sklearn.linear_model import LinearRegression sklearn中的LinearRegression(线性回归模型)
from sklearn.metrics import mean_squared_error 平均值平方估计
from sklearn.metrics import mean_absolute_error 平均值绝对值估计
from sklearn.tree import DecisionTreeRegressor 决策树模型
from sklearn.model_selection import cross_val_score 交叉验证
from sklearn.ensemble import RandomForestRegressor 随机深林模型
from sklearn.svm import SVR svm回归模型
from sklearn.model_selection import GridSearchCV 模型最优参数查找器,best_params_,best_estimator_,cv_results_,.best_estimator_.feature_importances_
from sklearn.model_selection import RandomizedSearchCV 模型参数随机查找器
from scipy.stats import randint
from scipy import stats
numpy:
.random.permutation 打乱固定范围内的数据,例如12345->32154
.c_() 类似于pandas的merge,合并数据
.random.randn(4,4) 生成4*4的随机矩阵
pandas:
.read_csv() 从csv文件里读取数据
.head() 打印头部前5个数据
.info() 数据总结
.value_counts() 单列数据值统计
.describe() 数据分析
.iloc() 数字下标选择对应行数据
.ceil() 返回最小的整数,使得整数i>x
.where() (condition, x, y)满足条件则输出x,否则输出y
.corr() 混淆矩阵
.sort_values() 按照固定规则排序
from pandas.plotting import scatter_matrix pandas支持的画图接口
.copy() 复制数据
.drop() 丢弃数据
.isnull() 元素级显示数据是否为空值,空值为TRUE,否则为False
.dropna() 丢弃包含na的列
.median() 求中值
.fillna() 填充空值
.loc() 根据行名称选择对应行数据
.isnull().any(axis=1) 按行显示是否包含空值,默认为按列显示。
.Series() 统计函数
matplotlib:
.hist() 显示数据分布图
.plot() 画图
.get_cmap() 设置cmap属性
.legend() 图例
.mpimg.imread() 读取图像接口
.imshow()
.ylabel() 设置y轴参数
.xlabel() 设置x轴参数
.colorbar() 渐变条
.ax.set_yticklabels() 刻度设置
.set_label() 设置标签
.show() 展示描述后的图画
.savefig() 保存图画
.axis() 设置x、y轴的取值范围