1、有一些python编程经验。
2、熟悉python主要科学库,特别是:numpy,pandas和matplotlib。
3、最好使用Jupyter 编程。(没有的话,建议下载Anaconda。里面有。)
1、 下载一个压缩文件housing.tgz即可,其包含housing.csv(已经包含书有数据。),用 tax xzf housing.tgz 来解压提取CSV文件。
import os
import tarfile
import urllib.request
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
if not os.path.isdir(housing_path):
os.makedirs(housing_path)
tgz_path = os.path.join(housing_path, "housing.tgz")
urllib.request.urlretrieve(housing_url, tgz_path)
housing_tgz = tarfile.open(tgz_path)
housing_tgz.extractall(path=housing_path)
housing_tgz.close()
之后应用函数就好了。Jupyter 最好用谷歌浏览器,搞不好会报错(没有网站访问权限)。
fetch_housing_data()
2、使用pandas加载数据,返回包含所用数据的DF 对象。
import pandas as pd
def load_housing_data(housing_path=HOUSING_PATH):
csv_path=os.path.join(housing_path,"housing.csv")
return pd.read_csv(csv_path)
load_housing_data(HOUSING_PATH)# 查看信息。
查看数据结构:
# 住房信息
housing = load_housing_data()
housing.head()
housing.info()
#统计学数据
housing.describe()
#每个数值属性的直方图
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50,figsize=(20,15))
plt.show()
3、创建测试集(一般为数据集的百分之20,数据集越大,比例越小。)
# to make this notebook's output identical at every run
import numpy as np
np.random.seed(42)
# For illustration only. Sklearn has train_test_split()
def split_train_test(data, test_ratio):
shuffled_indices = np.random.permutation(len(data))
test_set_size = int(len(data) * test_ratio)
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]
return data.iloc[train_indices], data.iloc[test_indices]
train_set, test_set = split_train_test(housing, 0.2)
len(train_set)
len(test_set)
from zlib import crc32
def test_set_check(identifier, test_ratio):
return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32
def split_train_test_by_id(data, test_ratio, id_column):
ids = data[id_column]
in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
return data.loc[~in_test_set], data.loc[in_test_set]
import hashlib
def test_set_check(identifier, test_ratio, hash=hashlib.md5):
return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio
def test_set_check(identifier, test_ratio, hash=hashlib.md5):
return bytearray(hash(np.int64(identifier)).digest())[-1] < 256 * test_ratio
#使用行索引做ID
housing_with_id = housing.reset_index() # adds an `index` column
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")
housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")
test_set.head()
4、用Scikit-Learn 随机拆分 和 分层抽样出的数据测试集:
#随机拆分:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
#观看效果:
test_set.head()
housing["median_income"].hist()
housing["income_cat"] = pd.cut(housing["median_income"],
bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
labels=[1, 2, 3, 4, 5])
housing["income_cat"].value_counts()
housing["income_cat"].hist()
#分层抽样:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]
#效果:
#收入类别比例:
strat_test_set["income_cat"].value_counts() / len(strat_test_set)
housing["income_cat"].value_counts() / len(housing)
5、接下来对三种测试集进行比较。
def income_cat_proportions(data):
return data["income_cat"].value_counts() / len(data)
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
compare_props = pd.DataFrame({
"Overall": income_cat_proportions(housing),
"Stratified": income_cat_proportions(strat_test_set),
"Random": income_cat_proportions(test_set),
}).sort_index()
compare_props["Rand. %error"] = 100 * compare_props["Random"] / compare_props["Overall"] - 100
compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100
compare_props
得到结果之后,只有随机的会有一定的偏差。我们可以将其删除,使数据恢复原样:
for set_ in (strat_train_set, strat_test_set):
set_.drop("income_cat", axis=1, inplace=True)
前提(为了不损坏数据,copy一下吧。)**
housing = strat_train_set.copy()
1、将地理数据可视化:
#地理散点图。
housing.plot(kind="scatter", x="longitude", y="latitude")
#save_fig("bad_visualization_plot")
#突出高密度区域的更好的可视化。
housing.plot(kind="scatter",x="longitude",y="latitude",alpha=0.1)
#加州房价:红(贵,蓝(便宜,圆圈大小代表人口密度。
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
s=housing["population"]/100, label="population", figsize=(10,7),
c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
sharex=False)
plt.legend()
#save_fig("housing_prices_scatterplot")
#加州房价:红(贵,蓝(便宜,圆圈大小代表人口密度。
2、寻找相关性:
#计算每对属性之间的相关属性。
corr_matrix = housing.corr()
#查看相关性
corr_matrix["median_house_value"].sort_values(ascending=False)
#每个数值属性的直方图,和每个属性相对于其他属性的散布矩阵
# from pandas.tools.plotting import scatter_matrix # For older versions of Pandas
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms",
"housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))
#save_fig("scatter_matrix_plot")
#找到最好的属性(收入中位数),单独拉出来
housing.plot(kind="scatter", x="median_income", y="median_house_value",
alpha=0.1)
plt.axis([0, 16, 0, 550000])
save_fig("income_vs_house_value_scatterplot")
3、试验不同属性的组合(特征提取):
# 创建三个新属性
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]
#查看相关矩阵
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)
#可视化
housing.plot(kind="scatter", x="rooms_per_household", y="median_house_value",
alpha=0.2)
plt.axis([0, 5, 0, 520000])
plt.show()
housing.describe()
先回到一个干净的训练集(copy())^ ^
housing = strat_train_set.drop("median_house_value", axis=1) # drop labels for training set
housing_labels = strat_train_set["median_house_value"].copy()
1、数据清理(对残缺的数据,我进行的是补充完整训练数据的中位数。):
#前后对比。
sample_incomplete_rows = housing[housing.isnull().any(axis=1)].head()
sample_incomplete_rows
median = housing["total_bedrooms"].median()
sample_incomplete_rows["total_bedrooms"].fillna(median, inplace=True)
sample_incomplete_rows
2、Scikit-Learn的设计:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)
imputer.statistics_
housing_num.median().values
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns ,index=housing_num.index )
housing_tr.loc[sample_incomplete_rows.index.values]
imputer.strategy
housing_tr = pd.DataFrame(X, columns=housing_num.columns,
index=housing_num.index)
housing_tr.head()
3、处理文本和分类属性:、
前面我们只处理了数值属性。现在看一下文本属性。
housing_cat = housing[["ocean_proximity"]]
housing_cat.head(10)
#文本转数字
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder =OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]
#获取类别 列表。
ordinal_encoder.categories_
#传化为独热向量
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot
#将上面的输出的稀疏矩阵 转换成 numpy数组
housing_cat_1hot.toarray()
#查看一下
cat_encoder.categories_
4、自定义转换器
#添加组合后的属性
from sklearn.base import BaseEstimator,TransformerMixin
rooms_ix , bedrooms_ix, population_ix , households_ix =3,4,5,6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin ):
def __init__ (self, add_bedrooms_per_room = True):
self.add_bedrooms_per_room=add_bedrooms_per_room
def fit(self,X, y = None):
return self
def transform(self , X):
rooms_per_household = X [: , rooms_ix] / X[:,households_ix]
population_per_household = X[:,population_ix] / X [:, households_ix]
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:,bedrooms_ix] / X[:,rooms_ix]
return np.c_[X,rooms_per_household , population_per_household,bedrooms_per_room ]
else:
return np.c_[X,rooms_per_household, population_per_household ]
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room= False)
housing_extra_attribs= attr_adder.transform(housing.values)
5、特征缩放:
col_names = "total_rooms", "total_bedrooms", "population", "households"
rooms_ix, bedrooms_ix, population_ix, households_ix = [
housing.columns.get_loc(c) for c in col_names] # get the column indices
housing_extra_attribs = pd.DataFrame(
housing_extra_attribs,
columns=list(housing.columns)+["rooms_per_household", "population_per_household"],
index=housing.index)
housing_extra_attribs.head()
6、转换流水线:
(数据的转换需要正确的顺序来执行)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
housing_num_tr = num_pipeline.fit_transform(housing_num)
#将所有转换应用到房屋数据
from sklearn.compose import ColumnTransformer
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
("num",num_pipeline , num_attribs ),
("cat" , OneHotEncoder(),cat_attribs ),
])
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared
housing_prepared.shape
开始准备机器学习算法:
一共训练了线性回归模型,决策树和随机森林。训练之后用测试集评估看那个泛化效果更好。
1、训练和评估训练集:
#训练线性回归模型。
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared,housing_labels)
#训练集实例预测(大概率不太准)
some_data =housing.iloc[:5]
some_labels = housing_labels.iloc[: 5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:",lin_reg.predict(some_data_prepared))
#使用skl来训练。
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse
但是这个结果也并不是太好看(68628.198)有点大。让我们再看一下决策树:
from sklearn.tree import DecisionTreeRegressor #决策树模型
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels,housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse
结果为(0.0) 大概严重过拟合了。
2、交叉验证更好的评估:
#交叉验证 过拟合
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg,housing_prepared , housing_labels ,
scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)
def display_scores(scores):
print("Scores:",scores)
print("Mean:", scores.mean())
print("Standard deviation:", scores.std())
display_scores(tree_rmse_scores)
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores) #线性回归模型
之后你会发现,决策树确实是过拟合了,而且表现比线性回归还有糟糕。让我们再试试随机森林:
#随机森林
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared,housing_labels)
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse
#这个结果就好多了。
from sklearn.model_selection import cross_val_score
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)
1、网格搜索:
调整超参数:
#找出超参数值的最佳组合。
from sklearn.model_selection import GridSearchCV #微调超参数。(网格搜素。
param_grid = [
# try 12 (3×4) combinations of hyperparameters
{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
# then try 6 (2×3) combinations with bootstrap set as False
{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
scoring='neg_mean_squared_error',
return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)
grid_search.best_params_
grid_search.best_estimator_
#所有组合:
cvres = grid_search.cv_results_ #得到评估分数。
for mean_score ,params in zip(cvres["mean_test_score"],cvres["params"]):
print(np.sqrt(-mean_score),params)
2、随机搜索:(适合那种超参数比较大范围的)。
#随机搜索。
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
param_distribs = {
'n_estimators': randint(low=1, high=200),
'max_features': randint(low=1, high=8),
}
forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(housing_prepared, housing_labels)
rnd_search.best_params_
#所有结果:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params)
3、分析最佳模型及其误差:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances #分析最佳模型及其误差。
将重要性分数显示在对应属性旁边:
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
#cat_encoder = cat_pipeline.named_steps["cat_encoder"] # old solution
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)
4、通过测试集评估系统:
到现在,我们终于有了一个还不错的系统。来让我们进行最后的评估,成败在此一举。
评估最终模型
#通过测试集评估最终模型系统
final_model = grid_search.best_estimator_
x_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()
x_test_prepared = full_pipeline.transform(x_test)
final_predictions = final_model.predict(x_test_prepared)
final_mse = mean_squared_error(y_test , final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse
结果还不错,但是存在的泛化误差的危害性还是比较大的。
为此计算泛化误差的0.95置信区间:
from scipy import stats #计算泛化误差0.95的置信区间
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
loc=squared_errors.mean(),
scale=stats.sem(squared_errors)))
#准备和预测的完整数组
full_pipeline_with_predictor = Pipeline([
("preparation", full_pipeline),
("linear", LinearRegression())
])
full_pipeline_with_predictor.fit(housing, housing_labels)
full_pipeline_with_predictor.predict(some_data)
保存训练好的模型,以后还能用。^^
#保存训练好的模型
my_model = full_pipeline_with_predictor
import joblib
joblib.dump(my_model, "my_model.pkl") # DIFF
#...
my_model_loaded = joblib.load("my_model.pkl") # DIFF
我是跟着一本《机器学习实战》学习的,以上基本上是上面的内容。以下会提及。
鄙人不才,分析不是很全面,如有一些错误,请评论指正,感谢!
完整代码: 这个是我敲的
或者:原作者敲的
最后:
Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow, 2nd Edition, 作者: Aurelien Geron(法语) , 又 O Reilly 出版, 书号 978-1-492-03264-9。
建议买一本,很不错。