1 # 导入工具包
2 import pandas as pd 3 4 # 读取数据 5 features = pd.read_csv('data/temps_extended.csv') 6 features.head(5) 7 8 print('数据规模',features.shape)
数据规模 (2191, 12)
- ws_1:前一天的风速。
- prcp_1:前一天的降水。
- snwd_1:前一天的积雪深度。
1 # 设置整体布局
2 fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2, figsize = (15,10))
3 fig.autofmt_xdate(rotation = 45) 4 5 # 平均最高气温 6 ax1.plot(dates, features['average']) 7 ax1.set_xlabel(''); ax1.set_ylabel('Temperature (F)'); ax1.set_title('Historical Avg Max Temp') 8 9 # 风速 10 ax2.plot(dates, features['ws_1'], 'r-') 11 ax2.set_xlabel(''); ax2.set_ylabel('Wind Speed (mph)'); ax2.set_title('Prior Wind Speed') 12 13 # 降水 14 ax3.plot(dates, features['prcp_1'], 'r-') 15 ax3.set_xlabel('Date'); ax3.set_ylabel('Precipitation (in)'); ax3.set_title('Prior Precipitation') 16 17 # 积雪 18 ax4.plot(dates, features['snwd_1'], 'ro') 19 ax4.set_xlabel('Date'); ax4.set_ylabel('Snow Depth (in)'); ax4.set_title('Prior Snow Depth') 20 21 plt.tight_layout(pad=2)
1 # 创建一个季节变量
2 seasons = []
3 4 for month in features['month']: 5 if month in [1, 2, 12]: 6 seasons.append('winter') 7 elif month in [3, 4, 5]: 8 seasons.append('spring') 9 elif month in [6, 7, 8]: 10 seasons.append('summer') 11 elif month in [9, 10, 11]: 12 seasons.append('fall') 13 14 # 有了季节我们就可以分析更多东西了 15 reduced_features = features[['temp_1', 'prcp_1', 'average', 'actual']] 16 # reduced_features=reduced_features.copy() 17 reduced_features['season'] = seasons 18 19 ####reduced_features['season']=None #增加一个新列 20 21 # reduced_features=reduced_features.copy() 22 23 # for k in range(0,len(seasons)): 24 # reduced_features.loc[k,'season']=seasons[k] 25 26 # reduced_features.loc[:, 'season'] = '30' #设置一个整列值 27 ####print(reduced_features.columns) #列名所有 28 # print(reduced_features.columns[4]) #列名season 29 ####print(reduced_features.iloc[:,4]) #第5列值 30 # print(reduced_features.iloc[:,3]) #第4列值 31 32 33 # for label, content in reduced_features.items(): 34 # print('label:', label) 35 # print('content:', content, sep='\n') 36 37 38 print(round(reduced_features.describe(),2)) 39 # print(len(reduced_features)) ##2191 40 # print(reduced_features.shape[1]) #列数5 41 # print(reduced_features.shape[0]) #行数2191 42 # print(reduced_features.columns[4]) #列名season 43 # print(reduced_features.loc[0,'season']) #找不到列?? KeyError: 'season' 44 print(reduced_features.loc[:,'season']) #找不到列?? KeyError: 'season' 45 # 46 # print(reduced_features.loc['season'])
1 #两个方案:
2 #1、加copy 3 4 reduced_features=reduced_features.copy() 5 reduced_features['season'] = seasons 6 7 #2、使用loc赋值 8 9 # reduced_features.loc[:,'season'] = seasons 10 11 #将切分后的list数据存入df中 12 for k in range(0,len(seasons)): 13 reduced_features.loc[k,'season']=seasons[k]
有了季节特征之后,如果想观察一下不同季节时上述各项特征的变化情况该怎么做呢?这里给大家推荐一个非常实用的绘图函数pairplot(),需要先安装seaborn工具包(pip install seaborn),它相当于是在Matplotlib的基础上进行封装,用起来更简单方便:
1 导入seaborn工具包
2 import seaborn as sns
3 sns.set(style="ticks", color_codes=True); 4 5 # 选择你喜欢的颜色模板 6 palette = sns.xkcd_palette(['dark blue', 'dark green', 'gold', 'orange']) 7 8 # 绘制pairplot 9 # help(sns.pairplot) 10 # 11 #默认参数 12 # sns.pairplot(reduced_features,hue=None, hue_order=None, palette=None, vars=None, x_vars=None, y_vars=None, 13 # kind='scatter', diag_kind='auto', markers=None, height=2.5, aspect=1, 14 # corner=False, dropna=True, plot_kws=None, diag_kws=None, grid_kws=None, size=None) 15 16 sns.pairplot(reduced_features, hue = 'season', diag_kind='reg', palette= palette, plot_kws=dict(alpha = 0.7), 17 diag_kws=dict(shade=True)); 18 19 sns.pairplot(reduced_features,dropna=True,hue = 'season', diag_kind='kde', palette= palette, plot_kws=dict(alpha = 0.7), 20 diag_kws=dict(shade=True));
1 # 独热编码
2 features = pd.get_dummies(features)
3 4 # 提取特征和标签 5 labels = features['actual'] 6 features = features.drop('actual', axis = 1) 7 8 # 特征名字留着备用 9 feature_list = list(features.columns) 10 11 # 转换成所需格式 12 import numpy as np 13 14 features = np.array(features) 15 labels = np.array(labels) 16 17 # 数据集切分 18 from sklearn.model_selection import train_test_split 19 20 train_features, test_features, train_labels, test_labels = train_test_split(features, labels, 21 test_size = 0.25, random_state = 0) 22 23 print('训练集特征:', train_features.shape) 24 print('训练集标签:', train_labels.shape) 25 print('测试集特征:', test_features.shape) 26 print('测试集标签:', test_labels.shape)
训练集特征: (1643, 17)
训练集标签: (1643,)
测试集特征: (548, 17)
测试集标签: (548,)
1 # 工具包导入
2 import pandas as pd 3 4 # 为了剔除特征个数对结果的影响,这里特征统一只有老数据集中特征 5 original_feature_indices = [feature_list.index(feature) for feature in 6 feature_list if feature not in 7 ['ws_1', 'prcp_1', 'snwd_1']] 8 9 # 读取老数据集 10 original_features = pd.read_csv('data/temps.csv') 11 12 original_features = pd.get_dummies(original_features) 13 14 import numpy as np 15 16 # 数据和标签转换 17 original_labels = np.array(original_features['actual']) 18 19 original_features= original_features.drop('actual', axis = 1) 20 21 original_feature_list = list(original_features.columns) 22 23 original_features = np.array(original_features) 24 25 # 数据集切分 26 from sklearn.model_selection import train_test_split 27 28 original_train_features, original_test_features, original_train_labels, original_test_labels = train_test_split(original_features, original_labels, test_size = 0.25, random_state = 42) 29 30 # 同样的树模型进行建模 31 from sklearn.ensemble import RandomForestRegressor 32 33 # 同样的参数与随机种子 34 rf = RandomForestRegressor(n_estimators= 100, random_state=0) 35 36 # 这里的训练集使用的是老数据集的 37, original_train_labels); 38 39 # 为了测试效果能够公平,统一使用一致的测试集,这里选择了刚刚我切分过的新数据集的测试集 40 predictions = rf.predict(test_features[:,original_feature_indices]) 41 42 # 先计算温度平均误差 43 errors = abs(predictions - test_labels) 44 45 print('平均温度误差:', round(np.mean(errors), 2), 'degrees.') 46 47 # MAPE 48 mape = 100 * (errors / test_labels) 49 50 # 这里的Accuracy为了方便观察,我们就用100减去误差了,希望这个值能够越大越好 51 accuracy = 100 - np.mean(mape) 52 print('Accuracy:', round(accuracy, 2), '%.')
平均温度误差: 4.67 degrees.
Accuracy: 92.2 %.
1 from sklearn.ensemble import RandomForestRegressor
3 # 剔除掉新的特征,保证数据特征是一致的 4 original_train_features = train_features[:,original_feature_indices] 5 6 original_test_features = test_features[:, original_feature_indices] 7 8 rf = RandomForestRegressor(n_estimators= 100 ,random_state=0) 9 10, train_labels); 11 12 # 预测 13 baseline_predictions = rf.predict(original_test_features) 14 15 # 结果 16 baseline_errors = abs(baseline_predictions - test_labels) 17 18 print('平均温度误差:', round(np.mean(baseline_errors), 2), 'degrees.') 19 20 # (MAPE) 21 baseline_mape = 100 * np.mean((baseline_errors / test_labels)) 22 23 # accuracy 24 baseline_accuracy = 100 - baseline_mape 25 print('Accuracy:', round(baseline_accuracy, 2), '%.')
平均温度误差: 4.2 degrees.
Accuracy: 93.12 %.
1 # 准备加入新的特征
2 from sklearn.ensemble import RandomForestRegressor 3 4 rf_exp = RandomForestRegressor(n_estimators= 100, random_state=0) 5, train_labels) 6 7 # 同样的测试集 8 predictions = rf_exp.predict(test_features) 9 10 # 评估 11 errors = abs(predictions - test_labels) 12 13 print('平均温度误差:', round(np.mean(errors), 2), 'degrees.') 14 15 # (MAPE) 16 mape = np.mean(100 * (errors / test_labels)) 17 18 # 看一下提升了多少 19 improvement_baseline = 100 * abs(mape - baseline_mape) / baseline_mape 20 print('特征增多后模型效果提升:', round(improvement_baseline, 2), '%.') 21 22 # accuracy 23 accuracy = 100 - mape 24 print('Accuracy:', round(accuracy, 2), '%.')
平均温度误差: 4.05 degrees.
特征增多后模型效果提升: 3.34 %.
Accuracy: 93.35 %.
1 # 特征名字 2 importances = list(rf_exp.feature_importances_) 3 4 # 名字,数值组合在一起 5 feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)] 6 7 # 排序 8 feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True) 9 10 # 打印出来 11 [print('特征: {:20} 重要性: {}'.format(*pair)) for pair in feature_importances];
特征: temp_1 重要性: 0.85 特征: average 重要性: 0.05 特征: ws_1 重要性: 0.02 特征: friend 重要性: 0.02 特征: year 重要性: 0.01 特征: month 重要性: 0.01 特征: day 重要性: 0.01 特征: prcp_1 重要性: 0.01 特征: temp_2 重要性: 0.01 特征: snwd_1 重要性: 0.0 特征: weekday_Fri 重要性: 0.0 特征: weekday_Mon 重要性: 0.0 特征: weekday_Sat 重要性: 0.0 特征: weekday_Sun 重要性: 0.0 特征: weekday_Thurs 重要性: 0.0 特征: weekday_Tues 重要性: 0.0 特征: weekday_Wed 重要性: 0.0
1 # 指定风格 2'fivethirtyeight') 3 4 # 指定位置 5 x_values = list(range(len(importances))) 6 7 # 绘图 8, importances, orientation = 'vertical', color = 'r', edgecolor = 'k', linewidth = 1.2) 9 10 # x轴名字得竖着写 11 plt.xticks(x_values, feature_list, rotation='vertical') 12 13 # 图名 14 plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances');
1 # 对特征进行排序 2 sorted_importances = [importance[1] for importance in feature_importances] 3 sorted_features = [importance[0] for importance in feature_importances] 4 5 # 累计重要性 6 cumulative_importances = np.cumsum(sorted_importances) 7 8 # 绘制折线图 9 plt.plot(x_values, cumulative_importances, 'g-') 10 11 # 画一条红色虚线,0.95那 12 plt.hlines(y = 0.95, xmin=0, xmax=len(sorted_importances), color = 'r', linestyles = 'dashed') 13 14 # X轴 15 plt.xticks(x_values, sorted_features, rotation = 'vertical') 16 17 # Y轴和名字 18 plt.xlabel('Variable'); plt.ylabel('Cumulative Importance'); plt.title('Cumulative Importances');
1 # 看看有几个特征 2 print('Number of features for 95% importance:', np.where(cumulative_importances > 0.95)[0][0] + 1) 3 4 #Number of features for 95% importance: 5
1 # 选择这些特征 2 important_feature_names = [feature[0] for feature in feature_importances[0:5]] 3 # 找到它们的名字 4 important_indices = [feature_list.index(feature) for feature in important_feature_names] 5 6 # 重新创建训练集 7 important_train_features = train_features[:, important_indices] 8 important_test_features = test_features[:, important_indices] 9 10 # 数据维度 11 print('Important train features shape:', important_train_features.shape) 12 print('Important test features shape:', important_test_features.shape)
Important train features shape: (1643, 5)
Important test features shape: (548, 5)
1 # 再训练模型 2, train_labels); 3 4 # 同样的测试集 5 predictions = rf_exp.predict(important_test_features) 6 7 # 评估结果 8 errors = abs(predictions - test_labels) 9 10 print('平均温度误差:', round(np.mean(errors), 2), 'degrees.') 11 12 mape = 100 * (errors / test_labels) 13 14 # accuracy 15 accuracy = 100 - np.mean(mape) 16 print('Accuracy:', round(accuracy, 2), '%.')
平均温度误差: 4.11 degrees.
Accuracy: 93.28 %.
1 # 要计算时间了 2 import time 3 4 # 这次是用所有特征 5 all_features_time = [] 6 7 # 算一次可能不太准,来10次取个平均 8 for _ in range(10): 9 start_time = time.time() 10, train_labels) 11 all_features_predictions = rf_exp.predict(test_features) 12 end_time = time.time() 13 all_features_time.append(end_time - start_time) 14 15 all_features_time = np.mean(all_features_time) 16 print('使用所有特征时建模与测试的平均时间消耗:', round(all_features_time, 2), '秒.')
使用所有特征时建模与测试的平均时间消耗: 0.7 秒.
1 # 这次是用部分重要的特征 2 reduced_features_time = [] 3 4 # 算一次可能不太准,来10次取个平均 5 for _ in range(10): 6 start_time = time.time() 7, train_labels) 8 reduced_features_predictions = rf_exp.predict(important_test_features) 9 end_time = time.time() 10 reduced_features_time.append(end_time - start_time) 11 12 reduced_features_time = np.mean(reduced_features_time) 13 print('使用所有特征时建模与测试的平均时间消耗:', round(reduced_features_time, 2), '秒.')
使用所有特征时建模与测试的平均时间消耗: 0.44 秒.
1 # 用分别的预测值来计算评估结果 2 all_accuracy = 100 * (1- np.mean(abs(all_features_predictions - test_labels) / test_labels)) 3 reduced_accuracy = 100 * (1- np.mean(abs(reduced_features_predictions - test_labels) / test_labels)) 4 5 #创建一个df来保存结果 6 comparison = pd.DataFrame({'features': ['all (17)', 'reduced (5)'], 7 'run_time': [round(all_features_time, 2), round(reduced_features_time, 2)], 8 'accuracy': [round(all_accuracy, 2), round(reduced_accuracy, 2)]}) 9 10 comparison[['features', 'accuracy', 'run_time']]
features accuracy run_time
0 all (17) 93.35 0.70
1 reduced (5) 93.28 0.44
relative_accuracy_decrease = 100 * (all_accuracy - reduced_accuracy) / all_accuracy print('相对accuracy下降:', round(relative_accuracy_decrease, 3), '%.') relative_runtime_decrease = 100 * (all_features_time - reduced_features_time) / all_features_time print('相对时间效率提升:', round(relative_runtime_decrease, 3), '%.')
相对accuracy下降: 0.071 %.
相对时间效率提升: 38.248 %.
1 # 绘图来总结把 2 # 设置总体布局,还是一整行看起来好一些 3 fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3, figsize = (16,5), sharex = True) 4 5 # X轴 6 x_values = [0, 1, 2] 7 labels = list(model_comparison['model']) 8 plt.xticks(x_values, labels) 9 10 # 字体大小 11 fontdict = {'fontsize': 18} 12 fontdict_yaxis = {'fontsize': 14} 13 14 # 预测温度和真实温度差异对比 15, model_comparison['error (degrees)'], color = ['b', 'r', 'g'], edgecolor = 'k', linewidth = 1.5) 16 ax1.set_ylim(bottom = 3.5, top = 4.5) 17 ax1.set_ylabel('Error (degrees) (F)', fontdict = fontdict_yaxis); 18 ax1.set_title('Model Error Comparison', fontdict= fontdict) 19 20 # Accuracy 对比 21, model_comparison['accuracy'], color = ['b', 'r', 'g'], edgecolor = 'k', linewidth = 1.5) 22 ax2.set_ylim(bottom = 92, top = 94) 23 ax2.set_ylabel('Accuracy (%)', fontdict = fontdict_yaxis); 24 ax2.set_title('Model Accuracy Comparison', fontdict= fontdict) 25 26 # 时间效率对比 27, model_comparison['run_time (s)'], color = ['b', 'r', 'g'], edgecolor = 'k', linewidth = 1.5) 28 ax3.set_ylim(bottom = 0, top = 1) 29 ax3.set_ylabel('Run Time (sec)', fontdict = fontdict_yaxis); 30 ax3.set_title('Model Run-Time Comparison', fontdict= fontdict); 31 32 # Find the original feature indices 33 original_feature_indices = [feature_list.index(feature) for feature in 34 feature_list if feature not in 35 ['ws_1', 'prcp_1', 'snwd_1']] 36 37 # Create a test set of the original features 38 original_test_features = test_features[:, original_feature_indices] 39 40 # Time to train on original data set (1 year) 41 original_features_time = [] 42 43 # Do 10 iterations and take average for all features 44 for _ in range(10): 45 start_time = time.time() 46, original_train_labels) 47 original_features_predictions = rf.predict(original_test_features) 48 end_time = time.time() 49 original_features_time.append(end_time - start_time) 50 51 original_features_time = np.mean(original_features_time) 52 53 # Calculate mean absolute error for each model 54 original_mae = np.mean(abs(original_features_predictions - test_labels)) 55 exp_all_mae = np.mean(abs(all_features_predictions - test_labels)) 56 exp_reduced_mae = np.mean(abs(reduced_features_predictions - test_labels)) 57 58 # Calculate accuracy for model trained on 1 year of data 59 original_accuracy = 100 * (1 - np.mean(abs(original_features_predictions - test_labels) / test_labels)) 60 61 # Create a dataframe for comparison 62 model_comparison = pd.DataFrame({'model': ['original', 'exp_all', 'exp_reduced'], 63 'error (degrees)': [original_mae, exp_all_mae, exp_reduced_mae], 64 'accuracy': [original_accuracy, all_accuracy, reduced_accuracy], 65 'run_time (s)': [original_features_time, all_features_time, reduced_features_time]}) 66 67 # Order the dataframe 68 model_comparison = model_comparison[['model', 'error (degrees)', 'accuracy', 'run_time (s)']]
model error (degrees) accuracy run_time (s)
0 original 4.667628 92.202816 0.176642
1 exp_all 4.049051 93.349629 0.704933
2 exp_reduced 4.113084 93.283485 0.435311
1 # 绘图来总结把 2 # 设置总体布局,还是一整行看起来好一些 3 fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3, figsize = (16,5), sharex = True) 4 5 # X轴 6 x_values = [0, 1, 2] 7 labels = list(model_comparison['model']) 8 plt.xticks(x_values, labels) 9 10 # 字体大小 11 fontdict = {'fontsize': 18} 12 fontdict_yaxis = {'fontsize': 14} 13 14 # 预测温度和真实温度差异对比 15, model_comparison['error (degrees)'], color = ['b', 'r', 'g'], edgecolor = 'k', linewidth = 1.5) 16 ax1.set_ylim(bottom = 3.5, top = 4.5) 17 ax1.set_ylabel('Error (degrees) (F)', fontdict = fontdict_yaxis); 18 ax1.set_title('Model Error Comparison', fontdict= fontdict) 19 20 # Accuracy 对比 21, model_comparison['accuracy'], color = ['b', 'r', 'g'], edgecolor = 'k', linewidth = 1.5) 22 ax2.set_ylim(bottom = 92, top = 94) 23 ax2.set_ylabel('Accuracy (%)', fontdict = fontdict_yaxis); 24 ax2.set_title('Model Accuracy Comparison', fontdict= fontdict) 25 26 # 时间效率对比 27, model_comparison['run_time (s)'], color = ['b', 'r', 'g'], edgecolor = 'k', linewidth = 1.5) 28 ax3.set_ylim(bottom = 0, top = 1) 29 ax3.set_ylabel('Run Time (sec)', fontdict = fontdict_yaxis); 30 ax3.set_title('Model Run-Time Comparison', fontdict= fontdict);
1 import pandas as pd 2 features = pd.read_csv('data/temps_extended.csv') 3 4 features = pd.get_dummies(features) 5 6 labels = features['actual'] 7 features = features.drop('actual', axis = 1) 8 9 feature_list = list(features.columns) 10 11 import numpy as np 12 13 features = np.array(features) 14 labels = np.array(labels) 15 16 from sklearn.model_selection import train_test_split 17 18 train_features, test_features, train_labels, test_labels = train_test_split(features, labels, 19 test_size = 0.25, random_state = 42) 20 21 print('Training Features Shape:', train_features.shape) 22 print('Training Labels Shape:', train_labels.shape) 23 print('Testing Features Shape:', test_features.shape) 24 print('Testing Labels Shape:', test_labels.shape) 25 26 print('{:0.1f} years of data in the training set'.format(train_features.shape[0] / 365.)) 27 print('{:0.1f} years of data in the test set'.format(test_features.shape[0] / 365.)) 28 29 important_feature_names = ['temp_1', 'average', 'ws_1', 'temp_2', 'friend', 'year'] 30 31 important_indices = [feature_list.index(feature) for feature in important_feature_names] 32 33 important_train_features = train_features[:, important_indices] 34 important_test_features = test_features[:, important_indices] 35 36 print('Important train features shape:', important_train_features.shape) 37 print('Important test features shape:', important_test_features.shape) 38 39 train_features = important_train_features[:] 40 test_features = important_test_features[:] 41 42 feature_list = important_feature_names[:]
1 from sklearn.ensemble import RandomForestRegressor 2 3 rf = RandomForestRegressor(random_state = 42) 4 5 from pprint import pprint 6 7 # 打印所有参数 8 pprint(rf.get_params())
1 from sklearn.model_selection import RandomizedSearchCV 2 3 # 建立树的个数 4 n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)] 5 # 最大特征的选择方式 6 max_features = ['auto', 'sqrt'] 7 # 树的最大深度 8 max_depth = [int(x) for x in np.linspace(10, 20, num = 2)] 9 max_depth.append(None) 10 # 节点最小分裂所需样本个数 11 min_samples_split = [2, 5, 10] 12 # 叶子节点最小样本数,任何分裂不能让其子节点样本数少于此值 13 min_samples_leaf = [1, 2, 4] 14 # 样本采样方法 15 bootstrap = [True, False] 16 17 # Random grid 18 random_grid = {'n_estimators': n_estimators, 19 'max_features': max_features, 20 'max_depth': max_depth, 21 'min_samples_split': min_samples_split, 22 'min_samples_leaf': min_samples_leaf, 23 'bootstrap': bootstrap}
- estimator:RandomizedSearchCV是一个通用的、并不是专为随机森林设计的函数,所以需要指定选择的算法模型是什么。
- distributions:参数的候选空间,上述代码中已经用字典格式给出了所需的参数分布。
- n_iter:随机寻找参数组合的个数,例如,n_iter=100,代表接下来要随机找100组参数的组合,在其中找到最好的。
- scoring:评估方法,按照该方法去找最好的参数组合。
- cv:交叉验证,之前已经介绍过。
- verbose:打印信息的数量,根据自己的需求。
- random_state:随机种子,为了使得结果能够一致,排除掉随机成分的干扰,一般都会指定成一个值,用你自己的幸运数字就好。
- n_jobs:多线程来跑这个程序,如果是−1,就会用所有的,但是可能会有点卡。即便把n_jobs设置成−1,程序运行得还是有点慢,因为要建立100次模型来选择参数,并且带有3折交叉验证,那就相当于300个任务。
1 # 随机选择最合适的参数组合 2 rf = RandomForestRegressor() 3 4 rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, 5 n_iter = 100, scoring='neg_mean_absolute_error', 6 cv = 3, verbose=2, random_state=42, n_jobs=-1) 7 8 # 执行寻找操作 9, train_labels)
1 rf_random.best_params_
{'n_estimators': 1400, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 10, 'bootstrap': True}
接下来,对比经过随机调参后的结果和用默认参数结果的差异,所有默认参数在API中都有说明,例如n_estimators:integer,optional (default=10),表示在随机森林模型中,默认要建立树的个数是10。
1 def evaluate(model, test_features, test_labels): 2 predictions = model.predict(test_features) 3 errors = abs(predictions - test_labels) 4 mape = 100 * np.mean(errors / test_labels) 5 accuracy = 100 - mape 6 7 print('平均气温误差.',np.mean(errors)) 8 print('Accuracy = {:0.2f}%.'.format(accuracy))
1 base_model = RandomForestRegressor( random_state = 42) 2, train_labels) 3 evaluate(base_model, test_features, test_labels)
平均气温误差. 3.829032846715329
Accuracy = 93.56%.
1 best_random = rf_random.best_estimator_ 2 evaluate(best_random, test_features, test_labels)
平均气温误差. 3.7145380641444214
Accuracy = 93.73%.
1 from sklearn.model_selection import GridSearchCV 2 3 # 网络搜索 4 param_grid = { 5 'bootstrap': [True], 6 'max_depth': [8,10,12], 7 'max_features': ['auto'], 8 'min_samples_leaf': [2,3, 4, 5,6], 9 'min_samples_split': [3, 5, 7], 10 'n_estimators': [800, 900, 1000, 1200] 11 } 12 13 # 选择基本算法模型 14 rf = RandomForestRegressor() 15 16 # 网络搜索 17 grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 18 scoring = 'neg_mean_absolute_error', cv = 3, 19 n_jobs = -1, verbose = 2) 20 21 # 执行搜索 22, train_labels)
1 grid_search.best_params_
{'bootstrap': True, 'max_depth': 12, 'max_features': 'auto', 'min_samples_leaf': 6, 'min_samples_split': 3, 'n_estimators': 900}
1 best_grid = grid_search.best_estimator_ 2 evaluate(best_grid, test_features, test_labels)
平均气温误差. 3.6813587581120273
Accuracy = 93.78%.
1 param_grid = { 2 'bootstrap': [True], 3 'max_depth': [12, 15, None], 4 'max_features': [3, 4,'auto'], 5 'min_samples_leaf': [5, 6, 7], 6 'min_samples_split': [7,10,13], 7 'n_estimators': [900, 1000, 1200] 8 } 9 10 # 选择算法模型 11 rf = RandomForestRegressor() 12 13 # 继续寻找 14 grid_search_ad = GridSearchCV(estimator = rf, param_grid = param_grid, 15 scoring = 'neg_mean_absolute_error', cv = 3, 16 n_jobs = -1, verbose = 2) 17 18, train_labels)
1 grid_search_ad.best_params_
grid_search_ad.best_params_ {'bootstrap': True, 'max_depth': 12, 'max_features': 4, 'min_samples_leaf': 7, 'min_samples_split': 13, 'n_estimators': 1200}
1 best_grid_ad = grid_search_ad.best_estimator_ 2 evaluate(best_grid_ad, test_features, test_labels)
平均气温误差. 3.6642196127491156
Accuracy = 93.82%.
1 print('最终模型参数:\n') 2 pprint(best_grid_ad.get_params())
最终模型参数: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': 12, 'max_features': 4, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 7, 'min_samples_split': 13, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 1200, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
- 1.参数空间是非常重要的,它会对结果产生决定性的影响,所以在任务开始之前,需要选择一个大致合适的区间,可以参考一些相同任务论文中的经验值。
- 2.随机搜索相对更节约时间,尤其是在任务开始阶段,并不知道参数在哪一个位置,效果可能更好时,可以把参数间隔设置得稍微大一些,用随机方法确定一个大致的位置。
- 3.网络搜索相当于地毯式搜索,需要遍历参数空间中每一种可能的组合,相对速度更慢,可以搭配随机搜索一起使用。
- 4.调参的方法还有很多,例如贝叶斯优化,这个还是很有意思的,跟大家简单说一下,试想之前的调参方式,是不是每一个都是独立地进行,不会对之后的结果产生任何影响?贝叶斯优化的基本思想在于,每一个优化都是在不断积累经验,这样会慢慢得到最终的解应当在的位置,相当于前一步结果会对后面产生影响,如果大家对贝叶斯优化感兴趣,可以参考Hyperopt工具包,用起来很简便: 这是一份参考:
pip install hyperopt /* Looking in indexes: ... Successfully built networkx Installing collected packages: cloudpickle, networkx, tqdm, hyperopt Attempting uninstall: networkx Found existing installation: networkx 2.4 Uninstalling networkx-2.4: Successfully uninstalled networkx-2.4 Successfully installed cloudpickle-1.3.0 hyperopt-0.2.3 networkx-2.2 tqdm-4.45.0