from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, ShuffleSplit,GridSearchCV
from sklearn import model_selection
def get_top_n_features(titanic_train_data_X, titanic_train_data_Y, top_n_features):
rf_est = RandomForestRegressor(random_state=0)
rf_param_grid = {'n_estimators': [400], 'min_samples_split': [2,3], 'max_depth': [10]}
rf_grid = model_selection.GridSearchCV(rf_est, rf_param_grid,n_jobs=-1, cv=5, verbose=1)
rf_grid.fit(titanic_train_data_X, titanic_train_data_Y)
print('Top N Features Best RF Params:' + str(rf_grid.best_params_))
print('Top N Features Best RF Score:' + str(rf_grid.best_score_))
print('Top N Features RF Train Score:' + str(rf_grid.score(titanic_train_data_X, titanic_train_data_Y)))
feature_imp_sorted_rf = pd.DataFrame({'feature': list(titanic_train_data_X),
'importance': rf_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
features_top_n_rf = feature_imp_sorted_rf.head(100)['feature']
print('Sample 10 Features from RF Classifier')
print(str(features_top_n_rf[:10]))
ada_est =AdaBoostRegressor(random_state=0)
ada_param_grid = {'n_estimators': [500], 'learning_rate': [0.01, 0.1]}
ada_grid = model_selection.GridSearchCV(ada_est, ada_param_grid, n_jobs=-1,cv=5, verbose=1)
ada_grid.fit(titanic_train_data_X, titanic_train_data_Y)
print('Top N Features Best Ada Params:' + str(ada_grid.best_params_))
print('Top N Features Best Ada Score:' + str(ada_grid.best_score_))
print('Top N Features Ada Train Score:' + str(ada_grid.score(titanic_train_data_X, titanic_train_data_Y)))
feature_imp_sorted_ada = pd.DataFrame({'feature': list(titanic_train_data_X),
'importance': ada_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
features_top_n_ada = feature_imp_sorted_ada.head(top_n_features)['feature']
print('Sample 10 Feature from Ada Classifier:')
print(str(features_top_n_ada[:10]))
et_est = ExtraTreesRegressor(random_state=0)
et_param_grid = {'n_estimators': [500], 'min_samples_split': [3, 4], 'max_depth': [10]}
et_grid = model_selection.GridSearchCV(et_est, et_param_grid,n_jobs=-1,cv=5, verbose=1)
et_grid.fit(titanic_train_data_X, titanic_train_data_Y)
print('Top N Features Best ET Params:' + str(et_grid.best_params_))
print('Top N Features Best ET Score:' + str(et_grid.best_score_))
print('Top N Features ET Train Score:' + str(et_grid.score(titanic_train_data_X, titanic_train_data_Y)))
feature_imp_sorted_et = pd.DataFrame({'feature': list(titanic_train_data_X),
'importance': et_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
features_top_n_et = feature_imp_sorted_et.head(top_n_features)['feature']
print('Sample 10 Features from ET Classifier:')
print(str(features_top_n_et[:10]))
gb_est =GradientBoostingRegressor(random_state=0)
gb_param_grid = {'n_estimators': [500], 'learning_rate': [0.01, 0.1], 'max_depth': [10]}
gb_grid = model_selection.GridSearchCV(gb_est, gb_param_grid,n_jobs=-1,cv=5, verbose=1)
gb_grid.fit(titanic_train_data_X, titanic_train_data_Y)
print('Top N Features Best GB Params:' + str(gb_grid.best_params_))
print('Top N Features Best GB Score:' + str(gb_grid.best_score_))
print('Top N Features GB Train Score:' + str(gb_grid.score(titanic_train_data_X, titanic_train_data_Y)))
feature_imp_sorted_gb = pd.DataFrame({'feature': list(titanic_train_data_X),
'importance': gb_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
features_top_n_gb = feature_imp_sorted_gb.head(top_n_features)['feature']
print('Sample 10 Feature from GB Classifier:')
print(str(features_top_n_gb[:10]))
dt_est = DecisionTreeRegressor(random_state=0)
dt_param_grid = {'min_samples_split': [2, 4], 'max_depth': [10]}
dt_grid = model_selection.GridSearchCV(dt_est, dt_param_grid, n_jobs=-1,cv=5, verbose=1)
dt_grid.fit(titanic_train_data_X, titanic_train_data_Y)
print('Top N Features Best DT Params:' + str(dt_grid.best_params_))
print('Top N Features Best DT Score:' + str(dt_grid.best_score_))
print('Top N Features DT Train Score:' + str(dt_grid.score(titanic_train_data_X, titanic_train_data_Y)))
feature_imp_sorted_dt = pd.DataFrame({'feature': list(titanic_train_data_X),
'importance': dt_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
features_top_n_dt = feature_imp_sorted_dt.head(top_n_features)['feature']
print('Sample 10 Features from DT Classifier:')
print(str(features_top_n_dt[:10]))
features_top_n = pd.concat([features_top_n_rf, features_top_n_ada, features_top_n_et, features_top_n_gb, features_top_n_dt]
ignore_index=True).drop_duplicates()
features_importance = pd.concat([feature_imp_sorted_rf, feature_imp_sorted_ada, feature_imp_sorted_et,
feature_imp_sorted_gb, feature_imp_sorted_dt],ignore_index=True)
return features_top_n , features_importance
feature_to_pick = 30
feature_top_n, feature_importance = get_top_n_features(train_input, train_output, feature_to_pick)
train_input = pd.DataFrame(train_input[features_top_n])
test_data = pd.DataFrame(test_data[features_top_n]