from sklearn.model_selection import GridSearchCV param_grid = {'C': np.arange(1e-05, 3, 0.1)} scoring = {'Accuracy': 'accuracy', 'AUC': 'roc_auc', 'Log_loss': 'neg_log_loss'} gs = GridSearchCV(LogisticRegression(), return_train_score=True, param_grid=param_grid, scoring=scoring, cv=10, refit='Accuracy')
def train_model(model, param_grid=[], X=[], y=[], splits=5, repeats=5): # get unmodified training data, unless data to use already specified if len(y) == 0: X, y = get_training_data() # create cross-validation method rkfold = RepeatedKFold(n_splits=splits, n_repeats=repeats) # perform a grid search if param_grid given if len(param_grid) > 0: # setup grid search parameters gsearch = GridSearchCV(model, param_grid, cv=rkfold, scoring=rmse_scorer, verbose=1, return_train_score=True) # search the grid gsearch.fit(X, y) # extract best model from the grid model = gsearch.best_estimator_ best_idx = gsearch.best_index_ # get cv-scores for best model grid_results = pd.DataFrame(gsearch.cv_results_) cv_mean = abs(grid_results.loc[best_idx, 'mean_test_score']) cv_std = grid_results.loc[best_idx, 'std_test_score'] # no grid search, just cross-val score for given model else: grid_results = [] cv_results = cross_val_score(model, X, y, scoring=rmse_scorer, cv=rkfold) cv_mean = abs(np.mean(cv_results)) cv_std = np.std(cv_results) # combine mean and std cv-score in to a pandas series cv_score = pd.Series({'mean': cv_mean, 'std': cv_std}) # predict y using the fitted model y_pred = model.predict(X) # print stats on model performance print('----------------------') print(model) print('----------------------') print('score=', model.score(X, y)) print('rmse=', rmse(y, y_pred)) print('cross_val: mean=', cv_mean, ', std=', cv_std) # residual plots y_pred = pd.Series(y_pred, index=y.index) resid = y - y_pred mean_resid = resid.mean() std_resid = resid.std() z = (resid - mean_resid) / std_resid n_outliers = sum(abs(z) > 3) plt.figure(figsize=(15, 5)) ax_131 = plt.subplot(1, 3, 1) plt.plot(y, y_pred, '.') plt.xlabel('y') plt.ylabel('y_pred'); plt.title('corr = {:.3f}'.format(np.corrcoef(y, y_pred)[0][1])) ax_132 = plt.subplot(1, 3, 2) plt.plot(y, y - y_pred, '.') plt.xlabel('y') plt.ylabel('y - y_pred'); plt.title('std resid = {:.3f}'.format(std_resid)) ax_133 = plt.subplot(1, 3, 3) z.plot.hist(bins=50, ax=ax_133) plt.xlabel('z') plt.title('{:.0f} samples with z>3'.format(n_outliers)) return model, cv_score, grid_results
def find_outliers(model, X, y, sigma=3): # predict y values using model try: y_pred = pd.Series(model.predict(X), index=y.index) # if predicting fails, try fitting the model first except: model.fit(X, y) y_pred = pd.Series(model.predict(X), index=y.index) # calculate residuals between the model prediction and true y values resid = y - y_pred mean_resid = resid.mean() std_resid = resid.std() # calculate z statistic, define outliers to be where |z|>sigma z = (resid - mean_resid) / std_resid outliers = z[abs(z) > sigma].index # print and plot the results print('R2=', model.score(X, y)) print('rmse=', rmse(y, y_pred)) print('---------------------------------------') print('mean of residuals:', mean_resid) print('std of residuals:', std_resid) print('---------------------------------------') print(len(outliers), 'outliers:') print(outliers.tolist()) plt.figure(figsize=(15, 5)) ax_131 = plt.subplot(1, 3, 1) plt.plot(y, y_pred, '.') plt.plot(y.loc[outliers], y_pred.loc[outliers], 'ro') plt.legend(['Accepted', 'Outlier']) plt.xlabel('y') plt.ylabel('y_pred'); ax_132 = plt.subplot(1, 3, 2) plt.plot(y, y - y_pred, '.') plt.plot(y.loc[outliers], y.loc[outliers] - y_pred.loc[outliers], 'ro') plt.legend(['Accepted', 'Outlier']) plt.xlabel('y') plt.ylabel('y - y_pred'); ax_133 = plt.subplot(1, 3, 3) z.plot.hist(bins=50, ax=ax_133) z.loc[outliers].plot.hist(color='r', bins=50, ax=ax_133) plt.legend(['Accepted', 'Outlier']) plt.xlabel('z') # plt.savefig('outliers.png') return outliers
def rmsle_cv(model): kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values) rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf)) return(rmse)
def rmse(y_true, y_pred):
diff = y_pred - y_true
sum_sq = sum(diff ** 2)
n = len(y_pred)
return np.sqrt(sum_sq / n)
# scorer to be used in sklearn model fitting
rmse_scorer = make_scorer(rmse, greater_is_better=False)
pca = decomposition.PCA().fit(X) plt.figure(figsize=(10,7)) plt.plot(np.cumsum(pca.explained_variance_ratio_), color='k', lw=2) plt.xlabel('Number of components') plt.ylabel('Total explained variance')
# One-hot encoding for categorical columns with get_dummies def one_hot_encoder(df, nan_as_category=True): original_columns = list(df.columns) categorical_columns = [col for col in df.columns if df[col].dtype == 'object'] df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category) new_columns = [c for c in df.columns if c not in original_columns] return df, new_columns
bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']} for col in bb_cat: bb_aggregations[col] = ['mean'] bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations) bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])