这次训练数据来自下面文件:
POS_CASH_balance.csv
1_2_train.csv,聚合了(application_train.csv, bureau.csv)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import gc
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
Load data
train_data = pd.read_csv('data/1_2_train.csv')
test_data = pd.read_csv('data/1_2_test.csv')
pos_cash = pd.read_csv('data/pos_cash.csv')
print('train_data ',train_data.shape)
print('test_data ',test_data.shape)
print('pos_cash',pos_cash.shape)
train_data (307511, 326)
test_data (48744, 325)
pos_cash (337252, 84)
聚合pos_cash.csv中的特征
train_data = train_data.merge(pos_cash, on = 'SK_ID_CURR', how = 'left')
test_data = test_data.merge(pos_cash, on = 'SK_ID_CURR', how = 'left')
train_data.shape
(307511, 409)
test_data.shape
(48744, 408)
def model(features, test_features, n_folds = 10):
# 取出ID列
train_ids = features['SK_ID_CURR']
test_ids = test_features['SK_ID_CURR']
# TARGET
labels = features[['TARGET']].astype(int)
# 去掉ID和TARGET
features = features.drop(['SK_ID_CURR', 'TARGET'], axis = 1)
test_features = test_features.drop(['SK_ID_CURR'], axis = 1)
# 特征名字
feature_names = list(features.columns)
# 10折交叉验证
k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 50)
# test predictions
test_predictions = np.zeros(test_features.shape[0])
# validation predictions
out_of_fold = np.zeros(features.shape[0])
# Empty array for feature importances
feature_importance_values = np.zeros(len(feature_names))
# 记录每次的scores
valid_scores = []
train_scores = []
# Iterate through each fold
count = 0
for train_indices, valid_indices in k_fold.split(features):
# Training data for the fold
train_features = features.loc[train_indices, :]
train_labels = labels.loc[train_indices, :]
# Validation data for the fold
valid_features = features.loc[valid_indices, :]
valid_labels = labels.loc[valid_indices, :]
# Create the model
model = lgb.LGBMClassifier(n_estimators=10000, objective = 'binary',
class_weight = 'balanced', learning_rate = 0.05,
reg_alpha = 0.1, reg_lambda = 0.1,
subsample = 0.8, n_jobs = -1, random_state = 50)
# Train the model
model.fit(train_features, train_labels, eval_metric = 'auc',
eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
eval_names = ['valid', 'train'], categorical_feature = 'auto',
early_stopping_rounds = 150, verbose = 200)
# Record the best iteration
best_iteration = model.best_iteration_
# 测试集的结果
test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1]/n_folds
# 验证集结果
out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
# feature importance
feature_importance_values += model.feature_importances_ / n_folds
# Record the best score
valid_score = model.best_score_['valid']['auc']
train_score = model.best_score_['train']['auc']
valid_scores.append(valid_score)
train_scores.append(train_score)
# Clean up memory
gc.enable()
del model, train_features, valid_features
gc.collect()
count += 1
print("%d_fold is over"%count)
# Make the submission dataframe
submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})
# feature importance
feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
# Overall validation score
valid_auc = roc_auc_score(labels, out_of_fold)
# Add the overall scores to the metrics
valid_scores.append(valid_auc)
train_scores.append(np.mean(train_scores))
# dataframe of validation scores
fold_names = list(range(n_folds))
fold_names.append('overall')
# Dataframe of validation scores
metrics = pd.DataFrame({'fold': fold_names,
'train': train_scores,
'valid': valid_scores})
return submission, metrics, feature_importances
submit7, metrics, feature_importances = model(train_data, test_data, n_folds=10)
Training until validation scores don't improve for 150 rounds.
[200] train's auc: 0.818082 valid's auc: 0.78171
[400] train's auc: 0.851286 valid's auc: 0.782229
Early stopping, best iteration is:
[327] train's auc: 0.840386 valid's auc: 0.782819
1_fold is over
Training until validation scores don't improve for 150 rounds.
[200] train's auc: 0.818656 valid's auc: 0.770237
[400] train's auc: 0.851714 valid's auc: 0.771625
Early stopping, best iteration is:
[274] train's auc: 0.832188 valid's auc: 0.771894
2_fold is over
Training until validation scores don't improve for 150 rounds.
[200] train's auc: 0.818428 valid's auc: 0.777017
[400] train's auc: 0.852307 valid's auc: 0.778279
[600] train's auc: 0.877749 valid's auc: 0.778526
Early stopping, best iteration is:
[527] train's auc: 0.869398 valid's auc: 0.778803
3_fold is over
Training until validation scores don't improve for 150 rounds.
[200] train's auc: 0.818863 valid's auc: 0.776356
[400] train's auc: 0.852349 valid's auc: 0.776951
Early stopping, best iteration is:
[353] train's auc: 0.845446 valid's auc: 0.777577
4_fold is over
Training until validation scores don't improve for 150 rounds.
[200] train's auc: 0.818512 valid's auc: 0.778107
[400] train's auc: 0.851824 valid's auc: 0.779782
[600] train's auc: 0.877395 valid's auc: 0.780289
Early stopping, best iteration is:
[472] train's auc: 0.861744 valid's auc: 0.780356
5_fold is over
Training until validation scores don't improve for 150 rounds.
[200] train's auc: 0.818 valid's auc: 0.78523
[400] train's auc: 0.851157 valid's auc: 0.787304
Early stopping, best iteration is:
[380] train's auc: 0.848455 valid's auc: 0.787651
6_fold is over
Training until validation scores don't improve for 150 rounds.
[200] train's auc: 0.817766 valid's auc: 0.781544
[400] train's auc: 0.851246 valid's auc: 0.782898
Early stopping, best iteration is:
[391] train's auc: 0.850062 valid's auc: 0.783152
7_fold is over
Training until validation scores don't improve for 150 rounds.
[200] train's auc: 0.818635 valid's auc: 0.773706
[400] train's auc: 0.852069 valid's auc: 0.775681
[600] train's auc: 0.877107 valid's auc: 0.775456
Early stopping, best iteration is:
[482] train's auc: 0.862897 valid's auc: 0.776128
8_fold is over
Training until validation scores don't improve for 150 rounds.
[200] train's auc: 0.81832 valid's auc: 0.780384
[400] train's auc: 0.851639 valid's auc: 0.782054
Early stopping, best iteration is:
[428] train's auc: 0.855628 valid's auc: 0.782321
9_fold is over
Training until validation scores don't improve for 150 rounds.
[200] train's auc: 0.819223 valid's auc: 0.771637
[400] train's auc: 0.852492 valid's auc: 0.772208
Early stopping, best iteration is:
[377] train's auc: 0.849162 valid's auc: 0.772556
10_fold is over
submit7.head()
SK_ID_CURR | TARGET | |
---|---|---|
0 | 100001 | 0.296470 |
1 | 100005 | 0.564654 |
2 | 100013 | 0.228887 |
3 | 100028 | 0.249874 |
4 | 100038 | 0.742904 |
submit7.to_csv('submit7.csv',index = False)
metrics
fold | train | valid | |
---|---|---|---|
0 | 0 | 0.840386 | 0.782819 |
1 | 1 | 0.832188 | 0.771894 |
2 | 2 | 0.869398 | 0.778803 |
3 | 3 | 0.845446 | 0.777577 |
4 | 4 | 0.861744 | 0.780356 |
5 | 5 | 0.848455 | 0.787651 |
6 | 6 | 0.850062 | 0.783152 |
7 | 7 | 0.862897 | 0.776128 |
8 | 8 | 0.855628 | 0.782321 |
9 | 9 | 0.849162 | 0.772556 |
10 | overall | 0.851537 | 0.779188 |
feature_importances = feature_importances.set_index('feature')
feature_importances = feature_importances.sort_values(by = 'importance')
feature_importances.plot(kind = 'barh', figsize=(10, 200), title='Feature Importances')
weak_importance_features=list(feature_importances[feature_importances['importance']==0].index)
train_data = train_data.drop(weak_importance_features, axis = 1)
test_data = test_data.drop(weak_importance_features, axis = 1)