kaggle的一个比赛,试题于 https://www.kaggle.com/c/santander-customer-transaction-prediction
我的初步解答:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import time
train = pd.read_csv('/home/kesci/input/BK_TRAIN2242/train.csv')
train.head()
test = pd.read_csv('/home/kesci/input/BK_TEST3071/test.csv')
test.head()
y = train['target']
X=train
X.drop(['ID_code','target'],axis = 1, inplace=True)
X
import copy
test2=copy.deepcopy(test)
test2.drop(['ID_code'],axis = 1, inplace=True)
test2.head()
#######################LGM成品框架
start = time.clock()
print("Loading Data ... ")
train_x, valid_x, train_y, valid_y = train_test_split(X,y, test_size=0.2, random_state=1) # 分训练集和验证集
X_train = train_x
y_train = train_y
X_test = valid_x
y_test = valid_y
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# specify your configurations as a dict
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': {'binary_logloss', 'auc'},
'num_leaves': 5,
'max_depth': 6,
'min_data_in_leaf': 450,
'learning_rate': 0.1,
'feature_fraction': 0.9,
'bagging_fraction': 0.95,
'bagging_freq': 5,
'lambda_l1': 1,
'lambda_l2': 0.001, # 越小l2正则程度越高
'min_gain_to_split': 0.2,
'verbose': 5,
'is_unbalance': True
}
# train
print('Start training...')
gbm = lgb.train(params,
lgb_train,
num_boost_round=10000,
valid_sets=lgb_eval,
early_stopping_rounds=500)
print('Start predicting...')
preds = gbm.predict(test2, num_iteration=gbm.best_iteration) # 输出的是概率结果
# 导出结果
threshold = 0.5
for pred in preds:
result = 1 if pred > threshold else 0
end = time.clock()
print (("完成 用时:"))
print ((end - start))
predictions=[]
for pred in preds:
result = 1 if pred > threshold else 0
predictions.append(result)
df=copy.deepcopy(pd.DataFrame(test['ID_code']))
df['target']=predictions
df.to_csv('out2.csv')
print ('生成结果csv完成')
模型还有很多参数可以继续调节,若要进10%至少要 score到88%以上