赛题描述:给定200000个训练样本和200000个测试样本,总共200个特征,进行二元分类,需要输出的时属于1(正类)的概率,评价指标是ROC曲线下的面积
roc解释可以参考本篇博文
数据集下载连接
首先,进行初始化,对数据进行探索性分析
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier,Pool
from IPython.display import display
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.patches as patch
import matplotlib.pyplot as plt
from sklearn.svm import NuSVR
from scipy.stats import norm
from sklearn import svm
import lightgbm as lgb
import xgboost as xgb
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import time
import glob
import sys
import os
import gc
# 初始化
fold_n=4
# 分层采样,确保训练集,测试集中各类别样本的比例与原始数据集中相同。
folds = StratifiedKFold(n_splits=fold_n, shuffle=True, random_state=30)
#可以直接在你的python console里面生成图像s
%matplotlib inline
%precision 4
warnings.filterwarnings('ignore') #忽略警告
plt.style.use('ggplot') #设置绘图格式
np.set_printoptions(suppress=True) #设置小数的输出格式
pd.set_option("display.precision", 15) # 设置最大打印列数,15列
print('pandas: {}'.format(pd.__version__))
print('numpy: {}'.format(np.__version__))
print('Python: {}'.format(sys.version))
pandas: 0.23.4
numpy: 1.15.1
Python: 3.7.0 (default, Jun 28 2018, 08:04:48) [MSC v.1912 64 bit (AMD64)]
# 查看此目录下有多少文件
print(os.listdir(r"D:/study/Python/Kaggle/SCTP/"))
['.ipynb_checkpoints', 'catboost_info', 'sample_submission.csv', 'SCTP-linear.py', 'SCTP-SVM.py', 'SCTP.py', 'SCTP_1.ipynb', 'test.csv', 'test.py', 'train.csv']
# 加载数据
train= pd.read_csv(r"D:/study/Python/Kaggle/SCTP/train.csv")
test = pd.read_csv(r"D:/study/Python/Kaggle/SCTP/test.csv")
# 查看提交样本示例
sample_submission = pd.read_csv(r"D:/study/Python/Kaggle/SCTP/sample_submission.csv")
print(sample_submission.head())
ID_code target
0 test_0 0
1 test_1 0
2 test_2 0
3 test_3 0
4 test_4 0
# 查看数据集的大小
train.shape, test.shape, sample_submission.shape
((200000, 202), (200000, 201), (200000, 2))
# 查看数据
train.head(5)
ID_code target var_0 var_1 var_2 var_3 var_4 var_5 var_6 var_7 ... var_190 var_191 var_192 var_193 var_194 var_195 var_196 var_197 var_198 var_199
0 train_0 0 8.925500000000000 -6.7863 11.908099999999999 5.093000000000000 11.460699999999999 -9.2834 5.1187 18.626600000000000 ... 4.4354 3.9642 3.1364 1.691000000000000 18.522700000000000 -2.3978 7.8784 8.563499999999999 12.780300000000000 -1.091400000000000
1 train_1 0 11.500600000000000 -4.1473 13.858800000000000 5.388999999999999 12.362200000000000 7.0433 5.6208 16.533799999999999 ... 7.6421 7.7214 2.5837 10.951599999999999 15.430500000000000 2.0339 8.1267 8.788900000000000 18.355999999999998 1.951800000000000
2 train_2 0 8.609299999999999 -2.7457 12.080500000000001 7.892800000000000 10.582500000000000 -9.0837 6.9427 14.615500000000001 ... 2.9057 9.7905 1.6704 1.685800000000000 21.604199999999999 3.1417 -6.5213 8.267500000000000 14.722200000000001 0.396500000000000
3 train_3 0 11.060400000000000 -2.1518 8.952199999999999 7.195700000000000 12.584600000000000 -1.8361 5.8428 14.925000000000001 ... 4.4666 4.7433 0.7178 1.421400000000000 23.034700000000001 -1.2706 -2.9275 10.292199999999999 17.969700000000000 -8.999599999999999
4 train_4 0 9.836900000000000 -1.4834 12.874599999999999 6.637500000000000 12.277200000000001 2.4486 5.9405 19.251400000000000 ... -1.4905 9.5214 -0.1508 9.194200000000000 13.287599999999999 -1.5121 3.9267 9.503100000000000 17.997399999999999 -8.810400000000000
train.columns
Index(['ID_code', 'target', 'var_0', 'var_1', 'var_2', 'var_3', 'var_4',
'var_5', 'var_6', 'var_7',
...
'var_190', 'var_191', 'var_192', 'var_193', 'var_194', 'var_195',
'var_196', 'var_197', 'var_198', 'var_199'],
dtype='object', length=202)
# 数值描述
train.describe()
target var_0 var_1 var_2 var_3 var_4 var_5 var_6 var_7 var_8 ... var_190 var_191 var_192 var_193 var_194 var_195 var_196 var_197 var_198 var_199
count 200000.000000000000000 200000.000000000000000 200000.000000000000000 200000.000000000000000 200000.000000000000000 200000.000000000000000 200000.000000000000000 200000.000000000000000 200000.000000000000000 200000.000000000000000 ... 200000.000000000000000 200000.000000000000000 200000.00000000000000 200000.000000000000000 200000.000000000000000 200000.000000000000000 200000.000000000000000 200000.000000000000000 200000.000000000000000 200000.000000000000000
mean 0.100490000000000 10.679914252000151 -1.627621689499992 10.715191851000073 6.796529157000018 11.078333240500118 -5.065317493499968 5.408948681499958 16.545849889500108 0.284161849999996 ... 3.234439775999965 7.438408337000044 1.92783851400003 3.331773684500021 17.993784182999992 -0.142088433500005 2.303335243500019 8.908157683499990 15.870720248000522 -3.326536900499999
std 0.300652975806355 3.040050870668801 4.050044189955011 2.640894191799927 2.043319016359718 1.623149533936866 7.863266683476754 0.866607266216908 3.418075578937139 3.332633536717585 ... 4.559921679910722 3.023271794723963 1.47842289233660 3.992030367901846 3.135161996426620 1.429372364408401 5.454369250069321 0.921625484493855 3.010945491221765 10.438015107352546
min 0.000000000000000 0.408400000000000 -15.043400000000000 2.117100000000000 -0.040200000000000 5.074800000000000 -32.562600000000003 2.347300000000000 5.349700000000000 -10.505500000000000 ... -14.093299999999999 -2.691700000000000 -3.81450000000000 -11.783400000000000 8.694400000000000 -5.261000000000000 -14.209600000000000 5.960600000000000 6.299300000000000 -38.852800000000002
25% 0.000000000000000 8.453850000000001 -4.740025000000000 8.722474999999999 5.254075000000000 9.883175000000000 -11.200350000000000 4.767700000000000 13.943800000000000 -2.317800000000000 ... -0.058825000000000 5.157400000000000 0.88977500000000 0.584600000000000 15.629799999999999 -1.170700000000000 -1.946925000000000 8.252800000000001 13.829700000000001 -11.208475000000000
50% 0.000000000000000 10.524750000000001 -1.608050000000000 10.580000000000000 6.825000000000000 11.108250000000000 -4.833150000000000 5.385100000000000 16.456800000000001 0.393700000000000 ... 3.203600000000000 7.347750000000000 1.90130000000000 3.396350000000000 17.957949999999997 -0.172700000000000 2.408900000000000 8.888199999999999 15.934050000000001 -2.819550000000000
75% 0.000000000000000 12.758200000000000 1.358625000000000 12.516700000000000 8.324100000000000 12.261125000000002 0.924800000000000 6.002999999999999 19.102900000000002 2.937900000000000 ... 6.406200000000000 9.512525000000000 2.94950000000000 6.205800000000000 20.396524999999997 0.829600000000000 6.556725000000000 9.593299999999999 18.064724999999999 4.836800000000000
max 1.000000000000000 20.315000000000001 10.376799999999999 19.352999999999998 13.188300000000000 16.671399999999998 17.251600000000000 8.447699999999999 27.691800000000001 10.151300000000001 ... 18.440899999999999 16.716500000000000 8.40240000000000 18.281800000000000 27.928799999999999 4.272900000000000 18.321500000000000 12.000400000000001 26.079100000000000 28.500699999999998
# 可视化
train['target'].value_counts().plot.bar()
# 通过改变var_0/1/2.。。。。的值,查看各个特征的分布情况,基本都属正态分布
f,ax=plt.subplots(1,2,figsize=(20,10))
train[train['target']==0].var_0.plot.hist(ax=ax[0],bins=20,edgecolor='black',color='red')
ax[0].set_title('target= 0')
x1=list(range(0,85,5))
ax[0].set_xticks(x1)
train[train['target']==1].var_0.plot.hist(ax=ax[1],color='green',bins=20,edgecolor='black')
ax[1].set_title('target= 1')
x2=list(range(0,85,5))
ax[1].set_xticks(x2)
plt.show()
# 查看每一列的均值的分布
train[train.columns[2:]].mean().plot('hist')
plt.title('Mean Frequency')
# print(train.columns[2:])
# print(train[train.columns[2:]])
# print(train[train.columns[2:]].mean())
# 画饼图和柱状图
f,ax=plt.subplots(1,2,figsize=(18,8))
train['target'].value_counts().plot.pie(explode=[0,0.1],autopct='%1.1f%%',ax=ax[0],shadow=True)
ax[0].set_title('target')
ax[0].set_ylabel('')
sns.countplot('target',data=train,ax=ax[1])
ax[1].set_title('target')
plt.show()
# 数据分布不均衡
sns.set(rc={'figure.figsize':(9,7)})
sns.distplot(train['target']);
plt.subplot(2,1,1)
sns.violinplot(data=train,x="target", y="var_0")
plt.subplot(2,1,2)
sns.violinplot(data=train,x="target", y="var_81")
# 检查是否有缺失值
def check_missing_data(df):
flag=df.isna().sum().any()
if flag==True:
total = df.isnull().sum()
percent = (df.isnull().sum())/(df.isnull().count()*100)
output = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
data_type = []
# written by MJ Bahmani
for col in df.columns:
dtype = str(df[col].dtype)
data_type.append(dtype)
output['Types'] = data_type
return(np.transpose(output))
else:
return(False)
print(check_missing_data(train))
print(check_missing_data(test))
没有缺失值
False
False
train['target'].unique()
array([0, 1], dtype=int64)
# 数据不平衡时,应该怎么办
# 一般的方法是进行上采样或下采样,但是在本次比赛中,均无效
train['target'].value_counts()
0 179902
1 20098
Name: target, dtype: int64
# 很明显,0非常多,1非常少
# 统计0,1占的百分比
# 数据不平衡的解决方案就是过采样或者下采样
def check_balance(df,target):
check=[]
# written by MJ Bahmani for binary target
print('size of data is:',df.shape[0] )
for i in [0,1]:
print('for target {} ='.format(i))
print(df[target].value_counts()[i]/df.shape[0]*100,'%')
check_balance(train,'target')
size of data is: 200000
for target 0 =89.95100000000001 %
for target 1 =10.049 %
# 偏度和峰度
print("Skewness: %f" % train['target'].skew())
print("Kurtosis: %f" % train['target'].kurt())
Skewness: 2.657642
Kurtosis: 5.063112
X = train.drop(["target","ID_code"],axis = 1)
y = train['target']
X_test = test.drop('ID_code',axis = 1)
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
rfc_model = RandomForestClassifier(random_state=0).fit(train_X, train_y)
# 用eli5库计算特征的重要性,最上边的时最重要的
import eli5
from eli5.sklearn import PermutationImportance
perm = PermutationImportance(rfc_model, random_state=1).fit(val_X, val_y)
eli5.show_weights(perm, feature_names = val_X.columns.tolist())
Weight Feature
0.0002 ± 0.0002 var_110
0.0001 ± 0.0001 var_157
0.0001 ± 0.0001 var_162
0.0001 ± 0.0001 var_42
0.0001 ± 0.0002 var_170
0.0001 ± 0.0002 var_174
0.0001 ± 0.0001 var_188
0.0001 ± 0.0001 var_147
0.0001 ± 0.0001 var_197
0.0001 ± 0.0001 var_47
0.0001 ± 0.0001 var_95
0.0001 ± 0.0002 var_148
0.0001 ± 0.0001 var_93
0.0001 ± 0.0001 var_184
0.0001 ± 0.0001 var_133
0.0001 ± 0.0001 var_185
0.0001 ± 0.0001 var_183
0.0001 ± 0.0000 var_158
0.0001 ± 0.0001 var_150
0.0001 ± 0.0002 var_6
… 180 more …
# 部分依赖图,显示特征如何影响预测
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
tree_model = DecisionTreeClassifier(random_state=0, max_depth=5, min_samples_split=5).fit(train_X, train_y)
features = [c for c in train.columns if c not in ['ID_code','target']]
from matplotlib import pyplot as plt
from pdpbox import pdp, get_dataset, info_plots
# Create the data that we will plot
pdp_goals = pdp.pdp_isolate(model=tree_model, dataset=val_X, model_features=features, feature='var_81')
# plot it
pdp.pdp_plot(pdp_goals, 'var_81')
plt.show()
# Create the data that we will plot
pdp_goals = pdp.pdp_isolate(model=tree_model, dataset=val_X, model_features=features, feature='var_82')
# plot it
pdp.pdp_plot(pdp_goals, 'var_82')
plt.show()
# Create the data that we will plot
pdp_goals = pdp.pdp_isolate(model=tree_model, dataset=val_X, model_features=features, feature='var_139')
# plot it
pdp.pdp_plot(pdp_goals, 'var_139')
plt.show()
from sklearn import tree
import graphviz
# shape 用于预测每个特征对结果的贡献程度
row_to_show = 5
data_for_prediction = val_X.iloc[row_to_show] # use 1 row of data here. Could use multiple rows if desired
data_for_prediction_array = data_for_prediction.values.reshape(1, -1)
rfc_model.predict_proba(data_for_prediction_array);
# 以下程序在本人电脑上报错
# from sklearn import tree
# import graphviz
# import numpy as np
# import shap # package used to calculate Shap values
# # Create object that can calculate shap values
# explainer = shap.TreeExplainer(rfc_model)
# # Calculate Shap values
# shap_values = explainer.shap_values(data_for_prediction)
# X = train.drop(["target","ID_code"],axis = 1)
# y = train['target']
# train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from catboost import CatBoostClassifier,Pool
from sklearn import metrics
from catboost import CatBoostClassifier,Pool
from IPython.display import display
import matplotlib.patches as patch
import matplotlib.pyplot as plt
from sklearn.svm import NuSVR
from scipy.stats import norm
from sklearn import svm
import lightgbm as lgb
import xgboost as xgb
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import time
import glob
import sys
import os
import gc
# 初始化
warnings.filterwarnings('ignore') #忽略警告
plt.style.use('ggplot') #设置绘图格式
np.set_printoptions(suppress=True) #设置小数的输出格式
pd.set_option("display.precision", 15) # 设置最大打印列数,15列
# 使用原始数据,进行分层采样,即使正负样本数量差别很大,按照比例进行分层,交叉验证。
def loadData():
# 加载数据
train= pd.read_csv(r"D:/study/Python/Kaggle/SCTP/train.csv")
test = pd.read_csv(r"D:/study/Python/Kaggle/SCTP/test.csv")
# 查看提交样本示例
# sample_submission = pd.read_csv(r"D:/study/Python/Kaggle/SCTP/sample_submission.csv")
# print(sample_submission.head())
X = train.drop(["target","ID_code"],axis = 1)
y = train['target']
X_test = test.drop('ID_code',axis = 1)
X = round(X,3)
x_test = round(X_test,3)
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
return (X,y,train_X, val_X, train_y, val_y,X_test)
X,y,train_X, val_X, train_y, val_y,X_test = loadData()
print(X.head())
var_0 var_1 var_2 var_3 var_4 \
0 8.926000000000000 -6.786 11.907999999999999 5.093 11.461000000000000
1 11.500999999999999 -4.147 13.859000000000000 5.389 12.362000000000000
2 8.609000000000000 -2.746 12.080000000000000 7.893 10.582000000000001
3 11.060000000000000 -2.152 8.952000000000000 7.196 12.585000000000001
4 9.837000000000000 -1.483 12.875000000000000 6.638 12.276999999999999
var_5 var_6 var_7 var_8 var_9 \
0 -9.282999999999999 5.119 18.626999999999999 -4.920 5.747000000000000
1 7.043000000000000 5.621 16.533999999999999 3.147 8.085000000000001
2 -9.084000000000000 6.943 14.616000000000000 -4.919 5.952000000000000
3 -1.836000000000000 5.843 14.925000000000001 -5.861 8.244999999999999
4 2.449000000000000 5.940 19.251000000000001 6.265 7.678000000000000
... var_190 var_191 var_192 var_193 \
0 ... 4.435 3.964000000000000 3.136 1.691000000000000
1 ... 7.642 7.721000000000000 2.584 10.952000000000000
2 ... 2.906 9.789999999999999 1.670 1.686000000000000
3 ... 4.467 4.743000000000000 0.718 1.421000000000000
4 ... -1.490 9.521000000000001 -0.151 9.194000000000001
var_194 var_195 var_196 var_197 \
0 18.523000000000000 -2.398 7.878000000000000 8.564000000000000
1 15.430000000000000 2.034 8.127000000000001 8.789000000000000
2 21.603999999999999 3.142 -6.521000000000000 8.268000000000001
3 23.035000000000000 -1.271 -2.928000000000000 10.292000000000000
4 13.288000000000000 -1.512 3.927000000000000 9.503000000000000
var_198 var_199
0 12.779999999999999 -1.091
1 18.356000000000002 1.952
2 14.722000000000000 0.396
3 17.969999999999999 -9.000
4 17.997000000000000 -8.810
[5 rows x 200 columns]
1
def lgbModel(X,y,train_X, val_X, train_y, val_y,X_test):
2
3
fold_n=5
def lgbModel(X,y,train_X, val_X, train_y, val_y,X_test):
fold_n=5
# 分层采样,确保训练集,测试集中各类别样本的比例与原始数据集中相同。
folds = StratifiedKFold(n_splits=fold_n, shuffle=True, random_state=30)
# 用lgb模型进行预测
params = {'num_leaves': 13,
'min_data_in_leaf': 42,
'objective': 'binary',
'max_depth': 16,
'learning_rate': 0.0123,
'boosting': 'gbdt',
'bagging_freq': 5,
'bagging_fraction': 0.8,
'feature_fraction': 0.8201,
'bagging_seed': 11,
'reg_alpha': 1.728910519108444,
'reg_lambda': 4.9847051755586085,
'random_state': 42,
'metric': 'auc',
'verbosity': -1,
'subsample': 0.81,
'min_gain_to_split': 0.01077313523861969,
'min_child_weight': 19.428902804238373,
'num_threads': 28}
prediction_lgb = np.zeros((len(X_test),5))
for fold_n, (train_index, valid_index) in enumerate(folds.split(X,y)):
print('Fold', fold_n, 'started at', time.ctime())
X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid)
model = lgb.train(params,train_data,num_boost_round=20000,
valid_sets = [train_data, valid_data],verbose_eval=300,early_stopping_rounds = 100)
#y_pred_valid = model.predict(X_valid)
prediction_lgb[:,fold_n] = model.predict(X_test, num_iteration=model.best_iteration)
return prediction_lgb
def catBC(X,y,train_X, val_X, train_y, val_y,X_test):
auc_catBC = []
prediction_catBC = np.zeros((len(X_test),4))
fold_n=4
# 分层采样,确保训练集,测试集中各类别样本的比例与原始数据集中相同。
folds = StratifiedKFold(n_splits=fold_n, shuffle=True, random_state=30)
m = CatBoostClassifier(loss_function="Logloss",
eval_metric="AUC",
task_type="GPU",
learning_rate=0.01,
iterations=10000,
random_seed=42,
od_type="Iter",
depth=10,
early_stopping_rounds=500
)
for fold_n, (train_index, valid_index) in enumerate(folds.split(X,y)):
print('Fold', fold_n, 'started at', time.ctime())
X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
train_pool = Pool(X_train,y_train)
valid_pool = Pool(X_valid,y_valid)
m.fit(train_pool,eval_set=valid_pool,use_best_model=True,verbose=200, plot=True)
y_pred_1 = m.predict_proba(X_valid) #有两列,一列是0的概率,一列是1的概率
auc1 = metrics.roc_auc_score(y_valid, y_pred_1[:,1])
print('第',str(fold_n),'次,交叉验证集上auc为:',auc1)
auc_catBC.append(auc1)
prediction_catBC[:,fold_n] = m.predict_proba(X_test)[:,1]
return auc_catBC,prediction_catBC
def xgbModel(X,y,train_X, val_X, train_y, val_y,X_test):
auc_xgb = []
prediction_xgb = np.zeros((len(X_test),4))
fold_n=4
# 分层采样,确保训练集,测试集中各类别样本的比例与原始数据集中相同。
folds = StratifiedKFold(n_splits=fold_n, shuffle=True, random_state=30)
model = xgb.XGBClassifier(max_depth=2,
n_estimators=15000,
colsample_bytree=0.3,
learning_rate=0.02,
objective='binary:logistic',
eval_metric = 'auc',
n_jobs=-1)
for fold_n, (train_index, valid_index) in enumerate(folds.split(X,y)):
print('Fold', fold_n, 'started at', time.ctime())
X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
model.fit(X_train, y_train,
eval_set=[(X_valid, y_valid)],
verbose=200,
early_stopping_rounds=500)
y_pred_1 = model.predict_proba(X_valid)
auc1 = metrics.roc_auc_score(y_valid, y_pred_1[:,1])
print('第',str(fold_n),'次,交叉验证集上auc为:',auc1)
auc_xgb.append(auc1)
prediction_xgb[:,fold_n] = model.predict_proba(X_test)[:,1]
return auc_xgb,prediction_xgb
X,y,train_X, val_X, train_y, val_y,X_test = loadData()
prediction_lgb = lgbModel(X,y,train_X, val_X, train_y, val_y,X_test)
auc,prediction_catBC = catBC(X,y,train_X, val_X, train_y, val_y,X_test)
auc,prediction_xgb = xgbModel(X,y,train_X, val_X, train_y, val_y,X_test)