总体思想:
1.test中存在合成数据.如何判别合成数据,若一条样本中,有那个特征的取值,在其他样本中出现过,则判定这条数据是合成数据
换句话说,真是样本中,每一个特征的取值都是独一无二的.
2.找到合成数据,首先剔除合成数据,仅保留真是数据
3.将真实的test数据与train数据进行拼接,计算出每一个特征中每一个取值在所有样本中出现的次数
4.将test中的虚假数据的次数用0填充
5.将每一个特征和其出现的次数两个特征,送入模型进行训练,则总共要需要训练200个模型,得到200个结果
6.将200个结果用LR进行组合,得到最终结果
话不多说,上代码:
# LOAD LIBRARIES
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd, numpy as np, gc
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import statsmodels.api as sm
# GET INDICIES OF REAL TEST DATA FOR FE
#######################
# TAKE FROM YAG320'S KERNEL
# https://www.kaggle.com/yag320/list-of-fake-samples-and-public-private-lb-split
test_path = '../input/test.csv'
df_test = pd.read_csv(test_path)
df_test.drop(['ID_code'], axis=1, inplace=True)
df_test = df_test.values
unique_samples = []
unique_count = np.zeros_like(df_test)
for feature in range(df_test.shape[1]):
_, index_, count_ = np.unique(df_test[:, feature], return_counts=True, return_index=True)
unique_count[index_[count_ == 1], feature] += 1
# Samples which have unique values are real the others are fake
real_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synthetic_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]
print('Found',len(real_samples_indexes),'real test')
print('Found',len(synthetic_samples_indexes),'fake test')
###################
d = {}
for i in range(200): d['var_'+str(i)] = 'float32'
d['target'] = 'uint8'
d['ID_code'] = 'object'
train = pd.read_csv('../input/train.csv', dtype=d)
test = pd.read_csv('../input/test.csv', dtype=d)
print('Loaded',len(train),'rows of train')
print('Loaded',len(test),'rows of test')
# FREQUENCY ENCODE
def encode_FE(df,col,test):
cv = df[col].value_counts()
nm = col+'_FE'
df[nm] = df[col].map(cv)
test[nm] = test[col].map(cv)
test[nm].fillna(0,inplace=True)
if cv.max()<=255:
df[nm] = df[nm].astype('uint8')
test[nm] = test[nm].astype('uint8')
else:
df[nm] = df[nm].astype('uint16')
test[nm] = test[nm].astype('uint16')
return
test['target'] = -1
comb = pd.concat([train,test.loc[real_samples_indexes]],axis=0,sort=True)
for i in range(200): encode_FE(comb,'var_'+str(i),test)
train = comb[:len(train)]; del comb
print('Added 200 new magic features!')
# LGBM PARAMETERS
param = {
'learning_rate': 0.04,
'num_leaves': 3,
'metric':'auc',
'boost_from_average':'false',
'feature_fraction': 1.0,
'max_depth': -1,
'objective': 'binary',
'verbosity': -10}
train2 = train.sample(frac=1,random_state=42)
evals_result = {}
num_vars = 200
# SAVE OUT-OF-FOLD PREDICTIONS
all_oof = np.zeros((len(train2),num_vars+1))
all_oof[:,0] = np.ones(len(train2))
all_oofB = np.zeros((len(train2),num_vars+1))
all_oofB[:,0] = np.ones(len(train2))
# SAVE TEST PREDICTIONS
all_preds = np.zeros((len(test),num_vars+1))
all_preds[:,0] = np.ones(len(test))
all_predsB = np.zeros((len(test),num_vars+1))
all_predsB[:,0] = np.ones(len(test))
for j in range(num_vars):
# MODEL WITH MAGIC
features = ['var_'+str(j),'var_'+str(j)+'_FE']
oof = np.zeros(len(train2))
preds = np.zeros(len(test))
# PLOT DENSITIES
plt.figure(figsize=(16,5))
plt.subplot(1,2,2)
sns.distplot(train2[train2['target']==0]['var_'+str(j)], label = 't=0')
sns.distplot(train2[train2['target']==1]['var_'+str(j)], label = 't=1')
plt.legend()
plt.yticks([])
plt.xlabel('Var_'+str(j))
# MAKE A GRID OF POINTS FOR LGBM TO PREDICT
mn,mx = plt.xlim()
mnFE = train2['var_'+str(j)+'_FE'].min()
mxFE = train2['var_'+str(j)+'_FE'].max()
step = 50
stepB = train2['var_'+str(j)+'_FE'].nunique()
w = (mx-mn)/step
x = w * (np.arange(0,step)+0.5) + mn
x2 = np.array([])
for i in range(stepB):
x2 = np.concatenate([x,x2])
df = pd.DataFrame({'var_'+str(j):x2})
df['var_'+str(j)+'_FE'] = mnFE + (mxFE-mnFE)/(stepB-1) * (df.index//step)
df['pred'] = 0
# 5-FOLD WITH MAGIC
for k in range(5):
valid = train2.iloc[k*40000:(k+1)*40000]
train = train2[ ~train2.index.isin(valid.index) ]
trn_data = lgb.Dataset(train[features], label=train['target'])
val_data = lgb.Dataset(valid[features], label=valid['target'])
model = lgb.train(param, trn_data, 750, valid_sets = [trn_data, val_data],
verbose_eval=False, evals_result=evals_result)
x = evals_result['valid_1']['auc']
best = x.index(max(x))
#print('i=',i,'k=',k,'best=',best)
oof[k*40000:(k+1)*40000] = model.predict(valid[features], num_iteration=best)
preds += model.predict(test[features], num_iteration=best)/5.0
df['pred'] += model.predict(df[features], num_iteration=best)/5.0
val_auc = roc_auc_score(train2['target'],oof)
print('VAR_'+str(j)+' with magic val_auc =',round(val_auc,5))
all_oof[:,j+1] = oof
all_preds[:,j+1] = preds
x = df['pred'].values
x = np.reshape(x,(stepB,step))
x = np.flip(x,axis=0)
# PLOT LGBM PREDICTIONS USING MAGIC
plt.subplot(1,2,1)
sns.heatmap(x, cmap='RdBu_r', center=0.0)
plt.title('VAR_'+str(j)+' Predictions with Magic',fontsize=16)
plt.xticks(np.linspace(0,49,5),np.round(np.linspace(mn,mx,5),1))
plt.xlabel('Var_'+str(j))
s = min(mxFE-mnFE+1,20)
plt.yticks(np.linspace(mnFE,mxFE,s)-0.5,np.linspace(mxFE,mnFE,s).astype('int'))
plt.ylabel('Count')
plt.show()
# MODEL WITHOUT MAGIC
features = ['var_'+str(j)]
oof = np.zeros(len(train2))
preds = np.zeros(len(test))
# PLOT DENSITIES
plt.figure(figsize=(16,5))
plt.subplot(1,2,2)
sns.distplot(train2[train2['target']==0]['var_'+str(j)], label = 't=0')
sns.distplot(train2[train2['target']==1]['var_'+str(j)], label = 't=1')
plt.legend()
plt.yticks([])
plt.xlabel('Var_'+str(j))
# MAKE A GRID OF POINTS FOR LGBM TO PREDICT
mn,mx = plt.xlim()
mnFE = train2['var_'+str(j)+'_FE'].min()
mxFE = train2['var_'+str(j)+'_FE'].max()
step = 50
stepB = train2['var_'+str(j)+'_FE'].nunique()
w = (mx-mn)/step
x = w * (np.arange(0,step)+0.5) + mn
x2 = np.array([])
for i in range(stepB):
x2 = np.concatenate([x,x2])
df = pd.DataFrame({'var_'+str(j):x2})
df['var_'+str(j)+'_FE'] = mnFE + (mxFE-mnFE)/(stepB-1) * (df.index//step)
df['pred'] = 0
# 5-FOLD WITHOUT MAGIC
for k in range(5):
valid = train2.iloc[k*40000:(k+1)*40000]
train = train2[ ~train2.index.isin(valid.index) ]
trn_data = lgb.Dataset(train[features], label=train['target'])
val_data = lgb.Dataset(valid[features], label=valid['target'])
model = lgb.train(param, trn_data, 750, valid_sets = [trn_data, val_data],
verbose_eval=False, evals_result=evals_result)
x = evals_result['valid_1']['auc']
best = x.index(max(x))
#print('i=',i,'k=',k,'best=',best)
oof[k*40000:(k+1)*40000] = model.predict(valid[features], num_iteration=best)
preds += model.predict(test[features], num_iteration=best)/5.0
df['pred'] += model.predict(df[features], num_iteration=best)/5.0
val_auc = roc_auc_score(train2['target'],oof)
print('VAR_'+str(j)+' without magic val_auc =',round(val_auc,5))
all_oofB[:,j+1] = oof
all_predsB[:,j+1] = preds
x = df['pred'].values
x = np.reshape(x,(stepB,step))
x = np.flip(x,axis=0)
# PLOT LGBM PREDICTIONS WITHOUT USING MAGIC
plt.subplot(1,2,1)
sns.heatmap(x, cmap='RdBu_r', center=0.0)
plt.title('VAR_'+str(j)+' Predictions without Magic',fontsize=16)
plt.xticks(np.linspace(0,49,5),np.round(np.linspace(mn,mx,5),1))
plt.xlabel('Var_'+str(j))
plt.yticks([])
plt.ylabel('')
plt.show()
# ENSEMBLE MODEL WITHOUT MAGIC
logrB = sm.Logit(train2['target'], all_oofB[:,:num_vars+1])
logrB = logrB.fit(disp=0)
ensemble_predsB = logrB.predict(all_oofB[:,:num_vars+1])
ensemble_aucB = roc_auc_score(train2['target'],ensemble_predsB)
print('##################')
print('Combined Model without magic Val_AUC=',round(ensemble_aucB,5))
print()
# ENSEMBLE MODEL WITH MAGIC
logr = sm.Logit(train2['target'], all_oof[:,:num_vars+1])
logr = logr.fit(disp=0)
ensemble_preds = logr.predict(all_oof[:,:num_vars+1])
ensemble_auc = roc_auc_score(train2['target'],ensemble_preds)
print('##################')
print('Combined Model with magic Val_AUC=',round(ensemble_auc,5))
print()
# SAVE PREDICTIONS TO CSV
print('Test predictions saved as submission.csv')
print('OOF predictions saved as oof_submission.csv')
print('Histogram of test predictions displayed below:')
sub = train2[['ID_code','target']].copy()
sub['predict'] = ensemble_preds
sub.reset_index(inplace=True)
sub.sort_values('index',inplace=True)
sub.to_csv('oof_submission.csv',index=False)
test_preds = logr.predict(all_preds[:,:num_vars+1])
sub = pd.read_csv('../input/sample_submission.csv')
sub['target'] = test_preds
sub.to_csv('submission.csv',index=False)
# DISPLAY HISTOGRAM OF PREDICTIONS
b = plt.hist(sub['target'], bins=200)