Kaggle | Allstate Claims Severity是我参加Kaggle | Santander Product Recommendation顺带参加的一场比赛,比赛整个就花了两星期时间,
取得的成绩比预期好些(73rd/3055 Top 3%),下面总结一下这场比赛。
题目要求:
要求基于给出的数据预测保险赔偿。给出的训练数据是116列(cat1-cat116)的离散数据和14列(con1-con14)的连续数据。
1.数据预处理:
这题数据给的较规范,没有缺失值,所以没有进行预处理
2.算法模型:
xgboost:对目标变量取log(x+200)(原因不是很清楚,可能是为了使目标变量分布更对称,易于训练)
NN:4层神经网络(keras搭建)
3.特征工程:
xgboost:相关性高的离散特征组合编码
连续特征偏度较大的进行Box-Cox变换,标准化,取均值,极差,求和等
进行xgboost特征重要性选择
NN:连续特征标准化
离散特征进行one hot coding,并压缩
4.参数调节:xgboost的损失函数设置,超参数挖掘和NN参数设置
5.Ensemble:
本博主只对xgboost结果和NN结果进行加权平均,ensemble应该是这题的关键,排行榜上比较靠前的队伍应该都在ensemble下足了功夫,进行了stacking,本博主也吸取教训了。
6.其它解题思路(第一名)
第1名解题思路 第2名解题思路 第3名解题思路 第4名解题思路
我的主要代码如下:
import pandas as pd
import numpy as np
import xgboost as xgb
import datetime
import itertools
from scipy.stats import boxcox
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_absolute_error
from sklearn import preprocessing
pd.options.mode.chained_assignment = None
multi_corr = [79,80,81,87,89,90,101,103,111]
two_corr = [2,3,9,10,11,12,13,23,36,57,72]
multi_cat_diff = [90,92,96,99,101,102,103,106,109,110,113,114,116]
skewed_num = [1,4,5,6,7,8,9,10,11,12,13]
cat2corr = [(29,30),(40,41),(43,45),(55,56),(8,65),(8,66),(104,106)]
two_avg1 = [1,2,3,4,5,6,7,9,10,11,12,13,14,16,23,24,25,26,27,28,36,38,40,44,50,53,57,72,73,76,79,80,81,82,87,89,90,103,111]
def logregobj(preds, dtrain):
labels = dtrain.get_label()
con = 2
x = preds - labels
grad = con * x / (np.abs(x) + con)
hess = con ** 2 / (np.abs(x) + con) ** 2
return grad, hess
def evalerror(preds, dtrain):
labels = dtrain.get_label()
return 'mae', mean_absolute_error(np.exp(preds), np.exp(labels))
def encode(charcode):
r = 0
ln = len(str(charcode))
for i in range(ln):
r += (ord(str(charcode)[i]) - ord('A'))
return r + 1
def prepro(train,test,cont_feature):
joined = pd.concat((train, test)).reset_index(drop=True)
skewed_feats = ['cont' + str(i) for i in skewed_num]
for feats in skewed_feats:
joined[feats] = joined[feats] + 1
joined[feats], lam = boxcox(joined[feats])
multi_diff_feats = ['cat' + str(i) for i in multi_cat_diff]
for column in multi_diff_feats:
set_train = set(train[column].unique())
set_test = set(test[column].unique())
remove_train = set_train - set_test
remove_test = set_test - set_train
remove = remove_train.union(remove_test)
def filter_cat(x):
if x in remove:
return np.nan
return x
joined[column] = joined[column].apply(lambda x: filter_cat(x), 1)
ss = StandardScaler()
joined[cont_feature] = ss.fit_transform(joined[cont_feature].values)
del train,test
return joined
def feature_extract(joined,cont_feature):
features = pd.DataFrame()
features['id'] = joined['id']
features['loss'] = np.log(joined['loss'] + 200)
cat_sel = [n for n in joined.columns if n.startswith('cat')]
for column in cat_sel:
features[column] = pd.factorize(joined[column].values , sort=True)[0] + 1
for column in cont_feature:
features[column] = joined[column]
features['cont_avg'] = joined[cont_feature].mean(axis = 1)
features['cont_min'] = joined[cont_feature].min(axis = 1)
features['cont_max'] = joined[cont_feature].max(axis = 1)
for i in [20,40,73]:
cat_feats = ['cat' + str(i) for i in range(1,i)]
idx = 'cat_' + 'sum_' + str(i)
features[idx + '_A'] = joined[cat_feats].apply(lambda x: sum(x == 'A'), axis = 1)
features[idx + '_B'] = joined[cat_feats].apply(lambda x: sum(x == 'B'), axis = 1)
cat2_feats = [('cat' + str(i), 'cat' + str(j)) for (i, j) in cat2corr]
for feat1,feat2 in cat2_feats:
feat_comb = feat1 + '_' + feat2
features[feat_comb] = joined[feat1] + joined[feat2]
features[feat_comb] = features[feat_comb].apply(encode)
cat2avg_feats = [ 'cat' + str(i) for i in two_avg1]
for feat1,feat2 in itertools.combinations(cat2avg_feats,2):
feat_comb = feat1 + '_' + feat2
features[feat_comb] = joined[feat1] + joined[feat2]
features[feat_comb] = features[feat_comb].apply(encode)
train = features[features['loss'].notnull()]
test = features[features['loss'].isnull()]
del features, joined
return train, test
def ceate_feature_map(features):
outfile = open('xgb.fmap', 'w')
i = 0
for feat in features:
outfile.write('{0}\t{1}\tq\n'.format(i, feat))
i = i + 1
outfile.close()
def feature_select(train,test):
import operator
params = {
'min_child_weight': 100,
'eta': 0.02,
'colsample_bytree': 0.7,
'max_depth': 12,
'subsample': 0.7,
'alpha': 1,
'gamma': 1,
'silent': 1,
'objective': 'reg:linear',
'verbose_eval': True,
'seed': 12
}
rounds = 300
y = train['loss']
X = train.drop(['loss', 'id'], 1)
xgtrain = xgb.DMatrix(X, label=y)
bst = xgb.train(params, xgtrain, num_boost_round=rounds)
feats = [x for x in train.columns if x not in ['id', 'loss']]
print len(feats)
outfile = open('xgb.fmap', 'w')
i = 0
for feat in feats:
outfile.write('{0}\t{1}\tq\n'.format(i, feat))
i = i + 1
outfile.close()
importance = bst.get_fscore(fmap='xgb.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1), reverse = True)
feats = [ a for (a,b) in importance]
feats = feats[:450]
print len(feats)
df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()
df.to_csv("../input/feat_sel/feat_importance.csv", index = False)
train1 = train[['id', 'loss'] + feats]
test1 = test[['id'] + feats]
return train1, test1
def runXGB(train,test,index,RANDOM_STATE):
train_index, test_index = index
y = train['loss']
X = train.drop(['loss', 'id'], 1)
X_test = test.drop(['id'], 1)
del train,test
X_train, X_val = X.iloc[train_index], X.iloc[test_index]
y_train, y_val = y.iloc[train_index], y.iloc[test_index]
xgtrain = xgb.DMatrix(X_train, label=y_train)
xgval = xgb.DMatrix(X_val, label=y_val)
xgtest = xgb.DMatrix(X_test)
X_val = xgb.DMatrix(X_val)
params = {
'min_child_weight': 10,
'eta': 0.01,
'colsample_bytree': 0.7,
'max_depth': 12,
'subsample': 0.7,
'alpha': 1,
'gamma': 1,
'silent': 1,
'verbose_eval': True,
'seed': RANDOM_STATE
}
rounds = 3000
watchlist = [(xgtrain, 'train'), (xgval, 'eval')]
model = xgb.train(params, xgtrain, rounds, watchlist, obj=logregobj, feval=evalerror,early_stopping_rounds=100)
cv_score = mean_absolute_error(np.exp(model.predict(X_val)) - 200, np.exp(y_val) - 200)
predict = np.exp(model.predict(xgtest)) - 200
print "iteration = %d"%(model.best_iteration)
return predict, cv_score
if __name__ == '__main__':
Generate_or_read = 0 # 0 generate
feat_sel = 1 # 1 select
start_time = datetime.datetime.now()
if Generate_or_read == 0:
print "generate features..."
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
test['loss'] = np.nan
cont_feature = [n for n in train.columns if n.startswith('cont')]
joined = prepro(train,test,cont_feature)
train,test = feature_extract(joined, cont_feature)
print train.shape, test.shape
print datetime.datetime.now() - start_time
if feat_sel == 1:
print "feature select..."
train,test = feature_select(train,test)
train.to_csv("../input/feature/train.csv",index = False)
test.to_csv("../input/feature/test.csv", index=False)
print train.shape, test.shape
print datetime.datetime.now() - start_time
else:
print "read features..."
train = pd.read_csv("../input/feature/train.csv")
test = pd.read_csv("../input/feature/test.csv")
print train.shape, test.shape
print "run model..."
nfolds = 10
RANDOM_STATE = 113
ids = test['id']
predicts = np.zeros(ids.shape)
kf = KFold(train.shape[0], n_folds = nfolds, shuffle = True, random_state = RANDOM_STATE)
for i, index in enumerate(kf):
print('Fold %d' % (i + 1))
predict, cv_score = runXGB(train, test, index, RANDOM_STATE)
print cv_score
predicts += predict
print datetime.datetime.now() - start_time
predicts = predicts / nfolds
submission = pd.DataFrame()
submission['id'] = ids
submission['loss'] = predicts
submission.to_csv('../submit/submit_xgb.csv', index=False)
import numpy as np
import pandas as pd
import subprocess
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import KFold
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU
np.random.seed(123)
def batch_generator(X, y, batch_size, shuffle):
number_of_batches = np.ceil(X.shape[0] / batch_size)
counter = 0
sample_index = np.arange(X.shape[0])
if shuffle:
np.random.shuffle(sample_index)
while True:
batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
X_batch = X[batch_index, :].toarray()
y_batch = y[batch_index]
counter += 1
yield X_batch, y_batch
if (counter == number_of_batches):
if shuffle:
np.random.shuffle(sample_index)
counter = 0
def batch_generatorp(X, batch_size, shuffle):
number_of_batches = X.shape[0] / np.ceil(X.shape[0] / batch_size)
counter = 0
sample_index = np.arange(X.shape[0])
while True:
batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
X_batch = X[batch_index, :].toarray()
counter += 1
yield X_batch
if (counter == number_of_batches):
counter = 0
## read data
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
index = list(train.index)
train = train.iloc[index]
'train = train.iloc[np.random.permutation(len(train))]'
## set test loss to NaN
test['loss'] = np.nan
## response and IDs
y = np.log(train['loss'].values + 200)
id_train = train['id'].values
id_test = test['id'].values
## stack train test
ntrain = train.shape[0]
tr_te = pd.concat((train, test), axis=0)
## Preprocessing and transforming to sparse data
sparse_data = []
f_cat = [f for f in tr_te.columns if 'cat' in f]
for f in f_cat:
dummy = pd.get_dummies(tr_te[f].astype('category'))
tmp = csr_matrix(dummy)
sparse_data.append(tmp)
f_num = [f for f in tr_te.columns if 'cont' in f]
scaler = StandardScaler()
tmp = csr_matrix(scaler.fit_transform(tr_te[f_num]))
sparse_data.append(tmp)
del (tr_te, train, test)
## sparse train and test data
xtr_te = hstack(sparse_data, format='csr')
xtrain = xtr_te[:ntrain, :]
xtest = xtr_te[ntrain:, :]
print('Dim train', xtrain.shape)
print('Dim test', xtest.shape)
del (xtr_te, sparse_data, tmp)
## neural net
def nn_model():
model = Sequential()
model.add(Dense(400, input_dim=xtrain.shape[1], init='he_normal'))
model.add(PReLU())
model.add(BatchNormalization())
model.add(Dropout(0.4))
model.add(Dense(200, init='he_normal'))
model.add(PReLU())
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(50, init='he_normal'))
model.add(PReLU())
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(1, init='he_normal'))
model.compile(loss='mae', optimizer='adadelta')
return (model)
## cv-folds
nfolds = 5
folds = KFold(len(y), n_folds=nfolds, shuffle=True, random_state=111)
## train models
i = 0
nbags = 10
nepochs = 55
pred_oob = np.zeros(xtrain.shape[0])
pred_test = np.zeros(xtest.shape[0])
for (inTr, inTe) in folds:
xtr = xtrain[inTr]
ytr = y[inTr]
xte = xtrain[inTe]
yte = y[inTe]
pred = np.zeros(xte.shape[0])
for j in range(nbags):
model = nn_model()
fit = model.fit_generator(generator=batch_generator(xtr, ytr, 128, True),
nb_epoch=nepochs,
samples_per_epoch=xtr.shape[0],
validation_data=(xte.todense(), yte),
verbose=0)
temp = np.exp(
model.predict_generator(generator=batch_generatorp(xte, 800, False), val_samples=xte.shape[0])[:, 0]) - 200
pred += temp
print(
"Fold val bagging score after", j + 1, "rounds is: ", mean_absolute_error(np.exp(yte) - 200, pred / (j + 1)))
pred_test += np.exp(
model.predict_generator(generator=batch_generatorp(xtest, 800, False), val_samples=xtest.shape[0])[:,
0]) - 200
pred /= nbags
pred_oob[inTe] = pred
score = mean_absolute_error(np.exp(yte) - 200, pred)
i += 1
print('Fold ', i, '- MAE:', score)
print('Total - MAE:', mean_absolute_error(np.exp(y) - 200, pred_oob))
## train predictions
df = pd.DataFrame({'id': id_train, 'loss': pred_oob})
df.to_csv('preds_oob.csv', index=False)
## test predictions
pred_test /= (nfolds * nbags)
df = pd.DataFrame({'id': id_test, 'loss': pred_test})
df.to_csv('submission_keras_shift_perm.csv', index=False)
GitHub:https://github.com/wenwu313/Kaggle-Solution