参考源码:Top2方案
问题主要是根据如下的数据所进行的11类分类问题。
字段 | 中文名 | 数据类型 | 说明 |
---|---|---|---|
USERID | 用户ID | VARCHAR2(50) | 用户编码,标识用户的唯一字段 |
current_type | 套餐 | VARCHAR2(500) | / |
service_type | 套餐类型 | VARCHAR2(10) | 0:23G融合,1:2I2C,2:2G,3:3G,4:4G |
is_mix_service | 是否固移融合套餐 | VARCHAR2(10) | 1.是 0.否 |
online_time | 在网时长 | VARCHAR2(50) | / |
1_total_fee | 当月总出账金额_月 | NUMBER | 单位:元 |
2_total_fee | 当月前1月总出账金额_月 | NUMBER | 单位:元 |
3_total_fee | 当月前2月总出账金额_月 | NUMBER 单位:元 | |
4_total_fee | 当月前3月总出账金额_月 | NUMBER | 单位:元 |
month_traffic | 当月累计-流量 | NUMBER | 单位:MB |
many_over_bill | 连续超套 | VARCHAR2(500) | 1-是,0-否 |
contract_type | 合约类型 | VARCHAR2(500) | ZBG_DIM.DIM_CBSS_ACTIVITY_TYPE |
contract_time | 合约时长 | VARCHAR2(500) | / |
is_promise_low_consume | 是否承诺低消用户 | VARCHAR2(500) | 1.是 0.否 |
net_service | 网络口径用户 | VARCHAR2(500) | 20AAAAAA-2G |
pay_times | 交费次数 | NUMBER | 单位:次 |
pay_num | 交费金额 | NUMBER | 单位:元 |
last_month_traffic | 上月结转流量 | NUMBER | 单位:MB |
local_trafffic_month | 月累计-本地数据流量 | NUMBER | 单位:MB |
local_caller_time | 本地语音主叫通话时长 | NUMBER | 单位:分钟 |
service1_caller_time | 套外主叫通话时长 | NUMBER | 单位:分钟 |
service2_caller_time | Service2_caller_time | NUMBER | 单位:分钟 |
gender | 性别 | varchar2(100) | 01.男 02女 |
age | 年龄 | varchar2(100) | / |
complaint_level | 投诉重要性 | VARCHAR2(1000) | 1:普通,2:重要,3:重大 |
former_complaint_num | 交费金历史投诉总量 | NUMBER | 单位:次 |
former_complaint_fee | 历史执行补救费用交费金额 | NUMBER | 单位:分 |
主要包括原始特征,统计特征,word2vec特征和stacking特征。
首先将特征分为类别特征和数值型特征,并对缺失值进行补0操作。
计数统计:每个月话费,流量,上网时间等进行计数统计。每个月话费,流量,上网时间等分别与套餐类型、合约类型交叉计数统计;
差值特征: 某些指标之间的差值作为特征
比例特征:当月话费占当月话费占比、各同类型流量话费等比例等。
'''
data是复赛的train和test数据,train_first为初赛的所有数据+复赛的train数据
'''
# ----------------------计数特征------------------
# 用来统计特征中各个取值所对应的的样本个数
def feature_count(data, features=[]):
if len(set(features)) != len(features):
print('equal feature !!!!')
return data
new_feature = 'count'
for i in features:
new_feature += '_' + i.replace('add_', '')
temp = data.groupby(features).size()\
.reset_index().rename(columns={0: new_feature})
data = data.merge(temp, 'left', on=features)
count_feature_list.append(new_feature)
# 若是service_type的组合计数,将train_first中的数据信息也作为特征
if 'service_type' in features:
temp_2 = train_first.groupby(features).size()\
.reset_index().rename(columns={0: 'train_' + new_feature})
data = data.merge(temp_2, 'left', on=features)
count_feature_list.append('train_' + new_feature)
return data
data = feature_count(data, ['1_total_fee'])
data = feature_count(data, ['2_total_fee'])
data = feature_count(data, ['3_total_fee'])
data = feature_count(data, ['4_total_fee'])
data = feature_count(data, ['former_complaint_fee'])
data = feature_count(data, ['pay_num'])
data = feature_count(data, ['contract_time'])
data = feature_count(data, ['last_month_traffic'])
data = feature_count(data, ['online_time'])
for i in ['service_type', 'contract_type']:
data = feature_count(data, [i, '1_total_fee'])
data = feature_count(data, [i, '2_total_fee'])
data = feature_count(data, [i, '3_total_fee'])
data = feature_count(data, [i, '4_total_fee'])
data = feature_count(data, [i, 'former_complaint_fee'])
data = feature_count(data, [i, 'pay_num'])
data = feature_count(data, [i, 'contract_time'])
data = feature_count(data, [i, 'last_month_traffic'])
data = feature_count(data, [i, 'online_time'])
# ----------------------计数特征------------------
# 相邻月费用的差值
data['diff_total_fee_1'] = data['1_total_fee'] - data['2_total_fee']
data['diff_total_fee_2'] = data['2_total_fee'] - data['3_total_fee']
data['diff_total_fee_3'] = data['3_total_fee'] - data['4_total_fee']
# 缴费金额减去当月金额
data['pay_num_1_total_fee'] = data['pay_num'] - data['1_total_fee']
data['last_month_traffic_rest'] = \
data['month_traffic'] - data['last_month_traffic']
data['last_month_traffic_rest'][data['last_month_traffic_rest'] < 0] = 0
# 1G流量15元,当月所需支付的流量费用
data['rest_traffic_ratio'] = \
(data['last_month_traffic_rest'] * 15 / 1024) / data['1_total_fee']
# 4部分费用的均值,最大值和最小值
total_fee = []
for i in range(1, 5):
total_fee.append(str(i) + '_total_fee')
data['total_fee_mean'] = data[total_fee].mean(1)
data['total_fee_max'] = data[total_fee].max(1)
data['total_fee_min'] = data[total_fee].min(1)
# 各部分电话时长费用所占的比例
data['total_caller_time'] = \
data['service2_caller_time'] + data['service1_caller_time']
data['service2_caller_ratio'] = \
data['service2_caller_time'] / data['total_caller_time']
data['local_caller_ratio'] = \
data['local_caller_time'] / data['total_caller_time']
# 各部分流量费用所占的比例
data['total_month_traffic'] = \
data['local_trafffic_month'] + data['month_traffic']
data['month_traffic_ratio'] = \
data['month_traffic'] / data['total_month_traffic']
data['last_month_traffic_ratio'] = \
data['last_month_traffic'] / data['total_month_traffic']
# 0.15是超套后电话时长每分钟的费用
# 统计超套时长费用所占总费用的比例
data['1_total_fee_call_fee'] = \
data['1_total_fee'] - data['service1_caller_time'] * 0.15
data['1_total_fee_call2_fee'] = \
data['1_total_fee'] - data['service2_caller_time'] * 0.15
# 当月所需支付的流量费=[当月使用流量-(上月结转+当月可结转)]*超套每MB的流量费
# 当月除流量支出外所需的费用
data['1_total_fee_trfc_fee'] = data['1_total_fee'] - (
data['month_traffic'] - 2 * data['last_month_traffic']) * 0.3
data.loc[data.service_type == 1, '1_total_fee_trfc_fee'] = None
将4个月的费用数字变为字符串进行word2vec编码,每一个样本的4个月费用作为一个句子。作者把价格编码为了10维的向量。使用gensim库可以轻易的实现word2vec。
from gensim.models import Word2Vec
import pandas as pd
import multiprocessing
sentence = []
for line in list(data[['1_total_fee', '2_total_fee',
'3_total_fee', '4_total_fee']].values):
sentence.append([str(float(l)) for idx, l in enumerate(line)])
# 训练word2vec模型并生成向量
print('training...')
model = Word2Vec(sentence,
size=L,
window=2,
min_count=1,
workers=multiprocessing.cpu_count(),
iter=10)
print('outputing...')
for fea in ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee']:
values = []
for line in list(data[fea].values):
values.append(line)
values = set(values)
print(len(values))
w2v = []
for i in values:
a = [i]
a.extend(model[str(float(i))])
w2v.append(a)
out_df = pd.DataFrame(w2v)
在stacking中,模型使用了上述的原始特征,统计特征和word2vec特征,并使用初赛到复赛的所有train集训练lightgbm预测当前(复赛阶段)train和test类别概率,将11类的类别概率作为11个特征。
# 对应初赛和复赛的所有训练数据
train_x = data[(data.data_type == 1)][feature]
train_y = data[(data.data_type == 1)].label
# 复赛的train数据,test的label都记为0
test_x = data[(data.data_type == 0) & (data.label != 0)][feature]
test_y = data[(data.data_type == 0) & (data.label != 0)].label
lgb_model = lgb.LGBMClassifier(
boosting_type="gbdt", num_leaves=120, reg_alpha=0, reg_lambda=0.,
max_depth=-1, n_estimators=2500, objective='multiclass', metric="None",
subsample=0.9, colsample_bytree=0.5, subsample_freq=1,
learning_rate=0.035, random_state=2018, n_jobs=10
)
lgb_model.fit(train_x, train_y, categorical_feature=cate_feature)
print(lgb_model.best_score_)
stacking_path = path + '/stack'
if not os.path.exists(stacking_path):
print(stacking_path)
os.makedirs(stacking_path)
# 预测当前(复赛阶段)的train和test数据的11类别的概率
train_proba = lgb_model.predict_proba(test_x[feature])
test_proba = lgb_model.predict_proba(data[data.label == 0][feature])
print(len(train_proba), len(test_proba))
# 提取user_id数据并整合
stacking_train = data[(data.data_type == 0) & (data.label != 0)][['user_id']]
stacking_test = data[data.label == 0][['user_id']]
for i in range(11):
stacking_train['stacking_' + str(i)] = train_proba[:, i]
stacking_test['stacking_' + str(i)] = test_proba[:, i]
stacking_train.to_csv(stacking_path + '/train.csv', index=False)
stacking_test.to_csv(stacking_path + '/test.csv', index=False)
作者使用的模型为基于GBDT的lightgbm,采用单模型以及交叉验证来进行分类,最后基于多数投票进行分类。由于数据中service_type=1和≠1可以将类别范围缩减,于是可以分别训练。首先基于单模型对数据中service_type=1的数据进行单独预测:
lgb_model = lgb.LGBMClassifier(
boosting_type="gbdt", num_leaves=152, reg_alpha=0, reg_lambda=0.,
max_depth=-1, n_estimators=1000, objective='multiclass',
class_weight='balanced', subsample=0.9, colsample_bytree=0.5,
subsample_freq=1, learning_rate=0.03, random_state=2018, n_jobs=10
)
lgb_model.fit(data[data.label != 0][feature], data[data.label != 0].label,
categorical_feature=cate_feature)
result_type1 = pd.DataFrame()
result_type1['user_id'] = \
data[(data.label == 0) & (data.service_type == 1)]['user_id']
result_type1['predict'] = lgb_model.predict\
(data[(data.label == 0) & (data.service_type == 1)][feature])
使用5-折交叉验证进行分类,用全部数据训练:
def f1_macro(preds, labels):
labels = np.argmax(labels.reshape(8, -1), axis=0)
score = f1_score(y_true=labels, y_pred=preds, average='macro')
return 'f1_macro', score ** 2, True
X = data[(data.label != 0) &
(data.label != 999999)][feature].reset_index(drop=True)
y = data[(data.label != 0) &
(data.label != 999999)].label.reset_index(drop=True)
label2current_service = dict(
zip(range(0, len(set(y))), sorted(list(set(y)))))
current_service2label = dict(
zip(sorted(list(set(y))), range(0, len(set(y)))))
cv_pred = []
skf = StratifiedKFold(n_splits=5, random_state=20181, shuffle=True)
for index, (train_index, test_index) in enumerate(skf.split(X, y)):
print(index)
lgb_model = lgb.LGBMClassifier(
boosting_type="gbdt", num_leaves=120,
reg_alpha=0, reg_lambda=0.,
max_depth=-1, n_estimators=800,
objective='multiclass', class_weight='balanced',
subsample=0.9, colsample_bytree=0.5, subsample_freq=1,
learning_rate=0.03, random_state=2018 + index,
n_jobs=10, metric="None", importance_type='gain'
)
train_x, test_x, train_y, test_y = \
X.loc[train_index], X.loc[test_index], \
y.loc[train_index], y.loc[test_index]
train_x = train_x[train_x.service_type == 4]
train_y = train_y[(train_x.service_type == 4).index]
test_x = test_x[test_x.service_type == 4]
test_y = test_y[(test_x.service_type == 4).index]
print(test_y.unique())
eval_set = [(test_x, test_y)]
lgb_model.fit(train_x, train_y, categorical_feature=cate_feature)
y_test = lgb_model.predict(
data[(data.label == 0) & (data.service_type != 1)][feature])
y_test = pd.Series(y_test).map(current_service2label)
if index == 0:
cv_pred = np.array(y_test).reshape(-1, 1)
else:
cv_pred = np.hstack((cv_pred, np.array(y_test).reshape(-1, 1)))
submit = []
for line in cv_pred:
submit.append(np.argmax(np.bincount(line)))
result = pd.DataFrame()
result['user_id'] = \
data[(data.label == 0) & (data.service_type != 1)]['user_id']
result['predict'] = submit
result['predict'] = result['predict'].map(label2current_service)
result.loc[result['user_id'] == '4VNcD6kE0sjnAvFX', 'predict'] = 999999
result = result.append(result_type1)
result[['user_id', 'predict']].to_csv(
path + '/sub.csv', index=False)
xgboost
def XGB():
clf = xgb.XGBClassifier(max_depth=8, learning_rate=0.05,
n_estimators=1000, silent=True,
objective="multi:softmax",
nthread=-1, gamma=0,
max_delta_step=0, subsample=1,
colsample_bytree=0.8, colsample_bylevel=0.8,
reg_alpha=0, reg_lambda=0.25,
scale_pos_weight=1, base_score=0.5, seed=1998,
missing=None,num_class=11, tree_method='exact',
predictor='cpu_predictor')
return clf
lightgbm
import pandas as pd
import lightgbm as lgb
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import numpy as np
warnings.filterwarnings('ignore')
path ='data/'
seed = 1995
n_splits =10
class_ =11
params={
"learning_rate":0.05,#最好
"lambda_l2":0.25,
"num_iterations":800,#默认100
"max_depth":8,
"num_leaves":256,#默认31,最好
"min_data_in_leaf":20,#默认20,最好
"objective":"multiclass",
"num_class":class_,
"seed":seed,
"num_threads":-1,
"feature_fraction":0.6,
"feature_fraction_seed":8,
"max_bin":400#默认255,最好
}
train = pd.read_csv(path +'train.csv',low_memory=False)
test = pd.read_csv(path +'test.csv',low_memory=False)
le = LabelEncoder()
y = le.fit_transform(train.pop('current_service'))
train_id = train.pop('user_id')
for i in train.columns:
train[i] = train[i].replace("\\N",-9999)
test[i] = test[i].replace("\\N", -9999)
train['2_total_fee'] = train['2_total_fee'].apply(lambda x : float(x))
test['2_total_fee'] = test['2_total_fee'].apply(lambda x : float(x))
train['3_total_fee'] = train['3_total_fee'].apply(lambda x : float(x))
test['3_total_fee'] = test['3_total_fee'].apply(lambda x : float(x))
#train['new'] = train['last_month_traffic'].apply(lambda x:1 if x == 800 else 0)
#test['new'] = test['last_month_traffic'].apply(lambda x:1 if x == 800 else 0)
X = train
X_test = test[train.columns]
test_id = test['user_id']
X, y, X_test = X.values, y, X_test.values
def f1_score_vali(preds, data_vali):
labels = data_vali.get_label()
preds = np.argmax(preds.reshape(class_, -1),axis=0)
score_vali = f1_score(y_true=labels,y_pred=preds,average='macro')
return 'f1_score', score_vali, True
score = []
cv_pred = []
skf = StratifiedKFold(n_splits=n_splits,random_state=seed,shuffle=True)
for index,(train_index,test_index)in enumerate(skf.split(X,y)):
print(index)
print("trainning...")
X_train,X_valid,y_train,y_valid =\
X[train_index],X[test_index],y[train_index],y[test_index]
train_data = lgb.Dataset(X_train,label=y_train)
validation_data = lgb.Dataset(X_valid,label=y_valid)
clf=lgb.train(params,train_data,num_boost_round=50000,
valid_sets=[validation_data],early_stopping_rounds=50,
feval=f1_score_vali, verbose_eval=1)
pred = clf.predict(X_valid,num_iteration=clf.best_iteration)
pred = [np.argmax(x)for x in pred]
score.append(f1_score(y_valid, pred,average='macro'))
y_test = clf.predict(X_test,num_iteration=clf.best_iteration)
y_test = [np.argmax(x)for x in y_test]
cv_pred.append(y_test)
print("trainning end")
cv_pred = np.array(cv_pred).T
print("voting...")
submit = []
for line in cv_pred:
tmp = np.zeros((class_,))
for id in range(len(line)):
tmp[line[id]] += score[id]
submit.append(np.argmax(tmp))
print("voting end")
print("saving...")
df_test = pd.DataFrame()
df_test['id'] =list(test_id.unique())
df_test['predict'] = le.inverse_transform(submit)
df_test.to_csv('result/lgb.csv',index=False)
print(score,np.mean(score))
print("saving end")