http://www.luyixian.cn/news_show_253681.aspx
lightGBM比XGBoost的1个改进之处在于对类别特征的处理, 不再需要将类别特征转为one-hot形式, 具体可参考这里.
在使用python API时(参考官方文档)
1.可以使用pd.DataFrame存放特征X, 每一列表示1个特征, 将类别特征设置为X[cat_cols].astype('category'). 这样模型在fit时会自动识别类别特征.
2.在模型的fit方法中传入参数categorical_feature, 指明哪些列是类别特征.
3.类别特征的值必须是从0开始的连续整数, 比如0,1,2,..., 不能是负数.
import json
import os
import math
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import safe_config
import numpy as np
df = pd.read_csv('safe-0817-0822.csv')
import safe_config
df[safe_config.TEXT_CATEGORICAL_COLS] = df[safe_config.TEXT_CATEGORICAL_COLS].fillna('unkown')
# 缺失值填充
missing_value_dict = {}
for col in safe_config.NUMERIC_CATEGORICAL_COLS + safe_config.NUMERIC_COLS:
fill_value = df[col].median()
df[col] = df[col].fillna(fill_value)
missing_value_dict[col] = fill_value
# 缺失值填充dict转json
with open("missing_value_fill.json", "w") as f:
json.dump(missing_value_dict, f)
print("缺失值填充json完成...")
def encodeColumns(sdf, colnames):
df = sdf # .copy()
labelEncoderDict = {}
for col in colnames:
print(col)
labelEncoderDict[col] = {}
le = LabelEncoder()
df[col] = le.fit_transform(df[col].astype(str))
clas = le.classes_
for i in range(0, len(clas)):
labelEncoderDict[col][clas[i]] = i
return df, labelEncoderDict
## 导出线上文件
def key_to_json(data):
if data is None or isinstance(data, (bool, int, str, float)):
return data
if isinstance(data, (tuple, frozenset)):
return str(data)
if isinstance(data, np.integer):
return int(data)
if isinstance(data, np.float):
return int(data)
raise TypeError
def to_json(data):
if data is None or isinstance(data, (bool, int, tuple, range, str, list)):
return data
if isinstance(data, (set, frozenset)):
return sorted(data)
if isinstance(data, np.float):
return float(data)
if isinstance(data, dict):
return {key_to_json(key): to_json(data[key]) for key in data}
raise TypeError
df, le_dict = encodeColumns(df, safe_config.CATEGORICAL_COLS)
# label encoder dict
with open(f'''./label_encoder_dict.json''', 'w') as fp:
json.dump(to_json(le_dict), fp)
df.to_csv('final_safe.csv', index=0, header=True)
import time
# X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.3)
lgb_train = lgb.Dataset(data=x_train,
label=y_train,
categorical_feature=safe_config.CATEGORICAL_COLS,
free_raw_data=False)
print('train')
# lgb_eval = lgb.Dataset(data=x_test,
# label=y_test,
# categorical_feature=config.CATEGORICAL_COLS,
# reference=lgb_train,
# free_raw_data=False)
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'binary', #binary xentropy regression
'metric': {'binary_logloss','xentropy','auc'},
'num_leaves': 200, #<2^depth
'max_depth':9,
'learning_rate': 0.06,
'feature_fraction': 0.8,
'bagging_fraction': 0.9,
'colsample_bytree': 0.9,
'min_data_in_leaf': 300,
'max_bin': 1000,
# 'lambda':1,
'lambda_l1': 0.1, #l1正则
# 'lambda_l2': 0.001, #l2正则
# 'min_data':100,
'is_unbalance':True, ###
# 'scale_pos_weight':10,
'sub_feature':0.6,
'bagging_freq': 5, #每 k 次迭代执行bagging
'verbose': 1,# <0 显示致命的, =0 显示错误 (警告), >0 显示信息
'num_iterations':1000
}
gbm = lgb.train(params,
lgb_train)
gbm.save_model('model.txt')
rst = gbm.predict(x_test)
import lightgbm as lgb
import pandas as pd
import numpy as np
NUMERIC_COLS = [
'start_distance', 'price', 'time_diff','sex']
df_train = df[(df['parsed_log_time'] < yesterday.strftime("%Y-%m-%d %H:%M:%S"))].drop_duplicates()
df_test = df[(df['parsed_log_time'] >= yesterday.strftime("%Y-%m-%d %H:%M:%S"))].drop_duplicates()
y_train = df_train['success'] # training label
y_test = df_test['success'] # testing label
X_train = df_train[NUMERIC_COLS] # training dataset
X_test = df_test[NUMERIC_COLS] # testing dataset
def map_value(x):
if x == -1:
return 0
else:
return x
X_train['sex'] = X_train.agg(lambda x: map_value(x['sex']),axis=1)
X_train['sex'] = X_train['sex'].astype('category')
X_test['sex'] = X_test.agg(lambda x: map_value(x['sex']),axis=1)
X_test['sex'] = X_test['sex'].astype('category')
num_leaf = 128
# 保存GBDT模型pmml
model = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=num_leaf, reg_alpha=0.0, reg_lambda=1,
max_depth=7, n_estimators=100, objective='binary',
learning_rate=0.06, random_state=20, n_jobs=4)
model.fit(X_train, y_train,categorical_feature=['sex'])
print(pd.DataFrame({
'column': NUMERIC_COLS,
'importance': model.feature_importances_
}).sort_values(by='importance', ascending=False))
A = {}
def unqiue_element(x):
a = len(np.unique(x))
A[x.name] = a
# df[NUMERIC_COLS].apply(unqiue_element)
# print(sorted(A.items(), key=lambda x: x[1], reverse=True))
predictions = model.predict(X_test)
from sklearn.metrics import precision_score, recall_score, roc_auc_score
print('正确率:', model.score(X_test, y_test))
print('精确率:', precision_score(y_test, predictions))
print('召回率:', recall_score(y_test, predictions))
print('auc值:', roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))
df_test['p0'] = model.predict_proba(X_test)[:, 1]
df_test['rk'] = df_test.groupby(['driver_id', 'parsed_log_time'])['p0'].rank(ascending=0, method='average')
df_test['orderNew'] = df_test.groupby(['driver_id', 'parsed_log_time'])['orderNum'].rank(ascending=1, method='average')
print('测试集新排位平均值:', df_test[df_test['success'] == 1]['rk'].mean())
print('测试集老排位平均值:', df_test[df_test['success'] == 1]['orderNew'].mean())