lightgbm处理类别特征

lightGBM的categorical_feature(类别特征)使用

http://www.luyixian.cn/news_show_253681.aspx

lightGBM比XGBoost的1个改进之处在于对类别特征的处理, 不再需要将类别特征转为one-hot形式, 具体可参考这里.

在使用python API时(参考官方文档)
1.可以使用pd.DataFrame存放特征X, 每一列表示1个特征, 将类别特征设置为X[cat_cols].astype('category'). 这样模型在fit时会自动识别类别特征.
2.在模型的fit方法中传入参数categorical_feature, 指明哪些列是类别特征.
3.类别特征的值必须是从0开始的连续整数, 比如0,1,2,..., 不能是负数.

import json
import os
import math
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import safe_config
import numpy as np

df = pd.read_csv('safe-0817-0822.csv')
import safe_config

df[safe_config.TEXT_CATEGORICAL_COLS] = df[safe_config.TEXT_CATEGORICAL_COLS].fillna('unkown')
# 缺失值填充
missing_value_dict = {}
for col in safe_config.NUMERIC_CATEGORICAL_COLS + safe_config.NUMERIC_COLS:
    fill_value = df[col].median()
    df[col] = df[col].fillna(fill_value)
    missing_value_dict[col] = fill_value
# 缺失值填充dict转json
with open("missing_value_fill.json", "w") as f:
    json.dump(missing_value_dict, f)
    print("缺失值填充json完成...")


def encodeColumns(sdf, colnames):
    df = sdf  # .copy()
    labelEncoderDict = {}
    for col in colnames:
        print(col)
        labelEncoderDict[col] = {}
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        clas = le.classes_
        for i in range(0, len(clas)):
            labelEncoderDict[col][clas[i]] = i

    return df, labelEncoderDict


## 导出线上文件

def key_to_json(data):
    if data is None or isinstance(data, (bool, int, str, float)):
        return data
    if isinstance(data, (tuple, frozenset)):
        return str(data)
    if isinstance(data, np.integer):
        return int(data)
    if isinstance(data, np.float):
        return int(data)
    raise TypeError


def to_json(data):
    if data is None or isinstance(data, (bool, int, tuple, range, str, list)):
        return data
    if isinstance(data, (set, frozenset)):
        return sorted(data)
    if isinstance(data, np.float):
        return float(data)
    if isinstance(data, dict):
        return {key_to_json(key): to_json(data[key]) for key in data}
    raise TypeError


df, le_dict = encodeColumns(df, safe_config.CATEGORICAL_COLS)

# label encoder dict
with open(f'''./label_encoder_dict.json''', 'w') as fp:
    json.dump(to_json(le_dict), fp)

df.to_csv('final_safe.csv', index=0, header=True)

import time
# X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.3)
lgb_train = lgb.Dataset(data=x_train,
                        label=y_train,
                        categorical_feature=safe_config.CATEGORICAL_COLS,
                        free_raw_data=False)
print('train')
# lgb_eval = lgb.Dataset(data=x_test,
#                        label=y_test,
#                        categorical_feature=config.CATEGORICAL_COLS,
#                        reference=lgb_train,
#                        free_raw_data=False)
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary', #binary xentropy regression
    'metric': {'binary_logloss','xentropy','auc'},
    'num_leaves': 200, #<2^depth
    'max_depth':9,
    'learning_rate': 0.06,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.9,
    'colsample_bytree': 0.9,
    'min_data_in_leaf': 300,
    'max_bin': 1000,
#     'lambda':1,
    'lambda_l1': 0.1,             #l1正则
#     'lambda_l2': 0.001,     #l2正则
#     'min_data':100,
    'is_unbalance':True, ###
#     'scale_pos_weight':10,
    'sub_feature':0.6, 
    'bagging_freq': 5, #每 k 次迭代执行bagging
    'verbose': 1,# <0 显示致命的, =0 显示错误 (警告), >0 显示信息
    'num_iterations':1000
    }

gbm = lgb.train(params,
        lgb_train)

gbm.save_model('model.txt')

rst = gbm.predict(x_test)

 

import lightgbm as lgb
import pandas as pd
import numpy as np
NUMERIC_COLS = [
    'start_distance', 'price', 'time_diff','sex']

df_train = df[(df['parsed_log_time'] < yesterday.strftime("%Y-%m-%d %H:%M:%S"))].drop_duplicates()
df_test = df[(df['parsed_log_time'] >= yesterday.strftime("%Y-%m-%d %H:%M:%S"))].drop_duplicates()

y_train = df_train['success']  # training label
y_test = df_test['success']  # testing label
X_train = df_train[NUMERIC_COLS]  # training dataset
X_test = df_test[NUMERIC_COLS]  # testing dataset
def map_value(x):
    if x == -1:
        return 0
    else:
        return x

X_train['sex'] = X_train.agg(lambda x: map_value(x['sex']),axis=1)
X_train['sex'] = X_train['sex'].astype('category')

X_test['sex'] = X_test.agg(lambda x: map_value(x['sex']),axis=1)
X_test['sex'] = X_test['sex'].astype('category')


num_leaf = 128

# 保存GBDT模型pmml
model = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=num_leaf, reg_alpha=0.0, reg_lambda=1,
                           max_depth=7, n_estimators=100, objective='binary',
                           learning_rate=0.06, random_state=20, n_jobs=4)
model.fit(X_train, y_train,categorical_feature=['sex'])
print(pd.DataFrame({
    'column': NUMERIC_COLS,
    'importance': model.feature_importances_
}).sort_values(by='importance', ascending=False))

A = {}


def unqiue_element(x):
    a = len(np.unique(x))
    A[x.name] = a


# df[NUMERIC_COLS].apply(unqiue_element)
# print(sorted(A.items(), key=lambda x: x[1], reverse=True))

predictions = model.predict(X_test)
from sklearn.metrics import precision_score, recall_score, roc_auc_score

print('正确率:', model.score(X_test, y_test))
print('精确率:', precision_score(y_test, predictions))
print('召回率:', recall_score(y_test, predictions))
print('auc值:', roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

df_test['p0'] = model.predict_proba(X_test)[:, 1]
df_test['rk'] = df_test.groupby(['driver_id', 'parsed_log_time'])['p0'].rank(ascending=0, method='average')
df_test['orderNew'] = df_test.groupby(['driver_id', 'parsed_log_time'])['orderNum'].rank(ascending=1, method='average')
print('测试集新排位平均值:', df_test[df_test['success'] == 1]['rk'].mean())
print('测试集老排位平均值:', df_test[df_test['success'] == 1]['orderNew'].mean())

 

你可能感兴趣的:(机器学习)