Digit Recognizer by LightGBM

用LightGBM和xgboost分别做了Kaggle的Digit Recognizer,尝试用GridSearchCV调了下参数,主要是对max_depth, learning_rate, n_estimates等参数进行调试,最后在0.9747。

能力有限,接下来也不知道该如何进一步调参。


另外xgboost的GridSearchCV还是不会用,如果有大神会的话,烦请告知。

贴下LightGBM的代码:

#!/usr/bin/python
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV

# specify your configurations as a dict
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 10,
    'verbose': 0,
    'metric': 'multi_logloss',
    'max_bin': 255,
    'max_depth': 7,
    'learning_rate': 0.3,
    'nthread': 4,
    'n_estimators': 85,
    #'feature_fraction': 0.8
}


def train_model(model_file='model/lgb'):
    print "load data ..."
    dataset = pd.read_csv("data/train.csv", header=0)
    d_x = dataset.iloc[:, 1:].values
    d_y = dataset.iloc[:, 0].values
    train_X, test_X, train_Y, test_Y = train_test_split(
        d_x, d_y, test_size=0.33, random_state=42)

    lgb_train = lgb.Dataset(train_X, label=train_Y)
    lgb_eval = lgb.Dataset(test_X, label=test_Y, reference=lgb_train)

    print "begin train..."
    bst = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_eval],
        num_boost_round=160,
        early_stopping_rounds=10)
    print "train end\nsaving..."
    bst.save_model(model_file)
    return bst


def create_submission():
    # get model
    bst = train_model()

    # load test data
    test_df = pd.read_csv("data/test.csv", header=0)
    xg_test = test_df.iloc[:, :].values
    print "predicting..."
    pred = bst.predict(xg_test)
    print "predict end."
    # create csv file
    print "create submission file..."
    pred = map(lambda x: sum([i * round(y) for i, y in enumerate(x)]), pred)
    submission = pd.DataFrame({
        'ImageId': range(1, len(pred) + 1),
        'Label': [int(x) for x in pred]
    })
    #submission.to_csv("submission.csv", index=False)
    np.savetxt(
        'submission.csv',
        np.c_[range(1, len(pred) + 1), pred],
        delimiter=',',
        header='ImageId,Label',
        comments='',
        fmt='%d')
    print "----end----"


def tune_model():
    print "load data ..."
    dataset = pd.read_csv("data/train.csv", header=0)
    d_x = dataset.iloc[:, 1:].values
    d_y = dataset.iloc[:, 0].values

    print "create classifier..."
    param_grid = {
        #"reg_alpha": [0.3, 0.7, 0.9, 1.1],
        "learning_rate": [0.1, 0.25, 0.3],
        'n_estimators': [75, 80, 85, 90],
        'max_depth': [6, 7, 8, 9]
    }
    params = {
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'max_bin': 255,
        'max_depth': 7,
        'learning_rate': 0.25,
        'n_estimators': 80,
    }
    # max_depth = 7, learning_rate:0.25
    model = lgb.LGBMClassifier(
        boosting_type='gbdt', objective="multiclass", nthread=8, seed=42)
    model.n_classes = 10
    print "run grid search..."
    searcher = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
    searcher.fit(d_x, d_y)
    print searcher.grid_scores_
    print "=" * 30, '\n'
    print searcher.best_params_
    print "=" * 30, '\n'
    print searcher.best_score_
    print "end"


if __name__ == "__main__":
    #create_submission()
    tune_model()


另外,xgboost的代码:

# -*- coding: utf-8 -*-
#!/usr/bin/python
import codecs
import os
import time

import pandas as pd
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn import metrics
import sklearn.preprocessing as sp

params = {
    "objective": "multi:softmax",
    "eta": 0.25,
    'max_depth': 7,
    'silent': 1,
    'nthread': 4,
    'num_class': 10,
}


def train_model():
    print "load data ..."
    dataset = pd.read_csv("data/train.csv", header=0)
    train_X = dataset.iloc[:, 1:].values
    train_Y = dataset.iloc[:, 0].values
    xg_train = xgb.DMatrix(train_X, label=train_Y)

    print "begin train..."
    bst = xgb.train(params, xg_train, 10)
    print "train end\nsaving..."
    bst.save_model("model/bst")
    return bst


def create_submission():
    test_df = pd.read_csv("data/test.csv", header=0)
    xg_test = xgb.DMatrix(test_df.iloc[:, :].values)

    bst = train_model()

    print "predicting..."
    pred = bst.predict(xg_test)
    print "predict end."
    # create csv file
    print "create submission file..."
    submission = pd.DataFrame({
        'ImageId': range(1, len(pred) + 1),
        'Label': [int(x) for x in pred]
    })
    #submission.to_csv("submission.csv", index=False)
    np.savetxt(
        'submission.csv',
        np.c_[range(1, len(pred) + 1), pred],
        delimiter=',',
        header='ImageId,Label',
        comments='',
        fmt='%d')
    print "----end----"


def tune_parameters():
    print "load data ..."
    dataset = pd.read_csv("data/train.csv", header=0)
    train_X = dataset.iloc[:100, 1:].values
    train_Y = dataset.iloc[:100, :1].values
    xg_train = xgb.DMatrix(train_X, label=train_Y)

    param_grid = {'learning_rate': [0.1, 0.4]}
    print "create classifier..."
    model = xgb.XGBClassifier(
        max_depth=6,
        learning_rate=0.1,
        n_estimators=10,
        silent=True,
        objective="multi:softmax",
        seed=36,
        nthread=8)
    searcher = GridSearchCV(
        estimator=model, param_grid=param_grid, scoring='roc_auc', cv=3)
    #train_Y = [sum(x) for x in train_Y]
    train_Y = sp.label_binarize(train_Y, classes=range(0, 10))
    #print train_Y.shape, train_X.shape
    #print train_Y[66, 9]
    print "fitting ..."
    searcher.fit(train_X, train_Y)
    print searcher.grid_scores_, searcher.best_params_, searcher.best_score_
    print "end..."


if __name__ == "__main__":
    tune_parameters()








你可能感兴趣的:(什么?瓦力!)