用LightGBM和xgboost分别做了Kaggle的Digit Recognizer,尝试用GridSearchCV调了下参数,主要是对max_depth, learning_rate, n_estimates等参数进行调试,最后在0.9747。
能力有限,接下来也不知道该如何进一步调参。
另外xgboost的GridSearchCV还是不会用,如果有大神会的话,烦请告知。
贴下LightGBM的代码:
#!/usr/bin/python
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
# specify your configurations as a dict
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'multiclass',
'num_class': 10,
'verbose': 0,
'metric': 'multi_logloss',
'max_bin': 255,
'max_depth': 7,
'learning_rate': 0.3,
'nthread': 4,
'n_estimators': 85,
#'feature_fraction': 0.8
}
def train_model(model_file='model/lgb'):
print "load data ..."
dataset = pd.read_csv("data/train.csv", header=0)
d_x = dataset.iloc[:, 1:].values
d_y = dataset.iloc[:, 0].values
train_X, test_X, train_Y, test_Y = train_test_split(
d_x, d_y, test_size=0.33, random_state=42)
lgb_train = lgb.Dataset(train_X, label=train_Y)
lgb_eval = lgb.Dataset(test_X, label=test_Y, reference=lgb_train)
print "begin train..."
bst = lgb.train(
params,
lgb_train,
valid_sets=[lgb_eval],
num_boost_round=160,
early_stopping_rounds=10)
print "train end\nsaving..."
bst.save_model(model_file)
return bst
def create_submission():
# get model
bst = train_model()
# load test data
test_df = pd.read_csv("data/test.csv", header=0)
xg_test = test_df.iloc[:, :].values
print "predicting..."
pred = bst.predict(xg_test)
print "predict end."
# create csv file
print "create submission file..."
pred = map(lambda x: sum([i * round(y) for i, y in enumerate(x)]), pred)
submission = pd.DataFrame({
'ImageId': range(1, len(pred) + 1),
'Label': [int(x) for x in pred]
})
#submission.to_csv("submission.csv", index=False)
np.savetxt(
'submission.csv',
np.c_[range(1, len(pred) + 1), pred],
delimiter=',',
header='ImageId,Label',
comments='',
fmt='%d')
print "----end----"
def tune_model():
print "load data ..."
dataset = pd.read_csv("data/train.csv", header=0)
d_x = dataset.iloc[:, 1:].values
d_y = dataset.iloc[:, 0].values
print "create classifier..."
param_grid = {
#"reg_alpha": [0.3, 0.7, 0.9, 1.1],
"learning_rate": [0.1, 0.25, 0.3],
'n_estimators': [75, 80, 85, 90],
'max_depth': [6, 7, 8, 9]
}
params = {
'objective': 'multiclass',
'metric': 'multi_logloss',
'max_bin': 255,
'max_depth': 7,
'learning_rate': 0.25,
'n_estimators': 80,
}
# max_depth = 7, learning_rate:0.25
model = lgb.LGBMClassifier(
boosting_type='gbdt', objective="multiclass", nthread=8, seed=42)
model.n_classes = 10
print "run grid search..."
searcher = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
searcher.fit(d_x, d_y)
print searcher.grid_scores_
print "=" * 30, '\n'
print searcher.best_params_
print "=" * 30, '\n'
print searcher.best_score_
print "end"
if __name__ == "__main__":
#create_submission()
tune_model()
另外,xgboost的代码:
# -*- coding: utf-8 -*-
#!/usr/bin/python
import codecs
import os
import time
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn import metrics
import sklearn.preprocessing as sp
params = {
"objective": "multi:softmax",
"eta": 0.25,
'max_depth': 7,
'silent': 1,
'nthread': 4,
'num_class': 10,
}
def train_model():
print "load data ..."
dataset = pd.read_csv("data/train.csv", header=0)
train_X = dataset.iloc[:, 1:].values
train_Y = dataset.iloc[:, 0].values
xg_train = xgb.DMatrix(train_X, label=train_Y)
print "begin train..."
bst = xgb.train(params, xg_train, 10)
print "train end\nsaving..."
bst.save_model("model/bst")
return bst
def create_submission():
test_df = pd.read_csv("data/test.csv", header=0)
xg_test = xgb.DMatrix(test_df.iloc[:, :].values)
bst = train_model()
print "predicting..."
pred = bst.predict(xg_test)
print "predict end."
# create csv file
print "create submission file..."
submission = pd.DataFrame({
'ImageId': range(1, len(pred) + 1),
'Label': [int(x) for x in pred]
})
#submission.to_csv("submission.csv", index=False)
np.savetxt(
'submission.csv',
np.c_[range(1, len(pred) + 1), pred],
delimiter=',',
header='ImageId,Label',
comments='',
fmt='%d')
print "----end----"
def tune_parameters():
print "load data ..."
dataset = pd.read_csv("data/train.csv", header=0)
train_X = dataset.iloc[:100, 1:].values
train_Y = dataset.iloc[:100, :1].values
xg_train = xgb.DMatrix(train_X, label=train_Y)
param_grid = {'learning_rate': [0.1, 0.4]}
print "create classifier..."
model = xgb.XGBClassifier(
max_depth=6,
learning_rate=0.1,
n_estimators=10,
silent=True,
objective="multi:softmax",
seed=36,
nthread=8)
searcher = GridSearchCV(
estimator=model, param_grid=param_grid, scoring='roc_auc', cv=3)
#train_Y = [sum(x) for x in train_Y]
train_Y = sp.label_binarize(train_Y, classes=range(0, 10))
#print train_Y.shape, train_X.shape
#print train_Y[66, 9]
print "fitting ..."
searcher.fit(train_X, train_Y)
print searcher.grid_scores_, searcher.best_params_, searcher.best_score_
print "end..."
if __name__ == "__main__":
tune_parameters()