import xgboost
/home/yanghua/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
"This module will be removed in 0.20.", DeprecationWarning)
from numpy import loadtxt
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
data_set = loadtxt("pima-indians-diabetes.csv",delimiter = ",")
#split data to x and y
x = data_set[:,0:8]
y=data_set[:,8]
#split data set to test set and train set
feed = 6
test_size = 0.3
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = test_size,random_state=feed)
# fit the data
model = XGBClassifier()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
#calculate the accuracy
#predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
Accuracy: 77.06%
# 将数据集转换一下格式,eval_set的作用是指明没加入一个新树,用什么数据来训练它
eval_set = [(X_test, y_test)]
# early_stopping_rounds : 指定当添加树loss变化不大这个状态持续的轮数,达到这个数就退出训练过程
# eval_metric :指定这个衡量的标准,我们选用的是loss。
# verbose : 这个用来显示这个loss的结果。如果设置成False则不会输出每一步的结果
model.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="logloss", eval_set=eval_set, verbose=True)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
[0] validation_0-logloss:0.660186
Will train until validation_0-logloss hasn't improved in 10 rounds.
[1] validation_0-logloss:0.634854
[2] validation_0-logloss:0.612239
[3] validation_0-logloss:0.593118
[4] validation_0-logloss:0.578303
[5] validation_0-logloss:0.564942
[6] validation_0-logloss:0.555113
[7] validation_0-logloss:0.54499
[8] validation_0-logloss:0.539151
[9] validation_0-logloss:0.531819
[10] validation_0-logloss:0.526065
[11] validation_0-logloss:0.51977
[12] validation_0-logloss:0.514979
[13] validation_0-logloss:0.50927
[14] validation_0-logloss:0.506086
[15] validation_0-logloss:0.503565
[16] validation_0-logloss:0.503591
[17] validation_0-logloss:0.500805
[18] validation_0-logloss:0.497605
[19] validation_0-logloss:0.495328
[20] validation_0-logloss:0.494777
[21] validation_0-logloss:0.494274
[22] validation_0-logloss:0.493333
[23] validation_0-logloss:0.492211
[24] validation_0-logloss:0.491936
[25] validation_0-logloss:0.490578
[26] validation_0-logloss:0.490895
[27] validation_0-logloss:0.490646
[28] validation_0-logloss:0.491911
[29] validation_0-logloss:0.491407
[30] validation_0-logloss:0.488828
[31] validation_0-logloss:0.487867
[32] validation_0-logloss:0.487297
[33] validation_0-logloss:0.487562
[34] validation_0-logloss:0.487788
[35] validation_0-logloss:0.487962
[36] validation_0-logloss:0.488218
[37] validation_0-logloss:0.489582
[38] validation_0-logloss:0.489334
[39] validation_0-logloss:0.490969
[40] validation_0-logloss:0.48978
[41] validation_0-logloss:0.490704
[42] validation_0-logloss:0.492369
Stopping. Best iteration:
[32] validation_0-logloss:0.487297
Accuracy: 78.35%
#显示各个特征的重要性 ---- 基于训练的特征树输出
from xgboost import plot_importance
from matplotlib import pyplot
plot_importance(model)
pyplot.show()
# 这个还不太会用,留一个小疑点。决策树那里应该可以使用。
from xgboost import plot_tree
import graphviz as pgv
plot_tree(model,num_trees=5)
pyplot.show()
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
learning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
param_grid = dict(learning_rate=learning_rate)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X, Y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
for mean, param in zip(means, params):
print("%f with: %r" % (mean, param))
Best: -0.483304 using {'learning_rate': 0.1}
-0.689811 with: {'learning_rate': 0.0001}
-0.661827 with: {'learning_rate': 0.001}
-0.531155 with: {'learning_rate': 0.01}
-0.483304 with: {'learning_rate': 0.1}
-0.515642 with: {'learning_rate': 0.2}
-0.554158 with: {'learning_rate': 0.3}
1.learning rate
2.tree
max_depth
min_child_weight
subsample, colsample_bytree
gamma
3.正则化参数
lambda
alpha
xgb1 = XGBClassifier(
learning_rate =0.1,
n_estimators=1000,
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread=4,
scale_pos_weight=1,
seed=27)