Anaconda下xgboost安装和实现
xgboost安装:
1.下载xgboost安装包:https://www.lfd.uci.edu/~gohlke/pythonlibs/#xgboost
xgboost-0.82-cp27-cp27m-win32.whl
xgboost-0.82-cp27-cp27m-win_amd64.whl
xgboost-0.90-cp35-cp35m-win32.whl
xgboost-0.90-cp35-cp35m-win_amd64.whl
xgboost-0.90-cp36-cp36m-win32.whl
xgboost-0.90-cp36-cp36m-win_amd64.whl
xgboost-0.90-cp37-cp37m-win32.whl
xgboost-0.90-cp37-cp37m-win_amd64.whl
多种版本选择 cp27 表示 python2.7,32位和64位可以选择
2.下载完成后放入python 安装目录下 \Anaconda\Scripts
3.pip install xgboost
其他相关包的安装 https://blog.csdn.net/Jarry_cm/article/details/92836423
#用xgb包能导出dump_model查看每棵树的形状
import xgboost as xgb
from sklearn.model_selection import train_test_split
from pandas import DataFrame
from xgboost.sklearn import XGBClassifier
from xgboost import plot_tree
import matplotlib.pyplot as plt
import numpy as np
from xgboost import plot_importance
data = []
labels = []
with open('test.txt') as ifile:
for line in ifile:
tokens = line.strip().split('\t')
data.append([float(tk) for tk in tokens[2:]])
labels.append(tokens[1])
x=np.array(data)
labels = np.array(labels)
y=np.zeros(labels.shape)
y[labels=='1']=1
#拆分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)
dtrain=xgb.DMatrix(x_train,label=y_train)
dtest=xgb.DMatrix(x_test)
params={'booster':'gbtree',
'objective': 'binary:logistic',
'eval_metric': 'auc',
'max_depth':2,
'lambda':10,
'subsample':0.75,
'colsample_bytree':0.75,
'min_child_weight':1,
'eta': 0.025,
'seed':0,
'nthread':8,
'silent':1}
watchlist = [(dtrain,'train')]
bst=xgb.train(params,dtrain,num_boost_round=100,evals=watchlist)
ypred=bst.predict(dtest)
y_pred = (ypred >= 0.5)*1
###auc、混淆矩阵
from sklearn import metrics
print('AUC: %.4f' % metrics.roc_auc_score(y_test,ypred))
print('ACC: %.4f' % metrics.accuracy_score(y_test,y_pred))
print('Recall: %.4f' % metrics.recall_score(y_test,y_pred))
print('F1-score: %.4f' %metrics.f1_score(y_test,y_pred))
print('Precesion: %.4f' %metrics.precision_score(y_test,y_pred))
metrics.confusion_matrix(y_test,y_pred)
#导出树结构
bst.dump_model('dump_model.txt')
ypred_contribs = bst.predict(dtest, pred_contribs=True)
xgb.to_graphviz(bst, num_trees=1)#查看第n颗树
##重要性程度
import operator
import pandas as pd
importance = bst.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1))
print(importance)
df = pd.DataFrame(importance,columns=['feature','fscore']).sort_values(by='fscore',ascending=False)
#df['fscore'] = df['fscore'] / df['fscore'].sum()
df.to_csv('feature_importance.txt',index = False)
##计算KS
from scipy.stats import ks_2samp
get_ks = lambda y_pred,y_true: ks_2samp(y_pred[y_true==1], y_pred[y_true!=1]).statistic
ks_test = get_ks(ypred,y_test)