在PySpark的并行跑xgboost模型

from sklearn import datasets
iris = datasets.load_iris()
data = iris.data[:100]
print data.shape
#(100L, 4L)
#一共有100个样本数据, 维度为4维

label = iris.target[:100]
print label

#划分训练集、测试集
from sklearn.cross_validation import train_test_split
train_x, test_x, train_y, test_y = train_test_split(data, label, random_state=0)

#构建xgboost模型
import xgboost as xgb
dtrain=xgb.DMatrix(train_x,label=train_y)
dtest=xgb.DMatrix(test_x)

#xgboost模型参数
params={'booster':'gbtree',
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth':4,
    'lambda':10,
    'subsample':0.75,
    'colsample_bytree':0.75,
    'min_child_weight':2,
    'eta': 0.025,
    'seed':0,
    'nthread':8,
     'silent':1}

watchlist = [(dtrain,'train')]

bst=xgb.train(params,dtrain,num_boost_round=100,evals=watchlist)

#预测
ypred=bst.predict(dtest)

#保存模型和加载模型
bst.save_model('/root/xgb2.model')
bst2 = xgb.core.Booster(model_file='/root/xgb2.model')

#数据集并行化跑
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("My App")
sc = SparkContext(conf = conf)
s=sc.parallelize(test_x,5)

#并行预测
import numpy as np;
s.map(lambda x: bst2.predict(xgb.DMatrix(np.array(x).reshape((1,-1))))).collect()

你可能感兴趣的:(在PySpark的并行跑xgboost模型)