pyspark:GBDT

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import GBTClassifier
from pyspark.sql import Row
import pandas as pd
from sklearn import metrics

if __name__ == "__main__":
    appname = "GBTClassifier"
    master ="local[4]" 
    conf = SparkConf().setAppName(appname).setMaster(master)  #spark配置                
    spark=SparkSession.builder.config(conf=conf).getOrCreate()#spark实例化
    
#读取数据
    data=spark.read.csv('良恶性乳腺癌数据.csv',header=True)
    
#构造训练数据集
    dataSet = data.na.fill('0').rdd.map(list)#用0填充空值    
    trainData, testData= dataSet.randomSplit([0.7, 0.3], seed=7)
    trainingSet = trainData.map(lambda x:Row(label=x[-1], features=Vectors.dense(x[:-1]))).toDF()    
    train_num = trainingSet.count()
    print("训练样本数:{}".format(train_num))

     
#使用GBDT进行训练
    stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
    si_model = stringIndexer.fit(trainingSet)
    train_tf = si_model.transform(trainingSet)#重新对类别标签进行编号
    train_tf.show(5)    
    rf = GBTClassifier(maxIter=50, labelCol="indexed", seed=7)
    rfModel = rf.fit(train_tf)
     
#输出模型特征重要性、子树权重
    print("模型特征重要性:{}".format(rfModel.featureImportances))
    print("模型特征数:{}".format(rfModel.numFeatures))
    
#预测测试集
    testSet = testData.map(lambda x:Row(label=x[-1], features=Vectors.dense(x[:-1]))).toDF()
    test_num=testSet.count()
    print("测试样本数:{}".format(test_num))    
    si_model = stringIndexer.fit(testSet)
    test_tf = si_model.transform(testSet)    
    predictResult = rfModel.transform(test_tf)
    predictResult.show(5)
    
#将预测结果转为python中的dataframe
    columns=predictResult.columns#提取强表字段
    predictResult=predictResult.take(test_num)#
    predictResult=pd.DataFrame(predictResult,columns=columns)#转为python中的dataframe
    spark.stop()

#性能评估
    y=list(predictResult['indexed'])
    y_pred=list(predictResult['prediction'])
    y_predprob=[x[1] for x in list(predictResult['probability'])]
    precision_score=metrics.precision_score(y, y_pred)#精确率
    recall_score=metrics.recall_score(y, y_pred)#召回率
    accuracy_score=metrics.accuracy_score(y, y_pred)#准确率
    f1_score=metrics.f1_score(y, y_pred)#F1分数
    auc_score=metrics.roc_auc_score(y, y_predprob)#auc分数
    print("精确率:",precision_score )#精确率
    print("召回率:",recall_score )#召回率
    print("准确率:",accuracy_score )#准确率
    print("F1分数:", f1_score)#F1分数
    print("auc分数:",auc_score )#auc分数

运行结果:

pyspark:GBDT_第1张图片

你可能感兴趣的:(python,机器学习,pyspark)