pyspark:随机森林

废话不多说,直接上代码:

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.sql import Row
import pandas as pd
from sklearn import metrics

if __name__ == "__main__":
    appname = "RandomForestClassifier"
    master ="local[4]" 
    conf = SparkConf().setAppName(appname).setMaster(master)  #spark配置                
    spark=SparkSession.builder.config(conf=conf).getOrCreate()#spark实例化
    
#读取数据
    data=spark.read.csv('良恶性乳腺癌数据.csv',header=True)
    
#构造训练数据集
    dataSet = data.na.fill('0').rdd.map(list)#用0填充空值    
    trainData, testData= dataSet.randomSplit([0.7, 0.3], seed=7)
    trainingSet = trainData.map(lambda x:Row(label=x[-1], features=Vectors.dense(x[:-1]))).toDF()    
    train_num = trainingSet.count()
    print("训练样本数:{}".format(train_num))

     
#使用随机森林进行训练
    stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
    si_model = stringIndexer.fit(trainingSet)
    train_tf = si_model.transform(trainingSet)
    train_tf.show(5)    
    rf = RandomForestClassifier(numTrees=100, labelCol="indexed", seed=7)
    rfModel = rf.fit(train_tf)
     
#输出模型特征重要性、子树权重
    print("模型特征重要性:{}".format(rfModel.featureImportances))
    print("模型特征数:{}".format(rfModel.numFeatures))
    
#预测测试集
    testSet = testData.map(lambda x:Row(label=x[-1], features=Vectors.dense(x[:-1]))).toDF()
    test_num=testSet.count()
    print("测试样本数:{}".format(test_num))    
    si_model = stringIndexer.fit(testSet)
    test_tf = si_model.transform(testSet)    
    predictResult = rfModel.transform(test_tf)
    predictResult.show(5)
    spark.stop()

#将预测结果转为python中的dataframe
    columns=predictResult.columns#提取强表字段
    predictResult=predictResult.take(test_num)#
    predictResult=pd.DataFrame(predictResult,columns=columns)#转为python中的dataframe

#性能评估
    y=list(predictResult['indexed'])
    y_pred=list(predictResult['prediction'])
    y_predprob=[x[1] for x in list(predictResult['probability'])]
    precision_score=metrics.precision_score(y, y_pred)#精确率
    recall_score=metrics.recall_score(y, y_pred)#召回率
    accuracy_score=metrics.accuracy_score(y, y_pred)#准确率
    f1_score=metrics.f1_score(y, y_pred)#F1分数
    auc_score=metrics.roc_auc_score(y, y_predprob)#auc分数
    print("精确率:",precision_score )#精确率
    print("召回率:",recall_score )#召回率
    print("准确率:",accuracy_score )#准确率
    print("F1分数:", f1_score)#F1分数
    print("auc分数:",auc_score )#auc分数

运行结果:

pyspark:随机森林_第1张图片

如果对你有帮助,请点下赞,予人玫瑰手有余香!

时时仰望天空,理想就会离现实越来越近!

 

 

你可能感兴趣的:(python,pyspark,机器学习)