35.机器学习应用-工作流随机森林二元分类算法

一、介绍

35.机器学习应用-工作流随机森林二元分类算法_第1张图片

二、Spark ML实现

import sys
from pyspark.sql.functions import udf
from pyspark.sql.functions import col
from pyspark.sql import SQLContext
from pyspark import SparkConf, SparkContext
from pyspark.sql.session import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import  StringIndexer, OneHotEncoder,VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder,TrainValidationSplit
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.classification import RandomForestClassifier


def SetLogger( sc ):
    logger = sc._jvm.org.apache.log4j
    logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR )
    logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR )
    logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR)

def SetPath(sc):
    global Path
    if sc.master[0:5]=="local" :
        Path="file:/home/hduser/pythonwork/PythonProject/"
    else:   
        Path="hdfs://master:9000/user/hduser/"
        
        
def CreateSparkContext():
    sparkConf = SparkConf()                                            \
                         .setAppName("MLPiplineDecisionTreeBinary")           \
                         .set("spark.ui.showConsoleProgress", "false") 
    sc = SparkContext(conf = sparkConf)
    print ("master="+sc.master)    
    SetLogger(sc)
    SetPath(sc)
    return (sc)

def replace_question(x):
    return ("0" if x=="?" else x)

if __name__ == "__main__":
    replace_question= udf(replace_question)
    print("MLPiplineDecisionTreeBinary")
    print("========initing the sparkContext========")
    sc=CreateSparkContext()
    #sqlContext=sc.getOrCreate()
    #sqlContext=SparkSession.Builder.getOrCreate(s);
    #import the data
    print("========initing the sqlContext========")
    sqlContext=SQLContext(sc)
    print("========reading the data of csv========")
    row_df = sqlContext.read.format("csv") \
     .option("header", "true") \
     .option("delimiter", "\t") \
     .load(Path+"data/train.tsv")
     
    #print(row_df.take(5))
     
    print("========filtration the datasets========")
    #generate the train datasets and test datasets
    df= row_df.select(
               ['url','alchemy_category' ]+
               [replace_question(col(column)).cast("double").alias(column)  
                for column in row_df.columns[4:] ] )
    
    print("========generate the train datasets and test datasets========")
    train_df, test_df = df.randomSplit([0.7, 0.3])
    train_df.cache()
    test_df.cache() 
    print("========string indexer========")
    stringIndexer = StringIndexer(inputCol='alchemy_category', 
                                                outputCol="alchemy_category_Index")
    print("========one hot encoder========")
    encoder = OneHotEncoder(dropLast=False,
                                            inputCol='alchemy_category_Index',
                                            outputCol="alchemy_category_IndexVec")
    assemblerInputs =['alchemy_category_IndexVec']  + row_df.columns[4:-1] 
    print("========assembler========")
    assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
   
    rf =RandomForestClassifier(labelCol="label", 
                           featuresCol="features",numTrees=10)
    
    evaluator = BinaryClassificationEvaluator(
                              rawPredictionCol="rawPrediction",
                              labelCol="label",  
                              metricName="areaUnderROC"  )

    rfpipeline = Pipeline(stages=[stringIndexer,encoder ,assembler,rf ])
    
    rfpipelineModel = rfpipeline.fit(train_df)
    rfpredicted=rfpipelineModel.transform(test_df)
    evaluator.evaluate(rfpredicted)

    
    print("找出最佳模型:TrainValidation")
    paramGrid = ParamGridBuilder()\
      .addGrid(rf.impurity, [ "gini","entropy"])\
      .addGrid(rf.maxDepth, [ 5,10,15])\
      .addGrid(rf.maxBins, [10, 15,20])\
      .addGrid(rf.numTrees, [10, 20,30])\
      .build()

    rftvs = TrainValidationSplit(estimator=rf, evaluator=evaluator,
                                 estimatorParamMaps=paramGrid, trainRatio=0.8)

    rftvs_pipeline = Pipeline(stages=[stringIndexer,encoder ,assembler, rftvs])
    rftvs_pipelineModel =rftvs_pipeline.fit(train_df)
    rftvspredictions = rftvs_pipelineModel.transform(test_df)
    auc= evaluator.evaluate(rftvspredictions)
    print("TrainValidation:"+str(auc))
    
    
    print("找出最佳模型:crossValidation")
    rfcv = CrossValidator(estimator=rf, evaluator=evaluator, 
                          estimatorParamMaps=paramGrid, numFolds=3)

    rfcv_pipeline = Pipeline(stages=[stringIndexer,encoder ,assembler, rfcv])
    rfcv_pipelineModel = rfcv_pipeline.fit(train_df)
    rfcvpredictions = rfcv_pipelineModel.transform(test_df)
    auc= evaluator.evaluate(rfcvpredictions)
    print("crossValidation:"+str(auc))
    
    
   
    
    
19/06/02 14:42:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
]0;IPython: pythonwork/PythonProjectMLPiplineDecisionTreeBinary
========initing the sparkContext========
master=spark://master:7077
========initing the sqlContext========
========reading the data of csv========
========filtration the datasets========
========generate the train datasets and test datasets========
========string indexer========
========one hot encoder========
========assembler========
找出最佳模型:TrainValidation
TrainValidation:0.748380647578
找出最佳模型:crossValidation
crossValidation:0.747620036483

 

 

你可能感兴趣的:(面试算法,机器学习,云计算,算法分析,大数据,算法,大数据+机器学习)