37.机器学习应用-工作流决策树回归分析算法

一、简介

37.机器学习应用-工作流决策树回归分析算法_第1张图片

二、基于Spark ML的实现

import sys
from pyspark.sql.functions import udf
from pyspark.sql.functions import col
from pyspark.sql import SQLContext
from pyspark import SparkConf, SparkContext
from pyspark.sql.session import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import  StringIndexer,  VectorIndexer,VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder,TrainValidationSplit
from pyspark.ml.tuning import CrossValidator
from pyspark.sql.types import  StringType,StructField,StructType
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import RegressionEvaluator

def SetLogger( sc ):
    logger = sc._jvm.org.apache.log4j
    logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR )
    logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR )
    logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR)

def SetPath(sc):
    global Path
    if sc.master[0:5]=="local" :
        Path="file:/home/hduser/pythonwork/PythonProject/"
    else:   
        Path="hdfs://master:9000/user/hduser/"
        
        
def CreateSparkContext():
    sparkConf = SparkConf()                                            \
                         .setAppName("MLPiplineDecisionTreeRegressor")           \
                         .set("spark.ui.showConsoleProgress", "false") 
    sc = SparkContext(conf = sparkConf)
    print ("master="+sc.master)    
    SetLogger(sc)
    SetPath(sc)
    return (sc)

def replace_question(x):
    return ("0" if x=="?" else x)

if __name__ == "__main__":
    replace_question= udf(replace_question)
    print("MLPiplineDecisionTreeRegressor")
    print("========initing the sparkContext========")
    sc=CreateSparkContext()
    #sqlContext=sc.getOrCreate()
    #sqlContext=SparkSession.Builder.getOrCreate(s);
    #import the data
    print("========initing the sqlContext========")
    sqlContext=SQLContext(sc)
    print("========reading the data========")
    
    hour_df= sqlContext.read.format('csv') \
                  .option("header", 'true').load(Path+"data/hour.csv")
                  
    hour_df=hour_df.drop("instant").drop("dteday") \
                            .drop('yr').drop("casual").drop("registered")
    
    hour_df= hour_df.select([ col(column).cast("double").alias(column) 
                                          for column in hour_df.columns])
    
    
    #print(row_df.take(5))
     
    print("========generate the train datasets and test datasets========")
    train_df, test_df = hour_df.randomSplit([0.7, 0.3])
    train_df.cache()
    test_df.cache()
    
    featuresCols = hour_df.columns[:-1]
    
    vectorAssembler = VectorAssembler(inputCols=featuresCols, outputCol="aFeatures")
    vectorIndexer = VectorIndexer(inputCol="aFeatures", outputCol="features", maxCategories=24)
    dt = DecisionTreeRegressor(labelCol="cnt",featuresCol= 'features')
    dt_pipeline = Pipeline(stages=[vectorAssembler,vectorIndexer ,dt])
    
    
    print("========start to training========")
    dt_pipelineModel = dt_pipeline.fit(train_df)
    
    print("========start to transforming========")
    predicted_df=dt_pipelineModel.transform(test_df)
    
    print("========评估模型的准确率========")
    evaluator = RegressionEvaluator(labelCol='cnt',
                                                        predictionCol='prediction',
                                                        metricName="rmse")
    predicted_df=dt_pipelineModel.transform(test_df)
    rmse = evaluator.evaluate(predicted_df)
    print("rms:"+str(rmse))
    
    print("使用TrainValidation进行训练评估找出最佳模型")
    paramGrid = ParamGridBuilder()\
     .addGrid(dt.maxDepth, [ 5,10,15,25])\
     .addGrid(dt.maxBins, [25,35,45,50])\
     .build()
    tvs = TrainValidationSplit(estimator=dt,evaluator=evaluator,
                  estimatorParamMaps=paramGrid,trainRatio=0.8) 
    tvs_pipeline = Pipeline(stages=[vectorAssembler,vectorIndexer ,tvs])
    tvs_pipelineModel =tvs_pipeline.fit(train_df)
    bestModel=tvs_pipelineModel.stages[2].bestModel
    predictions = tvs_pipelineModel.transform(test_df)
    rmse= evaluator.evaluate(predictions)
    print("rmse:"+str(rmse))
    
    
    
    
    print("使用crossValidation进行训练评估找出最佳模型")
    cv = CrossValidator(estimator=dt, evaluator=evaluator, 
                    estimatorParamMaps=paramGrid, numFolds=3)
    cv_pipeline = Pipeline(stages=[vectorAssembler,vectorIndexer ,cv])
    cv_pipelineModel = cv_pipeline.fit(train_df)
    predictions = cv_pipelineModel.transform(test_df)
    rmse= evaluator.evaluate(predictions)
    print("rmse:"+str(rmse))
   
    
    
19/06/02 16:55:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
]0;IPython: pythonwork/PythonProjectMLPiplineDecisionTreeRegressor
========initing the sparkContext========
master=spark://master:7077
========initing the sqlContext========
========reading the data========
========generate the train datasets and test datasets========
========start to training========
========start to transforming========
========评估模型的准确率========
rms:95.8085403734
使用TrainValidation进行训练评估找出最佳模型
rmse:78.9553083546
使用crossValidation进行训练评估找出最佳模型
rmse:79.6279239782

 

你可能感兴趣的:(面试算法,机器学习,云计算,算法分析,大数据,大数据+机器学习)