一、介绍
二、Spark ML实现
import sys
from pyspark.sql.functions import udf
from pyspark.sql.functions import col
from pyspark.sql import SQLContext
from pyspark import SparkConf, SparkContext
from pyspark.sql.session import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder,VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder,TrainValidationSplit
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.classification import RandomForestClassifier
def SetLogger( sc ):
logger = sc._jvm.org.apache.log4j
logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR )
logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR )
logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR)
def SetPath(sc):
global Path
if sc.master[0:5]=="local" :
Path="file:/home/hduser/pythonwork/PythonProject/"
else:
Path="hdfs://master:9000/user/hduser/"
def CreateSparkContext():
sparkConf = SparkConf() \
.setAppName("MLPiplineDecisionTreeBinary") \
.set("spark.ui.showConsoleProgress", "false")
sc = SparkContext(conf = sparkConf)
print ("master="+sc.master)
SetLogger(sc)
SetPath(sc)
return (sc)
def replace_question(x):
return ("0" if x=="?" else x)
if __name__ == "__main__":
replace_question= udf(replace_question)
print("MLPiplineDecisionTreeBinary")
print("========initing the sparkContext========")
sc=CreateSparkContext()
#sqlContext=sc.getOrCreate()
#sqlContext=SparkSession.Builder.getOrCreate(s);
#import the data
print("========initing the sqlContext========")
sqlContext=SQLContext(sc)
print("========reading the data of csv========")
row_df = sqlContext.read.format("csv") \
.option("header", "true") \
.option("delimiter", "\t") \
.load(Path+"data/train.tsv")
#print(row_df.take(5))
print("========filtration the datasets========")
#generate the train datasets and test datasets
df= row_df.select(
['url','alchemy_category' ]+
[replace_question(col(column)).cast("double").alias(column)
for column in row_df.columns[4:] ] )
print("========generate the train datasets and test datasets========")
train_df, test_df = df.randomSplit([0.7, 0.3])
train_df.cache()
test_df.cache()
print("========string indexer========")
stringIndexer = StringIndexer(inputCol='alchemy_category',
outputCol="alchemy_category_Index")
print("========one hot encoder========")
encoder = OneHotEncoder(dropLast=False,
inputCol='alchemy_category_Index',
outputCol="alchemy_category_IndexVec")
assemblerInputs =['alchemy_category_IndexVec'] + row_df.columns[4:-1]
print("========assembler========")
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
rf =RandomForestClassifier(labelCol="label",
featuresCol="features",numTrees=10)
evaluator = BinaryClassificationEvaluator(
rawPredictionCol="rawPrediction",
labelCol="label",
metricName="areaUnderROC" )
rfpipeline = Pipeline(stages=[stringIndexer,encoder ,assembler,rf ])
rfpipelineModel = rfpipeline.fit(train_df)
rfpredicted=rfpipelineModel.transform(test_df)
evaluator.evaluate(rfpredicted)
print("找出最佳模型:TrainValidation")
paramGrid = ParamGridBuilder()\
.addGrid(rf.impurity, [ "gini","entropy"])\
.addGrid(rf.maxDepth, [ 5,10,15])\
.addGrid(rf.maxBins, [10, 15,20])\
.addGrid(rf.numTrees, [10, 20,30])\
.build()
rftvs = TrainValidationSplit(estimator=rf, evaluator=evaluator,
estimatorParamMaps=paramGrid, trainRatio=0.8)
rftvs_pipeline = Pipeline(stages=[stringIndexer,encoder ,assembler, rftvs])
rftvs_pipelineModel =rftvs_pipeline.fit(train_df)
rftvspredictions = rftvs_pipelineModel.transform(test_df)
auc= evaluator.evaluate(rftvspredictions)
print("TrainValidation:"+str(auc))
print("找出最佳模型:crossValidation")
rfcv = CrossValidator(estimator=rf, evaluator=evaluator,
estimatorParamMaps=paramGrid, numFolds=3)
rfcv_pipeline = Pipeline(stages=[stringIndexer,encoder ,assembler, rfcv])
rfcv_pipelineModel = rfcv_pipeline.fit(train_df)
rfcvpredictions = rfcv_pipelineModel.transform(test_df)
auc= evaluator.evaluate(rfcvpredictions)
print("crossValidation:"+str(auc))
19/06/02 14:42:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
]0;IPython: pythonwork/PythonProjectMLPiplineDecisionTreeBinary
========initing the sparkContext========
master=spark://master:7077
========initing the sqlContext========
========reading the data of csv========
========filtration the datasets========
========generate the train datasets and test datasets========
========string indexer========
========one hot encoder========
========assembler========
找出最佳模型:TrainValidation
TrainValidation:0.748380647578
找出最佳模型:crossValidation
crossValidation:0.747620036483