mmlspark-102 : 简单的ML Pipelines

102 - Simplifying ML Pipelines with mmlspark

在下面的Recipes中,会在同一个任务上使用pysparkmmlspark两个库分别训练一个分类器.还是使用Adult Census这个数据集,预测一个人的收入是否超过$50k,数据下载地址[https://www.kaggle.com/uciml/adult-census-income/data]

1. 加载数据

import warnings
import numpy as np
from pyspark import SparkConf
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.sql.types import StringType, FloatType, IntegerType, LongType
warnings.filterwarnings('ignore')
spark = SparkSession.builder.appName("MyApp").config("spark.jars.packages", "Azure:mmlspark:0.17").getOrCreate()

data = spark.read.csv('hdfs:///user/hadoop/adult.csv',inferSchema=True, header=True)

data.limit(10).toPandas()
age workclass fnlwgt education education.num marital.status occupation relationship race sex capital.gain capital.loss hours.per.week native.country income
0 90 ? 77053 HS-grad 9 Widowed ? Not-in-family White Female 0 4356 40 United-States <=50K
1 82 Private 132870 HS-grad 9 Widowed Exec-managerial Not-in-family White Female 0 4356 18 United-States <=50K
2 66 ? 186061 Some-college 10 Widowed ? Unmarried Black Female 0 4356 40 United-States <=50K
3 54 Private 140359 7th-8th 4 Divorced Machine-op-inspct Unmarried White Female 0 3900 40 United-States <=50K
4 41 Private 264663 Some-college 10 Separated Prof-specialty Own-child White Female 0 3900 40 United-States <=50K
5 34 Private 216864 HS-grad 9 Divorced Other-service Unmarried White Female 0 3770 45 United-States <=50K
6 38 Private 150601 10th 6 Separated Adm-clerical Unmarried White Male 0 3770 40 United-States <=50K
7 74 State-gov 88638 Doctorate 16 Never-married Prof-specialty Other-relative White Female 0 3683 20 United-States >50K
8 68 Federal-gov 422013 HS-grad 9 Divorced Prof-specialty Not-in-family White Female 0 3683 40 United-States <=50K
9 41 Private 70037 Some-college 10 Never-married Craft-repair Unmarried White Male 0 3004 60 ? >50K

2a. 数据预处理和特征提取

2.1 列名的重命名
data = data.withColumnRenamed('education.num','education_num')\
           .withColumnRenamed('marital.status','marital_status')\
           .withColumnRenamed('capital.gain','capital_gain')\
           .withColumnRenamed('capital.loss','capital_loss')\
           .withColumnRenamed('hours.per.week','hours_per_week')\
           .withColumnRenamed('native.country','native_country')
2.2 数据类型
data.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- education_num: integer (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital_gain: integer (nullable = true)
 |-- capital_loss: integer (nullable = true)
 |-- hours_per_week: integer (nullable = true)
 |-- native_country: string (nullable = true)
 |-- income: string (nullable = true)

add label (income)

add_label = F.udf(lambda income : 0 if income == '<=50K' else 1 , IntegerType())

data = data.withColumn('label', add_label(data['income']))

data.groupBy('label').count().show()
+-----+-----+
|label|count|
+-----+-----+
|    1| 7841|
|    0|24720|
+-----+-----+
2. 2 labelencode

workclass

# 工作类型

work_class_count = data.groupBy('workclass').count()

work_class_count.show()
+----------------+-----+
|       workclass|count|
+----------------+-----+
|Self-emp-not-inc| 2541|
|       Local-gov| 2093|
|       State-gov| 1298|
|         Private|22696|
|     Without-pay|   14|
|     Federal-gov|  960|
|    Never-worked|    7|
|               ?| 1836|
|    Self-emp-inc| 1116|
+----------------+-----+

对工作类型进行数值编码

def workclass_encode(workclass):
    workdict = {'Self-emp-not-inc':0,
                'Local-gov': 1,
                'State-gov': 2,
                'Private': 3,
                'Without-pay' : 4,
                'Federal-gov': 5,
                'Never-worked' : 6,
                '?' : 7,
                'Self-emp-inc':8}
    return workdict[workclass]

workclass_encodeUDF = F.udf(workclass_encode, IntegerType())

data = data.withColumn('workclass_cate', workclass_encodeUDF(data['workclass']))

education

特征离散类别较少的时候,UDF还能应付, 较多的时候就要用 StringIndexer()

edu_encode = StringIndexer(inputCol='education', outputCol='education_cate')

edu_model = edu_encode.fit(data)

data = edu_model.transform(data)

marital_status

marital_encode = StringIndexer(inputCol='marital_status', outputCol='marital_cate')

marital_model = marital_encode.fit(data)

data = marital_model.transform(data)

occupation

occupation_encode = StringIndexer(inputCol='occupation', outputCol='occupation_cate')

occupation_model = occupation_encode.fit(data)

data = occupation_model.transform(data)

relationship

relationship_encode = StringIndexer(inputCol='relationship', outputCol='relationship_cate')

relationship_model = relationship_encode.fit(data)

data = relationship_model.transform(data)

race

race_encode = StringIndexer(inputCol='race', outputCol='race_cate')

race_model = race_encode.fit(data)

data = race_model.transform(data)

sex

sex_encode = StringIndexer(inputCol='sex', outputCol='sex_cate')

sex_model = sex_encode.fit(data)

data = sex_model.transform(data)

native_country

native_country_encode = StringIndexer(inputCol='native_country', outputCol='native_country_cate')

native_country_model = native_country_encode.fit(data)

data = native_country_model.transform(data)

上面就是整个数据预处理的过程

2b. 数据预处理的Pipeline

上面的操作都可以放在Pipeline中

add_label_udf = F.udf(lambda income : 0 if income == '<=50K' else 1 , IntegerType())

def workclass_encode(workclass):
    workdict = {'Self-emp-not-inc':0,
                'Local-gov': 1,
                'State-gov': 2,
                'Private': 3,
                'Without-pay' : 4,
                'Federal-gov': 5,
                'Never-worked' : 6,
                '?' : 7,
                'Self-emp-inc':8}
    return workdict[workclass]

workclass_encodeUDF = F.udf(workclass_encode, IntegerType())

edu_encode = StringIndexer(inputCol='education', outputCol='education_cate')
marital_encode = StringIndexer(inputCol='marital_status', outputCol='marital_cate')
occupation_encode = StringIndexer(inputCol='occupation', outputCol='occupation_cate')
relationship_encode = StringIndexer(inputCol='relationship', outputCol='relationship_cate')
race_encode = StringIndexer(inputCol='race', outputCol='race_cate')
sex_encode = StringIndexer(inputCol='sex', outputCol='sex_cate')
native_country_encode = StringIndexer(inputCol='native_country', outputCol='native_country_cate')

Function → \to UDF → \to UDFTransformer 构造Pipeline

from mmlspark import UDFTransformer

incomeTransformer = UDFTransformer(inputCol='income', outputCol='label', udf=add_label_udf)
workclassTransformer = UDFTransformer(inputCol='workclass', outputCol='workclass_cate',udf=workclass_encodeUDF)
data = spark.read.csv('hdfs:///user/hadoop/adult.csv',inferSchema=True, header=True)

data = data.withColumnRenamed('education.num','education_num')\
           .withColumnRenamed('marital.status','marital_status')\
           .withColumnRenamed('capital.gain','capital_gain')\
           .withColumnRenamed('capital.loss','capital_loss')\
           .withColumnRenamed('hours.per.week','hours_per_week')\
           .withColumnRenamed('native.country','native_country')
# Pipeline fitted models
feat_encode_pipeline = Pipeline(stages=[workclassTransformer, edu_encode, marital_encode, occupation_encode,
                                        relationship_encode, race_encode, sex_encode, native_country_encode,
                                        incomeTransformer]).fit(data)

# transform data
data = feat_encode_pipeline.transform(data)

Split train_df, test_df

data = data.drop('workclass','income','education','marital_status','occupation','relationship','race','sex','native_country')

train, test, validation = data.randomSplit([0.60, 0.20, 0.20], seed=422)

3a. Classify using pyspark GBTC

  • training
  • evaluation
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import VectorAssembler    # 合并特征列为行向量
from pyspark.ml.evaluation import BinaryClassificationEvaluator


# 这里其实也可以加入Pipeline
featureColumns = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss',
                  'hours_per_week', 'workclass_cate', 'education_cate', 'marital_cate',
                  'occupation_cate', 'relationship_cate', 'race_cate', 'sex_cate', 'native_country_cate']
assembler = VectorAssembler(inputCols = featureColumns,outputCol='features')

assembledTrain = assembler.transform(train)
assembledTest = assembler.transform(test)
assembledValidation = assembler.transform(validation)

assembledTest = assembledTest.select('label', 'features').withColumn('label', assembledTest.label.cast(IntegerType()))
assembledValidation = assembledValidation.select('label', 'features').withColumn('label',assembledValidation.label.cast(IntegerType()))
gbtc = GBTClassifier(maxIter=10, maxDepth=3, labelCol='label', seed=422,maxBins=64)

gbtc_model = gbtc.fit(assembledTrain)

test_pred = gbtc_model.transform(assembledTest)
valid_pred = gbtc_model.transform(assembledValidation)
# eval
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',labelCol='label', metricName='areaUnderROC')
print('pyspark train score : ',evaluator.evaluate(test_pred))
print('pyspark test  score : ',evaluator.evaluate(valid_pred))
pyspark train score :  0.9084413650071548
pyspark test  score :  0.9079059839660263

3b. Classify using mmlspark FindBestModel

  1. TrainClassifier : train, test, validation

  2. FindBestModel : 根据特定的metric在test数据集上选择最好的模型

  3. CompueModelStatisticsvalidation dataset)

from mmlspark import TrainClassifier, ComputeModelStatistics
from mmlspark import FindBestModel
train, test, validation = data.randomSplit([0.60, 0.20, 0.20], seed=2020)

# hyperParams : maxDepth
lrHyperParams = [2, 3, 4, 5, 7]
gbtcs = [GBTClassifier(maxIter=10, maxDepth=hyperParam, labelCol='label', seed=422,maxBins=64)
                       for hyperParam in lrHyperParams]

gbtc_models = [TrainClassifier(model=gbtc, labelCol="label").fit(train) for gbtc in gbtcs]
# Select the best model
bestModel = FindBestModel(evaluationMetric="AUC", models=gbtc_models).fit(test)

# Save model
#bestModel.write().overwrite().save("gbtc.mml")
# Get AUC on the validation dataset
predictions = bestModel.transform(validation)
metrics = ComputeModelStatistics().transform(predictions)
print("Best model's AUC on validation set = "
      + "{0:.2f}%".format(metrics.first()["AUC"] * 100))
Best model's AUC on validation set = 78.43%

你可能感兴趣的:(mmlspark-102 : 简单的ML Pipelines)