mmlspark
在下面的Recipes中,会在同一个任务上使用pyspark
和mmlspark
两个库分别训练一个分类器.还是使用Adult Census
这个数据集,预测一个人的收入是否超过$50k,数据下载地址[https://www.kaggle.com/uciml/adult-census-income/data]
import warnings
import numpy as np
from pyspark import SparkConf
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.sql.types import StringType, FloatType, IntegerType, LongType
warnings.filterwarnings('ignore')
spark = SparkSession.builder.appName("MyApp").config("spark.jars.packages", "Azure:mmlspark:0.17").getOrCreate()
data = spark.read.csv('hdfs:///user/hadoop/adult.csv',inferSchema=True, header=True)
data.limit(10).toPandas()
age | workclass | fnlwgt | education | education.num | marital.status | occupation | relationship | race | sex | capital.gain | capital.loss | hours.per.week | native.country | income | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 90 | ? | 77053 | HS-grad | 9 | Widowed | ? | Not-in-family | White | Female | 0 | 4356 | 40 | United-States | <=50K |
1 | 82 | Private | 132870 | HS-grad | 9 | Widowed | Exec-managerial | Not-in-family | White | Female | 0 | 4356 | 18 | United-States | <=50K |
2 | 66 | ? | 186061 | Some-college | 10 | Widowed | ? | Unmarried | Black | Female | 0 | 4356 | 40 | United-States | <=50K |
3 | 54 | Private | 140359 | 7th-8th | 4 | Divorced | Machine-op-inspct | Unmarried | White | Female | 0 | 3900 | 40 | United-States | <=50K |
4 | 41 | Private | 264663 | Some-college | 10 | Separated | Prof-specialty | Own-child | White | Female | 0 | 3900 | 40 | United-States | <=50K |
5 | 34 | Private | 216864 | HS-grad | 9 | Divorced | Other-service | Unmarried | White | Female | 0 | 3770 | 45 | United-States | <=50K |
6 | 38 | Private | 150601 | 10th | 6 | Separated | Adm-clerical | Unmarried | White | Male | 0 | 3770 | 40 | United-States | <=50K |
7 | 74 | State-gov | 88638 | Doctorate | 16 | Never-married | Prof-specialty | Other-relative | White | Female | 0 | 3683 | 20 | United-States | >50K |
8 | 68 | Federal-gov | 422013 | HS-grad | 9 | Divorced | Prof-specialty | Not-in-family | White | Female | 0 | 3683 | 40 | United-States | <=50K |
9 | 41 | Private | 70037 | Some-college | 10 | Never-married | Craft-repair | Unmarried | White | Male | 0 | 3004 | 60 | ? | >50K |
data = data.withColumnRenamed('education.num','education_num')\
.withColumnRenamed('marital.status','marital_status')\
.withColumnRenamed('capital.gain','capital_gain')\
.withColumnRenamed('capital.loss','capital_loss')\
.withColumnRenamed('hours.per.week','hours_per_week')\
.withColumnRenamed('native.country','native_country')
data.printSchema()
root
|-- age: integer (nullable = true)
|-- workclass: string (nullable = true)
|-- fnlwgt: integer (nullable = true)
|-- education: string (nullable = true)
|-- education_num: integer (nullable = true)
|-- marital_status: string (nullable = true)
|-- occupation: string (nullable = true)
|-- relationship: string (nullable = true)
|-- race: string (nullable = true)
|-- sex: string (nullable = true)
|-- capital_gain: integer (nullable = true)
|-- capital_loss: integer (nullable = true)
|-- hours_per_week: integer (nullable = true)
|-- native_country: string (nullable = true)
|-- income: string (nullable = true)
add label (income)
add_label = F.udf(lambda income : 0 if income == '<=50K' else 1 , IntegerType())
data = data.withColumn('label', add_label(data['income']))
data.groupBy('label').count().show()
+-----+-----+
|label|count|
+-----+-----+
| 1| 7841|
| 0|24720|
+-----+-----+
workclass
# 工作类型
work_class_count = data.groupBy('workclass').count()
work_class_count.show()
+----------------+-----+
| workclass|count|
+----------------+-----+
|Self-emp-not-inc| 2541|
| Local-gov| 2093|
| State-gov| 1298|
| Private|22696|
| Without-pay| 14|
| Federal-gov| 960|
| Never-worked| 7|
| ?| 1836|
| Self-emp-inc| 1116|
+----------------+-----+
对工作类型进行数值编码
def workclass_encode(workclass):
workdict = {'Self-emp-not-inc':0,
'Local-gov': 1,
'State-gov': 2,
'Private': 3,
'Without-pay' : 4,
'Federal-gov': 5,
'Never-worked' : 6,
'?' : 7,
'Self-emp-inc':8}
return workdict[workclass]
workclass_encodeUDF = F.udf(workclass_encode, IntegerType())
data = data.withColumn('workclass_cate', workclass_encodeUDF(data['workclass']))
education
特征离散类别较少的时候,UDF还能应付, 较多的时候就要用 StringIndexer()
edu_encode = StringIndexer(inputCol='education', outputCol='education_cate')
edu_model = edu_encode.fit(data)
data = edu_model.transform(data)
marital_status
marital_encode = StringIndexer(inputCol='marital_status', outputCol='marital_cate')
marital_model = marital_encode.fit(data)
data = marital_model.transform(data)
occupation
occupation_encode = StringIndexer(inputCol='occupation', outputCol='occupation_cate')
occupation_model = occupation_encode.fit(data)
data = occupation_model.transform(data)
relationship
relationship_encode = StringIndexer(inputCol='relationship', outputCol='relationship_cate')
relationship_model = relationship_encode.fit(data)
data = relationship_model.transform(data)
race
race_encode = StringIndexer(inputCol='race', outputCol='race_cate')
race_model = race_encode.fit(data)
data = race_model.transform(data)
sex
sex_encode = StringIndexer(inputCol='sex', outputCol='sex_cate')
sex_model = sex_encode.fit(data)
data = sex_model.transform(data)
native_country
native_country_encode = StringIndexer(inputCol='native_country', outputCol='native_country_cate')
native_country_model = native_country_encode.fit(data)
data = native_country_model.transform(data)
上面的操作都可以放在Pipeline中
add_label_udf = F.udf(lambda income : 0 if income == '<=50K' else 1 , IntegerType())
def workclass_encode(workclass):
workdict = {'Self-emp-not-inc':0,
'Local-gov': 1,
'State-gov': 2,
'Private': 3,
'Without-pay' : 4,
'Federal-gov': 5,
'Never-worked' : 6,
'?' : 7,
'Self-emp-inc':8}
return workdict[workclass]
workclass_encodeUDF = F.udf(workclass_encode, IntegerType())
edu_encode = StringIndexer(inputCol='education', outputCol='education_cate')
marital_encode = StringIndexer(inputCol='marital_status', outputCol='marital_cate')
occupation_encode = StringIndexer(inputCol='occupation', outputCol='occupation_cate')
relationship_encode = StringIndexer(inputCol='relationship', outputCol='relationship_cate')
race_encode = StringIndexer(inputCol='race', outputCol='race_cate')
sex_encode = StringIndexer(inputCol='sex', outputCol='sex_cate')
native_country_encode = StringIndexer(inputCol='native_country', outputCol='native_country_cate')
from mmlspark import UDFTransformer
incomeTransformer = UDFTransformer(inputCol='income', outputCol='label', udf=add_label_udf)
workclassTransformer = UDFTransformer(inputCol='workclass', outputCol='workclass_cate',udf=workclass_encodeUDF)
data = spark.read.csv('hdfs:///user/hadoop/adult.csv',inferSchema=True, header=True)
data = data.withColumnRenamed('education.num','education_num')\
.withColumnRenamed('marital.status','marital_status')\
.withColumnRenamed('capital.gain','capital_gain')\
.withColumnRenamed('capital.loss','capital_loss')\
.withColumnRenamed('hours.per.week','hours_per_week')\
.withColumnRenamed('native.country','native_country')
# Pipeline fitted models
feat_encode_pipeline = Pipeline(stages=[workclassTransformer, edu_encode, marital_encode, occupation_encode,
relationship_encode, race_encode, sex_encode, native_country_encode,
incomeTransformer]).fit(data)
# transform data
data = feat_encode_pipeline.transform(data)
data = data.drop('workclass','income','education','marital_status','occupation','relationship','race','sex','native_country')
train, test, validation = data.randomSplit([0.60, 0.20, 0.20], seed=422)
pyspark
GBTCfrom pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import VectorAssembler # 合并特征列为行向量
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# 这里其实也可以加入Pipeline
featureColumns = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss',
'hours_per_week', 'workclass_cate', 'education_cate', 'marital_cate',
'occupation_cate', 'relationship_cate', 'race_cate', 'sex_cate', 'native_country_cate']
assembler = VectorAssembler(inputCols = featureColumns,outputCol='features')
assembledTrain = assembler.transform(train)
assembledTest = assembler.transform(test)
assembledValidation = assembler.transform(validation)
assembledTest = assembledTest.select('label', 'features').withColumn('label', assembledTest.label.cast(IntegerType()))
assembledValidation = assembledValidation.select('label', 'features').withColumn('label',assembledValidation.label.cast(IntegerType()))
gbtc = GBTClassifier(maxIter=10, maxDepth=3, labelCol='label', seed=422,maxBins=64)
gbtc_model = gbtc.fit(assembledTrain)
test_pred = gbtc_model.transform(assembledTest)
valid_pred = gbtc_model.transform(assembledValidation)
# eval
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',labelCol='label', metricName='areaUnderROC')
print('pyspark train score : ',evaluator.evaluate(test_pred))
print('pyspark test score : ',evaluator.evaluate(valid_pred))
pyspark train score : 0.9084413650071548
pyspark test score : 0.9079059839660263
mmlspark
FindBestModelTrainClassifier
: train
, test
, validation
FindBestModel
: 根据特定的metric在test
数据集上选择最好的模型
CompueModelStatistics
validation
dataset)
from mmlspark import TrainClassifier, ComputeModelStatistics
from mmlspark import FindBestModel
train, test, validation = data.randomSplit([0.60, 0.20, 0.20], seed=2020)
# hyperParams : maxDepth
lrHyperParams = [2, 3, 4, 5, 7]
gbtcs = [GBTClassifier(maxIter=10, maxDepth=hyperParam, labelCol='label', seed=422,maxBins=64)
for hyperParam in lrHyperParams]
gbtc_models = [TrainClassifier(model=gbtc, labelCol="label").fit(train) for gbtc in gbtcs]
# Select the best model
bestModel = FindBestModel(evaluationMetric="AUC", models=gbtc_models).fit(test)
# Save model
#bestModel.write().overwrite().save("gbtc.mml")
# Get AUC on the validation dataset
predictions = bestModel.transform(validation)
metrics = ComputeModelStatistics().transform(predictions)
print("Best model's AUC on validation set = "
+ "{0:.2f}%".format(metrics.first()["AUC"] * 100))
Best model's AUC on validation set = 78.43%