from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
import pyspark.sql.types as typ
labels = [
('INFANT_ALIVE_AT_REPORT', typ.IntegerType()),
('BIRTH_PLACE', typ.StringType()),
('MOTHER_AGE_YEARS', typ.IntegerType()),
('FATHER_COMBINED_AGE', typ.IntegerType()),
('CIG_BEFORE', typ.IntegerType()),
('CIG_1_TRI', typ.IntegerType()),
('CIG_2_TRI', typ.IntegerType()),
('CIG_3_TRI', typ.IntegerType()),
('MOTHER_HEIGHT_IN', typ.IntegerType()),
('MOTHER_PRE_WEIGHT', typ.IntegerType()),
('MOTHER_DELIVERY_WEIGHT', typ.IntegerType()),
('MOTHER_WEIGHT_GAIN', typ.IntegerType()),
('DIABETES_PRE', typ.IntegerType()),
('DIABETES_GEST', typ.IntegerType()),
('HYP_TENS_PRE', typ.IntegerType()),
('HYP_TENS_GEST', typ.IntegerType()),
('PREV_BIRTH_PRETERM', typ.IntegerType())
]
schema = typ.StructType([
typ.StructField(e[0], e[1], False) for e in labels
])
births = spark.read.csv('file:///Program Files/Pyproject/pyspark/data/births_transformed.csv.gz',
header=True,
schema=schema)
import pyspark.ml.feature as ft
# 使用OneHotEncode方法来对BIRTH_PLACE列进行编码。该方法不接受StringType列:只能处理数字类型,所以将该列转换为IntegerType
births = births.withColumn('BIRTH_PLACE_INT',
births['BIRTH_PLACE'].cast(typ.IntegerType()))
# 创建转换器
encoder = ft.OneHotEncoder(inputCol='BIRTH_PLACE_INT',
outputCol='BIRTH_PLACE_VEC')
创建单一的列,它将所有特征整合在一起,使用VectorAssembler方法:
featuresCreator = ft.VectorAssembler(
inputCols=[
col[0]
for col in labels[2:]
] + [encoder.getOutputCol()],
outputCol='features'
)
传递给VectorAssembler对象地inputCol参数是一个列表,该列表包含所有要合并在一起组成outputCol————‘features’列。
使用逻辑回归模型。
import pyspark.ml.classification as cl
# 创建模型
logistic = cl.LogisticRegression(maxIter=10,
regParam=0.01,
labelCol='INFANT_ALIVE_AT_REPORT')
如果目标列地名称为”label“,则不必指定labelCol参数。如果featureCreator地输出名称不是‘feature’,那么必须通过在featuresCreator对象上调用getOutputCol()方法来指定featuresCol
建立管道并拟合模型
from pyspark.ml import Pipeline
管道概念图
pipline = Pipeline(stages=[encoder,
featuresCreator,
logistic
])
拟合模型前,需要把数据集分成训练数据和测试数据。DataFrame API提供了.randomSplit()方法
births_train, births_test = births.randomSplit([0.7, 0.3], seed=666)
seed参数给randomizer提供了一个种子
只要列表地元素总合为1,可以将数据集拆分成两个以上地子集,并输出多个子集
train, test, val = births.randomSplit([0.7, 0.2, 0.1], seed=666)
以上随机将70%地出生数据放到训练模型中,20%放入测试模型,而val DataFrame将保留剩余10%。
# 运行管道并评估模型:
model = pipline.fit(births_train)
test_model = model.transform(births_test)
管道对象的.fit()方法以训练数据集作为输入。births_train数据集有限被传给encoder对象。在endcoder极端创建的DataFrame将被传递给创建”features“列的featuresCreator。输出被传递给评估最终模型的logistic对象
.fit()方法返回用于预测的PipelineModel对象,将之前创建的测试数据集传递给要调用的.transfrom()方法来获得预测。test_model示例如下命令:
test_model.take(1)
[Row(INFANT_ALIVE_AT_REPORT=0, BIRTH_PLACE='1', MOTHER_AGE_YEARS=13, FATHER_COMBINED_AGE=99, CIG_BEFORE=0, CIG_1_TRI=0, CIG_2_TRI=0, CIG_3_TRI=0, MOTHER_HEIGHT_IN=66, MOTHER_PRE_WEIGHT=133, MOTHER_DELIVERY_WEIGHT=135, MOTHER_WEIGHT_GAIN=2, DIABETES_PRE=0, DIABETES_GEST=0, HYP_TENS_PRE=0, HYP_TENS_GEST=0, PREV_BIRTH_PRETERM=0, BIRTH_PLACE_INT=1, BIRTH_PLACE_VEC=SparseVector(9, {1: 1.0}), features=SparseVector(24, {0: 13.0, 1: 99.0, 6: 66.0, 7: 133.0, 8: 135.0, 9: 2.0, 16: 1.0}), rawPrediction=DenseVector([1.0573, -1.0573]), probability=DenseVector([0.7422, 0.2578]), prediction=0.0)]
逻辑回归模型输出类几列:rawPrediction是特征和β系数的线性组合的值,probability是每个类别计算出的概率,最后predicition的最总分类
import pyspark.ml.evaluation as ev
# 使用BinaryClassficationEvalutator来检验模型表现:
evalutor = ev.BinaryClassificationEvaluator(rawPredictionCol='probability',
labelCol='INFANT_ALIVE_AT_REPORT')
rawPredictionCol可以是由评估器产生的rawprediction列,也可以是probability
print(evalutor.evaluate(test_model,
{evalutor.metricName: 'areaUnderROC'}))
print(evalutor.evaluate(test_model,
{evalutor.metricName: 'areaUnderPR'}))
0.7401301847095617
0.7139354342365674
pyspark允许保存管道定义以备以后使用。可以保存管道结构,还可以报错所有转换器和评估器定义:
piplinePath = './infant_oneHotEncoder_Logistic_Pipeline'
pipline.write().overwrite().save(piplinePath)
所以,可以随后加载并直接使用.fit()并预测:
loadedPipeline = Pipeline.load(piplinePath)
loadedPipeline.fit(births_train).transform(births_test).take(1)
[Row(INFANT_ALIVE_AT_REPORT=0, BIRTH_PLACE='1', MOTHER_AGE_YEARS=13, FATHER_COMBINED_AGE=99, CIG_BEFORE=0, CIG_1_TRI=0, CIG_2_TRI=0, CIG_3_TRI=0, MOTHER_HEIGHT_IN=66, MOTHER_PRE_WEIGHT=133, MOTHER_DELIVERY_WEIGHT=135, MOTHER_WEIGHT_GAIN=2, DIABETES_PRE=0, DIABETES_GEST=0, HYP_TENS_PRE=0, HYP_TENS_GEST=0, PREV_BIRTH_PRETERM=0, BIRTH_PLACE_INT=1, BIRTH_PLACE_VEC=SparseVector(9, {1: 1.0}), features=SparseVector(24, {0: 13.0, 1: 99.0, 6: 66.0, 7: 133.0, 8: 135.0, 9: 2.0, 16: 1.0}), rawPrediction=DenseVector([1.0573, -1.0573]), probability=DenseVector([0.7422, 0.2578]), prediction=0.0)]
以上代码产生相同的结果
也可以保存评估模型;此时要保存PipelineModel,而不是保存管道
from pyspark.ml import PipelineModel
modelPath = './infant_oneHotEncoder_Logistic_PipelineModel'
model.write().overwrite().save(modelPath)
loadedPipelineModel = PipelineModel.load(modelPath)
test_reloadedModel = loadedPipelineModel.transform(births_test)
test_reloadedModel.take(1)
[Row(INFANT_ALIVE_AT_REPORT=0, BIRTH_PLACE='1', MOTHER_AGE_YEARS=13, FATHER_COMBINED_AGE=99, CIG_BEFORE=0, CIG_1_TRI=0, CIG_2_TRI=0, CIG_3_TRI=0, MOTHER_HEIGHT_IN=66, MOTHER_PRE_WEIGHT=133, MOTHER_DELIVERY_WEIGHT=135, MOTHER_WEIGHT_GAIN=2, DIABETES_PRE=0, DIABETES_GEST=0, HYP_TENS_PRE=0, HYP_TENS_GEST=0, PREV_BIRTH_PRETERM=0, BIRTH_PLACE_INT=1, BIRTH_PLACE_VEC=SparseVector(9, {1: 1.0}), features=SparseVector(24, {0: 13.0, 1: 99.0, 6: 66.0, 7: 133.0, 8: 135.0, 9: 2.0, 16: 1.0}), rawPrediction=DenseVector([1.0573, -1.0573]), probability=DenseVector([0.7422, 0.2578]), prediction=0.0)]
一般第一个模型几乎不可能是最好的模型。仅仅只是查看一系列指标便因为它通过了预先设定的性能阈值从而接受模型并不是寻求最佳模型的科学方法
超参调优的概念是找到模型的最佳参数:例如逻辑回归的最大迭代次数或决策树的最大深度
为模型找到最佳参数:grid search和train-validation spliting
网格搜索是一个详尽的算法,根据给定评估指标,循环遍历定义的参数值列表,估计各个单独的模型,从而选定一个最佳模型
如果定义的要优化的参数太多或者这些参数的值太多,则可能需要大量时间才能选出最佳模型,因为随着参数和参数值的增加,要估计的模型数量将迅速增长
# 加载.tuning包
import pyspark.ml.tuning as tune
# 指定模型和要遍历参数的列表
logistic = cl.LogisticRegression(labelCol='INFANT_ALIVE_AT_REPORT')
grid = tune.ParamGridBuilder() \
.addGrid(logistic.maxIter, [2, 10, 50]) \
.addGrid(logistic.regParam, [0.01, 0.05, 0.3]).build()
指定要优化参数的模型,确定优化的参数以及要测试的参数的值。使用.tuning子包中的ParamGridBuilder()对象,并使用.addGrid()方法继续将参数添加到网格中:第一个参数是要优化模型的参数对象,第二个参数是要循环值的列表。在.ParamGridBuilder上调用.build()方法构建网格
evalutor = ev.BinaryClassificationEvaluator(rawPredictionCol='probability',
labelCol='INFANT_ALIVE_AT_REPORT')
再次使用BinaryClassificationEvaluator。可以创建验证逻辑
cv = tune.CrossValidator(estimator=logistic,
estimatorParamMaps=grid,
evaluator=evalutor
)
使用CrossValidator需要评估器、estimatorParamMaps和evaluator。该模型循环遍历值的网络,评估各个模型,并使用evaluator比较性能
pipeline = Pipeline(stages=[encoder, featuresCreator])
data_transformer = pipline.fit(births_train)
# 寻找最佳参数组合
cvModel = cv.fit(data_transformer.transform(births_train))
data_train = data_transformer.transform(births_test)
results = cvModel.transform(data_train)
print(evalutor.evaluate(results, {evalutor.metricName: 'areaUnderROC'}))
print(evalutor.evaluate(results, {evalutor.metricName: 'areaUnderPR'}))
过程:
results = [
(
[
{key.name: paramValue}
for key, paramValue in zip(params.key(),
params.values())
], metric)
for params, metric in zip(
cvModel.getEstimatorParamMaps(),
cvModel.avgMetrics)
]
sorted(results,
key=lambda el: el[1],
reversed=True)[0]