主要分为两大部分:
第一部分是训练模型、交叉验证调参以及保存模型;
第二部分是load模型并且测试模型以及获得特征重要性排序
# -*- coding:utf-8 -*-
### 获取数据以及特征列
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("RF").enableHiveSupport().getOrCreate()
data = spark.sql("SELECT * FROM db.data where event_date >= '2019-06-21' and event_date <= '2019-06-23'")
#数字型null值处理
data = data.na.fill(value=0.0, subset=['column', ...])
#字符型null值处理
data = data.na.fill(value='0', subset=['column', ...])
featureList = []
columns = data.columns
#取特征列
featureList.extend(columns[21:-1])
### 特征处理
import pyspark.ml.feature as ft
#离散型特征处理
labelIndexer = ft.StringIndexer(inputCol="label", outputCol="indexedLabel", handleInvalid='skip')
data = labelIndexer.fit(data).transform(data)
e_localday_indexed = ft.StringIndexer(inputCol="e_localday", outputCol="indexedE_localday", handleInvalid='skip')
data = e_localday_indexed.fit(data).transform(data)
is_organic_indexed = ft.StringIndexer(inputCol="is_organic", outputCol="indexedIs_organic", handleInvalid='skip')
data = is_organic_indexed.fit(data).transform(data)
#标签特征反处理 在查看预测结果时可能会用到
labelConverter = ft.IndexToString(inputCol="indexedLabel", outputCol="predictedLabel")
# 使用VectorAssembler创建特征向量
featuresCreator = ft.VectorAssembler(inputCols=featureList, outputCol='features')
### 创建评估器 Create an estimator
import pyspark.ml.classification as cl
RFclassifier = cl.RandomForestClassifier(
labelCol='indexedLabel')
### 网格搜索 Grid search 交叉验证获得最优超参数
import pyspark.ml.tuning as tune
grid = tune.ParamGridBuilder()\
.addGrid(RFclassifier.maxDepth, [6, 8, 10]) \
.addGrid(RFclassifier.numTrees, [20, 25, 30])\
.addGrid(RFclassifier.featureSubsetStrategy, ['onethird', 'sqrt', 'log2'])\
.build()
### 创建管道 Create a pipeline
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[featuresCreator])
data_transformer = pipeline.fit(data)
## 模型拟合及性能评估 Fit the model & Model performance
# 使用BinaryClassificationEvaluator评估模型性能
import pyspark.ml.evaluation as ev
#二分类评估器
evaluator = ev.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='indexedLabel')
# 进行5折交叉验证
cv = tune.CrossValidator(
estimator=RFclassifier,
estimatorParamMaps=grid,
evaluator=evaluator,
numFolds=5
)
# 拟合模型
cvModel = cv.fit(data_transformer.transform(data))
# 训练模型
prediction = cvModel.transform(data_transformer.transform(data))
#results = prediction.select("advertising_id", "prediction", "probability", "label")
# 查看预测结果(前10行)
#results.show(10)
# 显示最优参数组合
parameters = [
(
[
{key.name: paramValue}
for key, paramValue
in zip(
params.keys(),
params.values())
], metric
)
for params, metric
in zip(
cvModel.getEstimatorParamMaps(),
cvModel.avgMetrics
)
]
print sorted(parameters, key=lambda el: el[1], reverse=True)[0]
# 保存模型
from pyspark.ml import PipelineModel
RFmodelPath = './model/rfModel'
cvModel.bestModel.write().overwrite().save(RFmodelPath)
加载训练好的模型并在测试数据集上验证效果
# -*- coding:utf-8 -*-
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("RF_test").enableHiveSupport().getOrCreate()
test_data = spark.sql("SELECT * FROM db.data where event_date >= '2019-06-24' and event_date <= '2019-06-28'")
#与训练时一样的步骤
test_data = test_data.na.fill(value=0.0, subset=['column', ...])
test_data = test_data.na.fill(value='0', subset=['column', ...])
featureList = []
columns = test_data.columns
featureList.extend(columns[21:-1])
import pyspark.ml.feature as ft
labelIndexer = ft.StringIndexer(inputCol="label", outputCol="indexedLabel", handleInvalid='skip')
test_data = labelIndexer.fit(test_data).transform(test_data)
e_localday_indexed = ft.StringIndexer(inputCol="e_localday", outputCol="indexedE_localday", handleInvalid='skip')
test_data = e_localday_indexed.fit(test_data).transform(test_data)
is_organic_indexed = ft.StringIndexer(inputCol="is_organic", outputCol="indexedIs_organic", handleInvalid='skip')
test_data = is_organic_indexed.fit(test_data).transform(test_data)
featuresCreator = ft.VectorAssembler(inputCols=featureList, outputCol='features')
test_data = featuresCreator.transform(test_data)
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator().setLabelCol('indexedLabel')
#加载训练好的模型
from pyspark.ml.classification import RandomForestClassificationModel
rf = RandomForestClassificationModel.load("/user/ai_data/zhangxin/model/rfModel")
#查看auc
print evaluator.evaluate(rf.transform(test_data))
#查看ap
print evaluator.evaluate(rf.transform(test_data), {evaluator.metricName: "areaUnderPR"})
#获取各个特征在模型中的重要性并按照权重倒序打印
features_important = rf.featureImportances
ks = list(features_important.indices)
vs = list(features_important.toArray())
features_important = tuple(features_important)
name_index = test_data.schema["features"].metadata["ml_attr"]["attrs"]
index_im = zip(ks, vs)
names = []
idxs = []
for it in name_index['numeric']:
names.append(it['name'])
idxs.append(it['idx'])
d = zip(names, idxs)
p = zip(index_im, d)
kv = {}
for fir, sec in p:
kv[sec[0]] = fir[1]
kv
print sorted(kv.items(),
key=lambda el: el[1],
reverse=True)