pyspark实现随机森林与交叉验证

主要分为两大部分:

第一部分是训练模型、交叉验证调参以及保存模型;

第二部分是load模型并且测试模型以及获得特征重要性排序

# -*- coding:utf-8 -*-

### 获取数据以及特征列
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("RF").enableHiveSupport().getOrCreate()

data = spark.sql("SELECT * FROM db.data where event_date >= '2019-06-21' and event_date <= '2019-06-23'")
#数字型null值处理
data = data.na.fill(value=0.0, subset=['column', ...])
#字符型null值处理
data = data.na.fill(value='0', subset=['column', ...])

featureList = []
columns = data.columns
#取特征列
featureList.extend(columns[21:-1])

### 特征处理
import pyspark.ml.feature as ft

#离散型特征处理
labelIndexer = ft.StringIndexer(inputCol="label", outputCol="indexedLabel", handleInvalid='skip')
data = labelIndexer.fit(data).transform(data)
e_localday_indexed = ft.StringIndexer(inputCol="e_localday", outputCol="indexedE_localday", handleInvalid='skip')
data = e_localday_indexed.fit(data).transform(data)
is_organic_indexed = ft.StringIndexer(inputCol="is_organic", outputCol="indexedIs_organic", handleInvalid='skip')
data = is_organic_indexed.fit(data).transform(data)

#标签特征反处理 在查看预测结果时可能会用到
labelConverter = ft.IndexToString(inputCol="indexedLabel", outputCol="predictedLabel")

# 使用VectorAssembler创建特征向量
featuresCreator = ft.VectorAssembler(inputCols=featureList, outputCol='features')

### 创建评估器 Create an estimator
import pyspark.ml.classification as cl

RFclassifier = cl.RandomForestClassifier(
    labelCol='indexedLabel')

### 网格搜索 Grid search 交叉验证获得最优超参数
import pyspark.ml.tuning as tune

grid = tune.ParamGridBuilder()\
    .addGrid(RFclassifier.maxDepth, [6, 8, 10]) \
    .addGrid(RFclassifier.numTrees, [20, 25, 30])\
    .addGrid(RFclassifier.featureSubsetStrategy, ['onethird', 'sqrt', 'log2'])\
    .build()
    
### 创建管道 Create a pipeline
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[featuresCreator])
data_transformer = pipeline.fit(data)

## 模型拟合及性能评估 Fit the model & Model performance

# 使用BinaryClassificationEvaluator评估模型性能
import pyspark.ml.evaluation as ev

#二分类评估器
evaluator = ev.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='indexedLabel')

# 进行5折交叉验证
cv = tune.CrossValidator(
    estimator=RFclassifier, 
    estimatorParamMaps=grid, 
    evaluator=evaluator,
    numFolds=5
)

# 拟合模型
cvModel = cv.fit(data_transformer.transform(data))

# 训练模型
prediction = cvModel.transform(data_transformer.transform(data))

#results = prediction.select("advertising_id", "prediction", "probability", "label")
# 查看预测结果(前10行)
#results.show(10)

# 显示最优参数组合
parameters = [
    (
        [
            {key.name: paramValue} 
            for key, paramValue 
            in zip(
                params.keys(), 
                params.values())
        ], metric
    ) 
    for params, metric 
    in zip(
        cvModel.getEstimatorParamMaps(), 
        cvModel.avgMetrics
    )
]

print sorted(parameters, key=lambda el: el[1], reverse=True)[0]

# 保存模型
from pyspark.ml import PipelineModel

RFmodelPath = './model/rfModel'
cvModel.bestModel.write().overwrite().save(RFmodelPath)

加载训练好的模型并在测试数据集上验证效果

# -*- coding:utf-8 -*-

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("RF_test").enableHiveSupport().getOrCreate()


test_data = spark.sql("SELECT * FROM db.data where event_date >= '2019-06-24' and event_date <= '2019-06-28'")
#与训练时一样的步骤
test_data = test_data.na.fill(value=0.0, subset=['column', ...])
test_data = test_data.na.fill(value='0', subset=['column', ...])

featureList = []
columns = test_data.columns
featureList.extend(columns[21:-1])

import pyspark.ml.feature as ft

labelIndexer = ft.StringIndexer(inputCol="label", outputCol="indexedLabel", handleInvalid='skip')
test_data = labelIndexer.fit(test_data).transform(test_data)
e_localday_indexed = ft.StringIndexer(inputCol="e_localday", outputCol="indexedE_localday", handleInvalid='skip')
test_data = e_localday_indexed.fit(test_data).transform(test_data)
is_organic_indexed = ft.StringIndexer(inputCol="is_organic", outputCol="indexedIs_organic", handleInvalid='skip')
test_data = is_organic_indexed.fit(test_data).transform(test_data)

featuresCreator = ft.VectorAssembler(inputCols=featureList, outputCol='features')
test_data = featuresCreator.transform(test_data)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator().setLabelCol('indexedLabel')

#加载训练好的模型
from pyspark.ml.classification import RandomForestClassificationModel
rf = RandomForestClassificationModel.load("/user/ai_data/zhangxin/model/rfModel")

#查看auc
print evaluator.evaluate(rf.transform(test_data))
#查看ap
print evaluator.evaluate(rf.transform(test_data), {evaluator.metricName: "areaUnderPR"})

#获取各个特征在模型中的重要性并按照权重倒序打印
features_important = rf.featureImportances
ks = list(features_important.indices)
vs = list(features_important.toArray())

features_important = tuple(features_important)
name_index = test_data.schema["features"].metadata["ml_attr"]["attrs"]

index_im = zip(ks, vs)
names = []
idxs = []

for it in name_index['numeric']:
    names.append(it['name'])
    idxs.append(it['idx'])
    
d = zip(names, idxs)
p = zip(index_im, d)

kv = {}
for fir, sec in p:
    kv[sec[0]] = fir[1]
kv

print sorted(kv.items(), 
       key=lambda el: el[1], 
       reverse=True)


 

你可能感兴趣的:(机器学习)