在工作中,我们常常会用sklearn来训练模型而不是用pyspark,如何在pyspark环境中调用sklearn训练好的模型是一个非常棘手的问题,熟练使用pyspark之后,尝试可以通过调用函数的方式轻松解决这个问题,最终运行成功,后期会不断优化这种预测结构。
直接使用pyspark.ml模块中的算法进行预测
import joblib
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import time
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import FloatType
from pyspark.ml.feature import VectorAssembler
import pyspark.sql.functions as F
from contextlib import contextmanager
# 获取运行时间
@contextmanager
def timer(title):
t0 = time.time()
yield
print("{}:使用时间为{:.0f}s".format(title, time.time() - t0))
# 构建数据
def generate_model_data():
data, target = make_blobs(
n_samples=50000, n_features=3, centers=2, random_state=123, cluster_std=[0.6, 0.7])
return pd.DataFrame(data, columns=['a', 'b', 'c']), pd.DataFrame(target, columns=['target'])
X, Y = generate_model_data()
# 数据抽样7:3
X_train, X_test, y_train, y_test = train_test_split(X, Y.values.ravel(), test_size=0.3, random_state=123)
# 构建模型
clf = LogisticRegression(random_state=123)
# 训练模型
clf.fit(X_train, y_train)
# 模型可持久化
joblib.dump(clf, 'lr_demo.pkl')
'''1'''
# 预测模型并用auc面积评估模型效果
with timer('sklearn原始预测处理时间'):
y_predprob = clf.predict_proba(X_test)[:, 1]
print("AUC Score (test): {:.2%}".format(roc_auc_score(y_test, y_predprob)))
# 生成建模数据
def generate_model_data():
data, target = make_blobs(
n_samples=100000, n_features=3, centers=2, random_state=123, cluster_std=[0.6, 0.7])
return pd.DataFrame(data, columns=['a', 'b', 'c']), pd.DataFrame(target, columns=['target'])
X, Y = generate_model_data()
# 构建spark引擎
spark = SparkSession.builder.appName('pyspark_ml_test').getOrCreate()
# 将dataframe 转化为pyspark_dataframe
spark_df = spark.createDataFrame(X)
df_assembler = VectorAssembler(inputCols=['a', 'b', 'c'], outputCol="features") # 把特征组装成一个list
spark_df_feature = df_assembler.transform(spark_df)
spark_df_feature.printSchema()
spark_df_feature.show(1,truncate=False)
'''2''' # 每次读取模型
model_base= joblib.load('lr_demo.pkl')
def predictor(features):
y_predprob = model_base.predict_proba([features])[:, 1]
return float(y_predprob[0])
udf_predictor = F.udf(predictor,FloatType())
with timer('原始预测处理时间'):
spark_df_feature = spark_df_feature.withColumn('prediction',udf_predictor(F.col('features')))
spark_df_feature.show()
'''3''' # 将模型广播给spark执行者并进行预测
sc=SparkContext.getOrCreate()
model_sc = joblib.load('lr_demo.pkl')
model_sc = sc.broadcast(model_sc)
def predictor(features):
y_predprob = model_sc.value.predict_proba([features])[:, 1]
return float(y_predprob[0])
udf_predictor = F.udf(predictor,FloatType())
with timer('sc广播传播预测处理时间'):
spark_df_feature = spark_df_feature.withColumn('prediction',udf_predictor(F.col('features')))
spark_df_feature.count()
spark_df_feature.show()
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler # 特征处理
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassificationModel
spark=SparkSession.builder.appName('random_forest').getOrCreate()
# 数据读入
from sklearn.datasets import make_blobs
# 构建数据
def generate_model_data():
data, target = make_blobs(
n_samples=50000, n_features=3, centers=2, random_state=123, cluster_std=[0.6, 0.7])
df = pd.DataFrame(data, columns=['a', 'b', 'c'])
df['target'] = target
return df
df = generate_model_data()
df = spark.createDataFrame(df)
# 数据属性
print((df.count(),len(df.columns)))
df.printSchema()
df.describe().show()
df.groupBy('target').count().show()
# 特征处理
df_assembler = VectorAssembler(inputCols=['a', 'b', 'c'], outputCol="features") # 把特征组装成一个list
df = df_assembler.transform(df)
df.printSchema()
df.show(5,truncate=False)
# 数据集划分
model_df=df.select(['features','target'])
train_df,test_df=model_df.randomSplit([0.75,0.25])
train_df.count()
train_df.groupBy('target').count().show()
test_df.groupBy('target').count().show()
# 模型构建
rf_classifier=RandomForestClassifier(labelCol='target',numTrees=50).fit(train_df)
rf_predictions=rf_classifier.transform(test_df)
rf_predictions.show()
# 结果查看
rf_classifier.featureImportances # 各个特征的权重
# 模型效果
rf_predictions.groupBy('prediction').count().show()
rf_predictions.select(['probability','target','prediction']).show(10,False)
# 多分类模型——准确率
rf_accuracy=MulticlassClassificationEvaluator(labelCol='target',metricName='accuracy').evaluate(rf_predictions)
print('The accuracy of RF on test data is {0:.0%}'.format(rf_accuracy))
print(rf_accuracy)
# 多分类模型——精确率
rf_precision=MulticlassClassificationEvaluator(labelCol='target',metricName='weightedPrecision').evaluate(rf_predictions)
print('The precision rate on test data is {0:.0%}'.format(rf_precision))
# AUC
rf_auc=BinaryClassificationEvaluator(labelCol='target').evaluate(rf_predictions)
print(rf_auc)
# 模型保存
rf_classifier.save("./spark_RF_model")
# 模型调用
rf=RandomForestClassificationModel.load("./spark_RF_model")
model_preditions=rf.transform(test_df)
model_preditions.show()
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler # 特征处理
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import GBTClassificationModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
spark=SparkSession.builder.appName('random_forest').getOrCreate()
# 数据读入
from sklearn.datasets import make_blobs
# 构建数据
def generate_model_data():
data, target = make_blobs(
n_samples=50000, n_features=3, centers=2, random_state=123, cluster_std=[0.6, 0.7])
df = pd.DataFrame(data, columns=['a', 'b', 'c'])
df['target'] = target
return df
df = generate_model_data()
df = spark.createDataFrame(df)
# 数据属性
print((df.count(),len(df.columns)))
df.printSchema()
df.describe().show()
df.groupBy('target').count().show()
# 特征处理
df_assembler = VectorAssembler(inputCols=['a', 'b', 'c'], outputCol="features") # 把特征组装成一个list
df = df_assembler.transform(df)
df.printSchema()
df.show(5,truncate=False)
# 数据集划分
model_df=df.select(['features','target'])
train_df,test_df=model_df.randomSplit([0.75,0.25])
train_df.count()
train_df.groupBy('target').count().show()
test_df.groupBy('target').count().show()
# 模型构建
gbdt_classifier=GBTClassifier(labelCol='target').fit(train_df)
gbdt_predictions=gbdt_classifier.transform(test_df)
gbdt_predictions.show()
# 结果查看
gbdt_classifier.featureImportances # 各个特征的权重
# 模型效果
gbdt_predictions.groupBy('prediction').count().show()
gbdt_predictions.select(['probability','target','prediction']).show(10,False)
# 多分类模型——准确率
gbdt_accuracy=MulticlassClassificationEvaluator(labelCol='target',metricName='accuracy').evaluate(gbdt_predictions)
print('The accuracy of gbdt on test data is {0:.0%}'.format(gbdt_accuracy))
# 多分类模型——精确率
gbdt_precision=MulticlassClassificationEvaluator(labelCol='target',metricName='weightedPrecision').evaluate(gbdt_predictions)
print('The precision rate on test data is {0:.0%}'.format(gbdt_precision))
# AUC
gbdt_auc=BinaryClassificationEvaluator(labelCol='target').evaluate(gbdt_predictions)
print('The AUC value on test data is {0:.0%}'.format(gbdt_auc))
# 模型保存
gbdt_classifier.save("./spark_gbdt_model")
# 模型调用
gbdt=GBTClassificationModel.load("./spark_gbdt_model")
model_preditions=gbdt.transform(test_df)
model_preditions.show()
资料参考
spark机器学习