之前的文章(pyspark lightGBM1和pyspark lightGBM2)介绍了pyspark下lightGBM算法的实现,本文将重点介绍下如何保存训练好的模型,直接上代码:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
#配置spark,创建SparkSession对象
spark = SparkSession.builder.master('yarn').appName('StringIndexerDemo').getOrCreate()
#创建简单的DataFrame
df = spark.createDataFrame([
(0, "a", "s" ), (1, "b", "o"),
(0, "a", "g"), (1, "b", "l"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u")
],
["id", "category", "name"])
indexer1 = StringIndexer(inputCol ='category', outputCol= 'categoryIndex')
indexer2 = StringIndexer(inputCol ='name', outputCol= 'nameIndex')
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[indexer1,indexer2])
model = pipeline.fit(df)
df = model.transform(df)
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(
inputCols = [
"id",
"nameIndex"
],
outputCol = "features")
df = assembler.transform(df)
# Generate classifier
from mmlspark import LightGBMClassifier
classifier = LightGBMClassifier(
featuresCol="features",
labelCol="categoryIndex",
learningRate=0.3,
numIterations=10,
numLeaves=100)
glbModel = classifier.fit(df)
pred = glbModel.transform(df)
pred.show()
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="categoryIndex", predictionCol="prediction")
accuracy = evaluator.evaluate(pred)
print("Accuracy = %g" % accuracy)
glbModel.saveNativeModel("/home/shenzhou/test_frame_model.txt",overwrite=True)
执行结果:
+---+--------+----+-------------+---------+---------+--------------------+--------------------+----------+
| id|category|name|categoryIndex|nameIndex| features| rawPrediction| probability|prediction|
+---+--------+----+-------------+---------+---------+--------------------+--------------------+----------+
| 0| a| s| 0.0| 2.0|[0.0,2.0]|[3.78240462265661...|[0.97773895925226...| 0.0|
| 1| b| o| 1.0| 5.0|[1.0,5.0]|[-3.7824046226566...|[0.02226104074773...| 1.0|
| 0| a| g| 0.0| 3.0|[0.0,3.0]|[3.78240462265661...|[0.97773895925226...| 0.0|
| 1| b| l| 1.0| 4.0|[1.0,4.0]|[-3.7824046226566...|[0.02226104074773...| 1.0|
| 0| a| p| 0.0| 1.0|[0.0,1.0]|[3.78240462265661...|[0.97773895925226...| 0.0|
| 1| b| u| 1.0| 0.0|[1.0,0.0]|[-3.7824046226566...|[0.02226104074773...| 1.0|
| 0| a| p| 0.0| 1.0|[0.0,1.0]|[3.78240462265661...|[0.97773895925226...| 0.0|
| 1| b| u| 1.0| 0.0|[1.0,0.0]|[-3.7824046226566...|[0.02226104074773...| 1.0|
| 0| a| p| 0.0| 1.0|[0.0,1.0]|[3.78240462265661...|[0.97773895925226...| 0.0|
| 1| b| u| 1.0| 0.0|[1.0,0.0]|[-3.7824046226566...|[0.02226104074773...| 1.0|
| 0| a| p| 0.0| 1.0|[0.0,1.0]|[3.78240462265661...|[0.97773895925226...| 0.0|
| 1| b| u| 1.0| 0.0|[1.0,0.0]|[-3.7824046226566...|[0.02226104074773...| 1.0|
| 0| a| p| 0.0| 1.0|[0.0,1.0]|[3.78240462265661...|[0.97773895925226...| 0.0|
| 1| b| u| 1.0| 0.0|[1.0,0.0]|[-3.7824046226566...|[0.02226104074773...| 1.0|
| 0| a| p| 0.0| 1.0|[0.0,1.0]|[3.78240462265661...|[0.97773895925226...| 0.0|
| 1| b| u| 1.0| 0.0|[1.0,0.0]|[-3.7824046226566...|[0.02226104074773...| 1.0|
| 0| a| p| 0.0| 1.0|[0.0,1.0]|[3.78240462265661...|[0.97773895925226...| 0.0|
| 1| b| u| 1.0| 0.0|[1.0,0.0]|[-3.7824046226566...|[0.02226104074773...| 1.0|
| 0| a| p| 0.0| 1.0|[0.0,1.0]|[3.78240462265661...|[0.97773895925226...| 0.0|
| 1| b| u| 1.0| 0.0|[1.0,0.0]|[-3.7824046226566...|[0.02226104074773...| 1.0|
+---+--------+----+-------------+---------+---------+--------------------+--------------------+----------+
Accuracy = 1
这里glbModel的类型为mmlspark.LightGBMClassifier.LightGBMClassificationModel ,调用起成员函数saveNativeModel可以保存模型,保存文件夹为hdfs下可访问的文件夹地址。
调用模型部分代码:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
#配置spark,创建SparkSession对象
spark = SparkSession.builder.master('yarn').appName('StringIndexerDemo').getOrCreate()
#创建简单的DataFrame
df = spark.createDataFrame([
(0, "a", "s" ), (1, "b", "o"),
(0, "a", "g"), (1, "b", "l"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u"),
(0, "a", "p"), (1, "b", "u")
],
["id", "category", "name"])
indexer1 = StringIndexer(inputCol ='category', outputCol= 'categoryIndex')
indexer2 = StringIndexer(inputCol ='name', outputCol= 'nameIndex')
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[indexer1,indexer2])
model = pipeline.fit(df)
df = model.transform(df)
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(
inputCols = [
"id",
"nameIndex"
],
outputCol = "features")
df = assembler.transform(df)
from mmlspark.LightGBMClassifier import LightGBMClassificationModel
lgbModel=LightGBMClassificationModel.loadNativeModelFromFile("/home/reco/shenzhou/test_frame_model.txt/")
pred=lgbModel.transform(df)
pred.show()
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="categoryIndex", predictionCol="prediction")
accuracy = evaluator.evaluate(pred)
print("Accuracy = %g" % accuracy)
打印结果:
+---+--------+----+-------------+---------+---------+--------------------+--------------------+----------+
| id|category|name|categoryIndex|nameIndex| features| rawPrediction| probability|prediction|
+---+--------+----+-------------+---------+---------+--------------------+--------------------+----------+
| 0| a| s| 0.0| 2.0|[0.0,2.0]|[3.78240462265661...|[0.97773895925226...| 0.0|
| 1| b| o| 1.0| 5.0|[1.0,5.0]|[-3.7824046226566...|[0.02226104074773...| 1.0|
| 0| a| g| 0.0| 3.0|[0.0,3.0]|[3.78240462265661...|[0.97773895925226...| 0.0|
| 1| b| l| 1.0| 4.0|[1.0,4.0]|[-3.7824046226566...|[0.02226104074773...| 1.0|
| 0| a| p| 0.0| 1.0|[0.0,1.0]|[3.78240462265661...|[0.97773895925226...| 0.0|
| 1| b| u| 1.0| 0.0|[1.0,0.0]|[-3.7824046226566...|[0.02226104074773...| 1.0|
| 0| a| p| 0.0| 1.0|[0.0,1.0]|[3.78240462265661...|[0.97773895925226...| 0.0|
| 1| b| u| 1.0| 0.0|[1.0,0.0]|[-3.7824046226566...|[0.02226104074773...| 1.0|
| 0| a| p| 0.0| 1.0|[0.0,1.0]|[3.78240462265661...|[0.97773895925226...| 0.0|
| 1| b| u| 1.0| 0.0|[1.0,0.0]|[-3.7824046226566...|[0.02226104074773...| 1.0|
| 0| a| p| 0.0| 1.0|[0.0,1.0]|[3.78240462265661...|[0.97773895925226...| 0.0|
| 1| b| u| 1.0| 0.0|[1.0,0.0]|[-3.7824046226566...|[0.02226104074773...| 1.0|
| 0| a| p| 0.0| 1.0|[0.0,1.0]|[3.78240462265661...|[0.97773895925226...| 0.0|
| 1| b| u| 1.0| 0.0|[1.0,0.0]|[-3.7824046226566...|[0.02226104074773...| 1.0|
| 0| a| p| 0.0| 1.0|[0.0,1.0]|[3.78240462265661...|[0.97773895925226...| 0.0|
| 1| b| u| 1.0| 0.0|[1.0,0.0]|[-3.7824046226566...|[0.02226104074773...| 1.0|
| 0| a| p| 0.0| 1.0|[0.0,1.0]|[3.78240462265661...|[0.97773895925226...| 0.0|
| 1| b| u| 1.0| 0.0|[1.0,0.0]|[-3.7824046226566...|[0.02226104074773...| 1.0|
| 0| a| p| 0.0| 1.0|[0.0,1.0]|[3.78240462265661...|[0.97773895925226...| 0.0|
| 1| b| u| 1.0| 0.0|[1.0,0.0]|[-3.7824046226566...|[0.02226104074773...| 1.0|
+---+--------+----+-------------+---------+---------+--------------------+--------------------+----------+
Accuracy = 1
这里调用loadNativeModelFromFile加载模型,注意其为静态函数,所以要直接用LightGBMClassificationModel调用函数loadNativeModelFromFile返回对象。