DataFrame:spark ML从spark sql中获取DataFrame作为学习的数据集。可以持有多种数据类型 ,例如:有个df可以包含文本,特征向量,true标签和预测结果的多个不同的列。
转换器: 转换器是把一个DF转成另外一个DF的算法,例:一个ML模型是一个转换器 ,转换带有特征的DataFrame成为带有预测结果的DF.
评估器: 评估器是应用(fit)在一个DF上生成一个转换器的算法。 例:学习算法是一个评估器,在DF上训练并且生成一个模型。
管道: 一个管道连接多个转换器和评估器在一起,作为一个工作流。
参数: 对于指定的参数来说,所有的转换器和评估器 共享一个公共的api.
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.sql.Row
// Prepare training data from a list of (label, features) tuples.
//准备带标签和特征的数据
val training = sqlContext.createDataFrame(Seq(
(1.0, Vectors.dense(0.0, 1.1, 0.1)),
(0.0, Vectors.dense(2.0, 1.0, -1.0)),
(0.0, Vectors.dense(2.0, 1.3, 1.0)),
(1.0, Vectors.dense(0.0, 1.2, -0.5))
)).toDF("label", "features")
// Create a LogisticRegression instance. This instance is an Estimator.
//创建一个逻辑回归事例,这个实例是评估器
val lr = new LogisticRegression()
// Print out the parameters, documentation, and any default values.
//输出参数等默认值
println("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
// We may set parameters using setter methods.
//使用setter方法设置参数
lr.setMaxIter(10)
.setRegParam(0.01)
// Learn a LogisticRegression model. This uses the parameters stored in lr.
//使用存储在lr中的参数来,学习一个模型,
val model1 = lr.fit(training)
// Since model1 is a Model (i.e., a Transformer produced by an Estimator),
// we can view the parameters it used during fit().
// This prints the parameter (name: value) pairs, where names are unique IDs for this
// LogisticRegression instance.
//由于model1是一个模型,(也就是,一个评估器产生一个转换器),
// 我们可以看lr在fit()上使用的参数。
//输出这些参数对,参数里的names是逻辑回归实例的唯一id
println("Model 1 was fit using parameters: " + model1.parent.extractParamMap)
// We may alternatively specify parameters using a ParamMap,
// which supports several methods for specifying parameters.
//我们可以使用paramMap选择指定的参数,并且提供了很多方法来设置参数
val paramMap = ParamMap(lr.maxIter -> 20)
.put(lr.maxIter, 30) // Specify 1 Param. This overwrites the original maxIter. 指定一个参数。
.put(lr.regParam -> 0.1, lr.threshold -> 0.55) // Specify multiple Params. 指定多个参数
// One can also combine ParamMaps.
val paramMap2 = ParamMap(lr.probabilityCol -> "myProbability") // Change output column name 改变输出列的名称
val paramMapCombined = paramMap ++ paramMap2
// Now learn a new model using the paramMapCombined parameters.
// paramMapCombined overrides all parameters set earlier via lr.set* methods.
//使用新的参数学习模型。
val model2 = lr.fit(training, paramMapCombined)
println("Model 2 was fit using parameters: " + model2.parent.extractParamMap)
// Prepare test data.
//准备测试数据
val test = sqlContext.createDataFrame(Seq(
(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
(0.0, Vectors.dense(3.0, 2.0, -0.1)),
(1.0, Vectors.dense(0.0, 2.2, -1.5))
)).toDF("label", "features")
// Make predictions on test data using the Transformer.transform() method.
// LogisticRegression.transform will only use the 'features' column.
// Note that model2.transform() outputs a 'myProbability' column instead of the usual
// 'probability' column since we renamed the lr.probabilityCol parameter previously.
//使用转换器的transform()方法在测试数据上作出预测.
// 逻辑回归的transform方法只使用“特征”列.
// 注意model2.transform()方法输出的是myProbability列而不是probability列,因为在上面重命名了lr.probabilityCol 参数。
model2.transform(test)
.select("features", "label", "myProbability", "prediction")
.collect()
.foreach { case Row(features: Vector, label: Double, prob: Vector, prediction: Double) =>
println(s"($features, $label) -> prob=$prob, prediction=$prediction")
}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.sql.Row
// Prepare training documents from a list of (id, text, label) tuples.
//准备训练文档,(id,内容,标签)
val training = sqlContext.createDataFrame(Seq(
(0L, "a b c d e spark", 1.0),
(1L, "b d", 0.0),
(2L, "spark f g h", 1.0),
(3L, "hadoop mapreduce", 0.0)
)).toDF("id", "text", "label")
// Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
//配置ML管道,由三个stage组成,tokenizer, hashingTF, and lr ,
val tokenizer = new Tokenizer()
.setInputCol("text")
.setOutputCol("words")
val hashingTF = new HashingTF()
.setNumFeatures(1000)
.setInputCol(tokenizer.getOutputCol)
.setOutputCol("features")
val lr = new LogisticRegression()
.setMaxIter(10)
.setRegParam(0.01)
val pipeline = new Pipeline()
.setStages(Array(tokenizer, hashingTF, lr))
// Fit the pipeline to training documents.
//安装管道到数据上
val model = pipeline.fit(training)
// now we can optionally save the fitted pipeline to disk
//现在可以保存安装好的管道到磁盘上
model.save("/tmp/spark-logistic-regression-model")
// we can also save this unfit pipeline to disk
//也可以保存未安装的管道到磁盘上
pipeline.save("/tmp/unfit-lr-model")
// and load it back in during production
//加载管道
val sameModel = PipelineModel.load("/tmp/spark-logistic-regression-model")
// Prepare test documents, which are unlabeled (id, text) tuples.
//准备测试文档,不包含标签
val test = sqlContext.createDataFrame(Seq(
(4L, "spark i j k"),
(5L, "l m n"),
(6L, "mapreduce spark"),
(7L, "apache hadoop")
)).toDF("id", "text")
// Make predictions on test documents.
//在测试文档上做出预测
model.transform(test)
.select("id", "text", "probability", "prediction")
.collect()
.foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
println(s"($id, $text) --> prob=$prob, prediction=$prediction")
}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.sql.Row
// Prepare training data from a list of (id, text, label) tuples.
//准备训练数据,id 内容,标签
val training = sqlContext.createDataFrame(Seq(
(0L, "a b c d e spark", 1.0),
(1L, "b d", 0.0),
(2L, "spark f g h", 1.0),
(3L, "hadoop mapreduce", 0.0),
(4L, "b spark who", 1.0),
(5L, "g d a y", 0.0),
(6L, "spark fly", 1.0),
(7L, "was mapreduce", 0.0),
(8L, "e spark program", 1.0),
(9L, "a e c l", 0.0),
(10L, "spark compile", 1.0),
(11L, "hadoop software", 0.0)
)).toDF("id", "text", "label")
// Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
// 配置机器学习管道,由tokenizer, hashingTF, lr评估器 组成
val tokenizer = new Tokenizer()
.setInputCol("text")
.setOutputCol("words")
val hashingTF = new HashingTF()
.setInputCol(tokenizer.getOutputCol)
.setOutputCol("features")
val lr = new LogisticRegression()
.setMaxIter(10)
val pipeline = new Pipeline()
.setStages(Array(tokenizer, hashingTF, lr))
// We use a ParamGridBuilder to construct a grid of parameters to search over.
// With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
// this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
//使用ParamGridBuilder 构造一个参数网格,
//hashingTF.numFeatures有3个值,lr.regParam有2个值,
// 这个网格有6个参数给CrossValidator来选择
val paramGrid = new ParamGridBuilder()
.addGrid(hashingTF.numFeatures, Array(10, 100, 1000))
.addGrid(lr.regParam, Array(0.1, 0.01))
.build()
// We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
// This will allow us to jointly choose parameters for all Pipeline stages.
// A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
// Note that the evaluator here is a BinaryClassificationEvaluator and its default metric
// is areaUnderROC.
//现在我们把管道看做成一个Estimator,把它包装到CrossValidator实例中。
//这可以让我们连带的为管道的所有stage选择参数。
//CrossValidator需要一个Estimator,一个评估器参数集合,和一个Evaluator。
//注意这里的evaluator 是二元分类的BinaryClassificationEvaluator,它默认的度量是areaUnderROC.
val cv = new CrossValidator()
.setEstimator(pipeline)
.setEvaluator(new BinaryClassificationEvaluator)
.setEstimatorParamMaps(paramGrid)
.setNumFolds(2) // Use 3+ in practice // 在实战中使用3+
// Run cross-validation, and choose the best set of parameters.
//运行交叉校验,选择最好的参数集
val cvModel = cv.fit(training)
// Prepare test documents, which are unlabeled (id, text) tuples.
//准备测试数据
val test = sqlContext.createDataFrame(Seq(
(4L, "spark i j k"),
(5L, "l m n"),
(6L, "mapreduce spark"),
(7L, "apache hadoop")
)).toDF("id", "text")
// Make predictions on test documents. cvModel uses the best model found (lrModel).
//在测试文档上做预测,cvModel是选择出来的最好的模型
cvModel.transform(test)
.select("id", "text", "probability", "prediction")
.collect()
.foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
println(s"($id, $text) --> prob=$prob, prediction=$prediction")
}
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
// Prepare training and test data.
//准备训练数据和测试数据
val data = sqlContext.read.format("libsvm").load("data/mllib/sample_linear_regression_data.txt")
val Array(training, test) = data.randomSplit(Array(0.9, 0.1), seed = 12345)
val lr = new LinearRegression()
// We use a ParamGridBuilder to construct a grid of parameters to search over.
// TrainValidationSplit will try all combinations of values and determine best model using
// the evaluator.
//ParamGridBuilder构建一组参数
//TrainValidationSplit将尝试从这些所有值的组合中使用evaluator选出最好的模型
val paramGrid = new ParamGridBuilder()
.addGrid(lr.regParam, Array(0.1, 0.01))
.addGrid(lr.fitIntercept)
.addGrid(lr.elasticNetParam, Array(0.0, 0.5, 1.0))
.build()
// In this case the estimator is simply the linear regression.
// A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
//在这里estimator是简单的线性回归
//TrainValidationSplit 需要一个Estimator , 一个Estimator ParamMaps集,一个Evaluator
val trainValidationSplit = new TrainValidationSplit()
.setEstimator(lr)
.setEvaluator(new RegressionEvaluator)
.setEstimatorParamMaps(paramGrid)
// 80% of the data will be used for training and the remaining 20% for validation.
//80%数据作为训练,剩下的20%作为验证
.setTrainRatio(0.8)
// Run train validation split, and choose the best set of parameters.
//运行训练校验分离,选择最好的参数。
val model = trainValidationSplit.fit(training)
// Make predictions on test data. model is the model with combination of parameters
// that performed best.
//在测试数据上做预测,模型是参数组合中执行最好的一个
model.transform(test)
.select("features", "label", "prediction")
.show()