SparkML之分类(二)logistics回归

前面已经陈述过logistic的理论的了,在此就不赘述了(http://blog.csdn.net/legotime/article/details/51312393)

Logistic 函数(分类时有个名字叫Sigmoid函数)如下:


logistic函数早期是用于人口预测的。但随着人们对其的应用扩展,开始慢慢应用于分类问题,而且是神经网络中一个

经常使用的过渡函数,图1是将logistic函数


图1

它的原理是:在分二类的情况下,当h的计算值大于0.5时,让h等于1,h的计算值小于等于于0.5时,让h等于0。這样

对于输入一个X

SparkML之分类(二)logistics回归_第1张图片

那么结果就分类  0 或 1,所以达到了分类的效果。当然logistic函数可以应用于多个类的情况。

-----------------------------------------------------------------------------------------------------

spark Logistic模型训练图

SparkML之分类(二)logistics回归_第2张图片


-------------------------------------------------------------------------------------------------------

源码分析

package org.apache.spark.mllib.classification

import org.apache.spark.SparkContext
import org.apache.spark.annotation.Since
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.mllib.classification.impl.GLMClassificationModel
import org.apache.spark.mllib.linalg.{DenseVector, Vector, Vectors}
import org.apache.spark.mllib.linalg.BLAS.dot
import org.apache.spark.mllib.optimization._
import org.apache.spark.mllib.pmml.PMMLExportable
import org.apache.spark.mllib.regression._
import org.apache.spark.mllib.util.{DataValidators, Loader, Saveable}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.storage.StorageLevel

/**
 * 利用(Multinomial/Binary)logistic回归来训练分类模型
 *
 * @param weights 特征的权重
 * @param intercept 偏置(二元回归的时候是一个值,在多元回归的时候会和特征融合在一起.)
 * @param numFeatures 特征的维度
 * @param numClasses 多元回归分析中的类分类问题的可能结果的个数。默认情况下,它是二元Logistic回归,numclasses将被设置为2。
 */
@Since("0.8.0")
class LogisticRegressionModel @Since("1.3.0") (
    @Since("1.0.0") override val weights: Vector,
    @Since("1.0.0") override val intercept: Double,
    @Since("1.3.0") val numFeatures: Int,
    @Since("1.3.0") val numClasses: Int)
  extends GeneralizedLinearModel(weights, intercept) with ClassificationModel with Serializable
  with Saveable with PMMLExportable {

  if (numClasses == 2) {
    require(weights.size == numFeatures,
      s"LogisticRegressionModel with numClasses = 2 was given non-matching values:" +
      s" numFeatures = $numFeatures, but weights.size = ${weights.size}")
  } else {
    val weightsSizeWithoutIntercept = (numClasses - 1) * numFeatures
    val weightsSizeWithIntercept = (numClasses - 1) * (numFeatures + 1)
    require(weights.size == weightsSizeWithoutIntercept || weights.size == weightsSizeWithIntercept,
      s"LogisticRegressionModel.load with numClasses = $numClasses and numFeatures = $numFeatures" +
      s" expected weights of length $weightsSizeWithoutIntercept (without intercept)" +
      s" or $weightsSizeWithIntercept (with intercept)," +
      s" but was given weights of length ${weights.size}")
  }

  private val dataWithBiasSize: Int = weights.size / (numClasses - 1)

  private val weightsArray: Array[Double] = weights match {
    case dv: DenseVector => dv.values
    case _ =>
      throw new IllegalArgumentException(
        s"weights only supports dense vector but got type ${weights.getClass}.")
  }

  /**
   * 构建一个LogisticRegressionModel,权重和偏置都是二维的。
   */
  @Since("1.0.0")
  def this(weights: Vector, intercept: Double) = this(weights, intercept, weights.size, 2)

  private var threshold: Option[Double] = Some(0.5)

  /**
   * 设置 阈值,对于二分类情况下。這个阈值用于当y大于它时,就在 positive,当小于它时,就分来negative
   * 默认情况之恶个這个阈值设置为 0.5
   */
  @Since("1.0.0")
  def setThreshold(threshold: Double): this.type = {
    this.threshold = Some(threshold)
    this
  }

  /**
   *返回的阈值(如果有的话),用于将原始预测分数转换为0 / 1预测。它仅用于二进制分类。
   */
  @Since("1.3.0")
  def getThreshold: Option[Double] = threshold

  /**
   * 清除阈值,以便“预测”将输出预测值。
   * 它仅用于二进制分类
   */
  @Since("1.0.0")
  def clearThreshold(): this.type = {
    threshold = None
    this
  }

  override protected def predictPoint(
      dataMatrix: Vector,
      weightMatrix: Vector,
      intercept: Double) = {
    require(dataMatrix.size == numFeatures)

    // 如果 dataMatrix和 weightMatrix 具有相同的维度, 那么它是二分类的logistic回归
    if (numClasses == 2) {
      val margin = dot(weightMatrix, dataMatrix) + intercept
      val score = 1.0 / (1.0 + math.exp(-margin))
      threshold match {
        case Some(t) => if (score > t) 1.0 else 0.0
        case None => score
      }
    } else {
      /**
       * Compute and find the one with maximum margins. If the maxMargin is negative, then the
       * prediction result will be the first class.
       *
       * PS, if you want to compute the probabilities for each outcome instead of the outcome
       * with maximum probability, remember to subtract the maxMargin from margins if maxMargin
       * is positive to prevent overflow.
       */
      var bestClass = 0
      var maxMargin = 0.0
      val withBias = dataMatrix.size + 1 == dataWithBiasSize
      (0 until numClasses - 1).foreach { i =>
        var margin = 0.0
        dataMatrix.foreachActive { (index, value) =>
          if (value != 0.0) margin += value * weightsArray((i * dataWithBiasSize) + index)
        }
        // Intercept is required to be added into margin.
        if (withBias) {
          margin += weightsArray((i * dataWithBiasSize) + dataMatrix.size)
        }
        if (margin > maxMargin) {
          maxMargin = margin
          bestClass = i + 1
        }
      }
      bestClass.toDouble
    }
  }

  @Since("1.3.0")
  override def save(sc: SparkContext, path: String): Unit = {
    GLMClassificationModel.SaveLoadV1_0.save(sc, path, this.getClass.getName,
      numFeatures, numClasses, weights, intercept, threshold)
  }

  override protected def formatVersion: String = "1.0"

  override def toString: String = {
    s"${super.toString}, numClasses = ${numClasses}, threshold = ${threshold.getOrElse("None")}"
  }
}

@Since("1.3.0")
object LogisticRegressionModel extends Loader[LogisticRegressionModel] {

  @Since("1.3.0")
  override def load(sc: SparkContext, path: String): LogisticRegressionModel = {
    val (loadedClassName, version, metadata) = Loader.loadMetadata(sc, path)
    // Hard-code class name string in case it changes in the future
    val classNameV1_0 = "org.apache.spark.mllib.classification.LogisticRegressionModel"
    (loadedClassName, version) match {
      case (className, "1.0") if className == classNameV1_0 =>
        val (numFeatures, numClasses) = ClassificationModel.getNumFeaturesClasses(metadata)
        val data = GLMClassificationModel.SaveLoadV1_0.loadData(sc, path, classNameV1_0)
        // numFeatures, numClasses, weights are checked in model initialization
        val model =
          new LogisticRegressionModel(data.weights, data.intercept, numFeatures, numClasses)
        data.threshold match {
          case Some(t) => model.setThreshold(t)
          case None => model.clearThreshold()
        }
        model
      case _ => throw new Exception(
        s"LogisticRegressionModel.load did not recognize model with (className, format version):" +
        s"($loadedClassName, $version).  Supported:\n" +
        s"  ($classNameV1_0, 1.0)")
    }
  }
}

/**
 * 用随机梯度下降算法来训练二分类的logitic回归的分类模型
 * 默认情况下用L2正则化,它可以通过[[LogisticRegressionWithSGD.optimizer]].来改变
 * note:二分类以上的K分类的logistic回归分类 ,Lables 可以为 {0, 1, ..., k - 1}
 */
@Since("0.8.0")
@deprecated("Use ml.classification.LogisticRegression or LogisticRegressionWithLBFGS", "2.0.0")
class LogisticRegressionWithSGD private[mllib] (
    private var stepSize: Double,
    private var numIterations: Int,
    private var regParam: Double,
    private var miniBatchFraction: Double)
  extends GeneralizedLinearAlgorithm[LogisticRegressionModel] with Serializable {

  private val gradient = new LogisticGradient()
  private val updater = new SquaredL2Updater()
  @Since("0.8.0")
  override val optimizer = new GradientDescent(gradient, updater)
    .setStepSize(stepSize)
    .setNumIterations(numIterations)
    .setRegParam(regParam)
    .setMiniBatchFraction(miniBatchFraction)
  override protected val validators = List(DataValidators.binaryLabelValidator)

  /**
   * 构建一个默认情况下的逻辑回归,默认参数是{stepSize: 1.0,numIterations: 100, regParm: 0.01, miniBatchFraction: 1.0}.
   */
  @Since("0.8.0")
  def this() = this(1.0, 100, 0.01, 1.0)

  override protected[mllib] def createModel(weights: Vector, intercept: Double) = {
    new LogisticRegressionModel(weights, intercept)
  }
}

/**
 * 最先用的方法是随机梯度下降
 * NOTE: Logistic 回归的label应该是 {0, 1}
 */
@Since("0.8.0")
@deprecated("Use ml.classification.LogisticRegression or LogisticRegressionWithLBFGS", "2.0.0")
object LogisticRegressionWithSGD {
  // NOTE(shivaram): We use multiple train methods instead of default arguments to support
  // Java programs.

  /**
   * 给定一个 pair RDD(label, features) 训练一个logistic回归模型。我们通过特定步长来固定迭代次数。
   * 每次迭代用miniBatchFraction来计算梯度。
   * NOTE: Labels used in Logistic Regression should be {0, 1}
   
   *
   * @param input RDD of (label, array of features) pairs.
   * @param numIterations Number of iterations of gradient descent to run.(迭代次数)
   * @param stepSize Step size to be used for each iteration of gradient descent.(步长)
   * @param miniBatchFraction Fraction of data to be used per iteration.(一次用于迭代的数据量)
   * @param initialWeights Initial set of weights to be used. Array should be equal in size to
   *        the number of features in the data.
   */
  @Since("1.0.0")
  def train(
      input: RDD[LabeledPoint],
      numIterations: Int,
      stepSize: Double,
      miniBatchFraction: Double,
      initialWeights: Vector): LogisticRegressionModel = {
    new LogisticRegressionWithSGD(stepSize, numIterations, 0.0, miniBatchFraction)
      .run(input, initialWeights)
  }

  /**
  /**
   * 给定一个 pair RDD(label, features) 训练一个logistic回归模型。我们通过特定步长来固定迭代次数。
   * 每次迭代用miniBatchFraction来计算梯度。
   * NOTE: Labels used in Logistic Regression should be {0, 1}
   *
   * @param input RDD of (label, array of features) pairs.
   * @param numIterations Number of iterations of gradient descent to run.
   * @param stepSize Step size to be used for each iteration of gradient descent.

   * @param miniBatchFraction Fraction of data to be used per iteration.
   */
  @Since("1.0.0")
  def train(
      input: RDD[LabeledPoint],
      numIterations: Int,
      stepSize: Double,
      miniBatchFraction: Double): LogisticRegressionModel = {
    new LogisticRegressionWithSGD(stepSize, numIterations, 0.0, miniBatchFraction)
      .run(input)
  }

  /**
  /**
   * 给定一个 pair RDD(label, features) 训练一个logistic回归模型。我们通过特定步长来固定迭代次数。
   * 每次迭代用miniBatchFraction来计算梯度。
   * NOTE: Labels used in Logistic Regression should be {0, 1}
   *
   * @param input RDD of (label, array of features) pairs.
   * @param stepSize Step size to be used for each iteration of Gradient Descent.

   * @param numIterations Number of iterations of gradient descent to run.
   * @return a LogisticRegressionModel which has the weights and offset from training.
   */
  @Since("1.0.0")
  def train(
      input: RDD[LabeledPoint],
      numIterations: Int,
      stepSize: Double): LogisticRegressionModel = {
    train(input, numIterations, stepSize, 1.0)
  }

  /**
  /**
   * 给定一个 pair RDD(label, features) 训练一个logistic回归模型。我们通过特定步长来固定迭代次数。
   * 每次迭代用miniBatchFraction来计算梯度。
   * NOTE: Labels used in Logistic Regression should be {0, 1}
   *
   * @param input RDD of (label, array of features) pairs.
   * @param numIterations Number of iterations of gradient descent to run.
   * @return a LogisticRegressionModel which has the weights and offset from training.
   */
  @Since("1.0.0")
  def train(
      input: RDD[LabeledPoint],
      numIterations: Int): LogisticRegressionModel = {
    train(input, numIterations, 1.0, 1.0)
  }
}

/**
 * 用Limited-memory BFGS算法来训练二分类/K分类的logitic回归的分类模型,默认情况下是用L2正则化
 * note:二分类以上的K分类的logistic回归分类 ,Lables 可以为 {0, 1, ..., k - 1}
 * 早期是用 LogisticRegressionWithLBFGS来实现正则化,包括偏置。如果updates是(L1Updater, or SquaredL2Updater) ,
 * 那么它应该是来自 ml.LogisticRegression
 * 否则就是现在的 mllib下的广义线性算法(GeneralizedLinearAlgorithm)来训练,
 */
@Since("1.1.0")
class LogisticRegressionWithLBFGS
  extends GeneralizedLinearAlgorithm[LogisticRegressionModel] with Serializable {

  this.setFeatureScaling(true)

  @Since("1.1.0")
  override val optimizer = new LBFGS(new LogisticGradient, new SquaredL2Updater)

  override protected val validators = List(multiLabelValidator)

  private def multiLabelValidator: RDD[LabeledPoint] => Boolean = { data =>
    if (numOfLinearPredictor > 1) {
      DataValidators.multiLabelValidator(numOfLinearPredictor + 1)(data)
    } else {
      DataValidators.binaryLabelValidator(data)
    }
  }

  /**
   * 在多分类(k)的logistic回归中,设置用于类分类问题的可能结果的数量。默认情况下k = 2
   */
  @Since("1.3.0")
  def setNumClasses(numClasses: Int): this.type = {
    require(numClasses > 1)
    numOfLinearPredictor = numClasses - 1
    if (numClasses > 2) {
      optimizer.setGradient(new LogisticGradient(numClasses))
    }
    this
  }

  override protected def createModel(weights: Vector, intercept: Double) = {
    if (numOfLinearPredictor == 1) {
      new LogisticRegressionModel(weights, intercept)
    } else {
      new LogisticRegressionModel(weights, intercept, numFeatures, numOfLinearPredictor + 1)
    }
  }

  /**
   * Run Logistic Regression with the configured parameters on an input RDD
   * of LabeledPoint entries.
   *
   *
   * 如果在之前声明了更新的方法是 ml包下面的,那么就是,如果不是那么选择的是mllib包下的更新方法
   */
  override def run(input: RDD[LabeledPoint]): LogisticRegressionModel = {
    run(input, generateInitialWeights(input), userSuppliedWeights = false)
  }

  /**
   * Run Logistic Regression with the configured parameters on an input RDD
   * of LabeledPoint entries.
   *
   *
   * 如果在之前声明了更新的方法是 ml包下面的,那么就是,如果不是那么选择的是mllib包下的更新方法
   */
   *note:因为在ml包下没有配置LBFGS更新方法,所以optimizer.setNumCorrections()是无效的
   */
  override def run(input: RDD[LabeledPoint], initialWeights: Vector): LogisticRegressionModel = {
    run(input, initialWeights, userSuppliedWeights = true)
  }

  private def run(input: RDD[LabeledPoint], initialWeights: Vector, userSuppliedWeights: Boolean):
      LogisticRegressionModel = {
    // ml's Logistic regression only supports binary classification currently.
    if (numOfLinearPredictor == 1) {
      def runWithMlLogisitcRegression(elasticNetParam: Double) = {
        // Prepare the ml LogisticRegression based on our settings
        val lr = new org.apache.spark.ml.classification.LogisticRegression()
        lr.setRegParam(optimizer.getRegParam())
        lr.setElasticNetParam(elasticNetParam)
        lr.setStandardization(useFeatureScaling)
        if (userSuppliedWeights) {
          val uid = Identifiable.randomUID("logreg-static")
          lr.setInitialModel(new org.apache.spark.ml.classification.LogisticRegressionModel(
            uid, initialWeights.asML, 1.0))
        }
        lr.setFitIntercept(addIntercept)
        lr.setMaxIter(optimizer.getNumIterations())
        lr.setTol(optimizer.getConvergenceTol())
        // Convert our input into a DataFrame
        val sqlContext = new SQLContext(input.context)
        import sqlContext.implicits._
        val df = input.map(_.asML).toDF()
        // Determine if we should cache the DF
        val handlePersistence = input.getStorageLevel == StorageLevel.NONE
        // Train our model
        val mlLogisticRegresionModel = lr.train(df, handlePersistence)
        // convert the model
        val weights = Vectors.dense(mlLogisticRegresionModel.coefficients.toArray)
        createModel(weights, mlLogisticRegresionModel.intercept)
      }
      optimizer.getUpdater() match {
        case x: SquaredL2Updater => runWithMlLogisitcRegression(0.0)
        case x: L1Updater => runWithMlLogisitcRegression(1.0)
        case _ => super.run(input, initialWeights)
      }
    } else {
      super.run(input, initialWeights)
    }
  }
}

SparkML实验

import org.apache.spark.mllib.classification.{LogisticRegressionModel, LogisticRegressionWithLBFGS}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils


object LinearRegressionWithSGDExample {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("LinearRegressionWithSGDExample").setMaster("local")
    val sc = new SparkContext(conf)


    val data = MLUtils.loadLibSVMFile(sc, "C:\\Users\\alienware\\IdeaProjects\\sparkCore\\data\\mllib\\sample_libsvm_data.txt")

    // Split data into training (60%) and test (40%).
    val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
    val training = splits(0).cache()
    val test = splits(1)

    // Run training algorithm to build the model
    val model = new LogisticRegressionWithLBFGS()
      .setNumClasses(2)
      .run(training)


    // Compute raw scores on the test set.
    val predictionAndLabels = test.map { case LabeledPoint(label, features) =>
      val prediction = model.predict(features)
      (prediction, label)
    }
    predictionAndLabels.foreach(println)

    // Get evaluation metrics.
    val metrics = new MulticlassMetrics(predictionAndLabels)
    val precision = metrics.precision  println("Precision = " + precision)

    // Save and load model
    model.save(sc, "target/tmp/scalaLogisticRegressionWithLBFGSModel")
    val sameModel = LogisticRegressionModel.load(sc,
      "target/tmp/scalaLogisticRegressionWithLBFGSModel")


    sc.stop()
  }
}
//预测数据和实际数据
(1.0,1.0)
(1.0,1.0)
(0.0,0.0)
(1.0,1.0)
(0.0,0.0)
(0.0,0.0)
(1.0,1.0)
(1.0,1.0)
(1.0,1.0)
(0.0,0.0)
(1.0,1.0)
(1.0,1.0)
(0.0,0.0)
(1.0,1.0)
(0.0,0.0)
(0.0,0.0)
(1.0,1.0)
(1.0,1.0)
(1.0,1.0)
(1.0,1.0)
(0.0,0.0)
(1.0,1.0)
(1.0,1.0)
(1.0,1.0)
(1.0,1.0)
(0.0,0.0)
(1.0,1.0)
(0.0,0.0)
(1.0,1.0)
(1.0,1.0)
(1.0,1.0)
(1.0,1.0)
(1.0,1.0)
(0.0,0.0)
Precision = 1.0

你可能感兴趣的:(源码,spark机器学习)