Spark MLlib RandomForest(随机森林)建模与预测

我要做的是发现异常用户,而我们之前没有已经打上异常不异常标签的样本,所以对原始数据进行清洗、特征工程、降维后用Mahout/R做的聚类,
打上了标签,本来打算继续用Mahout/R做分类,Mahout太慢,而用R实现KNN、RandomForest之后发现无法应用到真实项目上,所以用了MLlib。

下面是用R获取正负样本
#1.将kmeans标出的异常类标为1作为正样本,其它类标为0作为负样本
#1.1正常类标为0,作为负样本
#%in% 用法 a %in% table a值是否包含于table中,为真输出TURE,否则输出FALSE
levels(t_knn$cluster)[levels(t_knn$cluster) %in% c("1","2","11","4","5","6","7","8","9","10","12","13","14","15","16")] <- "0"

#1.2异常类标为1,作为正样本
levels(t_knn$cluster)[levels(t_knn$cluster) %in% c("3")] <- "1"
#1.3获取正负样本
negative_samples<-t_knn[which(t_knn$cluster=="0"),]
positive_samples<-t_knn[which(t_knn$cluster=="1"),]
#levels(negative_samples$cluster)

#################################################################################
#2.从正负样本中各随机抽取250个样本,组成500样本集v3
set.seed(1)
#sample(x, size, replace = FALSE, prob = NULL)默认不放回抽样
ns <- negative_samples[sample(1:nrow(negative_samples),250),]
ps <- positive_samples[sample(1:nrow(positive_samples),250),]
v3 <- rbind(ps,ns)
#levels(v3$cluster)
#去除第一列,第一列为用户id
vv3<-v3[,2:23] 
#写入本地
#write.table(vv3, file ="v3.csv",sep = ',',row.names = F, col.names = F, quote = F)
将v3.csv存入HDFS

下面是Spark MLlib RandomForest代码实现
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.mllib.tree.model.RandomForestModel
import org.apache.spark.rdd.RDD

/**
  * Created by drguo on 2016/11/18.
  */
object RunRF {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("rf")
    val sc = new SparkContext(sparkConf)
    //读取数据
    val rawData = sc.textFile("hdfs://192.168.1.64:8020/test/mllib/v3.csv")
    val data = rawData.map{ line =>
      val values = line.split(",").map(_.toDouble)
      //init返回除了最后一个元素的所有元素,作为特征向量
      //Vectors.dense向量化,dense密集型
      val feature = Vectors.dense(values.init)
      val label = values.last
      LabeledPoint(label, feature)
    }
    //训练集、交叉验证集和测试集,各占80%,10%和10%
    //10%的交叉验证数据集的作用是确定在训练数据集上训练出来的模型的最好参数
    //测试数据集的作用是评估CV数据集的最好参数
    val Array(trainData, cvData, testData) = data.randomSplit(Array(0.8, 0.1, 0.1))
    trainData.cache()
    cvData.cache()
    testData.cache()

    //构建随机森林
    val model = RandomForest.trainClassifier(trainData, 2, Map[Int, Int](), 20, "auto", "gini", 4, 32)
    val metrics = getMetrics(model, cvData)
    println("-----------------------------------------confusionMatrix-----------------------------------------------------")
    //混淆矩阵和模型精确率
    println(metrics.confusionMatrix)
    println("---------------------------------------------precision-------------------------------------------------")
    println(metrics.precision)

    println("-----------------------------------------(precision,recall)---------------------------------------------------")
    //每个类别对应的精确率与召回率
    (0 until 2).map(target => (metrics.precision(target), metrics.recall(target))).foreach(println)
    //保存模型
    model.save(sc,"hdfs://192.168.1.64:8020/tmp/RFModel")

  }

  /**
    * @param model 随机森林模型
    * @param data  用于交叉验证的数据集
    * */
  def getMetrics(model: RandomForestModel, data: RDD[LabeledPoint]): MulticlassMetrics = {
    //将交叉验证数据集的每个样本的特征向量交给模型预测,并和原本正确的目标特征组成一个tuple
    val predictionsAndLables = data.map { d =>
      (model.predict(d.features), d.label)
    }
    //将结果交给MulticlassMetrics,其可以以不同的方式计算分配器预测的质量
    new MulticlassMetrics(predictionsAndLables)
  }
  /**
    * 在训练数据集上得到最好的参数组合
    * @param trainData 训练数据集
    * @param cvData 交叉验证数据集
    * */
  def getBestParam(trainData: RDD[LabeledPoint], cvData: RDD[LabeledPoint]): Unit = {
    val evaluations = for (impurity <- Array("gini", "entropy");
                           depth <- Array(1, 20);
                           bins <- Array(10, 300)) yield {
      val model = RandomForest.trainClassifier(trainData, 2, Map[Int, Int](), 20, "auto", impurity, depth, bins)
      val metrics = getMetrics(model, cvData)
      ((impurity, depth, bins), metrics.precision)
    }
    evaluations.sortBy(_._2).reverse.foreach(println)
  }

  /**
    * 模拟对新数据进行预测1
    */
  val rawData = sc.textFile("hdfs://192.168.1.64:8020/test/mllib/v3.csv")

  val pdata = rawData.map{ line =>
    val values = line.split(",").map(_.toDouble)
    //转化为向量并去掉标签(init去掉最后一个元素,即去掉标签)
    val feature = Vectors.dense(values.init)
    feature
  }
  //读取模型
  val rfModel = RandomForestModel.load(sc,"hdfs://192.168.1.64:8020/tmp/RFModel")
  //进行预测
  val preLabel = rfModel.predict(pdata)
  preLabel.take(10)
  /**
    * 模拟对新数据进行预测2
    *
    */
  val dataAndPreLable = rawData.map{ line =>
    //转化为向量并去掉标签(init去掉最后一个元素,即去掉标签)
    val vecData = Vectors.dense(line.split(",").map(_.toDouble).init)
    val preLabel = rfModel.predict(vecData)
    line + "," + preLabel
  }//.saveAsTextFile("....")
dataAndPreLable.take(10)
}

参考文章:
*http://blog.csdn.net/mr__fang/article/details/51967852
http://www.ibm.com/developerworks/cn/opensource/os-cn-spark-random-forest/
http://blog.csdn.net/qq1010885678/article/details/51336094
http://www.uml.org.cn/sjjm/201511172.asp
http://www.xiaohei.info/2015/05/27/spark-11-mllib/
http://blog.csdn.net/xubo245/article/details/51498698
http://blog.csdn.net/chencheng12077/article/details/52766274

你可能感兴趣的:(Spark,大数据动物园,数据挖掘,R)