使用spark scala计算信息熵,信息增益,信息增益率

找了下,没找到现成的代码,自己写了个,速度还能接受,可以用来做简单的特征选择

import org.apache.spark.rdd.RDD

class FeatureSelect {
  //信息熵
  def entropyRDD(data:RDD[String])={
    val size=data.count()
    val p=data.map(x=>(x,1)).reduceByKey(_+_).map{
      case(value,num)=>num.toDouble/size
    }
    p.map{x=>
      -x*(Math.log(x)/Math.log(2))
    }.sum
  }

  //条件熵 data:label,feature
  def conditionalEntropy(data:RDD[(String,String)])={
    val size=data.count()
    data.map{case(label,feature)=>((feature,label),1)
    }.reduceByKey(_+_).map{case((feature,label),num)=>
      (feature,List((label,num)))   //feature,label,cnt
    }.reduceByKey(_:::_).mapValues{x=>
      val size_entro=x.map(_._2).sum
      val res=x.map(_._2.toDouble/size_entro).map{t=>
        -t*(Math.log(t)/Math.log(2))
      }.sum
      size_entro*res
    }.mapValues{x=>x/size}.map(_._2).sum
  }

  //信息增益 data:label,feature
  def infoGain(data:RDD[(String,String)])={
    entropyRDD(data.map(_._1))-conditionalEntropy(data)
  }

  //信息增益率 data:label,feature
  def infoRatio(data:RDD[(String,String)])={
    infoGain(data)/entropyRDD(data.map(_._2))
  }
}

 

你可能感兴趣的:(数据结构与算法,机器学习)