NLP处理-Spark中的HashTF与CountVectorizer模型

http://spark.apache.org/docs/latest/ml-features.html#tf-idf

import org.apache.spark.ml.feature._
import org.apache.spark.ml.linalg.SparseVector
import org.apache.spark.sql.SparkSession
 
import scala.collection.mutable
import scala.io.Source
 
/**
  * Created by xubc on 2017/6/3.
  */
object TestX {
 
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder
        .master("local[5]")
        .appName(this.getClass.getName().stripSuffix("$"))
        .getOrCreate()
    val sentenceData = spark.createDataFrame(Seq(
      (0.0, "Hi I heard about are Spark"),
      (1.0, "I wish Java could use case spark classes"),
      (2.0, "Logistic regression regression models are neat I")
    )).toDF("label", "sentence")
 
    val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
    val wordsData = tokenizer.transform(sentenceData)
 
    // HashingTF bow模型
//    val hashingTF = new HashingTF()
//      .setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(100)
//    val featurizedData = hashingTF.transform(wordsData)
 
    // CountVectorizer bow模型
    val cvModel: CountVectorizerModel = new CountVectorizer()
      .setInputCol("words").setOutputCol("rawFeatures")
      .fit(wordsData)
    val featurizedData = cvModel.transform(wordsData)
 
 
    val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
    val idfModel = idf.fit(featurizedData)
 
    val rescaledData = idfModel.transform(featurizedData)
    rescaledData.printSchema()
 
    val vocabulary = cvModel.vocabulary
    println(vocabulary.mkString(","))
    rescaledData.show(false)
    rescaledData.foreach(e => {
      val label = e.getAs[Double]("label")
      val str = e.getAs[String]("sentence")
      val words = e.getAs[mutable.WrappedArray[String]]("words").mkString(",")
      val tf = e.getAs[SparseVector]("rawFeatures")
      val originWords = tf.indices.map(i => vocabulary(i)).mkString(",")
      val idf = e.getAs[SparseVector]("features")
      println(
        s"""$label   $str
           | $words
           | $tf    $originWords
           | $idf""".stripMargin)
    })
  }
}

 

你可能感兴趣的:(推荐系统)