参考资料:spark MLlib 官方文档
package lean_mllib
//import breeze.linalg.PCA
import org.apache.spark.ml.feature._
import org.apache.spark.ml.linalg.Vectors
/**
* Created by wjf on 2016/8/15.
*/
object TestCountVector {
val spark=MLLibConf.spark
def main(args: Array[String]): Unit = {
/* val df= spark.createDataFrame(Seq(
(0,Array("a","b","C","c")),
(1,Array("a","a","b","b","c","C"))
)).toDF("id","words")
val cvModel:CountVectorizerModel =new CountVectorizer().setInputCol("words").setOutputCol("features").setVocabSize(3).setMinDF(2).fit(df)
val cvm =new CountVectorizerModel(Array("a","b","c")).setInputCol("words").setOutputCol("features")
cvModel.transform(df).select("features","words").show(false)*/
// removeStopWord()
// nGram()
// testPCA()
// testPolynomialExpansion()
// testDiscreteCosineTransform()
// testStringIndexer()
testOneHotCoder()
}
def removeStopWord(): Unit ={
val remover =new StopWordsRemover().setInputCol("raw").setOutputCol("filtered")
val dataSet =spark.createDataFrame(Seq(
(0,Seq("I","saw","the","red","baloon")),
(1,Seq("Marry","had","a","little","lamb"))
)).toDF("id","raw")
remover.transform(dataSet).show()
}
def nGram(): Unit ={
val wordDataFrame =spark.createDataFrame(Seq(
(0,Array("Hi","I","heard","about","Spark")),
(1,Array("I","wish","Java","Could","use","case","classes")),
(2,Array("Logistic","regression","models","are","neat"))
)).toDF("label","words")
val ngram =new NGram().setInputCol("words").setOutputCol("ngrams")
val ngramDataFrame =ngram.setN(10).transform(wordDataFrame)
ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(println)
}
def testPCA(): Unit ={
val data =Array(
Vectors.sparse(5,Seq((1,1.0),(3,7.0))),
Vectors.dense(2.0,0.0,3.0,4.0,5.0),
Vectors.dense(4.0,0.0,0.0,6.0,7.0)
)
val df =spark.createDataFrame(data.map(Tuple1.apply)).toDF("label","features")
df.take(10).foreach(println)
val pca=new PCA().setInputCol("features").setOutputCol("pcaFeatures").setK(1).fit(df)
val pcaDF =pca.transform(df)
pcaDF.take(10).foreach(println)
val result = pcaDF.select("pcaFeatures","label")
result.show()
}
def testPolynomialExpansion(): Unit ={
val data= Array(
Vectors.dense(-2.0,2.3),
Vectors.dense(0.0,0.0),
Vectors.dense(0.6,-1.1)
)
val df =spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")
val polynomialExpansion =new PolynomialExpansion().setInputCol("features").setOutputCol("polyFeatures").setDegree(3)
val polyDF = polynomialExpansion.transform(df)
polyDF.select("polyFeatures").take(3).foreach(println)
}
def testDiscreteCosineTransform(): Unit ={
val data =Seq(
Vectors.dense(0.0,1.0,-2.0,3.0),
Vectors.dense(-1.0,2.0,4.0,-7.0),
Vectors.dense(14.0,-2.0,-5.0,1.0)
)
data.foreach(println)
val df=spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")
// take some cow to the driver program. if the n is too large ,the driver program may be crash
df.take(10).foreach(println)
val dct =new DCT().setInputCol("features").setOutputCol("featuresDCT").setInverse(false)
val dctDF = dct.transform(df)
dctDF.select("featuresDCT").show(3)
}
def testStringIndexer(): Unit ={
val df =spark.createDataFrame(Seq(
(0,"a"),(1,"b"),(2,"c"),(3,"a"),(4,"a"),(5,"c")
)).toDF("id","category")
df.take(6).foreach(println)
val indexer =new StringIndexer().setInputCol("category").setOutputCol("categoryIndex").fit(df)
val indexed = indexer.transform(df)
indexed.take(6).foreach(println)
val converter =new IndexToString().setInputCol("categoryIndex").setOutputCol("originalCategory")
val converted = converter.transform(indexed)
converted.select("id","categoryIndex","originalCategory").show()
}
def testOneHotCoder(): Unit ={
val df = spark.createDataFrame(Seq(
(0,"a"),(1,"b"),(2,"c"),(3,"a"),(4,"a"),(5,"c"),(6,"b")
)).toDF("id","category")
val indexer = new StringIndexer().setInputCol("category").setOutputCol("categoryIndex").fit(df)
val indexed = indexer.transform(df)
val encoder =new OneHotEncoder().setInputCol("categoryIndex").setOutputCol("categoryVec")
val encoded = encoder.transform(indexed)
encoded.select("id","categoryVec").show()
}
}