spark ml 特征工程实现

package com.sparkMLlibStudy.model

import java.util

import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute}
import org.apache.spark.ml.feature._
import org.apache.spark.ml.linalg.{Matrix, Vector, Vectors}
import org.apache.spark.ml.stat.{ChiSquareTest, Correlation}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{Row, SparkSession, functions}
import org.apache.spark.sql.functions.col


/**
  *
  */
object BasicStatisticsModel {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder()
      .master("local[2]")
      .config("spark.some.config.option", "some-value")
      .getOrCreate()

    /**
      * 计算皮尔逊线性相关系数和皮尔斯曼系数、卡方检验
      */
    val data = Seq(
      Vectors.sparse(4, Seq((0, 1.0), (3, -2.0))),
      Vectors.dense(4.0, 5.0, 0.0, 3.0),
      Vectors.dense(6.0, 7.0, 0.0, 8.0),
      Vectors.sparse(4, Seq((0, 9.0), (3, 1.0)))
    )

    val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")
//    df.show()
//    df.printSchema()
    /**
      * pearson = E(X)E(Y)/(E(x2)-E2(x))1/2*(E(Y2)-E2(Y))1/2
      * 变量服从正太分布
      */
    val Row(coff1: Matrix) = Correlation.corr(df,
      "features").head
//    println(s"Pearson correlation matrix:\n $coff1")

    val Row(coff2: Matrix) = Correlation.corr(df,
      "features", "spearman").head
//    println(s"Spearman correlation matrix:\n $coff2")

    /**
      * 卡方检验
      */
    val data1 = Seq(
      (0.0, Vectors.dense(0.5, 10.0)),
      (0.0, Vectors.dense(1.5, 20.0)),
      (1.0, Vectors.dense(1.5, 30.0)),
      (0.0, Vectors.dense(3.5, 30.0)),
      (0.0, Vectors.dense(3.5, 40.0)),
      (1.0, Vectors.dense(3.5, 40.0))
    )

    val df1 = spark.createDataFrame(data1).toDF("label","features")
    val chi = ChiSquareTest.test(df1, "features",
      "label").head
//    println(s"pValues = ${chi.getAs[Vector](0)}")
//    println(s"degreesOfFreedom ${chi.getSeq[Int](1).mkString("[",",","]")}")
//    println(s"statistics ${chi.getAs[Vector](2)}")

    /**
      * tf-idf(term frequency-inverse document frequency)
      * 一种广泛用于文本挖掘的特征向量方法,用户反映术语对语料库中文档重要性
      * tf(Term Frequency):表示一个term与某个document的相关性
      * idf(Inverse Document Frequency):表示一个term表示document的主题的权重大小
      * tf(t,d)词频
      * idf(t,D)=log((|D|+1)/(DF(t,D)+1)),其中|D|表示文件集总数,DF词出现(t,D)文档数量
      * tfidf(t,d,D)=tf(t,d)*idf(t,D)
      * 示例:
      * 一篇文件总词语是100个,词语“胡歌”出现了3次,那么“胡歌”一词在该文件中的词频TF(t,d)=3/100;
      * 如果“胡歌”一词在1000份文件中出现过,而文件总数是10000,其文件频率DF(t,D)=log((10000+1)/(1000+1))
      * 那么“胡歌”一词在该文件集的tf-idf分数为TF(t,d)*DF(t,D)
      */
    val sentence = spark.createDataFrame(
      Seq(
        (0.0, "Hi I heard about Spark"),
        (0.0, "I wish Java could use case classes"),
        (1.0, "Logistic regression models are neat")
      )
    ).toDF("label","sentence")

    val tk = new Tokenizer().setInputCol("sentence").setOutputCol("words")
    val words = tk.transform(sentence)
//    words.show()

    val hashingTF = new HashingTF()
      .setInputCol("words")
      .setOutputCol("rawFeatures")
      .setNumFeatures(20)

    val featurized = hashingTF.transform(words)
//    featurized.show()

    val idf = new IDF().setInputCol("rawFeatures")
      .setOutputCol("features")
    val idfModel = idf.fit(featurized)

    val rescaled = idfModel.transform(featurized)
//    rescaled.show()

    /**
      * word2vec:采用代表文档的单词序列训练word2VecModel
      * word2vec该模型将每个单词映射到唯一固定长度向量,此向量用于预测,文档相似度计算等
      */
    val documentDF = spark.createDataFrame(
      Seq(
        "Hi I heard about Spark".split(" "),
        "I wish Java could use case classes".split(" "),
        "Logistic regression models are neat".split(" ")
      ).map(Tuple1.apply)
    ).toDF("text")

//    词映射到向量
    val word2Vec = new Word2Vec()
      .setInputCol("text")
      .setOutputCol("result")
      .setVectorSize(3)
      .setMinCount(0)
    val model = word2Vec.fit(documentDF)

    val result = model.transform(documentDF)
//    result.collect().foreach{
//      case Row(text: Seq[_],features: Vector)=>
//        println(s"Text: [${text.mkString(", ")}] => \nVector: $features\n")
//    }

    /**
      * CountVectorizer and CountVectorizerModel
      * 旨在通过计数将一个文档转换为向量
      * 当不存在先验字典,CountVectorizer可以作为Estimator提取词汇,并生成CountVectorizerModel
      * 该模型产生关于文档词汇的稀疏特征向量,可以传递给其他像LDA算法
      */

    val df2 = spark.createDataFrame(
      Seq(
        (0, Array("a","b","c")),
        (1,Array("a","b","c","a"))
      )
    ).toDF("id","words")

//    拟合语料库CountVectorizerModel
    val cvModel: CountVectorizerModel = new CountVectorizer()
      .setInputCol("words")
      .setOutputCol("features")
      .setVocabSize(3)
      .setMinDF(2)
      .fit(df2)

//    使用先验词表定义CountVectorizerModel
    val cvm = new CountVectorizerModel(Array("a","b","c"))
      .setInputCol("words")
      .setOutputCol("features")

//    cvModel.transform(df2).show(false)

    /**
      * FeatureHasher
      * 特征散列将一组分类或数字特征投影到指定纬度的特征向量中(通常小于原始特征空间的特征向量)
      * 数字特征
      * 字符串特征:onehot编码
      */
    val df3 = spark.createDataFrame(
      Seq(
        (2.2, true, "1", "foo"),
        (3.3, false, "2", "bar"),
        (4.4, false, "3", "baz"),
        (5.5, false, "4", "foo")
      )
    ).toDF("real","bool","stringNum","string")

    val hasher = new FeatureHasher()
      .setInputCols("real","bool","stringNum","string")
      .setOutputCol("features")

    val featurizedHash = hasher.transform(df3)
//    featurizedHash.show(false)

    /**
      * 分词器:文本拆分为单个术语(单词)的过程
      * RegexTokenizer允许基于正则表达式匹配更高级标记化
      */
    val tkz = new Tokenizer()
      .setInputCol("sentence")
      .setOutputCol("words")
    val regexTokenizer = new RegexTokenizer()
      .setInputCol("sentence")
      .setOutputCol("words")
      .setPattern("\\W")

    val countTk = functions.udf{ (words: Seq[String]) =>words.length}
    val tokenizer = tkz.transform(sentence)
    tokenizer.select("sentence","words")
      .withColumn("tokens", countTk(tokenizer("words")))
//      .show(false)

    val regexTokenized = regexTokenizer.transform(sentence)
    regexTokenized.select("sentence", "words")
      .withColumn("tokens",countTk(regexTokenized("words")))
//      .show(false)

    /**
      * stopwords
      * 停止词是应该从输入中排除的词,通常是因为词经常出现并且没有那么多含义
      */
    val remover = new StopWordsRemover()
      .setInputCol("raw")
      .setOutputCol("filtered")

    val data2 = spark.createDataFrame(
      Seq(
        (0, Seq("I","saw","the","red","balloon")),
        (1, Seq("Mary", "had", "a", "little", "lamb"))
      )
    ).toDF("id", "raw")

//    remover.transform(data2).show(false)

    /**
      * n-gram代表由n个字组成的句子
      * 利用上下文中相邻词间的搭配信息,
      * 在需要把连续无空格的拼音、笔划,或代表字母或笔划的数字,
      * 转换成汉字串(即句子)时,可以计算出具有最大概率的句子,
      * 从而实现到汉字的自动转换,无需用户手动选择,
      * 避开了许多汉字对应一个相同的拼音(或笔划串,或数字串)的重码问题。
      * 该模型基于这样一种假设,
      * 第N个词的出现只与前面N-1个词相关,
      * 而与其它任何词都不相关,整句的概率就是各个词出现概率的乘积。
      */
    val ngram = new NGram().setN(2)
      .setInputCol("raw")
      .setOutputCol("ngrams")

    val ngramDF = ngram.transform(data2)
//    ngramDF.select("ngrams").show(false)

    /**
      * 二值化:将数值特征阈值化为二进制(0/1)特征过程
      * Binarizer采用公共参数inputCol和outputCol以及二值化的阈值
      * 大于阈值的特征被二进制为1,等于或小于阈值的特征被二值化为0
      */
    val data3 = Array((0, 0.1), (1, 0.8), (2, 0.2))
    val dataFrame = spark.createDataFrame(data3).toDF("id", "feature")
    val binarizer: Binarizer = new Binarizer()
      .setInputCol("feature")
      .setOutputCol("binarized_feature")
      .setThreshold(0.5)

    val binarizedDataFrame = binarizer.transform(dataFrame)
//    println(s"Binarizer output with Threshold = ${binarizer.getThreshold}")
//    binarizedDataFrame.show()

    /**
      * PCA
      * 通过正交变换将线性相关变量转换为线性不相关变量
      */
    val data4 = Array(
      Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
      Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
      Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
    )
    val df4= spark.createDataFrame(data4.map
    (Tuple1.apply)).toDF("features")

    val pca = new PCA()
      .setInputCol("features")
      .setOutputCol("pcaFeatures")
      .setK(3)
      .fit(df4)

    val res = pca.transform(df4).select("pcaFeatures")
//    res.show(false)

    /**
      * PolynomialExpansion:多项式扩展
      */
    val polyExpansion = new PolynomialExpansion()
      .setInputCol("features")
      .setOutputCol("polyFeatures")
      .setDegree(3)

    val polyDF = polyExpansion.transform(df)
//    polyDF.show(false)

    /**
      * Discrete Cosine Transform(DCT):离散余弦变换
      * 离散余弦变换将时域中长度为N实值序列变换为频域中另一长度为N实值序列
      */
    val dct = new DCT()
      .setInputCol("features")
      .setOutputCol("featuresDCT")
      .setInverse(false)

    val dctDF = dct.transform(df4)
//    dctDF.select("featuresDCT").show(false)

    /**
      * StringIndexer:
      * 将一列字符串标签编码成一列下标标签,下标范围是[0,标签数量)
      * 顺序是标签的出现频率
      * IndexToString:
      * 和StringIndexer是对称的,将一列下标标签映射回一列包含原始字符串的标签
      * 常用于StringIndexer生产下标,通过下标训练模型,通过IndexToString
      * 从预测出下标列重新获得原始标签
      */
    val data5 = spark.createDataFrame(
      Seq((0, "a"),(1, "b"),(2, "c"),(3, "a"),(4, "a"),
        (5, "c"))
    ).toDF("id", "category")

    val indexer = new StringIndexer()
      .setInputCol("category")
      .setOutputCol("categoryIndex")

    val indexed = indexer.fit(data5).transform(data5)
//    println(s"Transformed string column '${indexer
//      .getInputCol}'" + s"to indexed column '${indexer
//      .getOutputCol}'")
//    indexed.show(false)

    val inputColSchema = indexed.schema(indexer.getOutputCol)
//    println(s"StringIndexer will store labels in output " +
//      s"column meatadata:${Attribute.fromStructField(inputColSchema).toString()}\n")

    val converter = new IndexToString()
      .setInputCol("categoryIndex")
      .setOutputCol("originalCategory")

    val converted = converter.transform(indexed)
//    println(s"Transformed indexed column '${converter.getInputCol}' back to original string " +
//      s"column '${converter.getOutputCol}' using labels in metadata")
//    converted.select("id", "categoryIndex", "originalCategory").show()

    /**
      * OneHot编码
      * 将表示为标签索引的分类特征映射到二进制向量
      * 此编码允许期望连续特征的算法使用分类特征(Logistic回归)的算法使用分类特征
      */
    val data6 = spark.createDataFrame(
      Seq(
        (0.0, 1.0),
        (1.0, 0.0),
        (2.0, 1.0),
        (0.0, 2.0),
        (0.0, 1.0),
        (2.0, 0.0)
      )
    ).toDF("categoryIndex1","categoryIndex2")

    val encoder = new OneHotEncoderEstimator()
      .setInputCols(Array("categoryIndex1", "categoryIndex2"))
      .setOutputCols(Array("categoryVec1","categoryVec2"))
    val modelOH = encoder.fit(data6)

    val encoded = modelOH.transform(data6)
//    encoded.show(false)

    /**
      * VectorIndexer
      * 对数据集特征向量中的类别(离散值)特征(index categorical features
      * categorial features)进行编码;
      * 提高决策树或随机森林等ML方法的分类效果
      */
    val data7 = spark.read.format("libsvm").load("/opt/modules/spark-2.3.1/data/mllib/sample_libsvm_data.txt")

    val indexer1 = new VectorIndexer()
      .setInputCol("features")
      .setOutputCol("indexed")
      .setMaxCategories(10)

    val indexerModel = indexer1.fit(data7)
    val categoricalFeatures:Set[Int] = indexerModel.categoryMaps.keys.toSet
//    println(s"Chose ${categoricalFeatures.size} " +
//      s"categorical features: ${categoricalFeatures.mkString(", ")}")

//    使用转换为索引的分类值创建索引新列
    val indexedData = indexerModel.transform(data7)
//    indexedData.show(false)

    /**
      * interaction
      * transformer,接受向量或双值列,并生成一个向量列,其中包含每个输入列的一个值
      */
    val data8 = spark.createDataFrame(Seq(
      (1, 1, 2, 3, 8, 4, 5),
      (2, 4, 3, 8, 7, 9, 8),
      (3, 6, 1, 9, 2, 3, 6),
      (4, 10, 8, 6, 9, 4, 5),
      (5, 9, 2, 7, 10, 7, 3),
      (6, 1, 1, 4, 2, 8, 4)
    )).toDF("id1", "id2", "id3", "id4", "id5", "id6", "id7")
    val assembler1 = new VectorAssembler()
      .setInputCols(Array("id2", "id3", "id4"))
      .setOutputCol("vec1")

    val assembled1 = assembler1.transform(data8)

    val assembler2 = new VectorAssembler()
      .setInputCols(Array("id5", "id6", "id7"))
      .setOutputCol("vec2")

    val assembled2 = assembler2.transform(assembled1)
      .select("id1", "vec1", "vec2")

    val interaction = new Interaction()
      .setInputCols(Array("id1", "vec1", "vec2"))
      .setOutputCol("interactedCol")

    val interacted = interaction.transform(assembled2)

//    interacted.show(false)

    /**
      * Normalizer正则化
      * Normalizer是一个转换器,它可以将多行向量输入转化为统一的形式
      * 参数为p(默认值:2)来指定正则化中使用的p-norm
      * 正则化操作可以使输入数据标准化并提高后期学习算法的效果
      */
    val data9 = spark.createDataFrame(Seq(
      (0, Vectors.dense(1.0, 0.5, -1.0)),
      (1, Vectors.dense(2.0, 1.0, 1.0)),
      (2, Vectors.dense(4.0, 10.0, 2.0))
    )).toDF("id", "features")

//    使用L1正则标准化每列向量
    val normalizer = new Normalizer()
      .setInputCol("features")
      .setOutputCol("normFeatures")
      .setP(1.0)

    val l1NormData = normalizer.transform(data9)
//    println("Normalized using L^1 norm")
//    l1NormData.show()

//    无限大正则
    val lInfNormData = normalizer.transform(data9, normalizer.p -> Double.PositiveInfinity)
//    println("Normalized using L^inf norm")
//    lInfNormData.show()

    /**
      * MinMaxScaler
      * 标准化
      */
    val scaler = new MinMaxScaler()
      .setInputCol("features")
      .setOutputCol("scaledFeatures")

//    创建MinMaxScalerModel
    val scalerModel = scaler.fit(data9)

//    [min,max]标准化
    val scaled = scalerModel.transform(data9)
//    println(s"Features scaled to range: [${scaler.getMin}, ${scaler.getMax}]")
//    scaled.select("features", "scaledFeatures").show(false)

    /**
      * MaxAbsScaler
      * 通过每列最大绝对值归一化[-1,1]
      */
    val scalerMA = new MaxAbsScaler()
      .setInputCol("features")
      .setOutputCol("scaledFeatures")

    // 创建MaxAbsScalerModel
    val scml = scalerMA.fit(data9)

    // 归一化至[-1, 1]
    val scaledData = scml.transform(data9)
//    scaledData.select("features", "scaledFeatures").show(false)

    /**
      * Bucketizer
      * 分箱(分段处理):连续值转换为离散类别
      */
    val splits = Array(Double.NegativeInfinity, -0.5,
      0.0, 0.5, Double.PositiveInfinity)
    val data10 = Array(-999.9, -0.5, -0.3, 0.0, 0.2, 999.9)
    val df5 = spark.createDataFrame(data10.map(Tuple1.apply))
      .toDF("features")

    val bucketizer = new Bucketizer()
      .setInputCol("features")
      .setOutputCol("bucketedFeatures")
      .setSplits(splits)

    val bucketed = bucketizer.transform(df5)
//    println(s"Bucketizer output with ${bucketizer.getSplits.length-1} buckets")
//    bucketed.show()

    /**
      * ElementwiseProduct
      * 对输入向量的每个元素乘以一个权重向量的每个元素,对输入向量每个元素逐个进行放缩
      */
    val transformingVector = Vectors.dense(0.0, 1.0, 2.0)
    val transformer = new ElementwiseProduct()
      .setScalingVec(transformingVector)
      .setInputCol("features")
      .setOutputCol("transformedVector")

//    批量转换矢量以创建新列
//    transformer.transform(data9).show()

    val df6 = spark.createDataFrame(
      Seq((0, 1.0, 3.0), (2, 2.0, 5.0))).toDF("id", "v1", "v2")

    val sqlTrans = new SQLTransformer().setStatement(
      "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")

//    sqlTrans.transform(df6).show()

    /**
      * VectorAssembler
      * 一个transformer,将多列数据转化为单列的向量列
      * 原始数据集里,经常会包含一些非指标数据,如 ID,Description 等
      * 为方便后续模型进行特征输入,需要部分列的数据转换为特征向量,并统一命名
      */
    val df7 = spark.createDataFrame(
      Seq(
        (0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0),
        (0, 18, 1.0, Vectors.dense(0.0, 10.0), 0.0))
    ).toDF("id", "hour", "mobile", "userFeatures", "clicked")

    val assembler = new VectorAssembler()
      .setInputCols(Array("hour","mobile","userFeatures"))
      .setOutputCol("features")

    val output = assembler.transform(df7)
//    println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
//    output.select("features", "clicked").show(false)

    /**
      * VectorSizeHint
      * VectorSizeHint允许用户显式指定列的向量大小
      * 以便VectorAssembler或可能需要知道向量大小的其他变换器可以将该列用作输入
      */
    val sizeHint = new VectorSizeHint()
      .setInputCol("userFeatures")
      .setHandleInvalid("skip")
      .setSize(3)

    val datasetWithSize = sizeHint.transform(df7)
//    println("Rows where 'userFeatures' is not the right size are filtered out")
//    datasetWithSize.show(false)

    val outputHint = assembler.transform(datasetWithSize)
//    println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
//    output.select("features", "clicked").show(false)

    /**
      * QuantileDiscretizer
      * 采用具有连续特征的列,并输出具有分箱分类特征的列
      */
    val df8 = spark.createDataFrame(
      Array((0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2))
    ).toDF("id", "hour")
    val discretizer = new QuantileDiscretizer()
      .setInputCol("hour")
      .setOutputCol("result")
      .setNumBuckets(3)

    val ret = discretizer.fit(df8).transform(df8)
//    ret.show(false)

    /**
      * Imputer使用缺失值所在的列的平均值或中值来完成数据集中的缺失值
      * 输入列应为DoubleType或FloatType
      */
    val df9 = spark.createDataFrame(
      Seq(
        (1.0, Double.NaN),
        (2.0, Double.NaN),
        (Double.NaN, 3.0),
        (4.0, 4.0),
        (5.0, 5.0)
      )
    ).toDF("a", "b")

    val imputer = new Imputer()
      .setInputCols(Array("a","b"))
      .setOutputCols(Array("out_a","out_b"))

    val modelIm = imputer.fit(df9)
//    modelIm.transform(df9).show(false)

    /**
      * VectorSlicer
      * 一个转换器输入特征向量,输出原始特征向量子集
      * VectorSlicer接收带有特定索引的向量列,通过对这些索引的值进行筛选得到新的向量集
      */
    val data11 = util.Arrays.asList(
      Row(Vectors.sparse(3, Seq((0, -2.0), (1, 2.3)))),
      Row(Vectors.dense(-2.0, 2.3, 0.0))
    )
    val defaultAttr = NumericAttribute.defaultAttr
    val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName)
    val attrGroup = new AttributeGroup("userFeatures", attrs.asInstanceOf[Array[Attribute]])

    val df10 = spark.createDataFrame(data11, StructType(Array(attrGroup.toStructField())))

    val slicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features")
    slicer.setIndices(Array(1)).setNames(Array("f3"))

    val outputSL = slicer.transform(df10)
//    outputSL.show(false)

    /**
      *  RFormula
      *  RFormula通过R模型公式来选择列。支持R操作中的部分操作,包括‘~’, ‘.’, ‘:’, ‘+’以及‘-‘
      *  产生一个向量特征列以及一个double或者字符串标签列
      */
    val df11 = spark.createDataFrame(Seq(
      (7, "US", 18, 1.0),
      (8, "CA", 12, 0.0),
      (9, "NZ", 15, 0.0)
    )).toDF("id", "country", "hour", "clicked")

    val formula = new RFormula()
      .setFormula("clicked ~ country + hour")
      .setFeaturesCol("features")
      .setLabelCol("label")

    val outputFR = formula.fit(df11).transform(df11)
//    outputFR.select("features", "label").show(false)

    /**
      * ChiSqSelector:卡方特征选择
      * 适用于带有类别特征的标签数据
      * ChiSqSelector根据独立卡方检验,然后选取类别标签主要依赖的特征
      * 它类似于选取最有预测能力的特征
      */
    val df12 = spark.createDataFrame(
      Seq(
        (7, Vectors.dense(0.0, 0.0, 18.0, 1.0), 1.0),
        (8, Vectors.dense(0.0, 1.0, 12.0, 0.0), 0.0),
        (9, Vectors.dense(1.0, 0.0, 15.0, 0.1), 0.0)
      )
    ).toDF("id", "features", "clicked")

    val selector = new ChiSqSelector()
      .setNumTopFeatures(1)
      .setFeaturesCol("features")
      .setLabelCol("clicked")
      .setOutputCol("selectedFeatures")

    val relt = selector.fit(df12).transform(df12)
//    relt.show(false)

    /**
      * Locality Sensitive Hashing (LSH)
      * 一类重要的散列技术,常用于聚类,近似最近邻搜索和大数据集的异常检测
      * 使用一系列函数(“LSH系列”)将数据点哈希到桶中,使得彼此接近的数据点在相同的桶中具有高概率,而数据点是远离彼此很可能在不同的桶中
      * 在度量空间(M,d)中,其中M是集合,d是M上的距离函数,LSH族是满足以下属性的函数族h:
      * ∀p,q∈M
      * d(p,q)≤r1⇒Pr(h(p)=h(q))≥p1
      * d(p,q)≥r2⇒Pr(h(p)=h(q))≤p2
      * 则(r1, r2, p1, p2)-sensitive
      * 1.Bucketed Random Projection for Euclidean Distance
      * d(x,y) = sqrt(X,Y) = sqrt(sum((xi-yi)^2))
      * h(x) = |xv/r|
      * r是用户定义的桶长度
      * 桶长度可以用来控制散列桶的平均大小(从而控制桶的数量)
      * 2.MinHash for Jaccard Distance
      * MinHash是用于计算Jaccard距离的LSH族
      * d(A,B) = 1 -|A ∩ B| / |A ∪ B|
      * MinHash 对集合中的每个元素应用随机哈希函数g,并取所有哈希值的最小值:
      * h(A) = min(g(a))  ,a∈A
      * https://www.cnblogs.com/maybe2030/p/4953039.html
      * */
    val dfA = spark.createDataFrame(Seq(
      (0, Vectors.dense(1.0, 1.0)),
      (1, Vectors.dense(1.0, -1.0)),
      (2, Vectors.dense(-1.0, -1.0)),
      (3, Vectors.dense(-1.0, 1.0))
    )).toDF("id", "features")

    val dfB = spark.createDataFrame(Seq(
      (4, Vectors.dense(1.0, 0.0)),
      (5, Vectors.dense(-1.0, 0.0)),
      (6, Vectors.dense(0.0, 1.0)),
      (7, Vectors.dense(0.0, -1.0))
    )).toDF("id", "features")

    val key = Vectors.dense(1.0, 0.0)

    val brp = new BucketedRandomProjectionLSH()
      .setBucketLength(2.0)
      .setNumHashTables(3)
      .setInputCol("features")
      .setOutputCol("hashes")

    val modelBR = brp.fit(dfA)
//    特征转换
//    println("The hashed dataset where hashed values are stored in the column 'hashes':")
//    modelBR.transform(dfA).show(false)

//    计算输入行的局部敏感哈希值,然后执行近似值相似性加入
//    我们可以通过传入已经转换过的数据集来避免计算哈希值
//    1.欧几里得距离
//    println("Approximately joining dfA and dfB on Euclidean distance smaller than 1.5:")
//    modelBR.approxSimilarityJoin(dfA, dfB, 1.5, "EuclideanDistance")
//      .select(col("datasetA.id").alias("idA"), col("datasetB.id").alias("idB"),
//        col("EuclideanDistance")).show(false)

//    println("Approximately searching dfA for 2 nearest neighbors of the key:")
//    modelBR.approxNearestNeighbors(dfA, key, 2).show(false)

    val mh = new MinHashLSH()
      .setNumHashTables(5)
      .setInputCol("features")
      .setOutputCol("hashes")

    val modelMH = mh.fit(dfA)

//    println("The hashed dataset where hashed values are stored in the column 'hashes':")
//    modelMH.transform(dfA).show(false)

//    println("Approximately joining dfA and dfB on Jaccard distance smaller than 0.6:")
//    modelMH.approxSimilarityJoin(dfA, dfB, 0.6, "JaccardDistance")
//      .select(col("datasetA.id").alias("idA"),
//        col("datasetB.id").alias("idB"),
//        col("JaccardDistance")).show(false)

//    println("Approximately searching dfA for 2 nearest neighbors of the key:")
//    modelMH.approxNearestNeighbors(dfA, key, 2).show(false)


  }
}

 

你可能感兴趣的:(hadoop,spark,特征工程)