Spark ItemCF推荐算法(mllib) 余弦相似度实现:

package com.keyboard.pro_test

import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.mllib.linalg
import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, IndexedRow, MatrixEntry, RowMatrix}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{SaveMode, SparkSession}
import scala.collection.mutable.ArrayBuffer

/**
  * 建立物品的相似度,来进行推荐
  * 基本的逻辑是:首先得到某个用户评价过(买过的商品),
  * 然后计算其他商品与改商品的相似度,并排序;从高到低,把不在用户评价过的商品里的其他商品推荐给用户
  * SUCCESS
  */
object sparkMllibItemCF {
  def main(args: Array[String]): Unit = {
    Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
    Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)

    val spark = SparkSession
      .builder()
      .appName("sparkMllibItemCF")
      .enableHiveSupport()
      .getOrCreate()
    import spark.implicits._

    val start_time = args(0).toString 
    val end_time = args(1).toString
    val filter_num_left = args(2).toInt 
    val filter_num_right = args(3).toInt 
    val outputpath = args(4).toString 

    /**
      * 选取指定日期评分数据,并按照量级筛选
      */
    val df_theme_filter = spark.sql(
      s"""
         |SELECT srv.xaid
         |   FROM AAA srv
         |where srv.pt >= '$start_time'
         |   AND srv.pt <='$end_time'
         |   GROUP BY srv.xaid
         |   HAVING count(srv.tid) > $filter_num_left
         |   AND count(srv.tid) < $filter_num_right
      """.stripMargin)
    df_theme_filter.createOrReplaceTempView("df_theme_filter")
    df_theme_filter.show()

    /**
      * 关联部分表,过滤指定条件
      */
    val df_theme_filter2 = spark.sql(
      s"""
         |SELECT sr3.xaid AS xaid,
         |       sr3.tid AS tid,
         |       max(cast(sr3.score AS int)) AS score
         |FROM df_theme_filter tf
         |LEFT JOIN BBB ON sr3.xaid = tf.xaid
         |JOIN CCC lw2 ON sr3.tid = lw2.id
         |WHERE lw2.is_show = '1'
         |GROUP BY sr3.xaid,
         |         sr3.tid
      """.stripMargin)
    df_theme_filter2.show()

    /**
      * 使用StringIndexer将xaid和tid转化成int类型
      * 查询xaidIndex、tidIndex、score为训练数据字段
      */
    val indexer_xaid = new StringIndexer()
      .setInputCol("xaid")
      .setOutputCol("xaidIndex")
    val data1 = indexer_xaid.fit(df_theme_filter2).transform(df_theme_filter2)
    val indexer_tid = new StringIndexer()
      .setInputCol("tid")
      .setOutputCol("tidIndex")
    val data2 = indexer_tid.fit(data1).transform(data1)
    data2.show()
    data2.createOrReplaceTempView("source_ratings")
    val target_ratings = spark.sql("select xaidIndex,tidIndex,score from source_ratings")
    target_ratings.show()

    /**
      * 建立xaid、tid和int类型字段中间表,后面关联使用
      */
    val xaid_index_map =  data2.select("xaid","xaidIndex").distinct()
    xaid_index_map.createOrReplaceTempView("map_xaid")
    val tid_index_map =  data2.select("tid","tidIndex").distinct()
    tid_index_map.createOrReplaceTempView("map_tid")

    /**
      * MatrixEntry代表一个分布式矩阵中的每一行(Entry)
      * 这里的每一项都是一个(i: Long, j: Long, value: Double) 指示行列值的元组tuple。
      * 其中i是行坐标,j是列坐标,value是值。
      **/
    val parseData: RDD[MatrixEntry] = target_ratings.mapPartitions(par=>{
      val array = new ArrayBuffer[MatrixEntry]()
      par.foreach(line=>{
        val xaidIndex = line(0).toString.toDouble.toLong
        val tidIndex = line(1).toString.toDouble.toLong
        val score = line(2).toString.toDouble
        array.+=(MatrixEntry(xaidIndex, tidIndex,score))
      })
      array.iterator
    }).rdd

    /**
      * CoordinateMatrix是 Spark MLLib中专门保存 user_item_rating这种数据样本
      * RowMatrix的方法columnSimilarities是计算列与列的相似度,即主题与主题之间的相似度
      */
    val ratings = new CoordinateMatrix(parseData)
    val matrix: RowMatrix = ratings.toRowMatrix()
    val similarities: CoordinateMatrix = matrix.columnSimilarities()
    val indexdsimilar: RDD[(Int, linalg.Vector)] = similarities.toIndexedRowMatrix().rows.map{
      case IndexedRow(index, vector) => (index.toInt,vector)
    }

    /**
      *把indexdsimilar:RDD[(Int,linalg.Vector)] 转成本地二维数组,tidIndex为角标
      */
    val recommends_for_all_tids:RDD[(Int,Int,Double,String)] = indexdsimilar.mapPartitions(par=>{
      val array = new ArrayBuffer[(Int,Int,Double,String)]()
      par.foreach(
        tid=>{
          val tidIndex = tid._1
          val tid_similar_array = tid._2.toArray
          for( t <- tid_similar_array.indices){
            if(tid_similar_array(t)!=0){
              array.+=((tidIndex,t,tid_similar_array(t),end_time))
            }
          }
        }
      )
      array.iterator
    })

    val df = recommends_for_all_tids.toDF("tidIndex1","tidIndex2","score","pt")
    df.createOrReplaceTempView("df_table")
    df.show()

    val result_df = spark.sql(
      s"""
         |SELECT d.tidIndex1,
         |    mx.tid as tid1,
         |    d.tidIndex2,
         |    mt.tid as tid2,
         |    d.score,
         |    '$end_time' AS pt
         |FROM df_table AS d
         |LEFT JOIN map_tid mx ON d.tidIndex1 = mx.tidIndex
         |LEFT JOIN map_tid mt ON d.tidIndex2 = mt.tidIndex
      """.stripMargin)
    result_df.show()

    result_df
      .write
      .mode(SaveMode.Overwrite)
      .format("parquet")
      .save(outputpath)

    spark.stop()
  }
}

你可能感兴趣的:(大数据,数据分析)