标签练习

package com.org.de.tagGenerator

import com.alibaba.fastjson.JSON
import org.apache.spark.{SparkConf, SparkContext}

/**
  * 标签生成器
  */
object TGTest {
    def main(args: Array[String]): Unit = {
        val conf = new SparkConf()
        conf.setAppName("WordCountScala")
        conf.setMaster("local[4]")
        val sc = new SparkContext(conf)
        //加载文件
        val rdd1 = sc.textFile("file:///d:/temptags.txt")
        //变换
        val rdd2 = rdd1.map(line => {
            val arr = line.split("\t")
            //获取86913510	{"reviewPics":null,"extInfoList":[{"title":"contentTags","values":["午餐","分量适中"],"desc":"","defineType":0},{"title":"tagIds","values":["684","240"],"desc":"","defineType":0}],"expenseList":null,"reviewIndexes":[2],"scoreList":null}
            //商家ID
            val busId = arr(0)
            //获取json格式串
            val text = arr(1)
            //将text串转为JSON对象
            val jstext = JSON.parseObject(text)
            //通过对象得到  "extInfoList":-->对应的数组[{"title":"contentTags","values":["午餐","分量适中"]
            val jarr = jstext.getJSONArray("extInfoList")
            //判断数组内容不为空
            if (jarr != null && jarr.size() > 0) {
                //得到json对象
                val v1 = jarr.getJSONObject(0)
                //通过values: 得到 ["午餐","分量适中"]
                val arr2 = v1.getJSONArray("values")
                if (arr2 != null && arr2.size() > 0) {
                    var str = ""
                    var i = 0
                    while (i < arr2.size()) {
                        str = str + arr2.getString(i) + ","
                        i += 1
                    }
                    (busId, str.substring(0, str.length - 1))
                }
                else (busId, "")
            }
            else (busId, "")
        }
        )
        //(77287793, 服务热情, 音响效果好)
        //(73812440,)
        //过滤,没有评论的过滤掉
        val rdd3 = rdd2.filter(t => {
            t._2 != null && !"".equals(t._2)
        })
        //按照value压扁
        val rdd4 = rdd3.flatMapValues(_.split(","))
        //重组key busId-comm,1
        val rdd5 = rdd4.map(t => {
            (t._1 + "_" + t._2, 1)
        })
        //聚合
        val rdd6 = rdd5.reduceByKey(_ + _)

        //变换成(busId,(comm,count))
        val rdd7 = rdd6.map(t => {
            var arr = t._1.split("_")
            (arr(0), (arr(1), t._2) :: Nil)
        })
        //按照商场id进行聚合 value 是list
        val rdd8 = rdd7.reduceByKey(_ ++ _)
        //进行排序(倒序)
        val rdd9 = rdd8.map(t => {
            val x = t._2.sortBy(t => {
                t._2
            }).reverse.take(5)
            (t._1, x)
        })
        //按照列来排序,将之前每家评价的第一个数量为key,进行排序
        val rdd99 = rdd9.sortBy(t => {
            t._2(0)._2
        }, false, 1)

        //(83644298,List((性价比高,1)
        //  t._1         t._2
        val rdd10 = rdd99.map(t => {
            val col = t._2
            var desc = ""
            for (tt <- col) {
                //List((性价比高    ,   1)
                //      tt._1        tt._2
                desc = desc + tt._1 + "(" + tt._2 + ")" + ","
            }
            (t._1, desc)
        }
        )
        val rdd11 = rdd10.map(t => {
            val s1 = t._1
            var s2 = t._2
            (t._1, t._2.substring(0, t._2.length - 1))
        }
        )
        rdd11.foreach(println)
    }
}

你可能感兴趣的:(标签练习)