spark rdd算子的简单练习

数据格式:

在这里插入图片描述

从左往右的列分别表示:职位编号、职位名称、薪资范围、工作区域、职位简述、工作年限、学历要求、职位详情等。

统计发布招聘岗位最多的五个地区

object frist_test {
  def main(args: Array[String]): Unit = {


    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("first")

    val context: SparkContext = new SparkContext(sparkConf)
    val value: RDD[String] = context.textFile("E:\\idea\\IdeaProjects\\Spark-Project\\Spark-Core\\src\\main\\resources\\cqbigdata_job.csv")


    //方法1
    val value1: RDD[(String, Int)] = value.map(
      data => {
        val strings: Array[String] = data.split("\t")
        (strings(4), 1)
      }
    )
    val value2: Array[(String, Int)] = value1.reduceByKey(_ + _).sortBy(_._2,false).take(5)
    value2.foreach(println)


    //方法2

    val value3: RDD[(String, Iterable[Int])] = value1.groupByKey()
    val value4: RDD[(String, Int)] = value3.mapValues(data => data.size)
    val value5 = value4.sortBy(_._2,false).take(5)
    value5.foreach(println)

    //方法3
    val stringToLong: collection.Map[String, Long] = value1.countByKey()
    val tuples: List[(String, Long)] = stringToLong.toList.sortBy(_._2)(Ordering.Long.reverse).take(5)
    println(tuples)


}
}

统计各地区职位数占比


    val total = value1.count().toDouble
    val value6 = value1
      .reduceByKey(_+_)
      .mapValues(_/total)
      .sortBy(_._2,false)
    
    value6.foreach(println)


筛选重庆地区岗位数据


    val value7: RDD[Array[String]] = value.map(
      data => {
        val x = data.split("\t")
        x
      }
    )

    value7
      .filter(data => data(4) == "重庆")
      .foreach(data => println("职位名称:" + data(1) + "\t" + "地区:" + data(4) + "\t" + "最低薪资:" + data(2)))


你可能感兴趣的:(spark,scala,intellij-idea)