Spark 读取 csv 时,当 csv 的字段值中有 JSON 串

Spark 读取 csv 时,当 csv 的字段值中有 JSON 串

需求:统计 csv 中 有 json 串的 key 个数

csv 数据:

代码:

package com.rm1024.scala

import com.alibaba.fastjson.JSON
import org.apache.spark.sql.SparkSession

import scala.collection.mutable.ArrayBuffer

object JsonParserTest {
  def main(args: Array[String]): Unit = {
    Logger.getLogger("org").setLevel(Level.ERROR)
    val spark = SparkSession.builder()
      .appName(this.getClass.getSimpleName)
      .master("local[*]")
      .getOrCreate()
    val path = "/Users/lijiayan/Desktop/bad7b804-6de6-4aa0-ba74-e7f485fbf41c.csv"
   
    spark.read.format("com.databricks.spark.csv")
      .option("inferSchema", value = false)
      .option("header", value = true)
      .option("nullValue", "\\N")
      .option("escape", "\"") // 设置用于在已引用的值内转义引号的单个字符。详情见 spark 读取 csv 官网介绍 https://spark.apache.org/docs/2.0.2/api/java/org/apache/spark/sql/DataFrameReader.html#option(java.lang.String,%20boolean)
      .option("quoteAll", "true")
      .option("sep", ",")
      .csv(path).createTempView("test")
    
    val sql =
      """
        |select * from test
      """.stripMargin

    spark.sql(sql).rdd.map(row => {
      val jsonStr = row.getAs[String]("stat_json")
      val jObj = JSON.parseObject(jsonStr)
      val list = ArrayBuffer[String]()
      val it = jObj.entrySet().iterator()
      while (it.hasNext) {
        val key: String = it.next().getKey
        list.append(key)
      }
      list
    }).flatMap(it => it.map((_, 1)))
      .reduceByKey(_ + _)
      .foreach(println)
    
    spark.stop()
  }
}

输出结果:
(kqid,11)
(event,100)
(stime,60)
(start,100)
(buynum,46)
(pn,8)
(source,32)
(type,100)
(skuid,46)
(price,46)
(code,21)
(end,100)
(filmid,53)
(kq,6)
(url,49)
(stoptime,60)
(binded,1)
(id,60)
(eventType,3)
(browser,17)
(eventName,3)
(ltime,60)
(receive,2)
(couponId,1)

你可能感兴趣的:(Spark)