Spark操作 对json复杂和嵌套数据结构的操作

目录

示例:对json字符串进行操作

对应功能的使用:从Json取值-----get_json_object

 to_json()将获取的数据转化为json格式


示例:对json字符串进行操作

对应功能的使用:从Json取值-----get_json_object

import org.apache.spark.{SPARK_REVISION, SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}

case class DeviceData(id: Int, device: String)

object jsonStu {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setAppName("activeDemo").setMaster("local[*]")
    val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
    val sc: SparkContext = spark.sparkContext

    import spark.implicits._
    import org.apache.spark.sql.functions._
    import org.apache.spark.sql.types._
//    创建一个没有任何嵌套的JSon Schema
    val jsonSchema: StructType = new StructType()
      .add("battery_level", LongType)
      .add("c02_level", LongType)
      .add("cca3", StringType)
      .add("cn", StringType)
      .add("device_id", LongType)
      .add("device_type", StringType)
      .add("signal", LongType)
      .add("ip", StringType)
      .add("temp", LongType)
      .add("timestamp", TimestampType)
//   Json数据转为DataSet
    val eventsFromJSONDF = Seq(
      (0, """{"device_id": 0, "device_type": "sensor-ipad", "ip": "68.161.225.1", "cca3": "USA", "cn": "United States", "temp": 25, "signal": 23, "battery_level": 8, "c02_level": 917, "timestamp" :1475600496 }"""),
      (1, """{"device_id": 1, "device_type": "sensor-igauge", "ip": "213.161.254.1", "cca3": "NOR", "cn": "Norway", "temp": 30, "signal": 18, "battery_level": 6, "c02_level": 1413, "timestamp" :1475600498 }"""),
      (2, """{"device_id": 2, "device_type": "sensor-ipad", "ip": "88.36.5.1", "cca3": "ITA", "cn": "Italy", "temp": 18, "signal": 25, "battery_level": 5, "c02_level": 1372, "timestamp" :1475600500 }"""),
      (3, """{"device_id": 3, "device_type": "sensor-inest", "ip": "66.39.173.154", "cca3": "USA", "cn": "United States", "temp": 47, "signal": 12, "battery_level": 1, "c02_level": 1447, "timestamp" :1475600502 }"""),
      (4, """{"device_id": 4, "device_type": "sensor-ipad", "ip": "203.82.41.9", "cca3": "PHL", "cn": "Philippines", "temp": 29, "signal": 11, "battery_level": 0, "c02_level": 983, "timestamp" :1475600504 }"""),
      (5, """{"device_id": 5, "device_type": "sensor-istick", "ip": "204.116.105.67", "cca3": "USA", "cn": "United States", "temp": 50, "signal": 16, "battery_level": 8, "c02_level": 1574, "timestamp" :1475600506 }"""),
      (6, """{"device_id": 6, "device_type": "sensor-ipad", "ip": "220.173.179.1", "cca3": "CHN", "cn": "China", "temp": 21, "signal": 18, "battery_level": 9, "c02_level": 1249, "timestamp" :1475600508 }"""),
      (7, """{"device_id": 7, "device_type": "sensor-ipad", "ip": "118.23.68.227", "cca3": "JPN", "cn": "Japan", "temp": 27, "signal": 15, "battery_level": 0, "c02_level": 1531, "timestamp" :1475600512 }"""),
      (8, """ {"device_id": 8, "device_type": "sensor-inest", "ip": "208.109.163.218", "cca3": "USA", "cn": "United States", "temp": 40, "signal": 16, "battery_level": 9, "c02_level": 1208, "timestamp" :1475600514 }"""),
      (9, """{"device_id": 9, "device_type": "sensor-ipad", "ip": "88.213.191.34", "cca3": "ITA", "cn": "Italy", "temp": 19, "signal": 11, "battery_level": 0, "c02_level": 1171, "timestamp" :1475600516 }""")).toDF("id", "json")

    eventsFromJSONDF.printSchema()
    eventsFromJSONDF.show(false)
//   get_json_object从Json中取值——DataFrame
    val frame: DataFrame = eventsFromJSONDF.select(
      get_json_object($"json", "$.device_id").as("device_id"),
      get_json_object($"json", "$.device_type").as("device_type"),
      get_json_object($"json", "$.ip").as("ip"),
      get_json_object($"json", "$.ip").as("ip"),
      get_json_object($"json", "$.cca3").as("cca3"),
      get_json_object($"json", "$.cn").as("cn"),
      get_json_object($"json", "$.temp").as("temp"),
      get_json_object($"json", "$.signal").as("signal"),
      get_json_object($"json", "$.battery_level").as("battery_level"),
      get_json_object($"json", "$.c02_level").as("c02_level"),
      get_json_object($"json", "$.timestamp").as("timestamp"),
    )
    frame.printSchema()
    frame.show(false)

    spark.close()
    sc.stop()
  }

}

 to_json()将获取的数据转化为json格式

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, SparkSession}

object JsonStu2 {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setAppName("activeDemo").setMaster("local[*]")
    val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
    val sc: SparkContext = spark.sparkContext

    import spark.implicits._
    import org.apache.spark.sql.functions._
    import org.apache.spark.sql.types._

    val jsonSchema: StructType = new StructType()
      .add("battery_level", LongType)
      .add("c02_level", LongType)
      .add("cca3", StringType)
      .add("cn", StringType)
      .add("device_id", LongType)
      .add("device_type", StringType)
      .add("signal", LongType)
      .add("ip", StringType)
      .add("temp", LongType)
      .add("timestamp", TimestampType)

    val eventsDS = Seq(
      (0, """{"device_id": 0, "device_type": "sensor-ipad", "ip": "68.161.225.1", "cca3": "USA", "cn": "United States", "temp": 25, "signal": 23, "battery_level": 8, "c02_level": 917, "timestamp" :1475600496 }"""),
      (1, """{"device_id": 1, "device_type": "sensor-igauge", "ip": "213.161.254.1", "cca3": "NOR", "cn": "Norway", "temp": 30, "signal": 18, "battery_level": 6, "c02_level": 1413, "timestamp" :1475600498 }"""),
      (2, """{"device_id": 2, "device_type": "sensor-ipad", "ip": "88.36.5.1", "cca3": "ITA", "cn": "Italy", "temp": 18, "signal": 25, "battery_level": 5, "c02_level": 1372, "timestamp" :1475600500 }"""),
      (3, """{"device_id": 3, "device_type": "sensor-inest", "ip": "66.39.173.154", "cca3": "USA", "cn": "United States", "temp": 47, "signal": 12, "battery_level": 1, "c02_level": 1447, "timestamp" :1475600502 }"""),
      (4, """{"device_id": 4, "device_type": "sensor-ipad", "ip": "203.82.41.9", "cca3": "PHL", "cn": "Philippines", "temp": 29, "signal": 11, "battery_level": 0, "c02_level": 983, "timestamp" :1475600504 }"""),
      (5, """{"device_id": 5, "device_type": "sensor-istick", "ip": "204.116.105.67", "cca3": "USA", "cn": "United States", "temp": 50, "signal": 16, "battery_level": 8, "c02_level": 1574, "timestamp" :1475600506 }"""),
      (6, """{"device_id": 6, "device_type": "sensor-ipad", "ip": "220.173.179.1", "cca3": "CHN", "cn": "China", "temp": 21, "signal": 18, "battery_level": 9, "c02_level": 1249, "timestamp" :1475600508 }"""),
      (7, """{"device_id": 7, "device_type": "sensor-ipad", "ip": "118.23.68.227", "cca3": "JPN", "cn": "Japan", "temp": 27, "signal": 15, "battery_level": 0, "c02_level": 1531, "timestamp" :1475600512 }"""),
      (8, """ {"device_id": 8, "device_type": "sensor-inest", "ip": "208.109.163.218", "cca3": "USA", "cn": "United States", "temp": 40, "signal": 16, "battery_level": 9, "c02_level": 1208, "timestamp" :1475600514 }"""),
      (9, """{"device_id": 9, "device_type": "sensor-ipad", "ip": "88.213.191.34", "cca3": "ITA", "cn": "Italy", "temp": 19, "signal": 11, "battery_level": 0, "c02_level": 1171, "timestamp" :1475600516 }"""),
      (10, """{"device_id": 10, "device_type": "sensor-igauge", "ip": "68.28.91.22", "cca3": "USA", "cn": "United States", "temp": 32, "signal": 26, "battery_level": 7, "c02_level": 886, "timestamp" :1475600518 }"""),
      (11, """{"device_id": 11, "device_type": "sensor-ipad", "ip": "59.144.114.250", "cca3": "IND", "cn": "India", "temp": 46, "signal": 25, "battery_level": 4, "c02_level": 863, "timestamp" :1475600520 }"""),
      (12, """{"device_id": 12, "device_type": "sensor-igauge", "ip": "193.156.90.200", "cca3": "NOR", "cn": "Norway", "temp": 18, "signal": 26, "battery_level": 8, "c02_level": 1220, "timestamp" :1475600522 }"""),
      (13, """{"device_id": 13, "device_type": "sensor-ipad", "ip": "67.185.72.1", "cca3": "USA", "cn": "United States", "temp": 34, "signal": 20, "battery_level": 8, "c02_level": 1504, "timestamp" :1475600524 }"""),
      (14, """{"device_id": 14, "device_type": "sensor-inest", "ip": "68.85.85.106", "cca3": "USA", "cn": "United States", "temp": 39, "signal": 17, "battery_level": 8, "c02_level": 831, "timestamp" :1475600526 }"""),
      (15, """{"device_id": 15, "device_type": "sensor-ipad", "ip": "161.188.212.254", "cca3": "USA", "cn": "United States", "temp": 27, "signal": 26, "battery_level": 5, "c02_level": 1378, "timestamp" :1475600528 }"""),
      (16, """{"device_id": 16, "device_type": "sensor-igauge", "ip": "221.3.128.242", "cca3": "CHN", "cn": "China", "temp": 10, "signal": 24, "battery_level": 6, "c02_level": 1423, "timestamp" :1475600530 }"""),
      (17, """{"device_id": 17, "device_type": "sensor-ipad", "ip": "64.124.180.215", "cca3": "USA", "cn": "United States", "temp": 38, "signal": 17, "battery_level": 9, "c02_level": 1304, "timestamp" :1475600532 }"""),
      (18, """{"device_id": 18, "device_type": "sensor-igauge", "ip": "66.153.162.66", "cca3": "USA", "cn": "United States", "temp": 26, "signal": 10, "battery_level": 0, "c02_level": 902, "timestamp" :1475600534 }"""),
      (19, """{"device_id": 19, "device_type": "sensor-ipad", "ip": "193.200.142.254", "cca3": "AUT", "cn": "Austria", "temp": 32, "signal": 27, "battery_level": 5, "c02_level": 1282, "timestamp" :1475600536 }""")).toDF("id", "device").as[DeviceData]

    eventsDS.printSchema()
    eventsDS.show()
    println("------------------------------")
    //    使用to_json()将获取的数据转化为json格式。将结果重新写入kafka或者保存partquet文件
    val frameJsonStr: DataFrame = eventsDS.select(to_json(struct($"*"))).toDF("devices")
    frameJsonStr.printSchema()
    frameJsonStr.show(3, false)

    println("---------------------------------")
    val frame2: DataFrame = eventsDS.selectExpr("CAST(device as String)")
    frame2.printSchema()
    frame2.show()
    println("-------------------------------------")
    //    传多个
    val frame3: DataFrame = eventsDS.selectExpr("CAST(id as Int)", "CAST(device as String)")


    frame3.printSchema()
    frame3.show()

    /*val frame: DataFrame = eventsDS.select(from_json($"device", jsonSchema) as "devices")
//      .select($"devices.battery_level",$"devices.ip")
      .select($"devices.*")

    frame.printSchema()
    frame.show(false)*/

    spark.close()
    sc.stop()
  }

}

 对嵌套结构的处理(样例类和炸裂函数使用来解决):

object JsonStu3op {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setAppName("activeDemo").setMaster("local[*]")
    val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
    val sc: SparkContext = spark.sparkContext

    import spark.implicits._
    import org.apache.spark.sql.functions._
    import org.apache.spark.sql.types._

    val optionRDD: RDD[String] = sc.textFile("in/op.log")
    optionRDD.foreach(println)

    val option1: RDD[(String, String)] = optionRDD.map(x => {
      val arr: Array[String] = x.split('|')
      (arr(0), arr(1))
    }
    )
    option1.foreach(println)

    //    val jsonStr: RDD[String] = option1.map(x => x._2)
    val jsonStr: DataFrame = option1.toDF("id", "value")
    val jsonStrDF: DataFrame = jsonStr.toDF()
    jsonStrDF.printSchema()
    jsonStrDF.show(false)

    val jsonObj: DataFrame = jsonStrDF.select($"id", get_json_object($"value", "$.cm").as("cm")
      , get_json_object($"value", "$.ap").as("ap")
      , get_json_object($"value", "$.et").as("et")
    )
    jsonObj.printSchema()
    jsonObj.show(false)
//   对
    val jsonobj2: DataFrame = jsonObj.select($"id", $"ap"
      , get_json_object($"cm", "$.ln").cast(DoubleType).as("ln")
      , get_json_object($"cm", "$.sv").as("sv")
      , get_json_object($"cm", "$.os").as("os")
      , get_json_object($"cm", "$.g").as("g")
      , get_json_object($"cm", "$.mid").cast(IntegerType).as("mid") //使用cast来转换类型
      , get_json_object($"cm", "$.nw").as("nw")
      , get_json_object($"cm", "$.1").as("1")
      , get_json_object($"cm", "$.vc").cast(IntegerType).as("vc")
      , get_json_object($"cm", "$.hw").as("hw")
      , get_json_object($"cm", "$.ar").as("ar")
      , get_json_object($"cm", "$.uid").cast(IntegerType).as("uid")
      , get_json_object($"cm", "$.t").as("t")
      , get_json_object($"cm", "$.la").cast(DoubleType).as("la")
      , get_json_object($"cm", "$.md").as("md")
      , get_json_object($"cm", "$.vn").as("vn")
      , get_json_object($"cm", "$.ba").as("ba")
      , get_json_object($"cm", "$.sr").as("sr")
      , $"et"

    )

    jsonobj2.printSchema()
    jsonobj2.show()

//   对嵌套结构进行处理,先创建样例类
    val jsonStrObj3: DataFrame = jsonobj2.select(
      $"id", $"ap"
      , $"sv", $"os", $"g", $"mid", $"nw", $"1", $"vc", $"hw", $"ar", $"uid", $"t", $"la", $"md", $"vn", $"ba", $"sr"
      , from_json($"et", ArrayType(
        StructType(
          StructField("ett", StringType) :: StructField("en", StringType) :: StructField("kv", StringType) :: Nil
        )
      )
      ).as("events")
    )
    jsonStrObj3.printSchema()
    jsonStrObj3.show(false)
//   炸裂字符串,使其可以和样例类适配对应,方柏霓存储数据
    val jsonobj4: DataFrame = jsonStrObj3.withColumn("events", explode($"events"))
    jsonobj4.printSchema()
    jsonobj4.show(false)
//   填入数据,withColumn可以自定义列名
    println("------------jsonobj5--------------------")
    val jsonobj5: DataFrame = jsonobj4.withColumn("ett", $"events.ett")
      .withColumn("en", $"events.en")
      .withColumn("kv", $"events.kv")
      .drop("events")

    jsonobj5.printSchema()
    jsonobj5.show()

    val loadDF: Dataset[Row] = jsonobj5.filter($"en" === "loading")
    val adDF: Dataset[Row] = jsonobj5.filter($"en" === "ad")
    val notifDF: Dataset[Row] = jsonobj5.filter($"en" === "notification")
    val actDF: Dataset[Row] = jsonobj5.filter($"en" === "active_background")
    val commentDF: Dataset[Row] = jsonobj5.filter($"en" === "comment")
    val praiseDF: Dataset[Row] = jsonobj5.filter($"en" === "praise")
    println("--------------------------")
    loadDF.show()
    //    jdbcUtils.dataFrameToMysql(loadDF,jdbcUtils.table_loading_json,1)
    println("--------------------------")
    adDF.show()
    //    jdbcUtils.dataFrameToMysql(adDF,jdbcUtils.table_ad_json,1)
    println("--------------------------")
    notifDF.show()
    //    jdbcUtils.dataFrameToMysql(notifDF,jdbcUtils.table_notification_json,1)
    println("--------------------------")
    actDF.show()
    //    jdbcUtils.dataFrameToMysql(actDF,jdbcUtils.table_active_background_json,1)
    println("--------------------------")
    commentDF.show()
    //    jdbcUtils.dataFrameToMysql(commentDF,jdbcUtils.table_comment_json,1)
    println("--------------------------")
    praiseDF.show()
    //    jdbcUtils.dataFrameToMysql(praiseDF,jdbcUtils.table_praise_json,1)



    sc.stop()
    spark.close()
  }

}

你可能感兴趣的:(spark,json,数据结构)