目录
示例:对json字符串进行操作
对应功能的使用:从Json取值-----get_json_object
to_json()将获取的数据转化为json格式
import org.apache.spark.{SPARK_REVISION, SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
case class DeviceData(id: Int, device: String)
object jsonStu {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("activeDemo").setMaster("local[*]")
val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
val sc: SparkContext = spark.sparkContext
import spark.implicits._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
// 创建一个没有任何嵌套的JSon Schema
val jsonSchema: StructType = new StructType()
.add("battery_level", LongType)
.add("c02_level", LongType)
.add("cca3", StringType)
.add("cn", StringType)
.add("device_id", LongType)
.add("device_type", StringType)
.add("signal", LongType)
.add("ip", StringType)
.add("temp", LongType)
.add("timestamp", TimestampType)
// Json数据转为DataSet
val eventsFromJSONDF = Seq(
(0, """{"device_id": 0, "device_type": "sensor-ipad", "ip": "68.161.225.1", "cca3": "USA", "cn": "United States", "temp": 25, "signal": 23, "battery_level": 8, "c02_level": 917, "timestamp" :1475600496 }"""),
(1, """{"device_id": 1, "device_type": "sensor-igauge", "ip": "213.161.254.1", "cca3": "NOR", "cn": "Norway", "temp": 30, "signal": 18, "battery_level": 6, "c02_level": 1413, "timestamp" :1475600498 }"""),
(2, """{"device_id": 2, "device_type": "sensor-ipad", "ip": "88.36.5.1", "cca3": "ITA", "cn": "Italy", "temp": 18, "signal": 25, "battery_level": 5, "c02_level": 1372, "timestamp" :1475600500 }"""),
(3, """{"device_id": 3, "device_type": "sensor-inest", "ip": "66.39.173.154", "cca3": "USA", "cn": "United States", "temp": 47, "signal": 12, "battery_level": 1, "c02_level": 1447, "timestamp" :1475600502 }"""),
(4, """{"device_id": 4, "device_type": "sensor-ipad", "ip": "203.82.41.9", "cca3": "PHL", "cn": "Philippines", "temp": 29, "signal": 11, "battery_level": 0, "c02_level": 983, "timestamp" :1475600504 }"""),
(5, """{"device_id": 5, "device_type": "sensor-istick", "ip": "204.116.105.67", "cca3": "USA", "cn": "United States", "temp": 50, "signal": 16, "battery_level": 8, "c02_level": 1574, "timestamp" :1475600506 }"""),
(6, """{"device_id": 6, "device_type": "sensor-ipad", "ip": "220.173.179.1", "cca3": "CHN", "cn": "China", "temp": 21, "signal": 18, "battery_level": 9, "c02_level": 1249, "timestamp" :1475600508 }"""),
(7, """{"device_id": 7, "device_type": "sensor-ipad", "ip": "118.23.68.227", "cca3": "JPN", "cn": "Japan", "temp": 27, "signal": 15, "battery_level": 0, "c02_level": 1531, "timestamp" :1475600512 }"""),
(8, """ {"device_id": 8, "device_type": "sensor-inest", "ip": "208.109.163.218", "cca3": "USA", "cn": "United States", "temp": 40, "signal": 16, "battery_level": 9, "c02_level": 1208, "timestamp" :1475600514 }"""),
(9, """{"device_id": 9, "device_type": "sensor-ipad", "ip": "88.213.191.34", "cca3": "ITA", "cn": "Italy", "temp": 19, "signal": 11, "battery_level": 0, "c02_level": 1171, "timestamp" :1475600516 }""")).toDF("id", "json")
eventsFromJSONDF.printSchema()
eventsFromJSONDF.show(false)
// get_json_object从Json中取值——DataFrame
val frame: DataFrame = eventsFromJSONDF.select(
get_json_object($"json", "$.device_id").as("device_id"),
get_json_object($"json", "$.device_type").as("device_type"),
get_json_object($"json", "$.ip").as("ip"),
get_json_object($"json", "$.ip").as("ip"),
get_json_object($"json", "$.cca3").as("cca3"),
get_json_object($"json", "$.cn").as("cn"),
get_json_object($"json", "$.temp").as("temp"),
get_json_object($"json", "$.signal").as("signal"),
get_json_object($"json", "$.battery_level").as("battery_level"),
get_json_object($"json", "$.c02_level").as("c02_level"),
get_json_object($"json", "$.timestamp").as("timestamp"),
)
frame.printSchema()
frame.show(false)
spark.close()
sc.stop()
}
}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, SparkSession}
object JsonStu2 {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("activeDemo").setMaster("local[*]")
val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
val sc: SparkContext = spark.sparkContext
import spark.implicits._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
val jsonSchema: StructType = new StructType()
.add("battery_level", LongType)
.add("c02_level", LongType)
.add("cca3", StringType)
.add("cn", StringType)
.add("device_id", LongType)
.add("device_type", StringType)
.add("signal", LongType)
.add("ip", StringType)
.add("temp", LongType)
.add("timestamp", TimestampType)
val eventsDS = Seq(
(0, """{"device_id": 0, "device_type": "sensor-ipad", "ip": "68.161.225.1", "cca3": "USA", "cn": "United States", "temp": 25, "signal": 23, "battery_level": 8, "c02_level": 917, "timestamp" :1475600496 }"""),
(1, """{"device_id": 1, "device_type": "sensor-igauge", "ip": "213.161.254.1", "cca3": "NOR", "cn": "Norway", "temp": 30, "signal": 18, "battery_level": 6, "c02_level": 1413, "timestamp" :1475600498 }"""),
(2, """{"device_id": 2, "device_type": "sensor-ipad", "ip": "88.36.5.1", "cca3": "ITA", "cn": "Italy", "temp": 18, "signal": 25, "battery_level": 5, "c02_level": 1372, "timestamp" :1475600500 }"""),
(3, """{"device_id": 3, "device_type": "sensor-inest", "ip": "66.39.173.154", "cca3": "USA", "cn": "United States", "temp": 47, "signal": 12, "battery_level": 1, "c02_level": 1447, "timestamp" :1475600502 }"""),
(4, """{"device_id": 4, "device_type": "sensor-ipad", "ip": "203.82.41.9", "cca3": "PHL", "cn": "Philippines", "temp": 29, "signal": 11, "battery_level": 0, "c02_level": 983, "timestamp" :1475600504 }"""),
(5, """{"device_id": 5, "device_type": "sensor-istick", "ip": "204.116.105.67", "cca3": "USA", "cn": "United States", "temp": 50, "signal": 16, "battery_level": 8, "c02_level": 1574, "timestamp" :1475600506 }"""),
(6, """{"device_id": 6, "device_type": "sensor-ipad", "ip": "220.173.179.1", "cca3": "CHN", "cn": "China", "temp": 21, "signal": 18, "battery_level": 9, "c02_level": 1249, "timestamp" :1475600508 }"""),
(7, """{"device_id": 7, "device_type": "sensor-ipad", "ip": "118.23.68.227", "cca3": "JPN", "cn": "Japan", "temp": 27, "signal": 15, "battery_level": 0, "c02_level": 1531, "timestamp" :1475600512 }"""),
(8, """ {"device_id": 8, "device_type": "sensor-inest", "ip": "208.109.163.218", "cca3": "USA", "cn": "United States", "temp": 40, "signal": 16, "battery_level": 9, "c02_level": 1208, "timestamp" :1475600514 }"""),
(9, """{"device_id": 9, "device_type": "sensor-ipad", "ip": "88.213.191.34", "cca3": "ITA", "cn": "Italy", "temp": 19, "signal": 11, "battery_level": 0, "c02_level": 1171, "timestamp" :1475600516 }"""),
(10, """{"device_id": 10, "device_type": "sensor-igauge", "ip": "68.28.91.22", "cca3": "USA", "cn": "United States", "temp": 32, "signal": 26, "battery_level": 7, "c02_level": 886, "timestamp" :1475600518 }"""),
(11, """{"device_id": 11, "device_type": "sensor-ipad", "ip": "59.144.114.250", "cca3": "IND", "cn": "India", "temp": 46, "signal": 25, "battery_level": 4, "c02_level": 863, "timestamp" :1475600520 }"""),
(12, """{"device_id": 12, "device_type": "sensor-igauge", "ip": "193.156.90.200", "cca3": "NOR", "cn": "Norway", "temp": 18, "signal": 26, "battery_level": 8, "c02_level": 1220, "timestamp" :1475600522 }"""),
(13, """{"device_id": 13, "device_type": "sensor-ipad", "ip": "67.185.72.1", "cca3": "USA", "cn": "United States", "temp": 34, "signal": 20, "battery_level": 8, "c02_level": 1504, "timestamp" :1475600524 }"""),
(14, """{"device_id": 14, "device_type": "sensor-inest", "ip": "68.85.85.106", "cca3": "USA", "cn": "United States", "temp": 39, "signal": 17, "battery_level": 8, "c02_level": 831, "timestamp" :1475600526 }"""),
(15, """{"device_id": 15, "device_type": "sensor-ipad", "ip": "161.188.212.254", "cca3": "USA", "cn": "United States", "temp": 27, "signal": 26, "battery_level": 5, "c02_level": 1378, "timestamp" :1475600528 }"""),
(16, """{"device_id": 16, "device_type": "sensor-igauge", "ip": "221.3.128.242", "cca3": "CHN", "cn": "China", "temp": 10, "signal": 24, "battery_level": 6, "c02_level": 1423, "timestamp" :1475600530 }"""),
(17, """{"device_id": 17, "device_type": "sensor-ipad", "ip": "64.124.180.215", "cca3": "USA", "cn": "United States", "temp": 38, "signal": 17, "battery_level": 9, "c02_level": 1304, "timestamp" :1475600532 }"""),
(18, """{"device_id": 18, "device_type": "sensor-igauge", "ip": "66.153.162.66", "cca3": "USA", "cn": "United States", "temp": 26, "signal": 10, "battery_level": 0, "c02_level": 902, "timestamp" :1475600534 }"""),
(19, """{"device_id": 19, "device_type": "sensor-ipad", "ip": "193.200.142.254", "cca3": "AUT", "cn": "Austria", "temp": 32, "signal": 27, "battery_level": 5, "c02_level": 1282, "timestamp" :1475600536 }""")).toDF("id", "device").as[DeviceData]
eventsDS.printSchema()
eventsDS.show()
println("------------------------------")
// 使用to_json()将获取的数据转化为json格式。将结果重新写入kafka或者保存partquet文件
val frameJsonStr: DataFrame = eventsDS.select(to_json(struct($"*"))).toDF("devices")
frameJsonStr.printSchema()
frameJsonStr.show(3, false)
println("---------------------------------")
val frame2: DataFrame = eventsDS.selectExpr("CAST(device as String)")
frame2.printSchema()
frame2.show()
println("-------------------------------------")
// 传多个
val frame3: DataFrame = eventsDS.selectExpr("CAST(id as Int)", "CAST(device as String)")
frame3.printSchema()
frame3.show()
/*val frame: DataFrame = eventsDS.select(from_json($"device", jsonSchema) as "devices")
// .select($"devices.battery_level",$"devices.ip")
.select($"devices.*")
frame.printSchema()
frame.show(false)*/
spark.close()
sc.stop()
}
}
对嵌套结构的处理(样例类和炸裂函数使用来解决):
object JsonStu3op {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("activeDemo").setMaster("local[*]")
val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
val sc: SparkContext = spark.sparkContext
import spark.implicits._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
val optionRDD: RDD[String] = sc.textFile("in/op.log")
optionRDD.foreach(println)
val option1: RDD[(String, String)] = optionRDD.map(x => {
val arr: Array[String] = x.split('|')
(arr(0), arr(1))
}
)
option1.foreach(println)
// val jsonStr: RDD[String] = option1.map(x => x._2)
val jsonStr: DataFrame = option1.toDF("id", "value")
val jsonStrDF: DataFrame = jsonStr.toDF()
jsonStrDF.printSchema()
jsonStrDF.show(false)
val jsonObj: DataFrame = jsonStrDF.select($"id", get_json_object($"value", "$.cm").as("cm")
, get_json_object($"value", "$.ap").as("ap")
, get_json_object($"value", "$.et").as("et")
)
jsonObj.printSchema()
jsonObj.show(false)
// 对
val jsonobj2: DataFrame = jsonObj.select($"id", $"ap"
, get_json_object($"cm", "$.ln").cast(DoubleType).as("ln")
, get_json_object($"cm", "$.sv").as("sv")
, get_json_object($"cm", "$.os").as("os")
, get_json_object($"cm", "$.g").as("g")
, get_json_object($"cm", "$.mid").cast(IntegerType).as("mid") //使用cast来转换类型
, get_json_object($"cm", "$.nw").as("nw")
, get_json_object($"cm", "$.1").as("1")
, get_json_object($"cm", "$.vc").cast(IntegerType).as("vc")
, get_json_object($"cm", "$.hw").as("hw")
, get_json_object($"cm", "$.ar").as("ar")
, get_json_object($"cm", "$.uid").cast(IntegerType).as("uid")
, get_json_object($"cm", "$.t").as("t")
, get_json_object($"cm", "$.la").cast(DoubleType).as("la")
, get_json_object($"cm", "$.md").as("md")
, get_json_object($"cm", "$.vn").as("vn")
, get_json_object($"cm", "$.ba").as("ba")
, get_json_object($"cm", "$.sr").as("sr")
, $"et"
)
jsonobj2.printSchema()
jsonobj2.show()
// 对嵌套结构进行处理,先创建样例类
val jsonStrObj3: DataFrame = jsonobj2.select(
$"id", $"ap"
, $"sv", $"os", $"g", $"mid", $"nw", $"1", $"vc", $"hw", $"ar", $"uid", $"t", $"la", $"md", $"vn", $"ba", $"sr"
, from_json($"et", ArrayType(
StructType(
StructField("ett", StringType) :: StructField("en", StringType) :: StructField("kv", StringType) :: Nil
)
)
).as("events")
)
jsonStrObj3.printSchema()
jsonStrObj3.show(false)
// 炸裂字符串,使其可以和样例类适配对应,方柏霓存储数据
val jsonobj4: DataFrame = jsonStrObj3.withColumn("events", explode($"events"))
jsonobj4.printSchema()
jsonobj4.show(false)
// 填入数据,withColumn可以自定义列名
println("------------jsonobj5--------------------")
val jsonobj5: DataFrame = jsonobj4.withColumn("ett", $"events.ett")
.withColumn("en", $"events.en")
.withColumn("kv", $"events.kv")
.drop("events")
jsonobj5.printSchema()
jsonobj5.show()
val loadDF: Dataset[Row] = jsonobj5.filter($"en" === "loading")
val adDF: Dataset[Row] = jsonobj5.filter($"en" === "ad")
val notifDF: Dataset[Row] = jsonobj5.filter($"en" === "notification")
val actDF: Dataset[Row] = jsonobj5.filter($"en" === "active_background")
val commentDF: Dataset[Row] = jsonobj5.filter($"en" === "comment")
val praiseDF: Dataset[Row] = jsonobj5.filter($"en" === "praise")
println("--------------------------")
loadDF.show()
// jdbcUtils.dataFrameToMysql(loadDF,jdbcUtils.table_loading_json,1)
println("--------------------------")
adDF.show()
// jdbcUtils.dataFrameToMysql(adDF,jdbcUtils.table_ad_json,1)
println("--------------------------")
notifDF.show()
// jdbcUtils.dataFrameToMysql(notifDF,jdbcUtils.table_notification_json,1)
println("--------------------------")
actDF.show()
// jdbcUtils.dataFrameToMysql(actDF,jdbcUtils.table_active_background_json,1)
println("--------------------------")
commentDF.show()
// jdbcUtils.dataFrameToMysql(commentDF,jdbcUtils.table_comment_json,1)
println("--------------------------")
praiseDF.show()
// jdbcUtils.dataFrameToMysql(praiseDF,jdbcUtils.table_praise_json,1)
sc.stop()
spark.close()
}
}