scala spark 手动构建DataFrame复杂类型,arrayType,StructType
package biReportJob.zt
import org.apache.log4j.{Level, Logger}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{IntegerType, StringType, StructType, _}
import org.apache.spark.sql.{Row, SparkSession}
object TestMain {
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local[*]").
config("hive.metastore.uris", "thrift://hdp02:9083").
appName("OrderReceivedAmountCheckTestAppend").
enableHiveSupport().getOrCreate()
import spark.implicits._
// Generate the schema based on the string of schema
val fields = Array(StructField("name", StringType, nullable = true), StructField("age", IntegerType, nullable = true))
// val schemaString = "name age"
// val fields = schemaString.split(" ") .map(fieldName => StructField(fieldName, StringType, nullable = true))
println(fields)
val schema = StructType(fields)
val rowsRdd: RDD[Row] = spark.sparkContext.parallelize(
Seq(
Row("xiaoming", 30),
Row("ling", 28)
)
)
// Convert records of the RDD (people) to Rows.
val peopleDataFrame = spark.createDataFrame(rowsRdd, schema)
// val peopleDataFrame = Seq(Person("Andy", 32)).toDS()
// val udf_null = udf((s: Any) => null)
val udf_2 = udf((s: Any) => {
"""[{"k":"kkk","v":100,"animal_interpretation":{"is_large_animal":true,"is_mammal":false}}]"""
})
//定义json的Schema
val arrayStruct = ArrayType(StructType(Seq(
StructField("k", StringType, true), StructField("v", DoubleType, true),
StructField("animal_interpretation", StructType(Seq(
StructField("is_large_animal", BooleanType, true),
StructField("is_mammal", BooleanType, true)
)))
)), true)
//添加一列字符串的值
val df1 = peopleDataFrame.withColumn("jsonstr", udf_2(col("age")))
df1.printSchema()
df1.show()
//添加一列字符串的转array->struct->等嵌套复杂类型的值
val df2 = df1.withColumn("jsonData", from_json($"jsonstr", arrayStruct))
df2.printSchema()
df2.show(false)
}
/**
* output
*
* root
* |-- name: string (nullable = true)
* |-- age: integer (nullable = true)
* |-- jsonstr: string (nullable = true)
*
* +--------+---+--------------------+
* | name|age| jsonstr|
* +--------+---+--------------------+
* |xiaoming| 30|[{"k":"kkk","v":1...|
* | ling| 28|[{"k":"kkk","v":1...|
* +--------+---+--------------------+
*
* root
* |-- name: string (nullable = true)
* |-- age: integer (nullable = true)
* |-- jsonstr: string (nullable = true)
* |-- jsonData: array (nullable = true)
* | |-- element: struct (containsNull = true)
* | | |-- k: string (nullable = true)
* | | |-- v: double (nullable = true)
* | | |-- animal_interpretation: struct (nullable = true)
* | | | |-- is_large_animal: boolean (nullable = true)
* | | | |-- is_mammal: boolean (nullable = true)
*
* +--------+---+----------------------------------------------------------------------------------------+-----------------------------+
* |name |age|jsonstr |jsonData |
* +--------+---+----------------------------------------------------------------------------------------+-----------------------------+
* |xiaoming|30 |[{"k":"kkk","v":100,"animal_interpretation":{"is_large_animal":true,"is_mammal":false}}]|[[ kkk, 100.0, [true, false]]]|
* |ling |28 |[{"k":"kkk","v":100,"animal_interpretation":{"is_large_animal":true,"is_mammal":false}}]|[[ kkk, 100.0, [true, false]]]|
* +--------+---+----------------------------------------------------------------------------------------+-----------------------------+
*/
}