scala spark 手动构建DataFrame复杂类型,arrayType,StructType

scala spark 手动构建DataFrame复杂类型,arrayType,StructType

package biReportJob.zt
import org.apache.log4j.{Level, Logger}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{IntegerType, StringType, StructType, _}
import org.apache.spark.sql.{Row, SparkSession}


object TestMain {
  Logger.getLogger("org.apache.spark").setLevel(Level.WARN)

  def main(args: Array[String]): Unit = {

    val spark = SparkSession.builder().master("local[*]").
      config("hive.metastore.uris", "thrift://hdp02:9083").
      appName("OrderReceivedAmountCheckTestAppend").
      enableHiveSupport().getOrCreate()
    import spark.implicits._
    // Generate the schema based on the string of schema
    val fields = Array(StructField("name", StringType, nullable = true), StructField("age", IntegerType, nullable = true))
    //    val schemaString = "name age"
    //    val fields = schemaString.split(" ") .map(fieldName => StructField(fieldName, StringType, nullable = true))
    println(fields)
    val schema = StructType(fields)
    val rowsRdd: RDD[Row] = spark.sparkContext.parallelize(
      Seq(
        Row("xiaoming", 30),
        Row("ling", 28)
      )
    )
    // Convert records of the RDD (people) to Rows.
    val peopleDataFrame = spark.createDataFrame(rowsRdd, schema)
    //    val peopleDataFrame = Seq(Person("Andy", 32)).toDS()
    //    val udf_null = udf((s: Any) => null)
    val udf_2 = udf((s: Any) => {
      """[{"k":"kkk","v":100,"animal_interpretation":{"is_large_animal":true,"is_mammal":false}}]"""
    })
    //定义json的Schema
    val arrayStruct = ArrayType(StructType(Seq(
      StructField("k", StringType, true), StructField("v", DoubleType, true),
      StructField("animal_interpretation", StructType(Seq(
        StructField("is_large_animal", BooleanType, true),
        StructField("is_mammal", BooleanType, true)
      )))
    )), true)
    //添加一列字符串的值
    val df1 = peopleDataFrame.withColumn("jsonstr", udf_2(col("age")))
    df1.printSchema()
    df1.show()
    //添加一列字符串的转array->struct->等嵌套复杂类型的值
    val df2 = df1.withColumn("jsonData", from_json($"jsonstr", arrayStruct))
    df2.printSchema()
    df2.show(false)
  }

  /**
    * output
    *
    * root
    * |-- name: string (nullable = true)
    * |-- age: integer (nullable = true)
    * |-- jsonstr: string (nullable = true)
    *
    * +--------+---+--------------------+
    * |    name|age|             jsonstr|
    * +--------+---+--------------------+
    * |xiaoming| 30|[{"k":"kkk","v":1...|
    * |    ling| 28|[{"k":"kkk","v":1...|
    * +--------+---+--------------------+
    *
    * root
    * |-- name: string (nullable = true)
    * |-- age: integer (nullable = true)
    * |-- jsonstr: string (nullable = true)
    * |-- jsonData: array (nullable = true)
    * |    |-- element: struct (containsNull = true)
    * |    |    |-- k: string (nullable = true)
    * |    |    |-- v: double (nullable = true)
    * |    |    |-- animal_interpretation: struct (nullable = true)
    * |    |    |    |-- is_large_animal: boolean (nullable = true)
    * |    |    |    |-- is_mammal: boolean (nullable = true)
    *
    * +--------+---+----------------------------------------------------------------------------------------+-----------------------------+
    * |name    |age|jsonstr                                                                                 |jsonData                     |
    * +--------+---+----------------------------------------------------------------------------------------+-----------------------------+
    * |xiaoming|30 |[{"k":"kkk","v":100,"animal_interpretation":{"is_large_animal":true,"is_mammal":false}}]|[[ kkk, 100.0, [true, false]]]|
    * |ling    |28 |[{"k":"kkk","v":100,"animal_interpretation":{"is_large_animal":true,"is_mammal":false}}]|[[ kkk, 100.0, [true, false]]]|
    * +--------+---+----------------------------------------------------------------------------------------+-----------------------------+
    */


}

 

你可能感兴趣的:(scala,spark,arrayType,StructType,DataFrame,scala)