原文链接(很nice的一篇文章):https://blog.csdn.net/wang_wbq/article/details/79678168
数组\列表array、字典map这两种数据类型的索引
首先我们还是先构造数据结构与DataFrame: |
---|
scala> case class A(a: String, b: Int) defined class A
scala> case class B(c: List[A], d: Map[String, A], e: Map[Int, String], f: Map[A, String]) defined class B
scala> def a_gen(i: Int) = A(s"str_$i", i) a_gen: (i: Int)A
scala> def b_gen(i: Int) = B((1 to 10).map(a_gen).toList, (1 to 10).map(j => s"key_$j" -> a_gen(j)).toMap, (1 to 10).map(j => j -> s"value_$j").toMap, (1 to 10).map(j => a_gen(j) -> s"value_$j").toMap) b_gen: (i: Int)B
scala> val data = (1 to 10).map(b_gen)
scala> val df = spark.createDataFrame(data) df: org.apache.spark.sql.DataFrame = [c: array scala> df.show +--------------------+--------------------+--------------------+--------------------+ | c| d| e| f| +--------------------+--------------------+--------------------+--------------------+ |[[str_1, 1], [str...|[key_2 -> [str_2,...|[5 -> value_5, 10...|[[str_8, 8] -> va...| |[[str_1, 1], [str...|[key_2 -> [str_2,...|[5 -> value_5, 10...|[[str_8, 8] -> va...| |[[str_1, 1], [str...|[key_2 -> [str_2,...|[5 -> value_5, 10...|[[str_8, 8] -> va...| |[[str_1, 1], [str...|[key_2 -> [str_2,...|[5 -> value_5, 10...|[[str_8, 8] -> va...| |[[str_1, 1], [str...|[key_2 -> [str_2,...|[5 -> value_5, 10...|[[str_8, 8] -> va...| |[[str_1, 1], [str...|[key_2 -> [str_2,...|[5 -> value_5, 10...|[[str_8, 8] -> va...| |[[str_1, 1], [str...|[key_2 -> [str_2,...|[5 -> value_5, 10...|[[str_8, 8] -> va...| |[[str_1, 1], [str...|[key_2 -> [str_2,...|[5 -> value_5, 10...|[[str_8, 8] -> va...| |[[str_1, 1], [str...|[key_2 -> [str_2,...|[5 -> value_5, 10...|[[str_8, 8] -> va...| |[[str_1, 1], [str...|[key_2 -> [str_2,...|[5 -> value_5, 10...|[[str_8, 8] -> va...| +--------------------+--------------------+--------------------+--------------------+ |
--主要看schema信息 |
---|
scala> df.printSchema root:c字段是array类型,d字段是map嵌套struct类型,e是map类型,f是map类型key是struct类型 |-- c: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- a: string (nullable = true) | | |-- b: integer (nullable = false) |-- d: map (nullable = true) | |-- key: string | |-- value: struct (valueContainsNull = true) | | |-- a: string (nullable = true) | | |-- b: integer (nullable = false) |-- e: map (nullable = true) | |-- key: integer | |-- value: string (valueContainsNull = true) |-- f: map (nullable = true) | |-- key: struct | |-- value: string (valueContainsNull = true) | | |-- a: string (nullable = true) | | |-- b: integer (nullable = false) |
//同样可以使用expr("c['a']")或col("c")("a")的方式获得相同的结果。 |
---|
scala> df.select("c.a").show(10, false) +-----------------------------------------------------------------------+ |a | +-----------------------------------------------------------------------+ |[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]| |[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]| |[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]| |[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]| |[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]| |[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]| |[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]| |[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]| |[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]| |[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]| +-----------------------------------------------------------------------+
scala> df.select("c.a").printSchema root |-- a: array (nullable = true) | |-- element: string (containsNull = true) |
//这里介绍一个很有用的表达式explode,它能把数组中的元素展开成多行数据 //比如: //> SELECT explode(array(10, 20)); // 10 // 20 //还有一个比较有用的函数是(posexplode),顾名思义,这个函数会增加一列原始数组的索引 |
---|
scala> df.select(expr("explode(c.a)")).show +------+ | col| +------+ | str_1| | str_2| | str_3| | str_4| | str_5| | str_6| | str_7| | str_8| | str_9| |str_10| | str_1| | str_2| | str_3| | str_4| | str_5| | str_6| | str_7| | str_8| | str_9| |str_10| +------+ only showing top 20 rows
scala> df.select(expr("explode(c.a)")).printSchema root |-- col: string (nullable = true)
scala> df.select(expr("explode(c)")).show +------------+ | col| +------------+ | [str_1, 1]| | [str_2, 2]| | [str_3, 3]| | [str_4, 4]| | [str_5, 5]| | [str_6, 6]| | [str_7, 7]| | [str_8, 8]| | [str_9, 9]| |[str_10, 10]| | [str_1, 1]| | [str_2, 2]| | [str_3, 3]| | [str_4, 4]| | [str_5, 5]| | [str_6, 6]| | [str_7, 7]| | [str_8, 8]| | [str_9, 9]| |[str_10, 10]| +------------+ only showing top 20 rows
scala> df.select(expr("explode(c)")).printSchema root |-- col: struct (nullable = true) | |-- a: string (nullable = true) | |-- b: integer (nullable = false) |
//inline也是一个非常有用的函数,它可以把array[struct[XXX]]直接展开成XXX |
---|
scala> df.select(expr("inline(c)") as.(Seq("a","b"))).show +------+---+ | a| b| +------+---+ | str_1| 1| | str_2| 2| | str_3| 3| | str_4| 4| | str_5| 5| | str_6| 6| | str_7| 7| | str_8| 8| | str_9| 9| |str_10| 10| | str_1| 1| | str_2| 2| | str_3| 3| | str_4| 4| | str_5| 5| | str_6| 6| | str_7| 7| | str_8| 8| | str_9| 9| |str_10| 10| +------+---+ only showing top 20 rows
scala> df.select(expr("inline(c)")).printSchema root |-- a: string (nullable = true) |-- b: integer (nullable = false) |
1、点表达式 a.b 2、中括号表达式 expr(“a[‘b’]”) 3、小括号表达式 col(“a”)(“b”) 只是最后取得的列名不同 |
---|
scala> df.select(expr("posexplode(d)")).printSchema root |-- pos: integer (nullable = false) // 索引 |-- key: string (nullable = false) |-- value: struct (nullable = true) | |-- a: string (nullable = true) | |-- b: integer (nullable = false)
scala> df.select(expr("posexplode(e)")).printSchema root |-- pos: integer (nullable = false) |-- key: integer (nullable = false) |-- value: string (nullable = true)
scala> df.select(expr("posexplode(f)")).show +---+------------+--------+ |pos| key| value| +---+------------+--------+ | 0| [str_8, 8]| value_8| | 1|[str_10, 10]|value_10| | 2| [str_3, 3]| value_3| | 3| [str_1, 1]| value_1| | 4| [str_6, 6]| value_6| | 5| [str_5, 5]| value_5| | 6| [str_7, 7]| value_7| | 7| [str_2, 2]| value_2| | 8| [str_4, 4]| value_4| | 9| [str_9, 9]| value_9| | 0| [str_8, 8]| value_8| | 1|[str_10, 10]|value_10| | 2| [str_3, 3]| value_3| | 3| [str_1, 1]| value_1| | 4| [str_6, 6]| value_6| | 5| [str_5, 5]| value_5| | 6| [str_7, 7]| value_7| | 7| [str_2, 2]| value_2| | 8| [str_4, 4]| value_4| | 9| [str_9, 9]| value_9| +---+------------+--------+
scala> df.select(expr("posexplode(f)")).printSchema root |-- pos: integer (nullable = false) |-- key: struct (nullable = false) | |-- a: string (nullable = true) | |-- b: integer (nullable = false) |-- value: string (nullable = true) |
//我们可以使用点表达式去用map的key取value //如果key不存在这行数据会为null |
---|
scala> df.select("d.key_1").show +----------+ | key_1| +----------+ |[str_1, 1]| |[str_1, 1]| |[str_1, 1]| |[str_1, 1]| |[str_1, 1]| |[str_1, 1]| |[str_1, 1]| |[str_1, 1]| |[str_1, 1]| |[str_1, 1]| +----------+
scala> df.select("d.key_1").printSchema root |-- key_1: struct (nullable = true) | |-- a: string (nullable = true) | |-- b: integer (nullable = false)
|
//数字为key同样可以使用 //对于数字来讲,expr("e[1]")、expr("e['1']")、col("e")(1)、col("e")("1")这四种表达式都可用 //只是最后取得的列名不同 |
---|
scala> df.select("e.1").show +-------+ | 1| +-------+ |value_1| |value_1| |value_1| |value_1| |value_1| |value_1| |value_1| |value_1| |value_1| |value_1| +-------+
scala> df.select("e.1").printSchema root |-- 1: string (nullable = true) |
最有意思的就是f这个map了,我们用struct作为map的key 这种情况下,我们可以用namedExpressionSeq表达式类构造这个struct |
---|
scala> df.select(expr("f[('str_1' AS a, 1 AS b)]")).show +---------------------------------------------+ |f[named_struct(a, str_1 AS `a`, b, 1 AS `b`)]| +---------------------------------------------+ | value_1| | value_1| | value_1| | value_1| | value_1| | value_1| | value_1| | value_1| | value_1| | value_1| +---------------------------------------------+ scala> df.select(expr("f[('str_1' AS a, 1 AS b)]")).printSchema root |-- f[named_struct(a, str_1 AS `a`, b, 1 AS `b`)]: string (nullable = true) |
以上这种构造方式当然不是凭空想出来的,依据呢当然还是我之前提到的另一个博客里介绍的查看方式
https://blog.csdn.net/wang_wbq/article/details/79673780
primaryExpression valueExpression namedExpression |
从上面我们可以看出:
1、中括号里需要放置valueExpression
2、valueExpression可以是一个primaryExpression
3、primaryExpression可以是一个'(' namedExpression (',' namedExpression)+ ')'结构
4、namedExpression又是一个exp AS alias的结构