// 1 获取SparkSession
val spark = SparkSession
.builder()
.appName("spark_demo")
.master("local[3]")
.getOrCreate()
import spark.implicits._
// 2 构造数据源
val arr = Array((1, 20), (2, 18), (3, 16))
val df=spark.sparkContext.makeRDD(arr).toDF("id","age")
// 3 转成只有一列value的json格式
df.toJSON.show()
+--------------------+
| value|
+--------------------+
|{"id":1,"age":20}|
|{"id":2,"age":18}|
|{"id":3,"age":16}|
+--------------------+
-- 方式01
select
to_json( struct(name, age) ) as json_str
from (
select
'aa' as name,
'bb' as sex,
18 as age
) aa;
{"name":"aa","age":18}
1. key中含有特殊字符
// 1 要测试的json串,key中含有特殊字符"."
val str = "{\"name\":\"shy\",\"company.code\":23}"
// 2 json_tuple方式可以解析
spark.sql(s"select json_tuple('${str}','company.code') as company_code").show()
+------------+
|company_code|
+------------+
| 23|
+------------+
// 3 get_json_object方式不可以解析
spark.sql(s"select get_json_object('${str}','company.code') as company_code").show()
+------------+
|company_code|
+------------+
| null|
+------------+
------------------------方式01----------------------------------------------
-- 构造数据源
with tmp as (
select
111 id,
'[{"_yz":1,"_qg":"08","_ve":"88.0.4324.181"},{"_yz":1,"_qg":"00","_ve":"2.0.0"}]' as eparam
),
-- 展开数组
tmp2 as (
select
id,
explode(from_json(eparam, 'array< string >')) as e
from tmp
)
-- 解析数组中元素
select
id,
get_json_object(e, "$._qg") as qg,
get_json_object(e, "$._ve") as ve
from tmp2;
------------------------方式02--------------------------------------
-- 构造数据源
with tmp as (
select
111 id,
'[{"_yz":1,"_qg":"08","_ve":"88.0"},{"_yz":1,"_qg":"00","_ve":"2.0.0"}]' as eparam
),
-- 展开数组
tmp2 as (
select
id,
explode(from_json(eparam, 'array< struct<`_qg`: string, `_ve`: string > >')) as e
from tmp
)
-- 解析数组中元素
select
id,
e._qg as qg,
e._ve as ve
from tmp2;
------------------------方式03--------------------------------------
with tmp as ( -- 构造数据源
select '{"metrics":{"2":[{"tag":"promise","cnt":1},{"tag":"promise","cnt":4}],"3":[{"tag":"req","cnt":1}]},"tag":"","type":4}' as value
),
tmp2 as ( -- 按结构展开数组
select
code,
values
from
tmp
lateral view explode(from_json( get_json_object(value,'$.metrics'),'map< string, array< struct<`tag`: string, `cnt`: int > > >')) as code,values
)
select -- 处理数组
code
,transform(values, x -> x.cnt) as cnt
,aggregate(values, 0, (acc, x) -> acc + x.cnt) as sum
from
tmp2
;
// 指定schema方式01
val schema_01 = new StructType()
.add("id", IntegerType)
.add("name", StringType)
.add("age", IntegerType)
// 指定schema方式02
val schema_01 = StructType(
List(
StructField( "id", IntegerType ),
StructField( "name", StringType ),
StructField( "age", IntegerType )
)
)
//读json文件,指定schema,节省spark自动推断的损耗
val jsonDF = spark
.read
.schema(schema_01)
.json("file_path")
-- 方式01
select * from json.`svae_path`;
-- 方式02
select
id,
name
from
text.`save_path`
lateral view json_tuple(value, 'id', 'name') tmp as id,name
;
-- 方式03
with tmp as (
select
id,
from_json(eparam, 'struct<`id`: string, `name`: string >') as e
from
text.`save_path`
)
select
id,
e.id,
e.name
from
tmp
;
-- 方式04
with tmp as (
select
from_json('{"teacher": "alice", "student": [{"name": "bob", "rank": 1}, {"name": "charlie", "rank": 2}]}', 'struct>>') as info
)
select
info.teacher
,info.student
from
tmp
;
-- 创建视图
CREATE TEMPORARY VIEW people USING json OPTIONS (path 'D:\\data\\*', multiline true);
-- 读取视图
select name, company from people
参考链接:
【Coding】SparkSQL读写JSON文件 - 知乎
--方式01
select map_keys( value_02) value_03 -- 获取map的key
from (
select str_to_map( value_01, ',', ':' ) value_02 -- 字符串转换到map
from (
select regexp_replace( '{"a":0, "b":12}', '\\{|\\"|\\}|\\s', '' ) value_01 -- 格式化json串: {"a":0,"b":12} --> a:0,b:12
) a
) aa;
["a","b"]
--方式02, 性能是方式01的两倍
select map_keys( from_json('{"_yz":1,"_qg":89}', 'map< string,string >') ) as map_tmp;
["_yz","_qg"]
--方式03(spark 3.1.1及以后的版本可用)
SELECT json_object_keys('{"f1":"abc","f2":{"f3":"a", "f4":"b"}}');
["f1","f2"]
获取map值的写法不同
-- 方式01
with tmp as (
select
from_json('{"_yz":1,"_qg":89,"_ve":"88.0.4324.181"}', 'map< string,string >') as e
)
select e['_yz'] yz from tmp;
-- 方式02
with tmp as (
select
from_json('{"_yz":1,"_qg":89,"_ve":"88.0.4324.181"}', 'map< string,string >') as e
)
select e._yz yz from tmp;
-- 写法01
select
mcc,
mnc,
lang
from
table_name
lateral view json_tuple(value, 'mcc', 'mnc') extra_cols as mcc, mnc
lateral view json_tuple(value2, 'lang') extra_cols_2 as lang
where
mcc = '618'
limit 10
;
-- 写法02
select
name,
age
from
table_name
lateral view json_tuple(json_str, 'req_data') tmp as req_data
lateral view explode(req_data, 'array< string >')) as data
lateral view json_tuple(data,'name','age', 'tuid') tmp2 as name,age
;
-- 写法03, 效率和lateral view 相当
select
json_tuple(value, 'gaid', 'ts', 'tid') as (gazj, ts, tid)
from
text.`path`
;
schema_of_json(json[, options]) - Returns schema in the DDL format of JSON string.
> SELECT schema_of_json('[{"col":0}]');
ARRAY>
> SELECT schema_of_json('[{"col":01}]', map('allowNumericLeadingZeros', 'true'));
ARRAY>
-- 能解析的样式
{'spark.app.id': '123', 'spark.app.name': 'spark://master'}
-- 不能解析的样式
{\"spark.app.id\": \"123\", \"spark.app.name\": \"spark://master\"}
-- 1 from_json不能解析有转义符的json串
select from_json('{"_yz":1,"_qg":89}', 'map< string,string >') as map_tmp;
{"_qg":"89","_yz":"1"}
select from_json('{"\_yz\":1,\"_qg\":89}', 'map< string,string >') as map_tmp;
NULL
-- 2 json格式读取文件,key不存在会报错
-- 1 lateral view json 与 from_json对比
-- 效率低, 但兼容性强
with tmp as (
select
gazj,
ts,
tid
from
text.`path`
lateral view json_tuple(value, 'gaid', 'ts', 'tid') tmp as gazj, ts, tid
)
select count(1) from tmp;
-- 效率高,但无法解析含有转义符的字符串
with tmp as (
select
from_json(value, 'struct<`gaid`: string,`ts`: bigint,`tid`: bigint>') as e
from
text.`path`
),
tmp2 as (
select
e.gaid as gazj
,e.ts as ts
,e.tid as tid
from
tmp
)
select count(1) from tmp2;