spark操作json数据

一、转json串

1. dataframe转成json串

// 1 获取SparkSession
val spark = SparkSession
  .builder()
  .appName("spark_demo")
  .master("local[3]")
  .getOrCreate()
import spark.implicits._

// 2 构造数据源
val arr = Array((1, 20), (2, 18), (3, 16))
val df=spark.sparkContext.makeRDD(arr).toDF("id","age")

// 3 转成只有一列value的json格式
df.toJSON.show()

+--------------------+
|            value|
+--------------------+
|{"id":1,"age":20}|
|{"id":2,"age":18}|
|{"id":3,"age":16}|
+--------------------+

2. spark-sql转json串

-- 方式01
select
    to_json( struct(name, age) ) as json_str
from (
    select
        'aa' as name,
        'bb' as sex,
        18 as age
) aa;
{"name":"aa","age":18}

二、 特殊字符

1. key中含有特殊字符

// 1 要测试的json串,key中含有特殊字符"."
val str = "{\"name\":\"shy\",\"company.code\":23}"

// 2 json_tuple方式可以解析
spark.sql(s"select json_tuple('${str}','company.code') as company_code").show()
+------------+
|company_code|
+------------+
|          23|
+------------+

// 3 get_json_object方式不可以解析
spark.sql(s"select get_json_object('${str}','company.code')  as company_code").show()
+------------+
|company_code|
+------------+
|        null|
+------------+

三、数组操作

------------------------方式01----------------------------------------------
-- 构造数据源
with tmp as  (
    select 
        111 id,
        '[{"_yz":1,"_qg":"08","_ve":"88.0.4324.181"},{"_yz":1,"_qg":"00","_ve":"2.0.0"}]'  as eparam
),
-- 展开数组
tmp2 as (
    select 
        id,
        explode(from_json(eparam, 'array< string >')) as e
from tmp 
)
-- 解析数组中元素
select 
    id,
    get_json_object(e, "$._qg") as qg,
    get_json_object(e, "$._ve") as ve 
from tmp2;


------------------------方式02--------------------------------------
-- 构造数据源
with tmp as  (
    select 
        111 id,
        '[{"_yz":1,"_qg":"08","_ve":"88.0"},{"_yz":1,"_qg":"00","_ve":"2.0.0"}]'  as eparam
),
-- 展开数组
tmp2 as (
    select 
        id,
        explode(from_json(eparam, 'array< struct<`_qg`: string, `_ve`: string > >')) as e
from tmp 
)
-- 解析数组中元素
select 
    id,
    e._qg as qg,
    e._ve as ve 
from tmp2;


------------------------方式03--------------------------------------
with tmp as ( -- 构造数据源
	select '{"metrics":{"2":[{"tag":"promise","cnt":1},{"tag":"promise","cnt":4}],"3":[{"tag":"req","cnt":1}]},"tag":"","type":4}' as value
),
tmp2 as ( -- 按结构展开数组
  select
      code,
      values
  from 
      tmp
      lateral view explode(from_json( get_json_object(value,'$.metrics'),'map< string, array< struct<`tag`: string, `cnt`: int > > >')) as code,values
)
select -- 处理数组
	code
    ,transform(values, x -> x.cnt) as cnt
    ,aggregate(values, 0, (acc, x) -> acc + x.cnt) as sum
from
	tmp2
; 

四、 spark读取json文件

1. dataframe指定schema读取

// 指定schema方式01
val schema_01 = new StructType()
    .add("id", IntegerType)
    .add("name", StringType)
    .add("age", IntegerType)

// 指定schema方式02
val schema_01 = StructType(
		List(
			StructField( "id", IntegerType ),
			StructField( "name", StringType ),
			StructField( "age", IntegerType )
		)
	)

//读json文件,指定schema,节省spark自动推断的损耗
val jsonDF = spark
   .read
   .schema(schema_01)
   .json("file_path")

2. spark-sql读取json文件

-- 方式01
select * from json.`svae_path`;


-- 方式02
select
    id,
    name
from
    text.`save_path`
    lateral view json_tuple(value, 'id', 'name') tmp as id,name
;


-- 方式03
with tmp as (
    select 
        id,
        from_json(eparam, 'struct<`id`: string, `name`: string >') as e
from
    text.`save_path` 
)
select 
    id,
    e.id,
    e.name 
from
    tmp
;

-- 方式04
with tmp as (
    select
        from_json('{"teacher": "alice", "student": [{"name": "bob", "rank": 1}, {"name": "charlie", "rank": 2}]}', 'struct>>') as info
)
select
    info.teacher
    ,info.student
from
    tmp
;

3. 多行形式的json

-- 创建视图
CREATE TEMPORARY VIEW people USING json OPTIONS (path 'D:\\data\\*', multiline true);

-- 读取视图
select name, company from people

参考链接:

【Coding】SparkSQL读写JSON文件 - 知乎

五、json的key值

--方式01
select map_keys( value_02) value_03 -- 获取map的key
from (
    select str_to_map( value_01, ',', ':' ) value_02  -- 字符串转换到map
    from (
        select regexp_replace( '{"a":0, "b":12}', '\\{|\\"|\\}|\\s', '' ) value_01   -- 格式化json串: {"a":0,"b":12} --> a:0,b:12
        ) a
    ) aa;

["a","b"]


--方式02, 性能是方式01的两倍
select  map_keys( from_json('{"_yz":1,"_qg":89}', 'map< string,string >') ) as map_tmp;
["_yz","_qg"]


--方式03(spark 3.1.1及以后的版本可用)
SELECT json_object_keys('{"f1":"abc","f2":{"f3":"a", "f4":"b"}}');
 ["f1","f2"]

六 json转map

获取map值的写法不同

-- 方式01
with tmp as (
    select 
        from_json('{"_yz":1,"_qg":89,"_ve":"88.0.4324.181"}', 'map< string,string >') as e
)
select e['_yz'] yz from tmp;


-- 方式02
with tmp as (
    select 
        from_json('{"_yz":1,"_qg":89,"_ve":"88.0.4324.181"}', 'map< string,string >') as e
)
select e._yz yz from tmp;

七 解析json写法参考

-- 写法01
select
    mcc,
    mnc,
    lang
from
    table_name
    lateral view json_tuple(value, 'mcc', 'mnc') extra_cols as mcc, mnc
    lateral view json_tuple(value2, 'lang') extra_cols_2 as lang
where
    mcc = '618'
limit 10
;


-- 写法02
select
    name,
    age
from 
    table_name
    lateral view json_tuple(json_str, 'req_data') tmp as req_data
    lateral view explode(req_data, 'array< string >')) as data
    lateral view json_tuple(data,'name','age', 'tuid') tmp2 as name,age
;


-- 写法03, 效率和lateral view 相当
select
    json_tuple(value, 'gaid', 'ts', 'tid') as (gazj, ts, tid)
from
    text.`path`
;

八 获取json的schema

schema_of_json(json[, options]) - Returns schema in the DDL format of JSON string.

> SELECT schema_of_json('[{"col":0}]');
 ARRAY>
> SELECT schema_of_json('[{"col":01}]', map('allowNumericLeadingZeros', 'true'));
 ARRAY>

九 json_tuple 和 get_json_object

1. json_tuple 

-- 能解析的样式
{'spark.app.id': '123', 'spark.app.name': 'spark://master'}


-- 不能解析的样式
{\"spark.app.id\": \"123\", \"spark.app.name\": \"spark://master\"}

十 注意事项

-- 1 from_json不能解析有转义符的json串
select from_json('{"_yz":1,"_qg":89}', 'map< string,string >') as map_tmp;
{"_qg":"89","_yz":"1"}
select from_json('{"\_yz\":1,\"_qg\":89}', 'map< string,string >') as map_tmp;
NULL

-- 2 json格式读取文件,key不存在会报错

十一 高效写法

-- 1 lateral view json 与 from_json对比
-- 效率低, 但兼容性强
with tmp as (
select
    gazj,
    ts,
    tid
from
    text.`path`
    lateral view json_tuple(value, 'gaid', 'ts', 'tid') tmp as gazj, ts, tid
)
select count(1) from tmp;

-- 效率高,但无法解析含有转义符的字符串
with tmp as (
    select
        from_json(value, 'struct<`gaid`: string,`ts`: bigint,`tid`: bigint>') as e
    from
        text.`path`
),
tmp2 as (
    select 
        e.gaid as gazj
        ,e.ts as ts
        ,e.tid as tid
    from
        tmp
)
select count(1) from tmp2;





你可能感兴趣的:(spark,spark)