有以下jason
{
"status":"0x0000",
"msg":"执⾏成功",
"result":"通过",
"score":"0",
"engineName":"credit_unit_salim",
"versionCode":"20200702credit_salim",
"versionId":356307673651200,
"engineId":355251417716736,
"outputFields":[
{
"code":"return_reason",
"name":"输出打回原因",
"value":"null"
},
{
"code":"deny_days",
"name":"输出拒绝天数",
"value":"0"
},
{
"code":"deny_reason",
"name":"输出拒绝原因",
"value":"null"
},
{
"code":"decision",
"name":"输出决策",
"value":"forward_manual"
},
{
"code":"limit",
"name":"输出授信额度",
"value":"0"
},
{
"code":"cash_limit",
"name":"现⾦贷款额度",
"value":"0"
}
],
"inputFields":[
{
"indo_id_check":"DEDY DWI SETYAWAN",
"indo_identical_accuracy_ktp":"-2.0",
"indo_mobile_number_approving":"1",
"indo_name_diff_id_check":"0",
"indo_name_diff_ocr":"1",
"indo_nik_approving":"1",
"indo_nik_diff_employee_nik":"0",
"indo_nik_diff_ocr":"1",
"indo_ocr_name":"DEDY DWI SEVYAWAN",
"indo_ocr_nik":"3525051812850002",
"indo_reject_his_nik":"0",
"indo_reject_his_tel":"0",
"同⼀个申请下return次数":"0"
}
],
"outputFieldInfo":[
{
"输出打回原因":"null",
"输出拒绝天数":"0",
"输出拒绝原因":"null",
"输出决策":"forward_manual",
"输出授信额度":"0",
"现⾦贷款额度":"0"
}
]
}
我们要获取 inputFields 这个的其中数据,如何过去?
SQL版本取json中的array,array的每个元素作为⼀个map结构
spark.sql("""
select
from_json(data, 'array).printSchema()
root
|-- r: array (nullable = true)
| |-- element: map (containsNull = true)
| | |-- key: string
| | |-- value: string (valueContainsNull = true)
array
spark.sql("""
select schema_of_json(json)
""").toPandas()
#array
obile_number_approving:string,indo_name_diff_id_check:string,indo_name_diff_
ocr:string,indo_nik_approving:string,indo_nik_diff_employee_nik:string,indo_
nik_diff_ocr:string,indo_ocr_name:string,indo_ocr_nik:string,indo_reject_his
_nik:string,indo_reject_his_tel:string,同⼀个申请下return次数:string>>
但是通过上述⽅式得到的结果不能直接拿来⽤,需要做⼀些变形:如果array的每个元素的字段都是固定
的,那么可以将array的元素定义为⼀个 struct ,但是我们的需求中的数据,array的每个元素的字段是
不固定的,且我们没有将其字段都穷举出来,所以我们就把它定义为⼀个map类型的,后续将它使⽤
explode_outer展开再进⾏处理。通过查询spark的api,可以知道MapType的构造⽅法需要两个参数,
分别为key的Type和value的Type,这⾥我们直接使⽤最通⽤的String类型代替了
from pyspark.sql.functions import *
from pyspark.sql.types import *
schema = ArrayType(MapType(StringType(), StringType()))
spark.sql("""
select json_tuple(json, 'inputFields') items
""").withColumn(
'items',
from_json('items', schema)
).toPandas()
spark.sql("""
select json_tuple(json, 'inputFields') items
-- from atome_id_mysql_snapshot_ruleengine.t_result_catalog limit 1
""").withColumn(
'items',
from_json('items', schema)
).withColumn(
'item', explode('items')
).withColumn(
'keys', map_keys('item')
).withColumn(
'values', map_values('item')
).withColumn(
'k_v', arrays_zip('keys', 'values')
).withColumn(
'kv', explode_outer('k_v')
).printSchema()
root
|-- items: array (nullable = true)
| |-- element: map (containsNull = true)
| | |-- key: string
| | |-- value: string (valueContainsNull = true)
|-- item: map (nullable = true)
| |-- key: string
| |-- value: string (valueContainsNull = true)
|-- keys: array (nullable = true)
| |-- element: string (containsNull = true)
|-- values: array (nullable = true)
| |-- element: string (containsNull = true)
|-- k_v: array (nullable = true)
| |-- element: struct (containsNull = false)
| | |-- keys: string (nullable = true)
| | |-- values: string (nullable = true)
|-- kv: struct (nullable = true)
| |-- keys: string (nullable = true)
| |-- values: string (nullable = true)
df = spark.sql("""
select json_tuple(json, 'inputFields') items
""").withColumn(
'items',
from_json('items', schema)
).withColumn(
'item', explode('items')
)
# df.printSchema()
# df.select(expr("posexplode(d)")).printSchema
df.select(expr('explode(item)')).toPandas() # 将map 展开 posexplode会多⼀个
pos的字段
key | value |
---|---|
indo_id_check | DEDY DWI SETYAWAN |
indo_identical_accuracy_ktp | -2.0 |
indo_mobile_number_approving | 1 |