参考 https://drill.apache.org/docs/json-data-model/
假设有原始数据在hdfs上:
hdfs://dc1:8020/xf/mytest/ia/2017/0208/details/part-00000
多条数据,按行存储的json文件,实际上是spark saveAsTextFile方法生成。
格式如下(已删除部分数据)
{
"afterOpenDay": 9,
"basic": {
"availableMoney": 24063.51344060898,
"closeReason": 0,
"cutEarning": 0,
"end_date": "20170222",
"ism_id": "170208206199185",
"losePercentage": 0,
"profitPercentage": 0,
"start_date": "20170209",
"tm_close": -1,
"totalMoney": 23600,
"user_id": "8888"
},
"closeDay": true,
"dailySummary": [
{
"TN": 0,
"annualProfitRate": 0,
"asset": 23515.820100307465,
"commission": 75.1798996925354,
"cost": 21560.179899692535,
"day": "20170209",
"floatProfit": -84.1798996925354,
"freeMoney": 2039.8201003074646,
"maketValue": 21476,
"profitRate": -0.0035669449022260762
},
{
"TN": 1,
"annualProfitRate": 0,
"asset": 23585.904140472412,
"commission": 81.09585952758789,
"cost": 20668.095859527588,
"day": "20170210",
"floatProfit": -14.09585952758789,
"freeMoney": 2931.904140472412,
"maketValue": 20654,
"profitRate": -0.0005972821833723683
},
{
"TN": 2,
"annualProfitRate": 0,
"asset": 23830.72134065628,
"commission": 88.27865934371948,
"cost": 18535.27865934372,
"day": "20170213",
"floatProfit": 230.72134065628052,
"freeMoney": 5064.7213406562805,
"maketValue": 18766,
"profitRate": 0.009776327993910192
},
{
"TN": 3,
"annualProfitRate": 0,
"asset": 23887.72134065628,
"commission": 88.27865934371948,
"cost": 18535.27865934372,
"day": "20170214",
"floatProfit": 287.7213406562805,
"freeMoney": 5064.7213406562805,
"maketValue": 18823,
"profitRate": 0.012191582231198327
},
{
"TN": 4,
"annualProfitRate": 0,
"asset": 23652.72134065628,
"commission": 88.27865934371948,
"cost": 18535.27865934372,
"day": "20170215",
"floatProfit": 52.72134065628052,
"freeMoney": 5064.7213406562805,
"maketValue": 18588,
"profitRate": 0.002233955112554259
},
{
"TN": 5,
"annualProfitRate": 0,
"asset": 23716.917340755463,
"commission": 94.08265924453735,
"cost": 17737.082659244537,
"day": "20170216",
"floatProfit": 116.91734075546265,
"freeMoney": 5862.917340755463,
"maketValue": 17854,
"profitRate": 0.004954124608282316
},
{
"TN": 6,
"annualProfitRate": 0,
"asset": 23595.554340839386,
"commission": 100.44565916061401,
"cost": 16380.445659160614,
"day": "20170217",
"floatProfit": -4.445659160614014,
"freeMoney": 7219.554340839386,
"maketValue": 16376,
"profitRate": -0.00018837538816161075
},
{
"TN": 7,
"annualProfitRate": 0,
"asset": 23780.802600860596,
"commission": 106.1973991394043,
"cost": 15649.197399139404,
"day": "20170220",
"floatProfit": 180.8026008605957,
"freeMoney": 7950.802600860596,
"maketValue": 15830,
"profitRate": 0.007661127155109988
},
{
"TN": 8,
"annualProfitRate": 0,
"asset": 24011.805600643158,
"commission": 113.19439888000488,
"cost": 13659.194399356842,
"day": "20170221",
"floatProfit": 411.80560064315796,
"freeMoney": 9940.805600643158,
"maketValue": 14071,
"profitRate": 0.01744938985776093
}
]
}
(1)下载并启动apache drill
bin/drill-embedded
(2)配置storage plugin,dc1是机器hostname
http://dc1:8047/storage
{
"type": "file",
"enabled": true,
"connection": "hdfs://dc1:8020",
"config": null,
"workspaces": {
"root": {
"location": "/",
"writable": false,
"defaultInputFormat": null
},
"tmp": {
"location": "/tmp",
"writable": true,
"defaultInputFormat": null
},
"ism": {
"location": "/wx/mytest/ia/2017",
"writable": true,
"defaultInputFormat": "json"
}
},
"formats": {
"psv": {
"type": "text",
"extensions": [
"tbl"
],
"delimiter": "|"
},
"csv": {
"type": "text",
"extensions": [
"csv"
],
"delimiter": ","
},
"tsv": {
"type": "text",
"extensions": [
"tsv"
],
"delimiter": "\t"
},
"httpd": {
"type": "httpd",
"logFormat": "%h %t \"%r\" %>s %b \"%{Referer}i\"",
"timestampFormat": null
},
"parquet": {
"type": "parquet"
},
"json": {
"type": "json",
"extensions": [
"json"
]
},
"avro": {
"type": "avro"
},
"sequencefile": {
"type": "sequencefile",
"extensions": [
"seq"
]
},
"csvh": {
"type": "text",
"extensions": [
"csvh"
],
"extractHeader": true,
"delimiter": ","
}
}
}
(3)修改配置
http://dc1:8047/options
store.json.read_numbers_as_double 改为true,这个是因为我这边的json数据,有的浮点数输出为整数如5.0直接输出为5,导致错误”DATA_READ ERROR: You tried to write a Float8 type when you are using a ValueWriter of type ...“
(4)执行sql语句,这里dfs.ism.表示使用的是storage plguin 中的dfs里面配置的工作目录为ism
a.basic.ism_id,表示使用json文件中的basic字段(basic是个OBJECT类型)里面的ism_id字段
0: jdbc:drill:zk=local> select a.basic.ism_id as ism_id,a.dailySummary.asset as asset from dfs.ism.`0208/details/part-00000` a limit 10;
+------------------+---------------------+
| ism_id | asset |
+------------------+---------------------+
| 170208206199185 | 23515.820100307465 |
| 170208206199187 | 23585.904140472412 |
| 170208206199188 | 23830.72134065628 |
| 170208206199189 | 23887.72134065628 |
| 170208206199191 | 23652.72134065628 |
| 170208206199196 | 23716.917340755463 |
| 170208206199199 | 23595.554340839386 |
| 170208206199201 | 23780.802600860596 |
| 170208206199206 | 24011.805600643158 |
| 170208206199209 | 24063.51344060898 |
+------------------+---------------------+
10 rows selected (0.898 seconds)
[思考问题]上述字段中,如果遇到数组应该如何处理?
比如,要查询dailySummary 中的每日资产asset?
参考Drill官方文档,使用子查询(nest query)和FLATTEN函数,
FLATTEN用于将数组扁平化,即1行拆分成多行数据。
0: jdbc:drill:zk=local> select b.ism_id,b.daily.asset as asset from (select a.basic.ism_id as ism_id,FLATTEN(a.dailySummary) as daily from dfs.ism.`0208/details/part-00000` a ) b limit 10;
+------------------+---------------------+
| ism_id | asset |
+------------------+---------------------+
| 170208206199185 | 23515.820100307465 |
| 170208206199185 | 23585.904140472412 |
| 170208206199185 | 23830.72134065628 |
| 170208206199185 | 23887.72134065628 |
| 170208206199185 | 23652.72134065628 |
| 170208206199185 | 23716.917340755463 |
| 170208206199185 | 23595.554340839386 |
| 170208206199185 | 23780.802600860596 |
| 170208206199185 | 24011.805600643158 |
| 170208206199185 | 24063.51344060898 |
| 170208206199187 | 20130.834299087524 |
| 170208206199187 | 19987.834299087524 |
| 170208206199187 | 20333.938299179077 |
| 170208206199187 | 20277.938299179077 |
| 170208206199187 | 20153.938299179077 |
| 170208206199187 | 20321.938299179077 |
| 170208206199187 | 20165.137598991394 |
| 170208206199187 | 20376.137598991394 |
| 170208206199187 | 20496.137598991394 |
| 170208206199187 | 20428.81975889206 |
+------------------+---------------------+
20 rows selected (0.978 seconds)
上述查询也可以通过web方式
http://dc1:8047/query
查询获得。