beeline -n serving
--hiveconf hive.security.authorization.sqlstd.confwhitelist.append='mapreduce.job.*|dfs.*'
-u jdbc:hive2://133.1333.1333.133:11111
--hiveconf hive.merge.mapredfiles=false
--hiveconf hive.merge.mapfiles=false
--hiveconf hive.stats.autogather=true
--hiveconf hive.auto.convert.join.noconditionaltask=true
--hiveconf dfs.replication=2
--hiveconf hive.auto.convert.join.noconditionaltask.size=100000000
--hiveconf hive.auto.convert.join=true
--hiveconf hive.exec.compress.output=true
--hiveconf mapreduce.job.split.metainfo.maxsize=-1
-f /home/serving/kylin/bin/../tomcat/temp/beeline_1707622077544118607.hql;
ret_code=$?;
rm -f /home/serving/kylin/bin/../tomcat/temp/beeline_1707622077544118607.hql;
exit $ret_code
USE kylin_i18n;
CREATE EXTERNAL TABLE IF NOT EXISTS kylin_intermediate_daily_detail_stat_test3_day_cube_c206165b_30ef_49d3_a3f0_2af0f5f95b9d
STORED AS SEQUENCEFILE
LOCATION 'hdfs://intlsf/i18n_kylin/report/i18n_kylin_meta/kylin-3f62091e-1e46-47a4-b83e-5b9a8d405a5a/kylin_intermediate_daily_detail_stat_test3_day_cube_c206165b_30ef_49d3_a3f0_2af0f5f95b9d
INSERT OVERWRITE TABLE kylin_intermediate_daily_detail_stat_test3_day_cube_c206165b_30ef_49d3_a3f0_2af0f5f95b9d
SELECT XXX
FROM XXDB.ADS_SS_HUICHUAN_WILD_DIM_D
INNER JOIN
KYLIN_REPORT_DICT_ONLINE.REPORT_DATE_DIC
ON ADS_XX_XX_WILD_DIM_D.DT = report_date_dic.DT
WHERE (ADS_XX_XX_WILD_DIM_D.DT >= '20180414' AND ADS_XX_XX_WILD_DIM_D.DT < '20180415')
实现类:CreateFlatHiveTableStep、BatchCubingInputSide
解析:在hive上新建一张中间表,将数据写入中间表
beeline -n serving
--hiveconf hive.security.authorization.sqlstd.confwhitelist.append='mapreduce.job.*|dfs.*'
-u jdbc:hive2://11.150.716.151:11101
--hiveconf hive.merge.mapredfiles=false
--hiveconf hive.merge.mapfiles=false
--hiveconf hive.stats.autogather=true
--hiveconf hive.auto.convert.join.noconditionaltask=true
--hiveconf dfs.replication=2
--hiveconf hive.auto.convert.join.noconditionaltask.size=100000000
--hiveconf hive.auto.convert.join=true
--hiveconf hive.exec.compress.output=true
--hiveconf mapreduce.job.split.metainfo.maxsize=-1
-f /home/serving/kylin-cpc/bin/../tomcat/temp/beeline_7092327869408196544.hql;
ret_code=$?;
rm -f /home/serving/kylin-cpc/bin/../tomcat/temp/beeline_7092327869408196544.hql;exit $ret_code
USE kylin_i18n;
set mapreduce.job.reduces=6;
set hive.merge.mapredfiles=false;
INSERT OVERWRITE TABLE kylin_intermediate_daily_detail_stat_test3_day_cube_c206165b_30ef_49d3_a3f0_2af0f5f95b9d
SELECT * FROM kylin_intermediate_daily_detail_stat_test3_day_cube_c206165b_30ef_49d3_a3f0_2af0f5f95b9d DISTRIBUTE BY RAND();
实现类:RedistributeFlatHiveTableStep
解析:通过DISTRIBUTE,将hive表中的数据分成几份,后续并行进行构建
-conf /home/serving/kylin-cpc/conf/kylin_job_conf.xml
-cubename daily_detail_stat_test1_day_cube
-output hdfs://intlsf/i18n_kylin/report/i18n_kylin_meta/kylin-1051bc22-1270-449c-ac71-9b6a15dc45de/daily_detail_stat_test1_day_cube/fact_distinct_columns -segmentid 5fa35a7a-c1df-4139-833b-e21efadf6602
-statisticsenabled true
-statisticsoutput hdfs://intlsf/i18n_kylin/report/i18n_kylin_meta/kylin-1051bc22-1270-449c-ac71-9b6a15dc45de/daily_detail_stat_test1_day_cube/fact_distinct_columns/statistics
-statisticssamplingpercent 100
-jobname Kylin_Fact_Distinct_Columns_daily_detail_stat_test1_day_cube_Step
-cubingJobId 1051bc22-1270-449c-ac71-9b6a15dc45de
-cubename daily_detail_stat_test1_day_cube
-segmentid 5fa35a7a-c1df-4139-833b-e21efadf6602
-input hdfs://intlsf/i18n_kylin/report/i18n_kylin_meta/kylin-1051bc22-1270-449c-ac71-9b6a15dc45de/daily_detail_stat_test1_day_cube/fact_distinct_columns
4-5 Save Cuboid Statistics
Duration: 0.05 mins Waiting: 0 seconds
4-6 Create HTable
Duration: 1.60 mins Waiting: 0 seconds
-cubename daily_detail_stat_test1_day_cube
-segmentid 5fa35a7a-c1df-4139-833b-e21efadf6602
-partitions hdfs://intlsf/i18n_kylin/report/i18n_kylin_meta/kylin-1051bc22-1270-449c-ac71-9b6a15dc45de/daily_detail_stat_test1_day_cube/rowkey_stats/part-r-00000
-statisticsenabled true
-conf /home/serving/kylin/conf/kylin_job_conf.xml
-cubename daily_detail_stat_test1_day_cube
-segmentid 5fa35a7a-c1df-4139-833b-e21efadf6602
-input FLAT_TABLE
-output hdfs://intlsf/i18n_kylin/report/i18n_kylin_meta/kylin-1051bc22-1270-449c-ac71-9b6a15dc45de/daily_detail_stat_test1_day_cube/cuboid/level_base_cuboid
-jobname Kylin_Base_Cuboid_Builder_daily_detail_stat_test1_day_cube
-level 0
-cubingJobId 1051bc22-1270-449c-ac71-9b6a15dc45de
-conf /home/serving/kylin-cpc/conf/kylin_job_conf.xml
-cubename daily_detail_stat_test1_day_cube
-segmentid 5fa35a7a-c1df-4139-833b-e21efadf6602
-input hdfs://intlsf/i18n_kylin/report/i18n_kylin_meta/kylin-1051bc22-1270-449c-ac71-9b6a15dc45de/daily_detail_stat_test1_day_cube/cuboid/level_base_cuboid
-output hdfs://intlsf/i18n_kylin/report/i18n_kylin_meta/kylin-1051bc22-1270-449c-ac71-9b6a15dc45de/daily_detail_stat_test1_day_cube/cuboid/level_1_cuboid
-jobname Kylin_ND-Cuboid_Builder_daily_detail_stat_test1_day_cube_Step
-level 1
-cubingJobId 1051bc22-1270-449c-ac71-9b6a15dc45de
-conf /home/serving/kylin-cpc/conf/kylin_job_conf_inmem.xml
-cubename daily_detail_stat_test1_day_cube
-segmentid 5fa35a7a-c1df-4139-833b-e21efadf6602
-output hdfs://intlsf/i18n_kylin/report/i18n_kylin_meta/kylin-1051bc22-1270-449c-ac71-9b6a15dc45de/daily_detail_stat_test1_day_cube/cuboid/
-jobname Kylin_Cube_Builder_daily_detail_stat_test1_day_cube
-cubingJobId 1051bc22-1270-449c-ac71-9b6a15dc45de
-conf /home/serving/kylin-cpc/conf/kylin_job_conf.xml
-cubename daily_detail_stat_test1_day_cube
-partitions hdfs://intlsf/i18n_kylin/report/i18n_kylin_meta/kylin-1051bc22-1270-449c-ac71-9b6a15dc45de/daily_detail_stat_test1_day_cube/rowkey_stats/part-r-00000_hfile
-input hdfs://intlsf/i18n_kylin/report/i18n_kylin_meta/kylin-1051bc22-1270-449c-ac71-9b6a15dc45de/daily_detail_stat_test1_day_cube/cuboid/*
-output hdfs://intlsf/i18n_kylin/report/i18n_kylin_meta/kylin-1051bc22-1270-449c-ac71-9b6a15dc45de/daily_detail_stat_test1_day_cube/hfile
-htablename KYLIN_6HEXXTE3HJ
-jobname Kylin_HFile_Generator_daily_detail_stat_test1_day_cube_Step
Counters: 51
File System Counters
FILE: Number of bytes read=553215047262
FILE: Number of bytes written=841363807856
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=60337214266
HDFS: Number of bytes written=74755693076
HDFS: Number of read operations=3545
HDFS: Number of large read operations=0
HDFS: Number of write operations=336
Job Counters
Launched map tasks=494
Launched reduce tasks=91
Other local map tasks=1
Data-local map tasks=396
Rack-local map tasks=97
Total time spent by all maps in occupied slots (ms)=313852856
Total time spent by all reduces in occupied slots (ms)=177376836
Total time spent by all map tasks (ms)=78463214
Total time spent by all reduce tasks (ms)=44344209
Total vcore-seconds taken by all map tasks=78463214
Total vcore-seconds taken by all reduce tasks=44344209
Total megabyte-seconds taken by all map tasks=321385324544
Total megabyte-seconds taken by all reduce tasks=181633880064
Map-Reduce Framework
Map input records=10152377320
Map output records=10152377320
Map output bytes=2000649779481
Map output materialized bytes=290313706378
Input split bytes=103246
Combine input records=0
Combine output records=0
Reduce input groups=10152377320
Reduce shuffle bytes=290313706378
Reduce input records=10152377320
Reduce output records=10152377320
Spilled Records=30457131960
Shuffled Maps =44954
Failed Shuffles=0
Merged Map outputs=44954
GC time elapsed (ms)=727568
CPU time spent (ms)=147900510
Physical memory (bytes) snapshot=1019898109952
Virtual memory (bytes) snapshot=2591039561728
Total committed heap usage (bytes)=1359976923136
Shuffle Errors
BAD_ID=0
CONNECTION=0
IO_ERROR=0
WRONG_LENGTH=0
WRONG_MAP=0
WRONG_REDUCE=0
File Input Format Counters
Bytes Read=60335814270
File Output Format Counters
Bytes Written=74755693076
-input hdfs://intlsf/i18n_kylin/report/i18n_kylin_meta/kylin-1051bc22-1270-449c-ac71-9b6a15dc45de/daily_detail_stat_test1_day_cube/hfile -htablename KYLIN_6HEXXTE3HJ -cubename daily_detail_stat_test1_day_cube
4-12 Update Cube Info
4-13 Hive Cleanup
Hive table kylin_i18n.kylin_intermediate_daily_detail_stat_test1_day_cube_5fa35a7a_c1df_4139_833b_e21efadf6602 is dropped.
Hive table kylin_i18n.kylin_intermediate_daily_detail_stat_test1_day_cube_5fa35a7a_c1df_4139_833b_e21efadf6602 external data path hdfs://intlsf/i18n_kylin/report/i18n_kylin_meta/kylin-1051bc22-1270-449c-ac71-9b6a15dc45de/kylin_intermediate_daily_detail_stat_test1_day_cube_5fa35a7a_c1df_4139_833b_e21efadf6602 is deleted.
$hadoop fs -ls hdfs://intlsf/i18n_kylin/report/i18n_kylin_meta/kylin-1051bc22-1270-449c-ac71-9b6a15dc45de/daily_detail_stat_test1_day_cube
hdfs://intlsf/i18n_kylin/report/i18n_kylin_meta/kylin-1051bc22-1270-449c-ac71-9b6a15dc45de/daily_detail_stat_test1_day_cube/cuboid
hdfs://intlsf/i18n_kylin/report/i18n_kylin_meta/kylin-1051bc22-1270-449c-ac71-9b6a15dc45de/daily_detail_stat_test1_day_cube/fact_distinct_columns
hdfs://intlsf/i18n_kylin/report/i18n_kylin_meta/kylin-1051bc22-1270-449c-ac71-9b6a15dc45de/daily_detail_stat_test1_day_cube/rowkey_stats
1、add job
(1)CubeController
(2)JobService.submitJobInternal
资源存储: ResourceStore->FileResourceStore
->HBaseResourceStore
->HDFSResourceStore
2、core-job
DefaultScheduler->FetcherRunner->JobRunner
#Backup Metadata
http://kylin.apache.org/docs15/howto/howto_backup_metadata.html
#Cleanup Storage (HDFS & HBase)
http://kylin.apache.org/docs15/howto/howto_cleanup_storage.html
#How to Update HBase Coprocessor
http://kylin.apache.org/docs15/howto/howto_update_coprocessor.html
超大规模数据集上的多维分析问题
数据无限增加,未来查询速度还能很快
low latency
##预计算(空间换时间) VS 实时计算
cpu资源竞争
更少的I/O消耗
##并行计算
##列式存储VS
hive表
(kafka mq)
mapreduce
(spark)
多维查询 group by - view - Materialized View -cuboid
10个维度 2的10次方 种维度组合
每一种叫做 cuboid, cube是cuboid的集合
时间换空间,实时计算
缩短数据计算时间
减少存储空间
hbase
接口:ANSI SQL
星型模型
row key=cuboid+dimensions
row value=Measures
Provide table schema from Kylin metadata
Translate the logic operator into Kylin operator
Find right cube
Translate SQL into storage engine API call
Generate physical execute plan by linq4j java implementation
Translate storage engine result into java implementation result.
Add HyperLogLog for distinct count
Implement date time related functions (i.e. Quarter)
full cube VS partial cube
2N+M+L 2N + 2M + 2L
merge的逻辑
最近可能变化的数据 比如最近7天,独立的segment。
后面进行merge 天-周-月-年
天-小时-分 构建
https://blog.bcmeng.com/post/kylin-cube.html
链家
https://mp.weixin.qq.com/s/LhtmXXTnYDDCrxdQWCvvUw
http://blog.csdn.net/fz1989/article/details/54312987
https://sdk.cn/news/3566
http://www.cnblogs.com/tgzhu/p/6136939.html
https://www.cnblogs.com/tgzhu/p/6113334.html
http://kylin.apache.org/docs/howto/howto_optimize_cubes.html