鉴于客户需求,有一个12w列的大宽表,那么这么一个大宽表如何实现呢?数据要存储在hadoop的hdfs中,而且要创建索引,多维分析与多维检索
12W列意味着生成的schema文件就要20多m,由于太大,读取这个表的配置一是耗费很多内存,二是解析也很慢。
12w列在hive表中(spark 的hive sql模式),也意味着占用太多的元数据库信息,而且建表也容易失败,要知道hadoop的configuration也是有大小限制的。
让我们来一步一步的通过YDB与ya100来搞定这个12万列的大宽表吧,并且数据是通过kafka实时导入进来的。
具体ydb的使用步骤,这里不详细介绍 ,大家另行参考ydb的官网下载YDB后有详细的文档 http://ycloud.net.cn/yyydb
这里只重点介绍跟12W列有关的注意事项。
第一、ydb_site.yaml的如下配置参数都要改
ydb.realtime.doclist.buffsize: 8
ydb.directory.blockbuffer.percent: 8
ydb.realtime.buffer.ram.writepartion.each.percent: 1
ydb.directory.fieldvalue.reuse.blocksize: 0
ydb.reader.rawdata.start.delay.secs: 60
ydb.realtime.binlog.usebinlog: "false"
ydb.realtime.index.disk.maxcount: 6
ydb.realtime.buffer.maxcount: 6
ydb.realtime.ram.maxcount: 6
ydb.realtime.ram.maxcount: 6
ydb.realtime.buffer.merger.factor: 2
ydb.realtime.ram.merger.factor: 2
ydb.realtime.index.disk.merger.factor: 2
ydb.realtime.index.merger.final.factor: 2
ydb.realtime.doclist.pending.thread.pral: 2
ydb.realtime.doclist.flush.process.threads: 1
kafka.queue.size.kafka_json: 6
第二,ya100_env.sh 如下配置都要改,要至少分配20G以上内存
export YA100_MEMORY=25000m
export YA100_DRIVER_MEMORY=15000m
export YA100_EXECUTORS=2 --根据实际能启动的进程数而修改-注意yarn里的配置
第三,在YDB中创建一个空的表 (注意这里,没有12万的列哦,后面会使用隐藏的动态列)
create table ydb12w (
id string
)
第四、通过kafka或者文件导入数据,数据格式类似下面这样
message.max.bytes与replica.fetch.max.bytes 这两个参数 别忘记调了,不然kafka也导入不了这么大的单条数据
{"tablename":"ydb12w","ydbpartion":"20151011","list":[{"id":7614190,"c0_l":1,"c1_l":15,"c2_l":13,"c3_l":8,"c4_l":0,"c5_l":15,"c6_l":11,"c7_l":5,"c8_l":8,"c9_l":7,"c10_l":13,"c11_l":14,"c12_l":3,"c13_l":7,"c14_l":11,"c15_l":10,"c16_l":4,"c17_l":7,"c18_l":4,"c19_l":5,"c20_l":1,"c21_l":11,"c22_l":9,"c23_l":15,"c24_l":3,"c25_l":5,"c26_l":13,"c27_l":1,"c28_l":11,"c29_l":2,"c30_l":10,"c31_l":0,"c32_l":12,"c33_l":6,"c34_l":1,"c35_l":4,"c36_l":7,"c37_l":8,"c38_l":13,"c39_l":14,"c40_l":4,"c41_l":8,"c42_l":15,"c43_l":7,"c44_l":9,"c45_l":3,"c46_l":15,"c47_l":0,"c48_l":12,"c49_l":10,"c50_l":5,"c51_l":6,"c52_l":4,"c53_l":8,"c54_l":6,"c55_l":1,"c56_l":7,"c57_l":10,"c58_l":0,"c59_l":15,"c60_l":15,"c61_l":0,"c62_l":8,"c63_l":2,"c64_l":15,"c65_l":1,"c66_l":9,"c67_l":15,"c68_l":0,"c69_l":8,"c70_l":15,"c71_l":11,"c72_l":12,"c73_l":11,"c74_l":5,"c75_l":3,"c76_l":6,"c77_l":9,"c78_l":9,"c79_l":2,"c80_l":4,"c81_l":9,"c82_l":5,"c83_l":0,"c84_l":11,"c85_l":7,"c86_l":14,"c87_l":4,"c88_l":3,"c89_l":3,"c90_l":15,"c91_l":12,"c92_l":5,"c93_l":14,"c94_l":15,"c95_l":7,"c96_l":2,"c97_l":1,"c98_l":11,"c99_l":6,"c100_l":10,"c101_l":3,"c102_l":7,"c103_l":2,"c104_l":3,"c105_l":8,"c106_l":5,"c107_l":14,"c108_l":1,"c109_l":0,"c110_l":11,"c111_l":2,"c112_l":1,"c113_l":3,"c114_l":12,"c115_l":9,"c116_l":14,"c117_l":8,"c118_l":15,"c119_l":5,"c120_l":14,"c121_l":0,"c122_l":3,"c123_l":14,"c124_l":0,"c125_l":0,"c126_l":6,"c127_l":5,"c128_l":12,"c129_l":6,"c130_l":4,"c131_l":15,"c132_l":5,"c133_l":4,"c134_l":2,"c135_l":3,"c136_l":14,"c137_l":6,"c138_l":0,"c139_l":9,"c140_l":4,"c141_l":13,"c142_l":5,"c143_l":0,"c144_l":12,"c145_l":10,"c146_l":7,"c147_l":8,"c148_l":1,"c149_l":1,"c150_l":11,"c151_l":4,"c152_l":2,"c153_l":3,"c154_l":0,"c155_l":8,"c156_l":8,"c157_l":3,"c158_l":15,"c159_l":0,"c160_l":6,"c161_l":2,"c162_l":8,"c163_l":15,"c164_l":13,"c165_l":11,"c166_l":1,"c167_l":11,"c168_l":1,"c169_l":4,"c170_l":5,"c171_l":7,"c172_l":10,"c173_l":13,"c174_l":13,"c175_l":14,"c176_l":8,"c177_l":0,"c178_l":6,"c179_l":1,"c
{"tablename":"ydb12w","ydbpartion":"20151011","list":[{"id":4486455,"c0_l":8,"c1_l":5,"c2_l":14,"c3_l":14,"c4_l":1,"c5_l":3,"c6_l":2,"c7_l":2,"c8_l":3,"c9_l":1,"c10_l":3,"c11_l":0,"c12_l":1,"c13_l":6,"c14_l":14,"c15_l":3,"c16_l":8,"c17_l":14,"c18_l":15,"c19_l":3,"c20_l":4,"c21_l":6,"c22_l":4,"c23_l":12,"c24_l":7,"c25_l":7,"c26_l":14,"c27_l":8,"c28_l":0,"c29_l":3,"c30_l":14,"c31_l":9,"c32_l":11,"c33_l":11,"c34_l":1,"c35_l":9,"c36_l":11,"c37_l":10,"c38_l":9,"c39_l":6,"c40_l":15,"c41_l":15,"c42_l":6,"c43_l":4,"c44_l":5,"c45_l":1,"c46_l":4,"c47_l":0,"c48_l":0,"c49_l":4,"c50_l":5,"c51_l":2,"c52_l":4,"c53_l":4,"c54_l":7,"c55_l":0,"c56_l":8,"c57_l":13,"c58_l":2,"c59_l":8,"c60_l":6,"c61_l":9,"c62_l":3,"c63_l":11,"c64_l":8,"c65_l":6,"c66_l":9,"c67_l":2,"c68_l":14,"c69_l":7,"c70_l":14,"c71_l":15,"c72_l":3,"c73_l":0,"c74_l":2,"c75_l":4,"c76_l":1,"c77_l":3,"c78_l":9,"c79_l":5,"c80_l":5,"c81_l":13,"c82_l":5,"c83_l":8,"c84_l":2,"c85_l":6,"c86_l":11,"c87_l":14,"c88_l":0,"c89_l":2,"c90_l":2,"c91_l":0,"c92_l":13,"c93_l":3,"c94_l":0,"c95_l":13,"c96_l":7,"c97_l":3,"c98_l":11,"c99_l":3,"c100_l":9,"c101_l":5,"c102_l":15,"c103_l":0,"c104_l":5,"c105_l":14,"c106_l":7,"c107_l":3,"c108_l":0,"c109_l":1,"c110_l":1,"c111_l":4,"c112_l":12,"c113_l":7,"c114_l":8,"c115_l":12,"c116_l":10,"c117_l":3,"c118_l":11,"c119_l":4,"c120_l":11,"c121_l":6,"c122_l":9,"c123_l":14,"c124_l":15,"c125_l":4,"c126_l":7,"c127_l":7,"c128_l":10,"c129_l":0,"c130_l":5,"c131_l":2,"c132_l":7,"c133_l":13,"c134_l":3,"c135_l":4,"c136_l":15,"c137_l":4,"c138_l":4,"c139_l":11,"c140_l":5,"c141_l":9,"c142_l":5,"c143_l":2,"c144_l":14,"c145_l":3,"c146_l":6,"c147_l":6,"c148_l":6,"c149_l":3,"c150_l":4,"c151_l":11,"c152_l":8,"c153_l":11,"c154_l":3,"c155_l":6,"c156_l":2,"c157_l":7,"c158_l":13,"c159_l":15,"c160_l":9,"c161_l":8,"c162_l":12,"c163_l":6,"c164_l":8,"c165_l":12,"c166_l":15,"c167_l":10,"c168_l":9,"c169_l":0,"c170_l":4,"c171_l":4,"c172_l":6,"c173_l":7,"c174_l":5,"c175_l":2,"c176_l":6,"c177_l":1,"c178_l":11,"c179_l":3,"c180_l":5,"c1
第五、通过ydb的sql预览页面,看能否预览到数据
select id,c1_l,c10_l,c100_l,c1000_l,c1100_l,c1200_l from ydb12w where ydbpartion='20151011' limit 0,30
第六、在SPARK中创建ya100与ydb表的动态映射
注意,这里并不是真的创建12w列,而是仅仅创建很少的列,如100列,但是通过这100列动态的与ydb的12万列进行映射而已
drop table ydb12w;
CREATE external table ydb12w(
id string,
m1 bigint,
m2 bigint,
m3 bigint,
m4 bigint,
m5 bigint,
m6 bigint,
m7 bigint,
m8 bigint,
m9 bigint,
ydbpartion string, ya100_pipe string
)
STORED BY 'cn.net.ycloud.ydb.handle.Ya100StorageHandler'
TBLPROPERTIES(
"ya100.handler.table.name"="ydb12w",
"ya100.handler.master"="ycloudtest:1210",
"ya100.handler.columns.mapping"="id,c1_l,c2_l,c3_l,c4_l,c5_l,c6_l,c7_l,c8_l,c9_l,ydbpartion,ya100_pipe"
);
第六,查询的时候通过Ymapping动态映射列
select id,m1,m2,m3,m4,m5,m6,m7,m8,m9 from ydb12w
where Yfilter('ydb12w','ydbpartion=\'20151011\' ',ya100_pipe)
and Ymapping('ydb12w','id,c1_l,c2_l,c3_l,c4_l,c5_l,c6_l,c7_l,c8_l,c9_l,ydbpartion,ya100_pipe',ya100_pipe)
limit 10 ;
select id,m1,m2,m3,m4,m5,m6,m7,m8,m9 from ydb12w
where Yfilter('ydb12w','ydbpartion=\'20151011\' ',ya100_pipe)
and Ymapping('ydb12w','id,c11_l,c12_l,c13_l,c14_l,c15_l,c16_l,c17_l,c18_l,c19_l,ydbpartion,ya100_pipe',ya100_pipe)
limit 10 ;
select id,m1,m2,m3,m4,m5,m6,m7,m8,m9 from ydb12w
where Yfilter('ydb12w','ydbpartion=\'20151011\' and id=\'21004410\'',ya100_pipe)
and Ymapping('ydb12w','id,c1_l,c2_l,c3_l,c4_l,c5_l,c6_l,c7_l,c8_l,c9_l,ydbpartion,ya100_pipe',ya100_pipe)
limit 10 ;
select id,m1,m2,m3,m4,m5,m6,m7,m8,m9 from ydb12w
where Yfilter('ydb12w','ydbpartion=\'20151011\' and id=\'21004410\'',ya100_pipe)
and Ymapping('ydb12w','id,c11_l,c12_l,c13_l,c14_l,c15_l,c16_l,c17_l,c18_l,c19_l,ydbpartion,ya100_pipe',ya100_pipe)
limit 10 ;
select id,m1,m2,m3,m4,m5,m6,m7,m8,m9 from ydb12w
where Yfilter('ydb12w','ydbpartion=\'20151011\' and m1>=\'10\'',ya100_pipe)
and Ymapping('ydb12w','id,c11_l,c12_l,c13_l,c14_l,c15_l,c16_l,c17_l,c18_l,c19_l,ydbpartion,ya100_pipe',ya100_pipe)
limit 10 ;
select Ycount('*',ya100_pipe),Ysum('m1', ya100_pipe),Yavg('m2', ya100_pipe) ,Ymin('m3', ya100_pipe) from ydb12w
where Yfilter('ydb12w','ydbpartion=\'20151011\' ',ya100_pipe)
and Ymapping('ydb12w','id,c11_l,c12_l,c13_l,c14_l,c15_l,c16_l,c17_l,c18_l,c19_l,ydbpartion,ya100_pipe',ya100_pipe)
and Ycombine('ydb12w','*,m1,m2,m3',ya100_pipe)
limit 10 ;
select Ycount('*',ya100_pipe),Ysum('m1', ya100_pipe),Yavg('m2', ya100_pipe) ,Ymin('m3', ya100_pipe) from ydb12w
where Yfilter('ydb12w','ydbpartion=\'20151011\' and m1>=\'10\'',ya100_pipe)
and Ymapping('ydb12w','id,c11_l,c12_l,c13_l,c14_l,c15_l,c16_l,c17_l,c18_l,c19_l,ydbpartion,ya100_pipe',ya100_pipe)
and Ycombine('ydb12w','*,m1,m2,m3',ya100_pipe)
limit 10 ;