在58遇到的第一个有意思的问题.
用户画像的原始表,有一亿记录,100多个维度(100多个列),比如年龄,性别,爱好,是否有车,是否有房什么的.
测试环境800w数据,大概在5G左右
需要解决的问题一 :在100列中任选N列,过滤查询,执行时间小于一秒。实际上N一般在5到10
即类似
select * from 画像表 where 性别=‘男’ and 年龄 between 20 and 30 and 有车='yes' and 有房='yes' and 已婚='no'
问题二:全体数据的随意聚合,执行时间小于5秒
比如
select 年龄,性别,count(*) from 画像表 group by 年龄,性别
在数据库解决这个问题有一些麻烦,传统建索引优化的方式不起作用了。
100多个列随意选择几列查询,索引不可能提前建出这么多.
先看测试数据
CREATE TABLE `o_huaxiang_big` (
`id` bigint(20) NOT NULL AUTO_INCREMENT,
`user_id` bigint(20) DEFAULT NULL,
`umc_sex` varchar(20) DEFAULT NULL,
`age` varchar(30) DEFAULT NULL,
PRIMARY KEY (`id`)
) ;
处理这个问题,我自然想到模拟一个位图.
一般画像数据有几种类型
1.数值类型
2.日期类型
3.日期时间类型
4.字符串类型
其中 日期和字符串类型可以作为离散值,
日期时间类型也可以转化为日期类型,作为离散值处理。
数值类型比较麻烦,需要人为介入判断是否是离散值,如果不是还需要划分范围。
总之,所有的值都要映射为离散值
然后以上图前5个数据为例,将离散值映射为位图
男 0 0 0 0 1
未知 1 0 0 1 0
女 0 1 1 0 0
一个bigint 是8字节的,为了取整,我存放60个记录的位信息。
然后建位图表如下
CREATE TABLE `bitmap20` (
`table_name` varchar(32) NOT NULL DEFAULT '' comment '位图表记录的原始表名称',
`column_name` varchar(32) NOT NULL DEFAULT '' comment '列名称',
`min_id` int(11) DEFAULT NULL comment '起始ID',
`max_id` int(11) DEFAULT NULL comment '终止ID',
`gid` int(11) NOT NULL DEFAULT '0' comment '分组ID,每组1200记录' ,
`grouped` varchar(32) NOT NULL DEFAULT '' comment '离散值',
`total` bigint(21) NOT NULL DEFAULT '0' comment '总数',
`c20` bigint(20) NOT NULL DEFAULT '0',
`c19` bigint(20) NOT NULL DEFAULT '0',
`c18` bigint(20) NOT NULL DEFAULT '0',
`c17` bigint(20) NOT NULL DEFAULT '0',
`c16` bigint(20) NOT NULL DEFAULT '0',
`c15` bigint(20) NOT NULL DEFAULT '0',
`c14` bigint(20) NOT NULL DEFAULT '0',
`c13` bigint(20) NOT NULL DEFAULT '0',
`c12` bigint(20) NOT NULL DEFAULT '0',
`c11` bigint(20) NOT NULL DEFAULT '0',
`c10` bigint(20) NOT NULL DEFAULT '0',
`c9` bigint(20) NOT NULL DEFAULT '0',
`c8` bigint(20) NOT NULL DEFAULT '0',
`c7` bigint(20) NOT NULL DEFAULT '0',
`c6` bigint(20) NOT NULL DEFAULT '0',
`c5` bigint(20) NOT NULL DEFAULT '0',
`c4` bigint(20) NOT NULL DEFAULT '0',
`c3` bigint(20) NOT NULL DEFAULT '0',
`c2` bigint(20) NOT NULL DEFAULT '0',
`c1` bigint(20) NOT NULL DEFAULT '0',
PRIMARY KEY (`column_name`,`gid`,`grouped`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=COMPRESSED comment '位图表';
c1-c20,一共20个bigint类型的字段,每个bigint记录60个位信息。
也就是位图表每行存储1200个原始记录的位图信息,并且位图表启用了压缩。
测试环境
4C 8G内存(innodb buffer 2G) SSD硬盘
800万原始画像数据,占用硬盘5G
初始化位图表
insert into bitmap20
select
'o_huaxiang_big' table_name,
'umc_sex' column_name,
((g1200-1)*60)*20 min_id,
((g1200-1)*60)*20+1200 max_id,
v2.*
from (
select
g1200,
grouped,
sum(total) total,
ifnull(max(case when abs((g1200-1)*20-g60)=20 then bitmap else null end),0) c20,
ifnull(max(case when abs((g1200-1)*20-g60)=19 then bitmap else null end),0) c19,
ifnull(max(case when abs((g1200-1)*20-g60)=18 then bitmap else null end),0) c18,
ifnull(max(case when abs((g1200-1)*20-g60)=17 then bitmap else null end),0) c17,
ifnull(max(case when abs((g1200-1)*20-g60)=16 then bitmap else null end),0) c16,
ifnull(max(case when abs((g1200-1)*20-g60)=15 then bitmap else null end),0) c15,
ifnull(max(case when abs((g1200-1)*20-g60)=14 then bitmap else null end),0) c14,
ifnull(max(case when abs((g1200-1)*20-g60)=13 then bitmap else null end),0) c13,
ifnull(max(case when abs((g1200-1)*20-g60)=12 then bitmap else null end),0) c12,
ifnull(max(case when abs((g1200-1)*20-g60)=11 then bitmap else null end),0) c11,
ifnull(max(case when abs((g1200-1)*20-g60)=10 then bitmap else null end),0) c10,
ifnull(max(case when abs((g1200-1)*20-g60)=9 then bitmap else null end),0) c9,
ifnull(max(case when abs((g1200-1)*20-g60)=8 then bitmap else null end),0) c8,
ifnull(max(case when abs((g1200-1)*20-g60)=7 then bitmap else null end),0) c7,
ifnull(max(case when abs((g1200-1)*20-g60)=6 then bitmap else null end),0) c6,
ifnull(max(case when abs((g1200-1)*20-g60)=5 then bitmap else null end),0) c5,
ifnull(max(case when abs((g1200-1)*20-g60)=4 then bitmap else null end),0) c4,
ifnull(max(case when abs((g1200-1)*20-g60)=3 then bitmap else null end),0) c3,
ifnull(max(case when abs((g1200-1)*20-g60)=2 then bitmap else null end),0) c2,
ifnull(max(case when abs((g1200-1)*20-g60)=1 then bitmap else null end),0) c1
from (
SELECT
CEIL(id / 60) g60,
CEIL(id / 1200) g1200,
umc_sex grouped,
COUNT(*) total,
BIT_OR(1 << (MOD(id, 60))) bitmap
FROM
o_huaxiang_big o
GROUP BY g1200 , g60 , umc_sex
) v1 group by g1200,grouped
) v2;
insert into bitmap20
select
'o_huaxiang_big' table_name,
'age' column_name,
((g1200-1)*60)*20 min_id,
((g1200-1)*60)*20+1200 max_id,
v2.*
from (
select
g1200,
grouped,
sum(total) total,
ifnull(max(case when abs((g1200-1)*20-g60)=20 then bitmap else null end),0) c20,
ifnull(max(case when abs((g1200-1)*20-g60)=19 then bitmap else null end),0) c19,
ifnull(max(case when abs((g1200-1)*20-g60)=18 then bitmap else null end),0) c18,
ifnull(max(case when abs((g1200-1)*20-g60)=17 then bitmap else null end),0) c17,
ifnull(max(case when abs((g1200-1)*20-g60)=16 then bitmap else null end),0) c16,
ifnull(max(case when abs((g1200-1)*20-g60)=15 then bitmap else null end),0) c15,
ifnull(max(case when abs((g1200-1)*20-g60)=14 then bitmap else null end),0) c14,
ifnull(max(case when abs((g1200-1)*20-g60)=13 then bitmap else null end),0) c13,
ifnull(max(case when abs((g1200-1)*20-g60)=12 then bitmap else null end),0) c12,
ifnull(max(case when abs((g1200-1)*20-g60)=11 then bitmap else null end),0) c11,
ifnull(max(case when abs((g1200-1)*20-g60)=10 then bitmap else null end),0) c10,
ifnull(max(case when abs((g1200-1)*20-g60)=9 then bitmap else null end),0) c9,
ifnull(max(case when abs((g1200-1)*20-g60)=8 then bitmap else null end),0) c8,
ifnull(max(case when abs((g1200-1)*20-g60)=7 then bitmap else null end),0) c7,
ifnull(max(case when abs((g1200-1)*20-g60)=6 then bitmap else null end),0) c6,
ifnull(max(case when abs((g1200-1)*20-g60)=5 then bitmap else null end),0) c5,
ifnull(max(case when abs((g1200-1)*20-g60)=4 then bitmap else null end),0) c4,
ifnull(max(case when abs((g1200-1)*20-g60)=3 then bitmap else null end),0) c3,
ifnull(max(case when abs((g1200-1)*20-g60)=2 then bitmap else null end),0) c2,
ifnull(max(case when abs((g1200-1)*20-g60)=1 then bitmap else null end),0) c1
from (
SELECT
CEIL(id / 60) g60,
CEIL(id / 1200) g1200,
age grouped,
COUNT(*) total,
BIT_OR(1 << (MOD(id, 60))) bitmap
FROM
o_huaxiang_big o
GROUP BY g1200 , g60 , age
) v1 group by g1200,grouped
) v2;
性别和年龄的初始化分别耗时36秒和49秒
两个维度的索引占用磁盘40M
聚合查询,800万数据耗时1.7秒.因为是CPU密集型操作,IO非常小,所以可以通过多线程再优化.
select t1p,t2p,sum(total)
from (
select
t1.grouped t1p,
t2.grouped t2p,
bit_count(t1.c1&t2.c1) +
bit_count(t1.c2&t2.c2) +
bit_count(t1.c3&t2.c3) +
bit_count(t1.c4&t2.c4) +
bit_count(t1.c5&t2.c5) +
bit_count(t1.c6&t2.c6) +
bit_count(t1.c7&t2.c7) +
bit_count(t1.c8&t2.c8) +
bit_count(t1.c9&t2.c9) +
bit_count(t1.c10&t2.c10) +
bit_count(t1.c11&t2.c11) +
bit_count(t1.c12&t2.c12) +
bit_count(t1.c13&t2.c13) +
bit_count(t1.c14&t2.c14) +
bit_count(t1.c15&t2.c15) +
bit_count(t1.c16&t2.c16) +
bit_count(t1.c17&t2.c17) +
bit_count(t1.c18&t2.c18) +
bit_count(t1.c19&t2.c19) +
bit_count(t1.c20&t2.c20) total
from
bitmap20 t1
inner join
bitmap20 t2
on(t1.gid=t2.gid)
where t1.column_name='umc_sex' and t2.column_name='age'
) t3 where total>0 group by t1p,t2p
还有一个问题是过滤
select max_id ,
concat(
concat(right(c20,1),left(c20,59)) ,
concat(right(c19,1),left(c19,59)) ,
concat(right(c18,1),left(c18,59)) ,
concat(right(c17,1),left(c17,59)) ,
concat(right(c16,1),left(c16,59)) ,
concat(right(c15,1),left(c15,59)) ,
concat(right(c14,1),left(c14,59)) ,
concat(right(c13,1),left(c13,59)) ,
concat(right(c12,1),left(c12,59)) ,
concat(right(c11,1),left(c11,59)) ,
concat(right(c10,1),left(c10,59)) ,
concat(right(c9,1),left(c9,59)) ,
concat(right(c8,1),left(c8,59)) ,
concat(right(c7,1),left(c7,59)) ,
concat(right(c6,1),left(c6,59)) ,
concat(right(c5,1),left(c5,59)) ,
concat(right(c4,1),left(c4,59)) ,
concat(right(c3,1),left(c3,59)) ,
concat(right(c2,1),left(c2,59)) ,
concat(right(c1,1),left(c1,59))
)
c
from (
select gid,min_id,max_id,
lpad(conv(bit_and(c20),10,2),60,'0') c20,
lpad(conv(bit_and(c19),10,2),60,'0') c19,
lpad(conv(bit_and(c18),10,2),60,'0') c18,
lpad(conv(bit_and(c17),10,2),60,'0') c17,
lpad(conv(bit_and(c16),10,2),60,'0') c16,
lpad(conv(bit_and(c15),10,2),60,'0') c15,
lpad(conv(bit_and(c14),10,2),60,'0') c14,
lpad(conv(bit_and(c13),10,2),60,'0') c13,
lpad(conv(bit_and(c12),10,2),60,'0') c12,
lpad(conv(bit_and(c11),10,2),60,'0') c11,
lpad(conv(bit_and(c10),10,2),60,'0') c10,
lpad(conv(bit_and(c9),10,2),60,'0') c9,
lpad(conv(bit_and(c8),10,2),60,'0') c8,
lpad(conv(bit_and(c7),10,2),60,'0') c7,
lpad(conv(bit_and(c6),10,2),60,'0') c6,
lpad(conv(bit_and(c5),10,2),60,'0') c5,
lpad(conv(bit_and(c4),10,2),60,'0') c4,
lpad(conv(bit_and(c3),10,2),60,'0') c3,
lpad(conv(bit_and(c2),10,2),60,'0') c2,
lpad(conv(bit_and(c1),10,2),60,'0') c1
from bitmap20
where (
(column_name='umc_sex' and grouped='未知') or
(column_name='age' and grouped='117'))
group by gid,min_id,max_id having count(distinct column_name)=2
) v1
用max_id 减去 ‘1’在c字符串的位置,就是原始的ID
来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/29254281/viewspace-2199179/,如需转载,请注明出处,否则将追究法律责任。
转载于:http://blog.itpub.net/29254281/viewspace-2199179/