一 使用SpagoBI和Hive进行互联网统计分区问题?
1 原来根据年月日进行分区再拼接SQL时会异常复杂,需要在Where条件后判断是否跨年,跨月,跨日等问题。
2 Hive的自定义函数不能再Where条件后返回字符串条件来进行数据筛选,但是可以返回boolean进行数据过滤,
比如:自定义函数date_where(startTime, endTime),根据开始日期和结束日期返回格式为:
year=2015 and month=08 and day > 1 and day < 10 拼接到Where date_where("2015-08-01", "2015-08-10")不支持。
二 根据上述问题,通过日期(dt)建立分区
Step1 创建数据库
1 创建tvlog_test数据库 create database tvlog_test;
Step2 创建数据表
1 创建tvlog_tcl数据表
create table if not exists tvlog_test.tvlog_tcl(
id string,
userid string,
channelid string,
channelname string,
region string,
channelcode string,
ip string,
starttime string,
endtime string,
fromchannel string,
tochannel string,
mac string,
deviceid string,
dnum string
)
partitioned by (dt string)
stored as orc;
2 创建epg_wiki_info数据表
create table if not exists tvlog_test.epg_wiki_info(
id string,
name string,
starttime string,
endtime string,
wikiscreenshots array<string>,
wikicover map<string, string>,
wikititle string,
tags array<string>,
wikiid string,
channelcode string,
channelname string,
timestamp string
)
partitioned by (dt string)
stored as orc;
Step3 开启Hive动态分区插入
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions.pernode=1000;
Step4 插入数据(2015-09-01 ~ 2015-09-05)
1 向tvlog_test.tvlog_tcl表插入数据
insert overwrite table tvlog_test.tvlog_tcl
partition (dt)
select id, userid, channelid, channelname, region,
channelcode, ip, starttime, endtime, fromchannel,
tochannel, mac, deviceid, dnum, dt
from tvlog.tvlog_tcl
where year = 2015 and month = 9 and (day between 1 and 5);
2 向tvlog_test.epg_wiki_info表插入数据
insert overwrite table tvlog_test.epg_wiki_info
partition (dt)
select id, name, starttime, endtime, wikiscreenshots, wikicover,
wikititle, tags, wikiid, channelcode, channelname, timestamp, dt
from tvlog.epg_wiki_info
where dt between '2015-09-01' and '2015-09-05';