hive SQL实现占比、同比、环比计算(lag函数,lead函数)

一、数据准备

-- 创建表并插入数据
CREATE TABLE `saleorder`  (
  `order_id` int ,
  `order_time` date ,
  `order_num` int
) 

-- ----------------------------
-- Records of saleorder
-- ----------------------------
INSERT INTO `saleorder` VALUES 
(1, '2020-04-20', 420),
(2, '2020-04-04', 800),
(3, '2020-03-28', 500),
(4, '2020-03-13', 100),
(5, '2020-02-27', 300),
(6, '2020-01-07', 450),
(7, '2019-04-07', 800),
(8, '2019-03-15', 1200),
(9, '2019-02-17', 200),
(10, '2019-02-07', 600),
(11, '2019-01-13', 300);

查看表信息

select * from saleorder;

hive SQL实现占比、同比、环比计算(lag函数,lead函数)_第1张图片

二、占比

写法一:

基本思路:用隐式内连接,外加嵌套找出分子分母,相除(最后要分组)

SELECT
	order_month,
	num,
	total,
	round( num / total, 2 ) AS ratio 
FROM
(
	-- 月统计
    SELECT 
	    DATE_FORMAT(order_time,"yyyy-MM") AS order_month,
	    sum( order_num ) AS num
    FROM    
	saleorder 
    GROUP BY
		DATE_FORMAT(order_time,"yyyy-MM")
)t1,
(
	-- 年统计
    SELECT 
	    year(order_time) AS order_year,
	    sum( order_num ) AS total
    FROM 
	saleorder 
    GROUP BY
		year(order_time)
)t2 
where substr(t1.order_month,1,4) = t2.order_year;

hive SQL实现占比、同比、环比计算(lag函数,lead函数)_第2张图片

写法二:显示内连接

基本思路:显示内联接,先分组、汇总–>笛卡尔积连接–>相除
友情提示: 时间处理的时候除了用date_formate()也可以用substr()函数来截取年月日格式

SELECT
	order_month,
	num,
	total,
	round(num/total,2) as ratio
FROM
(
    SELECT
	    substr( order_time, 1, 7 ) AS order_month,
	    sum( order_num ) AS num 
    FROM
	    saleorder 
    GROUP BY
	    substr( order_time, 1, 7 ) 
) t1
INNER JOIN 
(
    SELECT
	    substr( order_time, 1, 4 ) AS order_year,
	    sum( order_num ) AS total 
    FROM
	    saleorder
    GROUP BY
	    substr( order_time, 1, 4 ) 
) t2 
ON substr( order_month, 1, 4 ) = t2.order_year ;

hive SQL实现占比、同比、环比计算(lag函数,lead函数)_第3张图片

写法三:开窗函数
SELECT 
	order_month,
	num,
	total,
	round( num / total, 2 ) AS ratio 
FROM 
(
    select 
        substr(order_time, 1, 7) as order_month, 
        sum(order_num) over (partition by substr(order_time, 1, 7)) as num,
        sum( order_num ) over ( PARTITION BY substr( order_time, 1, 4 ) ) total,
        row_number() over (partition by substr(order_time, 1, 7)) as rk 
    from saleorder
) temp 
where rk = 1;

hive SQL实现占比、同比、环比计算(lag函数,lead函数)_第4张图片

注意:
(1) 时间处理的时候除了用date_formate()也可以用substr()函数来截取年月日格式
(2)当我们求的占比分子分母没有时间维度只有数量的时候,我们可以采用on 1=1 来进行关联,构造成笛卡尔积

三、环比

与上年度数据对比称"同比",与上月数据对比称"环比"。
相关公式如下:

同比增长率计算公式
(当年值-上年值)/上年值x100% 

环比增长率计算公式
(当月值-上月值)/上月值x100% 

实现:

select 
    now_month,
    now_num,
    last_num,
    round( (now_num-last_num) / last_num, 2 ) as ratio
FROM 
(
    select 
        now_month,
        now_num, 
        lag( t1.now_num, 1 ) over (order by t1.now_month ) as last_num 
    from 
    (
        select 
            substr(order_time, 1, 7) as now_month, 
            sum(order_num) as now_num 
        from saleorder 
        group by 
            substr(order_time, 1, 7) 
    ) t1
) t2;

hive SQL实现占比、同比、环比计算(lag函数,lead函数)_第5张图片
也可以对显示结果稍微优化一下:

select 
    now_month,
    now_num,
    last_num,
    -- round( (now_num-last_num) / last_num, 2 ) as ratio
    concat( nvl ( round( ( now_num - last_num ) / last_num * 100, 2 ), 0 ), "%" ) 
FROM 
(
    select 
        now_month,
        now_num, 
        lag( t1.now_num, 1 ) over (order by t1.now_month ) as last_num 
    from 
    (
        select 
            substr(order_time, 1, 7) as now_month, 
            sum(order_num) as now_num 
        from saleorder 
        group by 
            substr(order_time, 1, 7) 
    ) t1
) t2;

hive SQL实现占比、同比、环比计算(lag函数,lead函数)_第6张图片

四、同比

与上年度数据对比称"同比",与上月数据对比称"环比"。
相关公式如下:

同比增长率计算公式
(当年值-上年值)/上年值x100% 

环比增长率计算公式
(当月值-上月值)/上月值x100% 

同比的话,如果每个月都齐全,都有数据lag(num,12)就ok.。我们的例子中只有19年和20年1-4月份的数据。这种特殊情况应该如何处理?

写法一:本案例进行单独处理

有4个月数据,我就lag(num,4)

select 
    now_month,
    now_num,
    last_num,
    round( (now_num-last_num) / last_num, 2 ) as ratio 
FROM 
(
    select 
        now_month,
        now_num, 
        lag( t1.now_num, 4 ) over (order by t1.now_month ) as last_num 
    from 
    (
        select 
            substr(order_time, 1, 7) as now_month, 
            sum(order_num) as now_num 
        from saleorder 
        group by 
            substr(order_time, 1, 7) 
    ) t1
) t2;

hive SQL实现占比、同比、环比计算(lag函数,lead函数)_第7张图片
优化:
对空值可以做一下优化处理,用到nvl()函数和lag()函数的第三个参数。

select 
    now_month,
    now_num,
    last_num,
    nvl ( round( ( now_num - last_num ) / last_num, 2 ), 0 ) AS ratio 
FROM 
(
    select 
        now_month,
        now_num, 
        lag( t1.now_num, 4, 0 ) over (order by t1.now_month ) as last_num 
    from 
    (
        select 
            substr(order_time, 1, 7) as now_month, 
            sum(order_num) as now_num 
        from saleorder 
        group by 
            substr(order_time, 1, 7) 
    ) t1
) t2;

hive SQL实现占比、同比、环比计算(lag函数,lead函数)_第8张图片

写法二:通用方法

基本思路:利用date_add()生成跨年时间

SELECT
	t1.now_month,
    CASE WHEN now_num IS NULL OR now_num = 0 
	    THEN 0 ELSE now_num 
        END now_num,
    CASE WHEN last_num IS NULL OR last_num = 0 
        THEN 0 ELSE last_num 
		END last_num,
    CASE WHEN last_num IS NULL OR last_num = 0 
        THEN 0 ELSE round( ( now_num - last_num ) / last_num, 2 ) 
		END ratio 
FROM
(
	SELECT
		DATE_FORMAT( order_time, 'yyyy-MM' ) AS now_month,
		sum( order_num ) AS now_num 
	FROM
		saleorder 
	GROUP BY
		DATE_FORMAT( order_time, 'yyyy-MM' ) 
) t1
LEFT JOIN 
(
	SELECT
		DATE_FORMAT( DATE_ADD( order_time, 365 ), 'yyyy-MM' ) AS now_month,
		sum( order_num ) AS last_num 
	FROM
		saleorder 
	GROUP BY
	DATE_FORMAT( DATE_ADD( order_time, 365 ), 'yyyy-MM' ) 
) AS t2 ON t1.now_month = t2.now_month;

hive SQL实现占比、同比、环比计算(lag函数,lead函数)_第9张图片
优化:
用nvl()代替case…when

SELECT
	t1.now_month,
    nvl ( now_num, 0 ) AS now_num,
	nvl ( last_num, 0 ) AS last_num,
	nvl ( round( ( now_num - last_num ) / last_num, 2 ), 0 ) AS ratio 
FROM
(
	SELECT
		DATE_FORMAT( order_time, 'yyyy-MM' ) AS now_month,
		sum( order_num ) AS now_num 
	FROM
		saleorder 
	GROUP BY
		DATE_FORMAT( order_time, 'yyyy-MM' ) 
) t1
LEFT JOIN 
(
	SELECT
		DATE_FORMAT( DATE_ADD( order_time, 365 ), 'yyyy-MM' ) AS now_month,
		sum( order_num ) AS last_num 
	FROM
		saleorder 
	GROUP BY
	DATE_FORMAT( DATE_ADD( order_time, 365 ), 'yyyy-MM' ) 
) AS t2 ON t1.now_month = t2.now_month;

效果是一样的
hive SQL实现占比、同比、环比计算(lag函数,lead函数)_第10张图片

你可能感兴趣的:(hive,大数据,hive)