hive电子商务消费行为分析

hive电子商务消费行为分析

1. 掌握Zeppelin的使用

2. 了解数据结构

3.数据清洗

4. 基于Hive的数据分析

1.物料准备

(1)Customer表

customer_details

details

customer_id

Int, 1 - 500

first_name

string

last_name

string

email

string, such as [email protected]

gender

string, Male or female

address

string

country

string

language

string

job

string, job title/position

credit_type

string, credit card type, such as visa

credit_no

string, credit card number

hive电子商务消费行为分析_第1张图片

(2)Store表

store_details

details

store_id

Int, 1 - 5

store_name

string

employee_number

Int, 在store有多少employee

hive电子商务消费行为分析_第2张图片

(3)Review表

store_review

details

stransaction_id

Int, 1 - 8000

store_id

Int, 1 - 5

review_store

Int, 1 - 5

hive电子商务消费行为分析_第3张图片

(4)Transaction表

transaction_details

details

transaction_id

Int, 1 - 1000

customer_id

Int, 1 - 500

store_id

Int, 1 - 5

price

decimal, such as 5.08

product

string, things bought

date

string, when to purchase

time

string, what time to purchase

hive电子商务消费行为分析_第4张图片

2. 实现数据清洗

2.1  找出PII (personal information identification) 或PCI (personal confidential information) 数据进行加密或hash

--数据加密处理
select * from ext_customer_details;
create view if not exists vm_customer_details as
select
    customer_id,
    first_name,
    unbase64(last_name) as last_name,
    unbase64(email) as email,
    gender,
    unbase64(address) as address,
    country,
    language,
    job,
    credit_type,
    unbase64(concat(unbase64(credit_no),'kb23')) as credit_no
from ext_customer_details;

show views ;
show tables ;
select * from vw_customer_details;

2.2  重新组织transaction数据按照日期YYYY-MM做分区,并对transaction_details中的重复数据生成新ID

--交易详情表  按月创建分区表

  create table if not exists transaction_details (

    transaction_id string,

    customer_id string,

    store_id string,

    price decimal(8,2),

    product string,

    purchase_date string,

    purchase_time string

  )

  partitioned by (purchase_month string);


------ext_transaction_details  重复数据生成新的id-------------

--查询重复的数据

  with

    base as ( select row_number() over (partition by transaction_id order by 1) as rn, * from ext_transaction_details )

  select * from base where rn > 1;

  

  --打开动态分区

  set hive.exec.dynamic.partition=true;

  set hive.exec.dynamic.partition.mode=nonstrict;

  

  --重复id处理方式  transaction_id_fix_rn  --> 123456_fix_2

  with

    base as (

        select

            transaction_id,

            customer_id,

            store_id,

            price,

            product,

            purchase_date,

            purchase_time,

            from_unixtime(unix_timestamp(purchase_date, 'yyyy-MM-dd'),'yyyy-MM') as purchase_month,

            row_number() over (partition by transaction_id order by store_id) as rn

        from ext_transaction_details)

  insert overwrite table transaction_details partition (purchase_month)

  select `if`(rn=1,transaction_id, concat(transaction_id, '_fix_', rn)), customer_id, store_id,

            price, product, purchase_date, purchase_time,purchase_month from base;

  

  show partitions transaction_details;

2.3  过滤掉store_review中没有评分的数据

%hive

use shopping;

-- select * from ext_store_review where review_score == '';

create view vw_store_review as(

select transaction_id, store_id, review_score from ext_store_review where review_score <> '');

3. Customer分析(zeppelin和datagrip查询)

3.1 找出顾客最常用的信用卡

%hive

use shopping;

select

    country,

    credit_type,

    count(distinct credit_no) as credit_cnt

from vm_customer_details

group by country,credit_type

order by credit_cnt desc;

hive电子商务消费行为分析_第5张图片

3.2 找出客户资料中排名前五的职位名称

%hive

use shopping;

select job, count(1) as pn

from vw_customer_details

group by job

order by pn desc limit 10;

hive电子商务消费行为分析_第6张图片

3.3 在美国女性最常用的信用卡

%hive

use shopping;

select

    credit_type,

    count(1) as credit_cnt

from vw_customer_details

where country='United States' and gender = 'Female'

group by credit_type

order by credit_cnt desc limit 3;

hive电子商务消费行为分析_第7张图片

3.4 按性别和国家进行客户统计

%hive

use shopping;

select

    country,

    gender,

    count(1) as cn

from vw_customer_details

group by country,gender

order by cn desc limit 3;

4. Transaction分析

4.1计算每月总收入

4.2计算每个季度的总收入

4.3按年计算总收入

4.4按工作日计算总收入

4.5按时间段计算总收入(需要清理数据)

4.6按时间段计算平均消费

4.4按工作日计算平均消费

4.8计算年、月、日的交易总数

4.9找出交易量最大的10个客户

4.10找出消费最多的前10位顾客

-- 1计算每月总收入

  select

    purchase_month, sum(price) as monthSUM

  from transaction_details

  group by purchase_month;
 

  -- 2计算每季度总收入

  with

    base as ( select

    price,

    concat_ws('-',substr(purchase_date,1,4),cast(ceil(month(purchase_date)/3.0) as string))as year_quarter

  from transaction_details)

  select  year_quarter, sum(price) as year_quarterSUM from base group by year_quarter;

  

  -- 3计算每年度总收入

  select

    substr(purchase_date,1,4),

    sum(price) as yearSUM

  from transaction_details

  group by substr(purchase_date,1,4);

  

  



  -- 4计算工作日收入

  select

    `dayofweek`(purchase_date),

    sum(price) as daySUM

  from transaction_details

  group by `dayofweek`(purchase_date)

  having `dayofweek`(purchase_date) in (2,3,4,5,6);

  

  -- 5按时间段计算总收入(需要清理数据)

  with

    base as (

        select price,purchase_time,

        if(purchase_time like '%M',from_unixtime(unix_timestamp(purchase_time,'hh:mm aa'),'HH:mm'),purchase_time) as time_format

        from transaction_details),

    timeformat as (

  select price, purchase_time, time_format, (cast(split(time_format,':')[0] as decimal(4,2))+ cast(split(time_format,':')[1] as decimal(4,2))/60) as purchase_time_in_hr from base),

t1 as (select price, purchase_time, time_format, purchase_time_in_hr,

    `if`(purchase_time_in_hr > 5 and purchase_time_in_hr <= 8, 'early morning',

    `if`(purchase_time_in_hr > 8 and purchase_time_in_hr <= 11, 'morning',

    `if`(purchase_time_in_hr > 11 and purchase_time_in_hr <= 13, 'noon',

    `if`(purchase_time_in_hr > 13 and purchase_time_in_hr <= 18, 'afternoon',

    `if`(purchase_time_in_hr > 18 and purchase_time_in_hr <= 22, 'evening', 'night'))))) as time_bucket

  from timeformat)

  select time_bucket, sum(price) bucketSUM, avg(price) avgprice from t1 group by time_bucket ;

  
 
   

  

  -- 6按时间段计算平均消费

  

  

  -- 7按工作日计算平均消费

  select

    `dayofweek`(purchase_date),

    avg(price) as dayAVG

  from transaction_details

  where `dayofweek`(purchase_date) in (2,3,4,5,6)

  group by `dayofweek`(purchase_date);

  

  -- 8计算年、月、日的交易总数

  select

    year(purchase_date) currentYear,

    count(1) as sumCount

  from transaction_details

  group by year(purchase_date);

  

  select

    substring(purchase_date,1,7) currentMonth,

    count(1) as sumCount

  from transaction_details

  group by substring(purchase_date,1,7);

  

  select

    day(purchase_date) currentDay,

    count(1) as sumCount

  from transaction_details

  group by day(purchase_date);

  

  -- 9找出交易量最大的10个客户

  with

    base as (

    select customer_id,count(transaction_id) as trans_cnt,sum(price) as customerSUM from transaction_details group by customer_id),

    cust_detail as(

    select concat_ws(' ', cd.first_name, '***') as cust_name, base.* ,

       dense_rank() over (order by trans_cnt desc ) rn from base join vw_customer_details cd on base.customer_id=cd.customer_id)

  select * from cust_detail where rn<=10;

  

  

  -- 10找出消费最多的前10位顾客

  with

    base as (

    select customer_id,sum(price) as customerSUM from transaction_details group by customer_id),

    cust_detail as(

    select concat_ws(' ', cd.first_name, '***') as cust_name, base.* ,

       dense_rank() over (order by customerSUM desc ) rn from base join vw_customer_details cd on base.customer_id=cd.customer_id)

  select * from cust_detail where rn<=10;
 

hive电子商务消费行为分析_第8张图片

Mysql中建表查询

7.11统计该期间交易数量最少的用户

7.12计算每个季度的独立客户总数

7.13计算每周的独立客户总数

7.14计算整个活动客户平均花费的最大值

7.15统计每月花费最多的客户

7.16统计每月访问次数最多的客户

7.17按总价找出最受欢迎的5种产品

7.18根据购买频率找出最畅销的5种产品

7.19根据客户数量找出最受欢迎的5种产品

-- 7.11统计该期间交易数量最少的用户

WITH

       t1 AS(SELECT customer_id,COUNT(1) c FROM transaction_details GROUP BY customer_id ORDER BY c),

       t2 AS(SELECT t1.*,DENSE_RANK() over(ORDER BY t1.c) as rn FROM t1)

SELECT CONCAT(vcd.first_name,'***'), t2.* FROM t2 JOIN vw_customer_details vcd ON t2.customer_id=vcd.customer_id WHERE rn=1;



-- 7.12计算每个季度的独立客户总数

SELECT DISTINCT COUNT(customer_id),

       concat_ws('-',substr(purchase_date,1,4),cast(ceil(month(purchase_date)/3.0) as CHAR(20)))as year_quarter

from transaction_details

GROUP BY year_quarter

ORDER BY (year_quarter+0);



-- 7.13计算每周的独立客户总数

SELECT DISTINCT COUNT(customer_id),

       concat_ws('-',substr(purchase_date,1,4),cast(WEEKOFYEAR(purchase_date) as CHAR(20)))as week_year

from transaction_details

GROUP BY week_year;



-- 7.14计算整个活动客户平均花费的最大值

WITH

       t1 AS(SELECT customer_id, avg(price) as costAVG FROM transaction_details GROUP BY customer_id),

       t2 AS(SELECT t1.*,DENSE_RANK() over(ORDER BY costAVG DESC) as rn FROM t1)

SELECT * FROM t2 WHERE rn=1;



-- 7.15统计每月花费最多的客户

WITH

       t1 AS(SELECT purchase_month,customer_id, sum(price) as costSUM FROM transaction_details GROUP BY purchase_month,customer_id),

       t2 AS(SELECT t1.*,DENSE_RANK() over(PARTITION by purchase_month ORDER BY costSUM DESC) as rn FROM t1)

SELECT CONCAT(vcd.first_name,'***'), t2.* FROM t2 JOIN vw_customer_details vcd ON t2.customer_id=vcd.customer_id WHERE rn=1;


-- 7.16统计每月访问次数最多的客户

WITH

       t1 AS(SELECT purchase_month,customer_id, count(customer_id) as costCount FROM transaction_details GROUP BY purchase_month,customer_id),

       t2 AS(SELECT t1.*,DENSE_RANK() over(PARTITION by purchase_month ORDER BY costCount DESC) as rn FROM t1)

SELECT CONCAT(vcd.first_name,'***'), t2.* FROM t2 JOIN vw_customer_details vcd ON t2.customer_id=vcd.customer_id WHERE rn=1;



-- 7.17按总价找出最受欢迎的5种产品

WITH

       t1 AS(SELECT product,sum(price) as proSUM FROM transaction_details GROUP BY product),

       t2 AS(SELECT t1.*,DENSE_RANK() over(ORDER BY proSUM DESC) as rn FROM t1)

SELECT * FROM t2 WHERE rn BETWEEN 1 AND 5;



-- 7.18根据购买频率找出最畅销的5种产品

WITH

       t1 AS(SELECT product,COUNT(product) as proCount FROM transaction_details GROUP BY product),

       t2 AS(SELECT t1.*,DENSE_RANK() over(ORDER BY proCount DESC) as rn FROM t1)

SELECT * FROM t2 WHERE rn BETWEEN 1 AND 5;



-- 7.19根据客户数量找出最受欢迎的5种产品

WITH

       t1 AS(SELECT product,COUNT(customer_id) as proCount FROM transaction_details GROUP BY product),

       t2 AS(SELECT t1.*,DENSE_RANK() over(ORDER BY proCount DESC) as rn FROM t1)

SELECT * FROM t2 WHERE rn BETWEEN 1 AND 5;

你可能感兴趣的:(hive,hadoop,数据仓库)