某零售企业的门店最近一年收集的数据
customer_details.csv:客户信息
transaction_details.csv:交易信息
store_details.csv:门店信息
store_review.csv:评价信息
centos 7虚拟机,Hadoop+Hive+Zeppelin
启动Hadoop、Hive、Zeppelin
./hadoop/sbin/start-all.sh
nohup hive --service hiveserver2 &
./zeppelin09/bin/zeppelin-daemon.sh start
打开Zeppelin页面(hive为虚拟机名称,也可填虚拟机ip)
http://hive:8000/
进入到存放数据的虚拟机目录并查看文件信息
%sh
cd /workspace/hive/store/
wc -l customer_details.csv
wc -l store_details.csv
wc -l store_review.csv
wc -l transaction_details.csv
head -2 customer_details.csv
head -2 store_details.csv
head -2 store_review.csv
head -2 transaction_details.csv
%sh
cd /workspace/hive/store/
hdfs dfs -rm -r -f -skipTrash /data/shopping/
hdfs dfs -mkdir -p /data/shopping/customer/
hdfs dfs -put customer_details.csv /data/shopping/customer/
hdfs dfs -mkdir -p /data/shopping/transaction/
hdfs dfs -put transaction_details.csv /data/shopping/transaction/
hdfs dfs -mkdir -p /data/shopping/store/
hdfs dfs -put store_details.csv /data/shopping/store/
hdfs dfs -mkdir -p /data/shopping/review/
hdfs dfs -put store_review.csv /data/shopping/review/
hdfs dfs -ls -R /data/shopping
%hive
create database if not exists shopping;
use shopping;
create external table if not exists ext_customer_details(
customer_id string,
first_name string,
last_name string,
email string,
gender string,
address string,
country string,
language string,
job string,
credit_type string,
credit_no string
)
row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
location '/data/shopping/customer'
tblproperties("skip.header.line.count"="1")
%hive
use shopping;
create external table if not exists ext_transaction_details(
transaction_id string,
customer_id string,
store_id string,
price decimal(8,2),
product string,
purchase_date date,
purchase_time string
)
row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
location '/data/shopping/transaction'
tblproperties("skip.header.line.count"="1")
%hive
use shopping;
create external table if not exists ext_store_details(
store_id string,
store_name string,
employee_number string
)
row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
location '/data/shopping/store'
tblproperties("skip.header.line.count"="1")
%hive
use shopping;
create external table if not exists ext_store_review(
transaction_id string,
store_id string,
review_score string
)
row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
location '/data/shopping/review'
tblproperties("skip.header.line.count"="1")
%hive
create view if not exists vw_customer_details as select
customer_id,
first_name,
unbase64(last_name) as last_name,
unbase64(email) as email,
gender,
unbase64(address) as address,
country,job,credit_type,
unbase64(concat(unbase64(credit_no),'seed')) as credit_no
from ext_customer_details
%hive
create table if not exists transaction_details(
transaction_id string,
customer_id string,
store_id string,
price decimal(8,2),
product string,
purchase_date date,
purchase_time string
)
partitioned by (purchase_month string)
%hive
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrick;
with base as(
select transaction_id,customer_id,store_id,price,product,purchase_date,purchase_time,
from_unixtime(unix_timestamp(purchase_date,'yyyy-MM-dd'),'yyyy-MM') as purchase_month,
row_number() over(partition by transaction_id order by store_id) as rn
from ext_transaction_details
where customer_id<>'customer_id'
)
from base
insert overwrite table transaction_details partition(purchase_month)
select
if(rn=1,transaction_id,concat(transaction_id,'_fix',rn)) as transaction_id,
customer_id,store_id,price,product,purchase_date,purchase_time,purchase_month;
select transaction_id,customer_id,store_id,price,product,purchase_date,purchase_time,purchase_month from transaction_details where transaction_id like '%fix%';
%hive
select count(*) from ext_store_review r join ext_transaction_details t on
r.transaction_id=t.transaction_id and r.store_id=t.store_id
where review_score<>''
%hive
select count(*) from ext_store_review where review_score <>'';
%hive
create view if not exists vw_store_review as
select transaction_id,review_score from ext_store_review where review_score<>''
最受客户欢迎的信用卡
%hive
select credit_type,count(distinct credit_no) as credit_cnt
from vw_customer_details group by country,credit_type order by credit_cnt desc;
%hive
select job,count(*) as pn from vw_customer_details group by job order by pn desc limit 5;
%hive
select credit_type,count(*) as ct from vw_customer_details
where country='United States' and gender='Female'
group by credit_type order by ct desc limit 3;
%hive
select count(*),country,gender from vw_customer_details group by country,gender;
按月统计总收益
%hive
select sum(price) as revenue_mom,purchase_month from transaction_details group by purchase_month;
%hive
with base as(select price,
concat_ws('-',substr(purchase_date,1,4),cast(ceil(month(purchase_date)/3.0)as string)) as year_quarter
from transaction_details)
select sum(price) as revenue_qoq,year_quarter from base group by year_quarter;
%hive
select sum(price) as revenue_mom,substr(purchase_date,1,4) as year
from transaction_details group by substr(purchase_date,1,4);
%hive
select sum(price) as revenue_wow,date_format(purchase_date,'u') as weekday
from transaction_details group by date_format(purchase_date,'u');
%hive
with base as(
select price, purchase_time, if(purchase_time like '%PM',
concat_ws(':',string(hour(from_unixtime(unix_timestamp(purchase_time,'hh:mm')))+12),
string(minute(from_unixtime(unix_timestamp(purchase_time,'hh:mm'))))),
from_unixtime(unix_timestamp(purchase_time,'hh:mm'),'HH:mm')) as time_format
from transaction_details
),
timeformat as (
select
purchase_time,price,
(cast(split(time_format,':')[0] as decimal(4,2))+ cast(split(time_format,':')[1] as decimal(4,2))/60)
as purchase_time_in_hrs
from base
),
timebucket as (
select
price,purchase_time, purchase_time_in_hrs,
if(purchase_time_in_hrs>5 and purchase_time_in_hrs <=8,'early morning',
if(purchase_time_in_hrs >8 and purchase_time_in_hrs <=11,'morning',
if(purchase_time_in_hrs>11 and purchase_time_in_hrs<=13,'noon',
if(purchase_time_in_hrs >13 and purchase_time_in_hrs <=18,'afternoon',
if(purchase_time_in_hrs>18 and purchase_time_in_hrs <=22,'evening', 'night'))))) as time_bucket from timeformat
)
select time_bucket, avg(price) as avg_spend, sum(price)/1000 as revenue_k
from timebucket group by time_bucket -- divide 1k to see the chater more clear;
%hive
select avg(price) as avg_price,date_format(purchase_date,'u') as weekday from transaction_details
where date_format(purchase_date,'u') is not null group by date_format(purchase_date,'u');
%hive
with base as (select
transaction_id,date_format(purchase_date,'u') as weekday,purchase_month,
concat_ws('-', substr(purchase_date,1,4),
cast(ceil(month(purchase_date)/3.0) as string)) as year_quarter,substr(purchase_date,1,4)as year
from transaction_details where purchase_month is not null)
select count(distinct transaction_id) as total,weekday,purchase_month,year_quarter,year
from base group by weekday, purchase_month,year_quarter,year order by year,purchase_month
%hive
with base as (
select customer_id,count(distinct transaction_id) as trans_cnt,sum(price) as spend_total
from transaction_details where purchase_month is not null group by customer_id),
cust_detail as(
select td.*,first_name as cust_name from
base td join vw_customer_details cd on td.customer_id=cd.customer_id)
select trans_cnt,cust_name as top10_trans_cust from cust_detail order by trans_cnt desc limit 10;
%hive
with base as (
select
customer_id,
count(distinct transaction_id) as trans_cnt,
sum(price) as spend_total
from transaction_details
where purchase_month is not null
group by customer_id
),
cust_detail as (
select td.*,first_name as cust_name from
base td join vw_customer_details cd on td.customer_id =cd.customer_id
)
select spend_total,cust_name as top10_trans_cust from cust_detail order by spend_total desc limit 10;
%hive
with base as (select customer_id,count(distinct transaction_id) as trans_cnt
from transaction_details where purchase_month is not null group by customer_id)
select * from base order by trans_cnt limit 10;
%hive
with base as (select customer_id,
concat_ws('-',substr(purchase_date,1,4),
cast(ceil(month(purchase_date)/3.0) as string)) as year_quarter,substr(purchase_date,1,4) as year
from transaction_details where purchase_month is not null)
select count(distinct customer_id) as total, year_quarter, year
from base group by year_quarter,year order by year_quarter;
%hive
with base as (select customer_id,avg(price) as price_avg,max(price)as price_max
from transaction_details where purchase_month is not null group by customer_id)
select max(price_avg) from base;
%hive
with base as(
select customer_id,purchase_month,sum(price) as price_sum, count(transaction_id) as trans_cnt
from transaction_details where purchase_month is not null group by purchase_month,customer_id),
rank_sum as (select
rank() over(partition by purchase_month order by price_sum desc) as rn_sum,
rank() over(partition by purchase_month order by trans_cnt desc) as rn_cnt,
purchase_month,price_sum,trans_cnt,customer_id from base)
select purchase_month,'spend' as measure_name,price_sum as measure_value,customer_id
from rank_sum where rn_sum=1
union all
select purchase_month,'visit' as measure_name,trans_cnt as measure_value,customer_id
from rank_sum where rn_cnt =1 order by measure_name, purchase_month;
%hive
select product,sum(price) as price_sum from transaction_details
where purchase_month is not null group by product order by price_sum desc limit 5;
%hive
select product,count(transaction_id) as freq_buy from transaction_details
where purchase_month is not null group by product order by freq_buy desc limit 5;
%hive
select product,count(customer_id) as freq_cust from transaction_details
where purchase_month is not null group by product order by freq_cust desc limit 5;
按客流量统计最受欢迎的门店
%hive
select sd.store_name,count(distinct customer_id) as unique_visit
from transaction_details td join ext_store_details sd on td.store_id=sd.store_id
group by store_name order by unique_visit desc limit 5;
%hive
select sd.store_name,sum(td.price) as total_revnue from
transaction_details td join ext_store_details sd on td.store_id=sd.store_id
group by store_name order by total_revnue desc limit 5;
%hive
select sd.store_name,count(transaction_id) as unique_purchase
from transaction_details td join ext_store_details sd on td.store_id=sd.store_id
group by store_name order by unique_purchase desc limit 5;
%hive
with base as (select store_id,product,count(distinct customer_id) as freq_cust
from transaction_details where purchase_month is not null group by store_id, product),
prod_rank as (select store_id,product,freq_cust,
rank() over(partition by store_id order by freq_cust desc) as rn from base)
select store_name, product, freq_cust
from prod_rank td join ext_store_details sd on td.store_id =sd.store_id
where td.rn=1;
%hive
with base as (select store_id,count(distinct customer_id,purchase_date)as cust_visit
from transaction_details where purchase_month is not null group by store_id)
select store_name,cust_visit,employee_number,
round(cust_visit/employee_number,2) as cust_per_employee_within_period
from base td join ext_store_details sd on td.store_id=sd.store_id;
%hive
select store_name,purchase_month,sum(price) as revenue
from transaction_details td join ext_store_details sd on td.store_id=sd.store_id
where purchase_month is not null group by store_name,purchase_month;
%hive
select store_name,sum(price) as revenue
from transaction_details td join ext_store_details sd on td.store_id=sd.store_id
where purchase_month is not null group by store_name;
%hive
with base as(
select transaction_id, purchase_time, if(purchase_time like '%PM',
concat_ws(':',string(hour(from_unixtime(unix_timestamp(purchase_time,'hh:mm')))+12),
string(minute(from_unixtime(unix_timestamp(purchase_time,'hh:mm'))))),
from_unixtime(unix_timestamp(purchase_time,'hh:mm'),'HH:mm')) as time_format,
store_id from transaction_details
where purchase_month is not null),
timeformat as (
select purchase_time,transaction_id,
(cast(split(time_format,':')[0] as decimal(4,2))+ cast(split(time_format,':')[1] as decimal(4,2))/60)
as purchase_time_in_hrs,store_id from base),
timebucket as (
select transaction_id,purchase_time, purchase_time_in_hrs,store_id,
if(purchase_time_in_hrs>5 and purchase_time_in_hrs <=8,'early morning',
if(purchase_time_in_hrs >8 and purchase_time_in_hrs <=11,'morning',
if(purchase_time_in_hrs>11 and purchase_time_in_hrs<=13,'noon',
if(purchase_time_in_hrs >13 and purchase_time_in_hrs <=18,'afternoon',
if(purchase_time_in_hrs>18 and purchase_time_in_hrs <=22,'evening', 'night'))))) as time_bucket from timeformat
)
select sd.store_name, count(transaction_id) as tran_cnt, time_bucket
from timebucket td join ext_store_details sd on td.store_id=sd.store_id
group by sd.store_name,time_bucket order by sd.store_name,tran_cnt desc;
%hive
with base as (select store_name,customer_id,sum(td.price) as total_cust_purphase
from transaction_details td join ext_store_details sd on td.store_id =sd.store_id
where purchase_month is not null group by store_name, customer_id),
rk_cust as (select store_name,customer_id,total_cust_purphase,
rank() over(partition by store_name order by total_cust_purphase desc) as rn
from base)
select * from rk_cust where rn <=5;
with base as (select store_id,sum(price) as revenue from transaction_details
where purchase_month is not null group by store_id)
select store_name, revenue,employee_number,
round(revenue/employee_number,2) as revenue_per_employee_within_period
from base td join ext_store_details sd on td.store_id =sd.store_id;
找出客户评价的覆盖范围
%hive
select count(td.transaction_id) as total_trans,
sum(if(sd.transaction_id is null,1, 0))as total_review_missed,
sum(if(sd.transaction_id is not null,1, 0)) as total_review_exist
from transaction_details td left join ext_store_review sd on td.transaction_id =sd.transaction_id
where purchase_month is not null;
%hive
select review_score,count(distinct customer_id) as num_customer,count(*) as num_reviews
from transaction_details td join ext_store_review sd on td.transaction_id=sd.transaction_id
where purchase_month is not null and review_score <>'' group by review_score;
%hive
select count(*)as visit_cnt,customer_id,td.store_id
from transaction_details td join ext_store_review sd on td.transaction_id=sd.transaction_id
where purchase_month is not null and review_score='5'
group by customer_id,td.store_id order by visit_cnt desc;