项目文件获取,提取码: m3d4
需求概述
customer_details | details |
---|---|
customer_id | Int, 1 - 500 |
first_name | string |
last_name | string |
string, such as [email protected] | |
gender | string, Male or female |
address | string |
country | string |
language | string |
job | string, job title/position |
credit_type | string, credit card type, such as visa |
credit_no | string, credit card number |
问题:language字段数据存在错误
transaction_details | details |
---|---|
transaction_id | Int, 1 - 1000 |
customer_id | Int, 1 - 500 |
store_id | Int, 1 - 5 |
price | decimal, such as 5.08 |
product | string, things bought |
date | string, when to purchase |
time | string, what time to purchase |
问题:表中transaction_id有重复,但数据有效,需要修复数据
transaction_details | details |
---|---|
transaction_id | Int, 1 - 1000 |
customer_id | Int, 1 - 500 |
store_id | Int, 1 - 5 |
price | decimal, such as 5.08 |
product | string, things bought |
date | string, when to purchase |
time | string, what time to purchase |
store_review | details |
---|---|
stransaction_id | Int, 1 - 8000 |
store_id | Int, 1 - 5 |
review_store | Int, 1 - 5 |
问题:表中有无效的score数据表中有将transaction_id映射到错误的store_id
%sh
## /tmp/data/
-- 查看行数
cd /tmp/data/
wc -l customer_details.csv
wc -l store_details.csv
wc -l transaction_details.csv
wc -l store_review.csv
-- 查看头两行
head -2 customer_details.csv
head -2 transaction_details.csv
head -2 store_details.csv
head -2 store_review.csv
%sh
cd /tmp/data/
hdfs dfs -rm -r -f /tmp/shopping
hdfs dfs -mkdir -p /tmp/shopping/data/customer
hdfs dfs -mkdir -p /tmp/shopping/data/store
hdfs dfs -mkdir -p /tmp/shopping/data/review
hdfs dfs -mkdir -p /tmp/shopping/data/transaction
hdfs dfs -chmod -R 777 /tmp
-- 上传数据到hdfs
hdfs dfs -put customer_details.csv /tmp/shopping/data/customer/
hdfs dfs -put transaction_details.csv /tmp/shopping/data/transaction/
hdfs dfs -put store_details.csv /tmp/shopping/data/store/
hdfs dfs -put store_review.csv /tmp/shopping/data/review/
4.1 Clear all tables if exists
create database if not exists shopping
use shopping
-- 创建顾客表
create external table if not exists ext_customer_details (
customer_id string, --we can use int as well
first_name string,
last_name string,
email string,
gender string,
address string,
country string,
language string,
job string,
credit_type string,
credit_no string
)
row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
location '/tmp/shopping/data/customer' --this must tblproperties
tblproperties ("skip.header.line.count"="1")
-- 创建交易流水表
create external table if not exists ext_transaction_details (
transaction_id string,
customer_id string,
store_id string,
price decimal(8,2),
product string,
purchase_date string,
purchase_time string
)
row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
location '/tmp/shopping/data/transaction' --this must tblproperties
tblproperties ("skip.header.line.count"="1")
-- 创建商店详情表
create external table if not exists ext_store_details (
store_id string,
store_name string,
employee_number int
)
row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
location '/tmp/shopping/data/store' --this must tblproperties
tblproperties ("skip.header.line.count"="1")
-- 创建评价表
create external table if not exists ext_store_review (
transaction_id string,
store_id string,
review_score int
)
row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
location '/tmp/shopping/data/review' --this must tblproperties
tblproperties ("skip.header.line.count"="1")
4.2 Verify all Tables are Created
%hive
--select * from ext_customer_details limit 20
--select * from ext_transaction_details limit 20
--select * from ext_store_details limit 20
select * from ext_store_review limit 20
解决以下有问题的数据
5.1 Clean and Mask customer_details
%hive
-- 敏感信息加密
-- drop view vm_customer_details
create view if not exists vm_customer_details as
select
customer_id ,
first_name ,
unbase64(last_name) lastname,
unbase64(email) email,
gender ,
unbase64(address) address,
country ,
language,
job ,
credit_type ,
unbase64(credit_no) credit_no
from
ext_customer_details
5.2 Clean transaction_details into partition table
%hive
-- 创建流水详情表
create table if not exists transaction_details
(
transaction_id string,
customer_id string,
store_id string,
price decimal(8,2),
product string,
purchase_date date,
purchase_time string
)
partitioned by(purchase_month string)
-- select transaction_id,count(1) from ext_transaction_details group by transaction_id having count(1)>1
-- select * from ext_transaction_details where transaction_id=8001
set hive.exec.dynamic.partition.mode=nonstrict -- 开启动态分区
-- 重写数据
with base as (
select
transaction_id,
customer_id ,
store_id ,
price ,
product,
purchase_date,
purchase_time,
from_unixtime(unix_timestamp(purchase_date,'yyyy-MM-dd'),'yyyy-MM') as purchase_month,
row_number() over (partition by transaction_id order by store_id) as rn
from ext_transaction_details
)
insert overwrite table transaction_details partition(purchase_month)
select
if(rn=1,transaction_id,concat_ws('-',transaction_id,'_fix')) ,
customer_id ,
store_id ,
price ,
product,
purchase_date ,
purchase_time,
purchase_month
from base
-- 查看修复信息
select * from transaction_details where transaction_id like '%fix%'
5.3 Clean store_review table
create view if not exists vw_store_review as
select
transaction_id,
review_score
from ext_store_review where review_score <> ''
show tables
%hive
select credit_type,count(distinct credit_no) as credit_cnt
from
vm_customer_details
group by credit_type
order by credit_cnt desc
%hive
select job ,count(1) as pn
from vm_customer_details
group by job
order by pn desc
limit 5
%hive
select credit_type,count(1) as ct
from vm_customer_details
where country='United States' and gender =='Female'
group by credit_type
order by ct desc limit 5
%hive
select country,gender, count(1) cnt
from vm_customer_details
group by country,gender
%hive
select sum(price) as revenue_mon,purchase_month
from transaction_details
group by purchase_month
%hive
with
bash as
(
select price, ( concat(year(purchase_date),'-',ceil(month(purchase_date)/3)))as year_quarter
from transaction_details
)
select sum(price) revenue_quarter
from bash
select year(purchase_date),sum(price)
from transaction_details
group by year(purchase_date)
%hive
select dayofweek(cast(purchase_date as string))-1 work_date,sum(price)
from transaction_details
where dayofweek(cast(purchase_date as string)) between 2 and 6
group by dayofweek(cast(purchase_date as string))
-- 使用正则表达式清理数据然后使用case when 分组查询
with
t1 as(
select *, if(instr(purchase_time,'PM')>0,
if(cast(regexp_extract(purchase_time,'([0-9]{1,2}):([0-9]{2}\\w*)',1)as int)+12>=24,
0,
cast(regexp_extract(purchase_time,'([0-9]{1,2}):([0-9]{2}\\w*)',1)as int)+12),
cast(regexp_extract(purchase_time,'([0-9]{1,2}):([0-9]{2}\\w*)',1)as int)) as timeTrans
from transaction_details), t2 as(
select t1.*,case when t1.timeTrans<=8 and t1.timeTrans>5 then 'early morning'
when t1.timeTrans<=11 and t1.timeTrans>8 then 'morning'
when t1.timeTrans<=13 and t1.timeTrans>11 then 'noon'
when t1.timeTrans<=18 and t1.timeTrans>13 then 'afternoon'
when t1.timeTrans<=22 and t1.timeTrans>18 then 'evening'
else 'night'
end as timeSplit
from t1)
select t2.timeSplit,sum(price)
from t2
group by t2.timeSplit
%hive
select dayofweek(cast(purchase_date as string))-1 work_date,avg(price)
from transaction_details
where dayofweek(cast(purchase_date as string)) between 2 and 6
group by dayofweek(cast(purchase_date as string))
-- 按天计数
select purchase_date ,count(1)
from transaction_details
group by purchase_date
-- 按年计数
select year(purchase_date),count(1)
from transaction_details
group by year(purchase_date)
-- 按月计数
select concat(year(purchase_date),'-',month(purchase_date)),count(1)
from transaction_details
group by year(purchase_date),month(purchase_date)
-- 合计
select purchase_date,
count(1) over(partition by year(purchase_date)),
count(1) over(partition by year(purchase_date),month(purchase_date)),
count(1) over(partition by year(purchase_date),month(purchase_date),day(purchase_date))
from transaction_details
select customer_id,count(1) c
from transaction_details
group by customer_id
order by c desc
limit 10
select customer_id ,sum(price) s
from transaction_details
group by customer_id
order by s desc
limit 10
select customer_id ,count(1) c
from transaction_details
group by customer_id
order by c asc
limit 1
select concat(year(purchase_date),'年',ceil(month(purchase_date)/3),'季度'),count(distinct customer_id)
from transaction_details
group by year(purchase_date),ceil(month(purchase_date)/3)
select concat(year(purchase_date),'年第',weekofyear(purchase_date),'周'),count(distinct customer_id)
from transaction_details
group by year(purchase_date),weekofyear(purchase_date)
select a.customer_id,max(a.av)
from
(
select customer_id,avg(price) av
from transaction_details
group by customer_id) a
group by a.customer_id;
select b.m,b.id,b.s
from(
select a.m,a.id,a.s ,row_number() over(partition by a.m order by a.s desc) as win1
from(
select concat(year(purchase_date),'-',month(purchase_date)) m,customer_id id,sum(price) s
from transaction_details
group by year(purchase_date),month(purchase_date),customer_id)a) b
where b.win1=1
select b.m,b.id,b.c
from(
select a.m,a.id,a.c,row_number() over(partition by a.m order by a.c desc) as win1
from(
select concat(year(purchase_date),'-',month(purchase_date)) m,customer_id id, count(1) c
from transaction_details
group by year(purchase_date),month(purchase_date),customer_id) a) b
where b.win1=1
select product,sum(price) s
from transaction_details
group by product
order by s desc
limit 5
select product,count(1) c
from transaction_details
group by product
order by c desc
limit 5
select product,count(distinct customer_id) c
from transaction_details
group by product
order by c
limit 5
select store_id,count(1) c
from transaction_details
group by store_id
order by c desc
limit 1
select store_id,sum(price) s
from transaction_details
group by store_id
order by s desc
limit 1
select store_id,count(1) c ,sum(price) s
from transaction_details
group by store_id
order by c desc ,s desc
limit 1
select b.store_id,b.product
from (
select a.store_id,a.product,a.c ,row_number() over(partition by store_id order by a.c desc )as win1
from(
select store_id,product,count(distinct customer_id) c
from transaction_details
group by store_id,product) a )b
where b.win1 =1
select a.store_id,concat_ws(':',cast(ceil(round(s.employee_number/a.c*100))as string),'100')
from(
select t.store_id,count(distinct customer_id) c
from transaction_details t
group by t.store_id)a join ext_store_details s
on a.store_id=s.store_id
-- 按月
select store_id,year(purchase_date),month(purchase_date),sum(price)
from transaction_details
group by store_id,year(purchase_date),month(purchase_date)
-- 按年
select store_id,year(purchase_date),sum(price)
from transaction_details
group by store_id,year(purchase_date)
-- 合计到一张表
select distinct *
from(
select store_id,year(purchase_date),sum(price) over(partition by year(purchase_date)),month(purchase_date),sum(price) over(partition by year(purchase_date),month(purchase_date))
from transaction_details)a
select store_id,sum(price)
from transaction_details
group by store_id
with
t1 as(
select *, if(instr(purchase_time,'PM')>0,
if(cast(regexp_extract(purchase_time,'([0-9]{1,2}):([0-9]{2}\\w*)',1)as int)+12>=24,
0,
cast(regexp_extract(purchase_time,'([0-9]{1,2}):([0-9]{2}\\w*)',1)as int)+12),
cast(regexp_extract(purchase_time,'([0-9]{1,2}):([0-9]{2}\\w*)',1)as int)) as timeTrans
from transaction_details), t2 as(
select t1.*,case when t1.timeTrans<=8 and t1.timeTrans>5 then 'early morning'
when t1.timeTrans<=11 and t1.timeTrans>8 then 'morning'
when t1.timeTrans<=13 and t1.timeTrans>11 then 'noon'
when t1.timeTrans<=18 and t1.timeTrans>13 then 'afternoon'
when t1.timeTrans<=22 and t1.timeTrans>18 then 'evening'
else 'night'
end as timeSplit
from t1),
t3 as(
select t2.store_id,t2.timeSplit,count(1) c
from t2
group by t2.store_id,t2.timeSplit),
t4 as(
select t3.store_id,t3.timeSplit,row_number() over(partition by store_id order by t3.timeSplit desc)as win1
from t3 )
select t4.store_id,t4.timeSplit
from t4
where t4.win1=1
-- 购买超过6次
select a.*
from(
select store_id,customer_id,count(1) c
from transaction_details
group by store_id,customer_id)a
where a.c>6
-- 求总收入与雇员比值的最大值
with
t1 as (
select store_id,sum(price) s
from transaction_details
group by store_id)
select t1.store_id,t1.s/s.employee_number ss
from t1 join ext_store_details s on s.store_id= t1.store_id
order by ss desc
limit 1
select transaction_id
from vw_store_review
group by transaction_id
having count(1)>1
-- 求各个店共有多少顾客评价
with
t1 as(
select t2.store_id,t1.transaction_id,t2.customer_id
from vw_store_review t1 join transaction_details t2 on t1.transaction_id=t2.transaction_id)
select t1.store_id,count(distinct t1.customer_id)
from t1
group by t1.store_id
-- 求每家店每个评分有多少个客户给的
with
t1 as(
select t2.store_id ,t1.review_score,t2.customer_id
from vw_store_review t1 join transaction_details t2 on t1.transaction_id=t2.transaction_id)
select t1.store_id,t1.review_score,count(distinct customer_id)
from t1
group by t1.store_id,t1.review_score
-- 求每家店每个客户的订单数
select store_id,customer_id,count(1)
from transaction_details
group by store_id,customer_id
-- 每位顾客对每家店的评分只取最大值,然后筛选每家店评分为5的数量,最大就是最优店
with
t1 as(
select r.store_id,t.customer_id,max(r.review_score) m
from ext_store_review r join transaction_details t
on r.transaction_id = t.transaction_id
group by r.store_id,t.customer_id),
t2 as (select * from t1 where t1.m=5)
select store_id,count(t2.m) c from t2
group by store_id
order by c desc
limit 1