本文最早发表在csdn时间为:2021-12-08
本案例数据链接(数据是本人业余时间模拟数据,需要的自行下载):
数据点我下载
数据来源为工作中接触到的某公司后台数据,在完成工作相关分析后,本人对该部分数据虚拟重建用以复盘整理
大家好,我是Captain,一个从业两年的数据分析师。
公主号:Captain_Data
关注:船长数据分析
![在这里插入图片描述](https://img-blog.csdnimg.cn/17c81933a5de42019813b0218c189075.jpeg
学习是为了不落后,
整理则是为了不忘记。
通过本文您将学习到:
本项目主要复盘下面拆解图中,消费主力人群特征和主要商品贡献等~
某奢侈品电商平台为了合理的投⼊⼈⼒物⼒创造更⼤的销售利润,现需要对已有的销售数据进⾏⽤户分析,提出合理的促销计划。围绕产品和⽤户两⼤⽅⾯展开为电商平台制定策略提供分析及建议。
⽤户分析:
产品分析:
每条记录为⼀个订单
字段名称 | 字段描述 |
---|---|
User_ID | 顾客ID |
Product_ID | 商品ID |
Gender | 顾客性别 |
Age | 顾客年龄 |
Occupation | 顾客从事职业ID |
City_Category | 城市类别 |
Stay_In_Current_City_Years | 在现城市呆的年数 |
Marital_Status | 婚姻状况 |
Product_Category_1 | 商品类别1 |
Product_Category_2 | 商品类别2 |
Product_Category_3 | 商品类别3 |
Purchase | 消费⾦额 |
本数据在本地搭建的Hive 环境下,所以:
第一步:打开虚拟机,打开xshell 链接服务~
下面是本机hive 启动步骤:
启动hive步骤
#切换到hadoop⽤户
[root@node100 ~]$ su - hadoop
#启动集群
[hadoop@node100 hadoop]$ start-all.sh
#进⼊到hive安装⽬录
[hadoop@node100 hadoop]$ cd /opt/module/apache-hive-3.1.1-bin
#启动hive
[hadoop@node100 apache-hive-3.1.1-bin]$ hive
hive(default)>
start-all.sh启动一下相关服务
cd / 跳转根目录 ls 看一下文件夹
由于 hive安装⽬录在‘/opt/module/apache-hive-3.1.1-bin’
下面直接切换 ,并且启动hive
进入hive,
# 看一下数据库
SHOW DATABASES;
下载数据:
本案例数据链接(数据是本人业余时间模拟数据,,需要的自行下载):
数据点我下载
打开xftp
链接后,找到本地数据文件位置,上传到服务器指定数据文件夹:
双击文件,上传
数据准备完毕 数据文件位置为:‘/home/hadoop/data/captain_data_shop_model.txt’
至此,数据准备完毕~
先指定数据库
#
use captain;
再建表
#建表
create table captain_data_shop_model
(User_ID int,
Product_ID string,
Gender string,
Age string,
Occupation int,
City_Category string,
Stay_In_Current_City_Years string,
Marital_Status int,
Product_Category_1 int,
Product_Category_2 int,
Product_Category_3 int,
Purchase double
)
row format delimited fields terminated by '|'
tblproperties(
"skip.header.line.count"="1" -- 跳过首行
);
装载数据:注意数据源在服务器本地,加‘local’
--装载数据
load data local inpath '/home/hadoop/data/captain_data_shop_model.txt' overwrite into table captain_data_shop_model;
select * from captain_data_shop_model limit 5;
5.0. 数据量:
select count(1) from captain_data_shop_model;
数据量:53万条
5.1.订单整体的消费情况(包括:总销售额、⼈均消费、平均每单消费)
select
sum(purchase) total_sales,
sum(purchase)/count(distinct user_id) avg_user_pay,
sum(purchase)/count(purchase) avg_price
from captain_data_shop_model;
tableau 注意一下,人均消费注意使用:详细级别表达式 fixed
运行结果:
总销售额:50 亿
人均消费:85万(不愧是奢侈品电商平台)
平均订单金额:9333(平均一部苹果手机价格)
5.2. ⽤户侧分析(找到⾼质量⼈群)
5.2.1 各性别消费情况
select
*,
CAST(User_num/(sum(User_num) over () ) as DECIMAL (8,2)) as User_num_percent,
cast( sum_pay/(sum(sum_pay) over ()) as DECIMAL(8,2) ) as pay_percent
from (
select
Gender ,
count(distinct User_ID) User_num ,
CAST(sum(Purchase)/count(distinct User_ID) as DECIMAL(8,2)) avg_user,
sum(Purchase) sum_pay
from captain_data_shop_model group by Gender
) as p ORDER BY pay_percent desc;
5.2.2 各年龄消费情况
with p as
(
SELECT
Age ,
count(DISTINCT User_ID) user_num,
sum(Purchase) sum_pay,
cast(sum(Purchase)/count(DISTINCT User_ID) as decimal(10,2)) avg_pay
FROM captain_data_shop_model GROUP BY Age
)
SELECT p.*,
cast(user_num/sum(user_num) over() as DECIMAL(10,2)) user_num_percent,
cast(sum_pay/sum(sum_pay) over() as DECIMAL(10,2)) pay_percent
from p
ORDER BY
pay_percent
desc;
5.2.3 各职业消费情况
with p as
(SELECT
Occupation job,
count(DISTINCT User_ID) user_num,
sum(Purchase) sum_pay,
cast(sum(Purchase)/count(DISTINCT User_ID) as decimal(10,2)) avg_pay
FROM
captain_data_shop_model
GROUP BY
Occupation)
SELECt
p.*,
cast((user_num/sum(user_num) over() )*100 as DECIMAL(10,2))user_num_precent,
cast((sum_pay/sum(sum_pay) over())*100 as DECIMAL(10,2)) sum_pay_precent
FROM p
ORDER BY
sum_pay_precent desc
;
5.2.4 各婚姻状况消费情况
WITH
p
as
(
SELECT
Marital_Status Marital,
count(DISTINCT User_ID) user_num,
sum( Purchase) sum_pay,
cast(sum( Purchase)/count(DISTINCT User_ID) as DECIMAL(10,2)) avg_pay
FROM
captain_data_shop_model
GROUP BY
Marital_Status
)
SELECT
p.*,
cast((user_num/sum(user_num) over() )*100 as DECIMAL(10,2)) user_num_precent,
cast((sum_pay/sum(sum_pay) over() )*100 as DECIMAL(10,2)) sum_pay_precent
FROM p
ORDER BY sum_pay_precent desc;
可以看到,奢侈品顶部用户特征为:性别为M,年龄在26-35, 职业为:4和0的,婚姻状况为0的用户(王思聪很符合啊~)
5.3.1 订单量TOP10的产品
with
p as
(
SELECT
Product_ID,
count(Product_ID) sale_num
FROM
captain_data_shop_model
GROUP BY
Product_ID
)
SELECT * FROM
(
SELECT
RANK() over(ORDER BY sale_num desc) sale_num_rank,
p.*,
cast((sale_num/sum(sale_num) over() )*100 as decimal(10,2)) sale_num_percent
FROM p
) T_1
WHERE sale_num_rank<=10;
5.3.2销售额TOP10
WITH p as
(
SELECT
product_ID,
sum( Purchase) sum_pay
FROM
captain_data_shop_model
GROUP BY
Product_ID
)
SELECT * FROM
(
SELECT
rank() over(ORDER BY sum_pay desc) sum_pay_rank,
p.* ,
cast((sum_pay/sum(sum_pay) over() )*100 as DECIMAL(10,2) ) sum_pay_precent
FROM
p
) t_1
WHERE
sum_pay_rank<=10
order by sum_pay_rank;
5.3.3 各一级产品类目的订单量、销售额、订单量占⽐、销售额占⽐、累计销售额占⽐
with p as
(
SELECT
Product_Category_1,
count(product_ID) sale_num,
sum( Purchase) pay_sum
FROM
captain_data_shop_model
GROUP BY
Product_Category_1
)
SELECT
p.*,
cast((sale_num/sum(sale_num) over() )*100 as DECIMAL(10,2)) sale_num_precent,
cast((pay_sum/sum(pay_sum) over() )*100 as DECIMAL(10,2)) pay_sum_precent,
cast( ((sum(pay_sum) over(ORDER BY pay_sum desc))/( sum(pay_sum) over() ))*100 as DECIMAL(10,2)) add_pay_sum_precent
FROM p
ORDER BY pay_sum_precent desc;
with p as
(
SELECT
Gender,
Product_ID ,
Product_Category_1,
Product_Category_2,
Product_Category_3,
count(Product_ID) sale_num,
sum(Purchase) sum_pay
FROM
captain_data_shop_model
GROUP BY
Gender,
Product_ID ,
Product_Category_1,
Product_Category_2,
Product_Category_3
)
SELECT * FROM
(
SELECT
p.*,
rank() over(PARTITION by Gender ORDER BY sum_pay desc) rank_sum_pay,
cast( (sum_pay/sum(sum_pay) over())*100 as DECIMAL(10,2)) sum_pay_precent
FROM
p
)
T_1
WHERE
rank_sum_pay<=10
order by Gender,rank_sum_pay
;
结果展示:
5.3.5 ⾼质量⽤户群年龄段的订单量TOP10产品
with p as
(
SELECT
Product_ID,
count(Product_ID) sale_num,
sum(Purchase) sum_pay,
Product_Category_1,
Product_Category_2,
Product_Category_3
FROM
captain_data_shop_model
WHERE
Age='26-35'
GROUP BY
Product_ID,
Product_Category_1,
Product_Category_2,
Product_Category_3
)
SELECT * FROM
(
SELECT
p.*,
cast( (sale_num/sum(sale_num) over() ) *100 as DECIMAL(10,2)) sale_num_precent,
cast( (sum_pay/sum(sum_pay) over() ) *100 as DECIMAL(10,2)) sum_pay_precent,
rank() over( order by sale_num desc) rank_sale_num
FROM
p
) t_1
WHERE rank_sale_num<=10;
好了,至此,对该奢侈品平台数据的用户群体特征分析与商品贡献分析基本完成,觉得不错,欢迎转发评论点赞收藏~