(1).用户资料表(mp_user)
字段 字段描述 字段类型
user_id 用户ID String
user_name 用户姓名 String
user_status 用户状态 int
area_id 地区ID int
device_number 手机号码 String
cust_id 客户ID String
cust_sex 客户性别 String
cust_addr 客户地址 String
(2).地区ID对应的地区名称表(mp_area)
字段 字段描述 字段类型
area_id 地区ID int
area_name 地区名称 String
(3)流量详单表(mp_gprs)
字段 字段描述 字段类型
device_number 手机号码 String
lac 位置区编码 String
ci CI号码 String
imei 终端类型 String
service_type 流量类型 String
start_time 开始时间 String
end_time 结束时间 String
duration 时长(秒) Int
send_bytes 上行流量(MB) decimal(20,3)
recv_bytes 下行流量(MB) decimal(20,3)
mp_use.txt 用户资料表
883090|钱七|1|4|13074737671|37867|男|威海
mp_area.txt 地区名称表
0|北京
mp_gprs.txt 流量详单表
13027810045|59146|10742|013717002416820|912|2017-02-12 20:40:33|2017-02-12 23:12:20|1| 74|287
create external table mp_user(
user_id string,
user_name string,
user_status int,
area_id int,
device_number string,
cust_id string,
cust_sex string,
cust_addr string)
row format delimited fields terminated by '|';
create external table mp_area(area_id int,area_name string)
row format delimited fields terminated by '|';
create external table mp_gprs(
device_number string,
lac string,
ci string,
imei string,
service_type string,
start_time string,
end_time string,
duration int,
send_bytes decimal(20,3),
recv_bytes decimal(20,3)
)row format delimited fields terminated by '|';
加载数据
load data local inpath '/usr/local/data/mp_use.txt' into table mp_user;
load data local inpath '/usr/local/data/mp_area.txt' into table mp_area;
load data local inpath '/usr/local/data/mp_gprs.txt' into table mp_gprs;
输出的字段 包含
通话时长计算方式 结束时间-开始时间
字段 字段描述 字段类型
user_name 用户姓名 String
user_status 用户状态 int
cust_sex 客户性别 String
area_name 地区名称 String
device_number 手机号码 String
ci CI号码 String
total_time 通话时长(秒) Int
解答
关于这里的解答,mp_gprs中的字段含义,开始时间和结束时间,时长,我不确定应该用哪个,如果说是时长duration字段,每次通话时长就一两秒就很奇怪。但如果是用开始时间和结束时间来算通话时长,有感觉duration这个字段没什么用了。所以我干脆两种方式的解答都写了,看到这篇博客的朋友可以根据自己的理解来做,我的答案仅作参考
//首先连接user表和area表,并且提取相应字段,结果放入t1表中
select user_name,user_status,cust_sex,area_name,device_number
from mp_user join mp_area on mp_user.area_id=mp_area.area_id;
with t1 as (select user_name,user_status,cust_sex,area_name,device_number
from mp_user join mp_area on mp_user.area_id=mp_area.area_id)
//然后改变一下gprs表,将里面的starttime和endtime改为unix时间戳格式,结果放入t2
select device_number,ci,unix_timestamp(start_time,'yyyy-MM-dd HH:mm:ss') as s_time, unix_timestamp(end_time,'yyyy-MM-dd HH:mm:ss') as e_time
from mp_gprs;
with t2 as (select device_number,ci,unix_timestamp(start_time,'yyyy-MM-dd HH:mm:ss') as s_time, unix_timestamp(end_time,'yyyy-MM-dd HH:mm:ss') as e_time from mp_gprs)
//两表连接求最终结果
select t1.*,ci,(e_time-s_time) as total_time from t1 join t2 on t1.device_number=t2.device_number;
//汇总
with t1 as (select user_name,user_status,cust_sex,area_name,device_number from mp_user join mp_area on mp_user.area_id=mp_area.area_id),
t2 as (select device_number,ci,unix_timestamp(start_time,'yyyy-MM-dd HH:mm:ss') as s_time, unix_timestamp(end_time,'yyyy-MM-dd HH:mm:ss') as e_time from mp_gprs)
select t1.*,ci,(e_time-s_time) as total_time from t1 join t2 on t1.device_number=t2.device_number;
像上面这样求出来的数据,会有大量的重复行,因此还需要一步合并
//合并
with t1 as (select user_name,user_status,cust_sex,area_name,device_number from mp_user join mp_area on mp_user.area_id=mp_area.area_id),
t2 as (select device_number,ci,unix_timestamp(start_time,'yyyy-MM-dd HH:mm:ss') as s_time, unix_timestamp(end_time,'yyyy-MM-dd HH:mm:ss') as e_time from mp_gprs)
select * from (
select t1.*,ci,(e_time-s_time) as total_time from t1 join t2 on t1.device_number=t2.device_number) as t
group by user_name,user_status,cust_sex,area_name,device_number,ci,total_time;
//排序
with t1 as (select user_name,user_status,cust_sex,area_name,device_number from mp_user join mp_area on mp_user.area_id=mp_area.area_id),
t2 as (select device_number,ci,unix_timestamp(start_time,'yyyy-MM-dd HH:mm:ss') as s_time, unix_timestamp(end_time,'yyyy-MM-dd HH:mm:ss') as e_time from mp_gprs)
select * from (
select t1.*,ci,(e_time-s_time) as total_time from t1 join t2 on t1.device_number=t2.device_number) as t
group by user_name,user_status,cust_sex,area_name,device_number,ci,total_time
order by total_time desc;
//首先连接user表和area表,并且提取相应字段,结果放入t1表中
select user_name,user_status,cust_sex,area_name,device_number
from mp_user join mp_area on mp_user.area_id=mp_area.area_id;
with t1 as (select user_name,user_status,cust_sex,area_name,device_number
from mp_user join mp_area on mp_user.area_id=mp_area.area_id)
//对mp_gprs表求出每个手机号码的时长总和,结果放入t2表中
select device_number,ci,sum(duration) as sum from mp_gprs group by device_number,ci;
with t2 as (select device_number,ci,sum(duration) as sum from mp_gprs group by device_number,ci)
//连接t1表和t2表,获取相应的字段信息
select t1.*,ci,sum from t1 join mp_gprs on t1.device_number=mp_gprs.device_number
//汇总
with t1 as (select user_name,user_status,cust_sex,area_name,device_number from mp_user join mp_area on mp_user.area_id=mp_area.area_id),
t2 as (select device_number,ci,sum(duration) as sum from mp_gprs group by device_number,ci)
select t1.*,ci,sum from t1 join t2 on t1.device_number=t2.device_number
order by sum desc;
输出的字段 包含 用户姓名,时间(天),总流量
//首先获取每个手机号码的日期和每天总流量
select *
from (select device_number,substring(start_time,0,10) as sub,(send_bytes+recv_bytes) as sum from mp_gprs) as t
group by device_number,sub,sum;
//考虑到上行流量与下行流量可能为null的情况,替换为0
select *
from (select device_number,substring(start_time,0,10) as use_day,(if(send_bytes is null,0,send_bytes)+if(recv_bytes is null,0,recv_bytes)) as total_bytes from mp_gprs) as t
group by device_number,use_day,total_bytes;
//然后连接user表,获取用户姓名,顺便加上order by排序子句,然后用limit求取前五名
with t1 as (select device_number,use_day,total_bytes from (select device_number,substring(start_time,0,10) as use_day,(if(send_bytes is null,0,send_bytes)+if(recv_bytes is null,0,recv_bytes)) as total_bytes from mp_gprs) as t group by device_number,use_day,total_bytes)
select user_name,use_day,total_bytes
from t1
join mp_user
on t1.device_number=mp_user.device_number
order by total_bytes
desc limit 5;