找出所有科目成绩都大于某一学科平均成绩的学生
数据集如下:
uid subject_id score
1001 01 90
1001 02 90
1001 03 90
1002 01 85
1002 02 85
1002 03 70
1003 01 70
1003 02 70
1003 03 85
create table score(
uid string,
subject_id string,
score int)
row format delimited fields terminated by '\t';
select
uid
from(
select
uid,
if(score>avg_score,0,1) flag
from(
select
uid,
score,
avg(score) over(partition by subject_id) avg_score
from score
) t1
) t2
group by uid
having sum(flag)=0;
我们有如下的用户访问数据
userId visitDate visitCount
u01 2017/1/21 5
u02 2017/1/23 6
u03 2017/1/22 8
u04 2017/1/20 3
u01 2017/1/23 6
u01 2017/2/21 8
U02 2017/1/23 6
U01 2017/2/22 4
要求使用SQL统计出每个用户的累积访问次数,如下表所示:
用户id 月份 小计 累积
u01 2017-01 11 11
u01 2017-02 12 23
u02 2017-01 12 12
u03 2017-01 8 8
u04 2017-01 3 3
CREATE TABLE action(
userId string,
visitDate string,
visitCount int)
ROW format delimited FIELDS TERMINATED BY "\t";
INSERT INTO TABLE action VALUES
('u01', '2017/1/21', 5),
('u02', '2017/1/23', 6),
('u03', '2017/1/22', 8),
('u04', '2017/1/20', 3),
('u01', '2017/1/23', 6),
('u01', '2017/2/21', 8),
('u02', '2017/1/23', 6),
('u01', '2017/2/22', 4);
SELECT
t2.userid,
t2.visitmonth,
subtotal_visit_cnt,
sum(subtotal_visit_cnt) over (partition BY userid ORDER BY visitmonth) as total_visit_cnt
FROM(
SELECT
userid,
visitmonth,
sum(visitcount) as subtotal_visit_cnt
FROM(
SELECT
userid,
date_format(regexp_replace(visitdate,'/','-'),'yyyy-MM') AS visitmonth,
visitcount
FROM action
) t1
GROUP BY userid,visitmonth
) t2
ORDER BY t2.userid,t2.visitmonth;
有50W个京东店铺,每个顾客访客访问任何一个店铺的任何一个商品时都会产生一条访问日志,访问日志存储的表名为visit,访客的用户id为user_id,被访问的店铺名称为shop,请统计:
CREATE TABLE visit(
user_id string,
shop string)
ROW format delimited FIELDS TERMINATED BY '\t';
INSERT INTO TABLE visit VALUES
('u1', 'a'),
('u2', 'b'),
('u1', 'b'),
('u1', 'a'),
('u3', 'c'),
('u4', 'b'),
('u1', 'a'),
('u2', 'c'),
('u5', 'b'),
('u4', 'b'),
('u6', 'c'),
('u2', 'c'),
('u1', 'b'),
('u2', 'a'),
('u2', 'a'),
('u3', 'a'),
('u5', 'a'),
('u5', 'a'),
('u5', 'a');
1、每个店铺的UV(访客数)
方式一:
SELECT
shop,
count(DISTINCT user_id)
FROM visit
GROUP BY shop;
方式二:
SELECT
t.shop,
count(*)
FROM(
SELECT
user_id,
shop
FROM visit
GROUP BY user_id,shop
) t
GROUP BY t.shop;
2、每个店铺访问次数top3的访客信息
SELECT
t2.shop,
t2.user_id,
t2.cnt
FROM(
SELECT
t1.*,
row_number() over(partition BY t1.shop ORDER BY t1.cnt DESC) rank
FROM(
SELECT
user_id,
shop,
count(*) AS cnt
FROM visit
GROUP BY user_id,shop
) t1
) t2
WHERE rank <= 3;
已知一个表STG.ORDER,有如下字段::Date,Order_id,User_id,amount
数据样例:2017-01-01,10029028,1000003251,33.57
请给出sql进行统计:
CREATE TABLE order_tab(
dt string,
order_id string,
user_id string,
amount DECIMAL(10, 2))
ROW format delimited FIELDS TERMINATED BY '\t';
INSERT INTO TABLE order_tab VALUES ('2017-01-01','10029028','1000003251',33.57);
INSERT INTO TABLE order_tab VALUES ('2017-01-01','10029029','1000003251',33.57);
INSERT INTO TABLE order_tab VALUES ('2017-01-01','100290288','1000003252',33.57);
INSERT INTO TABLE order_tab VALUES ('2017-02-02','10029088','1000003251',33.57);
INSERT INTO TABLE order_tab VALUES ('2017-02-02','100290281','1000003251',33.57);
INSERT INTO TABLE order_tab VALUES ('2017-02-02','100290282','1000003253',33.57);
INSERT INTO TABLE order_tab VALUES ('2017-11-02','10290282','100003253',234);
INSERT INTO TABLE order_tab VALUES ('2018-11-02','10290284','100003243',234);
1、给出 2017年每个月的订单数、用户数、总成交金额
SELECT
t1.mon,
count(t1.order_id) AS order_cnt,
count(DISTINCT t1.user_id) AS user_cnt,
sum(amount) AS total_amount
FROM(
SELECT
order_id,
user_id,
amount,
date_format(dt,'yyyy-MM') mon
FROM order_tab
WHERE date_format(dt,'yyyy') = '2017'
) t1
GROUP BY t1.mon;
2、给出2017年11月的新客数(指在11月才有第一笔订单)
SELECT count(user_id)
FROM order_tab
GROUP BY user_id
HAVING date_format(min(dt),'yyyy-MM')='2017-11';
有日志如下,请写出代码求得所有用户和活跃用户的总数及平均年龄(活跃用户指连续两天都有访问记录的用户)
日期 用户 年龄
2019-02-11 test_1 23
2019-02-11 test_2 19
2019-02-11 test_3 39
2019-02-11 test_1 23
2019-02-11 test_3 39
2019-02-11 test_1 23
2019-02-12 test_2 19
2019-02-13 test_1 23
2019-02-15 test_2 19
2019-02-16 test_2 19
create table user_age(
dt string,
user_id string,
age int)
row format delimited fields terminated by ',';
INSERT INTO TABLE user_age VALUES ('2019-02-11','test_1',23);
INSERT INTO TABLE user_age VALUES ('2019-02-11','test_2',19);
INSERT INTO TABLE user_age VALUES ('2019-02-11','test_3',39);
INSERT INTO TABLE user_age VALUES ('2019-02-11','test_1',23);
INSERT INTO TABLE user_age VALUES ('2019-02-11','test_3',39);
INSERT INTO TABLE user_age VALUES ('2019-02-11','test_1',23);
INSERT INTO TABLE user_age VALUES ('2019-02-12','test_2',19);
INSERT INTO TABLE user_age VALUES ('2019-02-13','test_1',23);
INSERT INTO TABLE user_age VALUES ('2019-02-15','test_2',19);
INSERT INTO TABLE user_age VALUES ('2019-02-16','test_2',19);
SELECT
sum(total_user_cnt) total_user_cnt,
sum(total_user_avg_age) total_user_avg_age,
sum(two_days_cnt) two_days_cnt,
sum(avg_age) avg_age
FROM(
SELECT
0 total_user_cnt,
0 total_user_avg_age,
count(*) AS two_days_cnt,
cast(sum(age) / count(*) AS decimal(5,2)) AS avg_age
FROM(
SELECT
user_id,
max(age) age
FROM(
SELECT
user_id,
max(age) age
FROM(
SELECT
user_id,
age,
date_sub(dt,rank) flag
FROM(
SELECT
dt,
user_id,
max(age) age,
row_number() over(PARTITION BY user_id ORDER BY dt) rank
FROM user_age
GROUP BY dt,user_id
) t1
) t2
GROUP BY user_id,flag
HAVING count(*) >= 2
) t3
GROUP BY user_id
) t4
UNION ALL
SELECT
count(*) total_user_cnt,
cast(sum(age) /count(*) AS decimal(5,2)) total_user_avg_age,
0 two_days_cnt,
0 avg_age
FROM(
SELECT
user_id,
max(age) age
FROM user_age
GROUP BY user_id
) t5
) t6;
请用sql写出所有用户中在今年10月份第一次购买商品的金额
表ordertable字段(购买用户:userid,金额:money,购买时间:paymenttime(格式:2017-10-01),订单id:orderid)
CREATE TABLE ordertable(
userid string,
money decimal(10,2),
paymenttime string,
orderid string);
INSERT INTO TABLE ordertable VALUES('001',100,'2017-10-01','123');
INSERT INTO TABLE ordertable VALUES('001',200,'2017-10-02','124');
INSERT INTO TABLE ordertable VALUES('002',500,'2017-10-01','125');
INSERT INTO TABLE ordertable VALUES('001',100,'2017-11-01','126');
SELECT
userid,
paymenttime,
money,
orderid
from(
SELECT
userid,
money,
paymenttime,
orderid,
row_number() over (PARTITION BY userid ORDER BY paymenttime) rank
FROM ordertable
WHERE date_format(paymenttime,'yyyy-MM') = '2017-10'
) t
WHERE rank = 1;
有一个线上服务器访问日志格式如下(用sql答题)
时间 接口 ip地址
2016-11-09 11:22:05 /api/user/login 110.23.5.33
2016-11-09 11:23:10 /api/user/detail 57.3.2.16
2016-11-09 23:59:40 /api/user/login 200.6.5.166
......
求11月9号下午14点(14-15点),访问/api/user/login接口的top10的ip地址
CREATE TABLE ip(
time string,
interface string,
ip string);
INSERT INTO TABLE ip VALUES('2016-11-09 11:22:05','/api/user/login','110.23.5.23');
INSERT INTO TABLE ip VALUES('2016-11-09 11:23:10','/api/user/detail','57.3.2.16');
INSERT INTO TABLE ip VALUES('2016-11-09 23:59:40','/api/user/login','200.6.5.166');
INSERT INTO TABLE ip VALUES('2016-11-09 11:14:23','/api/user/login','136.79.47.70');
INSERT INTO TABLE ip VALUES('2016-11-09 11:15:23','/api/user/detail','94.144.143.141');
INSERT INTO TABLE ip VALUES('2016-11-09 11:16:23','/api/user/login','197.161.8.206');
INSERT INTO TABLE ip VALUES('2016-11-09 12:14:23','/api/user/detail','240.227.107.145');
INSERT INTO TABLE ip VALUES('2016-11-09 13:14:23','/api/user/login','79.130.122.205');
INSERT INTO TABLE ip VALUES('2016-11-09 14:14:23','/api/user/detail','65.228.251.189');
INSERT INTO TABLE ip VALUES('2016-11-09 14:15:23','/api/user/detail','245.23.122.44');
INSERT INTO TABLE ip VALUES('2016-11-09 14:17:23','/api/user/detail','22.74.142.137');
INSERT INTO TABLE ip VALUES('2016-11-09 14:19:23','/api/user/detail','54.93.212.87');
INSERT INTO TABLE ip VALUES('2016-11-09 14:20:23','/api/user/detail','218.15.167.248');
INSERT INTO TABLE ip VALUES('2016-11-09 14:24:23','/api/user/detail','20.117.19.75');
INSERT INTO TABLE ip VALUES('2016-11-09 15:14:23','/api/user/login','183.162.66.97');
INSERT INTO TABLE ip VALUES('2016-11-09 16:14:23','/api/user/login','108.181.245.147');
INSERT INTO TABLE ip VALUES('2016-11-09 14:17:23','/api/user/login','22.74.142.137');
INSERT INTO TABLE ip VALUES('2016-11-09 14:19:23','/api/user/login','22.74.142.137');
SELECT
ip,
count(*) as cnt
FROM ip
WHERE date_format(time,'yyyy-MM-dd HH') >= '2016-11-09 14'
AND date_format(time,'yyyy-MM-dd HH') < '2016-11-09 15'
AND interface='/api/user/login'
GROUP BY ip
ORDER BY cnt desc
LIMIT 10;
有一个账号表如下,请写出SQL语句,查询各自区组的money排名前十的账号(分组取前10)
`dist_id` string '区组id',
`account` string '账号',
`gold` int '金币'
CREATE TABLE account(
dist_id int(11) DEFAULT NULL COMMENT '区组id',
account varchar(100) DEFAULT NULL COMMENT '账号',
gold int(11) DEFAULT 0 COMMENT '金币');
INSERT INTO TABLE account VALUES(1,'77',18);
INSERT INTO TABLE account VALUES(1,'88',106);
INSERT INTO TABLE account VALUES(1,'99',10);
INSERT INTO TABLE account VALUES(1,'12',13);
INSERT INTO TABLE account VALUES(1,'13',14);
INSERT INTO TABLE account VALUES(1,'14',25);
INSERT INTO TABLE account VALUES(1,'15',36);
INSERT INTO TABLE account VALUES(1,'16',12);
INSERT INTO TABLE account VALUES(1,'17',158);
INSERT INTO TABLE account VALUES(2,'18',12);
INSERT INTO TABLE account VALUES(2,'19',44);
INSERT INTO TABLE account VALUES(2,'10',66);
INSERT INTO TABLE account VALUES(2,'45',80);
INSERT INTO TABLE account VALUES(2,'78',98);
SELECT
dist_id,
account,
gold
FROM(
SELECT
dist_id,
account,
gold,
row_number () over (PARTITION BY dist_id ORDER BY gold DESC) rank
FROM test_sql.test10
) t
WHERE rank <= 10;
有一个充值日志表credit_log,字段如下:
`dist_id` int '区组id',
`account` string '账号',
`money` int '充值金额',
`create_time` string '订单时间'
请写出SQL语句,查询充值日志表2019年01月02号每个区组下充值额最大的账号
要求结果:区组id,账号,金额,充值时间
CREATE TABLE credit_log(
dist_id string COMMENT '区组id',
account string COMMENT '账号',
money decimal(10,2) COMMENT '充值金额',
create_time string COMMENT '订单时间');
INSERT INTO TABLE credit_log VALUES ('1','11',100006,'2019-01-02 13:00:01');
INSERT INTO TABLE credit_log VALUES ('1','22',110000,'2019-01-02 13:00:02');
INSERT INTO TABLE credit_log VALUES ('1','33',102000,'2019-01-02 13:00:03');
INSERT INTO TABLE credit_log VALUES ('1','44',100300,'2019-01-02 13:00:04');
INSERT INTO TABLE credit_log VALUES ('1','55',100040,'2019-01-02 13:00:05');
INSERT INTO TABLE credit_log VALUES ('1','66',100005,'2019-01-02 13:00:06');
INSERT INTO TABLE credit_log VALUES ('1','77',180000,'2019-01-02 13:00:07');
INSERT INTO TABLE credit_log VALUES ('1','88',106000,'2019-01-02 13:00:08');
INSERT INTO TABLE credit_log VALUES ('1','99',100400,'2019-01-02 13:00:09');
INSERT INTO TABLE credit_log VALUES ('1','12',100030,'2019-01-02 13:00:10');
INSERT INTO TABLE credit_log VALUES ('1','13',100003,'2019-01-02 13:00:20');
INSERT INTO TABLE credit_log VALUES ('1','14',100020,'2019-01-02 13:00:30');
INSERT INTO TABLE credit_log VALUES ('1','15',100500,'2019-01-02 13:00:40');
INSERT INTO TABLE credit_log VALUES ('1','16',106000,'2019-01-02 13:00:50');
INSERT INTO TABLE credit_log VALUES ('1','17',100800,'2019-01-02 13:00:59');
INSERT INTO TABLE credit_log VALUES ('2','18',100800,'2019-01-02 13:00:11');
INSERT INTO TABLE credit_log VALUES ('2','19',100030,'2019-01-02 13:00:12');
INSERT INTO TABLE credit_log VALUES ('2','10',100000,'2019-01-02 13:00:13');
INSERT INTO TABLE credit_log VALUES ('2','45',100010,'2019-01-02 13:00:14');
INSERT INTO TABLE credit_log VALUES ('2','78',100070,'2019-01-02 13:00:15');
WITH TEMP AS(
SELECT
dist_id,
account,
sum(money) sum_money
FROM credit_log
WHERE date_format(create_time,'yyyy-MM-dd') = '2019-01-02'
GROUP BY dist_id,account)
SELECT
t1.dist_id,
t1.account,
t1.sum_money
FROM(
SELECT
temp.dist_id,
temp.account,
temp.sum_money,
rank() over(partition BY temp.dist_id ORDER BY temp.sum_money DESC) ranks
FROM TEMP
) t1
WHERE ranks = 1
有三张表分别为会员表(member)、销售表(sale)、退货表(regoods)
业务说明:
查询需求:
分组查出销售表中所有会员购买金额,同时分组查出退货表中所有会员的退货金额,把会员id相同的购买金额、退款金额得到的结果更新到表会员表中对应会员的积分字段(credits)
数据集
sale表
memberid MNAccount
1001 50.3
1002 56.5
1003 235
1001 23.6
1005 56.2
25.6
33.5
regoods表
memberid RMNAccount
1001 20.1
1002 23.6
1001 10.1
23.5
10.2
1005 0.8
create table member(
memberid string,
credits double)
row format delimited fields terminated by '\t';
create table sale(
memberid string,
MNAccount double)
row format delimited fields terminated by '\t';
create table regoods(
memberid string,
RMNAccount double)
row format delimited fields terminated by '\t';
insert into table member
select
t1.memberid,
MNAccount-RMNAccount
from(
select
memberid,
sum(MNAccount) MNAccount
from sale
where memberid!=''
group by memberid
) t1
join(
select
memberid,
sum(RMNAccount) RMNAccount
from regoods
where memberid!=''
group by memberid
) t2
on t1.memberid=t2.memberid;
用一条SQL语句查询出每门课都大于80分的学生姓名
name kecheng fenshu
张三 语文 81
张三 数学 75
李四 语文 76
李四 数学 90
王五 语文 81
王五 数学 100
王五 英语 90
方式一:
select distinct name from table where name not in (select distinct name from table where fenshu<=80);
方式二:
select name from table group by name having min(fenshu)>80;
学生表如下:
自动编号 学号 姓名 课程编号 课程名称 分数
1 2005001 张三 0001 数学 69
2 2005002 李四 0001 数学 89
3 2005001 张三 0001 数学 69
删除除了自动编号不同,其他都相同的学生冗余信息
delete tablename where 自动编号 not in(select min(自动编号) from tablename group by 学号, 姓名, 课程编号, 课程名称, 分数);
一个叫team的表,里面只有一个字段name,一共有4条纪录,分别是a,b,c,d,对应四个球队,现在四个球队进行比赛,用一条sql语句显示所有可能的比赛组合
select a.name, b.name from team a, team b where a.name < b.name;
面试题:怎么把这样一个数据
year month amount
1991 1 1.1
1991 2 1.2
1991 3 1.3
1991 4 1.4
1992 1 2.1
1992 2 2.2
1992 3 2.3
1992 4 2.4
查成这样一个结果?
year m1 m2 m3 m4
1991 1.1 1.2 1.3 1.4
1992 2.1 2.2 2.3 2.4
selec
year,
(select amount from aaa m where month=1 and m.year=aaa.year) as m1,
(select amount from aaa m where month=2 and m.year=aaa.year) as m2,
(select amount from aaa m where month=3 and m.year=aaa.year) as m3,
(select amount from aaa m where month=4 and m.year=aaa.year) as m4
from aaa
group by year;
说明:复制表(只复制结构,源表名:a,新表名:b)
select * into b from a where 1<>1; (where1=1,拷贝表结构和数据内容)
<>(不等于)(SQL Server Compact):比较两个表达式;当使用此运算符比较非空表达式时,如果左操作数不等于右操作数,则结果为true;否则,结果为false
原表:
courseid coursename score
1 java 70
2 oracle 90
3 xml 40
4 jsp 30
5 servlet 80
为了便于阅读,查询此表后的结果显式如下(及格分数为60):
courseid coursename score mark
1 java 70 pass
2 oracle 90 pass
3 xml 40 fail
4 jsp 30 fail
5 servlet 80 pass
select courseid,coursename,score,if(score>=60,"pass","fail") as mark from course;
表名:购物信息
购物人 商品名称 数量
A 甲 2
B 乙 4
C 丙 1
A 丁 2
B 丙 5
……
给出所有购入商品为两种或两种以上的购物人记录
select * from 购物信息 where 购物人 in (select 购物人 from 购物信息 group by 购物人 having count(*) >= 2);
info表
date result
2005-05-09 win
2005-05-09 lose
2005-05-09 lose
2005-05-09 lose
2005-05-10 win
2005-05-10 lose
2005-05-10 lose
如果要生成下列结果,该如何写sql语句?
date win lose
2005-05-09 2 2
2005-05-10 1 2
方式一:
select
date,
sum(case when result = "win" then 1 else 0 end) as "win",
sum(case when result = "lose" then 1 else 0 end) as "lose"
from info
group by date;
方式二:
select
a.date,
a.result as win,
b.result as lose
from(
select
date,
count(result) as result
from info
where result = "win"
group by date
) as a
join (
select
date,
count(result) as result
from info
where result = "lose"
group by date
) as b
on a.date = b.date;
有一个订单表order,已知字段有:order_id(订单ID),user_id(用户ID),amount(金额),pay_datetime(付费时间),channel_id(渠道ID),dt(分区字段)
1、在Hive中创建这个表
create external table order(
order_id int,
user_id int,
amount double,
pay_datatime timestamp,
channel_id int
)partitioned by(dt string)
row format delimited fields terminated by '\t';
2、查询dt='2018-09-01'里每个渠道的订单数,下单人数(去重),总金额;查询dt='2018-09-01'里每个渠道的金额最大3笔订单
select
count(order_id),
count(distinct(user_id)),
sum(amount)
from order
where dt="2019-09-01"
select
order_id,
channel_id,
channel_id_amount
from(
select
order_id
channel_id,
amount,
max(amount) over(partition by channel_id),
min(amount) over(partition by channel_id),
row_number() over(partition by channel_id order by amount desc) ranks
from order
where dt="2019-09-01"
) t
where t.ranks<4;
3、有一天发现订单数据重复,请分析原因
订单属于业务数据,在关系型数据库中不会存在数据重复
hive建表时也不会导致数据重复
我推测是在数据迁移时,迁移失败导致重复迁移数据冗余了
有一个订单表t_order,已知字段有:order_id(订单ID),item_id(商品id),create_time(下单时间),amount(下单金额)
有一个商品表t_item,已知字段有:item_id(商品id),item_name(商品名称),category(品类)
有一个商品表t_item,已知字段有:item_id(商品id),item_name(商品名称),category_1(一级品类),category_2(二级品类)
1、最近一个月,销售数量最多的10个商品
select
item_id,
count(order_id) a
from t_order
where dataediff(create_time,current_date)<=30
group by item_id
order by a desc;
2、最近一个月,每个种类里销售数量最多的10个商品
一个订单对应一个商品,一个商品对应一个品类
with(
select
order_id,
item_id,
item_name,
category
from t_order
join t_item
on t_order.item_id = t_item.item_id
) t
select
order_id,
item_id,
item_name,
category,
count(item_id)over(partition by category) item_count
from t
group by category
order by item_count desc
limit 10;
有一个5000万的用户文件(user_id,name,age),一个2亿记录的用户看电影的记录文件(user_id,url),根据年龄段观看电影的次数进行排序
CREATE TABLE movie_user(
user_id string,
name string,
age int);
CREATE TABLE movie_log(
user_id string,
url string);
INSERT INTO TABLE movie_user VALUES('001','u1',10);
INSERT INTO TABLE movie_user VALUES('002','u2',15);
INSERT INTO TABLE movie_user VALUES('003','u3',15);
INSERT INTO TABLE movie_user VALUES('004','u4',20);
INSERT INTO TABLE movie_user VALUES('005','u5',25);
INSERT INTO TABLE movie_user VALUES('006','u6',35);
INSERT INTO TABLE movie_user VALUES('007','u7',40);
INSERT INTO TABLE movie_user VALUES('008','u8',45);
INSERT INTO TABLE movie_user VALUES('009','u9',50);
INSERT INTO TABLE movie_user VALUES('0010','u10',65);
INSERT INTO TABLE movie_log VALUES('001','url1');
INSERT INTO TABLE movie_log VALUES('002','url1');
INSERT INTO TABLE movie_log VALUES('003','url2');
INSERT INTO TABLE movie_log VALUES('004','url3');
INSERT INTO TABLE movie_log VALUES('005','url3');
INSERT INTO TABLE movie_log VALUES('006','url1');
INSERT INTO TABLE movie_log VALUES('007','url5');
INSERT INTO TABLE movie_log VALUES('008','url7');
INSERT INTO TABLE movie_log VALUES('009','url5');
INSERT INTO TABLE movie_log VALUES('0010','url1');
select
age_region, sum(cnt) as total_count
from(
select
user_id,
case
when age between 0 and 20 then "0-20"
when age between 20 and 40 then "20-40"
when age between 40 and 60 then "40-60"
else "other age region"
end as age_region
from movie_user
) as t1
join(
select
user_id, count(*) as cnt
from movie_log
group by user_id
) as t2
on t1.user_id = t2.user_id
group by t1.age_region
order by total_count desc;
现有图书管理数据库的三个数据模型如下:
图书(数据表名:BOOK)
序号 | 字段名称 | 字段描述 | 字段类型 |
---|---|---|---|
1 | BOOK_ID | 总编号 | 文本 |
2 | SORT | 分类号 | 文本 |
3 | BOOK_NAME | 书名 | 文本 |
4 | WRITER | 作者 | 文本 |
5 | OUTPUT | 出版单位 | 文本 |
6 | PRICE | 单价 | 数值(保留小数点后2位) |
读者(数据表名:READER)
序号 | 字段名称 | 字段描述 | 字段类型 |
---|---|---|---|
1 | READER_ID | 借书证号 | 文本 |
2 | COMPANY | 单位 | 文本 |
3 | NAME | 姓名 | 文本 |
4 | SEX | 性别 | 文本 |
5 | GRADE | 职称 | 文本 |
6 | ADDR | 地址 | 文本 |
借阅记录(数据表名:BORROW_LOG)
序号 | 字段名称 | 字段描述 | 字段类型 |
---|---|---|---|
1 | READER_ID | 借书证号 | 文本 |
2 | BOOK_ID | 总编号 | 文本 |
3 | BORROW_DATE | 借书日期 | 日期 |
1、创建图书管理库的图书、读者和借阅三个基本表的表结构
-- 创建图书表book
CREATE TABLE library_book(
book_id string,
SORT string,
book_name string,
writer string,
OUTPUT string,
price decimal(10,2));
INSERT INTO TABLE library_book VALUES ('001','TP391','信息处理','author1','机械工业出版社','20');
INSERT INTO TABLE library_book VALUES ('002','TP392','数据库','author12','科学出版社','15');
INSERT INTO TABLE library_book VALUES ('003','TP393','计算机网络','author3','机械工业出版社','29');
INSERT INTO TABLE library_book VALUES ('004','TP399','微机原理','author4','科学出版社','39');
INSERT INTO TABLE library_book VALUES ('005','C931','管理信息系统','author5','机械工业出版社','40');
INSERT INTO TABLE library_book VALUES ('006','C932','运筹学','author6','科学出版社','55');
-- 创建读者表reader
CREATE TABLE library_reader(
reader_id string,
company string,
name string,
sex string,
grade string,
addr string);
INSERT INTO TABLE library_reader VALUES ('0001','阿里巴巴','jack','男','vp','addr1');
INSERT INTO TABLE library_reader VALUES ('0002','百度','robin','男','vp','addr2');
INSERT INTO TABLE library_reader VALUES ('0003','腾讯','tony','男','vp','addr3');
INSERT INTO TABLE library_reader VALUES ('0004','京东','jasper','男','cfo','addr4');
INSERT INTO TABLE library_reader VALUES ('0005','网易','zhangsan','女','ceo','addr5');
INSERT INTO TABLE library_reader VALUES ('0006','搜狐','lisi','女','ceo','addr6');
-- 创建借阅记录表borrow_log
CREATE TABLE library_borrow_log(
reader_id string,
book_id string,
borrow_date string);
INSERT INTO TABLE library_borrow_log VALUES ('0001','002','2019-10-14');
INSERT INTO TABLE library_borrow_log VALUES ('0002','001','2019-10-13');
INSERT INTO TABLE library_borrow_log VALUES ('0003','005','2019-09-14');
INSERT INTO TABLE library_borrow_log VALUES ('0004','006','2019-08-15');
INSERT INTO TABLE library_borrow_log VALUES ('0005','003','2019-10-10');
INSERT INTO TABLE library_borrow_log VALUES ('0006','004','2019-17-13');
2、找出姓李的读者姓名(NAME)和所在单位(COMPANY)
SELECT
name,
company
FROM library_reader
WHERE name LIKE '李%';
3、查找“高等教育出版社”的所有图书名称(BOOK_NAME)及单价(PRICE),结果按单价降序排序
SELECT
book_name,
price
FROM library_book
WHERE OUTPUT = "高等教育出版社"
ORDER BY price DESC;
4、查找价格介于10元和20元之间的图书种类(SORT)出版单位(OUTPUT)和单价(PRICE),结果按出版单位(OUTPUT)和单价(PRICE)升序排序
SELECT
sort,
output,
price
FROM library_book
WHERE price >= 10 and price <= 20
ORDER BY output,price;
5、查找所有借了书的读者的姓名(NAME)及所在单位(COMPANY)
SELECT
b.name,
b.company
FROM library_borrow_log a
JOIN library_reader b
ON a.reader_id = b.reader_id;
6、求”科学出版社”图书的最高单价、最低单价、平均单价
SELECT
max(price),
min(price),
avg(price)
FROM library_book
WHERE OUTPUT = '科学出版社';
7、找出当前至少借阅了2本图书(大于等于2本)的读者姓名及其所在单位
SELECT
b.name,
b.company
FROM(
SELECT
reader_id
FROM library_borrow_log
GROUP BY reader_id
HAVING count(*) >= 2
) a
JOIN library_reader b
ON a.reader_id = b.reader_id;
8、考虑到数据安全的需要,需定时将“借阅记录”中数据进行备份,请使用一条SQL语句,在备份用户bak下创建与“借阅记录”表结构完全一致的数据表BORROW_LOG_BAK,井且将“借阅记录”中现有数据全部复制到BORROW_LOG_BAK中
CREATE TABLE library_borrow_log_bak as SELECT * FROM library_borrow_log;
9、现在需要将原Oracle数据库中数据迁移至Hive仓库,请写出“图书”在Hive中的建表语句(Hive实现,提示:列分隔符|;数据表数据需要外部导入:分区分别以month_part、day_part命名)
CREATE TABLE book_hive(
book_id string,
SORT string,
book_name string,
writer string,
OUTPUT string,
price DECIMAL(10, 2))
partitioned BY ( month_part string, day_part string )
ROW format delimited FIELDS TERMINATED BY '|' stored AS textfile;
10、Hive中有表A,现在需要将表A的月分区 201505 中 user_id 为20000的user_dinner字段更新为bonc8920,其他用户user_dinner字段数据不变,请列出更新的方法步骤(Hive实现,提示:Hive中无update语法,请通过其他办法进行数据更新)
方式1:配置hive支持事务操作,分桶表,orc存储格式
方式2:第一步找到要更新的数据,将要更改的字段替换为新的值,第二步找到不需要更新的数据,第三步将上两步的数据插入一张新表中
计算平台的每一个用户发过多少日记、获得多少点赞数
已知数据
t1:10万行数据
uid(用户id) log_id(日记id)
uid1 log_id1
uid2 log_id2
uid3 log_id3
......
t2:1000万行数据(注:没有被点赞的日志此表不做记录)
log_id(日记id) like_uid(点赞的用户id)
log_id1 uid2
log_id1 uid3
log_id1 uid4
log_id3 uid2
......
需求,请用sql计算出如下结果
uid(用户id) log_cnt(发过多少日记) liked_cnt(获得多少点赞)
uid1 2 3
uid2 1 1
......
with t3 as(
select
*
from t1
left join t2
on t1.log_id = t2.log_id
)
select
uid,
count(log_id) over(partition by uid) log_cnt,
count(like_uid) over(partition by log_id) liked_cnt
from t3;
处理产品版本号,版本号信息存储在数据表中,每行一个版本号
版本号命名规则:产品版本号由三部分组成,如:v9.11.2
第一部分9为主版本号,为1-99之间的数字;第二部分11为子版本号,为0-99之间的数字;第三部分2为阶段版本号,为0-99之间的数字(可选)
已知t1表有若干个版本号:
version_id(版本号)
v9.9.9
v8.1
v9.9.2
v9.20
v31.0.1
......
找出t1表中最大的版本号
计算出如下格式的所有版本号排序,要求对于相同的版本号,顺序号并列
version_id(版本号) seq(顺序号)
v31.0.1 0
v9.20 1
v9.9.2 2
v9.9.2 2
v9.0.8 4
......
1、找出t1表中最大的版本号
思路:列转行,切割版本号,一列变三列(主版本号 子版本号 阶段版本号)
with t2 as( -- 转换
select
version_id v1, -- 版本号
version_id v2 -- 主
from t1
lateral view explode(v2) tmp as v2
)
select -- 第一层 找出第一个
v1,
max(v2)
from t2;
select
version_id, -- 版本号
max(split(version_id,".")[0]) v1, -- 主版本不会为空
max(if(split(version_id,".")[1]="",0,split(version_id,".")[1]))v2, -- 取出子版本并判断是否为空,并给默认值
max(if(split(version_id,".")[2]="",0,split(version_id,".")[2]))v3 -- 取出阶段版本并判断是否为空,并给默认值
from t1;
2、计算出如下格式的所有版本号排序,要求对于相同的版本号,顺序号并列
select
version_id,
rank() over(partition by version_id order by version_id) seq
from t1;
现在有三个表student(学生表)、course(课程表)、score(成绩单),结构如下:
create table student(
id bigint comment '学号',
name string comment '姓名',
age bigint comment '年龄');
create table course(
cid string comment '课程号,001/002格式',
cname string comment '课程名');
create table score(
id bigint comment '学号',
cid string comment '课程号',
score bigint comment '成绩'
) partitioned by(event_day string)
其中score中的id、cid,分别是student、course中对应的列请根据上面的表结构,回答下面的问题:
insert into table student values(1001,'wsl1',21);
insert into table student values(1002,'wsl2',22);
insert into table student values(1003,'wsl3',23);
insert into table student values(1004,'wsl4',24);
insert into table student values(1005,'wsl5',25);
insert into table course values('001','math');
insert into table course values('002','English');
insert into table course values('003','Chinese');
insert into table course values('004','music');
insert into table score values(1001,'004',10);
insert into table score values(1002,'003',21);
insert into table score values(1003,'002',32);
insert into table score values(1004,'001',43);
insert into table score values(1005,'003',54);
insert into table score values(1001,'002',65);
insert into table score values(1002,'004',76);
insert into table score values(1003,'002',77);
insert into table score values(1001,'004',48);
insert into table score values(1002,'003',39);
1、请将本地文件(/home/users/test/20190301.csv)文件,加载到分区表score的20190301分区中,并覆盖之前的数据
load data local inpath '/home/users/test/20190301.csv' overwrite into table score partition(event_day='20190301');
2、查出平均成绩大于60分的学生的姓名、年龄、平均成绩
select
stu.name,
stu.age,
stu.avg_score
from student stu
join(
select
id,
avg(score) avg_score
from score
group by id
) t1
on stu.id=t1.id
where avg_score>60;
3、查出没有‘001’课程成绩的学生的姓名、年龄
select
stu.name,
t2.age
from student stu
join(
select
id
from score
where cid != 001
group by id
) t2
on stu.id=t2.id;
4、查出有‘001’,’002’这两门课程下,成绩排名前3的学生的姓名、年龄
select
stu.name,
stu.age
from(
select
id,
cid,
score,
rank() over(partition by cid order by score desc) rank
from score
where cid=001 or cid-002
) t1
join student stu
on t1.id=stu.id
where rank<=3;
5、创建新的表score_20190317,并存入score表中20190317分区的数据
create table score_20190317 as select * from score where dt='20190317';
6、如果上面的score表中,uid存在数据倾斜,请进行优化,查出在20190101-20190317中,学生的姓名、年龄、课程、课程的平均成绩
select
stu.name,
stu.age,
cou.cname,
t1.avg_score
from(
select
id,
cid,
avg(score) avg_score
from score
group by id,cid
where dt >= '20190101' and dt <= '20190317'
) t1
left join student stu on t1.id = stu.id
left join course cou on t1.cid = cou.cid;
7、描述一下union和union all的区别,以及在mysql和HQL中用法的不同之处?
union会将联合的结果集去重,效率较union all差
union all不会对结果集去重,所以效率高
HQL中要求union或union all操作时必须保证select集合的结果相同个数的列,并且每个列的类型是一样的
8、简单描述一下lateral view语法在HQL中的应用场景,并写一个HQL实例
比如一个学生表为:
学号 姓名 年龄 成绩(语文|数学|英语)
001 张三 16 90,80,95
需要实现的效果:
学号 成绩
001 90
001 80
001 95
create table student(
id string,
name string,
age int,
scores array<string>)
row format delimited fields terminated by '\t'
collection items terminated by ',';
select
id,
score
from student
lateral view explode(scores) tmp_score as score;
需求:
数据如下
username month salary(消费记录)
A 2015-01 5
A 2015-01 15
B 2015-01 5
A 2015-01 8
B 2015-01 25
A 2015-01 5
A 2015-02 4
A 2015-02 6
B 2015-02 10
B 2015-02 5
create table t_consumer(
username string,
month string,
salary int)
row format delimited fields terminated by ',';
load data local inpath '/home/hadoop/data/emp_salas.txt' into table t_consumer;
1、求出每人每月的消费金额
select
username,
month,
sum(salary) as salary
from t_consumer
group by username,month;
2、求出截止到当月每个人累计消费总额
select
username,
month,
cnt,
sum(cnt) over(partition by username order by month) as total_cnt
from(
select
username,
month,
sum(salary) as cnt
from(
select
*
from t_consumer
) as t1
group by username, month
) as t2
order by username, month;