SQL复习(1)

SQL语句

行数

限制行数:末尾+limit n

查询行数:select count(*) from table

判定是否为空:is not null

聚合函数:count(xx)、sum(xx)、count(distinct xxx)、max/min/avg(xx)

union:纵向拼接

  • union:对结果进行并集操作(不包括重复行)
  • union all:对结果进行并集操作(包括重复行)
select * from OrderItems where quantity = 100
union all
select * from OrderItems where prod_id like 'BNBG%'

case when判断是否有过购买记录,增加新的一列

if(xx, xx, xx):判断

CASE expression
    WHEN condition1 THEN result1
    WHEN condition2 THEN result2 
    ... 
    WHEN conditionN THEN resultN
    ELSE result 
    END
### 判断是否有过购买记录,增加新的一列
select
 customer_id,
 (case 
     when latest_place_order_date is not null then 1
     else 0
     end)
     as if_placed_oredr
from customers_info
# 等价于
select customer_id, if(latest_place_order_date is not null,1,0) if_placed_order
    from customers_info
    
### 两重排序
# 按城市对客户进行排序,如果城市为空,则按国家排序
select * from customers_info
order by if(city is null, country, city)

### case后grouby
select (case
            when age is null then '未填写'
            when age>50 then '50以上'
            when age>20 and age<50 then '20-50'
            else '20以下'
       end) as age_group,
       count(*) user_count
from customers_info
group by age_group

窗口函数

语法结构:函数 OVER ([PARTITION BY 字段名 ORDER BY 字段名 ASC|DESC])

或:函数 OVER 窗口名 … WINDOW 窗口名 AS ([PARTITION BY 字段名 ORDER BY 字段名 ASC|DESC])

  • 序号函数
    • row_number():顺序排序
    • rank():并列排序,跳过重复序号
    • dense_rank():并列排序,不跳过重复序号
  • 分布函数
    • percent_rank():等级值百分比
    • cume_dist():累计分布值
  • 前后函数
    • lag(expr, n):返回当前行的前n行的expr的值
    • lead(expr, n):返回当前行的后n行的expr的值
  • 首尾函数
    • first_value(expr):返回第一个expr的值
    • last_value(expr):返回最后一个expr的值
  • 其他函数
    • nth_value(expr, n):返回第n个expr的值
    • ntile(n):将分区中的有序数据分为n个桶,记录桶编号
# 查询每天刷题通过数最多的前二名用户id和刷题数
# row_number():对每一组数据进行编号
select date, user_id, pass_count
from(select date,user_id,pass_count,
    row_number() over(partition by date order by pass_count desc) m
    from questions_pass_record) a
where m<=2 
order by date

# 查询用户的下一次刷题日期nextdate
# lead(expr, n):返回当前行的后n行的expr的值
select user_id,date,lead(date, 1) 
                    over(partition by user_id 
                    order by date asc) nextdate
from questions_pass_record

having字句

# 输出提交次数大于2次的用户ID且倒序排列
select user_id
from done_questions_record
group by user_id
having count(*)>2
order by user_id desc

# 输出提交且通过次数大于2 的用户ID且升序排列
select user_id
from done_questions_record
group by user_id
having sum(result_info)>2
order by user_id asc

# 计算三个值:
# question_pass_rate 表示每个用户不同题目的通过率(同一用户同一题重复提交通过仅计算一次);
# pass_rate 表示每个用户的提交正确率(只要有提交一次即计算一次);
# question_per_cnt表示平均每道不同的题目被提交的次数(只要有一次提交即计算一次)
select user_id, 
    count(distinct if(result_info=1,question_id,null))/count(distinct question_id) question_pass_rate,
    sum(result_info)/count(question_id) pass_rate,
    count(question_id)/count(distinct question_id) question_per_cnt
from done_questions_record
group by user_id
having question_pass_rate>0.6
order by user_id asc

时间函数

# 求出哪个小时为广告点击的高峰期,以及发生的点击次数
select hour(click_time) click_hour,
       count(click_time) click_cnt
from user_ad_click_time
group by click_hour
order by click_cnt desc
limit 1

# 输出在5min内完成点击购买的用户ID
select click.user_id uid
from user_ad_click_time click, user_payment_time pay
where click.user_id=pay.user_id 
and click.trace_id =pay.trace_id 
and time_to_sec(timediff(pay.pay_time,click.click_time))<=300
order by click.user_id desc

字符函数

# 正则表达式LIKE
select id, comment from comment_detail
where comment like '是%' or comment like '求%'
order by id asc

# 子串substring_index(xx,',',ind)
# 输出所有第二话题为1002的评论对应的第一话题subject_id1的数量cnt
select substring_index(subject_set,',',1) as subject_id1,
count(*) as cnt
from comment_detail
where substring_index(substring_index(subject_set,',',2),',',-1)='1002'
group by substring_index(subject_set,',',1)

# 字符串替换replace(xx, 'str1', 'str2')
select id, replace(comment,',','') comment
from comment_detail
where char_length(comment)>3  # 字符串长度

经典题

  1. order by的字段不在select的字段中,可以使用子查询+union(当一定要写Union的时候,否则可以直接写)

你可能感兴趣的:(数据分析,sql,数据库)