--使用子查询提升count distinct的效率
数据库:38_6543 db_hyrhk
--1.count distinct很有用,但太慢
----Count distinct是SQL分析时的祸根所以拿它来做个例子
----首先,让我们以我们一直使用的一个简单查询开始:查看案件涉及当事人大于200的案件
select
t_aj_all.c_ajbh,
count(distinct t_dsr_allbak.c_bh)
from t_aj_all
join t_dsr_allbak on t_dsr_allbak.c_ajbh = t_aj_all.c_ajbh
group by t_aj_all.c_ajbh having count(distinct t_dsr_allbak.c_bh)>200
order by count desc
耗时:8.573s
----它慢是因为数据库遍历了所有当事人以及所有的案件,然后join它们,再将它们排序,这些都在真正的group和分组和聚合工作之前。
--2.先聚合,然后Join
----group-聚合后的任何工作代价都要低,因为数据量会更小。:
select
t_aj_all.c_ajbh,
dsr.ct
from t_aj_all
join (
select
c_ajbh,
count(distinct c_bh) as ct
from t_dsr_allbak
group by c_ajbh having count(distinct c_bh)>200
) dsr
on dsr.c_ajbh = t_aj_all.c_ajbh
order by dsr.ct desc
耗时:7.822s
----正如设计的,group-聚合在join之前。
--3.先将数据集缩小
----我们可以做的更好。通过在整个当事人表上group-聚合,我们处理了数据库中很多不必要的数据。
----我们可以预先计算差异,而不是处理全部数据,这样只需要一个哈希集合。然后我们在此基础上做一个简单的聚集即可。
select
t_aj_all.c_ajbh,
counts.ct
from t_aj_all
inner join (
select distinct_dsr.c_ajbh,
count(1) as ct
from (
select distinct c_ajbh, c_bh
from t_dsr_allbak
) as distinct_dsr
group by distinct_dsr.c_ajbh having count(1) >200
) as counts
on counts.c_ajbh = t_aj_all.c_ajbh
order by counts.ct desc
耗时:1.948s
----我们采取内部的count-distinct-group,然后将数据拆成两部分分成两块。第一块计算distinct (c_ajbh, c_bh) 。第二块在它们基础上运行一个简单group-count。跟上面一样,最后再join。
----通常,数据大小和类型很重要。上面的例子受益于基数中没多少换算。distinct (c_ajbh, c_bh)相对于数据总量来说数量也很少。不同的对数越多,用来group和计数的唯一数据就越多——代价便会越来越大。
--总结:下一遇到长时间运行的count distinct时,尝试一些子查询吧。
with as 语法:WITH AS短语,也叫做子查询部分(subquery factoring),可以让你做很多事情,定义一个SQL片断,该SQL片断会 被整个SQL语句所用到。如果WITH AS短语所定义的表名被调用两次以上,则优化器会自动将 WITH AS短语所获取的数据放入一个TEMP表里,如果只是被调用一次,则不会。而提示materialize则是强制将WITH AS 短语里的数据放入一个全局临时表里。很多查询通过这种方法都可以提高速度
是公用表表达式,可以理解为创建临时表。比如
WITH A AS(SELECT * FROM TABLE1)
SELECT * FROM A
with as语法
–针对一个别名
with tmp as (select * from tb_name)
–针对多个别名
with
tmp as (select * from tb_name),
tmp2 as (select * from tb_name2),
tmp3 as (select * from tb_name3)
其实就是把一大堆重复用到的sql语句放在with as里面,取一个别名,后面的查询就可以用它,这样对于大批量的sql语句起到一个优化的作用,而且清楚明了
--样例:
--1.
select
t_aj_all.c_ajbh,
log_counts.ct
from t_aj_all
inner join (
select distinct_logs.c_ajbh,
count(1) as ct
from (
select distinct c_ajbh, c_bh
from t_dsr_allbak
) as distinct_logs
group by distinct_logs.c_ajbh having count(1) >200
) as log_counts
on log_counts.c_ajbh = t_aj_all.c_ajbh
order by log_counts.ct desc
with distinct_logs as
( select distinct c_ajbh, c_bh
from t_dsr_allbak
),
log_counts as
(
select distinct_logs.c_ajbh,
count(1) as ct
from distinct_logs group by distinct_logs.c_ajbh having count(1) >200
)
select
t_aj_all.c_ajbh,
log_counts.ct
from t_aj_all
inner join log_counts
on log_counts.c_ajbh = t_aj_all.c_ajbh
order by log_counts.ct desc
--2.
select
t_aj_all.c_ajbh,
dsr.ct
from t_aj_all
join (
select
c_ajbh,
count(distinct c_bh) as ct
from t_dsr_allbak
group by c_ajbh having count(distinct c_bh)>200
) dsr
on dsr.c_ajbh = t_aj_all.c_ajbh
order by dsr.ct desc
with dsr as ( select
c_ajbh,
count(distinct c_bh) as ct
from t_dsr_allbak
group by c_ajbh having count(distinct c_bh)>200)
select t_aj_all.c_ajbh,
dsr.ct
from t_aj_all
join dsr on dsr.c_ajbh = t_aj_all.c_ajbh
order by dsr.ct desc
区别:
EXPLAIN ANALYZE
with solution as(
SELECT
c_id,
c_task_mc,
d_createtime,
n_status,
c_graph_version
FROM
db_solution.t_solution_task
),
temp as(
SELECT
c_id,
MAX (d_createtime) AS d_createtime
FROM
db_solution.t_solution_task_his
GROUP BY
c_id
),
solutionhis as(
SELECT
A .c_id,
A .c_task_mc,
A .d_createtime,
A .n_status,
A .c_graph_version
FROM
db_solution.t_solution_task_his A
left join temp on A.c_id = temp.c_id and A.d_createtime=temp.d_createtime
left join solution on A.c_id= solution.c_id
where temp.c_id is not null and solution.c_id is null)
select
COUNT (DISTINCT task.c_id)
from
db_solution.t_task task
left join solution on task.c_id = solution.c_id
left join solutionhis on task.c_id = solutionhis.c_id
LEFT JOIN db_solution.t_task_process_graph gra ON task.c_id = gra.c_task_id AND
task.c_graph_version = gra.c_version
EXPLAIN ANALYZE
select
COUNT (DISTINCT task.c_id)
from
db_solution.t_task task
left join db_solution.t_solution_task solution on task.c_id = solution.c_id
left join ( SELECT
A .c_id,
A .c_task_mc,
A .d_createtime,
A .n_status,
A .c_graph_version
FROM
db_solution.t_solution_task_his A
left join ( SELECT c_id,MAX (d_createtime) AS d_createtime FROM db_solution.t_solution_task_his GROUP BY c_id)temp on A.c_id = temp.c_id and A.d_createtime=temp.d_createtime
left join db_solution.t_solution_task solution on A.c_id= solution.c_id
where temp.c_id is not null and solution.c_id is null) solutionhis on task.c_id = solutionhis.c_id
LEFT JOIN db_solution.t_task_process_graph gra ON task.c_id = gra.c_task_id AND
task.c_graph_version = gra.c_version