PostgreSQL Limit对索引的影响

服务器CPU排行榜

相关行业的同学如看不懂应该该好好反思一下自己了,思考人生了.

1.创建测试表

drop table if exists test;
create table test(
    objectid serial not null,
    num integer not null,
    ref integer[] not null,
    constraint pk_test_objectid primary key(objectid)
)with (fillfactor=100);
alter table test cluster on pk_test_objectid;

为加快插入速度,其它索引在生成数据完成后再创建.

2.创建函数

函数用于控制num和ref的值分布,以便num和ref字段上的索引具有较高的可选择性.

drop function if exists saveAsTest(integer,integer[]);
drop function if exists gen_row(integer[],tweights[],tweights[]);
drop function if exists gen_array(integer[],tweights[]);
drop function if exists get_next_index(tweights[]);
drop type if exists  tweights;
/****************************************************************************************
    创建平滑加权轮询系数类型
        weight:设置的系数
        curweight:当前使用的系数,初始化设置为0即可
****************************************************************************************/
create type tweights as(weight integer,curweight integer);
/****************************************************************************************
    平滑加权轮询(smooth weighted round-robin balancing)算法
    示例: array[((50,0)::tweights),((30,0)::tweights),((15,0)::tweights),((5,0)::tweights)]
            配置了4个系数参数,注意所有系数值累加为100,每调用一百次
                第一个系数返回索引1的概率为50%
                第二个系数返回索引2的概率为30%
                第三个系数返回索引3的概率为15%
                第四个系数返回索引4的概率为5%
****************************************************************************************/
create or replace function get_next_index(tweights[])
  returns table(index integer, weights tweights[])
as $$
    declare
        v_i integer;
        v_len integer;
        v_index integer;
        v_total integer;
        v_tmp tweights;
        v_tmpindex tweights;
    begin
        v_len := array_length($1,1);
        if (1 = v_len) then
          return query select 1,$1;
        end if;
        v_index := -1; v_total := 0;

        for v_i in 1..v_len loop
          v_tmp := $1[v_i];
          v_tmp.curweight := (v_tmp.curweight + v_tmp.weight);
          v_total := (v_total + v_tmp.weight);
          $1[v_i] = v_tmp;
          if (-1 = v_index or ($1[v_index]).curweight < v_tmp.curweight) then
            v_index := v_i;
          end if;
        end loop;

        v_tmpindex := $1[v_index];
        v_tmpindex.curweight :=  v_tmpindex.curweight - v_total;
        $1[v_index] = v_tmpindex;
        return query select v_index,$1;
    end;
$$ language plpgsql strict;


/****************************************************************************************
    随机生成1-4个元素的数组
drop function if exists gen_array(integer[],tweights[]);
****************************************************************************************/
create or replace function gen_array(integer[],tweights[])
    returns table(vals integer[], weights tweights[])
as $$
      with recursive cte(id,val,weights,count) as (
    			(select 1,$1[index],weights,((random()*(4-1)+1)::integer) from get_next_index($2))	 
    			union all
    			select (p.id+1),$1[a.index],a.weights,p.count from cte as p,get_next_index(p.weights) as a where p.id  < count
			) select array_agg(val),(select weights from cte where id=count) from cte;
$$ language sql strict;
/****************************************************************************************
    生成行
    $1、$2、$3的数组大小必须一至
    $2:为生成integer的平滑加权轮询系数
    $3:为生成integer[]的平滑加权轮询系数
drop function if exists gen_row(integer[],tweights[],tweights[]);
****************************************************************************************/
create or replace function gen_row(integer[],tweights[],tweights[])
    returns table(num integer,weights1 tweights[],ref integer[],weights2 tweights[])
as $$
  select $1[num.index],num.weights,ref.*  
  from get_next_index($2) as num,gen_array($1,$3) as ref;
$$ language sql strict;
/****************************************************************************************
    函数测试是否符合预期
****************************************************************************************/
/*
select *
from gen_row(
  array[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],
  array[
    (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
    (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
    (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
    (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights
  ],
  array[
    (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
    (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
    (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
    (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights
  ]);
*/
/****************************************************************************************
    保存数据到Test表
drop function if exists saveAsTest(integer,integer[]);
****************************************************************************************/
create or replace function saveAsTest(integer,integer[])
    returns integer
as $$
  insert into test(num,ref) values($1,$2) returning objectid;
$$ language sql strict;

3.生成测试数据

  • num的值范围为1-20,平均分布(各个的值占比为5%).
  • ref的值范围为1-20,数组大小控制在1-4(随机大小),每生成100个数值各个值的占比也为5%.
delete from test;
select setval(pg_get_serial_sequence('test','objectid'), 1, false);
/****************************************************************************************
    导入测试数据,开10个终端,每个终端都执行以下脚本.
    博主测试机cpu为双路16核,因此开了16个终端.CPU型号为Intel(R) Xeon(R) CPU E5530  @ 2.40GHz,现属于垃圾cpu,排行榜在倒数...
    因表比较简单导入测试数据硬盘写入较少(最高约16MB/s,大多数情况下小于2MB/s).
    本例主要是cpu运算,因此16个终端同时运行cpu达到了100%.kao运行了一会风扇狂响.......
****************************************************************************************/
\timing on
do $$
    declare
        v_nums integer[];
    v_weights1 tweights[];
    v_weights2 tweights[];

    v_num integer;
    v_ref integer[];
    v_coun integer;
    begin    
    v_coun := 1;
        v_nums:=array[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20];
    v_weights1:=array[
      (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
      (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
      (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
      (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights
    ];
    v_weights2:=array[
      (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
      (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
      (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
      (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights
    ];

        for i in 1..1000000 loop            
      select num,weights1,ref,weights2 into v_num,v_weights1,v_ref,v_weights2 from gen_row(v_nums,v_weights1,v_weights2);
      perform saveAsTest(v_num,v_ref);
      --raise notice  '%  %', v_num,v_ref;
      if ( 0 = (i % 1000) ) then      
        raise notice  '%', v_coun;
        v_coun := v_coun + 1;
      end if;
        end loop;
    end;
$$;
序号 耗时(ms)
1 1491206.016
2 1511390.919
3 1517245.568
4 1509241.432
5 1519552.252
6 1514420.896
7 1520820.174
8 1512984.280
9 1519851.215
10 1514590.502
11 1505463.332
12 1503091.390
13 1503749.024
14 1501670.722
15 1500027.669
16 1503459.150

4.创建索引

插入完成后vacuum表,测试时结果更准确.

vacuum  freeze verbose  analyze test;
select count(*) from test;
/*
count   
----------
16000000
(1 row)

Time: 587.956 ms
*/

/*B树索引*/
create index idx_test_num on test(num);

/*数组索引
 使用gin__int_ops,截止目前根据我的需求数组索引测试下来gin__int_ops效果最好
 gin__int_ops依赖intarray扩展
 create extension intarray;
*/
create index idx_test_ref on test using gin(ref gin__int_ops);
/*其它数组类型索引,需要相关扩展*/
--create index idx_test_ref on test using gist(ref gist__int_ops);
--create index idx_test_ref on test using rum(ref rum_anyarray_ops);

/*可以查看一下表结构*/
\dS+ test;

5.查询测试

注意不要加order by,order by会影响执行计划,目前只单纯的测试limit和索引之间的关系.

执行查询时多执行几次,直至不读取磁盘(没有Buffers: shared read).

因为数据在表中的占比一样,因此只要查询一个值就可以了.

/*表包含的数据,b树索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where num=1;
--Execution time: 2568.059 ms

/*表里不包含的数据,b树索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where num=21;
--Execution time: 0.044 ms

/*表包含的数据,数组索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref@>array[1];
--Execution time: 6589.734 ms

explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[1,2];
--Execution time: 9037.726 ms

explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[1,2,3];
--Execution time: 11621.418 ms

/*表不包含的数据,数组索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref@>array[21];
--Execution time: 0.065 ms

explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[21,22];
--Execution time: 0.056 ms

explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[21,22,23];
--Execution time: 0.060 ms

6.常规limt测试

/*表包含的数据,b树索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where num=1 limit 50;
--Execution time: 0.535 ms

/*表里不包含的数据,b树索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where num=21 limit 50;
--Execution time: 0.050 ms


/*表包含的数据,数组索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref@>array[1] limit 50;
--Execution time: 0.585 ms

explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[1,2] limit 50;
--Execution time: 0.561 ms

explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[1,2,3] limit 50;
--Execution time: 0.537 ms

/*表不包含的数据,数组索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref@>array[21]  limit 50;
--Execution time: 3572.286 ms

explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[21,22] limit 50;
--Execution time: 3944.530 ms

explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[21,22,23] limit 50;
--Execution time: 4130.662 ms

通过对比可以看到B树索引添加limit性能更高,只返回limit限定的数据,无论表中是否包含条件值.

数组索引分两种情况,表中包含条件值、表中不包含条件值.

6.1 数组索引和limit

6.1.1 表中包含条件值

不会使用数组索引,使用全表扫描,但是有limit限定,所以速度很快.

6.1.2 表中不包含条件值

不会使用数组索引,使用全表扫描,因为值不包含在表中,所以需要全表扫描,然后过滤所有数据,速度非常慢.

6.1.2.1 解决方案-使用with

with会使用数组索引.

/*表包含的数据,数组索引*/
explain (analyze,verbose,costs,buffers,timing)
with cte as(
  select objectid from test where ref@>array[1]
)select * from cte limit 10;
--Execution time: 293.301 ms

explain (analyze,verbose,costs,buffers,timing)
with cte as(
  select objectid from test where ref&&array[1,2]
)select * from cte limit 10;
--Execution time: 464.427 ms

explain (analyze,verbose,costs,buffers,timing)
with cte as(
  select objectid from test where ref&&array[1,2,3]
)select * from cte limit 10;
--Execution time: 717.172 ms

/*表不包含的数据,数组索引*/
explain (analyze,verbose,costs,buffers,timing)
with cte as(
  select objectid from test where ref@>array[21]
)select * from cte limit 10;
--Execution time: 0.075 ms

explain (analyze,verbose,costs,buffers,timing)
with cte as(
  select objectid from test where ref&&array[21,22]
)select * from cte limit 10;
--Execution time: 0.078 ms

explain (analyze,verbose,costs,buffers,timing)
with cte as(
  select objectid from test where ref&&array[21,22,23]
)select * from cte limit 10;
--Execution time: 0.079 ms

6.1.2.2 解决方案-禁用全表扫描

禁用全表扫描后,PostgreSQL会自动选择合适的索引,在本例中使用了索引idx_test_ref.类似Oracle的强制索引.

set enable_seqscan只对当前会话有效,注意使用完成后要打开.

set enable_seqscan = off;
/*表包含的数据,数组索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref@>array[1] limit 50;
--Execution time: 297.018 ms

explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[1,2] limit 50;
--Execution time: 466.661 ms

explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[1,2,3] limit 50;
--Execution time: 708.372 ms

/*表不包含的数据,数组索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref@>array[21]  limit 50;
--Planning time: 0.089 ms

explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[21,22] limit 50;
--Execution time: 0.065 ms

explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[21,22,23] limit 50;
--Execution time: 0.066 ms
set enable_seqscan = on;

6.1.3 小结

  • 索引扫描的成本较昂贵,但因返回的数据少,所以比较快.
  • limit会对查询行为产生较大的影响,设置了limit后需重新查看执行计划.
  • order by也会对查询行为产生较大的影响,需结合需求和执行计划调整.
  • 如果是单个条件(例如本例),且大多数情况下表包含值,建议使用”6.常规limt测试”,偶尔有表不包含的值时对总体影响不大.
  • 如果是多个条件,建议使用”6.1.2.1 解决方案-使用with”,它和禁用全表扫描效果差不多.具体使用那种需结合需求和执行计划调整.如下:
--多个条件
explain (analyze,verbose,costs,buffers,timing)
with cte as(
  select objectid from test where num=1 and ref&&array[1,2,3]
)select * from cte limit 10;

explain (analyze,verbose,costs,buffers,timing)
with cte as(
  select objectid from test where num=1 and ref&&array[21,22,23]
)select * from cte limit 10;

你可能感兴趣的:(postgresql,PostgreSQL二次开发)