函数已改进,请使用新版本函数,参看PostgreSQL 黑科技-递规二分法切分汉字
此方法的缺点是比较浪费空间,不过在当前相比较下来以空间换取时间的方法是值得的.
drop function if exists clear_punctuation(text);
create or replace function clear_punctuation(text)
returns text
as $$
select regexp_replace($1,
'[\ |\~|\`|\!|\@|\#|\$|\%|\^|\&|\*|\(|\)|\-|\_|\+|\=|\||\\|\[|\]|\{|\}|\;|\:|\"|\''|\,|\<|\.|\>|\/|\?|\:|\。|\;|\,|\:|\“|\”|\(|\)|\、|\?|\《|\》]'
,'','g');
$$ language sql strict immutable;
drop function if exists str_to_tsvector(text);
create or replace function str_to_tsvector(text)
returns tsvector
as $$
declare
v_count integer;
v_txt text;
v_txts text[];
v_result tsvector;
begin
v_txt := clear_punctuation($1);
--数组大小为字符数量-1
v_count := length(v_txt)-1;
if( v_count < 1 ) then
raise exception '输入参数("%")去除标点符号后至少需要2个字符',$1;
end if;
for i in 1..v_count loop
v_txts := array_append(v_txts, substring(v_txt,i,2));
end loop;
--tsvector类型要求去除重复并排序
with cte1 as(
select f from unnest(v_txts) as f group by f
),cte2 as(
select f from cte1 order by f
)select array_to_tsvector(array_agg(f)) into v_result from cte2;
return v_result;
end;
$$ language plpgsql strict immutable;
drop table if exists test_cond;
drop table if exists test;
create table test(
objectid bigserial not null, --唯一编号
name text not null, --名称
newtime timestamptz default(now()) not null, --首次添加时间
lastime timestamptz, --最后一次修改时间
constraint pk_test_objectid primary key (objectid)
);
--rum需要大量的计算和写入,保存速度非常慢,因此创建表时设置unlogged标记
--unlogged标记不是绝对安全的,因此和主表分开
--虽然浪费了一定的磁盘空间,但可以最大程度保证写入速度
--test_cond表损坏后,通过主表可以很容易的重建,不过机率非常小
--test_cond中的数据通过触发器更新,不需要操作这个表
create unlogged table test_cond(
objectid bigserial not null, --唯一编号
keys tsvector not null, --关键字
constraint pk_test_cond_objectid primary key (objectid),
constraint fk_test_cond_objectid foreign key(objectid) references test(objectid) on delete cascade
);
create index idx_test_cond_keys on test_cond using rum(keys rum_tsvector_ops);
--test_cond表设置了级联删除,所以不需要delete触发
drop trigger if exists tri_test_change on test;
drop function if exists tri_test_trigger();
create or replace function tri_test_trigger()
returns trigger as $$
begin
if (TG_OP = 'INSERT') then
insert into test_cond(objectid,keys) values(NEW.objectid,str_to_tsvector(NEW.name));
return NEW;
elsif (TG_OP = 'UPDATE') then
update test_cond set keys=str_to_tsvector(NEW.name) where objectid=NEW.objectid;
return NEW;
end if;
RETURN NULL;
end;
$$ language 'plpgsql' SECURITY DEFINER;
--test_cond依赖test表,因此只能在test成功后再触好,只能设置为after
create trigger tri_test_change after INSERT or UPDATE on test
for each ROW EXECUTE PROCEDURE tri_test_trigger();
新版本函数,请参看PostgreSQL 黑科技-递规二分法切分汉字
--test_cond表设置了级联删除,所以不需要delete触发
drop trigger if exists tri_test_change on test;
drop function if exists tri_test_trigger();
create or replace function tri_test_trigger()
returns trigger as $$
begin
if (TG_OP = 'INSERT') then
insert into test_cond(objectid,keys) values(NEW.objectid,dichotomy_split_tsv(NEW.name));
return NEW;
elsif (TG_OP = 'UPDATE') then
update test_cond set keys=dichotomy_split_tsv(NEW.name) where objectid=NEW.objectid;
return NEW;
end if;
RETURN NULL;
end;
$$ language 'plpgsql' SECURITY DEFINER;
--test_cond依赖test表,因此只能在test成功后再触好,只能设置为after
create trigger tri_test_change after INSERT or UPDATE on test
for each ROW EXECUTE PROCEDURE tri_test_trigger();
drop function if exists gen_random_zh(int,int);
create or replace function gen_random_zh(int,int)
returns text
as $$
select string_agg(chr((random()*(20901-19968)+19968 )::integer) , '') from generate_series(1,(random()*($2-$1)+$1)::integer);
$$ language sql;
每调一次ins_test插入100万数据,可以同时调用ins_test插入更多数据,以便验证模糊查询性能
drop function if exists ins_test();
create or replace function ins_test()
returns void
as $$
declare
v_index integer;
begin
v_index := 0;
for i in 1..1000 loop
v_index := v_index + 1;
insert into test(name)
select gen_random_zh(8,32) as name from generate_series(1,1000);
raise notice '%', v_index;
end loop;
end;
$$ language plpgsql;
--每调一次ins_test插入100万数据
select ins_test();
update test set name='哈哈,我来验证了' where objectid=10000;
select count(*) from test;
select count(*) from test_cond;
drop function if exists str_to_tsquery(text,boolean);
create or replace function str_to_tsquery(text,boolean default true)
returns tsquery
as $$
declare
v_count integer;
v_txt text;
v_txts text[];
v_result tsquery;
begin
v_txt := clear_punctuation($1);
--数组大小为字符数量-1
v_count := length(v_txt)-1;
if( v_count < 1 ) then
raise exception '输入参数("%")去除标点符号后至少需要2个字符',$1;
end if;
for i in 1..v_count loop
v_txts := array_append(v_txts, substring(v_txt,i,2));
end loop;
--tsquery类型要求去除重复并排序
with cte1 as(
select f from unnest(v_txts) as f group by f
),cte2 as(
select f from cte1 order by f
)select string_agg(f, (case when $2 then '&' else '|' end ) )::tsquery into v_result from cte2;
return v_result;
end;
$$ language plpgsql strict immutable;
关键字字数越多,查询越准确,并且查询速度
--因优先级问题,此语句可能会不走rum索引
explain (analyze,verbose,costs,buffers,timing)
select f.* from test as f left join test_cond as s
on f.objectid=s.objectid
where name @@ (str_to_tsquery('価仴'))
建议采用下面的方式,保证查询使用rum索引
explain (analyze,verbose,costs,buffers,timing)
select f.* from test as f left join test_cond as s
on f.objectid=s.objectid
where s.keys @@ (select str_to_tsquery('哈哈'));
explain (analyze,verbose,costs,buffers,timing)
select f.* from test as f left join test_cond as s
on f.objectid=s.objectid
where s.keys @@ (select str_to_tsquery('哈我'));
explain (analyze,verbose,costs,buffers,timing)
select f.* from test as f left join test_cond as s
on f.objectid=s.objectid
where s.keys @@ (select str_to_tsquery('我来'));
explain (analyze,verbose,costs,buffers,timing)
select f.* from test as f left join test_cond as s
on f.objectid=s.objectid
where s.keys @@ (select str_to_tsquery('来验'));
explain (analyze,verbose,costs,buffers,timing)
select f.* from test as f left join test_cond as s
on f.objectid=s.objectid
where s.keys @@ (select str_to_tsquery('验证'));
explain (analyze,verbose,costs,buffers,timing)
select f.* from test as f left join test_cond as s
on f.objectid=s.objectid
where s.keys @@ (select str_to_tsquery('证了'));
explain (analyze,verbose,costs,buffers,timing)
select f.* from test as f left join test_cond as s
on f.objectid=s.objectid
where s.keys @@ (select str_to_tsquery('哈哈,我来验证了'));
explain (analyze,verbose,costs,buffers,timing)
select f.* from test as f left join test_cond as s
on f.objectid=s.objectid
where s.keys @@ (select str_to_tsquery('侒亩'));
新版本函数,请参看PostgreSQL 黑科技-递规二分法切分汉字
explain (analyze,verbose,costs,buffers,timing)
select f.* from test as f left join test_cond as s
on f.objectid=s.objectid
where s.keys @@ (select dichotomy_split_tsq('哈哈'));
explain (analyze,verbose,costs,buffers,timing)
select f.* from test as f left join test_cond as s
on f.objectid=s.objectid
where s.keys @@ (select dichotomy_split_tsq('哈我'));
explain (analyze,verbose,costs,buffers,timing)
select f.* from test as f left join test_cond as s
on f.objectid=s.objectid
where s.keys @@ (select dichotomy_split_tsq('我来'));
explain (analyze,verbose,costs,buffers,timing)
select f.* from test as f left join test_cond as s
on f.objectid=s.objectid
where s.keys @@ (select dichotomy_split_tsq('来验'));
explain (analyze,verbose,costs,buffers,timing)
select f.* from test as f left join test_cond as s
on f.objectid=s.objectid
where s.keys @@ (select dichotomy_split_tsq('验证'));
explain (analyze,verbose,costs,buffers,timing)
select f.* from test as f left join test_cond as s
on f.objectid=s.objectid
where s.keys @@ (select dichotomy_split_tsq('证了'));
explain (analyze,verbose,costs,buffers,timing)
select f.* from test as f left join test_cond as s
on f.objectid=s.objectid
where s.keys @@ (select dichotomy_split_tsq('哈哈,我来验证了'));
explain (analyze,verbose,costs,buffers,timing)
select f.* from test as f left join test_cond as s
on f.objectid=s.objectid
where s.keys @@ (select dichotomy_split_tsq('侒亩'));
删除后再执行上面的sql查询均查不到数据
delete from test where objectid=10000;
``