根据上次写的博客:
http://blog.csdn.net/jiangshouzhuang/article/details/51792580
下面整理了一个更新详细的查看数据倾斜的函数,如下:
CREATE OR REPLACE FUNCTION my_create_func_for_extended_skew(
out schema_name varchar,
out table_name varchar,
out total_size_GB numeric(15,2),
out seg_min_size_GB numeric(15,2),
out seg_max_size_GB numeric(15,2),
out seg_avg_size_GB numeric(15,2),
out seg_gap_min_max_percent numeric(6,2),
out seg_gap_min_max_GB numeric(15,2),
out nb_empty_seg int)
RETURNS SETOF record AS
$$
DECLARE
v_function_name text := 'my_create_func_for_extended_skew';
v_location int;
v_sql text;
v_db_oid text;
v_num_segments numeric;
v_skew_amount numeric;
v_res record;
BEGIN
--定义代码的位置,方便用来定位问题--
v_location := 1000;
--获取当前数据库的oid--
SELECT oid INTO v_db_oid
FROM pg_database
WHERE datname = current_database();
v_location := 2200;
v_sql := 'DROP EXTERNAL TABLE IF EXISTS my_db_files_external';
v_location := 2300;
EXECUTE v_sql;
v_location := 3000;
v_sql := 'CREATE EXTERNAL WEB TABLE my_db_files_external ' ||
'(segment_id int, relfilenode text, filename text, ' ||
'size numeric) ' ||
'execute E''ls -l $GP_SEG_DATADIR/base/' || v_db_oid ||
' | ' ||
'grep gpadmin | ' ||
E'awk {''''print ENVIRON["GP_SEGMENT_ID"] "\\t" $9 "\\t" ' ||
'ENVIRON["GP_SEG_DATADIR"] "/' || v_db_oid ||
E'/" $9 "\\t" $5''''}'' on all ' || 'format ''text''';
/*
select * from my_db_files_external;
segment_id | relfilenode | filename | size
------------+------------------+-----------------------------------------------------+-----------
1 | 10774 | /data1/primary/gpseg1/29786/10774 | 65536
1 | 10776 | /data1/primary/gpseg1/29786/10776 | 0
1 | 10778 | /data1/primary/gpseg1/29786/10778 | 32768
1 | 10779 | /data1/primary/gpseg1/29786/10779 | 32768
1 | 10781 | /data1/primary/gpseg1/29786/10781 | 0
1 | 10783 | /data1/primary/gpseg1/29786/10783 | 32768
1 | 10784 | /data1/primary/gpseg1/29786/10784 | 32768
1 | 10786 | /data1/primary/gpseg1/29786/10786 | 0
1 | 10788 | /data1/primary/gpseg1/29786/10788 | 32768
1 | 10789 | /data1/primary/gpseg1/29786/10789 | 32768
.......................
*/
v_location := 3100;
EXECUTE v_sql;
v_location := 4000;
for v_res in (
select sub.vschema_name,
sub.vtable_name,
(sum(sub.size)/(1024^3))::numeric(15,2) AS vtotal_size_GB,
--获取Segments的最小,最大和平均大小-----
(min(sub.size)/(1024^3))::numeric(15,2) as vseg_min_size_GB,
(max(sub.size)/(1024^3))::numeric(15,2) as vseg_max_size_GB,
(avg(sub.size)/(1024^3))::numeric(15,2) as vseg_avg_size_GB,
--最小的segment和最大的segment的间隙百分比--
(100*(max(sub.size) - min(sub.size))/greatest(max(sub.size),1))::numeric(6,2) as vseg_gap_min_max_percent,
((max(sub.size) - min(sub.size))/(1024^3))::numeric(15,2) as vseg_gap_min_max_GB,
count(sub.size) filter (where sub.size = 0) as vnb_empty_seg
from (
SELECT n.nspname AS vschema_name,
c.relname AS vtable_name,
db.segment_id,
sum(db.size) AS size
FROM ONLY my_db_files_external db
JOIN pg_class c ON split_part(db.relfilenode, '.'::text, 1) = c.relfilenode::text
JOIN pg_namespace n ON c.relnamespace = n.oid
WHERE c.relkind = 'r'::"char"
and n.nspname not in ('pg_catalog','information_schema','gp_toolkit')
and not n.nspname like 'pg_temp%'
GROUP BY n.nspname, c.relname, db.segment_id
/*
这一块子查询会列出每个表在所有segment上面的大小
schema_name | table_name | segment_id | size
--------------+--------------------+------------+------------
os | ao_queue | 36 | 1120
os | ao_variables | 30 | 0
os | ao_schedule | 12 | 0
os | ao_ext_connection | 3 | 0
os | ao_ext_connection | 27 | 0
os | ao_schedule | 20 | 0
os | ao_schedule | 36 | 208
...................
比如我们查看 my_bigtable 的情况:
SELECT n.nspname AS schema_name,
c.relname AS table_name,
db.segment_id,
sum(db.size) AS size
FROM ONLY my_db_files_external db
JOIN pg_class c ON split_part(db.relfilenode, '.'::text, 1) = c.relfilenode::text
JOIN pg_namespace n ON c.relnamespace = n.oid
WHERE c.relkind = 'r'::"char"
and n.nspname not in ('pg_catalog','information_schema','gp_toolkit')
and not n.nspname like 'pg_temp%'
and c.relname = 'my_bigtable'
GROUP BY n.nspname, c.relname, db.segment_id;
*/
) sub
group by 1,2
--Extract only table bigger than 1 GB
-- and with a skew greater than 20%
/*having sum(sub.size)/(1024^3) > 1
and (100*(max(sub.size) - min(sub.size))/greatest(max(sub.size),1))::numeric(6,2) > 20
order by 1,2,3
limit 100*/
) loop
schema_name = v_res.vschema_name;
table_name = v_res.vtable_name;
total_size_GB = v_res.vtotal_size_GB;
seg_min_size_GB = v_res.vseg_min_size_GB;
seg_max_size_GB = v_res.vseg_max_size_GB;
seg_avg_size_GB = v_res.vseg_avg_size_GB;
seg_gap_min_max_percent = v_res.vseg_gap_min_max_percent;
seg_gap_min_max_GB = v_res.vseg_gap_min_max_GB;
nb_empty_seg = v_res.vnb_empty_seg;
return next;
end loop;
v_location := 4100;
v_sql := 'DROP EXTERNAL TABLE IF EXISTS my_db_files_external';
v_location := 4200;
EXECUTE v_sql;
return;
EXCEPTION
WHEN OTHERS THEN
RAISE EXCEPTION '(%:%:%)', v_function_name, v_location, sqlerrm;
END;
$$
language plpgsql;