pg_bulkload批量载入工具(初探)

安装:

# tar -zxvf pg_bulkload-3.1.6.tar.gz
# cd pg_bulkload-3.1.6
# . /home/postgres/.bash_profile
# make
# make install


引入扩展:

$ psql 
psql (9.3.4)
Type "help" for help.
postgres=# create database test;
CREATE DATABASE
postgres=# \c test
You are now connected to database "test" as user "postgres".
test=# create extension pg_bulkload ;
CREATE EXTENSION
test=# \dx
                                     List of installed extensions
    Name     | Version |   Schema   |                           Description                           
-------------+---------+------------+-----------------------------------------------------------------
 pg_bulkload | 1.0     | public     | pg_bulkload is a high speed data loading utility for PostgreSQL
 plpgsql     | 1.0     | pg_catalog | PL/pgSQL procedural language
(2 rows)
test=# \df
                                                                                                                                 List of functions
 Schema |    Name     | Result data type |                                                                                                       Argum
ent data types                                                                                                       |  Type  
--------+-------------+------------------+------------------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------------------------+--------
 public | pg_bulkload | record           | options text[], OUT skip bigint, OUT count bigint, OUT parse_errors bigint, OUT duplicate_new bigint, OUT d
uplicate_old bigint, OUT system_time double precision, OUT user_time double precision, OUT duration double precision | normal
(1 row)


创造测试数据:

test=# create table t1(id int,name text);
CREATE TABLE
test=# insert into t1 select generate_series(1,50000000),'HighGo';
INSERT 0 50000000
test=# \d+
                    List of relations
 Schema | Name | Type  |  Owner   |  Size   | Description 
--------+------+-------+----------+---------+-------------
 public | t1   | table | postgres | 2112 MB | 
(1 row)
test=# copy t1 to '/opt/pg93/data/t1.csv' with(format 'csv');
COPY 50000000
$ du -sh /opt/pg93/data/t1.csv 
754M /opt/pg93/data/t1.csv


copy方式载入数据:

(with logged):

test=# truncate t1;
TRUNCATE TABLE
test=# \timing 
Timing is on.
test=# 
test=# copy t1 from '/opt/pg93/data/t1.csv' with(format 'csv');
COPY 50000000
Time: 139038.099 ms

(without logged):

test=# truncate t1;
TRUNCATE TABLE

修改t1表为unlogged
test=# update pg_class set relpersistence='u' where relname='t1';
UPDATE 1

test=# copy t1 from '/opt/pg93/data/t1.csv' with(format 'csv');
COPY 50000000
Time: 110796.480 ms


pg_blukload载入数据:

(without logged):

$ pg_bulkload -i /opt/pg93/data/t1.csv -O t1 -l t1_bulkload.log -o "TYPE=CSV" -o "WRITER=PARALLEL" -d test
NOTICE: BULK LOAD START
NOTICE: BULK LOAD END
 0 Rows skipped.
 50000000 Rows successfully loaded.
 0 Rows not loaded due to parse errors.
 0 Rows not loaded due to duplicate errors.
 0 Rows replaced with new rows.
Run began on 2014-06-16 05:53:45.025377+08
Run ended on 2014-06-16 05:55:00.625057+08
CPU 2.80s/33.00u sec elapsed 75.60 sec
$ pg_bulkload -i /opt/pg93/data/t1.csv -O t1 -l t1_bulkload.log -o "TYPE=CSV" -o "WRITER=DIRECT" -d test
NOTICE: BULK LOAD START
NOTICE: BULK LOAD END
 0 Rows skipped.
 50000000 Rows successfully loaded.
 0 Rows not loaded due to parse errors.
 0 Rows not loaded due to duplicate errors.
 0 Rows replaced with new rows.
Run began on 2014-06-16 06:05:40.267198+08
Run ended on 2014-06-16 06:07:05.08921+08
CPU 6.88s/34.25u sec elapsed 84.82 sec


(with logged):

test=# truncate t1;
TRUNCATE TABLE
修改t1表为logged
test=# update pg_class set relpersistence='p' where relname='t1';
UPDATE 1
$ pg_bulkload -i /opt/pg93/data/t1.csv -O t1 -l t1_bulkload.log -o "TYPE=CSV" -o "WRITER=PARALLEL" -d test
NOTICE: BULK LOAD START
NOTICE: BULK LOAD END
 0 Rows skipped.
 50000000 Rows successfully loaded.
 0 Rows not loaded due to parse errors.
 0 Rows not loaded due to duplicate errors.
 0 Rows replaced with new rows.
Run began on 2014-06-16 05:57:05.620751+08
Run ended on 2014-06-16 05:58:20.458029+08
CPU 2.80s/33.02u sec elapsed 74.84 sec

  
$ pg_bulkload -i /opt/pg93/data/t1.csv -O t1 -l t1_bulkload.log -o "TYPE=CSV" -o "WRITER=DIRECT" -d test
NOTICE: BULK LOAD START
NOTICE: BULK LOAD END
 0 Rows skipped.
 50000000 Rows successfully loaded.
 0 Rows not loaded due to parse errors.
 0 Rows not loaded due to duplicate errors.
 0 Rows replaced with new rows.
Run began on 2014-06-16 06:02:10.33344+08
Run ended on 2014-06-16 06:03:36.986382+08
CPU 7.15s/34.93u sec elapsed 86.65 sec


pg_bulkload执行过程中查看进程和连接:
(WRITER=PARALLEL)

$ ps -ef | grep post
postgres 24044 22690  0 05:53 pts/2    00:00:00 pg_bulkload -i /opt/pg93/data/t1.csv -O t1 -l t1_bulkload.log -o TYPE=CSV -o WRITER=PARALLEL -d test
postgres 24045  2236 48 05:53 ?        00:00:04 postgres: postgres test [local] SELECT
postgres 24046  2236 20 05:53 ?        00:00:02 postgres: postgres test [local] SELECT
postgres=# select datname,application_name,query from pg_stat_activity;
 datname  | application_name |                                                                                                               query    

                                                                                                           
----------+------------------+------------------------------------------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------------------------
 postgres | psql             | select datname,application_name,query from pg_stat_activity;
 test     | pg_bulkload      | SELECT * FROM pg_bulkload($1)
 test     |                  | SELECT * FROM pg_bulkload(ARRAY['TYPE=TUPLE','INPUT=' || $1,'WRITER=DIRECT','OUTPUT=' || $2,'ON_DUPLICATE_KEEP=' || $3,
'DUPLICATE_ERRORS=' || $4,'DUPLICATE_BADFILE=' || $5,'LOGFILE=' || $6,'VERBOSE=' || $7,'TRUNCATE=' || $8])
(3 rows)


(WRITER=PARALLEL)

$ ps -ef | grep post
postgres 24124 22690  0 06:02 pts/2    00:00:00 pg_bulkload -i /opt/pg93/data/t1.csv -O t1 -l t1_bulkload.log -o TYPE=CSV -o WRITER=DIRECT -d test
postgres 24125  2236 49 06:02 ?        00:00:13 postgres: postgres test [local] SELECT
test=# select datname,application_name,query from pg_stat_activity;
 datname | application_name |                            query                             
---------+------------------+--------------------------------------------------------------
 test    | pg_bulkload      | SELECT * FROM pg_bulkload($1)
 test    | psql             | select datname,application_name,query from pg_stat_activity;
(2 rows)

你可能感兴趣的:(PostgreSQL,bulkload,批量载入,pg_bulkload)