1 ORACLE_HOME /home/oracle/client_1 2 ORACLE_DSN dbi:Oracle:host=192.168.11.1;sid=orcl 3 ORACLE_USER manager 4 ORACLE_PWD tiger 5 SCHEMA test 6 TYPE TABLE VIEW PACKAGE COPY 7 PG_NUMERIC_TYPE 0 8 PG_INTEGER_TYPE 1 9 DEFAULT_NUMERIC float 10 SKIP fkeys pkeys ukeys indexes checks 11 NLS_LANG AMERICAN_AMERICA.UTF8 12 PG_DSN dbi:Pg:dbname=easyetl;host=127.0.0.1;port=5432 13 PG_USER easyetl 14 PG_PWD password 15 OUTPUT output.sql
1-4 配置源端Oracle的信息
5 oracle的schema取值
6 准备转化的数据类型,也包括导数据的copy命令
7-9 用来转化oracle的number(p,s)到PG的类型:
7表示是否使用PG内部的数据类型,0表示不使用,1表示使用
8表示在7设置为0时,如果8设置为1,则类型number(p)的定义变更为integer;如果8设置为0,则number(p)也转化为numeric(p)
9表示是8设置为1的时候,类型number转化为float,如果8设置为0,则9不起作用.
简单的设置,如果7,8均设置为0,那么number(p) --> numeric(p),number(p,s) --> numeric(p,s), number --> numeric
10 约束们是否需要创建
11 语言选择
12-14 配置目的端PG(GP亦可),如果这三行信息不配置,也没关系,可以生成oracle转化为PG的脚本
15 生成文件
迁移中出现的情况:
(1) 表可以完全迁移过去
(2) 视图里面如果没有起别名的话,也需要手动添加别名
(3) package需要手动修改.注:ver13版本的package生成需要把perform/decode屏蔽掉,因为这二点未做好,模块为PLSQL.pm.
当然package转化不仅仅只是这部分东西,主要的有:
a 别名需要显式写出
b 隐式转化要显式写出
c 函数的差异(GP官方有一套Oracle的函数实现,基本上够用)
d oracle里面非标准写法,如: a left join b写成 a,b where a.xx=b.xx(+)
2. Oracle的数据迁移到GP的实现过程
#!/bin/bash if [ $# -lt 3 ];then echo 'Usage `basename $0` pipe tablename control' exit 1 fi pipename=$1 tablename=$2 control=$3 condition=$4 mknod $pipename p /root/software/sqluldr2 user=manager/tigerd@orcl query="select * from $tablename where $condition" field=0x7c file=$pipename
charset=utf8 text=CSV safe=yes persql="begin dbms_lock.sleep(2); end;" & gpload -f $control -l gpload.log rm -rf $pipename
#!/usr/bin/env python #-*- coding:utf-8 -*- import yaml import subprocess import sys import os # Script starts from here paramnum=len(sys.argv) datadt=20140820 condition="1=1" tplpath="/root/template/" pipepath="/tmp/pipe" batname="/root/script/dataload.sh" if (paramnum == 1): print 'Usage:'+ sys.argv[0]+' tablename ' sys.exit() elif(paramnum == 2): tablename=sys.argv[1] elif(paramnum == 3): tablename=sys.argv[1] datadt=sys.argv[2] elif(paramnum == 4): tablename=sys.argv[1] datadt=sys.argv[2] condition=sys.argv[3] else: print 'Usage:'+ sys.argv[0]+' tablename datadt condition. (datadt condition is optional)!' sys.exit() pid=os.getpid() pipename=pipepath+str(pid) f = open(tplpath+"gp_template_load.ctl") dataMap = yaml.load(f) f.close() dataMap['GPLOAD']['INPUT'][0]['SOURCE']['FILE'][0]=pipename dataMap['GPLOAD']['OUTPUT'][0]['TABLE']=tablename dataMap['GPLOAD']['INPUT'][6]['ERROR_TABLE']=tablename+'_err' filename=tplpath+tablename+'.ctl' f = open(filename,'w') yaml.dump(dataMap,f) f.close() handle=subprocess.Popen([batname,pipename,tablename,filename,condition]) handle.communicate()
VERSION: 1.0.0.1
DATABASE: dw
USER: manager
HOST: gp
PORT: 5432
GPLOAD:
INPUT:
- SOURCE:
LOCAL_HOSTNAME:
- gp
FILE:
- /tmp/mypipe
PORT_RANGE: [8001,9000]
- FORMAT: csv
- DELIMITER: ','
- QUOTE: '"'
- HEADER: true
- ERROR_LIMIT: 10000
- ERROR_TABLE: tablename_err
OUTPUT:
- TABLE: tablename
- MODE: INSERT
PRELOAD:
- TRUNCATE: true
后续操作:
上面的程序可以当作同步使用,但是真正的在生产使用就会有点不太让人放心.
原因有三:
(1)dataload.sh里面的sqluldr是放在后台处理的.当sqluldr出现异常,gpload可能会等待.当gpload出现异常的时候,sqluldr还是会载出文件.而且dataload.sh是fork出二个进程,当进程
出现异常,还需要手动寻找,kill掉.
(2)日常记录与处理.
(3)oracle与gp的表结构要严格一致才行.
基于此,写了可以统一处理fork的进程,增加了获取gp column list,加上日志处理这几部分.
oraconf文件格式:
#CONFNAME:USER^PASS^TNSNAME
gpconf文件格式:
#host:port:database:user:passwd
control文件看上面以及官方文档吧.
#!/bin/sh . greenplum_loaders_path.sh . setenv if [ $# -lt 4 ];then echo "Usage : `basename $0` confname etl_date mode src_tbname tgt_tbname " echo " confname : configuration at ${PWD}/conf/oraconf " echo " etl_date : YYYYMMDD " echo " mode : 1 truncate; 2 append" echo " src_tbname : oracle datasource tablename " echo " tgt_tbname(optional) : greenplum datasource tablename" exit 1 fi #trap the exception quit trap 'log_info "TERM/INTERRUPT(subprocess) close";close_subproc' INT TERM declare -a sublist function log_info() { DATETIME=`date +"%Y%m%d %H:%M:%S"` echo -e "S $DATETIME P[$$]: $*"| tee -a "$LOGPATH"/"$ETLDATE"/"$GPTABLE".log } function collect_subproc() { local index if [ ${#sublist} -eq 0 ];then index=0 else index=$[${#sublist}]+1 fi sublist[$index]=$1 } function close_subproc() { for subid in ${sublist[@]} do log_info "kill processid: $subid" kill $subid done } function parse_yaml() { local file=$1 local tablename=$2 local pipename=$3 local etldate=$4 sed -i -e "s/mypipe/"$pipename"/" -e "s/tablename_err/public."$tablename"_err/" -e "s/\<tablename\>/"$tablename"/" -e "s/etl_date/"$etldate"/" $file } if [ $(dirname $0) == '.' ];then PRIPATH=${PWD} else PRIPATH=$(dirname $0) fi TPLPATH="$PRIPATH"/template LOGPATH="$PRIPATH"/log CONFNAME=$1 ETLDATE=$2 MODE=$3 ORATABLE=$4 GPTABLE=$5 [ -z "$GPTABLE" ] && GPTABLE="$ORATABLE" [ ! -d "$LOGPATH"/"$ETLDATE" ] && mkdir -p "$LOGPATH"/"$ETLDATE" PIPENAME="P"$$"$GPTABLE" eval `grep "^$CONFNAME" "$PRIPATH"/conf/oraconf |awk -F':' '{print $2}'|awk -F'^' '{print "ORACLE_USER="$1";ORACLE_PASS="$2";ORACLE_SID="$3}'` eval $(eval `grep ^[^#] "$PRIPATH"/conf/gpconf |awk -F':' -v table=$GPTABLE '{printf("psql -h %s -p %d -U %s %s -tAc \047\\\d %s \047",$1,$2,$4,$3,table)}'`|awk -F"|" '{cmd=cmd$1","}END{print "collist="cmd}') collist=`echo $collist|sed "s/,$//g"` echo >> "$LOGPATH"/"$ETLDATE"/"$GPTABLE".log #create and modify template for gpload use log_info "create template "$LOGPATH"/"$ETLDATE"/"$GPTABLE".ctl." cp "$TPLPATH"/gp_template_load_"$MODE".ctl "$LOGPATH"/"$ETLDATE"/"$GPTABLE".ctl if [ $? -ne 0 ]; then log_info "create template "$LOGPATH"/"$ETLDATE"/"$GPTABLE".ctl failed." exit 2 fi parse_yaml "$LOGPATH"/"$ETLDATE"/"$GPTABLE".ctl $GPTABLE $PIPENAME $ETLDATE if [ $? -ne 0 ]; then log_info "modify template "$LOGPATH"/"$ETLDATE"/"$GPTABLE".ctl failed." exit 2 fi #create pipename log_info "create pipe /tmp/"$PIPENAME"." mknod /tmp/"$PIPENAME" p if [ $? -ne 0 ];then log_info "create pipe failed!" exit 3 fi gpload -f "$LOGPATH"/"$ETLDATE"/"$GPTABLE".ctl -l "$LOGPATH"/"$ETLDATE"/"$GPTABLE".log & collect_subproc $! log_info "unload sql:select $collist from $ORATABLE" sqluldr2 user="$ORACLE_USER"/"$ORACLE_PASS"@"$ORACLE_SID" query="select $collist from $ORATABLE" head=Yes field=0x7c file=/tmp/"$PIPENAME" charset=gb18030 text=CSV safe=yes presql="begin dbms_lock.sleep(5); end;" log=+"$LOGPATH"/"$ETLDATE"/"$GPTABLE".log & collect_subproc $! wait if [ $? -ne 0 ];then log_info "$GPTABLE load failed!" else log_info "$GPTABLE load succ!" fi log_info "rm -rf /tmp/"$PIPENAME"" rm -rf /tmp/"$PIPENAME" if [ $? -ne 0 ];then log_info "rm /tmp/"$PIPENAME" failed." exit 4 fi