我司使用Greenplum作为计算库,实时计算统计数据,但是数据量大了之后影响计算速度。所以将每天的数据通过Datax传输到Hive的按日分区的分区表中,用于备份,其他数据放在Greenplum中作为实时数据计算。
Greenplum
内核还是PostgreSQL,所以Datax配置还是使用PostgreSQL的连接。
hive
建表,dt
作为分区字段,比如dt='20230619'
create table test(
id bigint comment '主键',
name string comment '姓名',
address string comment '地址',
update_time timestamp comment '日期'
) partitioned by (dt string)
Greenplum就是常规建表。
CREATE TABLE "public"."test" (
"id" int8 NOT NULL,
"name" varchar(255) COLLATE "pg_catalog"."default",
"address" varchar(255) COLLATE "pg_catalog"."default",
"update_time" timestamp(6),
PRIMARY KEY ("id")
)
;
ALTER TABLE "public"."test"
OWNER TO "postgres";
COMMENT ON COLUMN "public"."test"."id" IS '主键';
COMMENT ON COLUMN "public"."test"."name" IS '姓名';
COMMENT ON COLUMN "public"."test"."address" IS '地址';
COMMENT ON COLUMN "public"."test"."update_time" IS '日期';
{
"job":{
"content":[
{
"reader":{
"name":"postgresqlreader",
"parameter":{
"column":[
"id",
"name",
"address",
"update_time"
],
"where":
"date(update_time) = to_date(${date}::text,'yyyyMMdd')",
"connection":[
{
"jdbcUrl":[
"jdbc:postgresql://localhost:5432/test"
],
"table":["public.test"]
}
],
"password":"postgres",
"username":"postgres"
}
},
"writer":{
"name":"hdfswriter",
"parameter":{
"column":[
{
"name": "id",
"type": "INT"
},
{
"name": "name",
"type": "STRING"
},
{
"name": "address",
"type": "STRING"
},
{
"name": "update_time",
"type": "TIMESTAMP"
}
],
"defaultFS": "hdfs://localhost:8020/",
"fieldDelimiter": "\1",
"fileName": "test_20230619",
"fileType": "orc",
"haveKerberos": false,
"path": "/warehouse/tablespace/managed/hive/test/test/dt=${date}",
"writeMode": "truncate"
}
}
}
],
"setting":{
"speed":{
"channel":"5"
}
}
}
}
因为datax不支持动态分区字段,所以我把分区字段传到job中进行传输。
#!/bin/bash
# 1.获取日期参数
if [ $# -eq 0 ]
then
echo "请添加参数1:tablename(必填) 2:date格式:20220101(非必填)"
exit 1
elif [ $# -eq 1 ]
then
date=$(date -d "-1 day" +%Y%m%d)
else
date=$2
fi
tablename=$1
# 2. 创建Hive分区表
su - hdfs -c "hadoop fs -mkdir -p /warehouse/tablespace/managed/hive/test.db/${tablename}/dt=${date}"
# 3. 执行datax脚本
su - hdfs -c "python /opt/datax/bin/datax.py /opt/datax/job/${tablename}_gp2hive.json -p"-Ddate=${date}" > /opt/datax/log/"${tablename}".log"
# 4. 修复Hive分区表
su - hive -c "hive -e 'msck repair table test."${tablename}"'"
批量启动脚本
dir=$(ls /opt/datax/job/*.*)
for file in ${dir}
do
filename=$(basename ${file})
tablename=$(echo ${filename} | sed 's/_gp2hive.json//')
sh /opt/datax/job/single_job.sh ${tablename}
done
#加入定时启动
> crontab -e
0 3 * * * /opt/datax/job/batch_job.sh
# 查看是否存在定时任务
> crontab -l
注意
需要将batch_job.sh
添加可执行权限,不然不能运行。
chmod +x batch_job.sh