前提:分区字段统一为ds,适用于常规增量抽取,通过创建和更新时间抽取。
#!/bin/bash
# 适用于常规增量抽取,通过创建和更新时间抽取
# 手工填写以下参数:
# 分区表达式,hive语法:pt_expr
# 主键字段,联合主键写id1,id2,id3:pk
# 非分区表增量数据量阈值,分区表不用管:inc_cnt
set -e
timer_start=`date "+%Y-%m-%d %H:%M:%S"`
timediff(){
duration=$(($(date +%s -d "${3} ${4}") - $(date +%s -d "${1} ${2}")))
echo "开始时间:${1} ${2}"
echo "结束时间:${3} ${4}"
if [[ $duration -lt 60 ]]
then echo "执行耗时:${duration}s"
elif [[ $duration -lt $((60*60)) ]]
then m=$(($duration/60))
s=$(($duration%60))
echo "执行耗时:${m}m${s}s"
else h=$(($duration/60/60))
m=$(($(($duration%3600))/60))
s=$(($duration%60))
echo "执行耗时:${h}h${m}m${s}s"
fi
}
file_name=`basename $0`
table_name=${file_name%.sh}
tmp_name=${table_name/#ods_/tmp_}
json_name=${table_name}.json
# sql_name=${file_name%.sh}.sql
dir_path=`dirname $0`
# 分区表达式,hive语法
pt_expr="substr(issuedate,1,10)"
# 主键字段,联合主键写id1,id2,id3
pk='id'
# 非分区表增量数据量阈值,分区表不用管
inc_cnt=0
cd $dir_path
# 1、建临时表装增量数据;
echo "判断是否分区,定增量阈值"
is_pt=`hive -e "desc bigdata_ods.${table_name}"|grep -e '^# Partition Information'|wc -l`
if [[ $is_pt == 0 ]]; then
#statements
thr_cnt=$inc_cnt
echo "增量阈值:$thr_cnt"
echo "创建临时表:drop table if exists bigdata_tmp.${tmp_name};create table bigdata_tmp.${tmp_name} like bigdata_ods.${table_name};alter table bigdata_tmp.${tmp_name} set SERDEPROPERTIES('field.delim'='\001')"
hive -e "drop table if exists bigdata_tmp.${tmp_name};create table bigdata_tmp.${tmp_name} like bigdata_ods.${table_name};alter table bigdata_tmp.${tmp_name} set SERDEPROPERTIES('field.delim'='\001')"
else
echo "获取表字段"
fields_arr=(`hive -e "set hive.cli.print.header=true; set hive.resultset.use.unique.column.names=false;select * from bigdata_ods.${table_name} where 1=2 limit 1"`)
fields_num=${#fields_arr[@]}
f_num=$(($fields_num-1))
f_arr=${fields_arr[@]:0:$f_num}
f_str=${f_arr[*]}
fields=${f_str// /,}
# 分区表增量数据量阈值
thr_cnt=`hive -e "select round(min(cnt)*0.6) from (select count(1) as cnt from bigdata_ods.${table_name} where ds>=date_sub(current_date,10) and ds<=date_sub(current_date,4) group by ds) t "`
if [[ $thr_cnt == 'NULL' ]]; then
#statements
thr_cnt=0
fi
echo "表字段:$fields"
echo "增量阈值:$thr_cnt"
echo "创建临时表:drop table if exists bigdata_tmp.${tmp_name};create table bigdata_tmp.${tmp_name} as select $fields from bigdata_ods.${table_name} where 1=2"
hive -e "drop table if exists bigdata_tmp.${tmp_name};create table bigdata_tmp.${tmp_name} as select $fields from bigdata_ods.${table_name} where 1=2"
fi
#定义成功抽取数量的变量
suc_out=0
datax_etl(){
# 2、抽增量数据,解析读出数量;
echo "开始进行增量抽取"
trap "cat datax.log;echo error line: $LINENO,error cmd: $BASH_COMMAND" ERR
datax.py -p "-Dparm=$1 -Dtmp_name=$3" $2 > datax.log
tail -8 datax.log
suc_out_str=`grep '读出记录总数' datax.log`
suc_out=`echo $suc_out_str|grep -e '[0-9]*' -o `
fai_inout_str=`grep '读写失败总数' datax.log`
fai_inout=`echo $fai_inout_str|grep -e '[0-9]*' -o`
if [[ $fai_inout != 0 ]]; then
#statements
echo $fai_inout
exit 1
fi
}
# 定义临时表数据量的变量
tmp_cnt=0
data_check(){
cnt_arr=(`hive -e "select count(1),count(distinct $1),sum(case when coalesce(${2},'')='' then 1 else 0 end) from bigdata_tmp.${3}"`)
cnt1=${cnt_arr[0]}
pk_cnt=${cnt_arr[0]}
no_pt=${cnt_arr[2]}
# 3、监控规则:
# 3.1、数据量是否和读取的一致
if [[ $cnt1 != $4 ]]; then
#statements
echo "增量表数据量与抽取数据量不一致,建议检查错行。"
exit 1
else
echo "增量表数据量与抽取数据量一致。"
fi
# 3.1、主键有无重复
if [[ $cnt1 != $pk_cnt ]]; then
#statements
echo "增量表主键有重复,建议检查列分隔符。"
exit 1
else
echo "增量表主键唯一。"
fi
# 3.3、分区字段或者创建时间有没有空值
if [[ $no_pt != 0 ]]; then
#statements
echo "增量表分区字段或者创建时间为空,建议检查列分隔符。"
exit 1
else
echo "增量表分区字段或者创建时间无空值。"
fi
tmp_cnt=$cnt1
}
if [[ $# == 1 ]]; then
# 手工执行不监控数据量
v_dt=`date -d "$1" "+%Y-%m-%d"`
datax_etl $v_dt $json_name ${tmp_name}
data_check $pk ${pt_expr} ${tmp_name} ${suc_out}
else
v_dt=`date -d "-1 day" "+%Y-%m-%d"`
# 抽取次数
fail_etl_cnt=0
# 3.4、监控数据量是否偏小,偏小执行第2步,三次跳出循环
while true; do
#statements
datax_etl $v_dt $json_name ${tmp_name}
data_check $pk ${pt_expr} ${tmp_name} ${suc_out}
# cnt=1
if [[ $tmp_cnt -le $thr_cnt ]]; then
#statements
fail_etl_cnt=$(($fail_etl_cnt+1))
if [[ $fail_etl_cnt -ge 3 ]]; then
#statements
echo '多次抽取数据量偏小,程序终止。'
exit 1
fi
echo "抽取数据量$cnt偏小,15分钟后重抽。"
sleep 15m
echo "开始重抽,清空临时表。"
hive -e "truncate table bigdata_tmp.${tmp_name}"
else
break
fi
done
fi
# 4、合并增量数据。
join_arr=(${pk//,/ })
join_str=''
p1=${join_arr[0]}
for i in ${join_arr[@]}; do
#statements
join_str="$join_str o.$i=t.$i and"
done
join_str=${join_str/%and}
if [[ $is_pt == 0 ]]; then
#statements
sql_str="insert overwrite table bigdata_ods.${table_name} select a.* from bigdata_tmp.${tmp_name} a union all select o.* from bigdata_ods.${table_name} o left join bigdata_tmp.${tmp_name} t on $join_str where t.$p1 is null"
else
pts=`hive -e "select concat_ws('|',collect_set($pt_expr)) from bigdata_tmp.${tmp_name}"`
sql_str="set hive.exec.dynamic.partition = true;set hive.exec.dynamic.partition.mode = nonstrict;set hive.exec.max.dynamic.partitions = 100000;set hive.exec.max.dynamic.partitions.pernode = 100000;insert overwrite table bigdata_ods.${table_name} partition(ds) select a.*,$pt_expr as ds from bigdata_tmp.${tmp_name} a union all select o.* from bigdata_ods.${table_name} o left join bigdata_tmp.${tmp_name} t on $join_str where o.ds regexp '$pts' and t.$p1 is null"
fi
echo "合并增量数据"
echo "hive -e ${sql_str};"
hive -e "${sql_str}";
echo "***SUCCESS***"
timer_end=`date "+%Y-%m-%d %H:%M:%S"`
timediff $timer_start $timer_end
对应的json文件命名和表名一致:
{
"job": {
"setting": {
"speed": {
"channel": "16",
"byte": 1048576,
"record": 10000
}
},
"content": [
{
"reader": {
"name": "oraclereader",
"parameter": {
"username": "xxx",
"password": "xxx",
"connection": [
{
"querySql": [
"SELECT id, create_by, create_date, update_by, update_date, remarks, del_flag from SHOP.SYS_DICT where create_date>=to_date('$parm','yyyy-MM-dd') or update_date>=to_date('$parm','yyyy-MM-dd')"
],
"jdbcUrl": [
"jdbc:oracle:thin:@192.168.1.1:1521/smzq"
]
}
]
}
},
"writer": {
"name": "hdfswriter",
"parameter": {
"defaultFS": "hdfs://hamaster:9000",
"fileType": "TEXT",
"path": "/user/hive/warehouse/bigdata_tmp.db/$tmp_name/",
"fileName": "$tmp_name",
"column": [
{
"name": "id",
"type": "string"
},
{
"name": "create_by",
"type": "string"
},
{
"name": "create_date",
"type": "string"
},
{
"name": "update_by",
"type": "string"
},
{
"name": "update_date",
"type": "string"
},
{
"name": "remarks",
"type": "string"
},
{
"name": "del_flag",
"type": "string"
}
],
"writeMode": "append",
"fieldDelimiter": "\u0001"
}
}
}
]
}
}