增量抽取到hive过程中的监控

前提:分区字段统一为ds,适用于常规增量抽取,通过创建和更新时间抽取。

 

#!/bin/bash
# 适用于常规增量抽取,通过创建和更新时间抽取
# 手工填写以下参数:
# 分区表达式,hive语法:pt_expr
# 主键字段,联合主键写id1,id2,id3:pk
# 非分区表增量数据量阈值,分区表不用管:inc_cnt

set -e 
timer_start=`date "+%Y-%m-%d %H:%M:%S"`

timediff(){
    duration=$(($(date +%s -d "${3} ${4}") - $(date +%s -d "${1} ${2}")))
    echo "开始时间:${1} ${2}"
    echo "结束时间:${3} ${4}"
    if [[ $duration -lt 60 ]]
        then echo "执行耗时:${duration}s"
    elif [[ $duration -lt $((60*60)) ]]
        then m=$(($duration/60))
             s=$(($duration%60))
             echo "执行耗时:${m}m${s}s"
    else h=$(($duration/60/60))
         m=$(($(($duration%3600))/60))
         s=$(($duration%60))
         echo "执行耗时:${h}h${m}m${s}s"
    fi
}
file_name=`basename $0`
table_name=${file_name%.sh}
tmp_name=${table_name/#ods_/tmp_}
json_name=${table_name}.json
# sql_name=${file_name%.sh}.sql
dir_path=`dirname $0`
# 分区表达式,hive语法
pt_expr="substr(issuedate,1,10)"
# 主键字段,联合主键写id1,id2,id3
pk='id' 
# 非分区表增量数据量阈值,分区表不用管
inc_cnt=0
cd $dir_path

# 1、建临时表装增量数据; 
echo "判断是否分区,定增量阈值"
is_pt=`hive -e "desc bigdata_ods.${table_name}"|grep -e '^# Partition Information'|wc -l`

if [[ $is_pt == 0 ]]; then
    #statements
    thr_cnt=$inc_cnt
    echo "增量阈值:$thr_cnt"
    echo "创建临时表:drop table if exists bigdata_tmp.${tmp_name};create table bigdata_tmp.${tmp_name} like bigdata_ods.${table_name};alter table bigdata_tmp.${tmp_name} set SERDEPROPERTIES('field.delim'='\001')"
    hive -e "drop table if exists bigdata_tmp.${tmp_name};create table bigdata_tmp.${tmp_name} like bigdata_ods.${table_name};alter table bigdata_tmp.${tmp_name} set SERDEPROPERTIES('field.delim'='\001')"

else
    echo "获取表字段"
    fields_arr=(`hive -e "set hive.cli.print.header=true; set hive.resultset.use.unique.column.names=false;select * from bigdata_ods.${table_name} where 1=2 limit 1"`)
    fields_num=${#fields_arr[@]}
    f_num=$(($fields_num-1))
    f_arr=${fields_arr[@]:0:$f_num}
    f_str=${f_arr[*]}
    fields=${f_str// /,}
    # 分区表增量数据量阈值
    thr_cnt=`hive -e "select round(min(cnt)*0.6) from (select count(1) as cnt from bigdata_ods.${table_name} where ds>=date_sub(current_date,10) and ds<=date_sub(current_date,4) group by ds) t "`
    if [[ $thr_cnt == 'NULL' ]]; then
        #statements
        thr_cnt=0
    fi
    echo "表字段:$fields"
    echo "增量阈值:$thr_cnt"
    echo "创建临时表:drop table if exists bigdata_tmp.${tmp_name};create table bigdata_tmp.${tmp_name} as select $fields from bigdata_ods.${table_name} where 1=2"
    hive -e "drop table if exists bigdata_tmp.${tmp_name};create table bigdata_tmp.${tmp_name} as select $fields from bigdata_ods.${table_name} where 1=2"
fi



#定义成功抽取数量的变量
suc_out=0
datax_etl(){
# 2、抽增量数据,解析读出数量;
    echo "开始进行增量抽取"
    trap "cat datax.log;echo error line: $LINENO,error cmd: $BASH_COMMAND" ERR
    datax.py -p "-Dparm=$1 -Dtmp_name=$3" $2 > datax.log
    tail -8 datax.log
    suc_out_str=`grep '读出记录总数' datax.log`
    suc_out=`echo $suc_out_str|grep -e '[0-9]*' -o `
    fai_inout_str=`grep '读写失败总数' datax.log`
    fai_inout=`echo $fai_inout_str|grep -e '[0-9]*' -o`
    if [[ $fai_inout != 0 ]]; then
        #statements
        echo $fai_inout
        exit 1
    fi
}
# 定义临时表数据量的变量
tmp_cnt=0
data_check(){
    cnt_arr=(`hive -e "select count(1),count(distinct $1),sum(case when coalesce(${2},'')='' then 1 else 0 end) from bigdata_tmp.${3}"`)
    cnt1=${cnt_arr[0]}
    pk_cnt=${cnt_arr[0]}
    no_pt=${cnt_arr[2]}
# 3、监控规则:
# 3.1、数据量是否和读取的一致
    if [[ $cnt1 != $4 ]]; then
        #statements
        echo "增量表数据量与抽取数据量不一致,建议检查错行。"
        exit 1
    else
        echo "增量表数据量与抽取数据量一致。"
    fi
# 3.1、主键有无重复
    if [[ $cnt1 != $pk_cnt ]]; then
        #statements
        echo "增量表主键有重复,建议检查列分隔符。"
        exit 1
    else
        echo "增量表主键唯一。"
    fi
# 3.3、分区字段或者创建时间有没有空值
    if [[ $no_pt != 0 ]]; then
        #statements
        echo "增量表分区字段或者创建时间为空,建议检查列分隔符。"
        exit 1
    else
        echo "增量表分区字段或者创建时间无空值。"
    fi
    tmp_cnt=$cnt1
}

if [[ $# == 1 ]]; then
    # 手工执行不监控数据量
    v_dt=`date -d "$1" "+%Y-%m-%d"`
    datax_etl $v_dt $json_name ${tmp_name}
    data_check $pk ${pt_expr} ${tmp_name} ${suc_out}
else
    v_dt=`date -d "-1 day" "+%Y-%m-%d"`
    # 抽取次数
    fail_etl_cnt=0
# 3.4、监控数据量是否偏小,偏小执行第2步,三次跳出循环
    while true; do
        #statements
        datax_etl $v_dt $json_name ${tmp_name}
        data_check $pk ${pt_expr} ${tmp_name} ${suc_out}
        # cnt=1
        if [[ $tmp_cnt -le $thr_cnt ]]; then
            #statements
            fail_etl_cnt=$(($fail_etl_cnt+1))
            if [[ $fail_etl_cnt -ge 3 ]]; then
                #statements
                echo '多次抽取数据量偏小,程序终止。'
                exit 1
            fi
            echo "抽取数据量$cnt偏小,15分钟后重抽。"
            sleep 15m
            echo "开始重抽,清空临时表。"
            hive -e "truncate table bigdata_tmp.${tmp_name}"
        else
            break
        fi
    done
fi
# 4、合并增量数据。
join_arr=(${pk//,/ })
join_str=''
p1=${join_arr[0]}
for i in ${join_arr[@]}; do
    #statements
    join_str="$join_str o.$i=t.$i and"
done
join_str=${join_str/%and}
if [[ $is_pt == 0 ]]; then
    #statements
    sql_str="insert overwrite table bigdata_ods.${table_name} select a.* from bigdata_tmp.${tmp_name} a union all select o.* from bigdata_ods.${table_name} o left join bigdata_tmp.${tmp_name} t on $join_str where t.$p1 is null"
else
    pts=`hive -e "select concat_ws('|',collect_set($pt_expr)) from bigdata_tmp.${tmp_name}"`
    sql_str="set hive.exec.dynamic.partition = true;set hive.exec.dynamic.partition.mode = nonstrict;set hive.exec.max.dynamic.partitions = 100000;set hive.exec.max.dynamic.partitions.pernode = 100000;insert overwrite table bigdata_ods.${table_name} partition(ds) select a.*,$pt_expr as ds from bigdata_tmp.${tmp_name} a union all select o.* from bigdata_ods.${table_name} o left join bigdata_tmp.${tmp_name} t on $join_str where o.ds regexp '$pts' and t.$p1 is null"
fi
echo "合并增量数据"
echo "hive -e ${sql_str};"
hive -e "${sql_str}";


echo "***SUCCESS***"
timer_end=`date "+%Y-%m-%d %H:%M:%S"`
timediff $timer_start $timer_end

 

对应的json文件命名和表名一致:

{
  "job": {
    "setting": {
      "speed": {
        "channel": "16",
        "byte": 1048576,
        "record": 10000
      }
    },
    "content": [
      {
        "reader": {
          "name": "oraclereader",
          "parameter": {
            "username": "xxx",
            "password": "xxx",
            "connection": [
              {
                "querySql": [
                  "SELECT id, create_by, create_date, update_by, update_date, remarks, del_flag from SHOP.SYS_DICT where create_date>=to_date('$parm','yyyy-MM-dd') or update_date>=to_date('$parm','yyyy-MM-dd')"
                ],
                "jdbcUrl": [
                  "jdbc:oracle:thin:@192.168.1.1:1521/smzq"
                ]
              }
            ]
          }
        },
        "writer": {
          "name": "hdfswriter",
          "parameter": {
            "defaultFS": "hdfs://hamaster:9000",
            "fileType": "TEXT",
            "path": "/user/hive/warehouse/bigdata_tmp.db/$tmp_name/",
            "fileName": "$tmp_name",
            "column": [
              {
                "name": "id",
                "type": "string"
              },
              {
                "name": "create_by",
                "type": "string"
              },
              {
                "name": "create_date",
                "type": "string"
              },
              {
                "name": "update_by",
                "type": "string"
              },
              {
                "name": "update_date",
                "type": "string"
              },
              {
                "name": "remarks",
                "type": "string"
              },
              {
                "name": "del_flag",
                "type": "string"
              }
            ],
            "writeMode": "append",
            "fieldDelimiter": "\u0001"
          }
        }
      }
    ]
  }
}

 

你可能感兴趣的:(shell,SQL,Linux,tools)