Hive -- 增量同步数据脚本

if [ $# -eq 12 ]; then
	HIVE_DATABASE=$1
	HIVE_TABLE_NAME=$2
	DATA_DIR=$3
	PARTITION_KEY=$4
	KEY_FIELD_NAME=$5
	MYSQL_HOST=$6
	MYSQL_PORT=$7
	MYSQL_DATABASE=$8
	MYSQL_TABLE_NAME=$9
	MYSQL_USERNAME=${10}
	MYSQL_PASSWORD=${11}
	PRIMARY_KEY=${12}
else
        echo 'commend : get_sqoop_partitions_sql HIVE_DATABASE HIVE_TABLE_NAME DATA_DIR PARTITION_KEYKEY_FILED_NAME MYSQL_HOST MYSQL_TABLE_NAME MYSQL_USERNAME MYSQL_PASSWORD PRIMARY_KEY'
        exit 127
fi

echo "开始进行从mysql表${MYSQL_TABLE_NAME}到hive表${HIVE_DATABASE}.${HIVE_TABLE_NAME}的导入"
echo "拉表利用的字段为${PRIMARY_KEY},分区名称为${PARTITION_KEY},分区使用的字段为${KEY_FIELD_NAME}"

echo "aws s3 rm ${DATA_DIR}/${HIVE_TABLE_NAME}/${PARTITION_KEY}=0 --recursive"
aws s3 rm ${DATA_DIR}/${HIVE_TABLE_NAME}/${PARTITION_KEY}=0 --recursive

SQL_PATH=$(pwd)/log/${HIVE_DATABASE}.${HIVE_TABLE_NAME}.sql
HIVE_COLUMN_PATH=$(pwd)/log/hive.${HIVE_DATABASE}.${HIVE_TABLE_NAME}.column.txt
MYSQL_COLUMN_PATH=$(pwd)/log/mysql.${MYSQL_DATABASE}.${MYSQL_TABLE_NAME}.column.txt
MYSQL_COLUMN_TYPE_PATH=$(pwd)/log/mysql.${MYSQL_DATABASE}.${MYSQL_TABLE_NAME}.column_type.txt
NEW_COLUMN_PATH=$(pwd)/log/hive.${HIVE_DATABASE}.${HIVE_TABLE_NAME}.new_column.txt

hive -e "show create table ${HIVE_DATABASE}.${HIVE_TABLE_NAME};" >$SQL_PATH

LAST_PARTITION=$(hive -e "show partitions ${HIVE_DATABASE}.${HIVE_TABLE_NAME};"|tail -1|sed 's/=/ /g'|awk '{print $2}')
HIVE_MAX=$(hive -e "select max(${PRIMARY_KEY}) from ${HIVE_DATABASE}.${HIVE_TABLE_NAME} where ${PARTITION_KEY}=${LAST_PARTITION};")
echo "${HIVE_DATABASE}.${HIVE_TABLE_NAME}的${PRIMARY_KEY}最大值为${HIVE_MAX}"


if [ "$(grep decimal $SQL_PATH)" != "" ]; then
	DECIMAL_FIELDS=$(grep decimal $SQL_PATH|sed 's/decimal/DECIMAL/g'|sed 's/,/%2C/g'|sed 's/`/ /g'|sed 's/)%2C/)/g'|awk '{if (NR==1) field="--map-column-hive "$1"='"$2"'"; print field; if(NR!=1) field=field","$1"='"$2"'"; print field;}' |tail -1)
fi

begin=$(grep -n CREATE ${SQL_PATH}|sed 's/:/ /g'|awk '{print $1}')
end=$(grep -n ')$' ${SQL_PATH}|head -n1|sed 's/:/ /g'|awk '{print $1}')
TABLE_COLUMNS=$(head -n$((${end})) ${SQL_PATH}|tail -n$((${end}-${begin}))|sed 's/`//g'|awk 'BEGIN {} {if (NR==1) all_fields=$1; if (NR!=1) all_fields=all_fields","$1} END {print all_fields}')

head -n$((${end})) ${SQL_PATH}|tail -n$((${end}-${begin}))|sed 's/`//g'|awk '{print $1}'> ${HIVE_COLUMN_PATH}

mysql -h${MYSQL_HOST} -u${MYSQL_USERNAME} -p${MYSQL_PASSWORD} -P${MYSQL_PORT} -D${MYSQL_DATABASE} -e "desc ${MYSQL_TABLE_NAME};" |awk '{if($1!="Field") print $1}' > ${MYSQL_COLUMN_PATH}

diff -u ${HIVE_COLUMN_PATH} ${MYSQL_COLUMN_PATH} |grep ^+[^+]|sed 's/+//g'|awk '{print $1}' > ${NEW_COLUMN_PATH}

mysql -h${MYSQL_HOST} -u${MYSQL_USERNAME} -p${MYSQL_PASSWORD} -P${MYSQL_PORT} -D${MYSQL_DATABASE} -e "desc ${MYSQL_TABLE_NAME};" |awk '{if($1!="Field") print $1" "$2}' > ${MYSQL_COLUMN_TYPE_PATH}


NEW_COLUMN=$(awk '{print $1}' ${NEW_COLUMN_PATH})

for i in ${NEW_COLUMN[@]}
do
	if [ ! -z $i ]; then
		echo "hive -e alter table ${HIVE_DATABASE}.${HIVE_TABLE_NAME} add column $i;"
		HIVE_SELECT_NEW_COLUMNS="${HIVE_SELECT_NEW_COLUMNS},$i"
	fi
done





echo "mysql中新增字段为$(echo ${HIVE_SELECT_NEW_COLUMNS}|sed 's/,//')"

MYSQL_MAX=$(mysql -h${MYSQL_HOST} -u${MYSQL_USERNAME} -p${MYSQL_PASSWORD} -P${MYSQL_PORT} -D${MYSQL_DATABASE} -e "select max(${PRIMARY_KEY}) from ${MYSQL_TABLE_NAME};"|tail -1)
echo "mysql表${MYSQL_TABLE_NAME}的最大${PRIMARY_KEY}为${MYSQL_MAX}"

FAILED='*'
TERMINATE=$(grep 'field.delim' ${SQL_PATH}|sed "s/'/ /g"|awk '{print $3}')

for (( i=${HIVE_MAX};$i<${MYSQL_MAX};i=$[$i+50000000] ))
do

	echo "aws s3 rm ${DATA_DIR}/tmp/${HIVE_DATABASE}/${HIVE_TABLE_NAME}"
	aws s3 rm ${DATA_DIR}/tmp/${HIVE_DATABASE}/${HIVE_TABLE_NAME} --recursive

	echo "sqoop import --connect jdbc:mysql://${MYSQL_HOST}:${MYSQL_PORT}/${MYSQL_DATABASE}?tinyInt1isBit=false \
	--username ${MYSQL_USERNAME} --password ${MYSQL_PASSWORD} \
	--query \"select ${FAILED} from ${MYSQL_TABLE_NAME} where ${PRIMARY_KEY} > $i and ${PRIMARY_KEY} <= $[$i+50000000] and \$CONDITIONS\" \
	--split-by ${PRIMARY_KEY} \
	--fields-terminated-by \"${TERMINATE}\" \
	--hive-import \
	--target_dir ${DATA_DIR}/${HIVE_TABLE_NAME}/${PARTITION_KEY}=0/  \
	--hive-partition-key=\"${PARTITION_KEY}\"  \
	--hive-partition-value=0 \
	--columns ${TABLE_COLUMNS}${HIVE_SELECT_NEW_COLUMNS} \
	--hive-table ${HIVE_DATABASE}.${HIVE_TABLE_NAME} \
	--num-mappers 8
	"
	sqoop import --connect jdbc:mysql://${MYSQL_HOST}:${MYSQL_PORT}/${MYSQL_DATABASE}?tinyInt1isBit=false \
	--username ${MYSQL_USERNAME} --password ${MYSQL_PASSWORD} \
	--query "select ${FAILED} from ${MYSQL_TABLE_NAME} where ${PRIMARY_KEY} > $i and ${PRIMARY_KEY} <= $[$i+50000000] and \$CONDITIONS" \
	--split-by ${PRIMARY_KEY} \
	--fields-terminated-by "${TERMINATE}" \
	--target-dir "${DATA_DIR}/tmp/${HIVE_DATABASE}/${HIVE_TABLE_NAME}" \
	--hive-import \
	--hive-partition-key="${PARTITION_KEY}"  \
	--hive-partition-value=0 \
	--hive-table ${HIVE_DATABASE}.${HIVE_TABLE_NAME} \
	--columns ${TABLE_COLUMNS}${HIVE_SELECT_NEW_COLUMNS} \
	--num-mappers 8
done



MYSQL_THIS_TIME_MIN_ID=$(mysql -h${MYSQL_HOST} -u${MYSQL_USERNAME} -p${MYSQL_PASSWORD} -P${MYSQL_PORT} -D${MYSQL_DATABASE} -e "select min(${PRIMARY_KEY}) from ${MYSQL_TABLE_NAME} where ${PRIMARY_KEY}>${HIVE_MAX};"|tail -1)
MYSQL_THIS_TIME_CREATE_TIME=$(mysql -h${MYSQL_HOST} -u${MYSQL_USERNAME} -p${MYSQL_PASSWORD} -P${MYSQL_PORT} -D${MYSQL_DATABASE} -e "select ${KEY_FIELD_NAME} from ${MYSQL_TABLE_NAME} where ${PRIMARY_KEY}=${MYSQL_THIS_TIME_MIN_ID};"|tail -1)
echo "mysql在上次导入后最早的更新时间为${MYSQL_THIS_TIME_CREATE_TIME}"

MYSQL_FIRST_CREATE_MONTH=$(date -d @$[${MYSQL_THIS_TIME_CREATE_TIME}/1000] +%Y-%m-01)
CURRENT_MONTH=$(date +%Y-%m-01)

echo "增量导入${MYSQL_FIRST_CREATE_MONTH}到${CURRENT_MONTH}数据"
for ((i=0;$(date -d "$i month $MYSQL_FIRST_CREATE_MONTH" +%s)<=$(date -d "$CURRENT_MONTH" +%s);i++))
do
	PROCESS_MONTH_TIMESTAMP=$(date -d "$i month $MYSQL_FIRST_CREATE_MONTH" +%s)
	PROCESS_NEXT_MONTH_TIMESTAMP=$(date -d "$[$i+1] month $MYSQL_FIRST_CREATE_MONTH" +%s)
	echo  "
	        insert overwrite table \`${HIVE_DATABASE}\`.\`${HIVE_TABLE_NAME}\` partition (${PARTITION_KEY}=$PROCESS_MONTH_TIMESTAMP)
	        select
	            ${TABLE_COLUMNS}${HIVE_SELECT_NEW_COLUMNS}
	        from \`${HIVE_DATABASE}\`.\`${MYSQL_TABLE_NAME}\`
	        where ${KEY_FIELD_NAME} >= ${PROCESS_MONTH_TIMESTAMP}000
			and ${KEY_FIELD_NAME} < ${PROCESS_NEXT_MONTH_TIMESTAMP}000 and ${PRIMARY_KEY}>${HIVE_MAX};	
	"
	hive -e   "
	        insert overwrite table \`${HIVE_DATABASE}\`.\`${HIVE_TABLE_NAME}\` partition (${PARTITION_KEY}=$PROCESS_MONTH_TIMESTAMP)
	        select
	            ${TABLE_COLUMNS}
	        from \`${HIVE_DATABASE}\`.\`${MYSQL_TABLE_NAME}\`
	        where ${KEY_FIELD_NAME} >= ${PROCESS_MONTH_TIMESTAMP}000
			and ${KEY_FIELD_NAME} < ${PROCESS_NEXT_MONTH_TIMESTAMP}000 and ${PRIMARY_KEY}>${HIVE_MAX};
	"
done
echo "hive -e  \"alter table \`${HIVE_DATABASE}\`.\`${HIVE_TABLE_NAME}\` drop partition(${PARTITION_KEY}=0);\""
hive -e  "alter table \`${HIVE_DATABASE}\`.\`${HIVE_TABLE_NAME}\` drop partition(${PARTITION_KEY}=0);"

echo "aws s3 rm ${DATA_DIR}/${HIVE_TABLE_NAME}/${PARTITION_KEY}=0 --recursive"
aws s3 rm ${DATA_DIR}/${HIVE_TABLE_NAME}/${PARTITION_KEY}=0 --recursive

写完脑子已经不够用了

你可能感兴趣的:(shell)