if [ $# -eq 12 ]; then
HIVE_DATABASE=$1
HIVE_TABLE_NAME=$2
DATA_DIR=$3
PARTITION_KEY=$4
KEY_FIELD_NAME=$5
MYSQL_HOST=$6
MYSQL_PORT=$7
MYSQL_DATABASE=$8
MYSQL_TABLE_NAME=$9
MYSQL_USERNAME=${10}
MYSQL_PASSWORD=${11}
PRIMARY_KEY=${12}
else
echo 'commend : get_sqoop_partitions_sql HIVE_DATABASE HIVE_TABLE_NAME DATA_DIR PARTITION_KEYKEY_FILED_NAME MYSQL_HOST MYSQL_TABLE_NAME MYSQL_USERNAME MYSQL_PASSWORD PRIMARY_KEY'
exit 127
fi
echo "开始进行从mysql表${MYSQL_TABLE_NAME}到hive表${HIVE_DATABASE}.${HIVE_TABLE_NAME}的导入"
echo "拉表利用的字段为${PRIMARY_KEY},分区名称为${PARTITION_KEY},分区使用的字段为${KEY_FIELD_NAME}"
echo "aws s3 rm ${DATA_DIR}/${HIVE_TABLE_NAME}/${PARTITION_KEY}=0 --recursive"
aws s3 rm ${DATA_DIR}/${HIVE_TABLE_NAME}/${PARTITION_KEY}=0 --recursive
SQL_PATH=$(pwd)/log/${HIVE_DATABASE}.${HIVE_TABLE_NAME}.sql
HIVE_COLUMN_PATH=$(pwd)/log/hive.${HIVE_DATABASE}.${HIVE_TABLE_NAME}.column.txt
MYSQL_COLUMN_PATH=$(pwd)/log/mysql.${MYSQL_DATABASE}.${MYSQL_TABLE_NAME}.column.txt
MYSQL_COLUMN_TYPE_PATH=$(pwd)/log/mysql.${MYSQL_DATABASE}.${MYSQL_TABLE_NAME}.column_type.txt
NEW_COLUMN_PATH=$(pwd)/log/hive.${HIVE_DATABASE}.${HIVE_TABLE_NAME}.new_column.txt
hive -e "show create table ${HIVE_DATABASE}.${HIVE_TABLE_NAME};" >$SQL_PATH
LAST_PARTITION=$(hive -e "show partitions ${HIVE_DATABASE}.${HIVE_TABLE_NAME};"|tail -1|sed 's/=/ /g'|awk '{print $2}')
HIVE_MAX=$(hive -e "select max(${PRIMARY_KEY}) from ${HIVE_DATABASE}.${HIVE_TABLE_NAME} where ${PARTITION_KEY}=${LAST_PARTITION};")
echo "${HIVE_DATABASE}.${HIVE_TABLE_NAME}的${PRIMARY_KEY}最大值为${HIVE_MAX}"
if [ "$(grep decimal $SQL_PATH)" != "" ]; then
DECIMAL_FIELDS=$(grep decimal $SQL_PATH|sed 's/decimal/DECIMAL/g'|sed 's/,/%2C/g'|sed 's/`/ /g'|sed 's/)%2C/)/g'|awk '{if (NR==1) field="--map-column-hive "$1"='"$2"'"; print field; if(NR!=1) field=field","$1"='"$2"'"; print field;}' |tail -1)
fi
begin=$(grep -n CREATE ${SQL_PATH}|sed 's/:/ /g'|awk '{print $1}')
end=$(grep -n ')$' ${SQL_PATH}|head -n1|sed 's/:/ /g'|awk '{print $1}')
TABLE_COLUMNS=$(head -n$((${end})) ${SQL_PATH}|tail -n$((${end}-${begin}))|sed 's/`//g'|awk 'BEGIN {} {if (NR==1) all_fields=$1; if (NR!=1) all_fields=all_fields","$1} END {print all_fields}')
head -n$((${end})) ${SQL_PATH}|tail -n$((${end}-${begin}))|sed 's/`//g'|awk '{print $1}'> ${HIVE_COLUMN_PATH}
mysql -h${MYSQL_HOST} -u${MYSQL_USERNAME} -p${MYSQL_PASSWORD} -P${MYSQL_PORT} -D${MYSQL_DATABASE} -e "desc ${MYSQL_TABLE_NAME};" |awk '{if($1!="Field") print $1}' > ${MYSQL_COLUMN_PATH}
diff -u ${HIVE_COLUMN_PATH} ${MYSQL_COLUMN_PATH} |grep ^+[^+]|sed 's/+//g'|awk '{print $1}' > ${NEW_COLUMN_PATH}
mysql -h${MYSQL_HOST} -u${MYSQL_USERNAME} -p${MYSQL_PASSWORD} -P${MYSQL_PORT} -D${MYSQL_DATABASE} -e "desc ${MYSQL_TABLE_NAME};" |awk '{if($1!="Field") print $1" "$2}' > ${MYSQL_COLUMN_TYPE_PATH}
NEW_COLUMN=$(awk '{print $1}' ${NEW_COLUMN_PATH})
for i in ${NEW_COLUMN[@]}
do
if [ ! -z $i ]; then
echo "hive -e alter table ${HIVE_DATABASE}.${HIVE_TABLE_NAME} add column $i;"
HIVE_SELECT_NEW_COLUMNS="${HIVE_SELECT_NEW_COLUMNS},$i"
fi
done
echo "mysql中新增字段为$(echo ${HIVE_SELECT_NEW_COLUMNS}|sed 's/,//')"
MYSQL_MAX=$(mysql -h${MYSQL_HOST} -u${MYSQL_USERNAME} -p${MYSQL_PASSWORD} -P${MYSQL_PORT} -D${MYSQL_DATABASE} -e "select max(${PRIMARY_KEY}) from ${MYSQL_TABLE_NAME};"|tail -1)
echo "mysql表${MYSQL_TABLE_NAME}的最大${PRIMARY_KEY}为${MYSQL_MAX}"
FAILED='*'
TERMINATE=$(grep 'field.delim' ${SQL_PATH}|sed "s/'/ /g"|awk '{print $3}')
for (( i=${HIVE_MAX};$i<${MYSQL_MAX};i=$[$i+50000000] ))
do
echo "aws s3 rm ${DATA_DIR}/tmp/${HIVE_DATABASE}/${HIVE_TABLE_NAME}"
aws s3 rm ${DATA_DIR}/tmp/${HIVE_DATABASE}/${HIVE_TABLE_NAME} --recursive
echo "sqoop import --connect jdbc:mysql://${MYSQL_HOST}:${MYSQL_PORT}/${MYSQL_DATABASE}?tinyInt1isBit=false \
--username ${MYSQL_USERNAME} --password ${MYSQL_PASSWORD} \
--query \"select ${FAILED} from ${MYSQL_TABLE_NAME} where ${PRIMARY_KEY} > $i and ${PRIMARY_KEY} <= $[$i+50000000] and \$CONDITIONS\" \
--split-by ${PRIMARY_KEY} \
--fields-terminated-by \"${TERMINATE}\" \
--hive-import \
--target_dir ${DATA_DIR}/${HIVE_TABLE_NAME}/${PARTITION_KEY}=0/ \
--hive-partition-key=\"${PARTITION_KEY}\" \
--hive-partition-value=0 \
--columns ${TABLE_COLUMNS}${HIVE_SELECT_NEW_COLUMNS} \
--hive-table ${HIVE_DATABASE}.${HIVE_TABLE_NAME} \
--num-mappers 8
"
sqoop import --connect jdbc:mysql://${MYSQL_HOST}:${MYSQL_PORT}/${MYSQL_DATABASE}?tinyInt1isBit=false \
--username ${MYSQL_USERNAME} --password ${MYSQL_PASSWORD} \
--query "select ${FAILED} from ${MYSQL_TABLE_NAME} where ${PRIMARY_KEY} > $i and ${PRIMARY_KEY} <= $[$i+50000000] and \$CONDITIONS" \
--split-by ${PRIMARY_KEY} \
--fields-terminated-by "${TERMINATE}" \
--target-dir "${DATA_DIR}/tmp/${HIVE_DATABASE}/${HIVE_TABLE_NAME}" \
--hive-import \
--hive-partition-key="${PARTITION_KEY}" \
--hive-partition-value=0 \
--hive-table ${HIVE_DATABASE}.${HIVE_TABLE_NAME} \
--columns ${TABLE_COLUMNS}${HIVE_SELECT_NEW_COLUMNS} \
--num-mappers 8
done
MYSQL_THIS_TIME_MIN_ID=$(mysql -h${MYSQL_HOST} -u${MYSQL_USERNAME} -p${MYSQL_PASSWORD} -P${MYSQL_PORT} -D${MYSQL_DATABASE} -e "select min(${PRIMARY_KEY}) from ${MYSQL_TABLE_NAME} where ${PRIMARY_KEY}>${HIVE_MAX};"|tail -1)
MYSQL_THIS_TIME_CREATE_TIME=$(mysql -h${MYSQL_HOST} -u${MYSQL_USERNAME} -p${MYSQL_PASSWORD} -P${MYSQL_PORT} -D${MYSQL_DATABASE} -e "select ${KEY_FIELD_NAME} from ${MYSQL_TABLE_NAME} where ${PRIMARY_KEY}=${MYSQL_THIS_TIME_MIN_ID};"|tail -1)
echo "mysql在上次导入后最早的更新时间为${MYSQL_THIS_TIME_CREATE_TIME}"
MYSQL_FIRST_CREATE_MONTH=$(date -d @$[${MYSQL_THIS_TIME_CREATE_TIME}/1000] +%Y-%m-01)
CURRENT_MONTH=$(date +%Y-%m-01)
echo "增量导入${MYSQL_FIRST_CREATE_MONTH}到${CURRENT_MONTH}数据"
for ((i=0;$(date -d "$i month $MYSQL_FIRST_CREATE_MONTH" +%s)<=$(date -d "$CURRENT_MONTH" +%s);i++))
do
PROCESS_MONTH_TIMESTAMP=$(date -d "$i month $MYSQL_FIRST_CREATE_MONTH" +%s)
PROCESS_NEXT_MONTH_TIMESTAMP=$(date -d "$[$i+1] month $MYSQL_FIRST_CREATE_MONTH" +%s)
echo "
insert overwrite table \`${HIVE_DATABASE}\`.\`${HIVE_TABLE_NAME}\` partition (${PARTITION_KEY}=$PROCESS_MONTH_TIMESTAMP)
select
${TABLE_COLUMNS}${HIVE_SELECT_NEW_COLUMNS}
from \`${HIVE_DATABASE}\`.\`${MYSQL_TABLE_NAME}\`
where ${KEY_FIELD_NAME} >= ${PROCESS_MONTH_TIMESTAMP}000
and ${KEY_FIELD_NAME} < ${PROCESS_NEXT_MONTH_TIMESTAMP}000 and ${PRIMARY_KEY}>${HIVE_MAX};
"
hive -e "
insert overwrite table \`${HIVE_DATABASE}\`.\`${HIVE_TABLE_NAME}\` partition (${PARTITION_KEY}=$PROCESS_MONTH_TIMESTAMP)
select
${TABLE_COLUMNS}
from \`${HIVE_DATABASE}\`.\`${MYSQL_TABLE_NAME}\`
where ${KEY_FIELD_NAME} >= ${PROCESS_MONTH_TIMESTAMP}000
and ${KEY_FIELD_NAME} < ${PROCESS_NEXT_MONTH_TIMESTAMP}000 and ${PRIMARY_KEY}>${HIVE_MAX};
"
done
echo "hive -e \"alter table \`${HIVE_DATABASE}\`.\`${HIVE_TABLE_NAME}\` drop partition(${PARTITION_KEY}=0);\""
hive -e "alter table \`${HIVE_DATABASE}\`.\`${HIVE_TABLE_NAME}\` drop partition(${PARTITION_KEY}=0);"
echo "aws s3 rm ${DATA_DIR}/${HIVE_TABLE_NAME}/${PARTITION_KEY}=0 --recursive"
aws s3 rm ${DATA_DIR}/${HIVE_TABLE_NAME}/${PARTITION_KEY}=0 --recursive
写完脑子已经不够用了