本文是为了记录工作中用到的技术方案,为了把同一个集群的hive中的表导入到hbase表的。
1、准备hive表数据
1)hive的表数据
create table adm.adm_2ndline_user_visit_1h_delta_hourly(
statis_day string,
search_time string,
serv_number string,
prov_id string,
region_id string,
node_id string,
sup_node_id string,
url_detail string,
sup_url_detail string,
client_id string,
chn_id string,
chn_id_source string,
cp_id string,
cp_name string,
node_type string,
net_type string,
term_type string,
gate_ip string,
session_id string,
page_id string,
term_prod_id string,
business_id string,
sub_busi_id string,
virt_busi_id string,
client_code string
)
partitioned by (dt string,hour string)
row format delimited fields terminated by '|';
2)hbase的表结构就是rowkey,info:record的数据格式
2、创建hive2hbase的外部分区表
创建的表结果和hbase的表结构保持一致。
create table adm.adm_2ndline_user_visit_1h_delta_hourly(
rowkey string,
line string
)
partitioned by (dt string,hour string)
row format delimited fields terminated by ‘31’;
–其中31就是|, 1f=1*16 + f =31; 对应分割符0x1f
3、 向hive的外部映射表插入数据
set mapreduce.job.name=adm.adm_2ndline_user_visit_1h_delta_hourly_2_hbase_${SRC_FILE_DAY}_${SRC_FILE_HOUR};
set hive.merge.mapfiles=true;
set hive.merge.mapredfiles=true;
set hive.merge.size.per.task=256000000;
set hive.merge.smallfiles.avgsize=256000000;
alter table adm.adm_2ndline_user_visit_1h_delta_hourly_2_hbase drop if exists partition (dt='${SRC_FILE_DAY}',hour='${SRC_FILE_HOUR}');
--重新组装wrowkey,保证数据的唯一
insert overwrite table adm.adm_2ndline_user_visit_1h_delta_hourly_2_hbase partition(dt = '${SRC_FILE_DAY}',hour = '${SRC_FILE_HOUR}')
select
concat(reverse(serv_number), statis_day, substr(yanfa.mg_md5(reflect("java.util.UUID", "randomUUID")), 0, 6), substr(yanfa.mg_md5(reflect("java.util.UUID", "randomUUID")), -6)) as rowkey,
concat_ws('|', statis_day,search_time,serv_number,prov_id,region_id,node_id,sup_node_id,url_detail,sup_url_detail,client_id,chn_id,chn_id_source,cp_id,cp_name,node_type,net_type,term_type,gate_ip,session_id,page_id,term_prod_id,business_id,sub_busi_id,virt_busi_id,client_code) as line
from adm.adm_2ndline_user_visit_1h_delta_hourly
where dt = '${SRC_FILE_DAY}' and hour = '${SRC_FILE_HOUR}'
;
4、线上执行脚本
#!/bin/bash
set -e
#enable 环境变量
. ~/.bashrc
umask 0000
#判断参数是否合法
if [[$# ne 4]]; then
echo "Usge :$FUNCNAME >SRC_FILE_HOUR> #"
fi
#数据周期(天)
src_file_day=$1
#数据周期(小时)
src_file_day=$2
#hdfs_path
hdfs_path=$3
#hbase_table_name
hbase_table=$4
echo "`date`:SRC_FILE_DAY=${src_file_day}"
echo "`date`:SRC_FILE_HOUR=${src_file_hour}"
echo "`date`:HDFS_PATH=${hdfs_path}"
echo "`date`:HBASE_TABLE_NAME=${hbase_table}"
if hdfs dfs -ls ${hdfs_path}/0* 1>/dev/null 2>&1;then
echo "`date`:HDFS file ${hdfs_path}/0* exists"
else
echo "`date`:HDFS file ${hdfs_path}/0* does not exist"
fi
#ImportTsv
echo "`date`: hbase org.apache.hadoop.hbase.mapreduce.ImportTsv -Dimporttsv.separator=\`echo -e \"\x1f\" \` -Dimporttsv.columns='HBASE_ROW_KEY,info:record' ${hbase_table} ${hdfs_path}/0*"
#另外可加的参数
# -Dimporttsv.bulk.output=/user/hadoop/profile/output_${tablename_hbase} default:${tablename_hbase} hdfs://ip:8020/user/hadoop/profile/${tablename}.csv
hbase org.apache.hadoop.hbase.mapreduce.ImportTsv -Dimporttsv.separator=\`echo -e \"\x1f\" \` -Dimporttsv.columns='HBASE_ROW_KEY,info:record' ${hbase_table} ${hdfs_path}/0*
if [ "$?" !=0 ]; then
echo "`date` : End running,excute failed!"
exit 1
else
echo "`date` : End running,excute success!"
exit 0
fi