下面介绍一下我这种用MapReduce生成Hfile,然后导入HBase的方法,一共分为6步:
1. 生成Range Partitioning
这一步决定后面生成HFile时的reduce的个数,比如下面这段sql共生成137556197/100/22000=62条记录,则生成HFile时用63个reduce
select list_no,row_sequence() from (
select
list_no,
row_sequence() as row
from (
select
list_no
from user_info tablesample(bucket 1 out of 100 on list_no) s order by list_no
) t order by list_no
) x where (row % 22000)=0 order by list_no ;
2. 创建存储HFile数据表
存储HiveHFileOutputFormat数据
CREATE EXTERNAL TABLE IF NOT EXISTS hbase_user_info(
list_no string,
asset_id double,
...
)
STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.hbase.HiveHFileOutputFormat'
TBLPROPERTIES('hfile.family.path' = '/tmp-data/user/hbase/hbase_hfiles/user_info/f');
3. 生成HFile
INSERT OVERWRITE TABLE hbase_user_info
SELECT * FROM user_info CLUSTER BY list_no;
4. 创建HBase表
echo "create 'user_info', {NAME => 'f', COMPRESSION => 'GZ'}" | hbase shell
5. 将HFile文件导入HBase
hadoop jar /appcom/hbase/hbase-0.94.11.jar completebulkload /tmp-data/user/hbase/hbase_hfiles/user_info user_info
6. 测试是否有数据
echo "scan 'user_info', { LIMIT => 1 }" | hbase shell
下面是前面6步合成的shell脚本
#!/bin/bash
hive_table=user_info
rowkey=id
hfile_path=/tmp-data/user/hbase/hbase_hfiles/
hbase_jar=$(echo "$(ls -l /software/hbase/*.jar | grep -v test)"|awk '{print $9}')
hive_contrib_jar=$(echo "$(ls -l /software/hive/lib/hive-contrib*.jar | grep -v test)"|awk '{print $9}')
hive_hbase_handler_jar=$(echo "$(ls -l /software/hive/lib/hive-hbase-handler*.jar | grep -v test)"|awk '{print $9}')
echo "##################################[step 1 generate splites]#####################################"
hive -e "
use user_db;
CREATE EXTERNAL TABLE IF NOT EXISTS hbase_splits(partition STRING, count int)
PARTITIONED BY (table STRING);
add jar ${hive_contrib_jar};
create temporary function row_sequence as 'org.apache.hadoop.hive.contrib.udf.UDFRowSequence';
INSERT OVERWRITE TABLE hbase_splits
PARTITION (table='${hive_table}')
select ${rowkey},row_sequence() from (
select
${rowkey},
row_sequence() as row
from (
select
${rowkey}
from ${hive_table} tablesample(bucket 1 out of 100 on ${rowkey}) s order by ${rowkey}
) t order by ${rowkey}
) x where (row % 22000)=0 order by ${rowkey} ;
CREATE EXTERNAL TABLE IF NOT EXISTS hbase_splits_file(partition STRING)
PARTITIONED BY (table STRING)
ROW FORMAT
SERDE 'org.apache.hadoop.hive.serde2.binarysortable.BinarySortableSerDe'
STORED AS
INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveNullValueSequenceFileOutputFormat'
LOCATION '/tmp-data/user/hbase/hbase_splits_file';
INSERT OVERWRITE TABLE hbase_splits_file
PARTITION (table='${hive_table}')
select partition from hbase_splits where table='${hive_table}';
"
echo "##################################[step 2 create hfile table ]#####################################"
echo "DEBUG: table name is: "${hive_table}
sql_select_col="CREATE EXTERNAL TABLE IF NOT EXISTS hbase_${hive_table}("
desc_table_cols=$(hive -e "
use user_db;
desc ${hive_table};
")
desc_table_cols=$(echo "${desc_table_cols}" | grep -Ev "^$|^#.*$|^col_name.*$")
temp_file=`mktemp -u temp.user.XXXXXX.$$`
echo "$desc_table_cols" > ${temp_file}
while read line
do
lgt=$(expr length "${line}")
if [ ${lgt} -eq 0 ];
then
break
fi;
col_name=$(echo "${line}"|awk -F ' ' '{print $1}')
col_type=$(echo "${line}"|awk -F ' ' '{print $2}')
sql_select_col="${sql_select_col}${col_name} ${col_type},";
done < ${temp_file}
rm -rf ${temp_file}
len=$(expr length "${sql_select_col}")
let "len = len - 1"
sql_select_col=$(echo ${sql_select_col}|cut -c1-$len)
sql_select_col=${sql_select_col}") STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.hbase.HiveHFileOutputFormat' TBLPROPERTIES('hfile.family.path' = '${hfile_path}${hive_table}/f');"
echo "DEBUG: cols:"${sql_select_col}
hive -e "use user_db;
${sql_select_col};
"
echo "##################################[step 3 create hfile ]#####################################"
task_num=$(
hive -e "
use user_db;
select max(count) + 1 from hbase_splits where table='${hive_table}';
"
)
task_num_str=$(echo ${task_num})
num=$(echo "${task_num_str}" | awk '{print $2}')
echo ${num}
hive -e "
ADD JAR ${hbase_jar};
ADD JAR ${hive_hbase_handler_jar};
USE user_db;
SET mapred.reduce.tasks=${num};
SET total.order.partitioner.path=/tmp-data/user/hbase/hbase_splits_file/table=${hive_table}/000000_0;
SET hive.mapred.partitioner=org.apache.hadoop.mapred.lib.TotalOrderPartitioner;
set hive.optimize.sampling.orderby=true;
set hive.optimize.sampling.orderby.number=10000000;
set hive.optimize.sampling.orderby.percent=0.1f;
INSERT OVERWRITE TABLE hbase_${hive_table}
SELECT * FROM ${hive_table} CLUSTER BY ${rowkey};
"
status=$?
echo status=${status}
if [ ${status} -eq 0 ];
then
echo "##################################[step 4 create hbase table ]#####################################"
#create 'testtable', { NAME => 'colfam1', COMPRESSION => 'GZ' }
echo "create '${hive_table}', {NAME => 'f', COMPRESSION => 'GZ'}" | hbase shell
echo "##################################[step 5 move hfile to hbase ]#####################################"
hadoop jar ${hbase_jar} completebulkload ${hfile_path}${hive_table} ${hive_table}
echo "##################################[step 6 test ]#####################################"
echo "scan '${hive_table}', { LIMIT => 1 }" | hbase shell
else
echo "ERROR:@@@@@@ generate hfile error @@@@@@";
exit -1;
fi
根据经验,基本上分在每个reduce的DateSize在1-2G之间效率比较高,有一个计算公式如下:
【以user_info为例,假设每个reduce 1.5G,(591999730/100) / (63208973969/1024/1024/1024/1.5) ) = 150846,这个值在第一步(row % 150846)时使用】
其中63208973969通过hadoop fs -du获得,591999730为记录条数
可能遇到的问题
a) Can't read partitions file,需要设置SET mapreduce.totalorderpartitioner.path=/tmp/hbase_splits;为SETtotal.order.partitioner.path=/tmp/hbase_splits;前面那个是hadoop2用的;
b) java.io.IOException: Added a key not lexically larger thanprevious key,可能是rowkey有重复;
c) reduce阶段 Nofiles found...,RangePartitioning对应的区域内没有数据;
d) Wrong number of partitions in keyset,这是因为Range Partitioning中的条数和后面SETmapred.reduce.tasks=43不一致造成的,RangePartitioning应该是mapred.reduce.tasks的值减1;