1 将async-1.4.0.jar 文件链接到hadoop-mapreduce 路径下面:
ln -s /opt/cloudera/parcels/CDH/jars/async-1.4.0.jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce
2 创建一个.csv 文件:
vi test.csv
1,name_1,21
2,name_2,22
3,name_3,33
3 将test.csv 放到hdfs 路径下面 :
hdfs dfs -ls
hdfs dfs -put /test.csv路径 /hdfs 路径下面
4 创建 kudu 的库
impala-shell
SHOW databases;
create database test;
use test;
5 创建KUDU表:
CREATE TABLE T_USER ( id BIGINT , name string , sex String, PRIMARY KEY(id) )PARTITION BY HASH PARTITIONS 16 STORED AS KUDU;
show tables;
select * from T_USER ;
6 导入数据:
hadoop jar /opt/cloudera/parcels/CDH/jars/kudu-client-tools-1.6.0-cdh5.14.0.jar org.apache.kudu.mapreduce.tools.ImportCsv -Dkudu.master.addresses=cdh-02:7051 -Dimportcsv.skip.bad.lines=true '-Dimportcsv.separator=,' 'id,name,sex' impala::test.T_USER hdfs://cdh-01:8020/tmp/tempdata/test.csv
7 验证:
select * from T_USER ;
8 需要注意的:
8.1 test.T_USER 数据库.数据表
8.2 hdfs://cdh-01:8020/tmp/tempdata/t.csv 是在nameNode 机器上面的HDFS路径
8.3 -Dkudu.master.addresses=cdh-02:7051 提交作业的地方
8.4 '-Dimportcsv.separator=,' 'id,name,sex' 以“,”为分隔符处理表字段
1 把文本文件(csv/tsv)中数据导入到impala 中,在impala中创建对应表
create table test_temp(
DR string,
I_A_BATCHNO string,
I_A_DATADATE string,
TS string
)row format delimited
fields terminated by '\t';
2 将文件上传到hdfs中:
hdfs dfs -put data.csv /user/hive/warehouse/caa_doc_duebill_test_temp
3 将hdfs中文件load到test_temp表中:
load data inpath '/user/hive/warehouse/caa_doc_duebill_test_temp' into table test_temp;
4 在impala-shell中创建kudu 表
create table test_kudu(
DR string,
I_A_BATCHNO string,
I_A_DATADATE string,
TS string,
PRIMARY KEY(DR)
)PARTITION BY HASH PARTITIONS 16
STORED AS KUDU;
5 将impala表中数据插入到kudu表中,在impala-shell中执行:
INSERT INTO test_kudu SELECT * FROM test_temp;