安装注意事项
#Set path to where bin/hadoop is available
export HADOOP_COMMON_HOME=/app/bigdata/hadoop
#Set path to where hadoop-*-core.jar is available
export HADOOP_MAPRED_HOME=/app/bigdata/hadoop
#set the path to where bin/hbase is available
export HBASE_HOME=/app/bigdata/hbase
#Set the path to where bin/hive is available
export HIVE_HOME=/app/bigdata/hive
#Set the path for where zookeper config dir is
export ZOOCFGDIR=/app/bigdata/zookeeper
链接mysql,前提是mysql允许远程登录,不允许的话需要授权:
--GRANT ALL PRIVILEGES ON *.* TO 'root'@'%' IDENTIFIED BY 'root' WITH GRANT OPTION;
#查询指定jdbc的所有库
./sqoop-list-databases --connect jdbc:mysql://192.168.10.107:3306 --username root --password root
#查询指定库xlhdw的所有表
./sqoop-list-tables --connect jdbc:mysql://192.168.10.107:3306/xlhdw --username root --password root
注意事项:
sqoop必须运行在yarn上
案例详解
==================IMPORT 导入到hive=================
/** 将mysql的表导入 hdfs文件系统 **/
sqoop import \
--connect jdbc:mysql://192.168.10.107:3306/userdb \
--username root \
--password root \
--target-dir \
/sqooptest/emp \
--fields-terminated-by ',' \
--table emp \
--split-by id \
--m 2
结果数据:
[root@COLBY-NN-101 bigdata]# hdfs dfs -cat /sqooptest/emp/*
1201,gopal,manager,50000,TP
1202,manisha,Proof reader,50000,TP
1203,khalil,php dev,30000,AC
1204,prasanth,php dev,30000,AC
1205,kranthi,admin,20000,TP
/** 将mysql的表导入 hive表,sqoop会自动创建表结构 **/
sqoop import \
--connect jdbc:mysql://192.168.10.107:3306/userdb \
--username root \
--password root \
--hive-import \
--fields-terminated-by ',' \
--table emp \
--split-by id \
--m 2
结果数据:
hive> select * from emp;
OK
1201 gopal manager 50000 TP
1202 manisha Proof reader 50000 TP
1203 khalil php dev 30000 AC
1204 prasanth php dev 30000 AC
1205 kranthi admin 20000 TP
Time taken: 1.586 seconds, Fetched: 5 row(s)
sqoop import \
--connect jdbc:mysql://192.168.10.107:3306/xlhdw \
--username root \
--password root \
--hive-import \
--fields-terminated-by ',' \
--table cdm_ent_dto_business_change_d \
--split-by reg_credit_no \
--m 2
/** 将mysql的表的增量数据导入 hdfs **/
sqoop import \
--connect jdbc:mysql://192.168.10.107:3306/userdb \
--target-dir /sqooptest \
--username root \
--password root \
--table emp \
--m 1 \
--incremental append \
--check-column id \
--last-value 1205
========================条件导入=================
/** 将mysql的表的条件数据导入到hive **/
sqoop import \
--connect jdbc:mysql://192.168.10.107:3306/userdb \
--username root \
--password root \
--where "id='1201'" \
--hive-import \
--fields-terminated-by ',' \
--table emp \
--split-by id \
--m 2
/** 按需求条件导入从mysql到hdfs **/
sqoop import \
--connect jdbc:mysql://192.168.10.107:3306/userdb \
--username root \
--password root \
--target-dir /wherequery2 \
--query 'select id,name,deg from emp WHERE id<1207 and $CONDITIONS' \
--split-by id \
--fields-terminated-by '\t' \
--m 2
===============增量导入======================
增量导入是仅导入新添加的表中的行的技术。
sqoop支持两种增量MySql导入到hive的模式,
一种是append,即通过指定一个递增的列,比如:
--incremental append --check-column num_id --last-value 0
另种是可以根据时间戳,比如:
--incremental lastmodified --check-column created --last-value '2012-02-01 11:0:00'
就是只导入created 比'2012-02-01 11:0:00'更大的数据
1、append模式
它需要添加‘incremental’, ‘check-column’, 和 ‘last-value’选项来执行增量导入。
下面的语法用于Sqoop导入命令增量选项。
--incremental
--check-column
--last value 上次导到哪个值了
/* 向emp表增量导入数据,指定emp的目录即可*/
sqoop import \
--connect jdbc:mysql://192.168.10.107:3306/userdb \
--username root \
--password root \
--target-dir /user/hive/warehouse/emp \
--table emp --m 1 \
--incremental append \
--check-column id \
--last-value 1205
=====================EXPORT 导出到mysql==============
/** 将hdfs的文件数据导出到mysql **/
bin/sqoop export \
--connect jdbc:mysql://192.168.10.107:3306/userdb \
--username root \
--password root \
--input-fields-terminated-by ',' \
--table emp \
--export-dir /sqooptest/
/** 将hive的表数据(hdfs的文件)导出到mysql **/
注意指定输入文件的分隔符,而且不能直接指定hive表名,而是指定表名所在的路径
sqoop export \
--connect jdbc:mysql://192.168.10.107:3306/xlhdw \
--username root \
--password root \
--input-fields-terminated-by ',' \
--table cdm_ent_dto_business_change_d \
--export-dir /user/hive/warehouse/cdm_ent_dto_business_change_d/
--TODO -----
将app数据仓库中的 日新用户维度统计报表:dim_user_new_day 导出到mysql的表中去
----------------------------------------
create table dim_user_new_day(os_name string,city string,release_channel string,app_ver_name string,cnts int)
partitioned by (day string, dim string);
-- 2 日新维度统计报表sql开发(利用多重插入语法)
from etl_user_new_day
insert into table dim_user_new_day partition(day='2017-09-21',dim='0000')
select 'all','all','all','all',count(1)
where day='2017-09-21'
insert into table dim_user_new_day partition(day='2017-09-21',dim='0001')
select 'all','all','all',app_ver_name,count(1)
where day='2017-09-21'
group by app_ver_name
insert into table dim_user_new_day partition(day='2017-09-21',dim='0010')
select 'all','all',release_channel,'all',count(1)
where day='2017-09-21'
group by release_channel
insert into table dim_user_new_day partition(day='2017-09-21',dim='0011')
select 'all','all',release_channel,app_ver_name,count(1)
where day='2017-09-21'
group by release_channel,app_ver_name
insert into table dim_user_new_day partition(day='2017-09-21',dim='0100')
select 'all',city,'all','all',count(1)
where day='2017-09-21'
group by city
----------------------------------------
-- 1 在mysql中建库建表
create database app;
create table dim_user_new_day(
os_name varchar(20),city varchar(20),release_channel varchar(20),app_ver_name varchar(20),cnts int,dt varchar(20)
);
--注意:将库和表的编码集改成utf8,命令如下:
修改库的编码:
mysql> alter database db_name character set utf8;
修改表的编码:
mysql> ALTER TABLE table_name CONVERT TO CHARACTER SET utf8 COLLATE utf8_general_ci;
-- 2 用sqoop将hive中的 dim_user_new_day 中的指定日分区的数据导出到mysql 的dim_user_new_day
sqoop导出hive多级分区表会出现异常,如下:
“Error: java.io.IOException: Can't export data, please check failed map task logs”
解决方式是:将数据存储为单分区,再导出到mysql或者其他RDMS
命令模式:注意分隔符的使用
sqoop export \
--connect "jdbc:mysql://192.168.10.107:3306/app?useUnicode=true&characterEncoding=utf-8" \
--username root \
--password root \
--input-fields-terminated-by '\001' \
--table dim_user_new_day \
--export-dir /user/hive/warehouse/dim_user_new_day/day=2017-09-21/dim*/*
-----脚本模式
#!/bin/bash
day=`date -d '-1 day' +'%Y-%m-%d'`
/root/apps/sqoop/bin/sqoop export \
--connect "jdbc:mysql://192.168.10.107:3306/app?useUnicode=true&characterEncoding=utf-8" \
--username root \
--password root \
--input-fields-terminated-by '\001' \
--table dim_user_new_day \
--export-dir /user/hive/warehouse/app.db/dim_user_new_day_1p/day=${day} /