##################
# SQOOP 1.4.6安装#
##################
#解压重,重命名
[root@single01 download]# tar -zxvf /opt/download/sqoop-1.4.6.bin__hadoop-2.0.4.gz -C /opt/software/
[root@single01 download]# cd /opt/software/
[root@single01 software]# mv sqoop-1.4.6.bin__hadoop-2.0.4-alpha/ sqoop-1.4.6
#环境变量配置
[root@single01 conf]# vim /etc/profile.d/my.sh
#-----------------------------------------------------
#sqoop1.4.6
export SQOOP_HOME=/opt/software/sqoop-1.4.6
export PATH=$PATH:$SQOOP_HOME/bin
#--------------------------------------------
source /etc/profile
#sqoop.env
[root@single01 ~]# cd /opt/software/sqoop-1.4.6/lib/
[root@single01 lib]# pwd =>/opt/software/sqoop-1.4.6/lib
[root@single01 lib]# mv ../conf/sqoop-env-template.sh ../conf/sqoop-env.sh
[root@single01 sqoop-1.4.6]# echo $HADOOP_HOME =>/opt/software/hadoop313
[root@single01 sqoop-1.4.6]# echo $HIVE_HOME =>/opt/software/hive312
[root@single01 lib]# vim ../conf/sqoop-env.sh
#-------------------------------------------------------------
export HADOOP_COMMON_HOME=/opt/software/hadoop313 #填写$HADOOP_HOME路径
export HADOOP_MAPRED_HOME=/opt/software/hadoop313 #填写$HADOOP_HOME路径
#export HBASE_HOME= #填写$HBASE_HOME路径,没装,不改变
export HIVE_HOME=/opt/software/hive312 #填写$HIVE_HOME路径
#export ZOOCFGDIR= #填写$ZOOKEEPER_HOME路径,没装,不改变
export LOGDIR=$SQOOP_HOME/logs #日志信息
#---------------------------------------------------------------------
#资源拷贝
[root@single01 ~]# cd /opt/software/sqoop-1.4.6/lib/
#mysql驱动jar包
cp /opt/software/hive312/lib/mysql-connector-java-5.1.47.jar ./
#hadoop3个jar包
cp /opt/software/hadoop313/share/hadoop/common/hadoop-common-3.1.3.jar ./
cp /opt/software/hadoop313/share/hadoop/hdfs/hadoop-hdfs-3.1.3.jar ./
cp /opt/software/hadoop313/share/hadoop/mapreduce/hadoop-mapreduce-client-core-3.1.3.jar ./
#如果安装sqoop 1.4.7
#出现异常; ERROR hive.HiveConfig: Could bot load org.apache.hadoop.hive.conf.HiveConf.
#添加jar包软连接(把jar包关联到sqoop的lib目录下)
ln -s /opt/software/hive312/lib/hive-exec-3.1.2.jar ./
#出现异常:ERROR sqoop.Sqoop:Got exception runnung Sqoop: java.lang.NullPointerException
#at org.json.JSONOBJECT.
把java-json.jar包移动到sqoop的lib目录下
#检查安装情况
ls|grep mysql =》mysql-connector-java-5.1.47.jar
ls |grep hadoop
#---------------------------------------------------------------
avro-mapred-1.7.5-hadoop2.jar
hadoop-common-3.1.3.jar
hadoop-hdfs-3.1.3.jar
hadoop-mapreduce-client-core-3.1.3.jar
kite-hadoop-compatibility-1.0.0.jar
parquet-hadoop-1.4.1.jar
#-------------------------------------------------------------
#常用命令
#测试命令
sqoop list-databases --connect jdbc:mysql://single01:3306 --username root --password ok
#hive=>mysql
# 从Hive表到RDBMS表的直接导出
# 不建议生产环境使用,因为当Hive 表记录较大时,或者RDBMS有多个分区表时,无法做精细的控制
#hive=>hdfs=>mysql (hive数据就在hdfs上)
# 从Hive表导出到HDFS 时,可以进一步对数据进行字段筛选、字段加工、数据过滤操作
# 使得HDFS上的数据更“接近“ 或等于将来实际要导入RDBMS 表的数据
# 从HDFS 导入RDBMS 时,也是将一个“小数据集“与目标表中的数据做对比会提高导出速度
#从数据仓库导出
#hdfs->mysql(mysql上要先根据hdfs上的数据类型建表)
sqoop export \
#JDBC
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--table score_kb16 \
--columns stu_name,stu_gender,java_score,mysql_score \
#MAPREDUCE
--export-dir /test/hive/kb16/kb16_scores.txt \
--fields-terminated-by ',';
#mysql ->hdfs
sqoop import \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--table order_info \
--columns order_id,order_user_id,order_dt,order_money,order_status \
--where "order_dt between '2019-01-05' and '2019-01-10'" \
-m 1 \
--delete-target-dir \
--target-dir /test/hive/order_info \
--fields-terminated-by ',' ;
#用query方式执行走parallel(并行)执行模式,必须指定--split-by分裂字段; -m 2表示两个reduce
sqoop import \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--query "select order_id,order_user_id,order_dt,order_money,order_status from order_info where order_user_id<=1900 and \$CONDITIONS" \
-m 2 \
--split-by order_user_id \
--delete-target-dir \
--target-dir /test/hive/order_info2 \
--fields-terminated-by ',' ;
#mysql->hive
#mysql> source /root/order_info.sql 在mysql中执行SQL文件
#mysql 一> hdfs
#--incremental append只支持新增不支持更新
#--tabLe TABLE NAME --query SELECT_COMMAND
#--sp1it—by 和-m 结合实现 numberReduceTasks并行
# --check-column和--last-value 结合实现--check-column :where sid>5
全量
sqoop import \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--table order_info \
--columns order_id,order_user_id,order_dt,order_money,order_status \
--delete-target-dir \
--fields-terminated-by ',' \
-m 2 \
--hive-import \
--create-hive-table \
--hive-database kb16 \
--hive-table full_order_info
sqoop import \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--table order_info \
--columns id,order_id,order_user_id,order_dt,order_money,order_status \
--delete-target-dir \
--fields-terminated-by ',' \
-m 1 \
--hive-import \
--create-hive-table \
--hive-database kb16 \
--hive-table full_order_info2
增量(分区)
#在hive上,sqoop-1.4.7支持incremental append,
#sqoop-1.4.6不支持incremental append
#Append mode for hive imports is not yet supported.
#解决方案:建分区表,手动添加分区,然后挂载数据分区即可
--incremental append|lastmodified
append 主键或唯一键(对新增的数据做增量,可以做insert,不能做update)
lastmodified 支持更新,主要面向日期(date|datetime|timestamp),支持append和
--merge-key order_id (修改order_id)
按量
#mysql -->hive 按id做append增量
sqoop import \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--table order_info \
--columns id,order_id,order_user_id,order_dt,order_money,order_status \
--fields-terminated-by ',' \
-m 1 \
--incremental append \
--check-column id \
--last-value 79979 \
--hive-import \
--hive-database kb16 \
--hive-table full_order_info2
#partitioned by(id_range int) 10000 20000
#mysql -->hdfs 按id数值做append增量(下次增量修改--last-value)
sqoop import \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--table order_info \
--columns id,order_id,order_user_id,order_dt,order_money,order_status \
--target-dir /test/hive/order_id_append \
--fields-terminated-by ',' \
-m 2 \
--split-by id \
--incremental append \
--check-column id \
--last-value 0
#799979
按日(时段)
sqoop import \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--table order_info \
--columns order_id,order_user_id,order_dt,order_money,order_status \
--where "order_dt>'2019-06-21'"
--fields-terminated-by ',' \
-m 1 \
--incremental append \
--check-column order_dt \
--last-value '2019-06-21 21:41:22' \
--hive-import \
--hive-database kb16 \
--hive-table full_order_info
mysql->hdfs(按id做增量)
1:32:55
sqoop import \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--query "select order_id,order_user_id,cast(order_dt as datetime) as order_dt,order_money,order_status from order_info where order_user_id<=1000 and \$CONDITIONS" \
--fields-terminated-by ',' \
--target-dir /test/hive/order_dt_lastmodified \
-m 1 \
--incremental lastmodified \
--check-column order_dt \
--last-value '2019-01-01 00:00:00'
--merge-key id #合并
--append #新增
sqoop import \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--query "select order_id,order_user_id,cast(order_dt as datetime) as order_dt,order_money,order_status from order_info where order_user_id<=1000 and \$CONDITIONS" \
--fields-terminated-by ',' \
--target-dir /test/hive/order_dt_lastmodified \
-m 1 \
--incremental lastmodified \
--check-column order_dt \
--last-value '2019-01-01 00:00:00'
--merge-key id
create external table kb16.sqoop_order_info_par_cluster(
id bigint ,
order_id bigint ,
order_user_id bigint ,
order_dt string,
order_money string,
order_status int
)
partitioned by(ym string)
clustered by (id) sorted by (order_dt) into 4 buckets
row format delimited
fields terminated by ','
stored as textfile;
#1.手动添加分区
alter table kb16.sqoop_order_info_par_cluster add partition (ym='2019-01');
#删除分区
alter table kb16.sqoop_order_info_par_cluster drop partition (ym='2019-03');
#查看分区
show partitions kb16.sqoop_order_info_par_cluster partition(ym='2019-02');
#add_order_par_by_ym_sqoop_data.sh --hive kb16.table -mysql test.order_info -par 2019-03
[root@single01 ~]# rst=`hive -e "show partitions kb16.sqoop_order_info_par_cluster partition(ym='2019-03')"`
sqoop import \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--table order_info \
--where "date_format(order_dt,'%Y-%m')='2019-01'" \
-m 1 \
--fields-terminated-by ',' \
--delete-target-dir \
--target-dir /hive312/warehouse/kb16.db/sqoop_order_info_par_cluster/ym=2019-01
alter table kb16.sqoop_order_info_par_cluster add partition (ym='2019-02');
sqoop import \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--table order_info \
--where "date_format(order_dt,'%Y-%m')='2019-02'" \
-m 1 \
--fields-terminated-by ',' \
--delete-target-dir \
--target-dir /hive312/warehouse/kb16.db/sqoop_order_info_par_cluster/ym=2019-02
serde
job 封装
#查看列表
sqoop job --list
#删除job
sqoop job --delet JOB_NAME
#创建job
sqoop job --create JOB_NAME \
...
#执行job
sqop job --exec JOB_NAME
sqoop import \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--query "select order_id,order_user_id,cast(order_dt as datetime) as order_dt,order_money,order_status from order_info where order_user_id<=1000 and \$CONDITIONS" \
--fields-terminated-by ',' \
--delete-target-dir \
--target-dir /test/hive/order_dt_lastmodified \
-m 1 \
--incremental lastmodified \
--check-column order_dt \
--last-value '2019-01-01 00:00:00'
--merge-key id #合并
--append #新增
拉链表
应用场景
大量的历史数据+新增的数据+有限时间范围内(截止拉取数据时间)的少量更新数据
mysql->hbase
hive -e
mysql -u root -pok -E "select count..."
shell crontab 调度工具
mysql
truncate table mysql_order;
create table mysql_order(
id bigint auto_increment primary key,
order_id bigint not null,
user_id bigint not null,
order_dt datetime not null,
order_money decimal(10,2),
order_status int
);
insert into mysql_order(order_id,user_id,order_dt,order_money,order_status) values
(1,1,'2019-01-01 08:35:44',38.45,0),
(2,2,'2019-01-01 09:12:31',123.45,0),
(3,3,'2019-01-01 11:05:02',49.45,0),
(4,1,'2019-01-01 13:19:12',58.65,0),
(5,3,'2019-01-01 20:01:27',360.38,0),
(6,4,'2019-01-01 22:30:00',99.33,0),
(1,1,'2019-01-01 08:50:30',38.45,2),
(2,2,'2019-01-01 09:35:05',123.45,2),
(3,3,'2019-01-01 11:40:44',49.45,1),
(4,1,'2019-01-01 13:32:11',58.65,0);
insert into mysql_order(order_id,user_id,order_dt,order_money,order_status) values
(5,3,'2019-01-02 08:01:22',360.38,1),
(6,4,'2019-01-02 08:18:20',99.33,2),
(7,2,'2019-01-02 08:52:09',1200.00,0),
(8,4,'2019-01-02 09:35:05',560.00,0),
(1,1,'2019-01-02 12:22:33',38.45,3),
(9,5,'2019-01-02 23:45:10',32.00,0),
(7,2,'2019-01-02 09:20:22',1200.00,2),
(8,4,'2019-01-02 10:02:09',560.00,2);
hive
ods
#订单历史全量表:kb6.hive_order
#mysql-->hive 复制MySQL表结构到hive(会在hdfs上自动生成相应目录)
sqoop create-hive-table \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--table mysql_order \
--fields-terminated-by ',' \
--hive-table kb16.hive_order
#mysql-->hdfs 按照order_dt做增量导入
sqoop import \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--table mysql_order \
--target-dir /hive312/warehouse/kb16.db/hive_order \
--incremental lastmodified \
--check-column order_dt \
--last-value '2019-01-01 00:00:00' \
--merge-key id \
-m 1
# --last-value 2022-01-21 11:49:45.0
#2019-01-01 22:30:00
sqoop import \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--table mysql_order \
--target-dir /hive312/warehouse/kb16.db/hive_order \
--incremental lastmodified \
--check-column order_dt \
--last-value '2019-01-01 22:30:00' \
--merge-key id \
-m 1
#拉链表
#建表要求
#Hive对使用Update功能的表有特定的语法要求, 语法要求如下: (1)要执行Update的表中, 建表时必须带有buckets(分桶)属性 (2)要执行Update的表中, 需要指定格式,其余格式目前赞不支持, 如:parquet格式, 目前只支持ORCFileformat和AcidOutputFormat (3)要执行Update的表中, 建表时必须指定参数(‘transactional’ = true);
#stored as orc tblproperties("transactional"="true");
批量更新语法
#MERGE INTO
set hive.support.concurrency = true;
set hive.enforce.bucketing = true;
set hive.exec.dynamic.partition.mode = nonstrict;
set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
set hive.compactor.initiator.on = true;
set hive.compactor.worker.threads = 1;
set hive.auto.convert.join=false;
set hive.merge.cardinality.check=false;
create table kb16.zipper_hive_order(
order_id bigint,
user_id bigint,
order_create_dt timestamp,
order_modify_dt timestamp,
order_money decimal(10,2),
current_status int
)
clustered by(order_create_dt) into 2 buckets
row format delimited
fields terminated by ','
stored as orc tblproperties("transactional"="true");
#先合并当日数据
select
order_id,user_id,order_money,
min(order_dt) as order_create_dt,
if(max(order_dt)==min(order_dt),'9999-12-31 00:00:00',max(order_dt)) as order_modify_dt,
max(order_status) as current_status
from kb16.hive_order
where to_date(order_dt)='2019-01-01'
group by order_id,user_id,order_money;
merge into kb16.zipper_hive_order as Z using (
select
order_id,user_id,order_money,
min(order_dt) as order_create_dt,
if(max(order_dt)==min(order_dt),'9999-12-31 00:00:00',max(order_dt)) as order_modify_dt,
max(order_status) as current_status
from kb16.hive_order
where to_date(order_dt)='2019-01-01'
group by order_id,user_id,order_money
) as O
on Z.order_id=O.order_id
when matched and O.current_status=1 then delete
when not matched O.current_status=1 then delete
when matched and O.current_status!=1 then update set order_modify_dt=O.order_modify_dt,current_status=O.current_status
when not matched then insert values(O.order_id,O.user_id,O.order_money,O.order_create_dt,O.order_modify_dt,O.current_status);
#where to_date(order_dt)='2019-01-02'
dwd
#RDBMS 模型->降维
#DATA WAREHOUSE 数仓的数据建模 【星型】,雪花,星座
#事实表,维度表
拉链表
应用场景
大量的历史数据+新增的数据+有限时间范围内(截止拉取数据时间)的少量更新数据