sqoop安装和使用

##################
# SQOOP 1.4.6安装#
##################
    #解压重,重命名
    [root@single01 download]# tar -zxvf /opt/download/sqoop-1.4.6.bin__hadoop-2.0.4.gz -C /opt/software/
    [root@single01 download]# cd /opt/software/
    [root@single01 software]# mv sqoop-1.4.6.bin__hadoop-2.0.4-alpha/ sqoop-1.4.6

    #环境变量配置
    [root@single01 conf]# vim /etc/profile.d/my.sh
    #-----------------------------------------------------
    #sqoop1.4.6
    export SQOOP_HOME=/opt/software/sqoop-1.4.6
    export PATH=$PATH:$SQOOP_HOME/bin
    #--------------------------------------------
    source /etc/profile

    #sqoop.env
        [root@single01 ~]# cd /opt/software/sqoop-1.4.6/lib/
        [root@single01 lib]# pwd    =>/opt/software/sqoop-1.4.6/lib
        [root@single01 lib]# mv ../conf/sqoop-env-template.sh ../conf/sqoop-env.sh
        [root@single01 sqoop-1.4.6]# echo $HADOOP_HOME    =>/opt/software/hadoop313
        [root@single01 sqoop-1.4.6]# echo $HIVE_HOME    =>/opt/software/hive312
        [root@single01 lib]# vim ../conf/sqoop-env.sh
        #-------------------------------------------------------------
        export HADOOP_COMMON_HOME=/opt/software/hadoop313    #填写$HADOOP_HOME路径
        export HADOOP_MAPRED_HOME=/opt/software/hadoop313    #填写$HADOOP_HOME路径
        #export HBASE_HOME=                                    #填写$HBASE_HOME路径,没装,不改变
        export HIVE_HOME=/opt/software/hive312                #填写$HIVE_HOME路径
        #export ZOOCFGDIR=                                    #填写$ZOOKEEPER_HOME路径,没装,不改变
        export LOGDIR=$SQOOP_HOME/logs                        #日志信息
        #---------------------------------------------------------------------
    
    #资源拷贝
    [root@single01 ~]# cd /opt/software/sqoop-1.4.6/lib/
    #mysql驱动jar包
    cp /opt/software/hive312/lib/mysql-connector-java-5.1.47.jar ./
    #hadoop3个jar包
    cp /opt/software/hadoop313/share/hadoop/common/hadoop-common-3.1.3.jar ./
    cp /opt/software/hadoop313/share/hadoop/hdfs/hadoop-hdfs-3.1.3.jar ./
    cp /opt/software/hadoop313/share/hadoop/mapreduce/hadoop-mapreduce-client-core-3.1.3.jar ./
    #如果安装sqoop 1.4.7
    #出现异常; ERROR hive.HiveConfig: Could bot load org.apache.hadoop.hive.conf.HiveConf.
    #添加jar包软连接(把jar包关联到sqoop的lib目录下)
    ln -s /opt/software/hive312/lib/hive-exec-3.1.2.jar ./    
    #出现异常:ERROR sqoop.Sqoop:Got exception runnung Sqoop: java.lang.NullPointerException
    #at org.json.JSONOBJECT.(JSONoBJECT.JAVA:144)
    把java-json.jar包移动到sqoop的lib目录下
    #检查安装情况
    ls|grep mysql        =》mysql-connector-java-5.1.47.jar
    ls |grep hadoop
    #---------------------------------------------------------------
    avro-mapred-1.7.5-hadoop2.jar
    hadoop-common-3.1.3.jar
    hadoop-hdfs-3.1.3.jar
    hadoop-mapreduce-client-core-3.1.3.jar
    kite-hadoop-compatibility-1.0.0.jar
    parquet-hadoop-1.4.1.jar
    #-------------------------------------------------------------
    
    #常用命令
        #测试命令
        sqoop list-databases --connect jdbc:mysql://single01:3306 --username root --password ok

#hive=>mysql
    # 从Hive表到RDBMS表的直接导出
# 不建议生产环境使用,因为当Hive 表记录较大时,或者RDBMS有多个分区表时,无法做精细的控制
#hive=>hdfs=>mysql (hive数据就在hdfs上)
# 从Hive表导出到HDFS 时,可以进一步对数据进行字段筛选、字段加工、数据过滤操作
    # 使得HDFS上的数据更“接近“ 或等于将来实际要导入RDBMS 表的数据
# 从HDFS 导入RDBMS 时,也是将一个“小数据集“与目标表中的数据做对比会提高导出速度

#从数据仓库导出
    #hdfs->mysql(mysql上要先根据hdfs上的数据类型建表)
    sqoop export \
    #JDBC
    --connect jdbc:mysql://single01:3306/test \
    --username root \
    --password ok \
    --table score_kb16 \
    --columns stu_name,stu_gender,java_score,mysql_score \
    #MAPREDUCE
    --export-dir /test/hive/kb16/kb16_scores.txt \
    --fields-terminated-by ',';


#mysql ->hdfs
sqoop import \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--table order_info \
--columns order_id,order_user_id,order_dt,order_money,order_status \
--where "order_dt between '2019-01-05' and '2019-01-10'" \
-m 1 \
--delete-target-dir \
--target-dir /test/hive/order_info \
--fields-terminated-by ',' ;

#用query方式执行走parallel(并行)执行模式,必须指定--split-by分裂字段;  -m 2表示两个reduce
sqoop import \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--query "select order_id,order_user_id,order_dt,order_money,order_status from order_info where order_user_id<=1900 and \$CONDITIONS" \
-m 2 \
--split-by order_user_id \
--delete-target-dir \
--target-dir /test/hive/order_info2 \
--fields-terminated-by ',' ;

#mysql->hive
#mysql> source /root/order_info.sql 在mysql中执行SQL文件
#mysql 一> hdfs
#--incremental append只支持新增不支持更新
#--tabLe TABLE NAME --query SELECT_COMMAND
#--sp1it—by 和-m 结合实现 numberReduceTasks并行
# --check-column和--last-value 结合实现--check-column :where sid>5

全量
sqoop import \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--table order_info \
--columns order_id,order_user_id,order_dt,order_money,order_status \
--delete-target-dir \
--fields-terminated-by ',' \
-m 2 \
--hive-import \
--create-hive-table \
--hive-database kb16 \
--hive-table full_order_info 


sqoop import \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--table order_info \
--columns id,order_id,order_user_id,order_dt,order_money,order_status \
--delete-target-dir \
--fields-terminated-by ',' \
-m 1 \
--hive-import \
--create-hive-table \
--hive-database kb16 \
--hive-table full_order_info2 


增量(分区)
#在hive上,sqoop-1.4.7支持incremental append,
#sqoop-1.4.6不支持incremental append
#Append mode for hive imports is not yet supported.
#解决方案:建分区表,手动添加分区,然后挂载数据分区即可

--incremental append|lastmodified
    append     主键或唯一键(对新增的数据做增量,可以做insert,不能做update)
    lastmodified  支持更新,主要面向日期(date|datetime|timestamp),支持append和
    --merge-key order_id (修改order_id)

按量
#mysql -->hive 按id做append增量 
sqoop import \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--table order_info \
--columns id,order_id,order_user_id,order_dt,order_money,order_status \
--fields-terminated-by ',' \
-m 1 \
--incremental append \
--check-column id \
--last-value 79979 \
--hive-import \
--hive-database kb16 \
--hive-table full_order_info2


#partitioned by(id_range int) 10000 20000

#mysql -->hdfs 按id数值做append增量(下次增量修改--last-value)
sqoop import \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--table order_info \
--columns id,order_id,order_user_id,order_dt,order_money,order_status \
--target-dir /test/hive/order_id_append  \
--fields-terminated-by ',' \
-m 2 \
--split-by id \
--incremental append \
--check-column id \
--last-value 0
#799979


按日(时段)
sqoop import \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--table order_info \
--columns order_id,order_user_id,order_dt,order_money,order_status \
--where "order_dt>'2019-06-21'"
--fields-terminated-by ',' \
-m 1 \
--incremental append \
--check-column order_dt \
--last-value '2019-06-21 21:41:22' \
--hive-import \
--hive-database kb16 \
--hive-table full_order_info


mysql->hdfs(按id做增量)

1:32:55
sqoop import \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--query "select order_id,order_user_id,cast(order_dt as datetime) as order_dt,order_money,order_status from order_info where order_user_id<=1000 and \$CONDITIONS" \
--fields-terminated-by ',' \
--target-dir /test/hive/order_dt_lastmodified \
-m 1 \
--incremental lastmodified \
--check-column order_dt \
--last-value '2019-01-01 00:00:00'
--merge-key id #合并
--append #新增 

sqoop import \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--query "select order_id,order_user_id,cast(order_dt as datetime) as order_dt,order_money,order_status from order_info where order_user_id<=1000 and \$CONDITIONS" \
--fields-terminated-by ',' \
--target-dir /test/hive/order_dt_lastmodified \
-m 1 \
--incremental lastmodified \
--check-column order_dt \
--last-value '2019-01-01 00:00:00'
--merge-key id 

create external table kb16.sqoop_order_info_par_cluster(
    id bigint ,
    order_id bigint ,
    order_user_id bigint ,
    order_dt string,
    order_money string,
    order_status int
)
partitioned by(ym string)
clustered by (id) sorted by (order_dt) into 4 buckets
row format delimited
fields terminated by ','
stored as textfile;

#1.手动添加分区
alter table kb16.sqoop_order_info_par_cluster add partition (ym='2019-01');

#删除分区
alter table kb16.sqoop_order_info_par_cluster drop partition (ym='2019-03');

#查看分区
show partitions kb16.sqoop_order_info_par_cluster partition(ym='2019-02');

#add_order_par_by_ym_sqoop_data.sh --hive kb16.table -mysql test.order_info -par 2019-03

[root@single01 ~]# rst=`hive -e "show partitions kb16.sqoop_order_info_par_cluster partition(ym='2019-03')"`


sqoop import \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--table order_info \
--where "date_format(order_dt,'%Y-%m')='2019-01'" \
-m 1 \
--fields-terminated-by ',' \
--delete-target-dir \
--target-dir /hive312/warehouse/kb16.db/sqoop_order_info_par_cluster/ym=2019-01 


alter table kb16.sqoop_order_info_par_cluster add partition (ym='2019-02');

sqoop import \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--table order_info \
--where "date_format(order_dt,'%Y-%m')='2019-02'" \
-m 1 \
--fields-terminated-by ',' \
--delete-target-dir \
--target-dir /hive312/warehouse/kb16.db/sqoop_order_info_par_cluster/ym=2019-02 

serde 

job 封装
#查看列表
sqoop job --list
#删除job
sqoop job --delet JOB_NAME
#创建job
sqoop job --create JOB_NAME \
...
#执行job
sqop job --exec JOB_NAME

sqoop import \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--query "select order_id,order_user_id,cast(order_dt as datetime) as order_dt,order_money,order_status from order_info where order_user_id<=1000 and \$CONDITIONS" \
--fields-terminated-by ',' \
--delete-target-dir \
--target-dir /test/hive/order_dt_lastmodified \
-m 1 \
--incremental lastmodified \
--check-column order_dt \
--last-value '2019-01-01 00:00:00'
--merge-key id #合并
--append #新增 


拉链表
    应用场景
    大量的历史数据+新增的数据+有限时间范围内(截止拉取数据时间)的少量更新数据
        
        
mysql->hbase


hive -e
mysql -u root -pok -E "select count..."

shell crontab 调度工具


mysql
truncate table mysql_order;
    create table mysql_order(
        id bigint auto_increment primary key,
        order_id bigint not null,
        user_id bigint not null,
        order_dt datetime not null,
        order_money decimal(10,2),
        order_status int 
    );

insert into mysql_order(order_id,user_id,order_dt,order_money,order_status) values
(1,1,'2019-01-01 08:35:44',38.45,0),
(2,2,'2019-01-01 09:12:31',123.45,0),
(3,3,'2019-01-01 11:05:02',49.45,0),
(4,1,'2019-01-01 13:19:12',58.65,0),
(5,3,'2019-01-01 20:01:27',360.38,0),
(6,4,'2019-01-01 22:30:00',99.33,0),
(1,1,'2019-01-01 08:50:30',38.45,2),
(2,2,'2019-01-01 09:35:05',123.45,2),
(3,3,'2019-01-01 11:40:44',49.45,1),
(4,1,'2019-01-01 13:32:11',58.65,0);

insert into mysql_order(order_id,user_id,order_dt,order_money,order_status) values
(5,3,'2019-01-02 08:01:22',360.38,1),
(6,4,'2019-01-02 08:18:20',99.33,2),
(7,2,'2019-01-02 08:52:09',1200.00,0),
(8,4,'2019-01-02 09:35:05',560.00,0),
(1,1,'2019-01-02 12:22:33',38.45,3),
(9,5,'2019-01-02 23:45:10',32.00,0),
(7,2,'2019-01-02 09:20:22',1200.00,2),
(8,4,'2019-01-02 10:02:09',560.00,2);

hive
    ods
#订单历史全量表:kb6.hive_order
#mysql-->hive 复制MySQL表结构到hive(会在hdfs上自动生成相应目录)
sqoop create-hive-table \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--table mysql_order \
--fields-terminated-by ',' \
--hive-table kb16.hive_order    


#mysql-->hdfs 按照order_dt做增量导入
sqoop import \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--table mysql_order \
--target-dir /hive312/warehouse/kb16.db/hive_order \
--incremental lastmodified \
--check-column order_dt \
--last-value '2019-01-01 00:00:00' \
--merge-key id \
-m 1
# --last-value 2022-01-21 11:49:45.0   
#2019-01-01 22:30:00


sqoop import \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password ok \
--table mysql_order \
--target-dir /hive312/warehouse/kb16.db/hive_order \
--incremental lastmodified \
--check-column order_dt \
--last-value '2019-01-01 22:30:00' \
--merge-key id \
-m 1


#拉链表    
#建表要求
#Hive对使用Update功能的表有特定的语法要求, 语法要求如下: (1)要执行Update的表中, 建表时必须带有buckets(分桶)属性 (2)要执行Update的表中, 需要指定格式,其余格式目前赞不支持, 如:parquet格式, 目前只支持ORCFileformat和AcidOutputFormat (3)要执行Update的表中, 建表时必须指定参数(‘transactional’ = true);
#stored as orc tblproperties("transactional"="true");

批量更新语法

 #MERGE INTO AS T USING AS S
 #ON <``boolean` `expression1>
 #WHEN MATCHED [AND <``boolean` `expression2>] THEN UPDATE SET
 #WHEN MATCHED [AND <``boolean` `expression3>] THEN DELETE
 #WHEN NOT MATCHED [AND <``boolean` `e xpression4>] THEN INSERT VALUES #再用当日数据参考历史数据,存在则更新,否则新增
drop table if exists kb16.zipper_hive_order;

set hive.support.concurrency = true;
set hive.enforce.bucketing = true;
set hive.exec.dynamic.partition.mode = nonstrict;
set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
set hive.compactor.initiator.on = true;
set hive.compactor.worker.threads = 1;
set hive.auto.convert.join=false;
set hive.merge.cardinality.check=false;

create table kb16.zipper_hive_order(
order_id bigint,
user_id bigint,
order_create_dt timestamp,
order_modify_dt timestamp,
order_money decimal(10,2),
current_status int 
)
clustered by(order_create_dt) into 2 buckets
row format delimited
fields terminated by ','
stored as orc tblproperties("transactional"="true");

#先合并当日数据
select 
order_id,user_id,order_money,
min(order_dt) as order_create_dt,
if(max(order_dt)==min(order_dt),'9999-12-31 00:00:00',max(order_dt)) as order_modify_dt,
max(order_status) as current_status
from kb16.hive_order
where to_date(order_dt)='2019-01-01'
group by order_id,user_id,order_money;


merge into kb16.zipper_hive_order as Z using (
select 
order_id,user_id,order_money,
min(order_dt) as order_create_dt,
if(max(order_dt)==min(order_dt),'9999-12-31 00:00:00',max(order_dt)) as order_modify_dt,
max(order_status) as current_status
from kb16.hive_order
where to_date(order_dt)='2019-01-01'
group by order_id,user_id,order_money
) as O
on Z.order_id=O.order_id 
when matched and O.current_status=1 then delete 
when not matched O.current_status=1 then delete
when matched and O.current_status!=1 then update set order_modify_dt=O.order_modify_dt,current_status=O.current_status
when not matched then insert values(O.order_id,O.user_id,O.order_money,O.order_create_dt,O.order_modify_dt,O.current_status);


#where to_date(order_dt)='2019-01-02'

    dwd    
    

#RDBMS 模型->降维
#DATA WAREHOUSE 数仓的数据建模 【星型】,雪花,星座
#事实表,维度表
拉链表
    应用场景
    大量的历史数据+新增的数据+有限时间范围内(截止拉取数据时间)的少量更新数据

你可能感兴趣的:(sqoop)