在前面我们已经熟悉了一次性向Hive表导入(装载)大量数据命令load data
,语法如下:
load data [local] input 'path' [overwrite] into table tableName
说明:
【例1】装载数据
(1)准备数据
[root@node3 ~]# cat data/user1.txt
1 qiang
2 root
3 test
[root@node3 ~]# cat data/user2.txt
4 aa
5 bb
6 cc
[root@node3 ~]#
上传user2.txt到HDFS
[root@node3 ~]# hdfs dfs -put data/user2.txt input
确认HDFS上的user2.txt文件
[root@node3 ~]# hdfs dfs -ls input
Found 2 items
-rw-r--r-- 3 root supergroup 2287354 2017-08-05 05:30 input/mydata.txt
-rw-r--r-- 3 root supergroup 15 2017-08-13 05:25 input/user2.txt
[root@node3 ~]#
(2)装载本地文件
hive> load data local inpath '/root/data/user1.txt' into table users;
Loading data to table default.users
OK
Time taken: 1.461 seconds
hive> select * from users;
OK
1 qiang
2 root
3 test
Time taken: 0.271 seconds, Fetched: 3 row(s)
hive>
(3)覆盖装载
hive> load data inpath 'input/user2.txt' overwrite into table users;
Loading data to table default.users
OK
Time taken: 0.822 seconds
hive> select * from users;
OK
4 aa
5 bb
6 cc
Time taken: 0.212 seconds, Fetched: 3 row(s)
hive>
查询HDFS,发现user2.txt文件已经不在了
hive> dfs -ls input;
Found 1 items
-rw-r--r-- 3 root supergroup 2287354 2017-08-05 05:30 input/mydata.txt
hive>
准备3个数据文件,内容如下。注意对于时间数据,格式是yyyy-mm-dd
。
[root@node1 ~]# cat data/emp10
7782,CLARK,MANAGER,7839,1981-06-09,2450,0,10
7839,KING,PRESIDENT,0,1981-11-17,5000,0,10
7934,MILLER,CLERK,7782,1982-01-23,1300,0,10
[root@node1 ~]# cat data/emp20
7369,SMITH,CLERK,7902,1980-12-17,800,0,20
7566,JONES,MANAGER,7839,1981-04-02,2975,0,20
7902,FORD,ANALYST,7566,1981-12-03,3000,0,20
[root@node1 ~]# cat data/emp30
7499,ALLEN,SALESMAN,7698,1981-02-20,1600,300,30
7521,WARD,SALESMAN,7698,1981-02-22,1250,500,30
7654,MARTIN,SALESMAN,7698,1981-09-28,1250,1400,30
7698,BLAKE,MANAGER,7839,1981-05-01,2850,0,30
7844,TURNER,SALESMAN,7698,1981-09-08,1500,0,30
7900,JAMES,CLERK,7698,1981-12-03,950,0,30
[root@node1 ~]#
上传到HDFS
[root@node1 ~]# hdfs dfs -put data/emp10 input
[root@node1 ~]# hdfs dfs -put data/emp20 input
[root@node1 ~]# hdfs dfs -put data/emp30 input
在Hive环境中,有时需要查看HDFS。不需要退出Hive,可以直接在Hive环境中使用dfs命令。
hive> dfs -ls input;
Found 4 items
-rw-r--r-- 3 root supergroup 133 2017-08-12 09:03 input/emp10
-rw-r--r-- 3 root supergroup 133 2017-08-12 09:03 input/emp20
-rw-r--r-- 3 root supergroup 280 2017-08-12 09:04 input/emp30
-rw-r--r-- 3 root supergroup 2287354 2017-08-05 05:30 input/mydata.txt
hive>
导入分区数据
hive> load data inpath 'input/emp10' into table test.emp partition(did=10);
Loading data to table test.emp partition (did=10)
OK
Time taken: 5.912 seconds
hive> load data inpath 'input/emp20' into table test.emp partition(did=20);
Loading data to table test.emp partition (did=20)
OK
Time taken: 3.536 seconds
hive> load data inpath 'input/emp30' into table test.emp partition(did=30);
Loading data to table test.emp partition (did=30)
OK
Time taken: 4.176 seconds
hive>
hive> select * from test.emp;
OK
7782 CLARK MANAGER 7839 1981-06-09 2450.0 0.0 10
7839 KING PRESIDENT 0 1981-11-17 5000.0 0.0 10
7934 MILLER CLERK 7782 1982-01-23 1300.0 0.0 10
7369 SMITH CLERK 7902 1980-12-17 800.0 0.0 20
7566 JONES MANAGER 7839 1981-04-02 2975.0 0.0 20
7902 FORD ANALYST 7566 1981-12-03 3000.0 0.0 20
7499 ALLEN SALESMAN 7698 1981-02-20 1600.0 300.0 30
7521 WARD SALESMAN 7698 1981-02-22 1250.0 500.0 30
7654 MARTIN SALESMAN 7698 1981-09-28 1250.0 1400.0 30
7698 BLAKE MANAGER 7839 1981-05-01 2850.0 0.0 30
7844 TURNER SALESMAN 7698 1981-09-08 1500.0 0.0 30
7900 JAMES CLERK 7698 1981-12-03 950.0 0.0 30
Time taken: 1.881 seconds, Fetched: 12 row(s)
hive>
hive> dfs -ls /user/hive/warehouse/test.db;
Found 2 items
drwxr-xr-x - root supergroup 0 2017-08-12 09:08 /user/hive/warehouse/test.db/emp
drwxr-xr-x - root supergroup 0 2017-08-10 09:39 /user/hive/warehouse/test.db/tb1
hive> dfs -ls /user/hive/warehouse/test.db/emp;
Found 3 items
drwxr-xr-x - root supergroup 0 2017-08-12 09:08 /user/hive/warehouse/test.db/emp/did=10
drwxr-xr-x - root supergroup 0 2017-08-12 09:08 /user/hive/warehouse/test.db/emp/did=20
drwxr-xr-x - root supergroup 0 2017-08-12 09:08 /user/hive/warehouse/test.db/emp/did=30
hive>
(1)复制表数据,得到普通表(非分区表)
hive> use default;
OK
Time taken: 0.035 seconds
hive> create table emp as select * from test.emp;
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
Query ID = root_20170816085512_041f0653-7e3c-4fd2-894c-a364f56a7462
Total jobs = 3
Launching Job 1 out of 3
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_1502887685471_0002, Tracking URL = http://node1:8088/proxy/application_1502887685471_0002/
Kill Command = /opt/hadoop-2.7.3/bin/hadoop job -kill job_1502887685471_0002
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0
2017-08-16 08:55:28,942 Stage-1 map = 0%, reduce = 0%
2017-08-16 08:55:42,681 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 1.83 sec
MapReduce Total cumulative CPU time: 1 seconds 830 msec
Ended Job = job_1502887685471_0002
Stage-4 is selected by condition resolver.
Stage-3 is filtered out by condition resolver.
Stage-5 is filtered out by condition resolver.
Moving data to directory hdfs://cetc/user/hive/warehouse/.hive-staging_hive_2017-08-16_08-55-12_391_2156611965186432614-1/-ext-10002
Moving data to directory hdfs://cetc/user/hive/warehouse/emp
MapReduce Jobs Launched:
Stage-Stage-1: Map: 1 Cumulative CPU: 1.83 sec HDFS Read: 5548 HDFS Write: 658 SUCCESS
Total MapReduce CPU Time Spent: 1 seconds 830 msec
OK
Time taken: 32.073 seconds
hive> create table emp as select * from test.emp;
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
Query ID = root_20170816085512_041f0653-7e3c-4fd2-894c-a364f56a7462
Total jobs = 3
Launching Job 1 out of 3
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_1502887685471_0002, Tracking URL = http://node1:8088/proxy/application_1502887685471_0002/
Kill Command = /opt/hadoop-2.7.3/bin/hadoop job -kill job_1502887685471_0002
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0
2017-08-16 08:55:28,942 Stage-1 map = 0%, reduce = 0%
2017-08-16 08:55:42,681 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 1.83 sec
MapReduce Total cumulative CPU time: 1 seconds 830 msec
Ended Job = job_1502887685471_0002
Stage-4 is selected by condition resolver.
Stage-3 is filtered out by condition resolver.
Stage-5 is filtered out by condition resolver.
Moving data to directory hdfs://cetc/user/hive/warehouse/.hive-staging_hive_2017-08-16_08-55-12_391_2156611965186432614-1/-ext-10002
Moving data to directory hdfs://cetc/user/hive/warehouse/emp
MapReduce Jobs Launched:
Stage-Stage-1: Map: 1 Cumulative CPU: 1.83 sec HDFS Read: 5548 HDFS Write: 658 SUCCESS
Total MapReduce CPU Time Spent: 1 seconds 830 msec
OK
Time taken: 32.073 seconds
hive>
查看副本表emp结构,发现是普通表,非分区表。
hive> desc emp;
OK
eid int
ename string
job string
mgr int
hiredate date
sal double
comm double
did int
Time taken: 0.113 seconds, Fetched: 8 row(s)
hive> select * from emp;
OK
7782 CLARK MANAGER 7839 1981-06-09 2450.0 0.0 10
7839 KING PRESIDENT 0 1981-11-17 5000.0 0.0 10
7934 MILLER CLERK 7782 1982-01-23 1300.0 0.0 10
7369 SMITH CLERK 7902 1980-12-17 800.0 0.0 20
7566 JONES MANAGER 7839 1981-04-02 2975.0 0.0 20
7902 FORD ANALYST 7566 1981-12-03 3000.0 0.0 20
7499 ALLEN SALESMAN 7698 1981-02-20 1600.0 300.0 30
7521 WARD SALESMAN 7698 1981-02-22 1250.0 500.0 30
7654 MARTIN SALESMAN 7698 1981-09-28 1250.0 1400.0 30
7698 BLAKE MANAGER 7839 1981-05-01 2850.0 0.0 30
7844 TURNER SALESMAN 7698 1981-09-08 1500.0 0.0 30
7900 JAMES CLERK 7698 1981-12-03 950.0 0.0 30
Time taken: 0.226 seconds, Fetched: 12 row(s)
hive>
hive>
(2)复制表结构
hive> create table empBak like test.emp;
OK
Time taken: 0.161 seconds
hive> desc test.emp;
OK
eid int
ename string
job string
mgr int
hiredate date
sal double
comm double
did int
# Partition Information
# col_name data_type comment
did int
Time taken: 0.077 seconds, Fetched: 13 row(s)
hive> select * from empBak;
OK
Time taken: 0.203 seconds
hive>
(1)导出到HDFS
hive> insert overwrite directory 'input/emp.bak'
> select * from test.emp;
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
Query ID = root_20170813093308_2bc4a189-000f-490b-8f9c-2451cfc86355
Total jobs = 3
Launching Job 1 out of 3
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_1502614100973_0002, Tracking URL = http://node1:8088/proxy/application_1502614100973_0002/
Kill Command = /opt/hadoop-2.7.3/bin/hadoop job -kill job_1502614100973_0002
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0
2017-08-13 09:33:31,834 Stage-1 map = 0%, reduce = 0%
2017-08-13 09:33:45,724 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 2.14 sec
MapReduce Total cumulative CPU time: 2 seconds 140 msec
Ended Job = job_1502614100973_0002
Stage-3 is selected by condition resolver.
Stage-2 is filtered out by condition resolver.
Stage-4 is filtered out by condition resolver.
Moving data to directory hdfs://cetc/user/root/input/emp.bak/.hive-staging_hive_2017-08-13_09-33-08_728_3968927259689334332-1/-ext-10000
Moving data to directory input/emp.bak
MapReduce Jobs Launched:
Stage-Stage-1: Map: 1 Cumulative CPU: 2.14 sec HDFS Read: 5515 HDFS Write: 590 SUCCESS
Total MapReduce CPU Time Spent: 2 seconds 140 msec
OK
Time taken: 39.39 seconds
hive>
hive> dfs -ls input;
Found 2 items
drwxr-xr-x - root supergroup 0 2017-08-13 09:33 input/emp.bak
-rw-r--r-- 3 root supergroup 2287354 2017-08-05 05:30 input/mydata.txt
hive> dfs -ls input/emp.bak;
Found 1 items
-rwxr-xr-x 3 root supergroup 590 2017-08-13 09:33 input/emp.bak/000000_0
hive> dfs -cat input/emp.bak/000000_0;
7782CLARKMANAGER78391981-06-092450.00.010
7839KINGPRESIDENT01981-11-175000.00.010
7934MILLERCLERK77821982-01-231300.00.010
7369SMITHCLERK79021980-12-17800.00.020
7566JONESMANAGER78391981-04-022975.00.020
7902FORDANALYST75661981-12-033000.00.020
7499ALLENSALESMAN76981981-02-201600.0300.030
7521WARDSALESMAN76981981-02-221250.0500.030
7654MARTINSALESMAN76981981-09-281250.01400.030
7698BLAKEMANAGER78391981-05-012850.00.030
7844TURNERSALESMAN76981981-09-081500.00.030
7900JAMESCLERK76981981-12-03950.00.030
hive>
(2)导出到本地
hive> insert overwrite local directory '/root/data/emp.bak'
> select * from test.emp;
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
Query ID = root_20170813094410_5d7bbe80-9cc3-430a-b1bc-f22d9c8bd990
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_1502614100973_0003, Tracking URL = http://node1:8088/proxy/application_1502614100973_0003/
Kill Command = /opt/hadoop-2.7.3/bin/hadoop job -kill job_1502614100973_0003
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0
2017-08-13 09:44:25,516 Stage-1 map = 0%, reduce = 0%
2017-08-13 09:44:36,939 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 2.01 sec
MapReduce Total cumulative CPU time: 2 seconds 10 msec
Ended Job = job_1502614100973_0003
Moving data to local directory /root/data/emp.bak
MapReduce Jobs Launched:
Stage-Stage-1: Map: 1 Cumulative CPU: 2.01 sec HDFS Read: 5632 HDFS Write: 590 SUCCESS
Total MapReduce CPU Time Spent: 2 seconds 10 msec
OK
Time taken: 28.015 seconds
hive> quit;
[root@node3 ~]#
[root@node3 ~]# cat /root/data/emp.bak/
000000_0 .000000_0.crc
[root@node3 ~]# ls /root/data/emp.bak/
000000_0
[root@node3 ~]# cat /root/data/emp.bak/000000_0
7782CLARKMANAGER78391981-06-092450.00.010
7839KINGPRESIDENT01981-11-175000.00.010
7934MILLERCLERK77821982-01-231300.00.010
7369SMITHCLERK79021980-12-17800.00.020
7566JONESMANAGER78391981-04-022975.00.020
7902FORDANALYST75661981-12-033000.00.020
7499ALLENSALESMAN76981981-02-201600.0300.030
7521WARDSALESMAN76981981-02-221250.0500.030
7654MARTINSALESMAN76981981-09-281250.01400.030
7698BLAKEMANAGER78391981-05-012850.00.030
7844TURNERSALESMAN76981981-09-081500.00.030
7900JAMESCLERK76981981-12-03950.00.030
[root@node3 ~]#