LOAD DATA [LOCAL] INPATH ‘filepath’ [OVERWRITE] INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 …)]
加载数据到表中时,Hive不做任何转换。加载操作只是把数据拷贝或移动操作,即移动数据文件到Hive表相应的位置。
加载的目标可以是一个表,也可以是一个分区。如果表是分区的,则必须通过指定所有分区列的值来指定一个表的分区。
filepath可以是一个文件,也可以是一个目录。不管什么情况下,filepath被认为是一个文件集合。
LOCAL:表示输入文件在本地文件系统(Linux),如果没有加LOCAL,hive则会去HDFS上查找该文件。
建表
1. hive> CREATE TABLE emp (
2. > empno int,
3. > ename string,
4. > job string,
5. > mgr int,
6. > hiredate string,
7. > salary double,
8. > comm double,
9. > deptno int
10. > ) ROW FORMAT DELIMITED FIELDS TERMINATED BY "\n";
11. OK
12. Time taken: 0.54 seconds
导入本地文本
1. LOAD DATA LOCAL INPATH '/home/hadoop/emp.txt' OVERWRITE INTO TABLE emp;
2. hive> select * from emp;
3. OK
4. 7369 SMITH CLERK 7902 1980/12/17 800.0 NULL 20
5. 7499 ALLEN SALESMAN 7698 1981/2/20 1600.0 300.0 30
6. 7521 WARD SALESMAN 7698 1981/2/22 1250.0 500.0 30
7. 7566 JONES MANAGER 7839 1981/4/2 2975.0 NULL 20
8. 7654 MARTIN SALESMAN 7698 1981/9/28 1250.0 1400.0 30
9. 7698 BLAKE MANAGER 7839 1981/5/1 2850.0 NULL 30
10. 7782 CLARK MANAGER 7839 1981/6/9 2450.0 NULL 10
11. 7788 SCOTT ANALYST 7566 1987/4/19 3000.0 NULL 20
12. 7839 KING PRESIDENT NULL 1981/11/17 5000.0 NULL 10
13. 7844 TURNER SALESMAN 7698 1981/9/8 1500.0 0.0 30
14. 7876 ADAMS CLERK 7788 1987/5/23 1100.0 NULL 20
15. 7900 JAMES CLERK 7698 1981/12/3 950.0 NULL 30
16. 7902 FORD ANALYST 7566 1981/12/3 3000.0 NULL 20
17. 7934 MILLER CLERK 7782 1982/1/23 1300.0 NULL 10
18. Time taken: 0.938 seconds, Fetched: 14 row(s)
导入分区
1. hive> load data local inpath '/home/hadoop/dept.txt' into table dept partition (dt='2018-09-09');
2. Loading data to table default.dept partition (dt=2018-09-09)
3. Partition default.dept{dt=2018-09-09} stats: [numFiles=1, totalSize=84]
4. OK
5. Time taken: 10.631 seconds
6. hive> select * form dept;
7. FAILED: ParseException line 1:9 missing EOF at 'form' near '*'
8. hive> select * from dept;
9. OK
10. 10 ACCOUNTING NEW YORK 2018-08-08
12. 20 RESEARCH DALLAS 2018-08-08
13. 30 SALES CHICAGO 2018-08-08
14. 40 OPERATIONS BOSTON 2018-08-08
15. 10 ACCOUNTING NEW YORK 2018-09-09
16. 20 RESEARCH DALLAS 2018-09-09
17. 30 SALES CHICAGO 2018-09-09
18. 40 OPERATIONS BOSTON 2018-09-09
19. Time taken: 1.385 seconds, Fetched: 8 row(s)
INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 …) [IF NOT EXISTS]] select_statement1 FROM from_statement;
INSERT INTO TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 …)] select_statement1 FROM from_statement;
可以使用PARTITION 关键字,进行分区插入。
OVERWRITE是否选择覆盖。
使用插入语法会跑mr作业。
向emp1表中插入emp表
1. hive> insert overwrite table emp1 select * from emp;
2. Query ID = hadoop_20180109081212_d62e58f3-946c-465e-999d-2ddf0d76d807
3. Total jobs = 3
4. Launching Job 1 out of 3
5. hive> select * from emp1;
6. OK
7. 7369 SMITH CLERK 7902 1980/12/17 800.0 NULL 20
8. 7499 ALLEN SALESMAN 7698 1981/2/20 1600.0 300.0 30
9. 7521 WARD SALESMAN 7698 1981/2/22 1250.0 500.0 30
10. 7566 JONES MANAGER 7839 1981/4/2 2975.0 NULL 20
11. 7654 MARTIN SALESMAN 7698 1981/9/28 1250.0 1400.0 30
12. 7698 BLAKE MANAGER 7839 1981/5/1 2850.0 NULL 30
13. 7782 CLARK MANAGER 7839 1981/6/9 2450.0 NULL 10
14. 7788 SCOTT ANALYST 7566 1987/4/19 3000.0 NULL 20
15. 7839 KING PRESIDENT NULL 1981/11/17 5000.0 NULL 10
16. 7844 TURNER SALESMAN 7698 1981/9/8 1500.0 0.0 30
17. 7876 ADAMS CLERK 7788 1987/5/23 1100.0 NULL 20
18. 7900 JAMES CLERK 7698 1981/12/3 950.0 NULL 30
19. 7902 FORD ANALYST 7566 1981/12/3 3000.0 NULL 20
20. 7934 MILLER CLERK 7782 1982/1/23 1300.0 NULL 10
21. Time taken: 0.211 seconds, Fetched: 14 row(s)
按字段进行插入时,不要把字段顺序写错,否则插入时不会报错,但要查找数据时,数据查询不到
演示:
把job,ename顺序写错进行插入
1. hive> insert overwrite table emp2 select empno,job,ename,mgr,hiredate,salary,comm,deptno from emp;
再插入emp表
1. hive> insert into table emp2 select * from emp;
2. hive> select * from emp2;
3. OK
4. 7369 CLERK SMITH 7902 1980/12/17 800.0 NULL 20
5. 7499 SALESMAN ALLEN 7698 1981/2/20 1600.0 300.0 30
6. 7521 SALESMAN WARD 7698 1981/2/22 1250.0 500.0 30
7. 7566 MANAGER JONES 7839 1981/4/2 2975.0 NULL 20
8. 7654 SALESMAN MARTIN 7698 1981/9/28 1250.0 1400.0 30
9. 7698 MANAGER BLAKE 7839 1981/5/1 2850.0 NULL 30
10. 7782 MANAGER CLARK 7839 1981/6/9 2450.0 NULL 10
11. 7788 ANALYST SCOTT 7566 1987/4/19 3000.0 NULL 20
12. 7839 PRESIDENT KING NULL 1981/11/17 5000.0 NULL 10
13. 7844 SALESMAN TURNER 7698 1981/9/8 1500.0 0.0 30
14. 7876 CLERK ADAMS 7788 1987/5/23 1100.0 NULL 20
15. 7900 CLERK JAMES 7698 1981/12/3 950.0 NULL 30
16. 7902 ANALYST FORD 7566 1981/12/3 3000.0 NULL 20
17. 7934 CLERK MILLER 7782 1982/1/23 1300.0 NULL 10
18. 7369 SMITH CLERK 7902 1980/12/17 800.0 NULL 20
19. 7499 ALLEN SALESMAN 7698 1981/2/20 1600.0 300.0 30
20. 7521 WARD SALESMAN 7698 1981/2/22 1250.0 500.0 30
21. 7566 JONES MANAGER 7839 1981/4/2 2975.0 NULL 20
22. 7654 MARTIN SALESMAN 7698 1981/9/28 1250.0 1400.0 30
23. 7698 BLAKE MANAGER 7839 1981/5/1 2850.0 NULL 30
24. 7782 CLARK MANAGER 7839 1981/6/9 2450.0 NULL 10
25. 7788 SCOTT ANALYST 7566 1987/4/19 3000.0 NULL 20
26. 7839 KING PRESIDENT NULL 1981/11/17 5000.0 NULL 10
27. 7844 TURNER SALESMAN 7698 1981/9/8 1500.0 0.0 30
28. 7876 ADAMS CLERK 7788 1987/5/23 1100.0 NULL 20
29. 7900 JAMES CLERK 7698 1981/12/3 950.0 NULL 30
30. 7902 FORD ANALYST 7566 1981/12/3 3000.0 NULL 20
31. 7934 MILLER CLERK 7782 1982/1/23 1300.0 NULL 10
32. Time taken: 2.363 seconds, Fetched: 28 row(s)
查询结果前14条记录job,ename是反的,可以成功插入,但是在查询相关数据时查询不到结果
多行插入是把from提到语句首,其实质就是简化标准插入
INSERT INTO TABLE tablename [PARTITION (partcol1[=val1], partcol2[=val2] …)] VALUES values_row [, values_row …]
1. e> create table stu(
2. > id int,
3. > name string
4. > )
5. ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
6. OK
7. Time taken: 0.405 seconds
8. hive> select * from stu;
9. OK
10. hive> insert into table stu values(1,'zhangsan'),(2,'lisi);
11. hive> select * from stu;
12. OK
13. 1 zhangsan
14. 2 lisi
标准导出
INSERT OVERWRITE [LOCAL] DIRECTORY directory1
[ROW FORMAT row_format] [STORED AS file_format] (Note: Only available starting with Hive 0.11.0)
SELECT … FROM …
LOCAL:加上LOCAL关键字代表导入本地系统,不加默认导入HDFS;
STORED AS:可以指定存储格式。
1. hive> insert overwrite local directory '/home/hadoop/data' ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
select * from stu;
2. [hadoop@zydatahadoop001 data]$ pwd
3. /home/hadoop/data
4. [hadoop@zydatahadoop001 data]$ cat 000000_0
5. 1 zhangsan
6. 2 lisi
多条导出
FROM from_statement
INSERT OVERWRITE [LOCAL] DIRECTORY directory1 select_statement1
[INSERT OVERWRITE [LOCAL] DIRECTORY directory2 select_statement2] …
1. hive> from emp
2. > INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/tmp/hivetmp1'
3. > ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t"
4. > select empno, ename
5. > INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/tmp/hivetmp2'
6. > ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t"
7. > select ename;
8. [hadoop@zydatahadoop001 tmp]$ pwd
9. /home/hadoop/tmp
10. [hadoop@zydatahadoop001 tmp]$ cat hivetmp1/000000_0
11. 7369 SMITH
12. 7499 ALLEN
13. 7521 WARD
14. 7566 JONES
15. 7654 MARTIN
16. 7698 BLAKE
17. 7782 CLARK
18. 7788 SCOTT
19. 7839 KING
20. 7844 TURNER
21. 7876 ADAMS
22. 7900 JAMES
23. 7902 FORD
24. 7934 MILLER
25. [hadoop@zydatahadoop001 tmp]$ cat hivetmp2/000000_0
26. SMITH
27. ALLEN
28. WARD
29. JONES
30. MARTIN
31. BLAKE
32. CLARK
33. SCOTT
34. KING
35. TURNER
36. ADAMS
37. JAMES
38. FORD
39. MILLER
查询员工表deptno=10的员工
1. hive> select * from emp where deptno=10;
2. OK
3. 7782 CLARK MANAGER 7839 1981/6/9 2450.0 NULL 10
4. 7839 KING PRESIDENT NULL 1981/11/17 5000.0 NULL 10
5. 7934 MILLER CLERK 7782 1982/1/23 1300.0 NULL 10
6. Time taken: 1.144 seconds, Fetched: 3 row(s)
查询员工编号小于等于7800的员工
1. hive> select * from emp where empno <= 7800;
查询员工工资大于1000小于1500的员工
1. hive> select * from emp where salary between 1000 and 1500;
查询前5条记录
1. hive> select * from emp limit 5;
查询编号为7566或7499的员工
1. hive> select * from emp where empno in(7566,7499);
查询有津贴不为空的员工
1. hive> select * from emp where comm is not null;
来自@若泽大数据