转载请注明出处:http://blog.csdn.net/l1028386804/article/details/78291025
#创建表人信息表 person(String name,int age)
hive> create table person(name STRING,age INT)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' ESCAPED BY '\\' STORED AS TEXTFILE;
OK
Time taken: 0.541 seconds
#创建表票价信息表 ticket(int age,float price)
hive> create table ticket(age INT,price FLOAT)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' ESCAPED BY '\\' STORED AS TEXTFILE;
OK
Time taken: 0.154 seconds
#创建本地数据文件
-rw-rw-r-- 1 hadoop hadoop 40 Feb 6 13:28 person.txt
-rw-rw-r-- 1 hadoop hadoop 45 Feb 6 13:28 ticket.txt
#将本地的数据文件load到hive数据仓库中
hive> LOAD DATA LOCAL INPATH '/home/hadoop/hfxdoc/person.txt' OVERWRITE INTO TABLE person;
Copying data from file:/home/hadoop/hfxdoc/person.txt
Copying file: file:/home/hadoop/hfxdoc/person.txt
Loading data to table default.person
Deleted hdfs://10.15.107.155:8000/user/hive/warehouse/person
OK
Time taken: 0.419 seconds
hive> LOAD DATA LOCAL INPATH '/home/hadoop/hfxdoc/ticket.txt' OVERWRITE INTO TABLE ticket;
Copying data from file:/home/hadoop/hfxdoc/ticket.txt
Copying file: file:/home/hadoop/hfxdoc/ticket.txt
Loading data to table default.ticket
Deleted hdfs://10.15.107.155:8000/user/hive/warehouse/ticket
OK
Time taken: 0.25 seconds
#load命令会将数据文件移动到配置好的数据路径下:/user/hive/warehouse
hive> show tables;
hive> describe person
hive> select * from person;
OK
huang 26
lili 25
dongdong 13
wangxiao 5
Time taken: 0.092 seconds
hive>
#注意select *语句是不会编译成MapReduce程序的,所以很快。
hive> select * from person join ticket on person.age = ticket.age;
MapReduce Total cumulative CPU time: 5 seconds 510 msec
Ended Job = job_201301211420_0011
MapReduce Jobs Launched:
Job 0: Map: 2 Reduce: 1 Cumulative CPU: 5.51 sec HDFS Read: 519 HDFS Write: 71 SUCCESS
Total MapReduce CPU Time Spent: 5 seconds 510 msec
OK
wangxiao 5 5 10.0
dongdong 13 13 20.0
lili 25 25 30.0
huang 26 26 30.0
Time taken: 32.465 seconds
#这里查询语句被编译成MapReduce程序,在hadoop上执行
[hadoop@localhost hfxdoc]$ hadoop fs -mkdir /tmp/ticket
[hadoop@localhost hfxdoc]$ hadoop fs -put person.txt /tmp/ticket
[hadoop@localhost hfxdoc]$ hadoop fs -put ticket.txt /tmp/ticket
[hadoop@localhost hfxdoc]$ hadoop fs -ls /tmp/ticket
Found 2 items
-rw-r--r-- 1 hadoop supergroup 40 2013-02-06 13:45 /tmp/ticket/person.txt
-rw-r--r-- 1 hadoop supergroup 45 2013-02-06 13:45 /tmp/ticket/ticket.txt
create external table person_ext(name STRING,age INT)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' ESCAPED BY '\\' STORED AS TEXTFILE LOCATION '/tmp/ticket'
#LOCATION只能配置数据路径,而刚刚我们的路径下有两个表的文件?这样创建的其中一个表可以吗?
hive> select * from person_ext;
OK
huang 26
lili 25
dongdong 13
wangxiao 5
1 10
2 10
5 10
13 20
14 20
25 30
26 30
31 40
Time taken: 0.088 seconds
hive> drop table person_ext;
#Drop外表的操作不会删除元信息以为的数据,所以hdfs上还是存在数据文件#
复杂类型的数据表,这里列之间以'\t'分割,数组元素之间以','分割
1 huangfengxiao beijing,shanghai,tianjin,hangzhou
2 linan changchu,chengdu,wuhan
hive> create table complex(name string,work_locations array)
> ROW FORMAT DELIMITED
> FIELDS TERMINATED BY '\t'
> COLLECTION ITEMS TERMINATED BY ',';
hive> describe complex;
OK
name string
work_locations array
hive> LOAD DATA LOCAL INPATH '/home/hadoop/hfxdoc/complex.txt' OVERWRITE INTO TABLE complex
hive> select * from complex;
OK
huangfengxiao ["beijing","shanghai","tianjin","hangzhou"]
linan ["changchu","chengdu","wuhan"]
Time taken: 0.125 seconds
hive> select name, work_locations[0] from complex;
MapReduce Total cumulative CPU time: 790 msec
Ended Job = job_201301211420_0012
MapReduce Jobs Launched:
Job 0: Map: 1 Cumulative CPU: 0.79 sec HDFS Read: 296 HDFS Write: 37 SUCCESS
Total MapReduce CPU Time Spent: 790 msec
OK
huangfengxiao beijing
linan changchu
Time taken: 20.703 seconds
#如何分区?
Mis li huangfengxiao 20
Mis li lijie 21
Mis li dongdong 21
Mis li liqiang 21
Mis li hemeng 21
Mr xu dingding 19
Mr xu wangqiang 19
Mr xu lidong 19
Mr xu hexing 19
如果我们将这个班级成员的数据按teacher来分区
create table classmem(student string,age int) partitioned by(teacher string)
分区文件
classmem_Misli.txt
huangfengxiao 20
lijie 21
dongdong 21
liqiang 21
hemeng 21
classmem_MrXu.txt
dingding 19
wangqiang 19
lidong 19
hexing 19
LOAD DATA LOCAL INPATH '/home/hadoop/hfxdoc/classmem_Misli.txt' INTO TABLE classmem partition (teacher = 'Mis.li')
LOAD DATA LOCAL INPATH '/home/hadoop/hfxdoc/classmem_MrXu.txt' INTO TABLE classmem partition (teacher = 'Mis.Xu')
#分区列被默认到最后一列
hive> select * from classmem where teacher = 'Mr.Xu';
OK
dingding 19 NULL Mr.Xu
wangqiang 19 NULL Mr.Xu
lidong 19 NULL Mr.Xu
hexing 19 NULL Mr.Xu
Time taken: 0.196 seconds
#直接从分区检索,加速;如果where子句的条件不是分区列,那么,这个sql将被编译成mapreduce程序,延时很大。
id name age
1 huang 11
2 li 11
3 xu 12
4 zhong 14
5 hu 15
6 liqiang 17
7 zhonghua 19
如果我们想将这个数据表切成3个桶,切片字段为id
create table bucketmem (id int,name string,age int) CLUSTERED BY (id) sorted by (id asc) into 3 buckets
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
LOAD DATA LOCAL INPATH '/home/hadoop/hfxdoc/bucketmem.txt' INTO TABLE bucketmem;
select * from bucketmem tablesample(bucket 1 out of 4)
#其他操作参考,更完整的请参考官网: https://cwiki.apache.org/confluence/display/Hive/Home
create table test_like_table like test_bucket;
2) 对表进行重命名 rename to:
ALTER TABLE table_name RENAME TO new_table_name
3) 增加分区 Add Partitions:
ALTER TABLE table_name ADD partition_spec [ LOCATION 'location1' ]partition_spec [ LOCATION 'location2' ]
4) 对表中的某一列进行修改,包括列的名称/列的数据类型/列的位置/列的注释
5) 添加/替换列Add/ReplaceColumns
ALTER TABLE table_name ADD|REPLACE COLUMNS (col_name data_type [COMMENTcol_comment], ...)
ADD COLUMNS 允许用户在当前列的末尾增加新的列,但是在分区列之前。
Create [EXTERNAL] TABLE [IF NOT EXISTS] table_name
[(col_name data_type [COMMENT col_comment], ...)]
[COMMENT table_comment]
[PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)]
[CLUSTERED BY (col_name, col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)]INTO num_buckets BUCKETS]
[ROW FORMAT row_format]
[STORED AS file_format]
[LOCATION hdfs_path]
7) 在hive中查看hdfs文件
>dfs -ls /user;