版本:cdh5.0.0+hadoop2.3.0+hive0.12
一、原始数据:
1. 本地数据
[root@node33 data]# ll total 12936 -rw-r--r--. 1 root root 13245467 May 1 17:08 hbase-data.csv [root@node33 data]# head -n 3 hbase-data.csv 1,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0,0,1 2,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0,0,1 3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0,0,1
2. hdfs数据:
[root@node33 data]# hadoop fs -ls /input Found 1 items -rwxrwxrwx 1 hdfs supergroup 13245467 2014-05-01 17:09 /input/hbase-data.csv [root@node33 data]# hadoop fs -cat /input/* | head -n 3 1,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0,0,1 2,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0,0,1 3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0,0,1
二、创建hive表:
1.hive外部表:
[root@node33 hive]# cat employees_ext.sql create external table if not exists employees_ext( id int, x1 float, x2 float, x3 float, x4 float, x5 float, x6 float, x7 float, x8 float, x9 float, y int) row format delimited fields terminated by ',' location '/input/'
创建表,客户端运行 :hive -f employees_ext.sql
2. hive表
[root@node33 hive]# cat employees.sql create table employees( id int, x1 float, x2 float, x3 float, x4 float, x5 float, x6 float, x7 float, x8 float, x9 float ) partitioned by (y int);
创建表,客户端运行:hive -f employees.sql
3. hive表(orc方式存储)
[root@node33 hive]# cat employees_orc.sql create table employees_orc( id int, x1 float, x2 float, x3 float, x4 float, x5 float, x6 float, x7 float, x8 float, x9 float ) partitioned by (y int) row format serde "org.apache.hadoop.hive.ql.io.orc.OrcSerde" stored as orc;
运行:hive -f employees_orc.sql
三、导入数据:
1. employees_ext 表导入employees表:
[root@node33 hive]# cat employees_ext-to-employees.sql set hive.exec.dynamic.partition=true; set hive.exec.dynamic.partition.mode=nonstrict; set hive.eec.max.dynamic.partitions.pernode=1000; insert overwrite table employees partition(y) select emp_ext.id, emp_ext.x1, emp_ext.x2, emp_ext.x3, emp_ext.x4, emp_ext.x5, emp_ext.x6, emp_ext.x7, emp_ext.x8, emp_ext.x9, emp_ext.y from employees_ext emp_ext;
运行:hive -f employees_ext-to-employees.sql,其部分log如下:
Partition default.employees{y=1} stats: [num_files: 1, num_rows: 0, total_size: 3622, raw_data_size: 0] Partition default.employees{y=2} stats: [num_files: 1, num_rows: 0, total_size: 4060, raw_data_size: 0] Partition default.employees{y=3} stats: [num_files: 1, num_rows: 0, total_size: 910, raw_data_size: 0] Partition default.employees{y=5} stats: [num_files: 1, num_rows: 0, total_size: 699, raw_data_size: 0] Partition default.employees{y=6} stats: [num_files: 1, num_rows: 0, total_size: 473, raw_data_size: 0] Partition default.employees{y=7} stats: [num_files: 1, num_rows: 0, total_size: 13561851, raw_data_size: 0] Table default.employees stats: [num_partitions: 6, num_files: 6, num_rows: 0, total_size: 13571615, raw_data_size: 0] MapReduce Jobs Launched: Job 0: Map: 1 Cumulative CPU: 6.78 sec HDFS Read: 13245660 HDFS Write: 13571615 SUCCESS Total MapReduce CPU Time Spent: 6 seconds 780 msec
OK Time taken: 186.743 seconds
查看hdfs文件大小:
[root@node33 hive]# hadoop fs -count /user/hive/warehouse/employees 7 6 13571615 /user/hive/warehouse/employees
查看hdfs文件内容:
bash-4.1$ hadoop fs -cat /user/hive/warehouse/employees/y=1/* | head -n 1 11.5210113.644.491.171.780.068.750.00.0
(截图的内容为输出,复制到代码块里面有问题)
2. employees_ext 表导入employees_orc表:
[root@node33 hive]# cat employees_ext-to-employees_orc.sql set hive.exec.dynamic.partition=true; set hive.exec.dynamic.partition.mode=nonstrict; set hive.eec.max.dynamic.partitions.pernode=1000; insert overwrite table employees_orc partition(y) select emp_ext.id, emp_ext.x1, emp_ext.x2, emp_ext.x3, emp_ext.x4, emp_ext.x5, emp_ext.x6, emp_ext.x7, emp_ext.x8, emp_ext.x9, emp_ext.y from employees_ext emp_ext;
运行:hive -f employees_ext-to-employees_orc.sql,其部分log如下:
Partition default.employees_orc{y=1} stats: [num_files: 1, num_rows: 0, total_size: 2355, raw_data_size: 0] Partition default.employees_orc{y=2} stats: [num_files: 1, num_rows: 0, total_size: 2539, raw_data_size: 0] Partition default.employees_orc{y=3} stats: [num_files: 1, num_rows: 0, total_size: 1290, raw_data_size: 0] Partition default.employees_orc{y=5} stats: [num_files: 1, num_rows: 0, total_size: 1165, raw_data_size: 0] Partition default.employees_orc{y=6} stats: [num_files: 1, num_rows: 0, total_size: 955, raw_data_size: 0] Partition default.employees_orc{y=7} stats: [num_files: 1, num_rows: 0, total_size: 1424599, raw_data_size: 0] Table default.employees_orc stats: [num_partitions: 6, num_files: 6, num_rows: 0, total_size: 1432903, raw_data_size: 0] MapReduce Jobs Launched: Job 0: Map: 1 Cumulative CPU: 7.84 sec HDFS Read: 13245660 HDFS Write: 1432903 SUCCESS Total MapReduce CPU Time Spent: 7 seconds 840 msec OK Time taken: 53.014 seconds
查看hdfs文件大小:
[root@node33 hive]# hadoop fs -count /user/hive/warehouse/employees_orc 7 6 1432903 /user/hive/warehouse/employees_orc
查看hdfs文件内容:
3. 比较两者性能
时间 | 压缩率 | |
employees表: | 186.7秒 | 13571615/13245660=1.0246 |
employees_orc表: | 53.0秒 | 1432903/13245660=0.108 |
时间上来说,orc的表现方式会好很多,同时压缩率也好很多。不过,这个测试是在本人虚拟机上测试的,而且是单机测试的,所以参考价值不是很大,但是压缩率还是有一定参考价值的。
四、导出数据
1. employees表:
[root@node33 hive]# cat export_employees.sql insert overwrite local directory '/opt/hivedata/employees.dat' row format delimited fields terminated by ',' select emp.id, emp.x1, emp.x2, emp.x3, emp.x4, emp.x5, emp.x6, emp.x7, emp.x8, emp.x9, emp.y from employees emp
运行:hive -f export_employees.sql
部分log:
MapReduce Total cumulative CPU time: 9 seconds 630 msec Ended Job = job_1398958404577_0007 Copying data to local directory /opt/hivedata/employees.dat Copying data to local directory /opt/hivedata/employees.dat MapReduce Jobs Launched: Job 0: Map: 1 Cumulative CPU: 9.63 sec HDFS Read: 13572220 HDFS Write: 13978615 SUCCESS Total MapReduce CPU Time Spent: 9 seconds 630 msec OK Time taken: 183.841 seconds
数据查看:
[root@node33 hive]# ll /opt/hivedata/employees.dat/ total 13652 -rw-r--r--. 1 root root 13978615 May 2 05:15 000000_0 [root@node33 hive]# head -n 1 /opt/hivedata/employees.dat/000000_0 1,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
2. employees_orc表:
[root@node33 hive]# cat export_employees_orc.sql insert overwrite local directory '/opt/hivedata/employees_orc.dat' row format delimited fields terminated by ',' select emp.id, emp.x1, emp.x2, emp.x3, emp.x4, emp.x5, emp.x6, emp.x7, emp.x8, emp.x9, emp.y from employees_orc emp
运行 hive -f export_employees_orc.sql
部分log:
MapReduce Total cumulative CPU time: 4 seconds 920 msec Ended Job = job_1398958404577_0008 Copying data to local directory /opt/hivedata/employees_orc.dat Copying data to local directory /opt/hivedata/employees_orc.dat MapReduce Jobs Launched: Job 0: Map: 1 Cumulative CPU: 4.92 sec HDFS Read: 1451352 HDFS Write: 13978615 SUCCESS Total MapReduce CPU Time Spent: 4 seconds 920 msec OK Time taken: 41.686 second
查看数据:
[root@node33 hive]# head -n 1 /opt/hivedata/employees_orc.dat/000000_0 1,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1 [root@node33 hive]# ll /opt/hivedata/employees_orc.dat/ total 13652 -rw-r--r--. 1 root root 13978615 May 2 05:18 000000_0
这里的数据和原始数据的大小不一样,原始数据是13245467, 而导出到本地的是13978615 。这是因为数据的精度问题,例如原始数据中的0都被存储为了0.0。
分享,成长,快乐
转载请注明blog地址:http://blog.csdn.net/fansy1990