/*
测试简述:
在oracle中测试100000条数据的等值连接效率;在hive中测试65535条数据的等值连接效率;并进行对比。
其中在hive中通过数据行存储和列存储的数据组织结构进行测试。
测试环境说明:
oracle中:服务器+orcel enterprise linux6.0+oracle11gr2
hive中:一般台式机+vmware8.0+orcel enterprise linux6.0+hadoop-0.20.2+hive-0.7.1
*/
--附:具体测试过程如下:
--------------------------oracle下操作--------------------------
--创建测试表1
SQL> create table hive1 (id number,name varchar2(20));
Table created.
SQL> begin
2 for i in 1..100000
3 loop
4 insert into hive1 values(i,'hive1');
5 commit;
6 end loop;
7 end;
8 /
PL/SQL procedure successfully completed.
--创建测试表2
SQL> create table hive2 (id number,name varchar2(20));
Table created.
Elapsed: 00:00:00.05
SQL> begin
2 for i in 1..100000
3 loop
4 insert into hive2 values(i,'hive2');
5 commit;
6 end loop;
7 end;
8 /
PL/SQL procedure successfully completed.
--测试2个表等值连接效率
SQL> select a.*,b.name as name2 from hive1 a,hive2 b where a.id=b.id;
100000 rows selected.
Elapsed: 00:00:00.98
Execution Plan
----------------------------------------------------------
Plan hash value: 2251506592
--------------------------------------------------------------------------------
----
| Id | Operation | Name | Rows | Bytes |TempSpc| Cost (%CPU)| Time
|
--------------------------------------------------------------------------------
----
| 0 | SELECT STATEMENT | | 102K| 5002K| | 456 (1)| 00:00:
06 |
|* 1 | HASH JOIN | | 102K| 5002K| 3704K| 456 (1)| 00:00:
06 |
| 2 | TABLE ACCESS FULL| HIVE2 | 102K| 2501K| | 45 (3)| 00:00:
01 |
| 3 | TABLE ACCESS FULL| HIVE1 | 105K| 2587K| | 45 (3)| 00:00:
01 |
--------------------------------------------------------------------------------
----
Predicate Information (identified by operation id):
---------------------------------------------------
1 - access("A"."ID"="B"."ID")
Note
-----
- dynamic sampling used for this statement (level=2)
Statistics
----------------------------------------------------------
0 recursive calls
0 db block gets
7149 consistent gets
0 physical reads
0 redo size
1836054 bytes sent via SQL*Net to client
73742 bytes received via SQL*Net from client
6668 SQL*Net roundtrips to/from client
0 sorts (memory)
0 sorts (disk)
100000 rows processed
--------------------------hive下操作--------------------------
--创建行存储测试表 test_hive2[测试表 test_hive3 同理]
hive> create table test_hive2 (id int,id2 int,name string)
> row format delimited
> fields terminated by '\t';
OK
Time taken: 0.055 seconds
hive> load data local inpath '/root/tmp001.txt'
> overwrite into table test_hive2;
Copying data from file:/root/tmp001.txt
Copying file: file:/root/tmp001.txt
Loading data to table default.test_hive2
Deleted file:/user/hive/warehouse/test_hive2
OK
Time taken: 0.342 seconds
--测试全表扫描效率
hive> select count(*) from test_hive2;
Total MapReduce jobs = 1
Launching Job 1 out of 1
Number of reduce tasks determined at compile time: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=
In order to set a constant number of reducers:
set mapred.reduce.tasks=
Execution log at: /tmp/root/root_20120331160606_9f242fd6-e321-4413-92a7-b4d92207f14e.log
Job running in-process (local Hadoop)
2012-03-31 16:06:10,931 null map = 100%, reduce = 100%
Ended Job = job_local_0001
OK
65535
Time taken: 2.36 seconds
--测试2个行存储表的等值连接效率
hive> select test_hive2.*,test_hive3.name from test_hive2 join test_hive3 on ( test_hive2.id=test_hive3.id);
Time taken: 27.749 seconds
--列存储2表等值连接测试如下
[root@oraking ~]# hive
Hive history file=/tmp/root/hive_job_log_root_201203311553_471153629.txt
--创建列存储测试表 test_hive
hive> create table test_hive (id int,id2 int,name string)
> row format serde 'org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe'
> STORED AS RCFILE;
OK
Time taken: 2.944 seconds
--插入数据
hive> insert overwrite table test_hive
> select id,id2,name
> from test_hive2;
Total MapReduce jobs = 2
Launching Job 1 out of 2
Number of reduce tasks is set to 0 since there's no reduce operator
Execution log at: /tmp/root/root_20120331164444_44ac1bdb-f603-45e2-b50a-4681e69d330f.log
Job running in-process (local Hadoop)
2012-03-31 16:44:31,709 null map = 100%, reduce = 0%
Ended Job = job_local_0001
Ended Job = 171641075, job is filtered out (removed at runtime).
Moving data to: file:/tmp/hive-root/hive_2012-03-31_16-44-28_048_8584030323247636747/-ext-10000
Loading data to table default.test_hive
Deleted file:/user/hive/warehouse/test_hive
Table default.test_hive stats: [num_partitions: 0, num_files: 1, num_rows: 65535, total_size: 961048]
OK
Time taken: 3.946 seconds
--创建列存储测试表 test_hive4
hive> create table test_hive4 (id int,id2 int,name string)
> row format serde 'org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe'
> STORED AS RCFILE;
OK
Time taken: 0.019 seconds
--插入数据
hive> insert overwrite table test_hive4
> select id,id2,name
> from test_hive2;
Total MapReduce jobs = 2
Launching Job 1 out of 2
Number of reduce tasks is set to 0 since there's no reduce operator
Execution log at: /tmp/root/root_20120331165555_13476cce-d845-4277-9505-38e6441e8e77.log
Job running in-process (local Hadoop)
2012-03-31 16:55:11,786 null map = 100%, reduce = 0%
Ended Job = job_local_0001
Ended Job = -1613004717, job is filtered out (removed at runtime).
Moving data to: file:/tmp/hive-root/hive_2012-03-31_16-55-09_157_510206187744689585/-ext-10000
Loading data to table default.test_hive4
Deleted file:/user/hive/warehouse/test_hive4
Table default.test_hive4 stats: [num_partitions: 0, num_files: 1, num_rows: 65535, total_size: 961048]
OK
Time taken: 2.863 seconds
--测试列存储全表扫描效率
hive> select count(*) from test_hive;
Total MapReduce jobs = 1
Launching Job 1 out of 1
Number of reduce tasks determined at compile time: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=
In order to set a constant number of reducers:
set mapred.reduce.tasks=
Execution log at: /tmp/root/root_20120331164848_c82b81f6-ee21-48fa-bbb8-c5a52b5c01bb.log
Job running in-process (local Hadoop)
2012-03-31 16:48:45,111 null map = 100%, reduce = 100%
Ended Job = job_local_0001
OK
65535
Time taken: 2.357 seconds
hive> select count(*) from test_hive4;
Total MapReduce jobs = 1
Launching Job 1 out of 1
Number of reduce tasks determined at compile time: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=
In order to set a constant number of reducers:
set mapred.reduce.tasks=
Execution log at: /tmp/root/root_20120331165555_0e89d023-1969-4272-8ab5-a68c48db7073.log
Job running in-process (local Hadoop)
2012-03-31 16:55:45,199 null map = 100%, reduce = 100%
Ended Job = job_local_0001
OK
65535
Time taken: 2.377 seconds
--测试列存储2表等值连接效率
hive> select test_hive.*,test_hive4.name from test_hive join test_hive4 on (test_hive.id=test_hive4.id);
Time taken: 28.306 seconds
/*
综述:整体来看,在oracle中的效率明显高于hive中[无论是行存储还是列存储]
注:oracle中是2个100000条数据的等值连接测试;而hive中是2个65535条数据的等值连接测试
初步分析如下:
1、由于hive是本机的虚拟机下测试,而oracle是在真实服务器下测试,故可能存在整体基础环境性能影响;
2、由于hive的工作原理本身的问题,如先map再reduce生成结果集等;
3、本人对新技术掌握程度,灵活使用的能力问题;
4、其他不确定因素。
*/
来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/26143577/viewspace-720093/,如需转载,请注明出处,否则将追究法律责任。