#! /bin/bash
#generating 350M raw data.
i=0
while [ $i -ne 1000000 ]
do
echo "$i\tA decade ago, many were predicting that Cooke, a New York City prodigy, would become a basketball shoe pitchman and would flaunt his wares and skills at All-Star weekends like the recent aerial show in Orlando, Fla. There was a time, however fleeting, when he was more heralded, or perhaps merely hyped, than any other high school player in America."
i=$(($i+1))
done
$ sh gen-data.sh >dual.txt
drop table table01;
create table table01( id int, name string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t';
load data local inpath '/home/allen/Desktop/dual.txt' overwrite into table table01;
hive> dfs -ls /user/hive/warehouse/table01;
Found 1 items
-rw-r--r-- 1 allen supergroup 356888890 2012-03-04 22:22 /user/hive/warehouse/table01/dual.txt
hive>create table02 as
select id ,name as text from table01;
6个Mapper,0个Reducer
hive> dfs -ls /user/hive/warehouse/table02;
Found 6 items
-rw-r--r-- 1 allen supergroup 67109134 2012-03-04 22:28 /user/hive/warehouse/table02/000000_0
-rw-r--r-- 1 allen supergroup 67108860 2012-03-04 22:28 /user/hive/warehouse/table02/000001_0
-rw-r--r-- 1 allen supergroup 67108860 2012-03-04 22:30 /user/hive/warehouse/table02/000002_0
-rw-r--r-- 1 allen supergroup 67108860 2012-03-04 22:30 /user/hive/warehouse/table02/000003_0
-rw-r--r-- 1 allen supergroup 67108860 2012-03-04 22:32 /user/hive/warehouse/table02/000004_0
-rw-r--r-- 1 allen supergroup 21344316 2012-03-04 22:32 /user/hive/warehouse/table02/000005_0
hive> select * from table02 where id=500000;
Total MapReduce jobs = 1
Launching Job 1 out of 1
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_201203042123_0003, Tracking URL = http://localhost:50030/jobdetails.jsp?jobid=job_201203042123_0003
Kill Command = /home/allen/Hadoop/hadoop-0.20.2/bin/../bin/hadoop job -Dmapred.job.tracker=localhost:9001 -kill job_201203042123_0003
2012-03-04 22:42:09,609 Stage-1 map = 0%, reduce = 0%
2012-03-04 22:42:21,919 Stage-1 map = 19%, reduce = 0%
2012-03-04 22:42:25,020 Stage-1 map = 33%, reduce = 0%
2012-03-04 22:42:37,602 Stage-1 map = 61%, reduce = 0%
2012-03-04 22:42:39,715 Stage-1 map = 67%, reduce = 0%
2012-03-04 22:42:46,168 Stage-1 map = 77%, reduce = 0%
2012-03-04 22:42:49,552 Stage-1 map = 79%, reduce = 0%
2012-03-04 22:42:51,812 Stage-1 map = 80%, reduce = 0%
2012-03-04 22:42:55,015 Stage-1 map = 83%, reduce = 0%
2012-03-04 22:42:58,181 Stage-1 map = 100%, reduce = 0%
2012-03-04 22:43:13,500 Stage-1 map = 100%, reduce = 100%
Ended Job = job_201203042123_0003
OK
500000 A decade ago, many were predicting that Cooke, a New York City prodigy, would become a basketball shoe pitchman and would flaunt his wares and skills at All-Star weekends like the recent aerial show in Orlando, Fla. There was a time, however fleeting, when he was more heralded, or perhaps merely hyped, than any other high school player in America.
Time taken: 70.275 seconds
hive>
hive> dfs -ls /user/hive/warehouse/;
Found 2 items
drwxr-xr-x - allen supergroup 0 2012-03-04 22:22 /user/hive/warehouse/table01
drwxr-xr-x - allen supergroup 0 2012-03-04 22:33 /user/hive/warehouse/table02
hive> show tables;
OK
table01
table02
Time taken: 3.754 seconds
hive> create index table02_index on table table02(id)
> as 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler'
> with deferred rebuild;
OK
Time taken: 0.913 seconds
hive> alter index table02_index on table02 rebuild;
Total MapReduce jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=
In order to set a constant number of reducers:
set mapred.reduce.tasks=
Starting Job = job_201203051029_0001, Tracking URL = http://localhost:50030/jobdetails.jsp?jobid=job_201203051029_0001
Kill Command = /home/allen/Hadoop/hadoop-0.20.2/bin/../bin/hadoop job -Dmapred.job.tracker=localhost:9001 -kill job_201203051029_0001
2012-03-05 10:39:43,607 Stage-1 map = 0%, reduce = 0%
。。。
2012-03-05 10:43:50,295 Stage-1 map = 100%, reduce = 100%
Ended Job = job_201203051029_0001
Loading data to table default.default__table02_table02_index__
Deleted hdfs://localhost:9000/user/hive/warehouse/default__table02_table02_index__
Table default.default__table02_table02_index__ stats: [num_partitions: 0, num_files: 1, num_rows: 0, total_size: 74701985]
OK
Time taken: 269.383 seconds
hive>
mysql> select * from IDXS;
+----------+-------------+------------------+-------------------------------------------------------------+---------------+--------------+------------------+-------------+-------+
| INDEX_ID | CREATE_TIME | DEFERRED_REBUILD | INDEX_HANDLER_CLASS | INDEX_NAME | INDEX_TBL_ID | LAST_ACCESS_TIME | ORIG_TBL_ID | SD_ID |
+----------+-------------+------------------+-------------------------------------------------------------+---------------+--------------+------------------+-------------+-------+
| 1 | 1330914783 | | org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler | table02_index | 6 | 1330914783 | 2 | 7 |
+----------+-------------+------------------+-------------------------------------------------------------+---------------+--------------+------------------+-------------+-------+
1 row in set (0.00 sec)
mysql> select * from TBLS;
+--------+-------------+-------+------------------+-------+-----------+-------+----------------------------------+---------------+--------------------+--------------------+
| TBL_ID | CREATE_TIME | DB_ID | LAST_ACCESS_TIME | OWNER | RETENTION | SD_ID | TBL_NAME | TBL_TYPE | VIEW_EXPANDED_TEXT | VIEW_ORIGINAL_TEXT |
+--------+-------------+-------+------------------+-------+-----------+-------+----------------------------------+---------------+--------------------+--------------------+
| 1 | 1330870868 | 1 | 0 | allen | 0 | 2 | table01 | MANAGED_TABLE | NULL | NULL |
| 2 | 1330871615 | 1 | 0 | allen | 0 | 3 | table02 | MANAGED_TABLE | NULL | NULL |
| 6 | 1330914783 | 1 | 0 | NULL | 0 | 10 | default__table02_table02_index__ | INDEX_TABLE | NULL | NULL |
+--------+-------------+-------+------------------+-------+-----------+-------+----------------------------------+---------------+--------------------+--------------------+
3 rows in set (0.01 sec)
mysql>
hive> dfs -ls /user/hive/warehouse;
Found 3 items
drwxr-xr-x - allen supergroup 0 2012-03-05 10:43 /user/hive/warehouse/default__table02_table02_index__
drwxr-xr-x - allen supergroup 0 2012-03-04 22:22 /user/hive/warehouse/table01
drwxr-xr-x - allen supergroup 0 2012-03-04 22:33 /user/hive/warehouse/table02
hive> dfs -ls /user/hive/warehouse/default*;
Found 1 items
-rw-r--r-- 1 allen supergroup 74701985 2012-03-05 10:42 /user/hive/warehouse/default__table02_table02_index__/000000_0
hive> dfs -dus /user/hive/warehouse/default*;
hdfs://localhost:9000/user/hive/warehouse/default__table02_table02_index__ 74701985
hive>
可见,确实是干活了的,东西都在,我们来看看索引表里存了些什么:
hive> select * from default__table02_table02_index__ limit 3;
OK
0 hdfs://localhost:9000/user/hive/warehouse/table02/000000_0 [0]
1 hdfs://localhost:9000/user/hive/warehouse/table02/000000_0 [352]
2 hdfs://localhost:9000/user/hive/warehouse/table02/000000_0 [704]
Time taken: 0.44 seconds
hive>
不出意外,应该是{值,HDFS文件位置,偏移量的数组(可能有多个)}
hive> SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
hive> Insert overwrite directory "/tmp/table02_index_data" select `_bucketname`, `_offsets` from default__table02_table02_index__ where id =500000;
Total MapReduce jobs = 2
Launching Job 1 out of 2
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_201203051029_0009, Tracking URL = http://localhost:50030/jobdetails.jsp?jobid=job_201203051029_0009
Kill Command = /home/allen/Hadoop/hadoop-0.20.2/bin/../bin/hadoop job -Dmapred.job.tracker=localhost:9001 -kill job_201203051029_0009
2012-03-05 11:32:28,733 Stage-1 map = 0%, reduce = 0%
2012-03-05 11:32:38,022 Stage-1 map = 100%, reduce = 0%
2012-03-05 11:32:41,163 Stage-1 map = 100%, reduce = 100%
Ended Job = job_201203051029_0009
Ended Job = 1423815964, job is filtered out (removed at runtime).
Launching Job 2 out of 2
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_201203051029_0010, Tracking URL = http://localhost:50030/jobdetails.jsp?jobid=job_201203051029_0010
Kill Command = /home/allen/Hadoop/hadoop-0.20.2/bin/../bin/hadoop job -Dmapred.job.tracker=localhost:9001 -kill job_201203051029_0010
2012-03-05 11:32:47,102 Stage-2 map = 0%, reduce = 0%
2012-03-05 11:32:50,152 Stage-2 map = 100%, reduce = 0%
2012-03-05 11:32:53,265 Stage-2 map = 100%, reduce = 100%
Ended Job = job_201203051029_0010
Moving data to: /tmp/table02_index_data
1 Rows loaded to /tmp/table02_index_data
OK
Time taken: 33.348 seconds
hive> select * from table02 where id =500000;
Total MapReduce jobs = 1
Launching Job 1 out of 1
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_201203051029_0011, Tracking URL = http://localhost:50030/jobdetails.jsp?jobid=job_201203051029_0011
Kill Command = /home/allen/Hadoop/hadoop-0.20.2/bin/../bin/hadoop job -Dmapred.job.tracker=localhost:9001 -kill job_201203051029_0011
2012-03-05 11:34:17,235 Stage-1 map = 0%, reduce = 0%
2012-03-05 11:34:29,421 Stage-1 map = 33%, reduce = 0%
2012-03-05 11:34:41,688 Stage-1 map = 67%, reduce = 0%
2012-03-05 11:34:50,824 Stage-1 map = 83%, reduce = 0%
2012-03-05 11:34:53,878 Stage-1 map = 100%, reduce = 0%
2012-03-05 11:34:56,903 Stage-1 map = 100%, reduce = 100%
Ended Job = job_201203051029_0011
OK
500000 A decade ago, many were predicting that Cooke, a New York City prodigy, would become a basketball shoe pitchman and would flaunt his wares and skills at All-Star weekends like the recent aerial show in Orlando, Fla. There was a time, however fleeting, when he was more heralded, or perhaps merely hyped, than any other high school player in America.
Time taken: 45.991 seconds
hive> Set hive.index.compact.file=/tmp/table02_index_data;
hive> Set hive.optimize.index.filter=false;
hive> Set hive.input.format=org.apache.hadoop.hive.ql.index.compact.HiveCompactIndexInputFormat;
hive> select * from table02 where id =500000;
Total MapReduce jobs = 1
Launching Job 1 out of 1
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_201203051029_0012, Tracking URL = http://localhost:50030/jobdetails.jsp?jobid=job_201203051029_0012
Kill Command = /home/allen/Hadoop/hadoop-0.20.2/bin/../bin/hadoop job -Dmapred.job.tracker=localhost:9001 -kill job_201203051029_0012
2012-03-05 11:35:50,694 Stage-1 map = 0%, reduce = 0%
2012-03-05 11:35:56,797 Stage-1 map = 100%, reduce = 0%
2012-03-05 11:35:59,851 Stage-1 map = 100%, reduce = 100%
Ended Job = job_201203051029_0012
OK
500000 A decade ago, many were predicting that Cooke, a New York City prodigy, would become a basketball shoe pitchman and would flaunt his wares and skills at All-Star weekends like the recent aerial show in Orlando, Fla. There was a time, however fleeting, when he was more heralded, or perhaps merely hyped, than any other high school player in America.
Time taken: 14.367 seconds
hive>
allen@allen-laptop:~$ hadoop dfs -cat /tmp/table02_index_data/000000_0;
hdfs://localhost:9000/user/hive/warehouse/table02/000002_044170896
allen@allen-laptop:~$