#! /bin/bash #generating 350M raw data. i=0 while [ $i -ne 1000000 ] do echo "$i\tA decade ago, many were predicting that Cooke, a New York City prodigy, would become a basketball shoe pitchman and would flaunt his wares and skills at All-Star weekends like the recent aerial show in Orlando, Fla. There was a time, however fleeting, when he was more heralded, or perhaps merely hyped, than any other high school player in America." i=$(($i+1)) done
$ sh gen-data.sh >dual.txt
drop table table01; create table table01( id int, name string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'; load data local inpath '/home/allen/Desktop/dual.txt' overwrite into table table01;
hive> dfs -ls /user/hive/warehouse/table01; Found 1 items -rw-r--r-- 1 allen supergroup 356888890 2012-03-04 22:22 /user/hive/warehouse/table01/dual.txt hive>create table02 as select id ,name as text from table01;
6个Mapper,0个Reducer
hive> dfs -ls /user/hive/warehouse/table02; Found 6 items -rw-r--r-- 1 allen supergroup 67109134 2012-03-04 22:28 /user/hive/warehouse/table02/000000_0 -rw-r--r-- 1 allen supergroup 67108860 2012-03-04 22:28 /user/hive/warehouse/table02/000001_0 -rw-r--r-- 1 allen supergroup 67108860 2012-03-04 22:30 /user/hive/warehouse/table02/000002_0 -rw-r--r-- 1 allen supergroup 67108860 2012-03-04 22:30 /user/hive/warehouse/table02/000003_0 -rw-r--r-- 1 allen supergroup 67108860 2012-03-04 22:32 /user/hive/warehouse/table02/000004_0 -rw-r--r-- 1 allen supergroup 21344316 2012-03-04 22:32 /user/hive/warehouse/table02/000005_0
hive> select * from table02 where id=500000; Total MapReduce jobs = 1 Launching Job 1 out of 1 Number of reduce tasks is set to 0 since there's no reduce operator Starting Job = job_201203042123_0003, Tracking URL = http://localhost:50030/jobdetails.jsp?jobid=job_201203042123_0003 Kill Command = /home/allen/Hadoop/hadoop-0.20.2/bin/../bin/hadoop job -Dmapred.job.tracker=localhost:9001 -kill job_201203042123_0003 2012-03-04 22:42:09,609 Stage-1 map = 0%, reduce = 0% 2012-03-04 22:42:21,919 Stage-1 map = 19%, reduce = 0% 2012-03-04 22:42:25,020 Stage-1 map = 33%, reduce = 0% 2012-03-04 22:42:37,602 Stage-1 map = 61%, reduce = 0% 2012-03-04 22:42:39,715 Stage-1 map = 67%, reduce = 0% 2012-03-04 22:42:46,168 Stage-1 map = 77%, reduce = 0% 2012-03-04 22:42:49,552 Stage-1 map = 79%, reduce = 0% 2012-03-04 22:42:51,812 Stage-1 map = 80%, reduce = 0% 2012-03-04 22:42:55,015 Stage-1 map = 83%, reduce = 0% 2012-03-04 22:42:58,181 Stage-1 map = 100%, reduce = 0% 2012-03-04 22:43:13,500 Stage-1 map = 100%, reduce = 100% Ended Job = job_201203042123_0003 OK 500000 A decade ago, many were predicting that Cooke, a New York City prodigy, would become a basketball shoe pitchman and would flaunt his wares and skills at All-Star weekends like the recent aerial show in Orlando, Fla. There was a time, however fleeting, when he was more heralded, or perhaps merely hyped, than any other high school player in America. Time taken: 70.275 seconds hive>
三。加Index
hive> dfs -ls /user/hive/warehouse/; Found 2 items drwxr-xr-x - allen supergroup 0 2012-03-04 22:22 /user/hive/warehouse/table01 drwxr-xr-x - allen supergroup 0 2012-03-04 22:33 /user/hive/warehouse/table02 hive> show tables; OK table01 table02 Time taken: 3.754 seconds hive> create index table02_index on table table02(id) > as 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' > with deferred rebuild; OK Time taken: 0.913 seconds hive> alter index table02_index on table02 rebuild; Total MapReduce jobs = 1 Launching Job 1 out of 1 Number of reduce tasks not specified. Estimated from input data size: 1 In order to change the average load for a reducer (in bytes): set hive.exec.reducers.bytes.per.reducer=<number> In order to limit the maximum number of reducers: set hive.exec.reducers.max=<number> In order to set a constant number of reducers: set mapred.reduce.tasks=<number> Starting Job = job_201203051029_0001, Tracking URL = http://localhost:50030/jobdetails.jsp?jobid=job_201203051029_0001 Kill Command = /home/allen/Hadoop/hadoop-0.20.2/bin/../bin/hadoop job -Dmapred.job.tracker=localhost:9001 -kill job_201203051029_0001 2012-03-05 10:39:43,607 Stage-1 map = 0%, reduce = 0% 。。。 2012-03-05 10:43:50,295 Stage-1 map = 100%, reduce = 100% Ended Job = job_201203051029_0001 Loading data to table default.default__table02_table02_index__ Deleted hdfs://localhost:9000/user/hive/warehouse/default__table02_table02_index__ Table default.default__table02_table02_index__ stats: [num_partitions: 0, num_files: 1, num_rows: 0, total_size: 74701985] OK Time taken: 269.383 seconds hive>
查询一下,看看效果:6个Mapper,不work确认索引建立成功没,先看看meta中怎么样:mysql> select * from IDXS; +----------+-------------+------------------+-------------------------------------------------------------+---------------+--------------+------------------+-------------+-------+ | INDEX_ID | CREATE_TIME | DEFERRED_REBUILD | INDEX_HANDLER_CLASS | INDEX_NAME | INDEX_TBL_ID | LAST_ACCESS_TIME | ORIG_TBL_ID | SD_ID | +----------+-------------+------------------+-------------------------------------------------------------+---------------+--------------+------------------+-------------+-------+ | 1 | 1330914783 | | org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler | table02_index | 6 | 1330914783 | 2 | 7 | +----------+-------------+------------------+-------------------------------------------------------------+---------------+--------------+------------------+-------------+-------+ 1 row in set (0.00 sec) mysql> select * from TBLS; +--------+-------------+-------+------------------+-------+-----------+-------+----------------------------------+---------------+--------------------+--------------------+ | TBL_ID | CREATE_TIME | DB_ID | LAST_ACCESS_TIME | OWNER | RETENTION | SD_ID | TBL_NAME | TBL_TYPE | VIEW_EXPANDED_TEXT | VIEW_ORIGINAL_TEXT | +--------+-------------+-------+------------------+-------+-----------+-------+----------------------------------+---------------+--------------------+--------------------+ | 1 | 1330870868 | 1 | 0 | allen | 0 | 2 | table01 | MANAGED_TABLE | NULL | NULL | | 2 | 1330871615 | 1 | 0 | allen | 0 | 3 | table02 | MANAGED_TABLE | NULL | NULL | | 6 | 1330914783 | 1 | 0 | NULL | 0 | 10 | default__table02_table02_index__ | INDEX_TABLE | NULL | NULL | +--------+-------------+-------+------------------+-------+-----------+-------+----------------------------------+---------------+--------------------+--------------------+ 3 rows in set (0.01 sec) mysql>
确实,IDX中已经有了这个索引,而TBLS中也有了索引表,看看HDFS中的情况:hive> dfs -ls /user/hive/warehouse; Found 3 items drwxr-xr-x - allen supergroup 0 2012-03-05 10:43 /user/hive/warehouse/default__table02_table02_index__ drwxr-xr-x - allen supergroup 0 2012-03-04 22:22 /user/hive/warehouse/table01 drwxr-xr-x - allen supergroup 0 2012-03-04 22:33 /user/hive/warehouse/table02 hive> dfs -ls /user/hive/warehouse/default*; Found 1 items -rw-r--r-- 1 allen supergroup 74701985 2012-03-05 10:42 /user/hive/warehouse/default__table02_table02_index__/000000_0 hive> dfs -dus /user/hive/warehouse/default*; hdfs://localhost:9000/user/hive/warehouse/default__table02_table02_index__ 74701985 hive>可见,确实是干活了的,东西都在,我们来看看索引表里存了些什么:hive> select * from default__table02_table02_index__ limit 3; OK 0 hdfs://localhost:9000/user/hive/warehouse/table02/000000_0 [0] 1 hdfs://localhost:9000/user/hive/warehouse/table02/000000_0 [352] 2 hdfs://localhost:9000/user/hive/warehouse/table02/000000_0 [704] Time taken: 0.44 seconds hive>不出意外,应该是{值,HDFS文件位置,偏移量的数组(可能有多个)}下面手动设置索引:hive> SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat; hive> Insert overwrite directory "/tmp/table02_index_data" select `_bucketname`, `_offsets` from default__table02_table02_index__ where id =500000; Total MapReduce jobs = 2 Launching Job 1 out of 2 Number of reduce tasks is set to 0 since there's no reduce operator Starting Job = job_201203051029_0009, Tracking URL = http://localhost:50030/jobdetails.jsp?jobid=job_201203051029_0009 Kill Command = /home/allen/Hadoop/hadoop-0.20.2/bin/../bin/hadoop job -Dmapred.job.tracker=localhost:9001 -kill job_201203051029_0009 2012-03-05 11:32:28,733 Stage-1 map = 0%, reduce = 0% 2012-03-05 11:32:38,022 Stage-1 map = 100%, reduce = 0% 2012-03-05 11:32:41,163 Stage-1 map = 100%, reduce = 100% Ended Job = job_201203051029_0009 Ended Job = 1423815964, job is filtered out (removed at runtime). Launching Job 2 out of 2 Number of reduce tasks is set to 0 since there's no reduce operator Starting Job = job_201203051029_0010, Tracking URL = http://localhost:50030/jobdetails.jsp?jobid=job_201203051029_0010 Kill Command = /home/allen/Hadoop/hadoop-0.20.2/bin/../bin/hadoop job -Dmapred.job.tracker=localhost:9001 -kill job_201203051029_0010 2012-03-05 11:32:47,102 Stage-2 map = 0%, reduce = 0% 2012-03-05 11:32:50,152 Stage-2 map = 100%, reduce = 0% 2012-03-05 11:32:53,265 Stage-2 map = 100%, reduce = 100% Ended Job = job_201203051029_0010 Moving data to: /tmp/table02_index_data 1 Rows loaded to /tmp/table02_index_data OK Time taken: 33.348 seconds hive> select * from table02 where id =500000; Total MapReduce jobs = 1 Launching Job 1 out of 1 Number of reduce tasks is set to 0 since there's no reduce operator Starting Job = job_201203051029_0011, Tracking URL = http://localhost:50030/jobdetails.jsp?jobid=job_201203051029_0011 Kill Command = /home/allen/Hadoop/hadoop-0.20.2/bin/../bin/hadoop job -Dmapred.job.tracker=localhost:9001 -kill job_201203051029_0011 2012-03-05 11:34:17,235 Stage-1 map = 0%, reduce = 0% 2012-03-05 11:34:29,421 Stage-1 map = 33%, reduce = 0% 2012-03-05 11:34:41,688 Stage-1 map = 67%, reduce = 0% 2012-03-05 11:34:50,824 Stage-1 map = 83%, reduce = 0% 2012-03-05 11:34:53,878 Stage-1 map = 100%, reduce = 0% 2012-03-05 11:34:56,903 Stage-1 map = 100%, reduce = 100% Ended Job = job_201203051029_0011 OK 500000 A decade ago, many were predicting that Cooke, a New York City prodigy, would become a basketball shoe pitchman and would flaunt his wares and skills at All-Star weekends like the recent aerial show in Orlando, Fla. There was a time, however fleeting, when he was more heralded, or perhaps merely hyped, than any other high school player in America. Time taken: 45.991 seconds hive> Set hive.index.compact.file=/tmp/table02_index_data; hive> Set hive.optimize.index.filter=false; hive> Set hive.input.format=org.apache.hadoop.hive.ql.index.compact.HiveCompactIndexInputFormat; hive> select * from table02 where id =500000; Total MapReduce jobs = 1 Launching Job 1 out of 1 Number of reduce tasks is set to 0 since there's no reduce operator Starting Job = job_201203051029_0012, Tracking URL = http://localhost:50030/jobdetails.jsp?jobid=job_201203051029_0012 Kill Command = /home/allen/Hadoop/hadoop-0.20.2/bin/../bin/hadoop job -Dmapred.job.tracker=localhost:9001 -kill job_201203051029_0012 2012-03-05 11:35:50,694 Stage-1 map = 0%, reduce = 0% 2012-03-05 11:35:56,797 Stage-1 map = 100%, reduce = 0% 2012-03-05 11:35:59,851 Stage-1 map = 100%, reduce = 100% Ended Job = job_201203051029_0012 OK 500000 A decade ago, many were predicting that Cooke, a New York City prodigy, would become a basketball shoe pitchman and would flaunt his wares and skills at All-Star weekends like the recent aerial show in Orlando, Fla. There was a time, however fleeting, when he was more heralded, or perhaps merely hyped, than any other high school player in America. Time taken: 14.367 seconds hive>
OK!生效了,变成了一个Mapper!不过这里只手动插入了id=500000的索引:allen@allen-laptop:~$ hadoop dfs -cat /tmp/table02_index_data/000000_0; hdfs://localhost:9000/user/hive/warehouse/table02/000002_044170896 allen@allen-laptop:~$
至于Hive怎么做的,下篇文章探索吧