1. 生成测试数据
在TPC-H的官网http://www.tpc.org/tpch/上下载dbgen工具,生成数据http://www.tpc.org/tpch/spec/tpch_2_17_0.zip
[root@ip-172-31-10-151 tpch]# wget http://www.tpc.org/tpch/spec/tpch_2_17_0.zip
解压,到dbgen目录下,复制makefile.suite到makefile并作如下修改
[root@ip-172-31-10-151 tpch]# yum install unzip
[root@ip-172-31-10-151 tpch]# unzip tpch_2_17_0.zip
[root@ip-172-31-10-151 tpch]# ls __MACOSX tpch_2_17_0 tpch_2_17_0.zip [root@ip-172-31-10-151 tpch]# cd tpch_2_17_0 [root@ip-172-31-10-151 tpch_2_17_0]# ls dbgen dev-tools ref_data [root@ip-172-31-10-151 tpch_2_17_0]# cd dbgen/ [root@ip-172-31-10-151 dbgen]# ls BUGS README bcd2.h check_answers dbgen.dsp dss.ddl dsstypes.h permute.c qgen.c reference rnd.h shared.h text.c tpch.sln variants HISTORY answers bm_utils.c column_split.sh dists.dss dss.h load_stub.c permute.h qgen.vcproj release.h rng64.c speed_seed.c tpcd.h tpch.vcproj varsub.c PORTING.NOTES bcd2.c build.c config.h driver.c dss.ri makefile.suite print.c queries rnd.c rng64.h tests tpch.dsw update_release.sh [root@ip-172-31-10-151 dbgen]# cp makefile.suite makefile
[root@ip-172-31-10-151 dbgen]# vi makefile
################ ## CHANGE NAME OF ANSI COMPILER HERE ################ CC = gcc # Current values for DATABASE are: INFORMIX, DB2, TDAT (Teradata) # SQLSERVER, SYBASE, ORACLE, VECTORWISE # Current values for MACHINE are: ATT, DOS, HP, IBM, ICL, MVS, # SGI, SUN, U2200, VMS, LINUX, WIN32 # Current values for WORKLOAD are: TPCH DATABASE= ORACLE MACHINE = LINUX WORKLOAD = TPCH
编译代码:
make
编译完成之后会在当前目录下生成dbgen
运行./dbgen -help查看如何使用
jfp4-1:/mnt/disk1/tpch_2_17_0/dbgen # ./dbgen -help TPC-H Population Generator (Version 2.17.0 build 0) Copyright Transaction Processing Performance Council 1994 - 2010 USAGE: dbgen [-{vf}][-T {pcsoPSOL}] [-s <scale>][-C <procs>][-S <step>] dbgen [-v] [-O m] [-s <scale>] [-U <updates>] Basic Options =========================== -C <n> -- separate data set into <n> chunks (requires -S, default: 1) -f -- force. Overwrite existing files -h -- display this message -q -- enable QUIET mode -s <n> -- set Scale Factor (SF) to <n> (default: 1) -S <n> -- build the <n>th step of the data/update set (used with -C or -U) -U <n> -- generate <n> update sets -v -- enable VERBOSE mode Advanced Options =========================== -b <s> -- load distributions for <s> (default: dists.dss) -d <n> -- split deletes between <n> files (requires -U) -i <n> -- split inserts between <n> files (requires -U) -T c -- generate cutomers ONLY -T l -- generate nation/region ONLY -T L -- generate lineitem ONLY -T n -- generate nation ONLY -T o -- generate orders/lineitem ONLY -T O -- generate orders ONLY -T p -- generate parts/partsupp ONLY -T P -- generate parts ONLY -T r -- generate region ONLY -T s -- generate suppliers ONLY -T S -- generate partsupp ONLY To generate the SF=1 (1GB), validation database population, use: dbgen -vf -s 1 To generate updates for a SF=1 (1GB), use: dbgen -v -U 1 -s 1
运行./dbgen -s 1024生成1TB数据
jfp4-1:/mnt/disk1/tpch_2_17_0/dbgen # ll *.tbl -rw-r--r-- 1 root root 25384864295 Jul 3 23:04 customer.tbl -rw-r--r-- 1 root root 833545019752 Jul 3 23:04 lineitem.tbl -rw-r--r-- 1 root root 2224 Jul 3 23:04 nation.tbl -rw-r--r-- 1 root root 185305368911 Jul 3 23:04 orders.tbl -rw-r--r-- 1 root root 25329003396 Jul 3 23:04 part.tbl -rw-r--r-- 1 root root 126691192078 Jul 3 23:04 partsupp.tbl -rw-r--r-- 1 root root 389 Jul 3 23:04 region.tbl -rw-r--r-- 1 root root 1473459356 Jul 3 23:04 supplier.tbl
将数据移动到一个单独的目录
mkdir ../data1024g
mv *.tbl ../data1024g
2.下载impala版本的TPCH-H脚本
建立原始表linetext,为text文件:大小776GB
jfp4-1:/mnt/disk1/tpch_2_17_0/dbgen # hdfs dfs -du /shaochen/tpch 25384864295 /shaochen/tpch/customer 833545019752 /shaochen/tpch/lineitem 2224 /shaochen/tpch/nation 185305368911 /shaochen/tpch/orders 25329003396 /shaochen/tpch/part 126691192078 /shaochen/tpch/partsupp 389 /shaochen/tpch/region 1473459356 /shaochen/tpch/supplier
Create external table lineitem (L_ORDERKEY INT, L_PARTKEY INT, L_SUPPKEY INT, L_LINENUMBER INT, L_QUANTITY DOUBLE, L_EXTENDEDPRICE DOUBLE, L_DISCOUNT DOUBLE, L_TAX DOUBLE, L_RETURNFLAG STRING, L_LINESTATUS STRING, L_SHIPDATE STRING, L_COMMITDATE STRING, L_RECEIPTDATE STRING, L_SHIPINSTRUCT STRING, L_SHIPMODE STRING, L_COMMENT STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' LOCATION '/shaochen/tpch/lineitem';
从原始text表中统计记录条数:
[jfp4-1:21000] > select count(*) from lineitem; Query: select count(*) from lineitem +------------+ | count(*) | +------------+ | 6144008876 | +------------+ Returned 1 row(s) in 856.47s
在脚本运行过程中,观察到Cluster Disk IO速度平均接近1GB,原始数据为776GB,由于是IO密集型操作,估算应该在776GB/1GB/s=800s完成。符合预期
将lineitem表保存为parquet格式:
[jfp4-1:21000] > insert overwrite lineitem_parquet select * from lineitem; Query: insert overwrite lineitem_parquet select * from lineitem Inserted 6144008876 rows in 3780.52s
在脚本运行过程中,该SQL为由于涉及到parquet文件的转换和Snappy压缩,属于混合型(IO密集+CPU密集),观察到Cluster Disk IO中读速率均值约为210M,估算在776/0.2=3800秒左右完成。符合预期。
根据写速率为140兆,parquet文件大小约为3800*0.14=532GB,再除以复制因子3,为180GB。
jfp4-1:/mnt/disk1/tpch_2_17_0/dbgen # hdfs dfs -du -h /user/hive/warehouse/tpch.db 200.9 G /user/hive/warehouse/tpch.db/lineitem_parquet 546 /user/hive/warehouse/tpch.db/q1_pricing_summary_report
真实的parquet文件大小为200G,符合预期。
再次统计记录条数:
[jfp4-1:21000] > select count(*) from lineitem_parquet; Query: select count(*) from lineitem_parquet +------------+ | count(*) | +------------+ | 6144008876 | +------------+ Returned 1 row(s) in 18.04s
在text文件格式上运行Q1:
[jfp4-1:21000] > -- the query > INSERT OVERWRITE TABLE q1_pricing_summary_report > SELECT > L_RETURNFLAG, L_LINESTATUS, SUM(L_QUANTITY), SUM(L_EXTENDEDPRICE), SUM(L_EXTENDEDPRICE*(1-L_DISCOUNT)), SUM(L_EXTENDEDPRICE*(1-L_DISCOUNT)*(1+L_TAX)), AVG(L_QUANTITY), AVG(L_EXTENDEDPRICE), AVG(L_DISCOUNT), cast(COUNT(1) as int) > FROM > lineitem > WHERE > L_SHIPDATE<='1998-09-02' > GROUP BY L_RETURNFLAG, L_LINESTATUS > ORDER BY L_RETURNFLAG, L_LINESTATUS > LIMIT 2147483647; Query: INSERT OVERWRITE TABLE q1_pricing_summary_report SELECT L_RETURNFLAG, L_LINESTATUS, SUM(L_QUANTITY), SUM(L_EXTENDEDPRICE), SUM(L_EXTENDEDPRICE*(1-L_DISCOUNT)), SUM(L_EXTENDEDPRICE*(1-L_DISCOUNT)*(1+L_TAX)), AVG(L_QUANTITY), AVG(L_EXTENDEDPRICE), AVG(L_DISCOUNT), cast(COUNT(1) as int) FROM lineitem WHERE L_SHIPDATE<='1998-09-02' GROUP BY L_RETURNFLAG, L_LINESTATUS ORDER BY L_RETURNFLAG, L_LINESTATUS LIMIT 2147483647 ^C[jfp4-1:21000] > INSERT OVERWRITE TABLE q1_pricing_summary_report > SELECT > L_RETURNFLAG, L_LINESTATUS, SUM(L_QUANTITY), SUM(L_EXTENDEDPRICE), SUM(L_EXTENDEDPRICE*(1-L_DISCOUNT)), SUM(L_EXTENDEDPRICE*(1-L_DISCOUNT)*(1+L_TAX)), AVG(L_QUANTITY), AVG(L_EXTENDEDPRICE), AVG(L_DISCOUNT), cast(COUNT(1) as int) > FROM > lineitem > WHERE > L_SHIPDATE<='1998-09-02' > GROUP BY L_RETURNFLAG, L_LINESTATUS > ORDER BY L_RETURNFLAG, L_LINESTATUS > LIMIT 2147483647; Query: insert OVERWRITE TABLE q1_pricing_summary_report SELECT L_RETURNFLAG, L_LINESTATUS, SUM(L_QUANTITY), SUM(L_EXTENDEDPRICE), SUM(L_EXTENDEDPRICE*(1-L_DISCOUNT)), SUM(L_EXTENDEDPRICE*(1-L_DISCOUNT)*(1+L_TAX)), AVG(L_QUANTITY), AVG(L_EXTENDEDPRICE), AVG(L_DISCOUNT), cast(COUNT(1) as int) FROM lineitem WHERE L_SHIPDATE<='1998-09-02' GROUP BY L_RETURNFLAG, L_LINESTATUS ORDER BY L_RETURNFLAG, L_LINESTATUS LIMIT 2147483647 Inserted 4 rows in 823.57s
查询查询计划:
[jfp4-1:21000] > explain INSERT OVERWRITE TABLE q1_pricing_summary_report > SELECT > L_RETURNFLAG, L_LINESTATUS, SUM(L_QUANTITY), SUM(L_EXTENDEDPRICE), SUM(L_EXTENDEDPRICE*(1-L_DISCOUNT)), SUM(L_EXTENDEDPRICE*(1-L_DISCOUNT)*(1+L_TAX)), AVG(L_QUANTITY), AVG(L_EXTENDEDPRICE), AVG(L_DISCOUNT), cast(COUNT(1) as int) > FROM > lineitem > WHERE > L_SHIPDATE<='1998-09-02' > GROUP BY L_RETURNFLAG, L_LINESTATUS > ORDER BY L_RETURNFLAG, L_LINESTATUS > LIMIT 2147483647; Query: explain INSERT OVERWRITE TABLE q1_pricing_summary_report SELECT L_RETURNFLAG, L_LINESTATUS, SUM(L_QUANTITY), SUM(L_EXTENDEDPRICE), SUM(L_EXTENDEDPRICE*(1-L_DISCOUNT)), SUM(L_EXTENDEDPRICE*(1-L_DISCOUNT)*(1+L_TAX)), AVG(L_QUANTITY), AVG(L_EXTENDEDPRICE), AVG(L_DISCOUNT), cast(COUNT(1) as int) FROM lineitem WHERE L_SHIPDATE<='1998-09-02' GROUP BY L_RETURNFLAG, L_LINESTATUS ORDER BY L_RETURNFLAG, L_LINESTATUS LIMIT 2147483647 +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | Explain String | +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | Estimated Per-Host Requirements: Memory=208.13GB VCores=2 | | WARNING: The following tables are missing relevant table and/or column statistics. | | tpch.lineitem | | | | WRITE TO HDFS [tpch.q1_pricing_summary_report, OVERWRITE=true] | | | partitions=1 | | | | | 06:TOP-N [LIMIT=2147483647] | | | order by: L_RETURNFLAG ASC, L_LINESTATUS ASC | | | | | 05:EXCHANGE [PARTITION=UNPARTITIONED] | | | | | 02:TOP-N [LIMIT=2147483647] | | | order by: L_RETURNFLAG ASC, L_LINESTATUS ASC | | | | | 04:AGGREGATE [MERGE FINALIZE] | | | output: sum(sum(L_QUANTITY)), sum(sum(L_EXTENDEDPRICE)), sum(sum(L_EXTENDEDPRICE * (1.0 - L_DISCOUNT))), sum(sum(L_EXTENDEDPRICE * (1.0 - L_DISCOUNT) * (1.0 + L_TAX))), sum(count(L_QUANTITY)), sum(count(L_EXTENDEDPRICE)), sum(sum(L_DISCOUNT)), sum(count(L_DISCOUNT)), sum(count(1)) | | | group by: L_RETURNFLAG, L_LINESTATUS | | | | | 03:EXCHANGE [PARTITION=HASH(L_RETURNFLAG,L_LINESTATUS)] | | | | | 01:AGGREGATE | | | output: sum(L_QUANTITY), sum(L_EXTENDEDPRICE), sum(L_EXTENDEDPRICE * (1.0 - L_DISCOUNT)), sum(L_EXTENDEDPRICE * (1.0 - L_DISCOUNT) * (1.0 + L_TAX)), count(L_QUANTITY), count(L_EXTENDEDPRICE), sum(L_DISCOUNT), count(L_DISCOUNT), count(1) | | | group by: L_RETURNFLAG, L_LINESTATUS | | | | | 00:SCAN HDFS [tpch.lineitem] | | partitions=1/1 size=776.30GB | | predicates: L_SHIPDATE <= '1998-09-02' | +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ Returned 28 row(s) in 0.15s
计算一下表的统计信息:
[jfp4-1:21000] > compute stats lineitem; Query: compute stats lineitem +------------------------------------------+ | summary | +------------------------------------------+ | Updated 1 partition(s) and 16 column(s). | +------------------------------------------+ Returned 1 row(s) in 5894.34s
根据执行结果,发现compute stats 原来是如此花费时间!观察执行过程中,前15分钟的DISK IO是非常高,达到900M/s左右,基本上是集群中所有的磁盘都在满负荷的读文件的。之后的IO也保持在130M/s左右。看来compute status是一个昂贵的操作
在parquet表上统计一下:
[jfp4-1:21000] > compute stats lineitem_parquet;
Query: compute stats lineitem_parquet
Query aborted.
[jfp4-1:21000] > SET
> NUM_SCANNER_THREADS=2
> ;
NUM_SCANNER_THREADS set to 2
[jfp4-1:21000] > compute stats lineitem_parquet;
Query: compute stats lineitem_parquet
+------------------------------------------+
| summary |
+------------------------------------------+
| Updated 1 partition(s) and 16 column(s). |
+------------------------------------------+
Returned 1 row(s) in 5176.29s
[jfp4-1:21000] >
注意需要设置NUM_SCANNER_THREAD,才能成功
查看snappy压缩对parquet表的压缩和查询效率的影响:
[jfp4-1:21000] > set PARQUET_COMPRESSION_CODEC=snappy; PARQUET_COMPRESSION_CODEC set to snappy [jfp4-1:21000] > create table lineitem_parquet_snappy (L_ORDERKEY INT, L_PARTKEY INT, L_SUPPKEY INT, L_LINENUMBER INT, L_QUANTITY DOUBLE, L_EXT DOUBLE, L_TAX DOUBLE, L_RETURNFLAG STRING, L_LINESTATUS STRING, L_SHIPDATE STRING, L_COMMITDATE STRING, L_RECEIPTDATE STRING, L_SHIPINSTRUCT SOMMENT STRING) STORED AS PARQUET; Query: create table lineitem_parquet_snappy (L_ORDERKEY INT, L_PARTKEY INT, L_SUPPKEY INT, L_LINENUMBER INT, L_QUANTITY DOUBLE, L_EXTENDEDPRICE_TAX DOUBLE, L_RETURNFLAG STRING, L_LINESTATUS STRING, L_SHIPDATE STRING, L_COMMITDATE STRING, L_RECEIPTDATE STRING, L_SHIPINSTRUCT STRING, L_SING) STORED AS PARQUET Returned 0 row(s) in 0.30s
[jfp4-1:21000] > insert overwrite lineitem_parquet_snappy select * from lineitem;
Query: insert overwrite lineitem_parquet_snappy select * from lineitem
Inserted 6144008876 rows in 3836.99s
查看snappy表的大小:
jfp4-1:~ # hdfs dfs -du -h /user/hive/warehouse/tpch.db 200.9 G /user/hive/warehouse/tpch.db/lineitem_parquet 200.9 G /user/hive/warehouse/tpch.db/lineitem_parquet_snappy 546 /user/hive/warehouse/tpch.db/q1_pricing_summary_report
发现lineitem_parquet_snappy和lineitem_parquet大小是一样的,可见默认情况下,impala的parquet表默认是用snappy压缩的
[jfp4-1:21000] > insert overwrite lineitem_parquet_raw select * from lineitem; Query: insert overwrite lineitem_parquet_raw select * from lineitem Inserted 6144008876 rows in 4063.22s
snappy + parquet在写数据上比不压缩的parquet还是要节省了一些时间的!
看看raw parquet的大小:
jfp4-1:~ # hdfs dfs -du -h /user/hive/warehouse/tpch.db 200.9 G /user/hive/warehouse/tpch.db/lineitem_parquet 319.2 G /user/hive/warehouse/tpch.db/lineitem_parquet_raw 200.9 G /user/hive/warehouse/tpch.db/lineitem_parquet_snappy 546 /user/hive/warehouse/tpch.db/q1_pricing_summary_report
看看gzip+snappy的效果:
[jfp4-1:21000] > set PARQUET_COMPRESSION_CODEC=gzip; PARQUET_COMPRESSION_CODEC set to gzip [jfp4-1:21000] > create table lineitem_parquet_gzip (L_ORDERKEY INT, L_PARTKEY INT, L_SUPPKEY INT, L_LINENUMBER INT, L_QUANTITY DOUBLE, L_EXTENDEDPRICE DOUBLE, L_DISCOUNT DOUBLE, L_TAX DOUBLE, L_RETURNFLAG STRING, L_LINESTATUS STRING, L_SHIPDATE STRING, L_COMMITDATE STRING, L_RECEIPTDATE STRING, L_SHIPINSTRUCT STRING, L_SHIPMODE STRING, L_COMMENT STRING) STORED AS PARQUET; Query: create table lineitem_parquet_gzip (L_ORDERKEY INT, L_PARTKEY INT, L_SUPPKEY INT, L_LINENUMBER INT, L_QUANTITY DOUBLE, L_EXTENDEDPRICE DOUBLE, L_DISCOUNT DOUBLE, L_TAX DOUBLE, L_RETURNFLAG STRING, L_LINESTATUS STRING, L_SHIPDATE STRING, L_COMMITDATE STRING, L_RECEIPTDATE STRING, L_SHIPINSTRUCT STRING, L_SHIPMODE STRING, L_COMMENT STRING) STORED AS PARQUET Returned 0 row(s) in 0.26s [jfp4-1:21000] > insert overwrite lineitem_parquet_gzip select * from lineitem; Query: insert overwrite lineitem_parquet_gzip select * from lineitem Inserted 6144008876 rows in 9090.71s
jfp4-1:~ # hdfs dfs -du -h /user/hive/warehouse/tpch.db 200.9 G /user/hive/warehouse/tpch.db/lineitem_parquet 155.1 G /user/hive/warehouse/tpch.db/lineitem_parquet_gzip 319.2 G /user/hive/warehouse/tpch.db/lineitem_parquet_raw 200.9 G /user/hive/warehouse/tpch.db/lineitem_parquet_snappy 546 /user/hive/warehouse/tpch.db/q1_pricing_summary_report
[jfp4-1:21000] > select count(*) from lineitem_parquet_gzip; Query: select count(*) from lineitem_parquet_gzip +------------+ | count(*) | +------------+ | 6144008876 | +------------+ Returned 1 row(s) in 18.54s