1.1 lzo格式文件压缩解压需要用到服务器的lzop工具,hadoop 的native库(hadoop checknative是没有的lzo,zip相关信息)并不支持
#检查是否有lzop命令
[hadoop@hadoop001 software]$ which lzop
/bin/lzop
#若没有执行如下安装命令
[root@hadoop001 ~]# yum install -y svn ncurses-devel
[root@hadoop001 ~]# yum install -y gcc gcc-c++ make cmake
[root@hadoop001 ~]# yum install -y openssl openssl-devel svn ncurses-devel zlib-devel libtool
[root@hadoop001 ~]# yum install -y lzo lzo-devel lzop autoconf automake cmake
1.2 使用lzop工具压缩测试数据
#原始数据
[hadoop@hadoop001 log_data]$ ll
total 441152
-rw-r--r--. 1 hadoop hadoop 437156257 Apr 16 10:48 page_views.dat
[hadoop@hadoop001 log_data]$ du -sh *
431M page_views.dat
#lzo压缩:lzop -v file lzo解压:lzop -dv file
[hadoop@hadoop001 log_data]$ lzop -v page_views.dat
compressing page_views.dat into page_views.dat.lzo
#压缩后的大小
[hadoop@hadoop001 log_data]$ du -sh *
417M page_views.dat
199M page_views.dat.lzo
hadoop-lzo的源码在GitHub上是开源的,源码地址:https://github.com/twitter/hadoop-lzo
2.1 mvn编译源码
#解压
[hadoop@hadoop001 software]$ tar -xzvf hadoop-lzo-release-0.4.20.tar.gz -C ../app/source/
#mvn编译,先把pom文件里边的hadoop版本修改一下,本地使用的是2.6.0
[hadoop@hadoop001 source]$ cd hadoop-lzo-release-0.4.20/
[hadoop@hadoop001 hadoop-lzo-release-0.4.20]$ vim pom.xml
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<hadoop.current.version>2.6.0</hadoop.current.version>
<hadoop.old.version>1.0.4</hadoop.old.version>
</properties>
#编译
[hadoop@hadoop001 hadoop-lzo-release-0.4.20]$ mvn clean package -Dmaven.test.skip=true
[INFO] Building jar: /home/hadoop/app/source/hadoop-lzo-release-0.4.20/target/hadoop-lzo-0.4.20-javadoc.jar
[INFO] ------------------------------------------------------------------------
[INFO] BUILD SUCCESS
[INFO] ------------------------------------------------------------------------
[INFO] Total time: 01:06 min
[INFO] Finished at: 2019-04-16T11:00:15-04:00
[INFO] Final Memory: 36M/516M
[INFO] ------------------------------------------------------------------------
#查看编译之后的jar包,hadoop-lzo-0.4.20.jar是我们需要使用的jar包
[hadoop@hadoop001 hadoop-lzo-release-0.4.20]$ cd target/
[hadoop@hadoop001 target]$ ll
total 424
drwxrwxr-x. 2 hadoop hadoop 4096 Apr 16 10:59 antrun
drwxrwxr-x. 4 hadoop hadoop 4096 Apr 16 11:00 apidocs
drwxrwxr-x. 5 hadoop hadoop 66 Apr 16 10:59 classes
drwxrwxr-x. 3 hadoop hadoop 25 Apr 16 10:59 generated-sources
-rw-rw-r--. 1 hadoop hadoop 188645 Apr 16 11:00 hadoop-lzo-0.4.20.jar
-rw-rw-r--. 1 hadoop hadoop 180128 Apr 16 11:00 hadoop-lzo-0.4.20-javadoc.jar
-rw-rw-r--. 1 hadoop hadoop 51984 Apr 16 11:00 hadoop-lzo-0.4.20-sources.jar
drwxrwxr-x. 2 hadoop hadoop 71 Apr 16 11:00 javadoc-bundle-options
drwxrwxr-x. 2 hadoop hadoop 28 Apr 16 11:00 maven-archiver
drwxrwxr-x. 3 hadoop hadoop 28 Apr 16 10:59 native
drwxrwxr-x. 3 hadoop hadoop 18 Apr 16 10:59 test-classes
#将hadoop-lzo-0.4.20-SNAPSHOT.jar 复制到hadoop的common目录,如果是集群,复制到每台机器上
[hadoop@hadoop001 target]$ cp hadoop-lzo-0.4.20.jar ~/app/hadoop/share/hadoop/common/
[hadoop@hadoop001 target]$ ll ~/app/hadoop/share/hadoop/common/hadoop-lzo*
-rw-rw-r--. 1 hadoop hadoop 188645 Apr 16 11:11 /home/hadoop/app/hadoop/share/hadoop/common/hadoop-lzo-0.4.20.jar
# 停止hadoop
[hadoop@hadoop001 hadoop-lzo-master]$ stop-all.sh
#编辑core-site.xml添加或修改如下内容
[hadoop@hadoop001 ~]$ vim ~/app/hadoop-2.6.0-cdh5.7.0/etc/hadoop/core-site.xml
#主要是配置com.hadoop.compression.lzo.LzoCodec、com.hadoop.compression.lzo.LzopCodec压缩类
#io.compression.codec.lzo.class必须指定为LzoCodec非LzopCodec,不然压缩后的文件不会支持分片的
<property>
<name>io.compression.codecs</name>
<value>org.apache.hadoop.io.compress.GzipCodec,
org.apache.hadoop.io.compress.DefaultCodec,
org.apache.hadoop.io.compress.BZip2Codec,
org.apache.hadoop.io.compress.SnappyCodec,
com.hadoop.compression.lzo.LzoCodec,
com.hadoop.compression.lzo.LzopCodec
</value>
</property>
<property>
<name>io.compression.codec.lzo.class</name>
<value>com.hadoop.compression.lzo.LzoCodec</value>
</property>
[hadoop@hadoop001 ~]$ vim ~/app/hadoop-2.6.0-cdh5.7.0/etc/hadoop/mapred-site.xml
#中间阶段的压缩
<property>
<name>mapred.compress.map.output</name>
<value>true</value>
</property>
<property>
<name>mapred.map.output.compression.codec</name>
<value>com.hadoop.compression.lzo.LzoCodec</value>
</property>
#最终阶段的压缩
<property>
<name>mapreduce.output.fileoutputformat.compress</name>
<value>true</value>
</property>
<property>
<name>mapreduce.output.fileoutputformat.compress.codec</name>
<value>org.apache.hadoop.io.compress.BZip2Codec</value>
</property>
core-site.xml 跟 mapred-site.xml 这两个文件如果是集群机器,也要同步修改,然后启动集群
1.hive测试分片
--不开启压缩
--创建LZO压缩文件测试表,若hadoop的common目录没有hadoop-lzo的jar,就会报类DeprecatedLzoTextInputFormat找不到异常
create table page_views2_lzo(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
) row format delimited fields terminated by '\t'
STORED AS INPUTFORMAT "com.hadoop.mapred.DeprecatedLzoTextInputFormat"
OUTPUTFORMAT "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat";
--加载lzo格式的测试数据
hive> load data local inpath '/home/hadoop/log_data/page_views.dat.lzo' overwrite into table page_views2_lzo;
Loading data to table test.page_views2_lzo
Table test.page_views2_lzo stats: [numFiles=1, numRows=0, totalSize=207749249, rawDataSize=0]
OK
Time taken: 1.009 seconds
#查看数据
[hadoop@hadoop001 hadoop]$ hadoop fs -du -s -h /user/hive/warehouse/test.db/page_views2_lzo/*
198.1 M 198.1 M /user/hive/warehouse/test.db/page_views2_lzo/page_views.dat.lzo
--查询测试,可以看到Map的数量为1
select count(1) from page_views2_lzo;
MapReduce Jobs Launched:
Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 11.88 sec HDFS Read: 207756318 HDFS Write: 8 SUCCESS
Total MapReduce CPU Time Spent: 11 seconds 880 msec
开启压缩
--开启压缩,生成的压缩文件格式必须为设置为LzopCodec,lzoCode的压缩文件格式后缀为.lzo_deflate是无法创建索引的。
SET hive.exec.compress.output=true;
SET mapreduce.output.fileoutputformat.compress.codec=com.hadoop.compression.lzo.LzopCodec;
--创建LZO压缩文件测试表
create table page_views2_lzo_split
STORED AS INPUTFORMAT "com.hadoop.mapred.DeprecatedLzoTextInputFormat"
OUTPUTFORMAT "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"
as select * from page_views2_lzo;
#查看数据,文件后缀为.lzo
[hadoop@hadoop001 hadoop]$ hadoop fs -du -s -h /user/hive/warehouse/test.db/page_views2_lzo_split/*
196.8 M 196.8 M /user/hive/warehouse/test.db/page_views2_lzo_split/000000_0.lzo
#构建LZO文件索引,使用我们之前打的jar包中的工具类
[hadoop@hadoop001 hadoop]$ hadoop jar ~/app/hadoop/share/hadoop/common/hadoop-lzo-0.4.20.jar com.hadoop.compression.lzo.LzoIndexer /user/hive/warehouse/test.db/page_views2_lzo_split
#查询hdfs数据目录,可知在lzo文件同级目录有个.index索引文件
[hadoop@hadoop001 hadoop]$ hadoop fs -du -s -h /user/hive/warehouse/test.db/page_views2_lzo_split/* 196.8 M 196.8 M /user/hive/warehouse/test.db/page_views2_lzo_split/000000_0.lzo
13.9 K 13.9 K /user/hive/warehouse/test.db/page_views2_lzo_split/000000_0.lzo.index
--查询测试,可以看到Map的数量为2
select count(1) from page_views2_lzo_split;
MapReduce Jobs Launched:
Stage-Stage-1: Map: 2 Reduce: 1 Cumulative CPU: 28.05 sec HDFS Read: 206448787 HDFS Write: 58 SUCCESS
Total MapReduce CPU Time Spent: 28 seconds 50 msec
OK
2298975
Time taken: 28.621 seconds, Fetched: 1 row(s)
所以构建索引之后lzo是支持数据分片的。
大数据中常见的压缩格式只有bzip2是支持数据分片的,lzo在文件构建索引后才会支持数据分片