hadoop编译支持LZO压缩格式

hadoop编译支持LZO压缩格式

1.lzo安装

1.1 lzo格式文件压缩解压需要用到服务器的lzop工具,hadoop 的native库(hadoop checknative是没有的lzo,zip相关信息)并不支持

#检查是否有lzop命令
[hadoop@hadoop001 software]$ which lzop
/bin/lzop
#若没有执行如下安装命令
[root@hadoop001 ~]# yum install -y svn ncurses-devel
[root@hadoop001 ~]# yum install -y gcc gcc-c++ make cmake
[root@hadoop001 ~]# yum install -y openssl openssl-devel svn ncurses-devel zlib-devel libtool
[root@hadoop001 ~]# yum install -y lzo lzo-devel lzop autoconf automake cmake 

1.2 使用lzop工具压缩测试数据

#原始数据
[hadoop@hadoop001 log_data]$ ll
total 441152
-rw-r--r--. 1 hadoop hadoop 437156257 Apr 16 10:48 page_views.dat
[hadoop@hadoop001 log_data]$ du -sh *
431M    page_views.dat
#lzo压缩:lzop -v file  lzo解压:lzop -dv file
[hadoop@hadoop001 log_data]$ lzop -v page_views.dat 
compressing page_views.dat into page_views.dat.lzo
#压缩后的大小
[hadoop@hadoop001 log_data]$ du -sh *               
417M    page_views.dat
199M    page_views.dat.lzo

2.编译hadoop-lzo

hadoop-lzo的源码在GitHub上是开源的,源码地址:https://github.com/twitter/hadoop-lzo

2.1 mvn编译源码

#解压
[hadoop@hadoop001 software]$ tar -xzvf hadoop-lzo-release-0.4.20.tar.gz -C ../app/source/

#mvn编译,先把pom文件里边的hadoop版本修改一下,本地使用的是2.6.0
[hadoop@hadoop001 source]$ cd hadoop-lzo-release-0.4.20/
[hadoop@hadoop001 hadoop-lzo-release-0.4.20]$ vim pom.xml
<properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <hadoop.current.version>2.6.0</hadoop.current.version>
    <hadoop.old.version>1.0.4</hadoop.old.version>
</properties>

#编译
[hadoop@hadoop001 hadoop-lzo-release-0.4.20]$ mvn clean package -Dmaven.test.skip=true
[INFO] Building jar: /home/hadoop/app/source/hadoop-lzo-release-0.4.20/target/hadoop-lzo-0.4.20-javadoc.jar
[INFO] ------------------------------------------------------------------------
[INFO] BUILD SUCCESS
[INFO] ------------------------------------------------------------------------
[INFO] Total time: 01:06 min
[INFO] Finished at: 2019-04-16T11:00:15-04:00
[INFO] Final Memory: 36M/516M
[INFO] ------------------------------------------------------------------------

#查看编译之后的jar包,hadoop-lzo-0.4.20.jar是我们需要使用的jar包
[hadoop@hadoop001 hadoop-lzo-release-0.4.20]$ cd target/
[hadoop@hadoop001 target]$ ll
total 424
drwxrwxr-x. 2 hadoop hadoop   4096 Apr 16 10:59 antrun
drwxrwxr-x. 4 hadoop hadoop   4096 Apr 16 11:00 apidocs
drwxrwxr-x. 5 hadoop hadoop     66 Apr 16 10:59 classes
drwxrwxr-x. 3 hadoop hadoop     25 Apr 16 10:59 generated-sources
-rw-rw-r--. 1 hadoop hadoop 188645 Apr 16 11:00 hadoop-lzo-0.4.20.jar
-rw-rw-r--. 1 hadoop hadoop 180128 Apr 16 11:00 hadoop-lzo-0.4.20-javadoc.jar
-rw-rw-r--. 1 hadoop hadoop  51984 Apr 16 11:00 hadoop-lzo-0.4.20-sources.jar
drwxrwxr-x. 2 hadoop hadoop     71 Apr 16 11:00 javadoc-bundle-options
drwxrwxr-x. 2 hadoop hadoop     28 Apr 16 11:00 maven-archiver
drwxrwxr-x. 3 hadoop hadoop     28 Apr 16 10:59 native
drwxrwxr-x. 3 hadoop hadoop     18 Apr 16 10:59 test-classes

3.配置hadoop

3.1上传hadoop-lzo.jar

#将hadoop-lzo-0.4.20-SNAPSHOT.jar 复制到hadoop的common目录,如果是集群,复制到每台机器上
[hadoop@hadoop001 target]$ cp hadoop-lzo-0.4.20.jar ~/app/hadoop/share/hadoop/common/
[hadoop@hadoop001 target]$ ll  ~/app/hadoop/share/hadoop/common/hadoop-lzo*
-rw-rw-r--. 1 hadoop hadoop 188645 Apr 16 11:11 /home/hadoop/app/hadoop/share/hadoop/common/hadoop-lzo-0.4.20.jar

3.2配置core.site.xml

# 停止hadoop
[hadoop@hadoop001 hadoop-lzo-master]$ stop-all.sh 

#编辑core-site.xml添加或修改如下内容
[hadoop@hadoop001 ~]$ vim ~/app/hadoop-2.6.0-cdh5.7.0/etc/hadoop/core-site.xml 
#主要是配置com.hadoop.compression.lzo.LzoCodec、com.hadoop.compression.lzo.LzopCodec压缩类
#io.compression.codec.lzo.class必须指定为LzoCodec非LzopCodec,不然压缩后的文件不会支持分片的
<property>
	<name>io.compression.codecs</name>
    <value>org.apache.hadoop.io.compress.GzipCodec,
		org.apache.hadoop.io.compress.DefaultCodec,
		org.apache.hadoop.io.compress.BZip2Codec,
		org.apache.hadoop.io.compress.SnappyCodec,
		com.hadoop.compression.lzo.LzoCodec,
		com.hadoop.compression.lzo.LzopCodec
    </value>
</property>
<property>
<name>io.compression.codec.lzo.class</name>
<value>com.hadoop.compression.lzo.LzoCodec</value>
</property>

[hadoop@hadoop001 ~]$ vim ~/app/hadoop-2.6.0-cdh5.7.0/etc/hadoop/mapred-site.xml
#中间阶段的压缩
<property>    
    <name>mapred.compress.map.output</name>    
    <value>true</value>    
</property>
<property>    
    <name>mapred.map.output.compression.codec</name>    
    <value>com.hadoop.compression.lzo.LzoCodec</value>    
</property>

#最终阶段的压缩
<property>
   <name>mapreduce.output.fileoutputformat.compress</name>
   <value>true</value>
</property>

<property>
   <name>mapreduce.output.fileoutputformat.compress.codec</name>
   <value>org.apache.hadoop.io.compress.BZip2Codec</value>
</property>	

core-site.xml 跟 mapred-site.xml 这两个文件如果是集群机器,也要同步修改,然后启动集群

4.LZO文件测试

1.hive测试分片

--不开启压缩
--创建LZO压缩文件测试表,若hadoop的common目录没有hadoop-lzo的jar,就会报类DeprecatedLzoTextInputFormat找不到异常
create table page_views2_lzo(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
) row format delimited fields terminated by '\t'
STORED AS INPUTFORMAT "com.hadoop.mapred.DeprecatedLzoTextInputFormat"
OUTPUTFORMAT "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"--加载lzo格式的测试数据
hive> load data local inpath '/home/hadoop/log_data/page_views.dat.lzo' overwrite into table page_views2_lzo;
Loading data to table test.page_views2_lzo
Table test.page_views2_lzo stats: [numFiles=1, numRows=0, totalSize=207749249, rawDataSize=0]
OK
Time taken: 1.009 seconds
#查看数据
[hadoop@hadoop001 hadoop]$ hadoop fs -du -s -h /user/hive/warehouse/test.db/page_views2_lzo/*
198.1 M  198.1 M  /user/hive/warehouse/test.db/page_views2_lzo/page_views.dat.lzo
--查询测试,可以看到Map的数量为1
select count(1) from page_views2_lzo;
MapReduce Jobs Launched: 
Stage-Stage-1: Map: 1  Reduce: 1   Cumulative CPU: 11.88 sec   HDFS Read: 207756318 HDFS Write: 8 SUCCESS
Total MapReduce CPU Time Spent: 11 seconds 880 msec

开启压缩

--开启压缩,生成的压缩文件格式必须为设置为LzopCodec,lzoCode的压缩文件格式后缀为.lzo_deflate是无法创建索引的。
SET hive.exec.compress.output=true;
SET mapreduce.output.fileoutputformat.compress.codec=com.hadoop.compression.lzo.LzopCodec;

--创建LZO压缩文件测试表
create table page_views2_lzo_split
STORED AS INPUTFORMAT "com.hadoop.mapred.DeprecatedLzoTextInputFormat"
OUTPUTFORMAT "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"
as select *  from page_views2_lzo;
#查看数据,文件后缀为.lzo
[hadoop@hadoop001 hadoop]$ hadoop fs -du -s -h /user/hive/warehouse/test.db/page_views2_lzo_split/*
196.8 M  196.8 M  /user/hive/warehouse/test.db/page_views2_lzo_split/000000_0.lzo

#构建LZO文件索引,使用我们之前打的jar包中的工具类
[hadoop@hadoop001 hadoop]$ hadoop jar ~/app/hadoop/share/hadoop/common/hadoop-lzo-0.4.20.jar com.hadoop.compression.lzo.LzoIndexer /user/hive/warehouse/test.db/page_views2_lzo_split
#查询hdfs数据目录,可知在lzo文件同级目录有个.index索引文件
[hadoop@hadoop001 hadoop]$ hadoop fs -du -s -h /user/hive/warehouse/test.db/page_views2_lzo_split/*                                        196.8 M  196.8 M  /user/hive/warehouse/test.db/page_views2_lzo_split/000000_0.lzo
13.9 K  13.9 K  /user/hive/warehouse/test.db/page_views2_lzo_split/000000_0.lzo.index
--查询测试,可以看到Map的数量为2
select count(1) from page_views2_lzo_split;
MapReduce Jobs Launched: 
Stage-Stage-1: Map: 2  Reduce: 1   Cumulative CPU: 28.05 sec   HDFS Read: 206448787 HDFS Write: 58 SUCCESS
Total MapReduce CPU Time Spent: 28 seconds 50 msec
OK
2298975
Time taken: 28.621 seconds, Fetched: 1 row(s)

所以构建索引之后lzo是支持数据分片的。

大数据中常见的压缩格式只有bzip2是支持数据分片的,lzo在文件构建索引后才会支持数据分片

你可能感兴趣的:(hadoop,hadoop,lzo,压缩)