test

hadoop2.2.0基础编程实例

1. Default mapreduce

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
importorg.apache.hadoop.mapreduce.Mapper;
importorg.apache.hadoop.mapreduce.Reducer;
importorg.apache.hadoop.conf.Configuration;
importorg.apache.hadoop.conf.Configured;
importorg.apache.hadoop.fs.Path;
importorg.apache.hadoop.util.Tool;
importorg.apache.hadoop.util.ToolRunner;
importorg.apache.hadoop.mapreduce.Job;
importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;
importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
                      
public class DefaultMapReduce extends Configured
   implements Tool {
     @Override
     public int run(String[] args) throwsException {
         Configuration conf = getConf();
         Job job = Job.getInstance(conf);
         job.setJarByClass(getClass());
         job.setNumReduceTasks( 0 );
         FileInputFormat.addInputPath(job, newPath(args[ 0 ]));
         FileOutputFormat.setOutputPath(job, newPath(args[ 1 ]));
                      
         returnjob.waitForCompletion( true ) ? 0 : 1 ;
     }
     public static void main(String[] args) throws Exception {
         ToolRunner.run(newDefaultMapReduce(), args);
     }
}

 

注意:

此处并没有定义map但是程序任然可以运行,是因为hadoop新的API用mapred类(接口实现)替换成了mapreduce类(抽象类实现),新的Mapper抽象类本身提供了可以运行的map方法, 部分源码如下:

1
2
3
4
5
6
7
8
9
10
publicvoidrun(Context context) throwsIOException, InterruptedException {
   setup(context);
   try {
     while (context.nextKeyValue()) {
       map(context.getCurrentKey(), context.getCurrentValue(), context);
     }
   } finally {
     cleanup(context);
   }
}

2.SequenceFileWritDemo

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IOUtils;
   
public class SequenceFileWritDemo {
   private static final String[] DATA = {
     "one, two, buckle my shoe" ,
     "Three, four, shut the door"
   };
   public static void main(String[] args) throws IOException {
     String uri = args[ 0 ];
     Configuration conf = new Configuration();
     FileSystem fs = FileSystem.get(URI.create(uri), conf);
     Path path = new Path(uri);
   
     IntWritable key = new IntWritable();
     Text value = new Text();
     SequenceFile.Writer writer = null ;
     try {
       writer = SequenceFile.createWriter(conf, writer.file(path),
qwriter.keyClass(key.getClass()),writer.valueClass(value.getClass()));
   
       for ( int i = 0 ; i < 100 ; i++ ) {
         key.set( 100 - i);
   
         value.set(DATA[i % DATA.length]);
   
         System.out.printf( "[%s]\t%s\t%s\n" , writer.getLength(), key, value);
         writer.append(key, value);
       }
     } finally {
       IOUtils.closeStream(writer);
     }
   }
}

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IOUtils;
                    
public class SequenceFileWritDemo {
   private static final String[] DATA = {
     "one, two, buckle my shoe" ,
     "Three, four, shut the door"
   };
   public static void main(String[] args) throws IOException {
     String uri = args[ 0 ];
     Configuration conf = new Configuration();
     FileSystem fs = FileSystem.get(URI.create(uri), conf);
     Path path = new Path(uri);
                    
     IntWritable key = new IntWritable();
     Text value = new Text();
     SequenceFile.Writer writer = null ;
     try {
       writer = SequenceFile.createWriter(conf, writer.file(path),
qwriter.keyClass(key.getClass()),writer.valueClass(value.getClass()));
                    
       for ( int i = 0 ; i < 100 ; i++ ) {
         key.set( 100 - i);
                    
         value.set(DATA[i % DATA.length]);
                    
         System.out.printf( "[%s]\t%s\t%s\n" , writer.getLength(), key, value);
         writer.append(key, value);
       }
     } finally {
       IOUtils.closeStream(writer);
     }
   }
}

3.write & read a  MapFile

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import java.io.IOException;
                   
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.MapFile.Writer;
import org.apache.hadoop.io.MapFile.Reader;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
                   
public class MyMapFile {
                   
   static private final String[] DATA =  {
       "this is the first" ,
       "this is the second" ,
       "this is the third" ,
       "this is the forth"
     }; 
                   
   public static void main(String[] args) throws IOException {
     Configuration conf = new Configuration();
     FileSystem fs = FileSystem.get(conf);
     String uri = args[ 0 ];
                   
     IntWritable key = new IntWritable();
     Text val = new Text();
                   
     MapFile.Writer writer = new MapFile.Writer(conf, new Path(uri),
Writer.keyClass(key.getClass()),Writer.valueClass(val.getClass()));
                   
     for ( int i = 0 ; i < 10 ; i++ ) {
       key.set( i + 1 );
       val.set(DATA[ i % DATA.length ]);
       writer.append(key, val);
     }
     writer.close();
                   
     MapFile.Reader reader = new MapFile.Reader( new Path(uri), conf);
                   
     while ( reader.next(key, val) ){
       System.out.println( key + "\t" + val );
     }
     reader.close();
   }
}

 

4.Configurataion Printer

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import java.util.Map.Entry;
                  
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.conf.Configured;
                  
public class ConfigurationPrinter extends Configured implements Tool {
                  
   @Override
   public int run(String[] args) throws Exception {
     Configuration.addDefaultResource(args[ 0 ]);
     Configuration.addDefaultResource(args[ 1 ]);
                  
     Configuration conf = getConf();
     for (Entry<String, String> entry : conf) {
       System.out.printf( "%s=%s\n" , entry.getKey(), entry.getValue());
     }  
     return 0 ;
   }
                  
   public static void main(String[] args) throws Exception {
     ToolRunner.run( new ConfigurationPrinter(), args);
   }
}

 

5.计算温度最大值 (基于全新2.2.0API)

Deprecated:  Job类的所有Constructors, 新的API用静态方法getInstance(conf)来去的Job的实例;

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import java.io.IOException;
import java.util.Iterator;
                 
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
                 
public class getMaxTemperature 
   extends Configured implements Tool {
                 
   class MaxTemperatureMapper
     extends Mapper<LongWritable, Text, Text, IntWritable> {
     @Override
     public void map(LongWritable key, Text val, Context context)
       throws IOException, InterruptedException {
       String line = val.toString();
       String year = line.substring( 15 , 19 );
                 
       int Temperature;
       if (!hasPlus(line)){
         Temperature = Integer.parseInt(line.substring( 87 , 92 ));
       } else {
         Temperature = Integer.parseInt(line.substring( 88 , 92 ));
       }  
       String qual = line.substring( 92 , 93 );
       if (!matched(qual)) {
         context.write( new Text(year), new IntWritable(Temperature));
    }
                 
     }
                 
     private boolean hasPlus(String line) {
       return line.charAt( 87 ) == '+' true : false ;
     }
                 
     private boolean matched(String line) {
         return line.matches( "[01459" ) ? true : false ;
     }
                 
   }
                 
   class MaxTemperatureReducer
     extends Reducer<Text, IntWritable, Text, IntWritable> {
     @Override
     public void reduce(Text key, Iterable<IntWritable> vals, Context context)
       throws IOException, InterruptedException {
       int maxValue = Integer.MIN_VALUE;
       for ( IntWritable value : vals ) {
         maxValue = Math.max(maxValue, value.get());
       }
       context.write(key, new IntWritable(maxValue));
     }
   }
                 
     @Override
     public int run(String[] args)
       throws Exception {
       Configuration conf = getConf();
       Job job = Job.getInstance(conf);
       job.setJobName( "helloRuby" );
       job.setJarByClass(getClass());
       FileInputFormat.addInputPath(job, new Path(args[ 0 ]));
       FileOutputFormat.setOutputPath(job, new Path(args[ 1 ]));
                 
       job.setMapperClass(MaxTemperatureMapper. class );
       job.setCombinerClass(MaxTemperatureReducer. class );
       job.setReducerClass(MaxTemperatureReducer. class );
                 
       job.setOutputKeyClass(Text. class );
       job.setOutputValueClass(IntWritable. class );
                 
       return job.waitForCompletion( true ) ? 0 : 1 ;
     }
                 
                 
   public static void main(String[] args)
     throws Exception {
     ToolRunner.run( new getMaxTemperature() , args);
   }
}

 

6.File Read

1
2
3
4
5
6
7
8
9
10
11
public class FileRead {
     public static void main(Sting[] args)
       throws Exception {
         Configuration conf = new Configuration();
         FileSystem fs = FileSystem.get(conf);
         InputStream in = new InputStream();
         in = fs.open( new Path(args[ 0 ]));
         IOUtils.copyBytes(in, System.out, 4096 , false );
         IOUtils.closeStream(in);
     }
}

过程分析:

(1)open

当client读取文件时,在FileSystem ojbect上调用open()方法,而FileSystem是HDFS的一个instance;

从上述程序中可见:

line5 得到FileSystem的instance

line7 调用FileSystem上的open()方法

(2)get block location

随后,HDFS通过RPC(Remote Procedure Call)来呼叫namenode, 来获得the locations of the blocks for the first few blocks, 对于每一个block, namenode会返回有此block的datanode 的 address, 而且datanode会根据Network Topology被重新排序;

获取locations以后,DFS会返回一个FSDataInputStream给client来读取数据, FSDataInputStream会依次wrap一个DFSInputStream来管理datanode和namenode的 I/O,DFSInputStream上同时也存储了first few datanode 的address;

(3)read

client 在FSDataInputStream上call read(), 则DFSInputStream会连接存储第一个block的最近的datanode, 之后不停地call read()方法从datanode读取数据到client, 当到达block的末尾,DFSInputStream会关闭与此datanode的连接, 然后找到存储下一个block的datanode, 依次往复...

(4) DFSInputStream

DFSInputStream按顺序读入每一个packet的最近的一个block, 每读一个block都要重新和一个datanode建立连接;

DFSInputStream同时会和namenode保持连接,来重新获取下一个packet的blocks所在的datanode的locations

(5)FSDataInputStream

FSDataInputStream 是client和datanode连接的中介, client call read() methods 都通过FSDataInputStream来调用DFSInputStream

(6)容错

在读取数据的过程中遇到的错误主要有两类:

1. DFSInputStream和datanode 的communication出现错误, 此时DFSInputStream会尝试连接保存此packet的下一个block所在的datanode中最近的一个, 同时会记录此datanode, 防止读取下一个block是再次从该datanode上读取;

2. DFSInputStream checksum data from datanode时,发现损坏的数据块, 则它会在DFSInputStream尝试从另一个datanode读取此packet的下一个block副本之前报告给namenode;

7.File Write

1
2
3
4
5
6
7
8
9
10
11
12
13
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.Configuration;
               
public class CreateDir {
   public static void main(String[] args)
    throws Exception {
     Configuration conf = new Configuration();
     String dst = args[ 0 ];
     FileSystem fs = FileSystem.get(conf);
     fs.create( new Path(dst));
   }
}

 

过程分析:

(1)create

当client 写入文件时, 在FileSystem object上调用create()方法, FileSystem是HDFS的一个instance;

(2) create new file in filesystem's namespace

DFS 通过Remote Procedure Call 来call namenode, namenode会在filesystem的namespace产生一个新的文件, 新建文件之前,nanemode会做一些列检查,包括client是否有create file的permission, 要create的文件是否已经存在, 若检查没有通过,则抛出IOException;

create file以后,DFS会返回一个FSDataOutputStream给client来写文件, FSDataInputStream会wrap一个DFSOutputStream来与namenode和datanode交流;

(3) client write data

client开始写入数据到文件时,DFSOutputStream会将待写入的数据split into 很多packets, 这些packet会被写入一个内部队列data queue, DFSOutputStream维护此data queue;

这个data queue将被DataStreamer所用,DataStreamer主要负责向namenode发出申请,来为新的packet的block副本分配 合适的datanode, namenode会挑选出合适的datanodes来存储这些data blocks;

存储这个packet的blocks的datanode会组成一个pipeline, 假设每个packet的block的replication level是3, 则此pipeline由3个datanode组成。

DataStreamer将此packet导入pipeline的第一个datanode, 该datanode存储此packet之后forward it to 第二个datanode, 同样地,第二个datanode存储此packet, forward to 第三个datanode;

(4) 容错

从(3)中我们知道DFSOutputStream维护着一个data queue, 此外,

DFSOutputStream还维护一个ack queue(acknowledged), 当一个packet已经被所有在pipeline中的datanode acknowledged, 则,此packet将会从ack queue中移走;

如果写数据时,一个datanode 写入失败,则会发生以下动作:

首先, pipeline会关闭,所有在ack queue中的packets会被添加到data queue前面,以保证下游的datanode不会丢失任何packets, 当前已写入datanode的数据块会被标识, 而写入坏的datnode中的部分数据会在此datanode recover以后被删除;

failed的datanode将从pipeline中移走, namenode 会notice到这些,会重新分配一个datanode来组成新的pipeline; 下一个packet的block不会受到影响;

当写入一个block时若大量的datanode failed,只要满足dfs.replication.min(default is 1), 则此写入就不会失败,block会被复制并同步到cluster上的datanode中,知道达到dfs.replication所设的数目(默认是3)

(5)数据写入的收尾阶段

当client完成数据写入是,client会调用在FSDataOutputStream 上的close()。 此动作会在给namenode发送file完成写入的信号之前flushs所有剩余的packets到datanode pipeline并等待acknowledge;

因为DataStreamer之前曾为所有的packets向namenode申请过block locations, 故namenode已经知道此文件由哪些blocks组成。

8.MRUnit使用

常用类

1
2
3
4
5
6
7
8
9
implements java.lang.annotation.Annotation:
org.junit.Test
org.junit.Rule
org.junit.Ignore
org.junit.ClassRule
org.junit.BeforeClass
org.junit.Before
org.junit.AfterClass
org.junit.After

Code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.junit.Test;
import org.apache.hadoop.mrunit.mapreduce.MapDriver;
             
public class MaxTemperatureMapperTest {
   @Test
   public void processesValidRecord() throws IOException {
     Text value = new Text( "0043011990999991950051518004+68750+023550FM-12+0382" +
                                   // Year ^^^^
         "99999V0203201N00261220001CN9999999N9-00111+99999999999" );
                               // Temperature ^^^^^
     new MapDriver<LongWritable, Text, Text, IntWritable>()
     .withMapper( new MaxTemperatureMapper())
     .withInput( new LongWritable( 1 ), value)
     .withOutput( new Text( "1950" ), new IntWritable(- 11 ))
     .runTest();
   }
}

 

注意一些deprecated的class和methods:

org.apache.hadoop.mrunit.MapDriver<K1,V1,K2,V2>被弃用应该可以理解,此类是为mapreduce的旧API(比如org.apache.hadoop.mapred)写的,比如其中一个方法

MapDriver<K1,V1,K2,V2>withMapper(org.apache.hadoop.mapred.Mapper<K1,V1,K2,V2> m)

mapreduce的新API为org.apache.hadoop.mapreduce.*; 与之对应MRUnit的MapDriver(包括ReduceDriver)为:

org.apache.hadoop.mrunit.mapreduce.MapDriver<K1,V1,K2,V2>同样的,上述方法变为:

MapDriver<K1,V1,K2,V2>withCounters(org.apache.hadoop.mapreduce.Counters ctrs)  

 MapDriverBase class中的T withInputValue(V1 val) 被弃用,改为T withInput(K1 key, V1 val) ,还有很多,不详列。

执行步骤:

注意: 需要下载MRUnit并编译,在/home/user/.bashrc下设置MRUnit_HOME变量, 之后修改$HADOOP_HOME/libexec/hadoop-config.sh,将$MRUnit_HOME/lib/*.jar添加进去, 之后source $HADOOP_HOME/libexec/hadoop-config.sh,再执行下面操作:

1
2
3
javac  -d class/  MaxTemperatureMapper.java  MaxTemperatureMapperTest.java
jar -cvf test .jar -C class ./
java - cp test .jar:$CLASSPATH org.junit.runner.JUnitCore  MaxTemperatureMapperTest

9.Test Tool, ToolRunner, GenericOptionsParser, Configuration

说明

   Tool 中的run()方法可以看做是mapreduce程序的driver, 我们一般通过implements Tool接口来设置Job启动的相关属性,然后在main()函数里通过静态调用ToolRunner.run(new MainClass(), args)来间接调用run()方法;

    2. 包装GenericOptionsParser类来解generic hadoop command line arguments(参见另一篇文章:《hadoopFS-shell commands》

Code1 (Configuration里添加的resource是String类型):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import java.util.Map.Entry;
            
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.fs.Path;
            
public class ConfigurationPrinter extends Configured implements Tool {
   static {
     Configuration.addDefaultResource( "config.xml" );
   }
            
   @Override
     public int run(String[] args) throws Exception {
       Configuration conf = getConf();
       for (Entry<String, String> hash: conf) {
         System.out.printf( "%s=%s\n" , hash.getKey(), hash.getValue());
       }  
       return 0 ;
     }
            
   public static void main(String[] args) throws Exception {
     int exitCode = ToolRunner.run( new ConfigurationPrinter(), args);
     System.exit(exitCode);
   }
}

注:Configuration class提供只一种静态方法:addDefaultresource(String name), 如上述代码, 添加Resource "config.xml"为String类型时,hadoop将从classpath里查找此文件;若Resource 为Path()类型时,hadoop将从local filesystem里查找此文件: Configuration conf = new Configuration(); conf.addResource(new Path("config.xml"));

执行步骤:

1
2
#将自定义的config文件config.xml放在hadoop的$HADOOP_CONF_DIR里
% mv config.xml $HADOOP_HOME /etc/hadoop/

假如我们添加的resource如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
<!--cat $HADOOP_HOME/etc/hadoop/config.xml-->
< configuration >
   < property >
     < name >color</ name >
     < value >yellow</ value >
   </ property >
                                                                                                          
   < property >
     < name >size</ name >
     < value >10</ value >
   </ property >
                                                                                                          
   < property >
     < name >weight</ name >
     < value >heavy</ value >
     < final >true</ final >
   </ property >
</ configuration >

执行代码:

1
2
3
4
5
6
7
8
9
10
mkdir class
source $HADOOP_HOME /libexec/hadoop-config .sh
javac  -d class ConfigurationPrinter.java
jar -cvf ConfigurationPrinter.jar -C class ./
export HADOOP_CLASSPATH=ConfigurationPrinter.jar:$CLASSPATH
#下面查找刚才添加的resource是否被读入
#我们在config.xml里添加了一项 <name>color</name>,执行
yarn ConfigurationPrinter| grep "color"
color=yellow
#可见代码是正确的

或者在Command line里指定HADOOP_CONF_DIR, 比如执行:

1
2
% $YARN_HOME /bin/yarn ConfigurationPrinter --conf config.xml | grep color
color=yellow

也是可以的!

Code2 (Configuration里添加的resource是Path类型):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import java.util.Map.Entry;
                                                                                                     
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.fs.Path;
                                                                                                     
public class ConfigurationPrinter extends Configured implements Tool {
   @Override
   public int run(String[] args) throws Exception {
     Configuration conf = new Configuration();
     conf.addResource( new Path( "config.xml" ));
     for (Entry<String, String> hash: conf) {
       System.out.printf( "%s=%s\n" , hash.getKey(), hash.getValue());
     }  
     return 0 ;
   }
                                                                                                     
   public static void main(String[] args) throws Exception {
     int exitCode = ToolRunner.run( new ConfigurationPrinter(), args);
     System.exit(exitCode);
   }
}

此时添加的resource类型是Path()类型,故hadoop将从local filesystem里查找config.xml, 不需要将config.xml放在conf/下面,只要在代码中指定config.xml在本地文件系统中的路径即可(new Path("../others/config.xml"))

运行步骤:

1
2
3
4
5
6
7
8
9
10
% mkdir class
% source $HADOOP_HOME /libexec/hadoop-config .sh
% javac  -d class ConfigurationPrinter.java
% jar -cvf ConfigurationPrinter.jar -C class ./
% export HADOOP_CLASSPATH=ConfigurationPrinter.jar:$CLASSPATH
#下面查找刚才添加的resource是否被读入
#我们在config.xml里添加了一项 <name>color</name>,执行
% yarn ConfigurationPrinter| grep "color"
color=yellow
#可见代码是正确的

备注:ConfigurationParser支持set individual properties:

1
2
3
4
5
6
7
8
9
10
11
12
13
Generic Options
The supported generic options are:
                                                                                               
-conf <configuration file >     specify a configuration file
      -D <property=value>            use value for given property
      -fs < local |namenode:port>      specify a namenode
      -jt < local |jobtracker:port>    specify a job tracker
      -files <comma separated list of files>    specify comma separated
                             files to be copied to the map reduce cluster
      -libjars <comma separated list of jars>   specify comma separated
                             jar files to include in the classpath.
      -archives <comma separated list of archives>    specify comma
              separated archives to be unarchived on the compute machines.

可以尝试:

1
2
3
% $YARN_HOME /bin/yarn ConfigurationPrinter -d fuck=Japan | grep fuck
#输出为:
fuck=Japan

再次提醒:

ToolRunner can be used to run classes implementing Toolinterface. It works in conjunction with GenericOptionsParser to parse thegeneric hadoop command line arguments and modifies the Configurationof theTool. The application-specific options are passed along without being modified.

ToolRunnerGenericOptionsParser共同来(解析|修改generic hadoop command line arguments (什么是generic hadoop command line arguments? 比如:yarn  command [genericOptions] [commandOptions]

10.DFS API 操作

Code 1. Reading data from a hadoop URL

说明:想要让java从hadoop的dfs里读取数据,则java 必须能够识别hadoop hdfs URL schema, 因此我们应该将hdfs的FsUrlStreamHandlerFactory作为一个实例提供给java, java的setURLStreamHandlerFactory方法可以实现此功能;

注意:此方法有缺陷,由于在java里,setURLStreamHandlerFactorymethod在 每一个JVM里只能调用一次,加入第三方component已经set a URLStreamHandlerFactory,则hadoop用户就不能使用setURLStreamHandlerFactory方法来 reading data from hadoop。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import java.io.InputStream;
import java.net.URL;
import org.apache.hadoop.fs.FsUrlStreamHandlerFactory;
import org.apache.hadoop.io.IOUtils;                   
    
public class URLCat {
   static {
     URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory());
   }
   public static void main(String[] args) throws Exception {
     InputStream in = null;
     try {
       in = new URL(args[0]).openStream();
       IOUtils.copyBytes( in , System.out, 4096, false );
     } finally {
       IOUtils.closeStream( in );
    
   }
}

 

执行步骤:

1
2
3
4
5
6
7
$ source $YARN_HOME /libexec/hadoop-config .sh
$ mkdir myclass
$javac - cp $CLASSPATH URLCat.java -d myclass
$jar -cvf urlcat.jar -C myclass ./
# assume we have a file bar.txt in hdfs: /user/grid/bar.txt
# then we need run yarn with this command
$yarn jar - cp urlcat.jar URLCat hdfs: ///user/grid/bar .txt

Code2. Reading data using HDFS API

说明:使用hadoop的FileSystem API可以避免上面所述的JVM只能调用一次setURLStreamHandlerFactory的缺陷;

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import java.net.URI;
import java.io.InputStream;
                                                                                          
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.Configuration;
                                                                                          
public class URICat {
   public static void main(String[] args) throws Exception {
    String uri = args[ 0 ];
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(URI.create(uri), conf);
    InputStream in = null ;
    try {
      in = fs.open( new Path(uri));
      IOUtils.copyBytes(in, System.out, 4096 , false );
    } finally {
      IOUtils.closeStream(in);
   
   }
}

执行步骤:

1
2
3
4
5
$ source $YARN_HOME /libexec/hadoop-config .sh
$ mkdir myclass
$javac - cp $CLASSPATH URICat.java -d myclass
$jar -cvf uricat.jar -C myclass ./
$yarn jar - cp uricat.jar URICat /user/grid/bar .txt

备注1:因为我们调用了FileSystem的API,故输入的filepath也可以省略HDFS的URI全名hdfs://,如上面执行步骤里所写。

备注2:FileSystem是抽象类,故不能new FileSystem()来得到instance, 而需要调用其的静态方法get()来得到;

备注3:注意java里的向上转型,体现在简要提示里各种Stream的继承关系上;

备注4:Configuration conf = new Configuration();

  • Configurations需要xml文件里的键值对<name>x</name>来配置,规则为:

    if x is named by a String, 则在classpath里检查同名文件;

    if x is named by a Path, 则直接本地查找,不检查classpath;

  • 若用户不指定,则默认调用两个resources: core-site.xml和core-default.xml        

  • 用户可以指定xml文件以添加自己定义的configurations:             

    conf.addResource("my_configuration.xml");

Code3. Writing data

从本地复制文件到hdfs

版本1 FileCopy with copyBytes() method

简要提示:

1.核心代码就一行,即从InputStrea 以二进制方式复制到OutputStream:   

1
static void copyBytes(InputStream in , OutputStream out, int buffSize, boolean close)

2.我们新建一个FileInputStream(localsrc)实例, 将其暂存在BufferedInputStream()里,并向上转型生成InputStream:

1
FileInputStream(String name )

3.调用FileSystem来产生OutputStream:

1
FSDataOutputStream create(Path f, Progressable progress)

代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import java.net.URI;
import java.io.InputStream;
import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.OutputStream;
                                                                                 
import org.apache.hadoop.fs.BufferedFSInputStream;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.conf.Configuration;
                                                                                 
public class FileCopyWithProgress {
   public static void main(String[] args) throws Exception {
     String localsrc = args[ 0 ];
     String dst = args[ 1 ];
     InputStream in = new BufferedInputStream( new FileInputStream(localsrc));
     Configuration conf = new Configuration();
     FileSystem fs = FileSystem.get(URI.create(dst), conf);
     OutputStream out = fs.create( new Path(dst), new Progressable() {
         public void progress() { System.out.print( "." );} }
                                 );
     IOUtils.copyBytes(in, out, 4096 , true );
   }
}

执行步骤:

1
2
3
4
5
6
7
8
9
% $YARN_HOME /libexec/hadoop-config .sh
% javac - cp $CLASSPATH -d my_class FileCopyWithProgress.java
% jar -cvf filecopywithprogress.jar -C my_class/ .
# assum we have a local file foo.out in directory: /home/grid/foo.out,
# then we should run yarn like below
% yarn jar filecopywithprogress.jar FileCopyWithProgress \
/home/grid/foo .out hdfs: ///user/grid/copied_foo .out
# we can do a check for the copied file
% hadoop fs - ls -R /user/grid/

注:从下面开始使用另一种方式来编译、运行代码

版本2  使用FileSystem的copyFromLocalFile()方法        

代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.conf.Configuration;
                                                                               
public class FileCopyFromLocal {
   public static void main(String[] args) throws Exception {
     String localSrc = args[ 0 ];
     String dst = args[ 1 ];
     Configuration conf = new Configuration();
     FileSystem fs = FileSystem.get(conf);
     fs.copyFromLocalFile( new Path(localSrc), new Path(dst));
   }
}

执行步骤:

1
2
3
4
5
6
7
8
9
10
$ source $YARN_HOME /libexec/hadoop-config .sh
$javac FileCopyFromLocal.java -d class/
$jar -cvf filecopyfromlocal.jar -C class ./
$ export HADOOP_CLASSPATH=$CLASSPATH:filecopyfromlocal.jar
# suppose we have a file bar.txt in local disk,
# then we use the following command line to copy it to hdfs
$yarn FileCopyFromLocal bar.txt hdfs: ///user/grid/kissyou
# we can check the copied file on hdfs
$hadoop fs - ls /user/grid/
w-r--r--   3 grid supergroup   899 2013-11-17 01:33 /user/grid/kissyou

Code4.新建文件夹/文件

新建文件夹 FileSystem.mkdirs()

1
2
3
4
5
6
7
8
9
10
11
12
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
                                                                        
public class CreateDir {
   public static void main(String[] args) throws Exception {
     Configuration conf = new Configuration();
     String dst = args[ 0 ];
     FileSystem fs = FileSystem.get(conf);
     fs.mkdirs( new Path(dst));
   }
}

执行步骤:

1
2
3
4
5
6
7
8
$ source $YARN_HOME /libexec/hadoop-config .sh
$javac CreatDir.java -d class/
$jar -cvf createdir.jar -C class ./
$ export HADOOP_CLASSPATH=$CLASSPATH:createdir.jar
$yarn CreateDir hdfs: ///user/grid/kissyou
# we can check the created directory on hdfs
$hadoop fs - ls /user/grid/
w-r--r--   3 grid supergroup        899 2013-11-17 01:33 /user/grid/kissyou

 

新建文件 FileSystem.create()

1
2
3
4
5
6
7
8
9
10
11
12
13
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
                                                                  
public class CreateFile {
   public static void main(String[] args)
    throws Exception {
     Configuration conf = new Configuration();
     String dst = args[ 0 ];
     FileSystem fs = FileSystem.get(conf);
     fs.create( new Path(dst));
   }
}

执行步骤:

1
2
3
4
5
6
7
8
$ source $YARN_HOME /libexec/hadoop-config .sh
$javac CreatFile.java -d class/
$jar -cvf createfile.jar -C class ./
$ export HADOOP_CLASSPATH=$CLASSPATH:createfile.jar
$yarn CreatFile hdfs: ///user/grid/kissyou .txt
# we can check the created file on hdfs
$hadoop fs - ls /user/grid/
w-r--r--   3 grid supergroup        899 2013-11-17 01:33 /user/grid/kissyou .txt

注意三点

1. 同一路径下不可以新建同名的文件foo和目录foo/, 否则运行时会抛出异常:fs.FileAlreadyExistsException

2. 我们进行copy复制、写文件操作时mkdirs()方法会被自动调用,故一般不会调用mkdirs()来手动创建目录;

3. 官方API文档里对mkdirs()的描述是:"Make the given file and all non-existent parents into directories", 所以在hadoop里创建文件的方法是recursive(递归的),相当于linux里的:

1
% mkdir -p foo /bar/qzx

同样等价于hdfs-shell里的命令:

1
% $YARN_HOME/bin/hadoop fs -mkdir -p hdfs: ///foo/bar/qzx

Code5.Testing file and Getting fileStatus

提示: hadoop2.2中一些API已经deprecated, 以下代码全部用新的constructor,methods写成。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import java.net.URI;
                                                            
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
                                                            
public class TestFileStatus {
   public static void main(String[] args) throws Exception {
     Configuration conf = new Configuration();
     FileSystem fs = FileSystem.get(conf);
     FileStatus stat = fs.getFileStatus( new Path(args[ 0 ]));
     if (stat.isDirectory()) {
       System.out.println(stat.getPath().toUri().getPath() +
       " is a directory." );
     } else if (stat.isFile()) {
       System.out.println(stat.getPath().toUri().getPath() +
       " is a file." );
       System.out.println(stat.getPath().toUri().getPath() +
       " getBlockSize: " + stat.getBlockSize());
       System.out.println(stat.getPath().toUri().getPath() +
       " getLen(): " + stat.getLen());
       System.out.println(stat.getPath().toUri().getPath() +
       " getOwner(): " + stat.getOwner());
       System.out.println(stat.getPath().toUri().getPath() +
       " getGroup(): " + stat.getGroup());
       System.out.println(stat.getPath().toUri().getPath() +
       " getAccessTime(): " + stat.getAccessTime());
       System.out.println(stat.getPath().toUri().getPath() +
       " getModificationTime(): " + stat.getModificationTime());
       System.out.println(stat.getPath().toUri().getPath() +
       " getPermission(): " + stat.getPermission());
       System.out.println(stat.getPath().toUri().getPath() +
       " hashcode(): " + stat.hashCode());
       System.out.println(stat.getPath().toUri().getPath() +
       " getPath(): " + stat.getPath());
     }
   }
}

利用下面我写的小脚本可以方便地编译并生成jar文件:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#!/usr/bin/env sh
CWD=$( pwd )
export CLASSPATH= ''
. $YARN_HOME /libexec/hadoop-config .sh
                                          
if [ -d class ]; then
   rm -rf class/*
else
   mkdir $CWD /class
fi
                                          
for f in $@
   do
     srcs= "$srcs $CWD/$f"
   done
                                          
javac $srcs -d class
                                          
if [ $? - ne 0 ] ; then
   echo Error found when compiling the code!
   exit 1
fi
                                          
class=$( cat $1 | grep 'package' | sed -e "s/\(package\s\)\|\(;\)//g" \
        ).$( echo $1 | sed -r 's/(.*).java/echo \1/ge' )
jarfile=$( echo $1 | sed -r 's/(.*)\.java/echo \L\1\.jar/ge' )
                                          
jar -cvf $CWD/$jarfile -C $CWD /class . > /dev/null 2>&1
#echo jar -cvf $jarfile -C class .
echo -----------------CMD Lines-----------------------
echo source $YARN_HOME /libexec/hadoop-config .sh >sourceIt.sh
echo export HADOOP_CLASSPATH=$jarfile: '$CLASSPATH' >>sourceIt.sh
echo source  $CWD /sourceIt .sh
echo yarn $class  [ command args]...

执行步骤:

注意,为了简化起见,脚本定义:

1
2
3
4
5
6
7
8
9
10
$. /compack .sh args1 args2 args3...中args1为main class
$ chmod 500 compack.sh
$. /compack .sh TestFileStatus.java
#then the script will reminder you with the following message:
-----------------CMD Lines------------------
source sourceIt.sh
yarn TestFileStatus  [ command args]...
$ source sourceIt.sh
# suppose we have a file "part-m-00000" in hdfs,run yarn like below
$yarn TestFileStatus /user/hive/warehouse/footbl/part-m-00000

Output:

1
2
3
4
5
6
7
8
9
10
11
12
#output
/user/hive/warehouse/footbl/part-m-00000 is a file .
/user/hive/warehouse/footbl/part-m-00000 getBlockSize: 134217728
/user/hive/warehouse/footbl/part-m-00000 getLen(): 1275
/user/hive/warehouse/footbl/part-m-00000 getOwner(): grid
/user/hive/warehouse/footbl/part-m-00000 getGroup(): supergroup
/user/hive/warehouse/footbl/part-m-00000 getAccessTime(): 1384675957784
/user/hive/warehouse/footbl/part-m-00000 getModificationTime(): 1384675958368
/user/hive/warehouse/footbl/part-m-00000 getPermission(): rw-r--r--
/user/hive/warehouse/footbl/part-m-00000 hashcode(): 1096001837
/user/hive/warehouse/footbl/part-m-00000 getPath(): \
hdfs: //cluster1 :9000 /user/hive/warehouse/footbl/part-m-00000

Code6. Listing files & glob files

Listing files

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import java.net.URI;
                                        
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.Configuration;
                                        
public class ListFiles {
   public static void main(String[] args) throws Exception {
     Configuration conf = new Configuration();
     FileSystem fs = FileSystem.get(conf);
                                        
     Path[] paths = new Path[args.length];
     for ( int i = 0 ; i < args.length; i++) {
       paths[i] = new Path(args[i]);
     }
                                        
     FileStatus[] status = fs.listStatus(paths);
     Path[] pathList = FileUtil.stat2Paths(status);
     for (Path p : pathList) {
       System.out.println(p);
     }
   }
}

执行步骤:

1
2
3
% . /compack .sh ListFiles.java
% source sourceIt.s
% yarn ListFiles /user/hive/warehouse/footbl /user/grid/

output:

1
2
3
4
5
hdfs: //cluster1 :9000 /user/hive/warehouse/footbl/_SUCCESS
hdfs: //cluster1 :9000 /user/hive/warehouse/footbl/part-m-00000
hdfs: //cluster1 :9000 /user/grid/kiss
hdfs: //cluster1 :9000 /user/grid/kissyou
hdfs: //cluster1 :9000 /user/grid/missyou

Filter files

1
2
3
4
5
6
7
8
9
10
11
12
13
14
package org.apache.hadoop.MyCode;
           
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.Path;
           
public class MyFilter implements PathFilter {
   private final String regex;
   public MyFilter(String regex) {
     this .regex = regex;
   }
   public boolean accept(Path path) {
     return path.toString().matches(regex);
   }
}

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
package org.apache.hadoop.MyCode;
          
import org.apache.hadoop.MyCode.MyFilter;
          
import java.net.URI;
          
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.conf.Configuration;
          
public class ListStatusWithPattern {
   public static void main(String[] args) throws Exception {
     Configuration conf = new Configuration();
     FileSystem fs = FileSystem.get(conf);
          
     FileStatus[] status = fs.globStatus( new Path(args[ 0 ]), new MyFilter(args[ 1 ]));
     Path[] pathList = FileUtil.stat2Paths(status);
          
     for ( Path p : pathList ) {
       System.out.println(p);
     }
   }
}

执行步骤:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
% source $YARN_HOME/libexec/hadoop-config.sh
% mkdir class
% javac ListStatusWithPattern.java  MyFilter.java -d class
% jar -cvf liststatuswithpattern.jar -C class ./
% export HADOOP_CLASSPATH=liststatuswithpattern.jar:$CLASSPATH
#suppose we have four files in hdfs like below
% hadoop fs -ls /user/grid/
Found 4 items
drwxr-xr-x   - grid supergroup          0 2013 - 11 - 17 01 : 06 /user/grid/kiss
-rw-r--r--   3 grid supergroup          0 2013 - 11 - 17 06 : 05 /user/grid/kissyou
drwxr-xr-x   - grid supergroup          0 2013 - 11 - 17 19 : 33 /user/grid/miss
-rw-r--r--   3 grid supergroup        899 2013 - 11 - 17 01 : 33 /user/grid/missyou
# then we can run the command to filter the matched file
% yarn jar liststatuswithpattern.jar org.apache.hadoop.MyCode.ListStatusWithPattern "hdfs:///user/grid/*ss*" "^.*grid/[k].*$

或者可以使用前面给出的脚本编译、打包并生成主要的执行yarn的代码:

1
2
3
4
5
6
7
8
9
10
$. /compack .sh ListStatusWithPattern.java MyFilter.java #注意,脚本默认输入的第一个源文件为main class所在文件
$ source source /home/grid/hadoop-2 .2.0-src /hadoop-dist/target/hadoop-2 .2.0 /task/DFSAPIProgramming/sourceIt .sh
-----------------CMD Lines-----------------------
source /home/grid/hadoop-2 .2.0-src /hadoop-dist/target/hadoop-2 .2.0 /task/DFSAPIProgramming/sourceIt .sh
yarn org.apache.hadoop.MyCode.MyFilter [ command args]...
$yarn org.apache.hadoop.MyCode.ListStatusWithPattern "hdfs:///user/grid/*ss*" "^.*grid/[k].*$"
output:
                                                                                        
hdfs: //cluster1 :9000 /user/grid/kiss
hdfs: //cluster1 :9000 /user/grid/kissyou

 

11.重写comparactor

要点

  1. 类型比较在hadoop的mapreduce中非常重要,主要用来比较keys;

  2. hadoop中的RawComparator<T>接口继承自java的comparator, 主要用来比较序列化的objects;

  3. hadoop中的WritableComparator class更全面,提供了两种主要的比较方法,一种是直接比较object,另一种是较serialized representations;

    举例来说 比较object:

compare(new IntWritable(21), new IntWritable(998));

比较serialized representations:

compare(serialize(new IntWritable(21)), serialize(new IntWritable(998)))

提示:继承关系

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
//1.org.apache.hadoop.io
Interface RawComparator<T>
  //description
  public interface RawComparator<T>
  extends Comparator<T>
  //method
  int compare( byte [] b1, int s1, int l1, byte [] b2, int s2, int l2)
//2.org.apache.hadoop.io
InterfaceWritableComparable<T>
  //description
public interface WritableComparable<T>
  extends Writable, Comparable<T>
  //method
Methods inherited from interface org.apache.hadoop.io.Writable
  readFields, write
//3.java.lang.Object
      |__ org.apache.hadoop.io.WritableComparator
  //description
public class WritableComparator
  extends Object
  implements RawComparator
  //methods
int compare( byte [] b1, int s1, int l1, byte [] b2, int s2, int l2) 
  int compare(Object a, Object b)
  int compare(WritableComparable a, WritableComparable b) 
  static int compareBytes( byte [] b1, int s1, int l1, byte [] b2, int s2, int l2)
//4.java.util
InterfaceComparator<T>
  //description
public interface Comparator<T>
  //methods
int compare(T o1, T o2)
  boolean equals(Object obj)


Code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import java.lang.Byte;
import java.io.DataOutputStream;
import java.io.ByteArrayOutputStream;
                                                                            
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.io.RawComparator;
                                                                            
public class MyIntWritableComparactor {
                                                                            
   public static byte [] serialize(IntWritable writable) throws Exception {
     ByteArrayOutputStream out = new ByteArrayOutputStream();
     DataOutputStream dataOut = new DataOutputStream(out);
     writable.write(dataOut);
     dataOut.close();
     return out.toByteArray();
   }
                                                                            
   @SuppressWarnings ( "unchecked" )
   public static void main(String[] args) throws Exception {
     RawComparator<IntWritable> comparator = WritableComparator.get(IntWritable. class );
     IntWritable w1 = new IntWritable( 13 );
     IntWritable w2 = new IntWritable( 12 );
     System.out.println( "w1: " + w1 + " w2: " + w2);
     System.out.println( "w1 compare w2 : " + comparator.compare(w1,w2));
                                                                            
     byte [] b1 = serialize(w1);
     byte [] b2 = serialize(w2);
     System.out.println( "b1.length: " + b1.length);
     System.out.println( "b2.length: " + b2.length);
     System.out.println( "b1.length compare b2.length: " +
  comparator.compare(b1, 0 , b1.length, b2, 0 , b2.length));
                                                                            
   }
}

编译,运行:

1
2
3
4
5
6
7
// 注意我用的是hadoop2.2
% source $YARN_HOME /libexec/hadoop-config .sh
% mkdir myclass
% javac -d myclass MyIntWritableCompare.java
% jar -cvf  mycompare.jar -C myclass ./
% export HADOOP_CLASSPATH=$CLASSPATH:mycompare.jar
% yarn MyIntWritableCompare

输出:

1
2
3
4
5
6
% yarn jar text.jar Text
w1: 13 w2: 12
w1 compare w2 : 1
b1.length: 4
b2.length: 4
b1.length compare b2.length: 1

12.数据压缩

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import org.apache.hadoop.io.*;
import org.apache.hadoop.io.compress.CompressionOutputStream;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.util.ReflectionUtils;
                                                                    
public class StreamCompressor {
   public static void main(String[] args) throws Exception {
     String codeClassname = args[ 0 ];
     Class<?> codeClass = Class.forName(codeClassname);
     Configuration conf = new Configuration();
     CompressionCodec codec =
(CompressionCodec)ReflectionUtils.newInstance(codeClass,conf);
                                                                    
     CompressionOutputStream out = codec.createOutputStream(System.out);
     IOUtils.copyBytes(System.in,out, 4096 , false );
     out.finish();
   }
}

你可能感兴趣的:(test)