test

hadoop2.2.0基础编程实例

1. Default mapreduce

 
          importorg.apache.hadoop.mapreduce.Mapper; 
         
          importorg.apache.hadoop.mapreduce.Reducer; 
         
          importorg.apache.hadoop.conf.Configuration; 
         
          importorg.apache.hadoop.conf.Configured; 
         
          importorg.apache.hadoop.fs.Path; 
         
          importorg.apache.hadoop.util.Tool; 
         
          importorg.apache.hadoop.util.ToolRunner; 
         
          importorg.apache.hadoop.mapreduce.Job; 
         
          importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat; 
         
          importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
         
          public 
          class 
          DefaultMapReduce  
          extends 
          Configured  
         
          implements 
          Tool { 
         
          @Override 
         
          public 
          int 
          run(String[] args) throwsException { 
         
          Configuration conf = getConf(); 
         
          Job job = Job.getInstance(conf); 
         
          job.setJarByClass(getClass()); 
         
          job.setNumReduceTasks( 
          0 
          ); 
         
          FileInputFormat.addInputPath(job, newPath(args[ 
          0 
          ])); 
         
          FileOutputFormat.setOutputPath(job, newPath(args[ 
          1 
          ])); 
         
          returnjob.waitForCompletion( 
          true 
          ) ?  
          0 
          :  
          1 
          ; 
         
          } 
         
          public 
          static 
          void 
          main(String[] args)  
          throws 
          Exception { 
         
          ToolRunner.run(newDefaultMapReduce(), args); 
         
          } 
         
          }

注意：

此处并没有定义map但是程序任然可以运行，是因为hadoop新的API用mapred类（接口实现）替换成了mapreduce类（抽象类实现），新的Mapper抽象类本身提供了可以运行的map方法，部分源码如下：

 
          publicvoidrun(Context context) throwsIOException, InterruptedException { 
         
          setup(context); 
         
          try 
          { 
         
          while 
          (context.nextKeyValue()) { 
         
          map(context.getCurrentKey(), context.getCurrentValue(), context); 
         
          } 
         
          }  
          finally 
          { 
         
          cleanup(context); 
         
          } 
         
          }

2.SequenceFileWritDemo

 
          import 
          java.io.IOException; 
         
          import 
          java.net.URI; 
         
          import 
          org.apache.hadoop.fs.FileSystem; 
         
          import 
          org.apache.hadoop.fs.Path; 
         
          import 
          org.apache.hadoop.conf.Configuration; 
         
          import 
          org.apache.hadoop.io.SequenceFile; 
         
          import 
          org.apache.hadoop.io.IntWritable; 
         
          import 
          org.apache.hadoop.io.Text; 
         
          import 
          org.apache.hadoop.io.IOUtils; 
         
          public 
          class 
          SequenceFileWritDemo { 
         
          private 
          static 
          final 
          String[] DATA = {  
         
          "one, two, buckle my shoe" 
          , 
         
          "Three, four, shut the door" 
         
          }; 
         
          public 
          static 
          void 
          main(String[] args)  
          throws 
          IOException { 
         
          String uri = args[ 
          0 
          ]; 
         
          Configuration conf =  
          new 
          Configuration(); 
         
          FileSystem fs = FileSystem.get(URI.create(uri), conf); 
         
          Path path =  
          new 
          Path(uri); 
         
          IntWritable key =  
          new 
          IntWritable(); 
         
          Text value =  
          new 
          Text(); 
         
          SequenceFile.Writer writer =  
          null 
          ; 
         
          try 
          { 
         
          writer = SequenceFile.createWriter(conf, writer.file(path),  
         
          qwriter.keyClass(key.getClass()),writer.valueClass(value.getClass())); 
         
          for 
          (  
          int 
          i =  
          0 
          ; i <  
          100 
          ; i++ ) {  
         
          key.set( 
          100 
          - i); 
         
          value.set(DATA[i % DATA.length]); 
         
          System.out.printf( 
          "[%s]\t%s\t%s\n" 
          , writer.getLength(), key, value); 
         
          writer.append(key, value); 
         
          } 
         
          }  
          finally 
          { 
         
          IOUtils.closeStream(writer); 
         
          } 
         
          } 
         
          }

 
          import 
          java.io.IOException; 
         
          import 
          java.net.URI; 
         
          import 
          org.apache.hadoop.fs.FileSystem; 
         
          import 
          org.apache.hadoop.fs.Path; 
         
          import 
          org.apache.hadoop.conf.Configuration; 
         
          import 
          org.apache.hadoop.io.SequenceFile; 
         
          import 
          org.apache.hadoop.io.IntWritable; 
         
          import 
          org.apache.hadoop.io.Text; 
         
          import 
          org.apache.hadoop.io.IOUtils; 
         
          public 
          class 
          SequenceFileWritDemo { 
         
          private 
          static 
          final 
          String[] DATA = {  
         
          "one, two, buckle my shoe" 
          , 
         
          "Three, four, shut the door" 
         
          }; 
         
          public 
          static 
          void 
          main(String[] args)  
          throws 
          IOException { 
         
          String uri = args[ 
          0 
          ]; 
         
          Configuration conf =  
          new 
          Configuration(); 
         
          FileSystem fs = FileSystem.get(URI.create(uri), conf); 
         
          Path path =  
          new 
          Path(uri); 
         
          IntWritable key =  
          new 
          IntWritable(); 
         
          Text value =  
          new 
          Text(); 
         
          SequenceFile.Writer writer =  
          null 
          ; 
         
          try 
          { 
         
          writer = SequenceFile.createWriter(conf, writer.file(path),  
         
          qwriter.keyClass(key.getClass()),writer.valueClass(value.getClass())); 
         
          for 
          (  
          int 
          i =  
          0 
          ; i <  
          100 
          ; i++ ) {  
         
          key.set( 
          100 
          - i); 
         
          value.set(DATA[i % DATA.length]); 
         
          System.out.printf( 
          "[%s]\t%s\t%s\n" 
          , writer.getLength(), key, value); 
         
          writer.append(key, value); 
         
          } 
         
          }  
          finally 
          { 
         
          IOUtils.closeStream(writer); 
         
          } 
         
          } 
         
          }

3.write & read a MapFile

 
          import 
          java.io.IOException; 
         
          import 
          org.apache.hadoop.io.IntWritable; 
         
          import 
          org.apache.hadoop.conf.Configuration; 
         
          import 
          org.apache.hadoop.io.Text; 
         
          import 
          org.apache.hadoop.io.MapFile; 
         
          import 
          org.apache.hadoop.io.MapFile.Writer; 
         
          import 
          org.apache.hadoop.io.MapFile.Reader; 
         
          import 
          org.apache.hadoop.fs.FileSystem; 
         
          import 
          org.apache.hadoop.fs.Path; 
         
          public 
          class 
          MyMapFile { 
         
          static 
          private 
          final 
          String[] DATA =  { 
         
          "this is the first" 
          , 
         
          "this is the second" 
          , 
         
          "this is the third" 
          , 
         
          "this is the forth" 
         
          };   
         
          public 
          static 
          void 
          main(String[] args)  
          throws 
          IOException { 
         
          Configuration conf =  
          new 
          Configuration(); 
         
          FileSystem fs = FileSystem.get(conf); 
         
          String uri = args[ 
          0 
          ]; 
         
          IntWritable key =  
          new 
          IntWritable(); 
         
          Text val =  
          new 
          Text(); 
         
          MapFile.Writer writer =  
          new 
          MapFile.Writer(conf,  
          new 
          Path(uri), 
         
          Writer.keyClass(key.getClass()),Writer.valueClass(val.getClass()));  
         
          for 
          (  
          int 
          i =  
          0 
          ; i <  
          10 
          ; i++ ) {  
         
          key.set( i +  
          1 
          ); 
         
          val.set(DATA[ i % DATA.length ]);  
         
          writer.append(key, val); 
         
          } 
         
          writer.close(); 
         
          MapFile.Reader reader =  
          new 
          MapFile.Reader( 
          new 
          Path(uri), conf); 
         
          while 
          ( reader.next(key, val) ){ 
         
          System.out.println( key +  
          "\t" 
          + val ); 
         
          } 
         
          reader.close(); 
         
          } 
         
          }

4.Configurataion Printer

 
          import 
          java.util.Map.Entry; 
         
          import 
          org.apache.hadoop.conf.Configuration; 
         
          import 
          org.apache.hadoop.fs.FileSystem; 
         
          import 
          org.apache.hadoop.util.ToolRunner; 
         
          import 
          org.apache.hadoop.util.Tool; 
         
          import 
          org.apache.hadoop.conf.Configured; 
         
          public 
          class 
          ConfigurationPrinter  
          extends 
          Configured  
          implements 
          Tool { 
         
          @Override 
         
          public 
          int 
          run(String[] args)  
          throws 
          Exception { 
         
          Configuration.addDefaultResource(args[ 
          0 
          ]); 
         
          Configuration.addDefaultResource(args[ 
          1 
          ]); 
         
          Configuration conf = getConf(); 
         
          for 
          (Entry<String, String> entry : conf) { 
         
          System.out.printf( 
          "%s=%s\n" 
          , entry.getKey(), entry.getValue()); 
         
          }    
         
          return 
          0 
          ; 
         
          } 
         
          public 
          static 
          void 
          main(String[] args)  
          throws 
          Exception { 
         
          ToolRunner.run( 
          new 
          ConfigurationPrinter(), args); 
         
          } 
         
          }

5.计算温度最大值（基于全新2.2.0API）

Deprecated: Job类的所有Constructors，新的API用静态方法getInstance(conf)来去的Job的实例；

 
          import 
          java.io.IOException; 
         
          import 
          java.util.Iterator; 
         
          import 
          org.apache.hadoop.fs.FileSystem; 
         
          import 
          org.apache.hadoop.fs.Path; 
         
          import 
          org.apache.hadoop.conf.Configuration; 
         
          import 
          org.apache.hadoop.conf.Configured; 
         
          import 
          org.apache.hadoop.io.LongWritable; 
         
          import 
          org.apache.hadoop.io.Text; 
         
          import 
          org.apache.hadoop.io.IntWritable; 
         
          import 
          org.apache.hadoop.mapreduce.Mapper; 
         
          import 
          org.apache.hadoop.mapreduce.Reducer; 
         
          import 
          org.apache.hadoop.util.Tool; 
         
          import 
          org.apache.hadoop.util.ToolRunner; 
         
          import 
          org.apache.hadoop.mapreduce.Job; 
         
          import 
          org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 
         
          import 
          org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 
         
          public 
          class 
          getMaxTemperature   
         
          extends 
          Configured  
          implements 
          Tool { 
         
          class 
          MaxTemperatureMapper  
         
          extends 
          Mapper<LongWritable, Text, Text, IntWritable> { 
         
          @Override 
         
          public 
          void 
          map(LongWritable key, Text val, Context context)  
         
          throws 
          IOException, InterruptedException { 
         
          String line = val.toString(); 
         
          String year = line.substring( 
          15 
          ,  
          19 
          ); 
         
          int 
          Temperature; 
         
          if 
          (!hasPlus(line)){ 
         
          Temperature = Integer.parseInt(line.substring( 
          87 
          ,  
          92 
          )); 
         
          }  
          else 
          { 
         
          Temperature = Integer.parseInt(line.substring( 
          88 
          , 
          92 
          )); 
         
          }    
         
          String qual = line.substring( 
          92 
          ,  
          93 
          ); 
         
          if 
          (!matched(qual)) { 
         
          context.write( 
          new 
          Text(year),  
          new 
          IntWritable(Temperature)); 
         
          } 
         
          } 
         
          private 
          boolean 
          hasPlus(String line) { 
         
          return 
          line.charAt( 
          87 
          ) ==  
          '+' 
          ?   
          true 
          :  
          false 
          ; 
         
          } 
         
          private 
          boolean 
          matched(String line) { 
         
          return 
          line.matches( 
          "[01459" 
          ) ?  
          true 
          :  
          false 
          ; 
         
          } 
         
          } 
         
          class 
          MaxTemperatureReducer  
         
          extends 
          Reducer<Text, IntWritable, Text, IntWritable> { 
         
          @Override 
         
          public 
          void 
          reduce(Text key, Iterable<IntWritable> vals, Context context)  
         
          throws 
          IOException, InterruptedException { 
         
          int 
          maxValue = Integer.MIN_VALUE; 
         
          for 
          ( IntWritable value : vals ) { 
         
          maxValue = Math.max(maxValue, value.get()); 
         
          } 
         
          context.write(key,  
          new 
          IntWritable(maxValue)); 
         
          } 
         
          } 
         
          @Override 
         
          public 
          int 
          run(String[] args)  
         
          throws 
          Exception { 
         
          Configuration conf = getConf(); 
         
          Job job = Job.getInstance(conf); 
         
          job.setJobName( 
          "helloRuby" 
          ); 
         
          job.setJarByClass(getClass()); 
         
          FileInputFormat.addInputPath(job,  
          new 
          Path(args[ 
          0 
          ])); 
         
          FileOutputFormat.setOutputPath(job,  
          new 
          Path(args[ 
          1 
          ])); 
         
          job.setMapperClass(MaxTemperatureMapper. 
          class 
          ); 
         
          job.setCombinerClass(MaxTemperatureReducer. 
          class 
          ); 
         
          job.setReducerClass(MaxTemperatureReducer. 
          class 
          ); 
         
          job.setOutputKeyClass(Text. 
          class 
          ); 
         
          job.setOutputValueClass(IntWritable. 
          class 
          ); 
         
          return 
          job.waitForCompletion( 
          true 
          ) ?  
          0 
          :  
          1 
          ; 
         
          } 
         
          public 
          static 
          void 
          main(String[] args)  
         
          throws 
          Exception { 
         
          ToolRunner.run( 
          new 
          getMaxTemperature() , args); 
         
          } 
         
          }

6.File Read

 
          public 
          class 
          FileRead { 
         
          public 
          static 
          void 
          main(Sting[] args)  
         
          throws 
          Exception { 
         
          Configuration conf =  
          new 
          Configuration(); 
         
          FileSystem fs = FileSystem.get(conf); 
         
          InputStream in =  
          new 
          InputStream(); 
         
          in = fs.open( 
          new 
          Path(args[ 
          0 
          ])); 
         
          IOUtils.copyBytes(in, System.out,  
          4096 
          ,  
          false 
          ); 
         
          IOUtils.closeStream(in); 
         
          } 
         
          }

过程分析：

（1）open

当client读取文件时，在FileSystem ojbect上调用open()方法，而FileSystem是HDFS的一个instance;

从上述程序中可见：

line5 得到FileSystem的instance

line7 调用FileSystem上的open()方法

（2）get block location

随后，HDFS通过RPC(Remote Procedure Call)来呼叫namenode, 来获得the locations of the blocks for the first few blocks, 对于每一个block, namenode会返回有此block的datanode 的 address, 而且datanode会根据Network Topology被重新排序；

获取locations以后，DFS会返回一个FSDataInputStream给client来读取数据， FSDataInputStream会依次wrap一个DFSInputStream来管理datanode和namenode的 I/O，DFSInputStream上同时也存储了first few datanode 的address;

（3）read

client 在FSDataInputStream上call read(), 则DFSInputStream会连接存储第一个block的最近的datanode，之后不停地call read()方法从datanode读取数据到client, 当到达block的末尾，DFSInputStream会关闭与此datanode的连接，然后找到存储下一个block的datanode, 依次往复...

(4) DFSInputStream

DFSInputStream按顺序读入每一个packet的最近的一个block，每读一个block都要重新和一个datanode建立连接；

DFSInputStream同时会和namenode保持连接，来重新获取下一个packet的blocks所在的datanode的locations

（5）FSDataInputStream

FSDataInputStream 是client和datanode连接的中介， client call read() methods 都通过FSDataInputStream来调用DFSInputStream

（6）容错

在读取数据的过程中遇到的错误主要有两类：

1. DFSInputStream和datanode 的communication出现错误，此时DFSInputStream会尝试连接保存此packet的下一个block所在的datanode中最近的一个，同时会记录此datanode, 防止读取下一个block是再次从该datanode上读取；

2. DFSInputStream checksum data from datanode时，发现损坏的数据块，则它会在DFSInputStream尝试从另一个datanode读取此packet的下一个block副本之前报告给namenode；

7.File Write

 
          import 
          org.apache.hadoop.fs.FileSystem; 
         
          import 
          org.apache.hadoop.fs.Path; 
         
          import 
          org.apache.hadoop.conf.Configuration; 
         
          public 
          class 
          CreateDir { 
         
          public 
          static 
          void 
          main(String[] args)  
         
          throws 
          Exception { 
         
          Configuration conf =  
          new 
          Configuration(); 
         
          String dst = args[ 
          0 
          ]; 
         
          FileSystem fs = FileSystem.get(conf); 
         
          fs.create( 
          new 
          Path(dst)); 
         
          } 
         
          }

过程分析：

(1)create

当client 写入文件时，在FileSystem object上调用create()方法， FileSystem是HDFS的一个instance;

(2) create new file in filesystem's namespace

DFS 通过Remote Procedure Call 来call namenode， namenode会在filesystem的namespace产生一个新的文件，新建文件之前，nanemode会做一些列检查，包括client是否有create file的permission，要create的文件是否已经存在，若检查没有通过，则抛出IOException；

create file以后，DFS会返回一个FSDataOutputStream给client来写文件， FSDataInputStream会wrap一个DFSOutputStream来与namenode和datanode交流；

(3) client write data

client开始写入数据到文件时，DFSOutputStream会将待写入的数据split into 很多packets, 这些packet会被写入一个内部队列data queue, DFSOutputStream维护此data queue;

这个data queue将被DataStreamer所用，DataStreamer主要负责向namenode发出申请，来为新的packet的block副本分配合适的datanode， namenode会挑选出合适的datanodes来存储这些data blocks；

存储这个packet的blocks的datanode会组成一个pipeline, 假设每个packet的block的replication level是3，则此pipeline由3个datanode组成。

DataStreamer将此packet导入pipeline的第一个datanode, 该datanode存储此packet之后forward it to 第二个datanode, 同样地，第二个datanode存储此packet, forward to 第三个datanode;

(4) 容错

从(3)中我们知道DFSOutputStream维护着一个data queue, 此外，

DFSOutputStream还维护一个ack queue(acknowledged), 当一个packet已经被所有在pipeline中的datanode acknowledged，则，此packet将会从ack queue中移走；

如果写数据时，一个datanode 写入失败，则会发生以下动作：

首先， pipeline会关闭，所有在ack queue中的packets会被添加到data queue前面，以保证下游的datanode不会丢失任何packets, 当前已写入datanode的数据块会被标识，而写入坏的datnode中的部分数据会在此datanode recover以后被删除；

failed的datanode将从pipeline中移走， namenode 会notice到这些，会重新分配一个datanode来组成新的pipeline；下一个packet的block不会受到影响；

当写入一个block时若大量的datanode failed，只要满足dfs.replication.min（default is 1), 则此写入就不会失败，block会被复制并同步到cluster上的datanode中，知道达到dfs.replication所设的数目（默认是3）

(5)数据写入的收尾阶段

当client完成数据写入是，client会调用在FSDataOutputStream 上的close()。此动作会在给namenode发送file完成写入的信号之前flushs所有剩余的packets到datanode pipeline并等待acknowledge；

因为DataStreamer之前曾为所有的packets向namenode申请过block locations, 故namenode已经知道此文件由哪些blocks组成。

8.MRUnit使用

常用类

 
          implements 
          java.lang.annotation.Annotation： 
         
          org.junit.Test 
         
          org.junit.Rule  
         
          org.junit.Ignore 
         
          org.junit.ClassRule 
         
          org.junit.BeforeClass 
         
          org.junit.Before 
         
          org.junit.AfterClass 
         
          org.junit.After

Code

 
          import 
          java.io.IOException; 
         
          import 
          org.apache.hadoop.io.LongWritable; 
         
          import 
          org.apache.hadoop.io.IntWritable; 
         
          import 
          org.apache.hadoop.io.Text; 
         
          import 
          org.junit.Test; 
         
          import 
          org.apache.hadoop.mrunit.mapreduce.MapDriver; 
         
          public 
          class 
          MaxTemperatureMapperTest { 
         
          @Test 
         
          public 
          void 
          processesValidRecord()  
          throws 
          IOException { 
         
          Text value =  
          new 
          Text( 
          "0043011990999991950051518004+68750+023550FM-12+0382" 
          + 
         
          // Year ^^^^ 
         
          "99999V0203201N00261220001CN9999999N9-00111+99999999999" 
          ); 
         
          // Temperature ^^^^^ 
         
          new 
          MapDriver<LongWritable, Text, Text, IntWritable>() 
         
          .withMapper( 
          new 
          MaxTemperatureMapper()) 
         
          .withInput( 
          new 
          LongWritable( 
          1 
          ), value) 
         
          .withOutput( 
          new 
          Text( 
          "1950" 
          ),  
          new 
          IntWritable(- 
          11 
          )) 
         
          .runTest(); 
         
          } 
         
          }

注意一些deprecated的class和methods:

org.apache.hadoop.mrunit.MapDriver<K1,V1,K2,V2>被弃用应该可以理解，此类是为mapreduce的旧API（比如org.apache.hadoop.mapred）写的，比如其中一个方法

MapDriver<K1,V1,K2,V2>withMapper(org.apache.hadoop.mapred.Mapper<K1,V1,K2,V2> m)

mapreduce的新API为org.apache.hadoop.mapreduce.*; 与之对应MRUnit的MapDriver（包括ReduceDriver）为：

org.apache.hadoop.mrunit.mapreduce.MapDriver<K1,V1,K2,V2>， 同样的，上述方法变为：

MapDriver<K1,V1,K2,V2>withCounters(org.apache.hadoop.mapreduce.Counters ctrs)

MapDriverBase class中的T withInputValue(V1 val) 被弃用，改为T withInput(K1 key, V1 val) ，还有很多，不详列。

执行步骤：

注意： 需要下载MRUnit并编译，在/home/user/.bashrc下设置MRUnit_HOME变量，之后修改$HADOOP_HOME/libexec/hadoop-config.sh，将$MRUnit_HOME/lib/*.jar添加进去, 之后source $HADOOP_HOME/libexec/hadoop-config.sh,再执行下面操作：

 
          javac  -d class/  MaxTemperatureMapper.java  MaxTemperatureMapperTest.java 
         
          jar -cvf  
          test 
          .jar -C class ./ 
         
          java - 
          cp 
          test 
          .jar:$CLASSPATH org.junit.runner.JUnitCore  MaxTemperatureMapperTest

9.Test Tool, ToolRunner, GenericOptionsParser, Configuration

说明：

Tool 中的run()方法可以看做是mapreduce程序的driver, 我们一般通过implements Tool接口来设置Job启动的相关属性，然后在main()函数里通过静态调用ToolRunner.run(new MainClass(), args)来间接调用run()方法;

2. 包装GenericOptionsParser类来解generic hadoop command line arguments（参见另一篇文章：《hadoopFS-shell commands》

Code1 (Configuration里添加的resource是String类型):

 
          import 
          java.util.Map.Entry; 
         
          import 
          org.apache.hadoop.conf.Configuration; 
         
          import 
          org.apache.hadoop.conf.Configured; 
         
          import 
          org.apache.hadoop.util.ToolRunner; 
         
          import 
          org.apache.hadoop.util.Tool; 
         
          import 
          org.apache.hadoop.fs.Path; 
         
          public 
          class 
          ConfigurationPrinter  
          extends 
          Configured  
          implements 
          Tool { 
         
          static 
          { 
         
          Configuration.addDefaultResource( 
          "config.xml" 
          ); 
         
          } 
         
          @Override 
         
          public 
          int 
          run(String[] args)  
          throws 
          Exception { 
         
          Configuration conf = getConf(); 
         
          for 
          (Entry<String, String> hash: conf) { 
         
          System.out.printf( 
          "%s=%s\n" 
          , hash.getKey(), hash.getValue()); 
         
          }    
         
          return 
          0 
          ; 
         
          } 
         
          public 
          static 
          void 
          main(String[] args)  
          throws 
          Exception { 
         
          int 
          exitCode = ToolRunner.run( 
          new 
          ConfigurationPrinter(), args); 
         
          System.exit(exitCode); 
         
          } 
         
          }

注：Configuration class提供只一种静态方法：addDefaultresource(String name），如上述代码，添加Resource "config.xml"为String类型时，hadoop将从classpath里查找此文件；若Resource 为Path()类型时，hadoop将从local filesystem里查找此文件： Configuration conf = new Configuration(); conf.addResource(new Path("config.xml"));

执行步骤：

 
          #将自定义的config文件config.xml放在hadoop的$HADOOP_CONF_DIR里 
         
          %  
          mv 
          config.xml $HADOOP_HOME 
          /etc/hadoop/

假如我们添加的resource如下：

 
          <!--cat $HADOOP_HOME/etc/hadoop/config.xml--> 
         
          < 
          configuration 
          > 
         
          < 
          property 
          > 
         
          < 
          name 
          >color</ 
          name 
          > 
         
          < 
          value 
          >yellow</ 
          value 
          > 
         
          </ 
          property 
          > 
         
          < 
          property 
          > 
         
          < 
          name 
          >size</ 
          name 
          > 
         
          < 
          value 
          >10</ 
          value 
          > 
         
          </ 
          property 
          > 
         
          < 
          property 
          > 
         
          < 
          name 
          >weight</ 
          name 
          > 
         
          < 
          value 
          >heavy</ 
          value 
          > 
         
          < 
          final 
          >true</ 
          final 
          > 
         
          </ 
          property 
          > 
         
          </ 
          configuration 
          >

执行代码：

 
          mkdir 
          class 
         
          source 
          $HADOOP_HOME 
          /libexec/hadoop-config 
          .sh  
         
          javac  -d class ConfigurationPrinter.java 
         
          jar -cvf ConfigurationPrinter.jar -C class ./ 
         
          export 
          HADOOP_CLASSPATH=ConfigurationPrinter.jar:$CLASSPATH 
         
          #下面查找刚才添加的resource是否被读入 
         
          #我们在config.xml里添加了一项 <name>color</name>，执行 
         
          yarn ConfigurationPrinter| 
          grep 
          "color" 
         
          color=yellow 
         
          #可见代码是正确的

或者在Command line里指定HADOOP_CONF_DIR，比如执行：

 
          % $YARN_HOME 
          /bin/yarn 
          ConfigurationPrinter --conf config.xml |  
          grep 
          color 
         
          color=yellow

也是可以的！

Code2 (Configuration里添加的resource是Path类型)：

 
          import 
          java.util.Map.Entry; 
         
          import 
          org.apache.hadoop.conf.Configuration; 
         
          import 
          org.apache.hadoop.conf.Configured; 
         
          import 
          org.apache.hadoop.util.ToolRunner; 
         
          import 
          org.apache.hadoop.util.Tool; 
         
          import 
          org.apache.hadoop.fs.Path; 
         
          public 
          class 
          ConfigurationPrinter  
          extends 
          Configured  
          implements 
          Tool { 
         
          @Override 
         
          public 
          int 
          run(String[] args)  
          throws 
          Exception { 
         
          Configuration conf =  
          new 
          Configuration(); 
         
          conf.addResource( 
          new 
          Path( 
          "config.xml" 
          )); 
         
          for 
          (Entry<String, String> hash: conf) { 
         
          System.out.printf( 
          "%s=%s\n" 
          , hash.getKey(), hash.getValue()); 
         
          }    
         
          return 
          0 
          ; 
         
          } 
         
          public 
          static 
          void 
          main(String[] args)  
          throws 
          Exception { 
         
          int 
          exitCode = ToolRunner.run( 
          new 
          ConfigurationPrinter(), args); 
         
          System.exit(exitCode); 
         
          } 
         
          }

此时添加的resource类型是Path()类型，故hadoop将从local filesystem里查找config.xml, 不需要将config.xml放在conf/下面，只要在代码中指定config.xml在本地文件系统中的路径即可（new Path("../others/config.xml"））

运行步骤:

 
          %  
          mkdir 
          class 
         
          %  
          source 
          $HADOOP_HOME 
          /libexec/hadoop-config 
          .sh  
         
          % javac  -d class ConfigurationPrinter.java 
         
          % jar -cvf ConfigurationPrinter.jar -C class ./ 
         
          %  
          export 
          HADOOP_CLASSPATH=ConfigurationPrinter.jar:$CLASSPATH 
         
          #下面查找刚才添加的resource是否被读入 
         
          #我们在config.xml里添加了一项 <name>color</name>，执行 
         
          % yarn ConfigurationPrinter| 
          grep 
          "color" 
         
          color=yellow 
         
          #可见代码是正确的

备注：ConfigurationParser支持set individual properties:

 
          Generic Options 
         
          The supported generic options are: 
         
          -conf <configuration  
          file 
          >     specify a configuration  
          file 
         
          -D <property=value>            use value  
          for 
          given property 
         
          -fs < 
          local 
          |namenode:port>      specify a namenode 
         
          -jt < 
          local 
          |jobtracker:port>    specify a job tracker 
         
          -files <comma separated list of files>    specify comma separated 
         
          files to be copied to the map reduce cluster 
         
          -libjars <comma separated list of jars>   specify comma separated 
         
          jar files to include  
          in 
          the classpath. 
         
          -archives <comma separated list of archives>    specify comma 
         
          separated archives to be unarchived on the compute machines.

可以尝试：

 
          % $YARN_HOME 
          /bin/yarn 
          ConfigurationPrinter -d fuck=Japan |  
          grep 
          fuck 
         
          #输出为： 
         
          fuck=Japan

再次提醒：

ToolRunner can be used to run classes implementing Toolinterface. It works in conjunction with GenericOptionsParser to parse thegeneric hadoop command line arguments and modifies the Configurationof theTool. The application-specific options are passed along without being modified.

ToolRunner和GenericOptionsParser共同来（解析|修改） generic hadoop command line arguments （什么是generic hadoop command line arguments？比如：yarn command [genericOptions] [commandOptions]

10.DFS API 操作

Code 1. Reading data from a hadoop URL

说明：想要让java从hadoop的dfs里读取数据，则java 必须能够识别hadoop hdfs URL schema, 因此我们应该将hdfs的FsUrlStreamHandlerFactory作为一个实例提供给java, java的setURLStreamHandlerFactory方法可以实现此功能；

注意：此方法有缺陷，由于在java里，setURLStreamHandlerFactorymethod在每一个JVM里只能调用一次，加入第三方component已经set a URLStreamHandlerFactory，则hadoop用户就不能使用setURLStreamHandlerFactory方法来 reading data from hadoop。

 
          import 
          java.io.InputStream; 
         
          import 
          java.net.URL; 
         
          import 
          org.apache.hadoop.fs.FsUrlStreamHandlerFactory; 
         
          import 
          org.apache.hadoop.io.IOUtils;                     
         
          public class URLCat { 
         
          static { 
         
          URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory()); 
         
          } 
         
          public static void main(String[] args) throws Exception { 
         
          InputStream  
          in 
          = null; 
         
          try { 
         
          in 
          = new URL(args[0]).openStream(); 
         
          IOUtils.copyBytes( 
          in 
          , System.out, 4096,  
          false 
          ); 
         
          } finally { 
         
          IOUtils.closeStream( 
          in 
          ); 
         
          }   
         
          } 
         
          }

执行步骤：

 
          $ 
          source 
          $YARN_HOME 
          /libexec/hadoop-config 
          .sh 
         
          $ 
          mkdir 
          myclass 
         
          $javac - 
          cp 
          $CLASSPATH URLCat.java -d myclass 
         
          $jar -cvf urlcat.jar -C myclass ./ 
         
          # assume we have a file bar.txt in hdfs： /user/grid/bar.txt 
         
          # then we need run yarn with this command 
         
          $yarn jar - 
          cp 
          urlcat.jar URLCat hdfs: 
          ///user/grid/bar 
          .txt

Code2. Reading data using HDFS API

说明：使用hadoop的FileSystem API可以避免上面所述的JVM只能调用一次setURLStreamHandlerFactory的缺陷；

 
          import 
          java.net.URI; 
         
          import 
          java.io.InputStream; 
         
          import 
          org.apache.hadoop.io.IOUtils; 
         
          import 
          org.apache.hadoop.fs.FileSystem; 
         
          import 
          org.apache.hadoop.fs.Path; 
         
          import 
          org.apache.hadoop.conf.Configuration; 
         
          public 
          class 
          URICat { 
         
          public 
          static 
          void 
          main(String[] args)  
          throws 
          Exception { 
         
          String uri = args[ 
          0 
          ]; 
         
          Configuration conf =  
          new 
          Configuration(); 
         
          FileSystem fs = FileSystem.get(URI.create(uri), conf); 
         
          InputStream in =  
          null 
          ; 
         
          try 
          { 
         
          in = fs.open( 
          new 
          Path(uri)); 
         
          IOUtils.copyBytes(in, System.out,  
          4096 
          ,  
          false 
          ); 
         
          }  
          finally 
          { 
         
          IOUtils.closeStream(in); 
         
          }   
         
          } 
         
          }

执行步骤：

 
          $ 
          source 
          $YARN_HOME 
          /libexec/hadoop-config 
          .sh 
         
          $ 
          mkdir 
          myclass 
         
          $javac - 
          cp 
          $CLASSPATH URICat.java -d myclass 
         
          $jar -cvf uricat.jar -C myclass ./ 
         
          $yarn jar - 
          cp 
          uricat.jar URICat  
          /user/grid/bar 
          .txt

备注1：因为我们调用了FileSystem的API，故输入的filepath也可以省略HDFS的URI全名hdfs://,如上面执行步骤里所写。

备注2：FileSystem是抽象类，故不能new FileSystem()来得到instance, 而需要调用其的静态方法get()来得到；

备注3：注意java里的向上转型，体现在简要提示里各种Stream的继承关系上；

备注4：Configuration conf = new Configuration();

Configurations需要xml文件里的键值对<name>x</name>来配置，规则为：

　　　　if x is named by a String, 则在classpath里检查同名文件；

　　　　if x is named by a Path, 则直接本地查找，不检查classpath；

若用户不指定，则默认调用两个resources： core-site.xml和core-default.xml
用户可以指定xml文件以添加自己定义的configurations：

conf.addResource("my_configuration.xml");

Code3. Writing data

从本地复制文件到hdfs

版本1 FileCopy with copyBytes() method

简要提示：

1.核心代码就一行，即从InputStrea 以二进制方式复制到OutputStream：

 
          static void copyBytes(InputStream  
          in 
          , OutputStream out, int buffSize, boolean close)

2.我们新建一个FileInputStream(localsrc)实例, 将其暂存在BufferedInputStream()里，并向上转型生成InputStream:

 
          FileInputStream(String name )

3.调用FileSystem来产生OutputStream:

 
          FSDataOutputStream create(Path f, Progressable progress)

代码：

 
          import 
          java.net.URI; 
         
          import 
          java.io.InputStream; 
         
          import 
          java.io.BufferedInputStream; 
         
          import 
          java.io.FileInputStream; 
         
          import 
          java.io.OutputStream; 
         
          import 
          org.apache.hadoop.fs.BufferedFSInputStream; 
         
          import 
          org.apache.hadoop.util.Progressable; 
         
          import 
          org.apache.hadoop.util.Progressable; 
         
          import 
          org.apache.hadoop.io.IOUtils; 
         
          import 
          org.apache.hadoop.fs.Path; 
         
          import 
          org.apache.hadoop.fs.FileSystem; 
         
          import 
          org.apache.hadoop.conf.Configuration; 
         
          public 
          class 
          FileCopyWithProgress { 
         
          public 
          static 
          void 
          main(String[] args)  
          throws 
          Exception { 
         
          String localsrc = args[ 
          0 
          ]; 
         
          String dst = args[ 
          1 
          ]; 
         
          InputStream in =  
          new 
          BufferedInputStream( 
          new 
          FileInputStream(localsrc)); 
         
          Configuration conf =  
          new 
          Configuration(); 
         
          FileSystem fs = FileSystem.get(URI.create(dst), conf); 
         
          OutputStream out = fs.create( 
          new 
          Path(dst),  
          new 
          Progressable() { 
         
          public 
          void 
          progress() { System.out.print( 
          "." 
          );} } 
         
          ); 
         
          IOUtils.copyBytes(in, out,  
          4096 
          ,  
          true 
          ); 
         
          } 
         
          }

执行步骤：

 
          % $YARN_HOME 
          /libexec/hadoop-config 
          .sh  
         
          % javac - 
          cp 
          $CLASSPATH -d my_class FileCopyWithProgress.java 
         
          % jar -cvf filecopywithprogress.jar -C my_class/ . 
         
          # assum we have a local file foo.out in directory: /home/grid/foo.out,  
         
          # then we should run yarn like below 
         
          % yarn jar filecopywithprogress.jar FileCopyWithProgress \ 
         
          /home/grid/foo 
          .out hdfs: 
          ///user/grid/copied_foo 
          .out 
         
          # we can do a check for the copied file 
         
          % hadoop fs - 
          ls 
          -R  
          /user/grid/

注：从下面开始使用另一种方式来编译、运行代码

版本2 使用FileSystem的copyFromLocalFile()方法

代码如下：

 
          import 
          org.apache.hadoop.fs.Path; 
         
          import 
          org.apache.hadoop.fs.FileSystem; 
         
          import 
          org.apache.hadoop.conf.Configuration; 
         
          public 
          class 
          FileCopyFromLocal { 
         
          public 
          static 
          void 
          main(String[] args)  
          throws 
          Exception { 
         
          String localSrc = args[ 
          0 
          ]; 
         
          String dst = args[ 
          1 
          ]; 
         
          Configuration conf =  
          new 
          Configuration(); 
         
          FileSystem fs = FileSystem.get(conf); 
         
          fs.copyFromLocalFile( 
          new 
          Path(localSrc), 
          new 
          Path(dst)); 
         
          } 
         
          }

执行步骤：

 
          $ 
          source 
          $YARN_HOME 
          /libexec/hadoop-config 
          .sh 
         
          $javac FileCopyFromLocal.java -d class/ 
         
          $jar -cvf filecopyfromlocal.jar -C class ./ 
         
          $ 
          export 
          HADOOP_CLASSPATH=$CLASSPATH:filecopyfromlocal.jar 
         
          # suppose we have a file bar.txt in local disk,  
         
          # then we use the following command line to copy it to hdfs 
         
          $yarn FileCopyFromLocal bar.txt hdfs: 
          ///user/grid/kissyou 
         
          # we can check the copied file on hdfs 
         
          $hadoop fs - 
          ls 
          /user/grid/ 
         
          w-r--r--   3 grid supergroup   899 2013-11-17 01:33  
          /user/grid/kissyou

Code4.新建文件夹/文件

新建文件夹 FileSystem.mkdirs()

 
          import 
          org.apache.hadoop.fs.FileSystem; 
         
          import 
          org.apache.hadoop.conf.Configuration; 
         
          import 
          org.apache.hadoop.fs.Path; 
         
          public 
          class 
          CreateDir { 
         
          public 
          static 
          void 
          main(String[] args)  
          throws 
          Exception { 
         
          Configuration conf =  
          new 
          Configuration(); 
         
          String dst = args[ 
          0 
          ]; 
         
          FileSystem fs = FileSystem.get(conf); 
         
          fs.mkdirs( 
          new 
          Path(dst)); 
         
          } 
         
          }

执行步骤:

 
          $ 
          source 
          $YARN_HOME 
          /libexec/hadoop-config 
          .sh 
         
          $javac CreatDir.java -d class/ 
         
          $jar -cvf createdir.jar -C class ./ 
         
          $ 
          export 
          HADOOP_CLASSPATH=$CLASSPATH:createdir.jar 
         
          $yarn CreateDir hdfs: 
          ///user/grid/kissyou 
         
          # we can check the created directory on hdfs 
         
          $hadoop fs - 
          ls 
          /user/grid/ 
         
          w-r--r--   3 grid supergroup        899 2013-11-17 01:33  
          /user/grid/kissyou

新建文件 FileSystem.create()

 
          import 
          org.apache.hadoop.fs.FileSystem; 
         
          import 
          org.apache.hadoop.conf.Configuration; 
         
          import 
          org.apache.hadoop.fs.Path; 
         
          public 
          class 
          CreateFile { 
         
          public 
          static 
          void 
          main(String[] args)  
         
          throws 
          Exception { 
         
          Configuration conf =  
          new 
          Configuration(); 
         
          String dst = args[ 
          0 
          ]; 
         
          FileSystem fs = FileSystem.get(conf); 
         
          fs.create( 
          new 
          Path(dst)); 
         
          } 
         
          }

执行步骤：

 
          $ 
          source 
          $YARN_HOME 
          /libexec/hadoop-config 
          .sh 
         
          $javac CreatFile.java -d class/ 
         
          $jar -cvf createfile.jar -C class ./ 
         
          $ 
          export 
          HADOOP_CLASSPATH=$CLASSPATH:createfile.jar 
         
          $yarn CreatFile hdfs: 
          ///user/grid/kissyou 
          .txt 
         
          # we can check the created file on hdfs 
         
          $hadoop fs - 
          ls 
          /user/grid/ 
         
          w-r--r--   3 grid supergroup        899 2013-11-17 01:33  
          /user/grid/kissyou 
          .txt

注意三点：

1. 同一路径下不可以新建同名的文件foo和目录foo/, 否则运行时会抛出异常：fs.FileAlreadyExistsException

2. 我们进行copy复制、写文件操作时mkdirs()方法会被自动调用，故一般不会调用mkdirs()来手动创建目录；

3. 官方API文档里对mkdirs()的描述是:"Make the given file and all non-existent parents into directories", 所以在hadoop里创建文件的方法是recursive(递归的)，相当于linux里的:

 
          %  
          mkdir 
          -p foo 
          /bar/qzx

同样等价于hdfs-shell里的命令：

 
          % $YARN_HOME/bin/hadoop fs -mkdir -p hdfs: 
          ///foo/bar/qzx

Code5.Testing file and Getting fileStatus

提示： hadoop2.2中一些API已经deprecated, 以下代码全部用新的constructor,methods写成。

 
          import 
          java.net.URI; 
         
          import 
          org.apache.hadoop.fs.FileSystem; 
         
          import 
          org.apache.hadoop.fs.Path; 
         
          import 
          org.apache.hadoop.conf.Configuration; 
         
          import 
          org.apache.hadoop.fs.FileStatus; 
         
          public 
          class 
          TestFileStatus { 
         
          public 
          static 
          void 
          main(String[] args)  
          throws 
          Exception { 
         
          Configuration conf =  
          new 
          Configuration(); 
         
          FileSystem fs = FileSystem.get(conf); 
         
          FileStatus stat = fs.getFileStatus( 
          new 
          Path(args[ 
          0 
          ])); 
         
          if 
          (stat.isDirectory()) { 
         
          System.out.println(stat.getPath().toUri().getPath() +  
         
          " is a directory." 
          ); 
         
          }  
          else 
          if 
          (stat.isFile()) { 
         
          System.out.println(stat.getPath().toUri().getPath() +  
         
          " is a file." 
          ); 
         
          System.out.println(stat.getPath().toUri().getPath() +  
         
          " getBlockSize: " 
          + stat.getBlockSize()); 
         
          System.out.println(stat.getPath().toUri().getPath() + 
         
          " getLen(): " 
          + stat.getLen()); 
         
          System.out.println(stat.getPath().toUri().getPath() + 
         
          " getOwner(): " 
          + stat.getOwner()); 
         
          System.out.println(stat.getPath().toUri().getPath() +  
         
          " getGroup(): " 
          + stat.getGroup()); 
         
          System.out.println(stat.getPath().toUri().getPath() + 
         
          " getAccessTime(): " 
          + stat.getAccessTime()); 
         
          System.out.println(stat.getPath().toUri().getPath() + 
         
          " getModificationTime(): " 
          + stat.getModificationTime()); 
         
          System.out.println(stat.getPath().toUri().getPath() + 
         
          " getPermission(): " 
          + stat.getPermission()); 
         
          System.out.println(stat.getPath().toUri().getPath() + 
         
          " hashcode(): " 
          + stat.hashCode()); 
         
          System.out.println(stat.getPath().toUri().getPath() + 
         
          " getPath(): " 
          + stat.getPath()); 
         
          } 
         
          } 
         
          }

利用下面我写的小脚本可以方便地编译并生成jar文件：

 
          #!/usr/bin/env sh 
         
          CWD=$( 
          pwd 
          ) 
         
          export 
          CLASSPATH= 
          '' 
         
          . $YARN_HOME 
          /libexec/hadoop-config 
          .sh 
         
          if 
          [ -d class ];  
          then 
         
          rm 
          -rf class/* 
         
          else 
         
          mkdir 
          $CWD 
          /class 
         
          fi 
         
          for 
          f  
          in 
          $@ 
         
          do 
         
          srcs= 
          "$srcs $CWD/$f" 
         
          done 
         
          javac $srcs -d class 
         
          if 
          [ $? - 
          ne 
          0 ] ; 
          then 
         
          echo 
          Error found when compiling the code! 
         
          exit 
          1 
         
          fi 
         
          class=$(  
          cat 
          $1 | 
          grep 
          'package' 
          | 
          sed 
          -e  
          "s/\(package\s\)\|\(;\)//g" 
          \ 
         
          ).$( 
          echo 
          $1 |  
          sed 
          -r  
          's/(.*).java/echo \1/ge' 
          ) 
         
          jarfile=$( 
          echo 
          $1 |  
          sed 
          -r  
          's/(.*)\.java/echo \L\1\.jar/ge' 
          ) 
         
          jar -cvf $CWD/$jarfile -C $CWD 
          /class 
          . >  
          /dev/null 
          2>&1 
         
          #echo jar -cvf $jarfile -C class .  
         
          echo 
          -----------------CMD Lines----------------------- 
         
          echo 
          source 
          $YARN_HOME 
          /libexec/hadoop-config 
          .sh >sourceIt.sh 
         
          echo 
          export 
          HADOOP_CLASSPATH=$jarfile: 
          '$CLASSPATH' 
          >>sourceIt.sh 
         
          echo 
          source  
          $CWD 
          /sourceIt 
          .sh 
         
          echo 
          yarn $class  [ 
          command 
          args]...

执行步骤：

注意，为了简化起见，脚本定义:

 
          $. 
          /compack 
          .sh args1 args2 args3...中args1为main class 
         
          $ 
          chmod 
          500 compack.sh 
         
          $. 
          /compack 
          .sh TestFileStatus.java 
         
          #then the script will reminder you with the following message： 
         
          -----------------CMD Lines------------------ 
         
          source 
          sourceIt.sh 
         
          yarn TestFileStatus  [ 
          command 
          args]... 
         
          $ 
          source 
          sourceIt.sh 
         
          # suppose we have a file "part-m-00000" in hdfs，run yarn like below 
         
          $yarn TestFileStatus  
          /user/hive/warehouse/footbl/part-m-00000

Output:

 
          #output 
         
          /user/hive/warehouse/footbl/part-m-00000 
          is a  
          file 
          . 
         
          /user/hive/warehouse/footbl/part-m-00000 
          getBlockSize: 134217728 
         
          /user/hive/warehouse/footbl/part-m-00000 
          getLen(): 1275 
         
          /user/hive/warehouse/footbl/part-m-00000 
          getOwner(): grid 
         
          /user/hive/warehouse/footbl/part-m-00000 
          getGroup(): supergroup 
         
          /user/hive/warehouse/footbl/part-m-00000 
          getAccessTime(): 1384675957784 
         
          /user/hive/warehouse/footbl/part-m-00000 
          getModificationTime(): 1384675958368 
         
          /user/hive/warehouse/footbl/part-m-00000 
          getPermission(): rw-r--r-- 
         
          /user/hive/warehouse/footbl/part-m-00000 
          hashcode(): 1096001837 
         
          /user/hive/warehouse/footbl/part-m-00000 
          getPath(): \ 
         
          hdfs: 
          //cluster1 
          :9000 
          /user/hive/warehouse/footbl/part-m-00000

Code6. Listing files & glob files

Listing files

 
          import 
          java.net.URI; 
         
          import 
          org.apache.hadoop.fs.FileUtil; 
         
          import 
          org.apache.hadoop.fs.FileSystem; 
         
          import 
          org.apache.hadoop.fs.FileStatus; 
         
          import 
          org.apache.hadoop.fs.Path; 
         
          import 
          org.apache.hadoop.conf.Configuration; 
         
          public 
          class 
          ListFiles { 
         
          public 
          static 
          void 
          main(String[] args)  
          throws 
          Exception { 
         
          Configuration conf =  
          new 
          Configuration(); 
         
          FileSystem fs = FileSystem.get(conf); 
         
          Path[] paths =  
          new 
          Path[args.length]; 
         
          for 
          ( 
          int 
          i =  
          0 
          ; i < args.length; i++) { 
         
          paths[i] =  
          new 
          Path(args[i]); 
         
          } 
         
          FileStatus[] status = fs.listStatus(paths); 
         
          Path[] pathList = FileUtil.stat2Paths(status); 
         
          for 
          (Path p : pathList) { 
         
          System.out.println(p); 
         
          } 
         
          } 
         
          }

执行步骤：

 
          % . 
          /compack 
          .sh ListFiles.java  
         
          %  
          source 
          sourceIt.s 
         
          % yarn ListFiles  
          /user/hive/warehouse/footbl 
          /user/grid/

output:

 
          hdfs: 
          //cluster1 
          :9000 
          /user/hive/warehouse/footbl/_SUCCESS 
         
          hdfs: 
          //cluster1 
          :9000 
          /user/hive/warehouse/footbl/part-m-00000 
         
          hdfs: 
          //cluster1 
          :9000 
          /user/grid/kiss 
         
          hdfs: 
          //cluster1 
          :9000 
          /user/grid/kissyou 
         
          hdfs: 
          //cluster1 
          :9000 
          /user/grid/missyou

Filter files

 
          package 
          org.apache.hadoop.MyCode; 
         
          import 
          org.apache.hadoop.fs.PathFilter; 
         
          import 
          org.apache.hadoop.fs.Path; 
         
          public 
          class 
          MyFilter  
          implements 
          PathFilter { 
         
          private 
          final 
          String regex; 
         
          public 
          MyFilter(String regex) { 
         
          this 
          .regex = regex; 
         
          } 
         
          public 
          boolean 
          accept(Path path) { 
         
          return 
          path.toString().matches(regex); 
         
          } 
         
          }

 
          package 
          org.apache.hadoop.MyCode; 
         
          import 
          org.apache.hadoop.MyCode.MyFilter; 
         
          import 
          java.net.URI; 
         
          import 
          org.apache.hadoop.fs.FileSystem; 
         
          import 
          org.apache.hadoop.fs.FileStatus; 
         
          import 
          org.apache.hadoop.fs.Path; 
         
          import 
          org.apache.hadoop.fs.FileUtil; 
         
          import 
          org.apache.hadoop.conf.Configuration; 
         
          public 
          class 
          ListStatusWithPattern { 
         
          public 
          static 
          void 
          main(String[] args)  
          throws 
          Exception { 
         
          Configuration conf =  
          new 
          Configuration(); 
         
          FileSystem fs = FileSystem.get(conf); 
         
          FileStatus[] status = fs.globStatus( 
          new 
          Path(args[ 
          0 
          ]),  
          new 
          MyFilter(args[ 
          1 
          ])); 
         
          Path[] pathList = FileUtil.stat2Paths(status); 
         
          for 
          ( Path p : pathList ) {  
         
          System.out.println(p); 
         
          } 
         
          } 
         
          }

执行步骤：

 
          % source $YARN_HOME/libexec/hadoop-config.sh 
         
 
          % mkdir  
          class 
         
 
          % javac ListStatusWithPattern.java  MyFilter.java -d  
          class 
         
 
          % jar -cvf liststatuswithpattern.jar -C  
          class 
          ./ 
         
 
          % export HADOOP_CLASSPATH=liststatuswithpattern.jar:$CLASSPATH 
         
 
          #suppose we have four files in hdfs like below 
         
 
          % hadoop fs -ls /user/grid/ 
         
 
          Found  
          4 
          items 
         
 
          drwxr-xr-x   - grid supergroup           
          0 
          2013 
          - 
          11 
          - 
          17 
          01 
          : 
          06 
          /user/grid/kiss 
         
 
          -rw-r--r--    
          3 
          grid supergroup           
          0 
          2013 
          - 
          11 
          - 
          17 
          06 
          : 
          05 
          /user/grid/kissyou 
         
 
          drwxr-xr-x   - grid supergroup           
          0 
          2013 
          - 
          11 
          - 
          17 
          19 
          : 
          33 
          /user/grid/miss 
         
 
          -rw-r--r--    
          3 
          grid supergroup         
          899 
          2013 
          - 
          11 
          - 
          17 
          01 
          : 
          33 
          /user/grid/missyou 
         
 
          # then we can run the command to filter the matched file 
         
 
          % yarn jar liststatuswithpattern.jar org.apache.hadoop.MyCode.ListStatusWithPattern  
          "hdfs:///user/grid/*ss*" 
          "^.*grid/[k].*$ 
         

或者可以使用前面给出的脚本编译、打包并生成主要的执行yarn的代码：

 
          $. 
          /compack 
          .sh ListStatusWithPattern.java MyFilter.java  
          #注意，脚本默认输入的第一个源文件为main class所在文件 
         
 
          $ 
          source 
          source 
          /home/grid/hadoop-2 
          .2.0-src 
          /hadoop-dist/target/hadoop-2 
          .2.0 
          /task/DFSAPIProgramming/sourceIt 
          .sh 
         
 
          -----------------CMD Lines----------------------- 
         
 
          source 
          /home/grid/hadoop-2 
          .2.0-src 
          /hadoop-dist/target/hadoop-2 
          .2.0 
          /task/DFSAPIProgramming/sourceIt 
          .sh 
         
 
          yarn org.apache.hadoop.MyCode.MyFilter [ 
          command 
          args]... 
         
 
          $yarn org.apache.hadoop.MyCode.ListStatusWithPattern  
          "hdfs:///user/grid/*ss*" 
          "^.*grid/[k].*$" 
         
 
          output: 
         
 
                                                                                                   
         
 
          hdfs: 
          //cluster1 
          :9000 
          /user/grid/kiss 
         
 
          hdfs: 
          //cluster1 
          :9000 
          /user/grid/kissyou 
         

11.重写comparactor

要点：

类型比较在hadoop的mapreduce中非常重要，主要用来比较keys;
hadoop中的RawComparator<T>接口继承自java的comparator, 主要用来比较序列化的objects;
hadoop中的WritableComparator class更全面，提供了两种主要的比较方法，一种是直接比较object，另一种是较serialized representations;

　　　　举例来说比较object:

compare(new IntWritable(21), new IntWritable(998));

比较serialized representations：

compare(serialize(new IntWritable(21)), serialize(new IntWritable(998)))

提示：继承关系

 
          //1.org.apache.hadoop.io 
         
          Interface RawComparator<T> 
         
          //description 
         
          public 
          interface 
          RawComparator<T> 
         
          extends 
          Comparator<T> 
         
          //method 
         
          int 
          compare( 
          byte 
          [] b1,  
          int 
          s1,  
          int 
          l1,  
          byte 
          [] b2,  
          int 
          s2,  
          int 
          l2)  
         
          //2.org.apache.hadoop.io 
         
          InterfaceWritableComparable<T> 
         
          //description 
         
          public 
          interface 
          WritableComparable<T> 
         
          extends 
          Writable, Comparable<T> 
         
          //method 
         
          Methods inherited from  
          interface 
          org.apache.hadoop.io.Writable  
         
          readFields, write 
         
          //3.java.lang.Object 
         
          |__ org.apache.hadoop.io.WritableComparator 
         
          //description 
         
          public 
          class 
          WritableComparator 
         
          extends 
          Object 
         
          implements 
          RawComparator 
         
          //methods 
         
          int 
          compare( 
          byte 
          [] b1,  
          int 
          s1,  
          int 
          l1,  
          byte 
          [] b2,  
          int 
          s2,  
          int 
          l2)   
         
          int 
          compare(Object a, Object b)  
         
          int 
          compare(WritableComparable a, WritableComparable b)   
         
          static 
          int 
          compareBytes( 
          byte 
          [] b1,  
          int 
          s1,  
          int 
          l1,  
          byte 
          [] b2,  
          int 
          s2,  
          int 
          l2) 
         
          //4.java.util 
         
          InterfaceComparator<T> 
         
          //description 
         
          public 
          interface 
          Comparator<T> 
         
          //methods 
         
          int 
          compare(T o1, T o2)  
         
          boolean 
          equals(Object obj)

Code

 
          import 
          java.lang.Byte; 
         
          import 
          java.io.DataOutputStream; 
         
          import 
          java.io.ByteArrayOutputStream; 
         
          import 
          org.apache.hadoop.io.IntWritable; 
         
          import 
          org.apache.hadoop.io.WritableComparator; 
         
          import 
          org.apache.hadoop.io.RawComparator; 
         
          public 
          class 
          MyIntWritableComparactor { 
         
          public 
          static 
          byte 
          [] serialize(IntWritable writable)  
          throws 
          Exception { 
         
          ByteArrayOutputStream out =  
          new 
          ByteArrayOutputStream(); 
         
          DataOutputStream dataOut =  
          new 
          DataOutputStream(out); 
         
          writable.write(dataOut); 
         
          dataOut.close(); 
         
          return 
          out.toByteArray(); 
         
          } 
         
          @SuppressWarnings 
          ( 
          "unchecked" 
          ) 
         
          public 
          static 
          void 
          main(String[] args)  
          throws 
          Exception { 
         
          RawComparator<IntWritable> comparator = WritableComparator.get(IntWritable. 
          class 
          ); 
         
          IntWritable w1 =  
          new 
          IntWritable( 
          13 
          ); 
         
          IntWritable w2 =  
          new 
          IntWritable( 
          12 
          ); 
         
          System.out.println( 
          "w1: " 
          + w1 +  
          " w2: " 
          + w2); 
         
          System.out.println( 
          "w1 compare w2 : " 
          + comparator.compare(w1,w2)); 
         
          byte 
          [] b1 = serialize(w1); 
         
          byte 
          [] b2 = serialize(w2); 
         
          System.out.println( 
          "b1.length: " 
          + b1.length); 
         
          System.out.println( 
          "b2.length: " 
          + b2.length); 
         
          System.out.println( 
          "b1.length compare b2.length: " 
          + 
         
          comparator.compare(b1,  
          0 
          , b1.length, b2,  
          0 
          , b2.length)); 
         
          } 
         
          }

编译，运行：

 
          // 
          注意我用的是hadoop2.2 
         
          %  
          source 
          $YARN_HOME 
          /libexec/hadoop-config 
          .sh 
         
          %  
          mkdir 
          myclass 
         
          % javac -d myclass MyIntWritableCompare.java 
         
          % jar -cvf  mycompare.jar -C myclass ./ 
         
          %  
          export 
          HADOOP_CLASSPATH=$CLASSPATH:mycompare.jar 
         
          % yarn MyIntWritableCompare

输出：

 
          % yarn jar text.jar Text 
         
          w1: 13 w2: 12 
         
          w1 compare w2 : 1 
         
          b1.length: 4 
         
          b2.length: 4 
         
          b1.length compare b2.length: 1

12.数据压缩

 
          import 
          org.apache.hadoop.io.*; 
         
          import 
          org.apache.hadoop.io.compress.CompressionOutputStream; 
         
          import 
          org.apache.hadoop.io.compress.CompressionCodec;  
         
          import 
          org.apache.hadoop.io.IOUtils; 
         
          import 
          org.apache.hadoop.util.ReflectionUtils; 
         
          public 
          class 
          StreamCompressor { 
         
          public 
          static 
          void 
          main(String[] args)  
          throws 
          Exception { 
         
          String codeClassname = args[ 
          0 
          ]; 
         
          Class<?> codeClass = Class.forName(codeClassname); 
         
          Configuration conf =  
          new 
          Configuration(); 
         
          CompressionCodec codec =  
         
          (CompressionCodec)ReflectionUtils.newInstance(codeClass,conf); 
         
          CompressionOutputStream out = codec.createOutputStream(System.out); 
         
          IOUtils.copyBytes(System.in,out, 
          4096 
          , 
          false 
          ); 
         
          out.finish(); 
         
          } 
         
          }

mapreduce

test

hadoop2.2.0基础编程实例

你可能感兴趣的:(test)