SequenceFileInputFormat只能处理SequenceFile类型的文件。
代码:
package inputformat; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner; //用之前SequenceFile类型的文件作为处理数据,用那个for循环生成的数据,那个数据指定的类型是<LongWritable,Text> //SequenceFileInputFormat只能处理SequenceFile类型的数据 public class SequenceFileInputFormatTest { public static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable> { final Text k2 = new Text(); final LongWritable v2 = new LongWritable(); protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws InterruptedException, IOException { final String line = value.toString(); final String[] splited = line.split("\\s"); for (String word : splited) { k2.set(word); v2.set(1); context.write(k2, v2); } } } public static class MyReducer extends Reducer<Text, LongWritable, Text, LongWritable> { LongWritable v3 = new LongWritable(); protected void reduce(Text k2, Iterable<LongWritable> v2s, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException { long count = 0L; for (LongWritable v2 : v2s) { count += v2.get(); } v3.set(count); context.write(k2, v3); } } public static void main(String[] args) throws Exception { final Configuration conf = new Configuration(); final Job job = Job.getInstance(conf, SequenceFileInputFormatTest.class.getSimpleName()); // 1.1 FileInputFormat.setInputPaths(job, "hdfs://192.168.1.10:9000/sf1"); //这里改了一下,把TextInputFormat改成了SequenceFileInputFormat job.setInputFormatClass(SequenceFileInputFormat.class); // 1.2 job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); // 1.3 默认只有一个分区 job.setPartitionerClass(HashPartitioner.class); job.setNumReduceTasks(1); // 1.4省略不写 // 1.5省略不写 // 2.2 job.setReducerClass(MyReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); // 2.3 FileOutputFormat.setOutputPath(job, new Path( "hdfs://192.168.1.10:9000/out1")); job.setOutputFormatClass(TextOutputFormat.class); // 执行打成jar包的程序时,必须调用下面的方法 job.setJarByClass(SequenceFileInputFormatTest.class); job.waitForCompletion(true); } }
生成SequenceFile类型的文件,供上述SequenceFileInputFormat使用,作为输入数据:
package sequenceFile; import java.net.URI; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.zookeeper.common.IOUtils; //for循环读写操作演示 public class Forduxie { public static void main(String args[]) throws Exception { final Path path = new Path("/sf1"); Configuration conf = new Configuration(); final FileSystem fs = FileSystem.get(new URI("hdfs://192.168.1.10:9000/"), conf); @SuppressWarnings("deprecation") final SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf,path, LongWritable.class,Text.class); for (int i = 0; i < 10; i++) { writer.append(new LongWritable(i), new Text(i+"=_=")); } IOUtils.closeStream(writer); @SuppressWarnings({ "deprecation" }) final SequenceFile.Reader reader = new SequenceFile.Reader(fs, path,conf); LongWritable key = new LongWritable(); Text val = new Text(); while (reader.next(key, val)) { System.out.println(key.get() + "\t" + val.toString()); }IOUtils.closeStream(reader); } }
如果创建的是Maven项目,需要在pom包里添加:
<span style="white-space:pre"> </span><dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.4</version> </dependency>