数据是以空格为分隔符的。
[root@baolibin hadoop]# hadoop fs -text /input/hehe Warning: $HADOOP_HOME is deprecated. hello you hello me hello you hello me
代码如下:
package hadoop_2_6_0; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.KeyValueLineRecordReader; import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner; public class KeyValueTextInputFormatTest { public static class MyMapper extends Mapper<Text, Text, Text, LongWritable> { final Text k2 = new Text(); final LongWritable v2 = new LongWritable(); protected void map(Text key, Text value, Mapper<Text, Text, Text, LongWritable>.Context context) throws InterruptedException, IOException { // final String line = value.toString(); // final String[] splited = line.split("o"); // for (String word : splited) { // k2.set(word); k2.set(key); v2.set(1); context.write(k2, v2); // } } } public static class MyReducer extends Reducer<Text, LongWritable, Text, LongWritable> { LongWritable v3 = new LongWritable(); protected void reduce(Text k2, Iterable<LongWritable> v2s, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException { long count = 0L; for (LongWritable v2 : v2s) { count += v2.get(); } v3.set(count); context.write(k2, v3); } } public static void main(String[] args) throws Exception { final Configuration conf = new Configuration(); conf.set(KeyValueLineRecordReader.KEY_VALUE_SEPERATOR, " "); final Job job = Job.getInstance(conf, KeyValueTextInputFormatTest.class.getSimpleName()); // 1.1 FileInputFormat.setInputPaths(job,"hdfs://192.168.1.100:9000/input/hehe"); // job.setInputFormatClass(KeyValueTextInputFormat.class); // 1.2 job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); // 1.3 job.setPartitionerClass(HashPartitioner.class); job.setNumReduceTasks(1); // 1.4 // 1.5 // 2.2 job.setReducerClass(MyReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); // 2.3 FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.1.100:9000/out5")); job.setOutputFormatClass(TextOutputFormat.class); // job.setJarByClass(KeyValueTextInputFormatTest.class); job.waitForCompletion(true); } }
结果如下:
</pre><pre name="code" class="java">[root@baolibin hadoop]# hadoop fs -text /out5/part-r* Warning: $HADOOP_HOME is deprecated. hello 4
解析:
下面这行代码说明每行数据以空格为分隔符,空格前面为key,空格后面为value
conf.set(KeyValueLineRecordReader.KEY_VALUE_SEPERATOR, " ");
如果不指定分隔符,则整行为key,value为空值。
指明使用的是KeyValueTextInputFormat。
job.setInputFormatClass(KeyValueTextInputFormat.class);
map函数里面可以直接调用set方法:
k2.set(key); v2.set(1); context.write(k2, v2);这里的key和value值都已经分割好了,我这里只用了分割好的key值,value并没有用。
我把map函数里面写成这样,调用value的值:
k2.set(value); v2.set(1); context.write(k2, v2);
[root@baolibin hadoop]# hadoop fs -text /out6/part-r* Warning: $HADOOP_HOME is deprecated. me 2 you 2