Hadoop MapReduce初探 自己写的MapReduce程序

Hadoop自己写的MapReduce WordCount程序

以下是我写的WordCount程序,代码的具体讲解已经在注释中给出,所以就不做赘述,我再学习的时候写WordCount程序是用的MapReduce八股文,非常好用,先写出一个框架,再向框架中添加血肉:

java
package org.hadoop.MapReduce;


import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * 自己的WordCount类 MapReduce八股文
 * @author Troy
 *
 */
public class MyWordCount {
    //map类
    /**
     * 输入是一个文本,所以以偏移量为键值,value为这一行的值
     * 每一行都执行一次map, 所以map里的value代表一行
     *
     */
    static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
        //代表word出现一次
        private final static IntWritable one = new IntWritable(1);
        //这里为什么要声明这个word呢,我猜是因为声明一个static的可以避免每个类对象声明的时候都要重复创建,比较费劲,这样只要就可以值创建一个,一直用了
        private static Text word = new Text();
        @Override
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            // TODO Auto-generated method stub
            //解析 获取改行的字符串
            String line = value.toString();
            //将字符串分割
            StringTokenizer stringTokenizer = new StringTokenizer(line);
            //循环计数
            while(stringTokenizer.hasMoreTokens()){
                String wordValue  = stringTokenizer.nextToken();
                word.set(wordValue);
                //把输出结果写出来
                context.write(word, one);
            }

        }

    }

    //reduce类
    /**
     * reduce函数的作用就是将每个key所对应的value值都相加起来
     * 又因为这里的value都是1,我理解的是每个key都会执行一次reduce函数
     * @author Troy
     *
     */
    static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
        //这里为什么要设置这个东西呢,我才也是跟之前的原因是一样的
        //都是为了解决重复声明浪费资源的问题
        private static IntWritable count = new IntWritable();
        @Override
        protected void reduce(Text key, Iterable values,Context context)
                throws IOException, InterruptedException {
            // TODO Auto-generated method stub
            int sum = 0;
            //将所有为该key的value全部相加起来
            for(IntWritable value:values){
                sum += value.get();
            }
            //把输出结果写出来
            count.set(sum);
            context.write(key, count);

        }

    }
    //客户端
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException{
        //获取配置信息
        Configuration configuration = new Configuration();
        //声明job
        Job job = new Job(configuration, "MyWordCount");
        //1.设置执行的类
        job.setJarByClass(MyWordCount.class);
        //2.设置Mapper类和Reducer类
        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReducer.class);
        //3.设置输入文件输出文件路径
        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        //4.设置输出结果的key value 类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        //5.提交job, 等待运行结果,并在客户端显示运行信息
        boolean isSuccess = job.waitForCompletion(true);
        //结束
        System.exit(isSuccess?0:1);

    }
}

你可能感兴趣的:(hadoop,hadoop,mapreduce,hadoop)