第一个MR程序—WordsCount

编写MR
编写Mapper
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * Mapper过程
 * Mapper是一个泛型类,分别代表:输入键,输入值,输出键和输出值类型
 */
public class WCMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

            @Override
            protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

                Text keyOut = new Text();
                IntWritable valueOut = new IntWritable();
                String[] arr = value.toString().split(" ");
                for (String s : arr) {
                    keyOut.set(s);
                    valueOut.set(1);
            context.write(keyOut, valueOut);
        }

    }

}

编写Reducer

package com.yuangh.hadoop.mr.demo;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 *Map的输出就是Reduce的输入
 * 
 */
public class WCReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

    /**
     * reduce
     */
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

        int count = 0;
        for (IntWritable iw : values) {
            count = count + iw.get();
        }
        //输出
        context.write(key, new IntWritable(count));
    }

}

主函数
package com.yuangh.hadoop.mr.demo;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class WCApp {

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);


        //设置job的各种属性
        job.setJobName("WCApp");           //作业名称
        job.setJarByClass(WCApp.class);    //搜索类
        job.setInputFormatClass(TextInputFormat.class);  //设置输入格式

        //设置输入路径
        FileInputFormat.addInputPath(job, new Path(args[0]));
        //设置输出路径
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        job.setMapperClass(WCMapper.class);   //mapper类
        job.setReducerClass(WCReducer.class); //reducer类

        job.setNumReduceTasks(1);             //reduce个数

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        job.waitForCompletion(true);
    }

}
详细过程解析(Local模式运行MR流程)
  • 创建外部Job(mapreduce.Job),设置配置信息
  • 通过jobsubmitter将job.xml + split等文件写入临时目录
  • 通过jobSubmitter提交job给localJobRunner
  • LocalJobRunner将外部Job 转换成成内部Job
  • 内部Job线程,开放分线程执行job
  • job执行线程分别计算Map和reduce任务信息并通过线程池孵化新线程执行MR任务
在hadoop集群上运行MR程序
  • 将本地项目使用maven打包成jar文件

  • 将打包好的jar文件放入到集群中(共享目录就行)

  • 运行 hadoop jar 命令

hadoop jar MyHadoop.jar com.yuangh.hadoop.mr.WCApp hdfs://s201/user/centosmin0/wc/data hdfs://s201/user/centosmin0/wc/out

http://s201:50070查看

第一个MR程序—WordsCount_第1张图片

http://s201:8088端口查看

第一个MR程序—WordsCount_第2张图片

你可能感兴趣的:(◆【大数据】)