Hadoop官方案例WordCount简单实现

Hadoop官方案例WordCount简单实现

前提准备

创建maven工程,导入依赖,注意版本修改与集群的版本一致

<dependency>
        <groupId>org.apache.logging.log4jgroupId>
        <artifactId>log4j-coreartifactId>
        <version>2.9.1version>
    dependency>
    <dependency>
        <groupId>org.apache.hadoopgroupId>
        <artifactId>hadoop-commonartifactId>
        <version>2.7.2version>
    dependency>
    <dependency>
        <groupId>org.apache.hadoopgroupId>
        <artifactId>hadoop-clientartifactId>
        <version>2.7.2version>
    dependency>
    <dependency>
        <groupId>org.apache.hadoopgroupId>
        <artifactId>hadoop-hdfsartifactId>
        <version>2.7.2version>
    dependency>

自定义Mapper类——MyMapper

package mapreduce;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * Mapper类
 */
public class MyMapper extends Mapper<LongWritable,Text,Text,LongWritable> {
    private final static LongWritable one = new LongWritable(1);
    private Text word = new Text();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] words = line.split(" ");
        for(String s :words){
            word.set(s);
            context.write(word, one);
        }
    }
}

自定义Reduce类——MyReduce

package mapreduce;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * Reduce类
 */
public class MyReduce extends Reducer<Text,LongWritable,Text,LongWritable>{
    private  LongWritable  longWritable= new LongWritable();
    @Override
    protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
        long sum  = 0;
        for(LongWritable v:values){
            sum+=v.get();
        }
        longWritable.set(sum);
        context.write(key,longWritable);
    }
}

自定义Runner类——MyRunner

package mapreduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
 * 运行主类
 */
public class MyRunner implements Tool{
    private Configuration conf = null;
    public int run(String[] args) throws Exception {
        //设置配置类和任务名称
        Job job = Job.getInstance(conf,"myJob");

        //设置运行主类
        job.setJarByClass(MyRunner.class);

        //设置Mapper类
        job.setMapperClass(MyMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);
        //设置Reducer类
        job.setReducerClass(MyReduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);

        //设置数据的输入和输出地址
        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        //表示任务运行状态
        return job.waitForCompletion(true)?0:1;
    }

    public void setConf(Configuration conf) {
        this.conf=conf;
    }

    public Configuration getConf() {
        return this.conf;
    }

    public static void main(String[] args) throws Exception {
        int state = ToolRunner.run(new MyRunner(), args);
        System.exit(state);
    }
}

运行准备——打jar包

mvn clean package

运行

#在hadoop的根目录安装下运行,并且把打好的jar也放入根目录下。
$ bin/yarn jar hadoop-hdfs-1.0-SNAPSHOT.jar mapreduce.MyRunner /input /output

你可能感兴趣的:(Hadoop)