MR——worldcount案例

springboot程序

1、导入依赖

 
        junit
        junit
        RELEASE
    

    
        org.apache.logging.log4j
        log4j-core
        2.8.2
    

    
        org.apache.hadoop
        hadoop-common
        2.6.0
    

    
        org.apache.hadoop
        hadoop-client
        2.6.0
    

    
        org.apache.hadoop
        hadoop-hdfs
        2.6.0
    

2、配置日志文件

log4j.rootLogger=INFO,stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=[%-5p] %d{yyyy-MM-dd HH:mm:ss,SSS} method:%l%n%m%n
log4j.appender.logfile=org.apache.log4j.FileAppender
log4j.appender.logfile.File=target/spring.log
log4j.appender.logfile.layout=org.apache.log4j.PatternLayout
log4j.appender.logfile.layout.ConversionPattern=[%-5p] %d{yyyy-MM-dd HH:mm:ss,SSS} method:%l%n%m%n

3、编写Mapper

package com.taikang.bd;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * KEYIN 默认情况下,是MR框架所读到的一行文本的起始偏移量
 * VALUEIN   默认情况下,是MR框架所读到的一行文本的内容, String
 * KEYOUT 用户自定义逻辑处理完成后输出数据中的 key ,在此处是单词
 * VALUEOUT  用户自定义逻辑处理完成后输出数据中的 value 此处是单词次数
 */
public class WCMapper extends Mapper {

    Text t = new Text();
    IntWritable i = new IntWritable(1);

    /**
     * map 方法是提供给 map task 进程来调用的,map task 进程是每读取一行文本来调用一次重写的map 方法
     * map task 在调用 map 方法时,传递的参数
     * key  : 一行文本的偏移量
     * value:  一行文本的内容
     */
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        //1 获取一行数据
        String line = value.toString();
        //2 切分一行的数据,按照逗号分隔
        String[] v = line.split(",");
        //3 循环写出
        for (String s : v) {
            t.set(s);
            context.write(t, i);
        }
    }
}

4、编写reduce

package com.taikang.bd;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * KEYIN
 * VALUEIN
 * KEYOUT
 * VALUEOUT
 */
public class WCReduce extends Reducer {

    // wangfei,1
    // wangfei,1

    //wangfei <1,1>

    @Override
    protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
        int sum = 0;
        //1 累加求和
        for (IntWritable value : values) {
            sum += value.get();
        }
        IntWritable v = new IntWritable();
        v.set(sum);
        //写出
        context.write(key, v);

    }
}

5、编写driver

package com.taikang.bd;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class WCDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        //1 获取job对象
        Job job = Job.getInstance(conf);
        //2 设置jar 存储位置
        job.setJarByClass(WCDriver.class);
        //3 关联mapper和reduce类
        job.setMapperClass(WCMapper.class);
        job.setReducerClass(WCReduce.class);
        //4 设置mapper的输出数据的kv类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        //5 设置最终数据输出的kv类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        //6 设置输入输出路径
        FileInputFormat.setInputPaths(job,new Path(args[0])); //输入路径
        FileOutputFormat.setOutputPath(job,new Path(args[1]));
        //7 提交job
        boolean b = job.waitForCompletion(true);
        System.exit(b?0:1);
    }
}

6、上传到服务器

hadoop jar wc.jar com.taikang.bd.WCDriver /user/in /user/out

or

yarn jar wc.jar com.taikang.bd.WCDriver /user/in /user/out

注意:输出路径不能存在,路径都是hdfs路径

 

 

 

 

你可能感兴趣的:(hadoop,java)