Hadoop实战:MapReduce应用实例

参考:hadoop实战

一.WordCount

  1. 任务:统计文件中单词的频率
  2. 代码:
package mapreduce;


import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.*;

public class NewWordCount extends Configured implements Tool{

    public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {

        private final static IntWritable one = new IntWritable(1);

        private Text word = new Text();

        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            String line = value.toString();

            StringTokenizer tokenizer = new StringTokenizer(line);

            while(tokenizer.hasMoreTokens()){
                word.set(tokenizer.nextToken());
                context.write(word, one);
            }
        }
    }

    public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
        public void reduce(Text key, Iterator values, Context context) throws IOException, InterruptedException {

            int sum = 0;

            while(values.hasNext()){
                sum += values.next().get();

            }

            context.write(key, new IntWritable(sum));
        }
    }

    public int run(String[] args) throws Exception {

        Job job = new Job(getConf());

        job.setJobName("wordcount");

        job .setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        boolean success = job.waitForCompletion(true);
        return success ? 0 : 1;
    }
    public static void main(String[] args) throws Exception {
        int ret = ToolRunner.run(new NewWordCount(), args);
        System.exit(ret);
    }
}

二.数据去重

  1. 任务:去掉重复出现的数据
  2. 代码:
package mapreduce;


import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.*;


/**
 * data deplicatioin removing_mapreduce
 * @author hadoop
 *
 */
public class Dedup extends Configured implements Tool{


    public static class Map extends Mapper<Object, Text ,Text, Text>{
        private static Text line = new Text();

        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            line = value;
            context.write(line, new Text(""));
        }
    }

    public static class Reduce extends Reducer<Text, Text, Text, Text>{
        public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
            context.write(key, new Text(""));
        }
    }

    public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException{

//      Job job = new Job(getConf());
//      
//      job.setJobName("dedup");
//      
//      job .setOutputKeyClass(Text.class);
//      job.setOutputValueClass(Text.class);
//      
//      job.setMapperClass(Map.class);
//      job.setReducerClass(Reduce.class);
//      
//      job.setInputFormatClass(TextInputFormat.class);
//      job.setOutputFormatClass(TextOutputFormat.class);
//      
//      FileInputFormat.setInputPaths(job, new Path(args[0]));
//      FileOutputFormat.setOutputPath(job, new Path(args[1]));
//      
//      boolean success = job.waitForCompletion(true);
//      return success ? 0 : 1;


        //上面注释的和下面的功能相同的

        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if( otherArgs.length != 2){
            System.err.println("Usage: dedup  ");
            System.exit(2);
        }

        Job job = new Job(conf, "Data Deduplication");
        job.setJarByClass(Dedup.class);
        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);
        job.setCombinerClass(Reduce.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
        return job.waitForCompletion(true) ? 0 : 1;

    }

    public static void main(String[] args) throws Exception{
        int ret = ToolRunner.run(new Dedup(), args);
        System.exit(ret);
    }
}

三.数据排序

  1. 任务:

    对输入的数据按照一定顺序排列

  2. 实例:

    file1:
    43
    32
    5
    6
    file2:
    98
    1234
    356
    2
    output:
    1 2
    2 5
    3 6
    4 32
    5 43
    6 98
    7 356
    8 1234

  3. 代码:
package mapreduce;

import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.*;

import mapreduce.Dedup.Map;
import mapreduce.Dedup.Reduce;
public class Sort {

    public static class Map extends Mapper{


        private static IntWritable data = new IntWritable();
        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            data.set(Integer.parseInt(line));

            context.write(data, new IntWritable(1));
        }
    }

    public static class Reduce extends Reducer {

        private static IntWritable linenum = new IntWritable(1);
        public void reduce(IntWritable key, Iterable values, Context context) throws IOException, InterruptedException{

            for(IntWritable val : values){
                context.write(linenum, key);
                linenum = new IntWritable(linenum.get() + 1);
            }
        }
    }
    public static class Partition extends Partitioner {
        @Override
        public int getPartition(IntWritable key, IntWritable value, int numPartitions){
            int Maxnumber = 65223;

            int bound = Maxnumber / numPartitions + 1;
            int keynumber = key.get();

            for(int i = 1; i <= numPartitions; i++){

                if(keynumber < bound * i && keynumber >= bound * (i - 1)){
                    return i-1;
                }

            }
            return -1;
        }

    }


    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if( otherArgs.length != 2){
            System.err.println("Usage: sort  ");
            System.exit(2);
        }

        Job job = new Job(conf, "Data sorting");
        job.setJarByClass(Sort.class);
        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);
        job.setPartitionerClass(Partition.class);

        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);

    }

}

四.单表关联

  1. 任务:

    要求从给出的数据中寻找所关心的数据.

  2. 实例:

    输入:
    child parent
    Tom Lucy
    Tom Jack
    Lucy Mary
    Lucy Ben
    Jone Alma
    输出:
    grandchild grandparent
    Tom Mary
    Tom Ben 

  3. 代码:
package mapreduce;

import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.*;

public class STjoin {

    public static int time = 0;
    public static class Map extends Mapper{


        private static IntWritable data = new IntWritable();
        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            String childname = new String();
            String parentname = new String();

            String relationtype = new String();

            String line = value.toString();
            System.out.println(line);
            String[] values = line.trim().split(" ");

            if(values[0].compareTo("child")  != 0){

                childname = values[0];
                parentname = values[1];
                relationtype = "1";
                context.write(new Text(values[1]), new Text(relationtype + "+" + childname + "+" + parentname));

                relationtype = "2";
                context.write(new Text(values[0]), new Text(relationtype + "+" + childname + "+" + parentname));

            }
        }


    }

    public static class Reduce extends Reducer {

        private static IntWritable linenum = new IntWritable(1);
        public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException{

            if(time == 0){
                context.write(new Text("grandchild"), new Text("grandparent"));
                System.out.println("grandchild" + "\t" + "grandparent");
                time++;
            }

            int grandchildnum = 0;
            String grandchild[] = new String[10];
            int grandparentnum = 0;
            String grandparent[]  = new String[10];

            Iterator ite = values.iterator();

            while(ite.hasNext()){

                String record = ite.next().toString();

                int len = record.length();
                int i = 2;
                if(len == 0) continue;

                char relationtype = record.charAt(0);
                String childname = new String();
                String parentname = new String();

                //获取value-list中value的child
                while(record.charAt(i) != '+'){
                    childname = childname + record.charAt(i);
                    i++;
                }

                i = i + 1;
                while(i < len){

                    parentname = parentname + record.charAt(i);
                    i++;
                }


                if(relationtype == '1'){
                    grandchild[grandchildnum] = childname;
                    grandchildnum++;
                }

                else{
                    grandparent[grandparentnum] = parentname;
                    grandparentnum++;
                }
            }

            if(grandparentnum != 0 && grandchildnum != 0){
                for(int m = 0; m < grandchildnum; m++){
                    for(int n = 0; n < grandparentnum; n++){
                        context.write(new Text(grandchild[m]), new Text(grandparent[n]));
                        System.out.println(grandchild[m] + "\t" + grandparent[n]);
                    }
                }
            }

        }
    }


    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if( otherArgs.length != 2){
            System.err.println("Usage: stjoin  ");
            System.exit(2);
        }

        Job job = new Job(conf, "single table join");
        job.setJarByClass(STjoin.class);
        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);

    }

}

test data:

child parent
Tom Lucy
Tom Jack
Jone Lucy
Jone Jack
Lucy Mary
Lucy Ben
Jack Alice
Jack Jesse
Terry Alice
Terry Jesse
Philip Terry
Philip Alma
Mark Terry
Mark Alma

五.多表关联

你可能感兴趣的:(hadoop)