黑猴子的家:MapReduce 倒排索引(多job串联)

1、数据

https://www.jianshu.com/p/e33ff7e367db

2、输出结果

(1)第一次预期输出结果

victor--a.txt   3
victor--b.txt   2
victor--c.txt   2
pingping--a.txt  1
pingping--b.txt 3
pingping--c.txt  1
ss--a.txt   2
ss--b.txt   1
ss--c.txt   1

(2)第二次预期输出结果

victor  c.txt-->2   b.txt-->2   a.txt-->3   
pingping    c.txt-->1   b.txt-->3   a.txt-->1   
ss  c.txt-->1   b.txt-->1   a.txt-->2   

3、第一次处理

OneIndexMapper

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

// k victor--a.txt
// v  个数
public class OneIndexMapper extends Mapper {
    
    String name;
    Text k = new Text();
    IntWritable v = new IntWritable(1);
    
    @Override
    protected void setup(Mapper.Context context)
            throws IOException, InterruptedException {

        // 获取名字
        FileSplit inputSplit = (FileSplit) context.getInputSplit();
        name = inputSplit.getPath().getName();

    }
    

    @Override
    protected void map(LongWritable key, Text value, Mapper.Context context)
            throws IOException, InterruptedException {

        // 1 获取一行
        // victor pingping
        String line = value.toString();

        // 2 切割
        // victor
        // pingping
        String[] words = line.split(" ");

        // 3 遍历输出
        for (String word : words) {
            // victor--a.txt
            k.set(word + "--" + name);

            context.write(k, v);
        }
    }
}

OneIndexReducer

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class OneIndexReducer extends Reducer {

    @Override
    protected void reduce(Text key, Iterable values, Context context)
            throws IOException, InterruptedException {

        // 累加求和
        int count = 0;
        for (IntWritable value : values) {
            count += value.get();
        }
        
        // 写出
        context.write(key, new IntWritable(count));
    }
}

OneIndexDriver

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class OneIndexDriver {

    public static void main(String[] args) throws Exception {

        args = new String[] { "e:/inputoneindex", "e:/output5" };

        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);
        job.setJarByClass(OneIndexDriver.class);

        job.setMapperClass(OneIndexMapper.class);
        job.setReducerClass(OneIndexReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        job.waitForCompletion(true);

    }
}

第一次输出结果

victor--a.txt   3
victor--b.txt   2
victor--c.txt   2
pingping--a.txt 1
pingping--b.txt 3
pingping--c.txt 1
ss--a.txt   2
ss--b.txt   1
ss--c.txt   1

4、第二次处理

TwoIndexMapper

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class TwoIndexMapper extends Mapper{
    Text k = new  Text();
    Text v = new  Text();
    
    @Override
    protected void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {
        
        // victor--a.txt    3
        
        // 1 获取一行
        String line = value.toString();
        
        // 2 切割
        // victor
        // a.txt    3
        String[] fields = line.split("--");
        
        // 3 处理k和value
        k.set(fields[0]);
        v.set(fields[1]);
        
        // 4 输出
        context.write(k, v);
    }
    
}

TwoIndexReducer

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class TwoIndexReducer extends Reducer{

    @Override
    protected void reduce(Text key, Iterable values, Context context)
            throws IOException, InterruptedException {
//      victor  a.txt   3
//      victor  b.txt   2
//      victor  c.txt   2
        
        StringBuffer sb = new StringBuffer();
        
        for (Text value : values) {
            
            sb.append(value.toString().replace("\t", "-->") + "\t");
        }
        
        context.write(key, new Text(sb.toString()));
        
    }
}

TwoIndexDriver

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class TwoIndexDriver {

    public static void main(String[] args) throws Exception, IOException {
        args = new String[] { "e:/inputtwoindex", "e:/output6" };

        Configuration config = new Configuration();
        Job job = Job.getInstance(config);
        job.setJarByClass(TwoIndexDriver.class);
        
        job.setMapperClass(TwoIndexMapper.class);
        job.setReducerClass(TwoIndexReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        System.exit(job.waitForCompletion(true) ? 0 : 1);

    }
}

第二次输出结果

victor  c.txt-->2   b.txt-->2   a.txt-->3   
pingping    c.txt-->1   b.txt-->3   a.txt-->1   
ss  c.txt-->1   b.txt-->1   a.txt-->2   

5、Code -> GitHub

https://github.com/liufengji/hadoop_mapreduce.git

你可能感兴趣的:(黑猴子的家:MapReduce 倒排索引(多job串联))