MapReduce

MapReduce 八个步骤

hdfs输入源文件------>读取文件------>InputFormat: TextInputFormat------> //读出来就是
------>自定义Map逻辑,将k1和v1转为k2和v2,继承Mapper,重写map方法------>
------>Shuffle阶段------>新------>自定义Reduce逻辑,将k2和v2转换为k3和v3,继承Reducer,重写reduce方法------>将写入hdfs输出文件

以wordcount为例

请添加图片描述

Map

package cn.itclass.mapreduce;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;


/*
四个泛型
Mapper<KEYIN,VALUEIN,KEYOUT,VALUEOUT>
<k1,v1>
<k2,v2>
 */
public class WordCountMapper extends Mapper<LongWritable, Text,Text,LongWritable> {

    //map方法就是将<k1,v1>转为<k2,v2>
    /*
    参数
    key:k1  行偏移量(意义不大)
    value:v1 每一行的文本数据
    context: 上下文对象,连接各个过程
     */
    /*
    如何将<k1,v1>转为<k2,v2>
    k1     v1
    0   hello,world,hadoop
    15  hdfs,hive,hello
    -----------------------
    k2     v2
    hello   1
    world   1
    hadoop  1
    hdfs    1
    hive    1
    hello   1
     */
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException {
         //1:将一行的文本数据进行拆分,放入数组内
        String[] split=value.toString().split(",");
        //2:遍历数组,组装<k2,v2>
        Text text=new Text();
        LongWritable longWritable=new LongWritable();
        for (String word : split) {
            //3:<k2,v2>写入上下文中:利用context.write()
            text.set(word);
            longWritable.set(1);
            context.write(text,longWritable);
        }
    }

}

Reduce

package cn.itclass.mapreduce;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/*
四个泛型
Reducer<KEYIN,VALUEIN,KEYOUT,VALUEOUT>
KEYIN : K2类型
VALUEIN : V2类型

KEYOUT : K3类型
VALUEOUT : V3类型
 */

public class WordCountReducer extends Reducer<Text, LongWritable,Text,LongWritable>{
    //reduce方法作用:将新的K2和V2转为K3和V3,将K3和V3写入上下文中
    /*
    参数:
    key: 新K2
    values: 集合 新V2
    context:表示上下文对象
    ----------------------
    如何将新的K2和V2转为K3和V3
    新 K2         V2
      hello      <1,1,1>
      world      <1,1>
      hadoop     <1>
    ----------------------
       K3        V3
       hello     3
       world     2
       hadoop    1
     */
    @Override
    protected void reduce(Text key, Iterable<LongWritable> values, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
        long count=0;
         //1:遍历集合,将集合中的数字相加得到V3
        for (LongWritable value : values) {
            count+=value.get();
        }
        //2:将K3和V3写入上下文中,context.write()
        context.write(key,new LongWritable(count));
    }
}

main

package cn.itclass.mapreduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.net.URI;

public class JobMain extends Configured implements Tool {
    //该run()方法用于指定一个job任务
    @Override
    public int run(String[] strings) throws Exception {
        //1:创建一个job任务对象
        Job job = Job.getInstance(super.getConf(),"wordcount");
        job.setJarByClass(JobMain.class);//如果打包运行出错
        //super.getConf()获取main里面的Configuration对象
        //2:配置job任务对象(八个步骤)
        //第一步:指定文件的读取方式和读取路径
        job.setInputFormatClass(TextInputFormat.class);//指定输入类
        //指定读取原文件路径,到文件夹名即可,会自动读取文件夹下的所有文件
        TextInputFormat.addInputPath(job,new Path("hdfs://192.168.75.9:8020/wordcount"));
        //第二步:指定Map阶段的处理方式和数据类型
        job.setMapperClass(WordCountMapper.class);//Map阶段自定义逻辑类
        //设置Map阶段K2类型
        job.setMapOutputKeyClass(Text.class);
        //设置Map阶段V2类型
        job.setMapOutputValueClass(LongWritable.class);
        //第三、四、五、六 采用默认方式
        //第七步:指定Reduce阶段的处理方式和数据类型
        job.setReducerClass(WordCountReducer.class);
        //设置K3的类型
        job.setOutputKeyClass(Text.class);
        //设置V3的类型
        job.setMapOutputValueClass(LongWritable.class);
        //第八步:设置输出类型
        job.setOutputFormatClass(TextOutputFormat.class);
        //设置输出路径
        Path path =new Path("hdfs://192.168.75.9:8020/wordcount_out");
        TextOutputFormat.setOutputPath(job,path);
        //获取文件系统FileSystem
        FileSystem fileSystem=FileSystem.get(new URI("hdfs://192.168.75.9:8020"),new Configuration());
        //判断目录是否存在
        boolean bl2= fileSystem.exists(path);
        if(bl2){
            //删除该目录
            fileSystem.delete(path,true);
        }
        //等待任务结束
        boolean bl=job.waitForCompletion(true);
        //返回0表示任务执行成功,否则任务执行失败
        return bl?0:1;
    }

    public static void main(String[] args) throws Exception {
        Configuration configuration=new Configuration();
        //启动job任务
       int run = ToolRunner.run(configuration,new JobMain(),args);
       //返回值run记录任务执行状态,返回0表示任务执行成功,否则任务执行失败
        System.exit(run);//退出
    }
}

JobMain模板

public class JobMain extends configured implements Tool{

@Override
public int run(String[] args) throws Exception{
//1:创建job对象
Job job=Job.getInstance(super.getConf(),"名字");
//2:配置job任务(八个步骤)
	//第一步:设置输入类型和输入路径
	job.setInputFormatClass(TextInputFormat.class);
	TextInputFormatClass(job,new Path("hdfs://"));
	//第二步:设置Mapper类和数据类型
	job.setMapperClass(xxx.class);//实现mapper的类
	job.setMapperOutputKeyClass(xxx.class);
	job.setMapperOutputValueClass(xxx.class);
	//第三步:分区
	//第四步:排序
	//第五步:规约
	//第六步:分组
	//第七步:设置Reducer类和类型
	job.setReducerClass(xxx.class);
	job.setOutputKeyClass(xxx.class);
	job.setOutputValueClass(xxx.class);
	//第八步:设置输出类和输出路径
	job.setOutputFormatClass(TextOutputFormat.class);
	TextOutputFormat.setOutputPath(job,new Path("hdfs://"));
//3:等待程序结束
boolean bl =job.waitForCompletion(true);
return bl?0:1;
}
public void main(String[] args) throws Exception{
	Configuration configuration = new Configuration();
	//启动job任务
	int run = ToolRunner.run(configration,new JobMian(),args);
	System.exit(run);
}
}

Boss: 实验

String Similarity Join
Input:S , R ,θ(编辑距离ed)
Output: (s,r)且ed(s,r)≤θ

: LongWritable , Text
: Text , LongWritable
: Text , Text

你可能感兴趣的:(hadoop,mapreduce)