Hadoop图解MapReduce并实现WordCount示例分析

MapReduce实现WordCount

Hadoop图解MapReduce并实现WordCount示例分析_第1张图片
1、spliting :Documents会根据切割规则被切成若干块,
2、map阶段:然后进行Map过程,Map会并行读取文本,对读取的单词进行单词分割,并且每个词以键值对形式生成。
例如:读取到”Hello World Hello Java“,分割单词形成Map.

3、Combine阶段:接下来Combine(该阶段是可以选择的,Combine其实也是一种reduce),会对每个片相同的词进行统计。
4、shuffle阶段:将Map输出作为reduce的输入的过程就是shuffle,次阶段是最耗时间,也是重点需要优化的阶段。shuffle阶段会对数据进行拉取,对最后得到单词进行统计,每个单词的位置会根据Hash来确定所在的位置,
5、reduce阶段:对数据做最后的汇总,最后结果是存储在hdfs上。

项目结构

Hadoop图解MapReduce并实现WordCount示例分析_第2张图片

java代码:

pom.xml

<?xml version="1.0" encoding="UTF-8"?>

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>mymr</groupId>
  <artifactId>mymr</artifactId>
  <version>1.0-SNAPSHOT</version>

  <name>mymr</name>
  <!-- FIXME change it to the project's website -->
  <url>http://www.example.com</url>

  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <maven.compiler.source>1.8</maven.compiler.source>
    <maven.compiler.target>1.8</maven.compiler.target>
  </properties>

  <dependencies>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.11</version>
      <scope>test</scope>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-client</artifactId>
      <version>2.6.0</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-common</artifactId>
      <version>2.6.0</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-hdfs</artifactId>
      <version>2.6.0</version>
    </dependency>


  </dependencies>
  <build>
    <plugins>
      <plugin>
        <artifactId>maven-compiler-plugin</artifactId>
        <version>2.3.2</version>
        <configuration>
          <source>1.8</source>
          <target>1.8</target>
        </configuration>
      </plugin>
      <plugin>
        <artifactId>maven-assembly-plugin</artifactId>
        <configuration>
          <descriptorRefs>
            <descriptorRef>jar-with-dependencies</descriptorRef>
          </descriptorRefs>
          <archive>
            <manifest>
              <mainClass>mymr.service.wc.MyDriver</mainClass>
            </manifest>
          </archive>
        </configuration>
        <executions>
          <execution>
            <id>make-assembly</id>
            <phase>package</phase>
            <goals>
              <goal>single</goal>
            </goals>
          </execution>
        </executions>
      </plugin>
    </plugins>
  </build>
</project>

MyMapper

package mymr.service.wc;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
//继承Mapper
public class MyMapper extends Mapper<LongWritable, Text,Text,LongWritable> {

    private LongWritable one = new LongWritable(1);
    private Text word = new Text();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String [] wds = value.toString().split(" ");
        for (String word : wds) {
            Text wd = new Text(word);
            context.write(wd,one);
        }
    }
}

MyReduce

package mymr.service.wc;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class MyReduce  extends Reducer<Text, LongWritable,Text,LongWritable> {

    private LongWritable res = new LongWritable();


    @Override
    protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
        long ressum=0;
        for (LongWritable one : values) {
            ressum+=one.get();
        }
        res.set(ressum);
        context.write(key,res);
    }
}

MyDriver

package mymr.service.wc;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class MyDriver {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        //准备一个空任务,并给名字
        Job job = Job.getInstance(conf,"wc");
        //设置该任务主启动类
        job.setJarByClass(MyDriver.class);
        //设置任务的输入数据源
        FileInputFormat.addInputPath(job,new Path("f://abc.txt"));
        //设置你的mapper任务类
        job.setMapperClass(MyMapper.class);
        //设置mapper任务类的输出数据类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);
        //设置你的reducer任务类
        job.setReducerClass(MyReduce.class);
        //设置Reducer任务的输出的数据类
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);
        //设置任务的输出数据目标
        FileOutputFormat.setOutputPath(job,new Path("f://abc1"));
        job.waitForCompletion(true);
    }
}

你可能感兴趣的:(hadoop)