MapRudce 单词统计 WordCount 案例代码

MapRudce 单词统计 WordCount 案例代码

pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0modelVersion>

  <groupId>com.lihaozegroupId>
  <artifactId>hadoopartifactId>
  <version>1.0.0version>
  <packaging>jarpackaging>

  <name>hadoopname>
  <url>http://maven.apache.orgurl>

  <properties>
    <jdk.version>1.8jdk.version>
    <maven.compiler.source>1.8maven.compiler.source>
    <maven.compiler.target>1.8maven.compiler.target>
    <project.build.sourceEncoding>UTF-8project.build.sourceEncoding>
    <project.reporting.outputEncoding>UTF-8project.reporting.outputEncoding>
    <maven.test.failure.ignore>truemaven.test.failure.ignore>
    <maven.test.skip>truemaven.test.skip>
  properties>
  <dependencies>
    
    <dependency>
      <groupId>org.junit.jupitergroupId>
      <artifactId>junit-jupiter-apiartifactId>
      <version>5.9.2version>
      <scope>testscope>
    dependency>
    
    <dependency>
      <groupId>org.junit.jupitergroupId>
      <artifactId>junit-jupiter-engineartifactId>
      <version>5.9.2version>
      <scope>testscope>
    dependency>
    <dependency>
      <groupId>org.projectlombokgroupId>
      <artifactId>lombokartifactId>
      <version>1.18.26version>
    dependency>
    <dependency>
      <groupId>org.apache.logging.log4jgroupId>
      <artifactId>log4j-slf4j-implartifactId>
      <version>2.20.0version>
    dependency>
    <dependency>
      <groupId>org.apache.hadoopgroupId>
      <artifactId>hadoop-clientartifactId>
      <version>3.3.5version>
    dependency>
    <dependency>
      <groupId>com.google.guavagroupId>
      <artifactId>guavaartifactId>
      <version>31.1-jreversion>
    dependency>
    
    <dependency>
      <groupId>org.apache.commonsgroupId>
      <artifactId>commons-pool2artifactId>
      <version>2.11.1version>
    dependency>
    <dependency>
      <groupId>com.janeluogroupId>
      <artifactId>ikanalyzerartifactId>
      <version>2012_u6version>
    dependency>
    <dependency>
      <groupId>com.github.binarywanggroupId>
      <artifactId>java-testdata-generatorartifactId>
      <version>1.1.2version>
    dependency>

  dependencies>
  <build>
    <finalName>${project.artifactId}finalName>
    
    <plugins>
      <plugin>
        <groupId>org.apache.maven.pluginsgroupId>
        <artifactId>maven-compiler-pluginartifactId>
        <version>3.11.0version>
        <configuration>
          
          <encoding>UTF-8encoding>
          
          <source>${jdk.version}source>
          <target>${jdk.version}target>
        configuration>
      plugin>
      <plugin>
        <groupId>org.apache.maven.pluginsgroupId>
        <artifactId>maven-clean-pluginartifactId>
        <version>3.2.0version>
      plugin>
      <plugin>
        <groupId>org.apache.maven.pluginsgroupId>
        <artifactId>maven-resources-pluginartifactId>
        <version>3.3.1version>
      plugin>
      <plugin>
        <groupId>org.apache.maven.pluginsgroupId>
        <artifactId>maven-war-pluginartifactId>
        <version>3.3.2version>
      plugin>
      
      
      <plugin>
        <groupId>org.apache.maven.pluginsgroupId>
        <artifactId>maven-surefire-pluginartifactId>
        <version>2.22.2version>
        <configuration>
          <skip>trueskip>
        configuration>
      plugin>
    plugins>
  build>
project>

map类 WordCountMapper

package com.lihaozhe.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * @author 李昊哲
 * @version 1.0
 * @Description
 * @createTime 2023/4/12 上午9:35
 * 

* 自定义的Mapper类 需要继承Hadoop提供的Mapper 并且根据具体业务指定输入数据和输出数据的数据类型 *

* 输入数据的类型 * KEYIN, 读取文件的偏移量 数字(LongWritable) * VALUEIN, 读取文件的一行数据 文本(Text) *

* 输出数据的类型 * KEYOUT, 输出数据的key的类型 就是一个单词(Text) * VALUEOUT 输出数据value的类型 给单词的标记 1 数字(IntWritable) */ public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> { @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException { String line = value.toString(); String[] words = line.split(" "); for (String word : words) { context.write(new Text(word),new IntWritable(1)); } } }

reduce类 WordCountReduce

package com.lihaozhe.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * @author 李昊哲
 * @version 1.0
 * @Description
 * @createTime 2023/4/12 上午9:46
 */
public class WordCountReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
        int total = 0;
        for (IntWritable value : values) {
            // 对value累加进行累加 输出结果
            total += value.get();
        }
        context.write(new Text(key),new IntWritable(total));
    }
}

驱动类 WordCountDriver

package com.lihaozhe.wordcount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * @author 李昊哲
 * @version 1.0
 * @Description
 * @createTime 2023/4/12 上午9:52
 */
public class WordCountDriver {
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        System.setProperty("HADOOP_USER_NAME", "root");
        Configuration conf = new Configuration();
        // windows 跨平台提交
        conf.set("mapreduce.app-submission.cross-platform", "true");
        // conf.addResource("core-site.xml");
        // conf.addResource("mapred-site.xml");
        // conf.addResource("yarn-site.xml");
        //  conf.addResource("hdfs-site.xml");
        // 资源文件
        Path src = new Path("/wordcount/input/wcdata.txt");
        Path dst = new Path("/wordcount/result");
        FileSystem fs = FileSystem.get(conf);
        // 如果计算结果存储目录存在先删除
        if (fs.exists(dst)) {
            fs.delete(dst, true);
        }
        // 声明Job对象
        Job job = Job.getInstance(conf, "词频统计");
        // 指定当前Job的驱动类
        // 本地提交注释该行
        job.setJarByClass(WordCountDriver.class);
        // 本地提交启用该行
        // job.setJar("D:\\dev\\java\\code\\hadoop\\target\\hadoop.jar");
        // 指定当前Job的 Mapper和Reducer
        job.setMapperClass(WordCountMapper.class);
        // job.setCombinerClass(WordCountReduce.class);
        job.setReducerClass(WordCountReduce.class);
        // 指定Map段输出数据的key的类型和输出数据value的类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        // FileInputFormat.addInputPath(job, new Path(args[0]));
        // FileOutputFormat.setOutputPath(job, new Path(args[1]));
        FileInputFormat.setInputPaths(job, src);
        FileOutputFormat.setOutputPath(job, dst);
        // 提交Job
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

你可能感兴趣的:(hadoop,java,大数据,mapreduce)