<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0modelVersion>
<groupId>com.lihaozegroupId>
<artifactId>hadoopartifactId>
<version>1.0.0version>
<packaging>jarpackaging>
<name>hadoopname>
<url>http://maven.apache.orgurl>
<properties>
<jdk.version>1.8jdk.version>
<maven.compiler.source>1.8maven.compiler.source>
<maven.compiler.target>1.8maven.compiler.target>
<project.build.sourceEncoding>UTF-8project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8project.reporting.outputEncoding>
<maven.test.failure.ignore>truemaven.test.failure.ignore>
<maven.test.skip>truemaven.test.skip>
properties>
<dependencies>
<dependency>
<groupId>org.junit.jupitergroupId>
<artifactId>junit-jupiter-apiartifactId>
<version>5.9.2version>
<scope>testscope>
dependency>
<dependency>
<groupId>org.junit.jupitergroupId>
<artifactId>junit-jupiter-engineartifactId>
<version>5.9.2version>
<scope>testscope>
dependency>
<dependency>
<groupId>org.projectlombokgroupId>
<artifactId>lombokartifactId>
<version>1.18.26version>
dependency>
<dependency>
<groupId>org.apache.logging.log4jgroupId>
<artifactId>log4j-slf4j-implartifactId>
<version>2.20.0version>
dependency>
<dependency>
<groupId>org.apache.hadoopgroupId>
<artifactId>hadoop-clientartifactId>
<version>3.3.5version>
dependency>
<dependency>
<groupId>com.google.guavagroupId>
<artifactId>guavaartifactId>
<version>31.1-jreversion>
dependency>
<dependency>
<groupId>org.apache.commonsgroupId>
<artifactId>commons-pool2artifactId>
<version>2.11.1version>
dependency>
<dependency>
<groupId>com.janeluogroupId>
<artifactId>ikanalyzerartifactId>
<version>2012_u6version>
dependency>
<dependency>
<groupId>com.github.binarywanggroupId>
<artifactId>java-testdata-generatorartifactId>
<version>1.1.2version>
dependency>
dependencies>
<build>
<finalName>${project.artifactId}finalName>
<plugins>
<plugin>
<groupId>org.apache.maven.pluginsgroupId>
<artifactId>maven-compiler-pluginartifactId>
<version>3.11.0version>
<configuration>
<encoding>UTF-8encoding>
<source>${jdk.version}source>
<target>${jdk.version}target>
configuration>
plugin>
<plugin>
<groupId>org.apache.maven.pluginsgroupId>
<artifactId>maven-clean-pluginartifactId>
<version>3.2.0version>
plugin>
<plugin>
<groupId>org.apache.maven.pluginsgroupId>
<artifactId>maven-resources-pluginartifactId>
<version>3.3.1version>
plugin>
<plugin>
<groupId>org.apache.maven.pluginsgroupId>
<artifactId>maven-war-pluginartifactId>
<version>3.3.2version>
plugin>
<plugin>
<groupId>org.apache.maven.pluginsgroupId>
<artifactId>maven-surefire-pluginartifactId>
<version>2.22.2version>
<configuration>
<skip>trueskip>
configuration>
plugin>
plugins>
build>
project>
package com.lihaozhe.wordcount;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* @author 李昊哲
* @version 1.0
* @Description
* @createTime 2023/4/12 上午9:35
*
* 自定义的Mapper类 需要继承Hadoop提供的Mapper 并且根据具体业务指定输入数据和输出数据的数据类型
*
* 输入数据的类型
* KEYIN, 读取文件的偏移量 数字(LongWritable)
* VALUEIN, 读取文件的一行数据 文本(Text)
*
* 输出数据的类型
* KEYOUT, 输出数据的key的类型 就是一个单词(Text)
* VALUEOUT 输出数据value的类型 给单词的标记 1 数字(IntWritable)
*/
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] words = line.split(" ");
for (String word : words) {
context.write(new Text(word),new IntWritable(1));
}
}
}
package com.lihaozhe.wordcount;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* @author 李昊哲
* @version 1.0
* @Description
* @createTime 2023/4/12 上午9:46
*/
public class WordCountReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int total = 0;
for (IntWritable value : values) {
// 对value累加进行累加 输出结果
total += value.get();
}
context.write(new Text(key),new IntWritable(total));
}
}
package com.lihaozhe.wordcount;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* @author 李昊哲
* @version 1.0
* @Description
* @createTime 2023/4/12 上午9:52
*/
public class WordCountDriver {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
System.setProperty("HADOOP_USER_NAME", "root");
Configuration conf = new Configuration();
// windows 跨平台提交
conf.set("mapreduce.app-submission.cross-platform", "true");
// conf.addResource("core-site.xml");
// conf.addResource("mapred-site.xml");
// conf.addResource("yarn-site.xml");
// conf.addResource("hdfs-site.xml");
// 资源文件
Path src = new Path("/wordcount/input/wcdata.txt");
Path dst = new Path("/wordcount/result");
FileSystem fs = FileSystem.get(conf);
// 如果计算结果存储目录存在先删除
if (fs.exists(dst)) {
fs.delete(dst, true);
}
// 声明Job对象
Job job = Job.getInstance(conf, "词频统计");
// 指定当前Job的驱动类
// 本地提交注释该行
job.setJarByClass(WordCountDriver.class);
// 本地提交启用该行
// job.setJar("D:\\dev\\java\\code\\hadoop\\target\\hadoop.jar");
// 指定当前Job的 Mapper和Reducer
job.setMapperClass(WordCountMapper.class);
// job.setCombinerClass(WordCountReduce.class);
job.setReducerClass(WordCountReduce.class);
// 指定Map段输出数据的key的类型和输出数据value的类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// FileInputFormat.addInputPath(job, new Path(args[0]));
// FileOutputFormat.setOutputPath(job, new Path(args[1]));
FileInputFormat.setInputPaths(job, src);
FileOutputFormat.setOutputPath(job, dst);
// 提交Job
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}