目录
前言:
Spring Boot项目中添加Hadoop和HDFS的依赖。可以使用Apache Hadoop的Java API或者使用Spring Hadoop来简化操作。
需要配置Hadoop和HDFS的连接信息,包括Hadoop的配置文件和HDFS的连接地址等。
可以使用Hadoop的API来实现数据的查询和添加。例如,使用HDFS的FileSystem API来读取和写入文件,使用MapReduce来处理数据等。
MapReduce处理数据:
创建一个MapReduce任务,用于统计HDFS中文本文件中单词的出现次数
实现MapReduce任务的API
main方法测试
api请求:
这只是一个笔记而已
根据情况选取依赖
org.springframework.boot
spring-boot-starter-web
org.apache.hadoop
hadoop-common
3.3.1
org.apache.hadoop
hadoop-hdfs
3.3.1
org.apache.hadoop
hadoop-client
3.3.1
org.springframework.data
spring-data-hadoop
2.5.0.RELEASE
org.apache.hadoop
hadoop-mapreduce-client-core
3.3.1
# Hadoop configuration
spring.hadoop.config.fs.defaultFS=hdfs://localhost:9000
spring.hadoop.config.dfs.replication=1
spring.hadoop.config.dfs.blocksize=128m
spring.hadoop.config.dfs.client.use.datanode.hostname=true
spring.hadoop.config.dfs.client.read.shortcircuit=true
spring.hadoop.config.dfs.domain.socket.path=/var/run/hadoop-hdfs/dn._PORT
# HDFS configuration
spring.hadoop.fsUri=hdfs://localhost:9000
spring.hadoop.fsUser=root
@RestController
@RequestMapping("/hdfs")
public class HdfsController {
@Autowired
private FileSystem fileSystem;
@GetMapping("/read/{path}")
public ResponseEntity read(@PathVariable String path) throws IOException {
Path filePath = new Path(path);
if (!fileSystem.exists(filePath)) {
return ResponseEntity.notFound().build();
}
FSDataInputStream inputStream = fileSystem.open(filePath);
String content = IOUtils.toString(inputStream, StandardCharsets.UTF_8);
inputStream.close();
return ResponseEntity.ok(content);
}
@PostMapping("/write/{path}")
public ResponseEntity write(@PathVariable String path, @RequestBody String content) throws IOException {
Path filePath = new Path(path);
if (fileSystem.exists(filePath)) {
return ResponseEntity.badRequest().build();
}
FSDataOutputStream outputStream = fileSystem.create(filePath);
IOUtils.write(content, outputStream, StandardCharsets.UTF_8);
outputStream.close();
return ResponseEntity.ok().build();
}
}
使用curl或其他HTTP客户端发送GET和POST请求来测试API: 或者postman去测试
# 读取文件
curl http://localhost:8080/hdfs/read/test.txt# 写入文件
curl -X POST -H "Content-Type: text/plain" -d "Hello, HDFS!" http://localhost:8080/hdfs/write/test.txt
public class WordCount {
public static class WordCountMapper extends Mapper {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
context.write(word, one);
}
}
}
public static class WordCountReducer extends Reducer {
private IntWritable result = new IntWritable();
@Override
public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
}
@RestController
@RequestMapping("/hdfs")
public class HdfsController {
@Autowired
private FileSystem fileSystem;
@Autowired
private Configuration configuration;
@PostMapping("/wordcount")
public ResponseEntity wordCount(@RequestParam String inputPath, @RequestParam String outputPath) throws Exception {
Job job = Job.getInstance(configuration, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(WordCount.WordCountMapper.class);
job.setCombinerClass(WordCount.WordCountReducer.class);
job.setReducerClass(WordCount.WordCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(inputPath));
FileOutputFormat.setOutputPath(job, new Path(outputPath));
boolean success = job.waitForCompletion(true);
return success ? ResponseEntity.ok().build() : ResponseEntity.badRequest().build();
}
}
@SpringBootApplication
public class Application {
public static void main(String[] args) {
SpringApplication.run(Application.class, args);
}
@Bean
public FileSystem fileSystem() throws IOException {
Configuration configuration = new Configuration();
configuration.set("fs.defaultFS", "hdfs://localhost:9000");
return FileSystem.get(configuration);
}
@Bean
public Configuration configuration() {
return new Configuration();
}
}
api请求:
curl -X POST http://localhost:8080/hdfs/wordcount?inputPath=/user/input&outputPath=/user/output
最后,需要注意Hadoop和HDFS的安全性和性能问题,例如数据的加密和压缩,数据的分片和并行处理等。可以使用Hadoop的安全和性能优化工具来提高系统的稳定性和效率。