需求:在给定文本文件中统计输出每一个单词出现的次数。
输入 Map阶段 中间结果 Reduce阶段 输出
Java Java Java Java ... > Assembly 2
PHP PHP PHP PHP PHP ... > Java 4
Python Python Python ... > PHP 5
Assembly Assembly ... > Python 3
SQL SQL SQL SQL ... > SQL 4
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
//KEYIN 输入数据的key类型
//VALUEIN 输入数据的value类型
//KEYOUT 输出数据的key类型
//VALUEOUT 输出数据的value类型
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
Text k = new Text();
IntWritable v = new IntWritable(1);
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
String string = value.toString(); //获取一行数据
String[] words = string.split(" "); //获取一行的每个单词
for (String word : words) {
k.set(word);
context.write(k, v);
}
}
}
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
IntWritable result = new IntWritable();
protected void reduce(Text key, Iterable<IntWritable> values,
Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}
result.set(sum);
context.write(key, result);
}
}
Driver分为7个步骤:
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCountDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
//1. 获取job对象
Job job = Job.getInstance(conf);
//2.设置jar包存储位置
job.setJarByClass(WordCountDriver.class);
//3.关联Map和Reduce类
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
//4.设置Mapper阶段输出数据的key和value类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//5.设置最终输出数据的key和value类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//6.设置输入输出路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//7.提交job
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
序列化指把内存中的对象转换成字节序列以便存储到磁盘(持久化)和网络传输。
反序列化指把收到字节序列或磁盘的持久化数据转换为内存中的对象。
Java的序列化是一个重量级序列化框架(Serializable),一个对象被序列化后,会附带很多额外的信息(校验信息,Header,继承体系等),不便于在网络中高效传输。所以,Hadoop自己开发了一套序列化机制(Writable)
常用数据类型对应的Hadoop数据序列化类型
Java类型 | Hadoop Writable类型 |
---|---|
boolean | BooleanWritable |
byte | ByteWritable |
int | IntWritable |
float | FloatWritable |
long | LongWritable |
double | DoubleWritable |
String | Text |
map | MapWritable |
array | ArrayWritable |
package beanwritable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class Student implements Writable, Comparable<Student> {
private int chinese;
private int math;
private int english;
private int sum;
public Student() {}
public Student(int chinese, int math, int english) {
this.chinese = chinese;
this.math = math;
this.english = english;
this.sum = chinese + math + english;
}
public int getChinese() { return chinese; }
public void setChinese(int chinese) { this.chinese = chinese; }
public int getMath() { return math; }
public void setMath(int math) { this.math = math; }
public int getEnglish() { return english; }
public void setEnglish(int english) { this.english = english; }
public int getSum() { return sum; }
public void setSum(int sum) { this.sum = sum; }
public void setGrade(int chinese, int math, int english) {
this.chinese = chinese;
this.math = math;
this.english = english;
this.sum = chinese + math + english;
}
public String toString() {
return "chinese=" + chinese + "\tmath=" + math + "\tenglish=" + english;
}
//序列化方法
public void write(DataOutput out) throws IOException {
out.writeInt(chinese);
out.writeInt(math);
out.writeInt(english);
out.writeInt(sum);
}
//反序列化方法:必须与序列化方法顺序一致
public void readFields(DataInput in) throws IOException {
chinese = in.readInt();
math = in.readInt();
english = in.readInt();
sum = in.readInt();
}
public int compareTo(Student o) {
return this.sum - o.sum;
}
}