MapReduce:
**
----MapReduce常用数据类型----------------
ByteWritable:单字节数值
IntWritable:整型数
LongWritable:长整型数
FloatWritable:浮点数
DoubleWritable:双字节数值
BooleanWritable:标准布尔型数值
Text:使用UTF8格式存储的文本
NullWritable:当中的key或value为空时使用
** IntWritable LongWritable Text NullWritable DoubleWritable较为常用
====MapReduce自定义数据类型===================================
** key实现WritableComparable
** value通常只需要实现Writable
案例1:手机上网流量统计
数据源:
HTTP_20130313143750
每个电话号码每上一次网络,就会生成一条日志数据
需求目标:
手机号码 上行数据包总数 下行数据包总数 上行总流量 下行总流量
13123412431 1238 17823 3424 2342
13622311238 17223 34214 23421 1231
13123412431 12380 1828 34 23
......
MapReduce设计:
** key是什么?
--电话号码
** value是什么?
--后面四个字段
key value list
13123412431 (123,2423,3453,234),(789,1231,4353,234),(1231,231,342,23) ...
13622311238 (1233,23,342353,23234),(78239,123231,42353,2234) ...
...... ......
最终结果:
13123412431 (1232313,241231323,23133453,1231234)
13622311238 (6666,62,889,99999999)
...... ......
----DataTotalWritable------
package com.myblue.myhdfs;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class DataTotalWritable implements Writable {
// 上行数据包总数
private long upPackNum ;
// 下行数据包总数
private long downPackNum ;
// 上行总流量
private long upPayLoad ;
// 下行总流量
private long downPayLoad ;
public DataTotalWritable() {
}
public DataTotalWritable(long upPackNum, long downPackNum, long upPayLoad,long downPayLoad) {
this.set(upPackNum, downPackNum, upPayLoad, downPayLoad);
}
public void set (long upPackNum, long downPackNum, long upPayLoad,long downPayLoad) {
this.upPackNum = upPackNum;
this.downPackNum = downPackNum;
this.upPayLoad = upPayLoad;
this.downPayLoad = downPayLoad;
}
public long getUpPackNum() {
return upPackNum;
}
public void setUpPackNum(long upPackNum) {
this.upPackNum = upPackNum;
}
public long getDownPackNum() {
return downPackNum;
}
public void setDownPackNum(long downPackNum) {
this.downPackNum = downPackNum;
}
public long getUpPayLoad() {
return upPayLoad;
}
public void setUpPayLoad(long upPayLoad) {
this.upPayLoad = upPayLoad;
}
public long getDownPayLoad() {
return downPayLoad;
}
public void setDownPayLoad(long downPayLoad) {
this.downPayLoad = downPayLoad;
}
//^为异或运算, << 带符号左移, >>带符号右移, >>> 无符号右移
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + (int) (downPackNum ^ (downPackNum >>> 32));
result = prime * result + (int) (downPayLoad ^ (downPayLoad >>> 32));
result = prime * result + (int) (upPackNum ^ (upPackNum >>> 32));
result = prime * result + (int) (upPayLoad ^ (upPayLoad >>> 32));
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
DataTotalWritable other = (DataTotalWritable) obj;
if (downPackNum != other.downPackNum)
return false;
if (downPayLoad != other.downPayLoad)
return false;
if (upPackNum != other.upPackNum)
return false;
if (upPayLoad != other.upPayLoad)
return false;
return true;
}
@Override
public String toString() {
return upPackNum + "\t" + downPackNum + "\t"
+ upPayLoad + "\t" + downPayLoad ;
}
public void write(DataOutput out) throws IOException {
out.writeLong(upPackNum);
out.writeLong(downPackNum);
out.writeLong(upPayLoad);
out.writeLong(downPayLoad);
}
public void readFields(DataInput in) throws IOException {
this.upPackNum = in.readLong() ;
this.downPackNum = in.readLong() ;
this.upPayLoad = in.readLong() ;
this.downPayLoad = in.readLong() ;
}
}
----DataTotalMapReduce-----------
package com.myblue.myhdfs;
import java.io.IOException;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import com.myblue.myhdfs.DataTotalWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class DataTotalMapReduce extends Configured implements Tool {
public static class DataTotalMapper extends
Mapper {
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
//split by '\t'
String[] splits = value.toString().split("\t") ;
//以手机号码作为output key
String phoneNum = splits[1];
Text mapOutputKey = new Text();
mapOutputKey.set(phoneNum);
// set map output value
long upPackNum = Long.parseLong(splits[6]) ;
long downPackNum = Long.parseLong(splits[7]) ;
long upPayLoad = Long.parseLong(splits[8]) ;
long downPayLoad = Long.parseLong(splits[9]) ;
DataTotalWritable mapOutputValue = new DataTotalWritable() ;
mapOutputValue.set(upPackNum, downPackNum, upPayLoad, downPayLoad);
//map output
context.write(mapOutputKey, mapOutputValue);
}
}
public static class DataTotalReducer extends
Reducer {
@Override
protected void reduce(Text key, Iterable values,
Context context) throws IOException, InterruptedException {
long upPackNumSum = 0;
long downPackNumSum = 0;
long upPayLoadSum = 0;
long downPayLoadSum = 0;
//iterator
for(DataTotalWritable value : values){
upPackNumSum += value.getUpPackNum() ;
downPackNumSum += value.getDownPackNum() ;
upPayLoadSum += value.getUpPayLoad() ;
downPayLoadSum += value.getDownPayLoad() ;
}
// set output value
DataTotalWritable outputValue = new DataTotalWritable() ;
outputValue.set(upPackNumSum, downPackNumSum, upPayLoadSum, downPayLoadSum);
// output
context.write(key, outputValue);
}
}
public int run(String[] args) throws Exception {
//Job
Configuration conf = super.getConf();
Job job = Job.getInstance(conf);
job.setJarByClass(getClass());
//Mapper
job.setMapperClass(DataTotalMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(DataTotalWritable.class);
//Reducer
job.setReducerClass(DataTotalReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DataTotalWritable.class);
//输入路径
Path inPath = new Path(args[0]);
FileInputFormat.addInputPath(job, inPath);
//输出路径
Path outPath = new Path(args[1]);
FileSystem dfs = FileSystem.get(conf);
if (dfs.exists(outPath)) {
dfs.delete(outPath, true);
}
FileOutputFormat.setOutputPath(job, outPath);
//Submit Job
boolean isSuccess = job.waitForCompletion(true);
return isSuccess ? 0 : 1;
}
public static void main(String[] args) throws Exception {
args = new String[] {"hdfs://blue01.mydomain:8020/input2",
"hdfs://blue01.mydomain:8020/output2"};
// run job
Configuration conf = new Configuration();
int status = ToolRunner.run(conf,new DataTotalMapReduce(),args);
System.exit(status);
}
}
====案例2:Join===================================================================
数据文件:
customer文件
1,Stephanie Leung,555-555-5555
2,Edward Kim,123-456-7890
3,Jose Madriz,281-330-8004
4,David Stork,408-555-0000
order文件
3,A,12.95,02-Jun-2008
1,B,88.25,20-May-2008
2,C,32.00,30-Nov-2007
3,D,25.02,22-Jan-2009
目标:
1,B,88.25,20-May-2008,Stephanie Leung,555-555-5555
2,C,32.00,30-Nov-2007,Edward Kim,123-456-7890
3,D,25.02,22-Jan-2009,Jose Madriz,281-330-8004
3,A,12.95,02-Jun-2008,Jose Madriz,281-330-8004
思路:
选用: Join
map阶段:
** map task依次读取两个文件,切割,并设置key和value,取cid为key,同时给来自不同的文件的value打一个标签
value == flag + value
reduce阶段:
** Join
Map读入
<偏移量,这一行值>
Map()输出
--customer文件输出
<1,customer_[Stephanie Leung,555-555-5555]>
<2,customer_[Edward Kim,123-456-7890]>
...
--order文件输出
<3,order_[A,12.95,02-Jun-2008]>
<1,order_[B,88.25,20-May-2008]>
<3,order_[D,25.02,22-Jan-2009]>
...
Reduce()输入
<1,(customer_[Stephanie Leung,555-555-5555],order_[B,88.25,20-May-2008])
<2,(customer_[Edward Kim,123-456-7890],order_[C,32.00,30-Nov-2007])
<3,(customer_[Jose Madriz,281-330-8004],order_[A,12.95,02-Jun-2008],order_[D,25.02,22-Jan-2009])
<4,(customer_[David Stork,408-555-0000])
----DataJoinWritable-------------
package com.myblue.myhdfs;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class DataJoinWritable implements Writable {
private String flag;
private String data;
public DataJoinWritable() {
}
public DataJoinWritable(String flag, String data) {
this.set(flag, data);
}
public void set(String flag, String data) {
this.flag = flag;
this.data = data;
}
public String getFlag() {
return flag;
}
public void setFlag(String flag) {
this.flag = flag;
}
public String getData() {
return data;
}
public void setData(String data) {
this.data = data;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((data == null) ? 0 : data.hashCode());
result = prime * result + ((flag == null) ? 0 : flag.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
DataJoinWritable other = (DataJoinWritable) obj;
if (data == null) {
if (other.data != null)
return false;
} else if (!data.equals(other.data))
return false;
if (flag == null) {
if (other.flag != null)
return false;
} else if (!flag.equals(other.flag))
return false;
return true;
}
@Override
public String toString() {
return flag + "," + data ;
}
public void write(DataOutput out) throws IOException {
//用与平台无关的方式使用UTF-8编码将一个字符串写入输出流
out.writeUTF(getFlag());
out.writeUTF(getData());
}
public void readFields(DataInput in) throws IOException {
this.flag = in.readUTF() ;
this.data = in.readUTF() ;
}
}
----DataJoinMapReduce--------------
package com.myblue.myhdfs;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import com.myblue.myhdfs.DataJoinWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Job;
public class DataJoinMapReduce extends Configured implements Tool {
public static class DataJoinMapper extends
Mapper {
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String[] splits = value.toString().split(",");
//output key,以连接值为key
String cid = splits[0];
Text mapOutputKey = new Text();
mapOutputKey.set(cid);
//output value
DataJoinWritable mapOutputValue = new DataJoinWritable();
// length == 3 ==> customer
if (splits.length == 3) {
String name = splits[1];
String phoneNum = splits[2];
mapOutputValue.set("customer", name + "," + phoneNum);
}
// length == 4 ==> order
if (splits.length == 4) {
String name = splits[1];
String price = splits[2];
String date = splits[3];
mapOutputValue.set("order", name + "," + price + "," + date);
}
context.write(mapOutputKey, mapOutputValue);
}
}
public static class DataJoinReducer extends
Reducer {
@Override
protected void reduce(Text key, Iterable values, Context context)
throws IOException, InterruptedException {
String customerInfo = null;
List orderList = new ArrayList();
for (DataJoinWritable value : values) {
if ("customer".equals(value.getFlag())) {
customerInfo = value.getData();
} else if ("order".equals(value.getFlag())) {
orderList.add(value.getData());
}
}
Text outputValue = new Text();
for (String order : orderList) {
outputValue.set(key.toString() + "," + order + "," + customerInfo);
context.write(NullWritable.get(), outputValue);
}
}
}
public int run(String[] args) throws Exception {
Configuration conf = super.getConf();
Job job = Job.getInstance(conf);
job.setJarByClass(getClass());
//Mapper
job.setMapperClass(DataJoinMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(DataJoinWritable.class);
//Reducer
job.setReducerClass(DataJoinReducer.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
//输入路径
Path inPath = new Path(args[0]);
FileInputFormat.addInputPath(job, inPath);
//输出路径
Path outPath = new Path(args[1]);
FileSystem dfs = FileSystem.get(conf);
if (dfs.exists(outPath)) {
dfs.delete(outPath, true);
}
FileOutputFormat.setOutputPath(job, outPath);
//Submit Job
boolean isSuccess = job.waitForCompletion(true);
return isSuccess ? 0 : 1;
}
public static void main(String[] args) throws Exception {
args = new String[] {"hdfs://blue01.mydomain:8020/input2",
"hdfs://blue01.mydomain:8020/output2"};
Configuration conf = new Configuration();
int status = ToolRunner.run(conf,new DataJoinMapReduce(), args);
System.exit(status);
}
}
-----------------------------------
mapreduce中的Join
** Map端Join
** 一个表比较小,另外一个表非常大
会把小表数据完整的放入到大表关联的每台nodemanager节点的内存中去,
[map task]依次匹配内存中小表数据
** reduce端Join
** 两张表通常都是大文件
** Join的操作是在Reduce端执行
** semi Join
** map端Join和reduce端Join结合
** mapper在读取block数据处理的时候,如果有相关字段则保留,否则过滤掉。然后把需要的数据传递给reduce端进行join。
====Shuffle==============================================================
MapReduce程序
** 离线数据分析、数据清洗(过滤脏数据)
** 执行命令:bin/yarn jar 包名.类名 参数
MapReduce Shuffle
数据从map task输出到reduce task输入的这段过程
** map的输入: split
** 默认情况下,一个block就是一个split,一个split对应一个map
** 尽量保证每个map的输入数据是来自同一个block
** 如果设计多个block为一个split,可能会造成大量额外流量
** 合理控制map个数
>>Input
<0,hadoop mapreduce>
<14,hbase hadoop>
>>map()
** map() --> value.toString().split("\t")
** output
----Shuffle---------------
>>>>>>>> map shuffle
>>> 环形缓存区
默认大小100M mapreduce.task.io.sort.mb
>>> partition分区
** HashPartitioner
** 决定数据交给哪个reduce处理
1 hadoop hbase --> reduce1
2 mapreduce --> reduce2
3 ... --> reduce3
>>> sort
** 按照key进行字典顺序排序
>>> combine (可选,并非所有的情况都可以使用combine)
** 默认情况下,相当于map阶段局部reduce
>>> spill
** 当环形缓存区容量达到80M(0.8) mapreduce.map.sort.spill.percent
** 会将缓存区的数据写入本地磁盘临时目录(不是HDFS)
>>> merge
** 把很多小文件合并成一个大文件
>>> compress (可选)
** 减轻网络IO的压力
>>>>>>>> reduce shuffle (application master)
每个reduce会去map的输出结果中拉取自己对应的分区数据
merge合并
** 按照key进行文件合并
group分组
** 将相同key的value值放到一起,形成list
** ...
----reduce-------------
>>reduce
>>output
** 数据汇总
** hadoop 2,mapreduce 1,hbase 1 ...
设置reduce个数:
** 当前job任务
job.setNumReduceTasks(n);
** 永久生效
配置文件 mapred-site.xml
安装传文件
yum install lrzsz