import ......
public class WordCount
{
//设置Mapper接口,设置map的输入类型是<objetc, Text>: object是通用封装,封装多个类型
//RPC的返回和参数用该类型
//输出类型<Text,IntWritabl>: Text是针对UTF-8的封装
public static class TokenizerMapper extend Mapper<Object, Text, Text, IntWritable>
public Text word=new Text();
//IO流异常
public void map(Object key, Text value, Context context) throws IOException,
InterruptedException
{
private final static IntWiratble one= new IntWritable(1);
//StringTokenizer是字符串分隔解析类型
StringTokenizer itr=new StringTokenizer(value.toString()); //对输入的词切分
while(itr.hasMoreTokens()) //返回是否还有分隔符。
{
word.set(itr.nextToken()); //切入的单词存入word
//返回从当前位置到下一个分隔符的字符串。
context.wirte(word, one)
}
public static class IntSumReduer extends reducer<Text,IntWritable,Text,
IntWirtable>
//result 记录单词的频数
private IntWiratable result= new IntWiratable();
public void reduce(Text key, Iterable<IntWritable> values, Context context)throws IOExcaption,InterruptedException
{
int sum=0;
//对获取的<key, value-list> 计算value的和
for(IntWritable val:values)
{
sum+=val.get();
}
//频数设置到result中
result.set(sum);
//结果
context.wirte(key,result);
}
}
}
// 样例
file_1:
2006-6-9 a
2006-6-10 b
2006-6-11 c
2006-6-12 d
2006-6-13 a
2006-6-14 b
2006-6-15 c
2006-6-11 c
file_2:
2006-6-9 b
2006-6-10 a
2006-6-11 b
2006-6-12 d
2006-6-13 a
2006-6-14 c
2006-6-15 d
2006-6-11 c
输出:
2006-6-10 a
2006-6-10 b
2006-6-11 b
2006-6-11 c
2006-6-12 d
2006-6-13 a
2006-6-14 b
2006-6-14 c
2006-6-15 c
2006-6-9 a
2006-6-9 b
import ......
public class Dedup {
//map 将输入中的value 复制到输出数据key上, 并直接输出
public static class Map extends Mapper<Object,Text, Text, Text> public static Text line= new Text(); public void map(Object key, Text value, Context context)throws IOException ,InterruptedException {
line=value;
context.wirte(line, next Text(""));
}
//reduce 将输入的key复制到输出数据key上, 并直接输出
public static class Reduce extends Reducer<Text, Text, Text, Text> {
public void reduce (Text key, Iterable<Text> values, Context context )
throws IOException, InterruptedException
{
context.wirtable(key, new Text(""))
}
}
}
file_1:
2
32
654
32
15
756
65223
file_2:
5956
22
650
92
file_3:
26
54
6
输出:
1 2
2 6
3 15
4 22
5 26
6 32
7 32
8 34
9 92
10 650
11 654
12 756
13 5956
14 65223
算法:
key封装成IntWritable类型,reduce的自动排序是发送到自己所在的节点,默认排序无法满足全局的顺序; 构建自己的partiton,再对局部的reduce上的数据进行默认排序
public class Sort
{
//map 将输入的value转换为IntWratable类型, 作为输出的key
public static class Map extends Mapper<Object,Text, IntWriatable, IntWriatable>
{
private static IntWritable data= new IntWritable();
public void map(Object key, Text value, Context context) throws IOException, InterruptedException
{
String line=value.tostring();
data.set(Interger.parseInt(line));
context.write(data,new IntWritable(1));
}
}
//reduce 将输入的key复制到输出的value上,然后根据输入的
//value-list中的元素的个数决定key的输出次数
//用全局linenum来代表key的位次
public static class Redcue exteneds Reducer<IntWritable, Int Writable, IntWriatble, Intwriatble>
{
private static IntWritable linenum=new IntWritable(1);
public void redcue(IntWritable key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException
{
for(IntWritable val : values)
{
context.wirte(linenum, key);
linenum=new IntWritable(linenum.get()+1);
}
}
}
//自定义Partitions,函数根据输入的数据的最大值和MapReducer框架中
//Partition的数量获取
//返回对于的partition的ID
public static class Partitions extends Partitioner<IntWritable,IntWritable>
{
public int getPartition(IntWriatable key, IntWriatable value, int numPartitions)
{
int Maxnumber=65523;
int bound=Maxnumber/numPartitions+1;
int keynumber=key.get();
for(int i=0;i< numPartitions;i++)
{
if(keynumber<bound*i && keynumber>=bound*(i-1))
return i-1;
}
return -1;
}
}
}
//main
{
job.setPartitionerclass(Partition.class)
}
import ......
public class TotalSort
{
public static void main(String[] args) throws Exception
{
Path inputPath=new Path(args[0]);
Path outputPath=new Path(args[1]);
//分区文件路径
Path partitionFile=new Path(args[2]);
int reducerNumber=Integer.parsInt(args[3]);
//RandomSampler第一个参数表示会被选中的概率,
//第二个参数是一个选取的样本数
//第三个参数是最大读取的intputsplit的数目
RandomSampler<Text,Text> sampler=new InputSampler.RandomSampler<Text,Text>(0.1, 10000,10);
Configuration.setPartitionFile(conf, partitionFile);
Job job=new Job(conf);
job.setJobName("TotalSort");
job.setJarByClass(TotalSort.class);
job.setInputForamtClass(KeyValueTextInputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setNumReduceTasks(reduceNumber);
//设置partitions
job.setPartitionerClass(TotalOrderPartitioner.class);
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
outputPath.getFileSystem(conf).delete(outputPath,true);
//写入分区文件
InputSampler.writePartitionFile(job,sampler);
System.out.println(job.waitForCompletion(true)? 0:1);
}
}
http://www.linuxidc.com/Linux/2014-03/98498.htm
sort1 1
sort2 3
sort2 77
sort2 54
sort1 2
sort6 22
sort6 221
sort6 20
输出:
sort1 1,2
sort2 3,54,77
sort6 20,22,221
流程
Map端处理:生成如下格式
{[sort1,1],1}
{[sort2,3],3}
{[sort2,77],77}
{[sort2,54],54}
{[sort1,2],2}
{[sort6,22],22}
{[sort6,221],221}
{[sort6,20],20}
操作后,得到的数据流如下:
Partition1:{[sort1,1],1}、{[sort1,2],2}
Partition2:{[sort2,3],3}、{[sort2,77],77}、{[sort2,54],54}
Partition3:{[sort6,22],22}、{[sort6,221],221}、{[sort6,20],20}
调用自己的自定义排序器对新的Key值进行排序。
{[sort1,1],1}
{[sort1,2],2}
{[sort2,3],3}
{[sort2,54],54}
{[sort2,77],77}
{[sort6,20],20}
{[sort6,22],22}
{[sort6,221],221}
Reduce端处理:
{sort1,[1,2]}
{sort2,[3,54,77]}
{sort6,[20,22,221]}
//map代码
public class SecondaryMapper extends Mapper<LongWirtable,Text,Text,NullWritable> {
protected void map(LongWirtable key, Text value, Context context) throws
java.io.IOexception,InterruptedException
{
//将value做为key输出,value代表一整行数据
context.write(value, NullWritable.get());
}
}
//partitioner代码
//分区过程中,按照第一个排序字段进行分发(新key的第一个字段)
public class KeyPartitioner extends HashPartitioner<Text, NullWritable> {
public int getPartition(Text key, NullWritable value, int numReduceTasks)
{
return (key.toString().split(" ")[0].hashCode() & Inter.MAX_VALUE) %
numRedcueTasks;
}
}
//自定义的排序,对新的value值排序
public class SortComparator extends WritableComparator {
protected SortComparator()
{
super(Text.class,true);
}
public int compare(WritableComparable key1, WirtableComparable key2)
{
//如果第一个排序字段相同,则需要比较第二个排序字段
if(Integer.parseInt(Key1.toString().split(" ")[0]==Integer.parseInt(key2.toString().split(" ")[0])))
{
if(Integer.parseInt(Key1.toString().split(" ")[1]>Integer.parseInt(key2.toString().split(" ")[1]))
return 1;
else if Integer.parseInt(Key1.toString().split(" ")[1]<Integer.parseInt(key2.toString().split(" ")[1])
{return -1;}
else if (Integer.parseInt(Key1.toString().split(" ")[1]==Integer.parseInt(key2.toString().split(" ")[1]))
{return 0;}
}
//如果第一个排序字段不同,则比较第二个排序字段
else
{
if(Integer.parseInt(Key1.toString().split(" ")[0]>Integer.parseInt(key2.toString().split(" ")[0]))
{return 1;}
else if(Integer.parseInt(Key1.toString().split(" ")[0]<Integer.parseInt(key2.toString().split(" ")[0]))
{return -1;}
}
return 0;
}
}
//reduce
//设置reduce个数为1,分组需要再reduce之前再次分组,按默认的key
public class SecondaryReducer extends Reducer<Text, IntWriatble, NullWritable,Text> {
protected void reduce(Text key, java.lang.Iterable<Text> values,Context context)
throws java.io.IOException, InterruptedException
{
for(Text value:values)
{
context.wirte(NullWritable.get(),value);
}
}
}
J 0001
H 0002
B 0003
0001 chinese
0001 math
0002 music
0002 math
0003 physic
// join后的输出
j chinese
j math
h music
h math
b physic
//map函数
public class JoinMapper extends Mapper<LongWritable key,Text, Text, Text> {
public static final String LEFT_FILENAME="student_info.txt";
public static final String RIGHT_FILENAME="student_class_info.txt";
public static final String LEFT_FILENAME_FLAG="l";
public static final String RIGHT_FILENAME_FLAG="r";
protected void map(LongWritable key, Text value, Context context) throws
IOException, InterruptedException
{
String filePath=((FileSplit) context.getInputSplit()).getPath().toString();
String fileFlag=null;
String joinKey=null;
String joinVlaue=null;
//判断来自那个文件
if(filePath.contains(LEFT_FILENAME))
{
FILEfLAG=left_filename_flag;
joinKey=value.toString().split("\t")[1];
jionValu=value.toString().split("\t")[0];
}else if(filePath.contains(RIGHT_FILENAME))
{
fileFlag=RIGHT_FILENAME_FLAG;
joinKey=value.toString().split("\t")[0];
jionValu=value.toString().split("\t")[1];
}
//输出键值并标示该结果来自那个文件
context.write(new Text(joinKey), new Text(joinValue+"\t"+fileFlag));
};
}
//reduce函数
public class JoinReduce extends Reducer<Text,Text,Text,Text> {
public static final String LEFT_FILENAME="student_info.txt";
public static final String RIGHT_FILENAME="student_class_info.txt";
public static final String LEFT_FILENAME_FLAG="l";
public static final String RIGHT_FILENAME_FLAG="r";
protected void reduce(Text key,Iterable<Text> values,Context context) throws
IOException, InterruptedException
{
Iterable<Text> iterator=values.iterator();
List<string> studentClssNames=new ArrayList<String>();
String studentName="";
while(iterator.hasNext())
{
String[] infos=oteratr.next().tostring.split("\t");
//判断该记录来自那个文件
if(infos[1].equals(LEFT_FILENAME_FLAG))
{studentName=infos[0];}
else if(infos[1].equals(RIGHT_FILENAME_FLAG))
{studentClassNames.add(info[0]);}
}
}
//求笛卡尔积
for(int i=0;i<studentClassNames.size();i++)
{
context.write(new Text(studentName),new Text(studentClassName.get(i)));
}
}