hadoop_7 : MapReduce代码

HelloWord

import ......

public class WordCount
{
  //设置Mapper接口,设置map的输入类型是<objetc, Text>: object是通用封装,封装多个类型
  //RPC的返回和参数用该类型
  //输出类型<Text,IntWritabl>: Text是针对UTF-8的封装

  public static class TokenizerMapper extend  Mapper<Object, Text, Text, IntWritable>

        public Text word=new Text();

  //IO流异常 
        public void map(Object key, Text value, Context context) throws IOException,
        InterruptedException
        {
           private final static IntWiratble one= new IntWritable(1);

       //StringTokenizer是字符串分隔解析类型
       StringTokenizer itr=new StringTokenizer(value.toString()); //对输入的词切分
       while(itr.hasMoreTokens()) //返回是否还有分隔符。
       {
          word.set(itr.nextToken()); //切入的单词存入word
          //返回从当前位置到下一个分隔符的字符串。
          context.wirte(word, one)
      }
public static class IntSumReduer extends reducer<Text,IntWritable,Text,
IntWirtable>
   //result 记录单词的频数
   private IntWiratable result= new IntWiratable();
   public void reduce(Text key, Iterable<IntWritable> values, Context   context)throws IOExcaption,InterruptedException
   {
       int sum=0;

       //对获取的<key, value-list> 计算value的和
        for(IntWritable val:values)
    {
            sum+=val.get();     
    }
    //频数设置到result中
    result.set(sum);

    //结果
    context.wirte(key,result);
   }

}  
}

数据去重

// 样例

file_1:

2006-6-9  a
2006-6-10 b
2006-6-11 c
2006-6-12 d
2006-6-13 a
2006-6-14 b
2006-6-15 c
2006-6-11 c

file_2:

2006-6-9  b
2006-6-10 a
2006-6-11 b
2006-6-12 d
2006-6-13 a
2006-6-14 c
2006-6-15 d
2006-6-11 c

输出:

2006-6-10 a
2006-6-10 b
2006-6-11 b
2006-6-11 c
2006-6-12 d
2006-6-13 a
2006-6-14 b
2006-6-14 c
2006-6-15 c
2006-6-9  a
2006-6-9  b
  • 算法
    1. 将所有结果交给1台reduce
    2. <key, value>中将value设置为空
import ......

public class Dedup {
//map 将输入中的value 复制到输出数据key上, 并直接输出
   public static class Map extends Mapper<Object,Text, Text, Text> public static Text line= new Text(); public void map(Object key, Text value, Context context)throws IOException ,InterruptedException {
      line=value;
      context.wirte(line, next Text(""));
   }

 //reduce 将输入的key复制到输出数据key上, 并直接输出
   public static class Reduce extends Reducer<Text, Text, Text, Text> {
      public void reduce (Text key, Iterable<Text> values, Context context )
      throws IOException, InterruptedException
      {
          context.wirtable(key, new Text(""))
      }
   }
}

排序

file_1:

2
32
654
32
15
756
65223

file_2:

5956
22
650
92

file_3:

26
54
6

输出:

1 2
2 6
3 15
4 22
5 26
6 32
7 32
8 34
9 92
10 650
11 654
12 756
13 5956
14 65223
  • 算法:
    key封装成IntWritable类型,reduce的自动排序是发送到自己所在的节点,默认排序无法满足全局的顺序; 构建自己的partiton,再对局部的reduce上的数据进行默认排序

    1. 封装Int的IntWritable类型的数据结构
    2. 重写partitions,保证整体有序:输入数据的最大值除以系统的parititions的数量的商,
    3. reduce获取后,分别排序,输出的key是全局变量
    4. 默认是hashpartition,产生均衡的分区
public class Sort
{
//map 将输入的value转换为IntWratable类型, 作为输出的key
public static class Map extends Mapper<Object,Text, IntWriatable, IntWriatable>
{
   private static IntWritable data= new IntWritable();
   public void map(Object key, Text value, Context context) throws IOException, InterruptedException
   {
      String line=value.tostring();
      data.set(Interger.parseInt(line));
      context.write(data,new IntWritable(1));
   }
}

//reduce 将输入的key复制到输出的value上,然后根据输入的
//value-list中的元素的个数决定key的输出次数
//用全局linenum来代表key的位次
public static class Redcue exteneds Reducer<IntWritable, Int Writable, IntWriatble, Intwriatble>
{
   private static IntWritable linenum=new IntWritable(1);

   public void redcue(IntWritable key, Iterable<IntWritable> values, Context context)
   throws IOException, InterruptedException
   {
      for(IntWritable val : values)
      {
         context.wirte(linenum, key);
     linenum=new IntWritable(linenum.get()+1);
      }
   }
}

//自定义Partitions,函数根据输入的数据的最大值和MapReducer框架中
//Partition的数量获取
//返回对于的partition的ID

public static class Partitions extends Partitioner<IntWritable,IntWritable>
{
   public int getPartition(IntWriatable key, IntWriatable value, int numPartitions)
   {
      int Maxnumber=65523;
      int bound=Maxnumber/numPartitions+1;
      int keynumber=key.get();
      for(int i=0;i< numPartitions;i++)
      {
         if(keynumber<bound*i && keynumber>=bound*(i-1))
        return i-1;
      }
      return -1;
   }
}
}

//main
{
   job.setPartitionerclass(Partition.class)
}

全排序

  • partitioner的实现方法: HashPartitioner;TotalOrderPartitioner
  • 抽样的方法确定分区,预防数据倾斜
import ......

public class TotalSort
{
   public static void main(String[] args) throws Exception
   {
       Path inputPath=new Path(args[0]);
       Path outputPath=new Path(args[1]);

       //分区文件路径
       Path partitionFile=new Path(args[2]);
       int reducerNumber=Integer.parsInt(args[3]);

       //RandomSampler第一个参数表示会被选中的概率,
       //第二个参数是一个选取的样本数
       //第三个参数是最大读取的intputsplit的数目

       RandomSampler<Text,Text> sampler=new InputSampler.RandomSampler<Text,Text>(0.1, 10000,10);

       Configuration.setPartitionFile(conf, partitionFile);

       Job job=new Job(conf);
       job.setJobName("TotalSort");
       job.setJarByClass(TotalSort.class);
       job.setInputForamtClass(KeyValueTextInputFormat.class);
       job.setMapOutputKeyClass(Text.class);
       job.setMapOutputValueClass(Text.class);
       job.setNumReduceTasks(reduceNumber);

       //设置partitions
       job.setPartitionerClass(TotalOrderPartitioner.class);
       FileInputFormat.setInputPaths(job, inputPath);
       FileOutputFormat.setOutputPath(job, outputPath);
       outputPath.getFileSystem(conf).delete(outputPath,true);

       //写入分区文件
       InputSampler.writePartitionFile(job,sampler);

       System.out.println(job.waitForCompletion(true)? 0:1);
   }
}

二次排序

http://www.linuxidc.com/Linux/2014-03/98498.htm

sort1    1
sort2    3
sort2    77
sort2    54
sort1    2
sort6    22
sort6    221
sort6    20

输出:

sort1 1,2
sort2 3,54,77
sort6 20,22,221
  • 流程

  • Map端处理:生成如下格式

{[sort1,1],1}
{[sort2,3],3}
{[sort2,77],77}
{[sort2,54],54}
{[sort1,2],2}
{[sort6,22],22}
{[sort6,221],221}
{[sort6,20],20}
  • 分区处理器:将新key中的第一个字段相同的才放到同一个reduce中进行分组合并

操作后,得到的数据流如下:
Partition1:{[sort1,1],1}、{[sort1,2],2}
Partition2:{[sort2,3],3}、{[sort2,77],77}、{[sort2,54],54}
Partition3:{[sort6,22],22}、{[sort6,221],221}、{[sort6,20],20}

  • 调用自己的自定义排序器对新的Key值进行排序。
    {[sort1,1],1}
    {[sort1,2],2}
    {[sort2,3],3}
    {[sort2,54],54}
    {[sort2,77],77}
    {[sort6,20],20}
    {[sort6,22],22}
    {[sort6,221],221}

  • Reduce端处理:
    {sort1,[1,2]}
    {sort2,[3,54,77]}
    {sort6,[20,22,221]}

//map代码

public class SecondaryMapper extends Mapper<LongWirtable,Text,Text,NullWritable> {
    protected void map(LongWirtable key, Text value, Context context) throws
    java.io.IOexception,InterruptedException
    {
       //将value做为key输出,value代表一整行数据
       context.write(value, NullWritable.get());
    }
}

//partitioner代码
//分区过程中,按照第一个排序字段进行分发(新key的第一个字段)

public class KeyPartitioner extends HashPartitioner<Text, NullWritable> {
    public int getPartition(Text key, NullWritable value, int numReduceTasks)
    {
        return (key.toString().split(" ")[0].hashCode() & Inter.MAX_VALUE) %
    numRedcueTasks;
    }
}


//自定义的排序,对新的value值排序

public class SortComparator extends WritableComparator {
    protected SortComparator()
    {
       super(Text.class,true);
    }

    public int compare(WritableComparable key1, WirtableComparable key2)
    {
    //如果第一个排序字段相同,则需要比较第二个排序字段
    if(Integer.parseInt(Key1.toString().split(" ")[0]==Integer.parseInt(key2.toString().split(" ")[0])))
    {
        if(Integer.parseInt(Key1.toString().split(" ")[1]>Integer.parseInt(key2.toString().split(" ")[1]))
    return 1;
    else if Integer.parseInt(Key1.toString().split(" ")[1]<Integer.parseInt(key2.toString().split(" ")[1])
    {return -1;}
    else if (Integer.parseInt(Key1.toString().split(" ")[1]==Integer.parseInt(key2.toString().split(" ")[1]))
    {return 0;}

    }

    //如果第一个排序字段不同,则比较第二个排序字段

    else
    {
        if(Integer.parseInt(Key1.toString().split(" ")[0]>Integer.parseInt(key2.toString().split(" ")[0]))
    {return 1;}
    else if(Integer.parseInt(Key1.toString().split(" ")[0]<Integer.parseInt(key2.toString().split(" ")[0]))
    {return -1;}
    }
    return 0;
    }
}

//reduce
//设置reduce个数为1,分组需要再reduce之前再次分组,按默认的key

public class SecondaryReducer extends Reducer<Text, IntWriatble, NullWritable,Text> {
   protected void reduce(Text key, java.lang.Iterable<Text> values,Context context)
   throws java.io.IOException, InterruptedException
   {
       for(Text value:values)
       {
           context.wirte(NullWritable.get(),value);
        }
   }
}

join

J 0001
H 0002
B 0003

0001 chinese
0001 math
0002 music
0002 math
0003 physic

// join后的输出

j chinese
j math
h music
h math
b physic
  • 算法
    1. map阶段:读入文件的数据,并打上文件名(来源于那个文件)
    2. reduce阶段:笛卡尔积
//map函数
public class JoinMapper extends Mapper<LongWritable key,Text, Text, Text> {
   public static final String LEFT_FILENAME="student_info.txt";
   public static final String RIGHT_FILENAME="student_class_info.txt";
   public static final String LEFT_FILENAME_FLAG="l";
   public static final String RIGHT_FILENAME_FLAG="r";

   protected void map(LongWritable key, Text value, Context context) throws
   IOException, InterruptedException
   {
       String filePath=((FileSplit) context.getInputSplit()).getPath().toString();
       String fileFlag=null;
       String joinKey=null;
       String joinVlaue=null;

       //判断来自那个文件
       if(filePath.contains(LEFT_FILENAME))
       {
           FILEfLAG=left_filename_flag;
       joinKey=value.toString().split("\t")[1];
       jionValu=value.toString().split("\t")[0];
       }else if(filePath.contains(RIGHT_FILENAME))
       {
           fileFlag=RIGHT_FILENAME_FLAG;
       joinKey=value.toString().split("\t")[0];
       jionValu=value.toString().split("\t")[1];
       }
       //输出键值并标示该结果来自那个文件

       context.write(new Text(joinKey), new Text(joinValue+"\t"+fileFlag));
   };
}

//reduce函数

public class JoinReduce extends Reducer<Text,Text,Text,Text> {
   public static final String LEFT_FILENAME="student_info.txt";
   public static final String RIGHT_FILENAME="student_class_info.txt";
   public static final String LEFT_FILENAME_FLAG="l";
   public static final String RIGHT_FILENAME_FLAG="r";

   protected void reduce(Text key,Iterable<Text> values,Context context) throws
   IOException, InterruptedException
   {
      Iterable<Text> iterator=values.iterator();

      List<string> studentClssNames=new ArrayList<String>();
      String studentName="";

      while(iterator.hasNext())
      {
        String[] infos=oteratr.next().tostring.split("\t");
    //判断该记录来自那个文件
    if(infos[1].equals(LEFT_FILENAME_FLAG))
    {studentName=infos[0];}
    else if(infos[1].equals(RIGHT_FILENAME_FLAG))
    {studentClassNames.add(info[0]);}
      }
   }

    //求笛卡尔积
    for(int i=0;i<studentClassNames.size();i++)
    {
        context.write(new Text(studentName),new Text(studentClassName.get(i)));
    }
}

你可能感兴趣的:(hadoop_7 : MapReduce代码)