MapReduce中Initialization of all the collectors failed的解决方案

需求

显示输出网站用户的登录记录.
服务后台记录下的日志文件有3个,分别是:

  1. 所有用户列表文件 user-log

    u_id name
    1 小红
    2 小行
    3 小通
    4 小闪
    5 小镇
    6 小振
    7 小秀
    8 小微
    9 小懂
    10 小明
    11 小刚
    12 小举
    13 小黑
    14 小白
    15 小鹏
    16 小习
  2. 用户性别文件sex-log

    sex_id sex
    0 不知道
    1 男
    2 女
  3. 用户登录网站时间记录文件login-log

    u_id sex_id login_time
    1 1 2017-04-17 08:16:20
    2 2 2017-04-15 06:18:20
    3 1 2017-04-16 05:16:24
    4 2 2017-04-14 03:18:20
    5 1 2017-04-13 02:16:25
    6 2 2017-04-13 01:15:20
    7 1 2017-04-12 08:16:34
    8 2 2017-04-11 09:16:20
    9 0 2017-04-10 05:16:50

最后经过mapreduce得到的结果如下:

1 小红 男 2017-04-17 08:16:20
2 小张 女 2017-04-16 05:16:24
...

这样的需求很简单,读取login-log的每一行,根据空格” “拆分每一行,然后根据u_id去user-log中得到name,根据sex_id去sex-log中得到具体的sex,然后将拆分后以及查到的每一个部分都封装到一个Writable实现子类中,输出每一个Writable的子类对象,就可以得到最后的结果.
我的Writable实现类为Person:

package com.ljy.demo15;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;


public class Person implements Writable{

    private int id;
    private String sex;
    private String name;
    private String loginfo;



    public Person() {
    }

    public Person(int id, String sex, String name, String loginfo) {
        this.id = id;
        this.sex = sex;
        this.name = name;
        this.loginfo = loginfo;
    }


    public int getId() {
        return id;
    }


    public void setId(int id) {
        this.id = id;
    }


    public String getSex() {
        return sex;
    }


    public void setSex(String sex) {
        this.sex = sex;
    }


    public String getName() {
        return name;
    }


    public void setName(String name) {
        this.name = name;
    }




    public String getLoginfo() {
        return loginfo;
    }


    public void setLoginfo(String loginfo) {
        this.loginfo = loginfo;
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.id = in.readInt();
        this.sex = in.readUTF();
        this.name = in.readUTF();
        this.loginfo = in.readUTF();
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeInt(id);
        out.writeUTF(sex);
        out.writeUTF(name);
        out.writeUTF(loginfo);
    }

    @Override
    public String toString() {
        return id+" "+sex+" "+name+" "+loginfo;
    }

}

我自定义的Mapper类:

static class DemoMapper extends Mapper<LongWritable, Text, Person, NullWritable> {
        //存储sex-log的数据
        private Map sex_map = new HashMap<>();
        //存储user-log的数据
        private Map uer_map = new HashMap<>();

        @Override
        protected void map(LongWritable key, Text value,
                Mapper.Context context)
                throws IOException, InterruptedException {
            String line = value.toString();
            String[] split = line.split(" ");
            String id = split[0];
            String sexid = split[1];
            String loginfo = split[2];
            //根据user-id从map中获取name
            String name = uer_map.get(Integer.parseInt(id));
            //根据 sex-id从map中获取具体的sex
            String sex = sex_map.get(Integer.parseInt(sexid));
            //将得到的数据封装到Writable的实现类中
            Person p = new Person(Integer.parseInt(id), sex, name, loginfo);
            context.write(p, null);
        }

        /**
         * 读取小表的内容
         * 
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void setup(Mapper.Context context)
                throws IOException, InterruptedException {
            Path[] paths = DistributedCache.getLocalCacheFiles(context.getConfiguration());
            for (Path path : paths) {
                String name = path.getName();
                //读取sex-log的数据,并存到map集合中
                if (name.equals("sex-log")) {
                    BufferedReader br = new BufferedReader(new FileReader(path.toString()));
                    String line = null;
                    while ((line = br.readLine()) != null) {
                        String[] sexs = line.split(" ");
                        sex_map.put(Integer.parseInt(sexs[0]), sexs[1]);
                    }
                    br.close();
                }
                //读取user-log的数据,并存到map集合中
                if (name.equals("user-log")) {
                    BufferedReader br = new BufferedReader(new FileReader(path.toString()));
                    String line = null;
                    while ((line = br.readLine()) != null) {
                        String[] users = line.split(" ");
                        uer_map.put(Integer.parseInt(users[0]), users[1]);
                    }
                    br.close();
                }
            }
        }

    }

我的Driver的main函数代码:

public static void main(String[] args) {
        Configuration conf = new Configuration();
        try {
            Job job = Job.getInstance(conf, "demo15");
            job.setJarByClass(DemoDriver.class);

            job.setMapperClass(DemoMapper.class);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Person.class);

            job.setOutputKeyClass(Person.class);
            job.setOutputValueClass(NullWritable.class);
            //指定sex-log和user-log的存放位置
            job.setCacheFiles(new URI[] { new URI("/test/demo15/cache/sex-log"), new URI("/test/demo15/cache/user-log") });

            FileInputFormat.addInputPath(job, new Path("/test/demo15/input"));
            Path path = new Path("/test/demo15/output");
            FileOutputFormat.setOutputPath(job, path);
            FileSystem fs = FileSystem.get(new URI("hdfs://192.168.93.111:9000"), conf);
            if (fs.exists(path)) {
                fs.delete(path, true);
            }

            int res = job.waitForCompletion(true) ? 1 : 0;
            System.exit(res);
        } catch (IOException e) {
            e.printStackTrace();
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        } catch (URISyntaxException e) {
            e.printStackTrace();
        }
    }

原本我以为这样就可以完美的输出最后的结果了,但是将jar包放到linux上运行时,出现了下面的错误:

/09 23:25:09 INFO mapreduce.Job:  map 0% reduce 0%
18/05/09 23:25:24 INFO mapreduce.Job: Task Id : attempt_1525876857912_0001_m_000000_0, Status : FAILED
Error: java.io.IOException: Initialization of all the collectors failed. Error in last collector was :class com.ljy.demo15.Person
    at org.apache.hadoop.mapred.MapTask.createSortingCollector(MapTask.java:415)
    at org.apache.hadoop.mapred.MapTask.access$100(MapTask.java:81)
    at org.apache.hadoop.mapred.MapTask$NewOutputCollector.(MapTask.java:698)
    at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:770)
    at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
    at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164)
    at java.security.AccessController.doPrivileged(Native Method)
    at javax.security.auth.Subject.doAs(Subject.java:415)
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754)
    at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
Caused by: java.lang.ClassCastException: class com.ljy.demo15.Person
    at java.lang.Class.asSubclass(Class.java:3208)
    at org.apache.hadoop.mapred.JobConf.getOutputKeyComparator(JobConf.java:887)
    at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.init(MapTask.java:1004)
    at org.apache.hadoop.mapred.MapTask.createSortingCollector(MapTask.java:402)
    ... 9 more

由于是新手,错误信息中并没有指定是我的代码的那一行出错,于是就上网查,网上查到的结果都是Text的包导错了,本来是org.apache.hadoop.io.Text下的Text,但是导成了其他包下的,我以为我的也是这样的,但是经过再三确认后,我的就是org.apache.hadoop.io.Text.
Google的好多页,也没有得到最后的结果,于是我就想把这个问题放下吧,不能耽误时间,还需要继续其他的事情呢,刚好当天我买了数据算法这本书,打开第一章,讲的是二次排序,说mapreduce内置默认的排序功能,突然恍然大悟,曾经学习TreeMap和TreeSet的时候,也有排序的要求,我们自定义的类,如果没有实现两个对象大小比较的方法,就没有自动排序功能可言,而以上我自定义的Person作为Map输出的key,也需要这样的比较功能,而Writable只是Hadoop对序列化和反序列化的加强,要能实现比较功能,需要实现的类是WritableComparable接口,然后重写其compareTo方法.果不其然,修改了代码以后,就可以得到正确的结果了.
最后的Person类

package com.ljy.demo15;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;


public class Person implements WritableComparable{

    private int id;
    private String sex;
    private String name;
    private String loginfo;

    public Person() {
    }

    public Person(int id, String sex, String name, String loginfo) {
        this.id = id;
        this.sex = sex;
        this.name = name;
        this.loginfo = loginfo;
    }

    public int getId() {
        return id;
    }
    public void setId(int id) {
        this.id = id;
    }
    public String getSex() {
        return sex;
    }
    public void setSex(String sex) {
        this.sex = sex;
    }
    public String getName() {
        return name;
    }
    public void setName(String name) {
        this.name = name;
    }

    public String getLoginfo() {
        return loginfo;
    }

    public void setLoginfo(String loginfo) {
        this.loginfo = loginfo;
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.id = in.readInt();
        this.sex = in.readUTF();
        this.name = in.readUTF();
        this.loginfo = in.readUTF();
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeInt(id);
        out.writeUTF(sex);
        out.writeUTF(name);
        out.writeUTF(loginfo);
    }

    @Override
    public String toString() {
        return id+" "+sex+" "+name+" "+loginfo;
    }

    @Override
    public int compareTo(Person o) {
        int res = o.getId()-this.id;
        return res==0?1:res;//如果返回0,表示这两个对象是相等的,而要求key不能相等.所以如果等于0,就按先来后到存进去咯
    }

}

最后的Driver类:

package com.ljy.demo15;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

@SuppressWarnings("deprecation")
public class DemoDriver {
    static class DemoMapper extends Mapper {

        private Map sex_map = new HashMap<>();
        private Map uer_map = new HashMap<>();

        @Override
        protected void map(LongWritable key, Text value,
                Mapper.Context context)
                throws IOException, InterruptedException {
            String line = value.toString();
            String[] split = line.split(" ");
            String id = split[0];
            String sexid = split[1];
            String loginfo = split[2];
            String name = uer_map.get(Integer.parseInt(id));
            String sex = sex_map.get(Integer.parseInt(sexid));
            Person p = new Person(Integer.parseInt(id), sex, name, loginfo);
            context.write(p,null);
        }

        /**
         * 读取小表的内容
         * 
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void setup(Mapper.Context context)
                throws IOException, InterruptedException {
            Path[] paths = DistributedCache.getLocalCacheFiles(context.getConfiguration());
            for (Path path : paths) {
                String name = path.getName();
                if (name.equals("t_sex")) {
                    BufferedReader br = new BufferedReader(new FileReader(path.toString()));
                    String line = null;
                    while ((line = br.readLine()) != null) {
                        String[] sexs = line.split(" ");
                        sex_map.put(Integer.parseInt(sexs[0]), sexs[1]);
                    }
                    br.close();
                }

                if (name.equals("t_user")) {
                    BufferedReader br = new BufferedReader(new FileReader(path.toString()));
                    String line = null;
                    while ((line = br.readLine()) != null) {
                        String[] users = line.split(" ");
                        uer_map.put(Integer.parseInt(users[0]), users[1]);
                    }
                    br.close();
                }
            }
        }

    }


    public static void main(String[] args) {
        Configuration conf = new Configuration();
        try {
            Job job = Job.getInstance(conf, "demo15");
            job.setJarByClass(DemoDriver.class);

            job.setMapperClass(DemoMapper.class);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Person.class);

            job.setOutputKeyClass(Person.class);
            job.setOutputValueClass(NullWritable.class);

            job.setCacheFiles(new URI[] { new URI("/test/demo15/cache/t_sex"), new URI("/test/demo15/cache/t_user") });

            FileInputFormat.addInputPath(job, new Path("/test/demo15/input"));
            Path path = new Path("/test/demo15/output");
            FileOutputFormat.setOutputPath(job, path);
            FileSystem fs = FileSystem.get(new URI("hdfs://192.168.93.111:9000"), conf);
            if (fs.exists(path)) {
                fs.delete(path, true);
            }

            int res = job.waitForCompletion(true) ? 1 : 0;
            System.exit(res);
        } catch (IOException e) {
            e.printStackTrace();
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        } catch (URISyntaxException e) {
            e.printStackTrace();
        }
    }

}

总结:自定义的序列化对象作为MapReduce的key输出的时候,需要实现WritableComparable接口,从写compareTo方法.

以上只是记录我的学习过程,并不是实际生产,实际生产,如果用户量很很大,向上面我代码中将user-log读到内存中,将会是一个很不雅的做法,程序将没有hadoop的可伸缩性,会产生内存溢出错误.

你可能感兴趣的:(大数据-hadoop)