Spark:org.apache.spark.SparkException: Task not serializable

最近调式Java语言写spark SQL 访问HBase数据表,遇到标题所述的问题,先描述下问题出现过程,然后讲述下问题解决办法。

第一,在单独类中调式代码,能成功返回HBase数据,并展示出来:

public class testSQLFinal {
    public static void main(String[] args)throws IOException {
        //设置spark属性
        System.setProperty("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
        SparkSession spark = SparkSession.builder()
                .appName("wu_java_read_hbase_register_to_table")
                .master("local[4]")
                .getOrCreate();
        JavaSparkContext context = new JavaSparkContext(spark.sparkContext());
        //设置hbase连接参数
        Configuration configuration = HBaseConfiguration.create();
       configuration.set("hbase.zookeeper.quorum", "192.168.80.182,192.168.80.183,192.168.80.184");
       configuration.set("hbase.zookeeper.property.clientPort", "2181");
       configuration.set("hbase.master", "192.168.80.181:60000");
        Scan scan = new Scan();
        String tableName = "t1";
        configuration.set(TableInputFormat.INPUT_TABLE, tableName);
       
       //HBase封装类:用来实例化HBase实例和HBase crud操作方法
        IHBaseOperation ihBaseOperation= HBaseOperationImpl.getInstance(configuration);

         //列族 和列名  info:name
        final Map cfq=ihBaseOperation.getFamiliesAndQualifiersAByTable("default","t1");

        //查询数据
        ClientProtos.Scan proto = ProtobufUtil.toScan(scan);
        String ScanToString = Base64.encodeBytes(proto.toByteArray());
        configuration.set(TableInputFormat.SCAN, ScanToString);
        JavaPairRDD myRDD =    context.newAPIHadoopRDD(configuration,TableInputFormat.class, ImmutableBytesWritable.class, Result.class);

        JavaRDD personsRDD = myRDD.map(new Function,Row>() {  
            @Override
            public Row call(Tuple2 tuple) throws Exception {
                // TODO Auto-generated method stub
                System.out.println("====tuple=========="+tuple);
                Result result = tuple._2();
                String rowkey = Bytes.toString(result.getRow());
                List list=new ArrayList();
                list.add(rowkey);
                    for (Map.Entry entry:cfq.entrySet()){
                        String cf=entry.getValue();
                        String col=entry.getKey();
                        String s=Bytes.toString(result.getValue(Bytes.toBytes(cf),Bytes.toBytes(col)));
                        list.add(s);
                    }
                    //将tuple数据转换为rowRDD
                String[] fields =list.toArray(new String[list.size()]);
                return RowFactory.create(fields);
            }
        });

        List structFields=new ArrayList();
        structFields.add(DataTypes.createStructField("rowkey", DataTypes.StringType, true));
        List fields=new ArrayList(cfq.keySet());
        for (int i=0;i nameDf=spark.sql("select * from c1");
        nameDf.show();
    }
}

第二步,考虑到可重用性,将上述代码功能进行封装,封装到HBaseOperationImpl 实现类中,

如下:

  public Dataset getAllDatas(String nameSpace, String tableName) throws IOException {
        Scan scan=new Scan();
        final Map columnFamiliesQualier=getFamiliesAndQualifiersAByTable(nameSpace,tableName);
        ClientProtos.Scan proto=ProtobufUtil.toScan(scan);
        String ScanToString = Base64.encodeBytes(proto.toByteArray());
        conf.set(TableInputFormat.SCAN, ScanToString);
//        JavaSparkContext context = new JavaSparkContext(spark.sparkContext());
        JavaSparkContext context =sparkAppConf.javaSparkContext();
        JavaPairRDD myRDD = context.newAPIHadoopRDD(conf,TableInputFormat.class, ImmutableBytesWritable.class, Result.class);
        JavaRDD JavaRDD = myRDD.map(new Function,Row>() {
   //封装成方法后,报错点   
 @Override
    public Row call(Tuple2 tuple) throws Exception {
        // TODO Auto-generated method stub
        System.out.println("====tuple=========="+tuple);
        Result result = tuple._2();
        String rowkey = Bytes.toString(result.getRow());
        List list=new ArrayList();
        list.add(rowkey);
            for (Map.Entry entry:cfq.entrySet()){
                String cf=entry.getValue();
                String col=entry.getKey();
                String s=Bytes.toString(result.getValue(Bytes.toBytes(cf),Bytes.toBytes(col)));
                list.add(s);
            }
            //将tuple数据转换为rowRDD
        String[] fields =list.toArray(new String[list.size()]);
        return RowFactory.create(fields);
    }
});
        List structFields=new ArrayList();
        structFields.add(DataTypes.createStructField("rowkey", DataTypes.StringType, true));
        List fields=new ArrayList(columnFamiliesQualier.keySet());
        for (int i=0;i nameDf=spark.sql("select * from temp");
        return nameDf;
    }

解决方案:

将map里面的匿名内部类额外封装为一个可序列化的类,此方法中调用此类。

public class Mapper implements Function,Row>, Serializable {
    private static final long serialVersionUID = 42L;
    private Map columnFamiliesQualier;
    public Mapper(Map columnFamiliesQualier) {
        this.columnFamiliesQualier = columnFamiliesQualier;
    }
    @Override
    public Row call(Tuple2 tuple) throws IOException {
        // TODO Auto-generated method stub

//        System.out.println("====tuple==========" + tuple);
        Result result = tuple._2();
        String rowkey = Bytes.toString(result.getRow());
        List list = new ArrayList();
        list.add(rowkey);
        for (Map.Entry entry : columnFamiliesQualier.entrySet()) {
            String cf = entry.getValue();
            String col = entry.getKey();
            String s = Bytes.toString(result.getValue(Bytes.toBytes(cf), Bytes.toBytes(col)));
            list.add(s);
        }
        String[] fields = list.toArray(new String[list.size()]);
        return RowFactory.create(fields);
    }
}

解决方案参考一下博客:

https://stackoverflow.com/questions/30828595/spark-notserializableexception

http://bighow.org/questions/30828595/spark-notserializableexception

https://blog.csdn.net/javastart/article/details/50845767

http://mangocool.com/detail_1_1439971291385.html

 

你可能感兴趣的:(Spark)