hbase实现TXT,CSV,ORC和Parquet格式数据的导入和导出代码

下面纯属原创,只是实现其功能,性能上对于大规模数据尚有欠缺,直接上代码:

导入:

 public ImportDataResponse importData(String connectionID, String dataBase, String tableName, String sourceFileType, String sourceFilePath) throws Exception {
        ImportDataResponse importDataResponse = new ImportDataResponse();
        orc orc = new orc();

        File file = new File(sourceFilePath);
        BufferedReader reader = null;
        String temp = null;
        String rowkey = null;
        String tmpvalue = null;
        try {
            if (ConnectionImpl.mapConnPool.get(connectionID) == null) {
                importDataResponse.setTaskStatus(0);
                importDataResponse.setErrorMsg("Please establish a connection first");
            } else {
                Configuration conf = (Configuration) ConnectionImpl.mapConnPool.get(connectionID);
                Connection conn = ConnectionFactory.createConnection(conf);
                Table table = conn.getTable(TableName.valueOf("default", tableName));
                List listPut = new ArrayList();
                if (sourceFileType.equals("TXT") || sourceFileType.equals("CSV")) {
                    reader = new BufferedReader(new FileReader(file));
                    while ((temp = reader.readLine()) != null) {
                        rowkey = temp.split(",")[0];
                        Put put = new Put(rowkey.getBytes());
                        tmpvalue = temp.substring(rowkey.length() + 1);

                        put.addColumn("family".getBytes(), ("field").getBytes(), tmpvalue.getBytes());
                        listPut.add(put);
                    }
                } else if (sourceFileType.equals("ORC")) {
                    String reader1 = orc.readerOrc(sourceFilePath);
                    String[] split = reader1.substring(0, reader1.length() - 1).split("\t");
                    for (int i = 0; i < split.length; i++) {
                        rowkey = split[i].split("&")[0];
                        Put put = new Put(rowkey.getBytes());
                        tmpvalue = split[i].split("&")[1];
                        put.addColumn("family".getBytes(), ("field").getBytes(), tmpvalue.getBytes());
                        listPut.add(put);
                    }

                }else if(sourceFileType.equals("Parquet")){
                    Path file1 = new Path(sourceFilePath);
                    Builder builder = ParquetReader.builder(new GroupReadSupport(), file1);
                    ParquetReader reader1 = builder.build();
                    Group line=null;
                    while((line=reader1.read())!=null){
                        rowkey = line.getString("rowkey", 0);
//                        line.getString("family",0);
//                        line.getString("field",0);
                        tmpvalue = line.getString("value", 0);
                        Put put = new Put(rowkey.getBytes());
                        put.addColumn("family".getBytes(), ("field").getBytes(), tmpvalue.getBytes());
                        listPut.add(put);
                        System.out.println("success!");
                    }

                }else {
                    System.out.println("please input right Document Format!");
                }
                table.put(listPut);
                table.close();
                importDataResponse.setTaskStatus(1);
                importDataResponse.setTable(tableName);
            }
        } catch (Exception e) {
            e.printStackTrace();
            importDataResponse.setTaskStatus(0);
            importDataResponse.setErrorCode(11);
            importDataResponse.setErrorMsg(e.getMessage());
        }
        return importDataResponse;
    }

orc类:

package com.Distributedcolumndatabase.Management;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.io.Writable;
import org.apache.orc.OrcFile;
import org.apache.orc.Reader;
import org.apache.orc.RecordReader;

import java.util.List;
import java.util.Properties;

import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcSerde;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcSerde;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;


public class orc {
    String path;

    /**
     * 导入orc文件
     *
     * @param path
     * @throws IOException
     * @throws SerDeException
     */
    public String readerOrc(String path) throws IOException, SerDeException {
        String str = null;

        JobConf conf = new JobConf();
        Path testFilePath = new Path(path);
        Properties p = new Properties();
        OrcSerde serde = new OrcSerde();
        p.setProperty("columns", "rowkey,family,field,value");
        p.setProperty("columns.types", "string:string:string:string");
        serde.initialize(conf, p);
        StructObjectInspector inspector = (StructObjectInspector) serde.getObjectInspector();
        InputFormat in = new OrcInputFormat();
        FileInputFormat.setInputPaths(conf, testFilePath.toString());
        InputSplit[] splits = in.getSplits(conf, 1);
        System.out.println("splits.length==" + splits.length);

        conf.set("hive.io.file.readcolumn.ids", "1");
        org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
        Object key = reader.createKey();
        Object value = reader.createValue();
        List fields = inspector.getAllStructFieldRefs();
        long offset = reader.getPos();
        while (reader.next(key, value)) {
            Object rowkey = inspector.getStructFieldData(value, fields.get(0));
            Object field_value = inspector.getStructFieldData(value, fields.get(3));
            offset = reader.getPos();
            str+=rowkey+"&"+field_value+"\t";
        }
        reader.close();
        return str;
    }

    /**
     * 导出orc文件
     *
     * @param path
     * @param myrow
     * @throws IOException
     */
    public void writerOrc(Path path, MyRow myrow) throws IOException {
        JobConf conf = new JobConf();
        FileSystem fs = FileSystem.get(conf);
       // Path outputPath = new Path(path);
        StructObjectInspector inspector =
                (StructObjectInspector) ObjectInspectorFactory
                        .getReflectionObjectInspector(MyRow.class,
                                ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
        OrcSerde serde = new OrcSerde();
        OutputFormat outFormat = new OrcOutputFormat();
        RecordWriter writer = outFormat.getRecordWriter(fs, conf,
                path.toString(), Reporter.NULL);
        writer.write(NullWritable.get(),
                serde.serialize(myrow, inspector));
        writer.close(Reporter.NULL);
        fs.close();
        System.out.println("write success .");
    }

    static class MyRow implements Writable {
        String rowkey;
        String family;
        String field;
        String value;

        public MyRow(String rowkey, String family, String field,String value) {
            this.rowkey = rowkey;
            this.family = family;
            this.field = field;
            this.value = value;
        }

        public String getValue() {
            return value;
        }

        public void setValue(String value) {
            this.value = value;
        }

        public String getRowkey() {
            return rowkey;
        }

        public void setRowkey(String rowkey) {
            this.rowkey = rowkey;
        }

        public String getFamily() {
            return family;
        }

        public void setFamily(String family) {
            this.family = family;
        }

        public String getField() {
            return field;
        }

        public void setField(String field) {
            this.field = field;
        }

        @Override
        public void write(DataOutput dataOutput) throws IOException {
            throw new UnsupportedOperationException("no read");
        }

        @Override
        public void readFields(DataInput dataInput) throws IOException {
            throw new UnsupportedOperationException("no write");
        }
    }

}

导出:

public ExportDataResponse exportData(String connectionID, String dataBase, String tableName, String destFileType, String destFilePath) {
        ExportDataResponse exportDataResponse = new ExportDataResponse();
        orc orc = new orc();

        if (ConnectionImpl.mapConnPool.get(connectionID) == null) {
            exportDataResponse.setTaskStatus(0);
            exportDataResponse.setErrorMsg("Please establish a connection first");
        } else {
            Configuration conf = (Configuration) ConnectionImpl.mapConnPool.get(connectionID);
            Connection conn = null;
            try {
                conn = ConnectionFactory.createConnection(conf);
                Table table = conn.getTable(TableName.valueOf("default", tableName));
                //Collection families = table.getTableDescriptor().getFamilies();
                ResultScanner resultScanner;
                try {
                    Scan scan = new Scan();
                    resultScanner = table.getScanner(scan);
                } finally {
                    table.close();
                }
                if (destFileType.equals("TXT") || destFileType.equals("CSV")) {
                    File file = new File(destFilePath);
                    FileWriter writer = new FileWriter(file);
                    for (Result result : resultScanner) {
                        List keyValueList = result.listCells();
                        for (Cell cell : keyValueList) {
                            System.out.println("row=" + new String(CellUtil.cloneRow(cell)) + " " + "columefamily=" + new String(CellUtil.cloneFamily(cell)) + " " + "colume = " + new String(CellUtil.cloneQualifier(cell)) + " " + "value=" + new String(CellUtil.cloneValue(cell)));
                            String str = new String(CellUtil.cloneRow(cell)) + "," + new String(CellUtil.cloneFamily(cell)) + "," + new String(CellUtil.cloneQualifier(cell)) + "," + new String(CellUtil.cloneValue(cell)) + "\n";
                            writer.write(str);
                        }
                    }
                    writer.flush();
                    writer.close();
                } else if (destFileType.equals("ORC")) {
                    //Path outputPath = new Path(destFilePath);
                    JobConf conf1 = new JobConf();
                    FileSystem fs = FileSystem.get(conf1);
                    Path outputPath = new Path(destFilePath);
                    StructObjectInspector inspector =
                            (StructObjectInspector) ObjectInspectorFactory
                                    .getReflectionObjectInspector(com.paas.Distributedcolumndatabase.Management.orc.MyRow.class,
                                            ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
                    OrcSerde serde = new OrcSerde();
                    OutputFormat outFormat = new OrcOutputFormat();
                    RecordWriter writer = outFormat.getRecordWriter(fs, conf1,
                            outputPath.toString(), Reporter.NULL);
                    for (Result result : resultScanner) {
                        List keyValueList = result.listCells();
                        for (Cell cell : keyValueList) {
                            // System.out.println("row=" + new String(CellUtil.cloneRow(cell)) + " " + "columefamily=" + new String(CellUtil.cloneFamily(cell)) + " " + "colume = " + new String(CellUtil.cloneFamily(cell) + " " + "value=" + new String(CellUtil.cloneValue(cell)));
                            String str = new String(CellUtil.cloneRow(cell)) + "," + new String(CellUtil.cloneFamily(cell)) + "," + new String(CellUtil.cloneQualifier(cell)) + "," + new String(CellUtil.cloneValue(cell)) + "\n";
                            //orc.writerOrc(outputPath, new orc.MyRow(new String(CellUtil.cloneRow(cell)), new String(CellUtil.cloneFamily(cell)), new String(CellUtil.cloneFamily(cell)), new String(CellUtil.cloneValue(cell))));
                            writer.write(NullWritable.get(),
                                    serde.serialize(new orc.MyRow(new String(CellUtil.cloneRow(cell)), new String(CellUtil.cloneFamily(cell)), new String(CellUtil.cloneQualifier(cell)), new String(CellUtil.cloneValue(cell))), inspector));
                        }
                    }
                    writer.close(Reporter.NULL);
                    fs.close();
                    System.out.println("write success .");
                }else if(destFileType.equals("Parquet")){
                    //构建schema
                     String schemaStr = "message schema {" + "repeated binary rowkey;"
                            +"repeated binary family;"+ "repeated binary colume;"+"repeated binary value;}";
                     MessageType schema = MessageTypeParser.parseMessageType(schemaStr);

                    Path file = new Path(destFilePath);
                    ExampleParquetWriter.Builder builder = ExampleParquetWriter
                            .builder(file).withWriteMode(ParquetFileWriter.Mode.CREATE)
                            .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0)
                            .withCompressionCodec(CompressionCodecName.SNAPPY)
                            //.withConf(configuration)
                            .withType(schema);
                    ParquetWriter writer = builder.build();
                    SimpleGroupFactory groupFactory = new SimpleGroupFactory(schema);
                    for (Result result : resultScanner) {
                        List keyValueList = result.listCells();
                        for (Cell cell : keyValueList) {
                            System.out.println("row=" + new String(CellUtil.cloneRow(cell)) + " " + "columefamily=" + new String(CellUtil.cloneFamily(cell)) + " " + "colume = " + new String(CellUtil.cloneQualifier(cell)) + " " + "value=" + new String(CellUtil.cloneValue(cell)));
                            String[] str = {new String(CellUtil.cloneRow(cell)) , new String(CellUtil.cloneFamily(cell)), new String(CellUtil.cloneQualifier(cell)),new String(CellUtil.cloneValue(cell)) };
                            writer.write(groupFactory.newGroup()
                                    .append("rowkey",str[0])
                                    .append("family",str[1])
                                    .append("colume",str[2])
                                    .append("value",str[3]));
                            System.out.println("=======================");
                        }
                    }
                    writer.close();
                }else {
                    System.out.println("please input right Document Format!");
                }
                exportDataResponse.setTaskStatus(1);
                exportDataResponse.setTable(tableName);
                exportDataResponse.setErrorCode(0);
                exportDataResponse.getErrorMsg();
            } catch (IOException e) {
                exportDataResponse.setTaskStatus(0);
                exportDataResponse.setTable(tableName);
                exportDataResponse.setErrorMsg(e.getMessage());
                exportDataResponse.setErrorCode(0);
            }
        }
        return exportDataResponse;
    }

 

你可能感兴趣的:(大数据)