Just record how to write & Read Apache ORC file with Java.
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.orc.OrcFile;
import org.apache.orc.Reader;
import org.apache.orc.RecordReader;
import org.apache.orc.TypeDescription;
import org.apache.orc.Writer;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.sql.Timestamp;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.LocalDate;
import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter;
public class ORCReader {
private static String testWrite() throws IOException, ParseException {
Configuration conf = new Configuration();
TypeDescription schema = TypeDescription.createStruct()
.addField("a-string", TypeDescription.createString())
.addField("b-date", TypeDescription.createDate())
.addField("c-double", TypeDescription.createDouble())
.addField("d-time", TypeDescription.createTimestamp())
.addField("e-string", TypeDescription.createString());
// TypeDescription schema = TypeDescription.fromString("struct");
String orcFile = System.getProperty("java.io.tmpdir") + File.separator + "orc-test-" + System.currentTimeMillis() + ".orc";
if(Files.exists(Paths.get(orcFile))) {
Files.delete(Paths.get(orcFile));
}
Writer writer = OrcFile.createWriter(new Path(orcFile),
OrcFile.writerOptions(conf)
.setSchema(schema));
VectorizedRowBatch batch = schema.createRowBatch();
BytesColumnVector a = (BytesColumnVector) batch.cols[0];
LongColumnVector b = (LongColumnVector) batch.cols[1];
DoubleColumnVector c = (DoubleColumnVector) batch.cols[2];
TimestampColumnVector d = (TimestampColumnVector) batch.cols[3];
BytesColumnVector e = (BytesColumnVector) batch.cols[4];
for(int r=0; r < 500; ++r) {
int row = batch.size++;
a.setVal(row, ("a-" + r).getBytes());
b.vector[row] = LocalDate.parse("2019-07-22").minusDays(r).toEpochDay();
c.vector[row] = Double.valueOf(r);
d.set(row, new Timestamp(new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ").parse("2019-07-22T01:12:37.758-0500").getTime()));
e.setVal(row, ("e-" + r).getBytes());
// If the batch is full, write it out and start over.
if (batch.size == batch.getMaxSize()) {
writer.addRowBatch(batch);
batch.reset();
}
}
if (batch.size != 0) {
writer.addRowBatch(batch);
batch.reset();
}
writer.close();
return orcFile;
}
private static void readTest(String filePath) throws IOException {
Configuration conf = new Configuration();
conf.setAllowNullValueProperties(true);
Reader reader = OrcFile.createReader(new Path(filePath),
OrcFile.readerOptions(conf));
RecordReader rows = reader.rows();
VectorizedRowBatch batch = reader.getSchema().createRowBatch();
System.out.println("schema:" + reader.getSchema());
System.out.println("numCols:" + batch.numCols);
ColumnVector.Type[] colsMap = new ColumnVector.Type[batch.numCols];
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
while (rows.nextBatch(batch)) {
BytesColumnVector cols0 = (BytesColumnVector) batch.cols[0];
LongColumnVector cols1 = (LongColumnVector) batch.cols[1];
DoubleColumnVector cols2 = (DoubleColumnVector) batch.cols[2];
TimestampColumnVector cols3 = (TimestampColumnVector) batch.cols[3];
BytesColumnVector cols4 = (BytesColumnVector) batch.cols[4];
for(int cols = 0; cols < batch.numCols; cols++) {
System.out.println("args = [" + batch.cols[cols].type + "]");
}
for(int r=0; r < batch.size; r++) {
String a = cols0.toString(r);
// System.out.println("date:" + cols1.vector[r]);
// String date = new SimpleDateFormat("yyyy-MM-dd").format(new Date(cols1.vector[r]));
// String value2 = String.valueOf(cols1.vector[r]);
String b = LocalDate.ofEpochDay(cols1.vector[r]).atStartOfDay(ZoneOffset.UTC).format(DateTimeFormatter.ofPattern("yyyy-MM-dd"));
// System.out.println("date:" + date);
Double c = cols2.vector[r];
Timestamp d = cols3.asScratchTimestamp(r);
String e = cols4.toString(r);
// String timeV = new String(insertTime.vector[r], insertTime.start[r], insertTime.length[r]);
// String value2 = jobId.length[r] == 0 ? "": new String(jobId.vector[r], jobId.start[r], jobId.length[r]);
System.out.println(a + ", " + b + ", " + c + ", " + simpleDateFormat.format(d) + ", " + e);
}
}
rows.close();
}
public static void main(String [ ] args) throws IOException, ParseException
{
String filePath = testWrite();
readTest(filePath);
}
}
Lookup the orc file with orc-tools.
- Convert csv/json to ORC, it didn't support the date type.
we can use orc-tools to check the data or meta - Lookup the data information
java -jar ~/software/orc-tools-1.5.6-uber.jar data /var/folders/ch/9l8p3shx5rdbzyh6dr17ljnm0000gp/T/orc-test-1564154755670.orc
log4j:WARN No appenders could be found for logger (org.apache.hadoop.util.Shell).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
Processing data file /var/folders/ch/9l8p3shx5rdbzyh6dr17ljnm0000gp/T/orc-test-1564154755670.orc [length: 3118]
{"a-string":"a-0","b-date":"2019-07-22","c-double":0,"d-time":"2019-07-22 01:12:37.758","e-string":"e-0"}
{"a-string":"a-1","b-date":"2019-07-21","c-double":1,"d-time":"2019-07-22 01:12:37.758","e-string":"e-1"}
{"a-string":"a-2","b-date":"2019-07-20","c-double":2,"d-time":"2019-07-22 01:12:37.758","e-string":"e-2"}
{"a-string":"a-3","b-date":"2019-07-19","c-double":3,"d-time":"2019-07-22 01:12:37.758","e-string":"e-3"}
{"a-string":"a-4","b-date":"2019-07-18","c-double":4,"d-time":"2019-07-22 01:12:37.758","e-string":"e-4"}
{"a-string":"a-5","b-date":"2019-07-17","c-double":5,"d-time":"2019-07-22 01:12:37.758","e-string":"e-5"}
{"a-string":"a-6","b-date":"2019-07-16","c-double":6,"d-time":"2019-07-22 01:12:37.758","e-string":"e-6"}
{"a-string":"a-7","b-date":"2019-07-15","c-double":7,"d-time":"2019-07-22 01:12:37.758","e-string":"e-7"}
{"a-string":"a-8","b-date":"2019-07-14","c-double":8,"d-time":"2019-07-22 01:12:37.758","e-string":"e-8"}
{"a-string":"a-9","b-date":"2019-07-13","c-double":9,"d-time":"2019-07-22 01:12:37.758","e-string":"e-9"}
- Lookup the meta information.
java -jar ~/software/orc-tools-1.5.6-uber.jar meta /var/folders/ch/9l8p3shx5rdbzyh6dr17ljnm0000gp/T/orc-test-1564154755670.orc
log4j:WARN No appenders could be found for logger (org.apache.hadoop.util.Shell).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
Processing data file /var/folders/ch/9l8p3shx5rdbzyh6dr17ljnm0000gp/T/orc-test-1564154755670.orc [length: 3118]
Structure for /var/folders/ch/9l8p3shx5rdbzyh6dr17ljnm0000gp/T/orc-test-1564154755670.orc
File Version: 0.12 with ORC_517
Rows: 500
Compression: ZLIB
Compression size: 262144
Type: struct<`a-string`:string,`b-date`:date,`c-double`:double,`d-time`:timestamp,`e-string`:string>
Stripe Statistics:
Stripe 1:
Column 0: count: 500 hasNull: false
Column 1: count: 500 hasNull: false bytesOnDisk: 904 min: a-0 max: a-99 sum: 2390
Column 2: count: 500 hasNull: false bytesOnDisk: 9 min: 2018-03-10 max: 2019-07-22
Column 3: count: 500 hasNull: false bytesOnDisk: 667 min: 0.0 max: 499.0 sum: 124750.0
Column 4: count: 500 hasNull: false bytesOnDisk: 19 min: 2019-07-22 01:12:37.758 max: 2019-07-22 01:12:37.758
Column 5: count: 500 hasNull: false bytesOnDisk: 904 min: e-0 max: e-99 sum: 2390
File Statistics:
Column 0: count: 500 hasNull: false
Column 1: count: 500 hasNull: false bytesOnDisk: 904 min: a-0 max: a-99 sum: 2390
Column 2: count: 500 hasNull: false bytesOnDisk: 9 min: 2018-03-10 max: 2019-07-22
Column 3: count: 500 hasNull: false bytesOnDisk: 667 min: 0.0 max: 499.0 sum: 124750.0
Column 4: count: 500 hasNull: false bytesOnDisk: 19 min: 2019-07-22 01:12:37.758 max: 2019-07-22 01:12:37.758
Column 5: count: 500 hasNull: false bytesOnDisk: 904 min: e-0 max: e-99 sum: 2390
Stripes:
Stripe: offset: 3 data: 2503 rows: 500 tail: 104 index: 173
Stream: column 0 section ROW_INDEX start: 3 length 12
Stream: column 1 section ROW_INDEX start: 15 length 34
Stream: column 2 section ROW_INDEX start: 49 length 27
Stream: column 3 section ROW_INDEX start: 76 length 36
Stream: column 4 section ROW_INDEX start: 112 length 30
Stream: column 5 section ROW_INDEX start: 142 length 34
Stream: column 1 section DATA start: 176 length 891
Stream: column 1 section LENGTH start: 1067 length 13
Stream: column 2 section DATA start: 1080 length 9
Stream: column 3 section DATA start: 1089 length 667
Stream: column 4 section DATA start: 1756 length 11
Stream: column 4 section SECONDARY start: 1767 length 8
Stream: column 5 section DATA start: 1775 length 891
Stream: column 5 section LENGTH start: 2666 length 13
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
Encoding column 3: DIRECT
Encoding column 4: DIRECT_V2
Encoding column 5: DIRECT_V2
File length: 3118 bytes
Padding length: 0 bytes
Padding ratio: 0%
Refer:
- ORC Tools https://orc.apache.org/docs/java-tools.html
- ORC Tools Source Code https://github.com/apache/orc/tree/master/java/tools