java读取hudi parquet文件

java读取hudi parquet文件

hudi版本说明:0.10.1

import org.apache.avro.Schema;
import org.apache.parquet.avro.AvroParquetReader;
import org.apache.parquet.hadoop.ParquetReader;
import org.apache.parquet.hadoop.util.HadoopInputFile;

import java.io.IOException;

public class HudiParquetParser {

    public static void main(String[] args) throws IOException {
        // Hudi 生成的 Parquet 文件路径
        String parquetFilePath = "/Users/lxq/Desktop/depark/test/parquet/user/45b734d2-bcff-406f-819f-2ba097356f10_0-2-0_20230703193743155.parquet";

        // 读取 Parquet 文件
        try (ParquetReader<org.apache.avro.generic.GenericRecord> reader = AvroParquetReader
                .<org.apache.avro.generic.GenericRecord>builder(
                        HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(parquetFilePath), new org.apache.hadoop.conf.Configuration()))
                .withConf(new org.apache.hadoop.conf.Configuration())
                .build()) {
            org.apache.avro.generic.GenericRecord record;
            while ((record = reader.read()) != null) {
                // 处理每个记录
                Schema schema = record.getSchema();
                for (Schema.Field field : schema.getFields()) {
                    String fieldName = field.name();
                    Object value = record.get(fieldName);
                    System.out.println(fieldName + ": " + value);
                }
                System.out.println("======================");
            }
        }
    }
}

你可能感兴趣的:(java,python,开发语言)