mapreduce针对n列orc文件的读写

建300列的ORC表,可以用execl简单建一个300列,10000行的数据,复制成以tab分割的txt文件
hdfs dfs -put ddd.txt hdfs://hadoop:9000/tmp/input/

create table test_orc_300(
c1  string      , 
c2  string      , 
c3  string      , 
c4  string      , 
c5  string      , 
c6  string      , 
c7  string      , 
c8  string      , 
c9  string      , 
c10 string      , 
c11 string      , 
c12 string      , 
c13 string      , 
c14 string      , 
c15 string      , 
c16 string      , 
c17 string      , 
c18 string      , 
c19 string      , 
c20 string      , 
c21 string      , 
c22 string      , 
c23 string      , 
c24 string      , 
c25 string      , 
c26 string      , 
c27 string      , 
c28 string      , 
c29 string      , 
c30 string      , 
c31 string      , 
c32 string      , 
c33 string      , 
c34 string      , 
c35 string      , 
c36 string      , 
c37 string      , 
c38 string      , 
c39 string      , 
c40 string      , 
c41 string      , 
c42 string      , 
c43 string      , 
c44 string      , 
c45 string      , 
c46 string      , 
c47 string      , 
c48 string      , 
c49 string      , 
c50 string      , 
c51 string      , 
c52 string      , 
c53 string      , 
c54 string      , 
c55 string      , 
c56 string      , 
c57 string      , 
c58 string      , 
c59 string      , 
c60 string      , 
c61 string      , 
c62 string      , 
c63 string      , 
c64 string      , 
c65 string      , 
c66 string      , 
c67 string      , 
c68 string      , 
c69 string      , 
c70 string      , 
c71 string      , 
c72 string      , 
c73 string      , 
c74 string      , 
c75 string      , 
c76 string      , 
c77 string      , 
c78 string      , 
c79 string      , 
c80 string      , 
c81 string      , 
c82 string      , 
c83 string      , 
c84 string      , 
c85 string      , 
c86 string      , 
c87 string      , 
c88 string      , 
c89 string      , 
c90 string      , 
c91 string      , 
c92 string      , 
c93 string      , 
c94 string      , 
c95 string      , 
c96 string      , 
c97 string      , 
c98 string      , 
c99 string      , 
c100    string    , 
c101    string    , 
c102    string    , 
c103    string    , 
c104    string    , 
c105    string    , 
c106    string    , 
c107    string    , 
c108    string    , 
c109    string    , 
c110    string    , 
c111    string    , 
c112    string    , 
c113    string    , 
c114    string    , 
c115    string    , 
c116    string    , 
c117    string    , 
c118    string    , 
c119    string    , 
c120    string    , 
c121    string    , 
c122    string    , 
c123    string    , 
c124    string    , 
c125    string    , 
c126    string    , 
c127    string    , 
c128    string    , 
c129    string    , 
c130    string    , 
c131    string    , 
c132    string    , 
c133    string    , 
c134    string    , 
c135    string    , 
c136    string    , 
c137    string    , 
c138    string    , 
c139    string    , 
c140    string    , 
c141    string    , 
c142    string    , 
c143    string    , 
c144    string    , 
c145    string    , 
c146    string    , 
c147    string    , 
c148    string    , 
c149    string    , 
c150    string    , 
c151    string    , 
c152    string    , 
c153    string    , 
c154    string    , 
c155    string    , 
c156    string    , 
c157    string    , 
c158    string    , 
c159    string    , 
c160    string    , 
c161    string    , 
c162    string    , 
c163    string    , 
c164    string    , 
c165    string    , 
c166    string    , 
c167    string    , 
c168    string    , 
c169    string    , 
c170    string    , 
c171    string    , 
c172    string    , 
c173    string    , 
c174    string    , 
c175    string    , 
c176    string    , 
c177    string    , 
c178    string    , 
c179    string    , 
c180    string    , 
c181    string    , 
c182    string    , 
c183    string    , 
c184    string    , 
c185    string    , 
c186    string    , 
c187    string    , 
c188    string    , 
c189    string    , 
c190    string    , 
c191    string    , 
c192    string    , 
c193    string    , 
c194    string    , 
c195    string    , 
c196    string    , 
c197    string    , 
c198    string    , 
c199    string    , 
c200    string    , 
c201    string    , 
c202    string    , 
c203    string    , 
c204    string    , 
c205    string    , 
c206    string    , 
c207    string    , 
c208    string    , 
c209    string    , 
c210    string    , 
c211    string    , 
c212    string    , 
c213    string    , 
c214    string    , 
c215    string    , 
c216    string    , 
c217    string    , 
c218    string    , 
c219    string    , 
c220    string    , 
c221    string    , 
c222    string    , 
c223    string    , 
c224    string    , 
c225    string    , 
c226    string    , 
c227    string    , 
c228    string    , 
c229    string    , 
c230    string    , 
c231    string    , 
c232    string    , 
c233    string    , 
c234    string    , 
c235    string    , 
c236    string    , 
c237    string    , 
c238    string    , 
c239    string    , 
c240    string    , 
c241    string    , 
c242    string    , 
c243    string    , 
c244    string    , 
c245    string    , 
c246    string    , 
c247    string    , 
c248    string    , 
c249    string    , 
c250    string    , 
c251    string    , 
c252    string    , 
c253    string    , 
c254    string    , 
c255    string    , 
c256    string    , 
c257    string    , 
c258    string    , 
c259    string    , 
c260    string    , 
c261    string    , 
c262    string    , 
c263    string    , 
c264    string    , 
c265    string    , 
c266    string    , 
c267    string    , 
c268    string    , 
c269    string    , 
c270    string    , 
c271    string    , 
c272    string    , 
c273    string    , 
c274    string    , 
c275    string    , 
c276    string    , 
c277    string    , 
c278    string    , 
c279    string    , 
c280    string    , 
c281    string    , 
c282    string    , 
c283    string    , 
c284    string    , 
c285    string    , 
c286    string    , 
c287    string    , 
c288    string    , 
c289    string    , 
c290    string    , 
c291    string    , 
c292    string    , 
c293    string    , 
c294    string    , 
c295    string    , 
c296    string    , 
c297    string    , 
c298    string    , 
c299    string    , 
c300    string      
) stored as orc ; 

ORC读文件

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.orc.mapred.OrcStruct;
import org.apache.orc.mapreduce.OrcInputFormat;
import java.io.IOException;


public class OrcReaderMR {
    public static class OrcMap extends Mapper{
        private Text text = new Text();
        public void map(NullWritable key, OrcStruct value,
                        Context output) throws IOException, InterruptedException {
            StringBuffer bf = new StringBuffer();
            for(int i=0;i
                WritableComparable fieldValue = value.getFieldValue(i);
                bf.append(fieldValue.toString()).append("\t");
            }
           text.set(bf.toString());
            output.write(NullWritable.get(),text);
            }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://hadoop:9000");
        conf.set("mapreduce.application.classpath", System.getProperty("user.dir"));
        System.setProperty("HADOOP_USER_NAME", "root");
        //设置开发环境变量
        System.setProperty("hadoop.home.dir", "/opt/hadoop-2.7.3/");

        Job job = Job.getInstance(conf);
        job.setJarByClass(OrcReaderMR.class);
        job.setJobName("OrcReaderMR");
        job.setMapperClass(OrcMap.class);
        job.setInputFormatClass(OrcInputFormat.class);
        job.setNumReduceTasks(0);
        job.setOutputFormatClass(TextOutputFormat.class);

        // 指定该mapreduce程序数据的输入路径
        Path inputPath = new Path("/user/hive/warehouse/test_orc_300");

        // 指定该mapreduce程序数据的输出路径
        Path outputPath = new Path("/user/hive/warehouse/test_orc_300_out");
        FileSystem fs = FileSystem.get(conf);
        if (fs.exists(outputPath)) {
            fs.delete(outputPath, true);
        }
        FileInputFormat.setInputPaths(job, inputPath);
        FileOutputFormat.setOutputPath(job, outputPath);
        boolean waitForCompletion = job.waitForCompletion(true);
        System.exit(waitForCompletion ? 0 : 1);
    }
}

运行完后查看文件内容

hdfs dfs -cat hdfs://hadoop:9000/user/hive/warehouse/test_orc_300_out/part-m-00000

ORC写文件

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.orc.OrcConf;
import org.apache.orc.TypeDescription;
import org.apache.orc.mapred.OrcStruct;
import org.apache.orc.mapreduce.OrcOutputFormat;
import parquet.filter2.predicate.Operators;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class OrcWriterMR {
    public static class OrcWriterMapper
            extends Mapper {


        private TypeDescription schema =
                TypeDescription.fromString("struct");

        private OrcStruct pair = (OrcStruct) OrcStruct.createValue(schema);


        private final NullWritable nada = NullWritable.get();



       // private IntWritable age = new IntWritable();

        public void map(LongWritable key, Text value,
                        Context output
        ) throws IOException, InterruptedException {
            List list =new ArrayList();
            for(int i=0;i<300;i++){
                Text text = new Text();
                list.add(text);
            }

            if(!"".equals(value.toString())){
                String[] arr = value.toString().split("\t");
                for(int n=0;n<300;n++) {
                    list.get(n).set(arr[n]);
                    pair.setFieldValue(n, list.get(n));
                }
                output.write(nada, pair);
            }
        }
    }



    public static void main(String[] args) throws Exception {
        // 指定mapreduce运行的hdfs相关的参数
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://hadoop:9000");
        conf.set("mapreduce.application.classpath", System.getProperty("user.dir"));
        OrcConf.MAPRED_OUTPUT_SCHEMA.setString(conf,"struct");

        //分布式集群设置
        // conf.set("mapred.jar", System.getProperty("user.dir")+"/WordCount.jar");
        System.setProperty("HADOOP_USER_NAME", "root");
        //设置开发环境变量
        System.setProperty("hadoop.home.dir", "/opt/hadoop-2.7.3/");

        // 设置mapreduce运行模式,这也是默认值
        // conf.set("mapreduce.framework.name", "yarn");
        // conf.set("yarn.resourcemanager.hostname", "hadoop");

        // 获取job对象
        Job job = Job.getInstance(conf);

        // 设置jar包所在路径
        job.setJarByClass(OrcWriterMR.class);
        job.setJobName("OrcWriterMR");
        job.setNumReduceTasks(0);
        // 指定mapper类和reducer类
        job.setMapperClass(OrcWriterMapper.class);
        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(OrcOutputFormat.class);
        // 指定该mapreduce程序数据的输入路径
        Path inputPath = new Path("/tmp/input/300.txt");
        // 指定该mapreduce程序数据的输出路径
        Path outputPath = new Path("/user/hive/warehouse/test_orc_300");
        FileSystem fs = FileSystem.get(conf);
        if (fs.exists(outputPath)) {
            fs.delete(outputPath, true);
        }
        FileInputFormat.setInputPaths(job, inputPath);
        FileOutputFormat.setOutputPath(job, outputPath);

        boolean waitForCompletion = job.waitForCompletion(true);
        System.exit(waitForCompletion ? 0 : 1);
    }
}

运行完后查看表的内容
select * from test_orc_300;

你可能感兴趣的:(hadoop)