Pig之自定义UDF的开发和使用

本文对的hadoop中日志,用pig进行处理,抽取样例日志格式如下:

2019-11-11 12:24:10.472 INFO [resin-port-9001-48][ContentOperationController.java:367] - Collection events:eventsType=operationPage;mac=08674E4296AF;sn=12033500205FF008674E4296AF;userId=12488488;userType=vod;parentColumnId=6029;columnId=6831;nowSpm=6029.PAGE_SER01100604.6831.0.1573446250464;afterSpm=6029.PAGE_SER01100602.6833.0.1573446249270.4;pos=POS_LIST;posName=列表;createTime=2019-11-11 12:24:10:END
2019-11-11 12:24:12.433 INFO [resin-port-9003-1315][CommonAuthService.java:162] - Collection events:eventsType=auth_product;mac=74FF4CDA27DA;sn=1202216140576074FF4CDA27DA;userId=13497550;userType=VOD;contentId=13567;contentType=1;parentColumnId=6029;code=S100000;message=鉴权通过;operateType=auth_product;createTime=2019-11-11 12:24:12:END
2019-11-11 12:24:18.946 INFO [resin-port-9002-42][ResumePointController.java:161] - Collection events:eventsType=operateResumePoint;mac=FCE19240AA9E;sn=12034100105010FCE19240AA9E;userId=11393439;userType=vod;parentColumnId=6029;columnId=0;contentId=12415;contentType=1;operateType=get;createTime=2019-11-11 12:24:18:END
2019-11-18 13:55:10.355 INFO [resin-port-9002-45][BookMarkController.java:97] - Collection events:eventsType=operateBookMark;mac=FCE192398070;sn=12094100205010FCE192398070;userId=13293012;userType=vod;parentColumnId=6029;columnId=0;contentId=12993;contentType=1;operateType=add;createTime=2019-11-18 13:55:10:END
2019-11-18 13:59:20.814 INFO [resin-port-9002-1626][SearchContentController.java:122] - Collection events:eventsType=operationPage;mac=A089E4B645FF;sn=12032100705760A089E4B645FF;userId=11411625;userType=vod;parentColumnId=6029;columnId=6035;nowSpm=6029.PAGE_SEARCH.6035.0.1574056760814;afterSpm=6029.PAGE_ALBUM_DETAILS.6035.14437.1574056752311.4;pos=POS_SEARCH;posName=搜索;createTime=2019-11-18 13:59:20:END

需提取上述日志中“.* Collection events:”和结束“:END”直接的key,value值

具体操作步骤如下

一、创建java的maven工程,把pig开发依赖包添加进pom.xml中,如下

 
      org.apache.pig
      pig
      0.17.0
    
    
      org.apache.hadoop
      hadoop-core
      1.2.1
    

 

二、开发自定义的Storage

package com.ppfuns;

import org.apache.commons.cli.*;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.pig.*;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigTextInputFormat;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigTextOutputFormat;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POStore;
import org.apache.pig.builtin.JsonMetadata;
import org.apache.pig.bzip2r.Bzip2TextInputFormat;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.util.*;

import java.io.IOException;
import java.util.*;

/**
 * Created with IntelliJ IDEA.

 * To change this template use File | Settings | File Templates.
 */
public class PpfunsStorage extends FileInputLoadFunc implements StoreFuncInterface, LoadPushDown, LoadMetadata, StoreMetadata, OverwritableStoreFunc {
    protected RecordReader in;
    protected RecordWriter writer;
    protected final Log mLog;
    protected String udfcSignature;
    private static final String PPFUNS_SIGNATURE = "pig.ppfunsloader.schema";
    private ArrayList mProtoTuple;
    private TupleFactory mTupleFactory;
    private String loadLocation;
    boolean isSchemaOn;
    boolean dontLoadSchema;
    boolean overwriteOutput;
    private byte fieldDel;
    protected ResourceSchema schema;
    protected LoadCaster caster;
    protected boolean[] mRequiredColumns;
    private boolean mRequiredColumnsInitialized;
    private boolean tagFile;
    private static final String TAG_SOURCE_FILE = "tagFile";
    private boolean tagPath;
    private static final String TAG_SOURCE_PATH = "tagPath";
    private Path sourcePath;
    private boolean bzipinput_usehadoops;

    private final static String PPFUNS_REPLACE_PREFIX = "^.*Collection events:";
    private final static String PPFUNS_REPLACE_SUFFIX = ":END(\n){0,1}$";
    private final static String PPFUNS_PARAM_SPLIT_SIGN = ";"; //参数拆分
    private final static String PPFUNS_KEY_VALUE_SPLIT_SIGN = "="; //k-v拆分
    public PpfunsStorage(){
        this(",");
    }
    public PpfunsStorage(String delimiter) {
        this(delimiter, "");
    }
    public PpfunsStorage(String delimiter,String options){
        this.in = null;
        this.writer = null;
        this.mLog = LogFactory.getLog(this.getClass());
        this.fieldDel = 9;
        this.mProtoTuple = null;
        this.mTupleFactory = TupleFactory.getInstance();
        this.isSchemaOn = false;
        this.dontLoadSchema = false;
        this.overwriteOutput = false;
        this.mRequiredColumns = null;
        this.mRequiredColumnsInitialized = false;
        this.tagFile = false;
        this.tagPath = false;
        this.sourcePath = null;
        this.fieldDel = StorageUtil.parseFieldDel(delimiter);
        Options validOptions = this.populateValidOptions();
        String[] optsArr = options.split(" ");

        try {
            GnuParser e = new GnuParser();
            CommandLine formatter1 = e.parse(validOptions, optsArr);
            this.isSchemaOn = formatter1.hasOption("schema");
            if(formatter1.hasOption("overwrite")) {
                String value = formatter1.getOptionValue("overwrite");
                if("true".equalsIgnoreCase(value)) {
                    this.overwriteOutput = true;
                }
            }

            this.dontLoadSchema = formatter1.hasOption("noschema");
            this.tagFile = formatter1.hasOption("tagFile");
            this.tagPath = formatter1.hasOption("tagPath");
            if(formatter1.hasOption("tagsource")) {
                this.mLog.warn("\'-tagsource\' is deprecated. Use \'-tagFile\' instead.");
                this.tagFile = true;
            }

        } catch (ParseException var8) {
            HelpFormatter formatter = new HelpFormatter();
            formatter.printHelp("PpfunsStorage(\',\', \'[options]\')", validOptions);
            throw new RuntimeException(var8);
        }
    }


    public void setLocation(String location, Job job) throws IOException {
        this.loadLocation = location;
        FileInputFormat.setInputPaths(job, location);
        this.bzipinput_usehadoops = job.getConfiguration().getBoolean("pig.bzip.use.hadoop.inputformat", true);
    }
    public InputFormat getInputFormat() {
        if((this.loadLocation.endsWith(".bz2") || this.loadLocation.endsWith(".bz")) && !this.bzipinput_usehadoops) {
            this.mLog.info("Using Bzip2TextInputFormat");
            return new Bzip2TextInputFormat();
        } else {
            this.mLog.info("Using PigTextInputFormat");
            return new PigTextInputFormat();
        }
    }
    public void prepareToRead(RecordReader reader, PigSplit split) {
        this.in = reader;
        if(this.tagFile || this.tagPath) {
            this.sourcePath = ((FileSplit)split.getWrappedSplit()).getPath();
        }

    }

    @Override
    public Tuple getNext() throws IOException {
        this.mProtoTuple = new ArrayList();
        if(!this.mRequiredColumnsInitialized) {
            if(this.udfcSignature != null) {
                Properties e = UDFContext.getUDFContext().getUDFProperties(this.getClass(),new String[]{udfcSignature});
                this.mRequiredColumns = (boolean[])((boolean[]) ObjectSerializer.deserialize(e.getProperty(this.PPFUNS_SIGNATURE)));
            }

            this.mRequiredColumnsInitialized = true;
        }

        if(this.tagFile) {
            this.mProtoTuple.add(new DataByteArray(this.sourcePath.getName()));
        } else if(this.tagPath) {
            this.mProtoTuple.add(new DataByteArray(this.sourcePath.toString()));
        }

        try {
            boolean var9 = this.in.nextKeyValue();
            if(!var9) {
                return null;
            } else {
                Text var10 = (Text)this.in.getCurrentValue();
                String vart = var10.toString();
                if(vart != null){
                    String t = vart.replaceFirst(PPFUNS_REPLACE_PREFIX,"").replaceFirst(PPFUNS_REPLACE_SUFFIX,""); //替换前缀和后缀
                    String[] arr = t.split(PPFUNS_PARAM_SPLIT_SIGN);
                    Map map = new HashMap<>();
                    for(String a : arr){
                        String[] keyValue = a.split(PPFUNS_KEY_VALUE_SPLIT_SIGN,2);    //key-value
                        map.put(keyValue[0],keyValue[1]);
                    }
                    addTupleValue2(map);
                }
                Tuple var12 = this.mTupleFactory.newTupleNoCopy(this.mProtoTuple);
                return var12;
            }
        } catch (InterruptedException var8) {
            short errCode = 6018;
            String errMsg = "Error while reading input";
            throw new ExecException(errMsg, errCode, (byte)16, var8);
        }
    }
    private void addTupleValue2(Map map) throws IOException {
        if(this.caster == null) {
            this.caster = this.getLoadCaster();
        }
        if(this.schema == null) listSchema();
        if(this.schema != null) {
            ResourceSchema.ResourceFieldSchema[] var8 = this.schema.getFields();
            int index;
            for (index = 0; index < var8.length; ++index) {
                String value = map.get(var8[index].getName());
                if(!StringUtils.isEmpty(value)){
                    Object list = CastUtils.convertToType(this.caster, value.getBytes(), var8[index], var8[index].getType());
                    mLog.info("转换后的值:key"+var8[index].getName()+";value="+list);
                    mProtoTuple.add(list);
                }else {
                    mProtoTuple.add(null);
                }

            }
        }
    }
    public void putNext(Tuple f) throws IOException {
        try {
            this.writer.write((Object)null, f);
        } catch (InterruptedException var3) {
            throw new IOException(var3);
        }
    }

    public RequiredFieldResponse pushProjection(RequiredFieldList requiredFieldList) throws FrontendException {
        if(requiredFieldList == null) {
            mLog.info("RequiredFieldList字段列表NULL");
            return null;
        } else {
            if(requiredFieldList.getFields() != null) {
                int lastColumn = -1;
                Iterator p = requiredFieldList.getFields().iterator();

                RequiredField e;
                while(p.hasNext()) {
                    e = (RequiredField)p.next();
                    mLog.info("RequiredFieldList字段列表11:alias="+e.getAlias()+";index="+e.getIndex()+";type="+e.getIndex());
                    if(e.getIndex() > lastColumn) {
                        lastColumn = e.getIndex();
                    }
                }

                this.mRequiredColumns = new boolean[lastColumn + 1];
                p = requiredFieldList.getFields().iterator();

                while(p.hasNext()) {
                    e = (RequiredField)p.next();
                    mLog.info("RequiredFieldList字段列表:alias="+e.getAlias()+";index="+e.getIndex()+";type="+e.getIndex());
                    if(e.getIndex() != -1) {
                        this.mRequiredColumns[e.getIndex()] = true;
                    }
                }

                Properties p1 = UDFContext.getUDFContext().getUDFProperties(this.getClass());

                try {
                    p1.setProperty(this.PPFUNS_SIGNATURE, ObjectSerializer.serialize(this.mRequiredColumns));
                } catch (Exception var5) {
                    throw new RuntimeException("Cannot serialize mRequiredColumns");
                }
            }

            return new RequiredFieldResponse(true);
        }
    }
    public boolean equals(Object obj) {
        return obj instanceof PpfunsStorage?this.equals((PpfunsStorage)obj):false;
    }

    public boolean equals(PpfunsStorage other) {
        return this.fieldDel == other.fieldDel;
    }
    private Options populateValidOptions() {
        Options validOptions = new Options();
        validOptions.addOption("schema", false, "Loads / Stores the schema of the relation using a hidden JSON file.");
        validOptions.addOption("noschema", false, "Disable attempting to load data schema from the filesystem.");
        validOptions.addOption("tagFile", false, "Appends input source file name to beginning of each tuple.");
        validOptions.addOption("tagPath", false, "Appends input source file path to beginning of each tuple.");
        validOptions.addOption("tagsource", false, "Appends input source file name to beginning of each tuple.");
        Option overwrite = new Option("overwrite", "Overwrites the destination.");
        overwrite.setLongOpt("overwrite");
        overwrite.setOptionalArg(true);
        overwrite.setArgs(1);
        overwrite.setArgName("overwrite");
        validOptions.addOption(overwrite);
        return validOptions;
    }
    public OutputFormat getOutputFormat() {
        return new PigTextOutputFormat(this.fieldDel);
    }

    public void prepareToWrite(RecordWriter writer) {
        this.writer = writer;
    }

    public void setStoreLocation(String location, Job job) throws IOException {
        job.getConfiguration().set("mapred.textoutputformat.separator", "");
        FileOutputFormat.setOutputPath(job, new Path(location));
        if("true".equals(job.getConfiguration().get("output.compression.enabled"))) {
            FileOutputFormat.setCompressOutput(job, true);
            String codec = job.getConfiguration().get("output.compression.codec");

            try {
                FileOutputFormat.setOutputCompressorClass(job, (Class) Class.forName(codec));
            } catch (ClassNotFoundException var5) {
                throw new RuntimeException("Class not found: " + codec);
            }
        } else {
            this.setCompression(new Path(location), job);
        }

    }

    private void setCompression(Path path, Job job) {
        String location = path.getName();
        if(!location.endsWith(".bz2") && !location.endsWith(".bz")) {
            if(location.endsWith(".gz")) {
                FileOutputFormat.setCompressOutput(job, true);
                FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
            } else {
                FileOutputFormat.setCompressOutput(job, false);
            }
        } else {
            FileOutputFormat.setCompressOutput(job, true);
            FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
        }

    }

    public void checkSchema(ResourceSchema s) throws IOException {
    }

    public String relToAbsPathForStoreLocation(String location, Path curDir) throws IOException {
        return LoadFunc.getAbsolutePath(location, curDir);
    }

    public int hashCode() {
        return this.fieldDel;
    }

    public void setUDFContextSignature(String signature) {
        this.udfcSignature = signature;
    }

    public List getFeatures() {
        return Arrays.asList(new OperatorSet[]{OperatorSet.PROJECTION});
    }

    public void setStoreFuncUDFContextSignature(String signature) {
    }

    public void cleanupOnFailure(String location, Job job) throws IOException {
        StoreFunc.cleanupOnFailureImpl(location, job);
    }

    public void cleanupOnSuccess(String location, Job job) throws IOException {
    }
    private ResourceSchema.ResourceFieldSchema addFieldSchema(String name){
        Schema.FieldSchema c = new Schema.FieldSchema(name, DataType.CHARARRAY);
        ResourceSchema.ResourceFieldSchema fs =new ResourceSchema.ResourceFieldSchema(c);
        return fs;
    }
    private void listSchema(){
        String f = "eventsType:chararray,mac:chararray,sn:chararray,userId:chararray,userType:chararray,parentColumnId:chararray,columnId:chararray,nowSpm:chararray,afterSpm:chararray,posName:chararray,contentId:chararray,contentType:chararray,operateType:chararray,keyWord:chararray,createTime:datetime";
        String[] params =  f.split(",");
        ResourceSchema.ResourceFieldSchema[] list =  new ResourceSchema.ResourceFieldSchema[params.length];
        int i=0;
        for(String s : params){
            String[] kv = s.split(":");
            Schema.FieldSchema c = new Schema.FieldSchema(kv[0],  DataType.findTypeByName(kv[1]));
            ResourceSchema.ResourceFieldSchema fs =new ResourceSchema.ResourceFieldSchema(c);
            list[i++]=fs;
        }
        schema = new ResourceSchema();
        schema.setFields(list);
    }
    public ResourceSchema getSchema(String location, Job job) throws IOException {
        listSchema();
        if(schema != null) return schema;
        mLog.info("getSchema方法下dontLoadSchema="+dontLoadSchema);
        if(!this.dontLoadSchema) {
            this.schema = (new JsonMetadata()).getSchema(location, job, this.isSchemaOn);
            mLog.info("getSchema方法下signature="+udfcSignature+";isSchemaOn="+isSchemaOn);
            if(this.schema !=null){
                for(String t : this.schema.fieldNames()){
                    mLog.info("getSchema方法下schema="+t);
                }
            }
            if(this.udfcSignature != null && this.schema != null) {
                if(this.tagFile) {
                    this.schema = Utils.getSchemaWithInputSourceTag(this.schema, "INPUT_FILE_NAME");
                } else if(this.tagPath) {
                    this.schema = Utils.getSchemaWithInputSourceTag(this.schema, "INPUT_FILE_PATH");
                }

                Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass(), new String[]{this.udfcSignature});
                p.setProperty(this.udfcSignature + ".schema", this.schema.toString());
            }
        }

        return this.schema;
    }

    public ResourceStatistics getStatistics(String location, Job job) throws IOException {
        return null;
    }

    public void setPartitionFilter(Expression partitionFilter) throws IOException {
    }

    public String[] getPartitionKeys(String location, Job job) throws IOException {
        return null;
    }

    public void storeSchema(ResourceSchema schema, String location, Job job) throws IOException {
        if(this.isSchemaOn) {
            JsonMetadata metadataWriter = new JsonMetadata();
            byte recordDel = 10;
            metadataWriter.setFieldDel(this.fieldDel);
            metadataWriter.setRecordDel(recordDel);
            metadataWriter.storeSchema(schema, location, job);
        }

    }

    public void storeStatistics(ResourceStatistics stats, String location, Job job) throws IOException {
    }

    public boolean shouldOverwrite() {
        return this.overwriteOutput;
    }

    public void cleanupOutput(POStore store, Job job) throws IOException {
        Configuration conf = job.getConfiguration();
        String output = conf.get("mapred.output.dir");
        Path outputPath = null;
        if(output != null) {
            outputPath = new Path(output);
        }

        FileSystem fs = outputPath.getFileSystem(conf);

        try {
            fs.delete(outputPath, true);
        } catch (Exception var8) {
            this.mLog.warn("Could not delete output " + output);
        }

    }

    public static void main(String[] args) {
        System.out.println(new PpfunsStorage().getClass());
    }
}
 
  

 

三、打包PpfunsStorage.java 文件,推荐方式有两种:

  1. 直接eclipse导出(选择PpfunsStorage.java文件,然后右键直接Export->Java(JAR file)即可),这种方式最简单。
  2. idea编辑器打包,这个可参考https://github.com/sunnyJam/code2/tree/master/pig_udf/doc下视频文件

 

四、注册并执行

  1. 先登录pig;执行  pig -x mapreduce
  2. 注册自定义的udf; REGISTER pig_udf.jar;
  3. 定义类的类别名称 ;DEFINE PpfunsStorage  com.ppfuns.PpfunsStorage;
  4. 加载数据并定义字段;
    dataLogs = LOAD '/data/logs/2019/11/{11,18}'
       USING PpfunsStorage()
       as (eventsType:chararray,mac:chararray, sn:chararray, userId:chararray, userType:chararray,
       parentColumnId:chararray,columnId:chararray,nowSpm:chararray,afterSpm:chararray,
       posName:chararray,contentId:chararray,contentType:chararray,
       operateType:chararray,keyWord:chararray,createTime:datetime);
  5. dump数据;dump dataLogs;

   以上执行步骤如下

[root@hadoop-01 app]# pig -x mapreduce
2020-03-02 10:21:22,648 INFO pig.ExecTypeProvider: Trying ExecType : LOCAL
2020-03-02 10:21:22,649 INFO pig.ExecTypeProvider: Trying ExecType : MAPREDUCE
2020-03-02 10:21:22,649 INFO pig.ExecTypeProvider: Picked MAPREDUCE as the ExecType
2020-03-02 10:21:22,685 [main] INFO  org.apache.pig.Main - Apache Pig version 0.17.0 (r1797386) compiled Jun 02 2017, 15:41:58
2020-03-02 10:21:22,685 [main] INFO  org.apache.pig.Main - Logging error messages to: /home/app/pig_1583115682682.log
2020-03-02 10:21:22,700 [main] INFO  org.apache.pig.impl.util.Utils - Default bootup file /root/.pigbootup not found
2020-03-02 10:21:22,902 [main] INFO  org.apache.hadoop.conf.Configuration.deprecation - mapred.job.tracker is deprecated. Instead, use mapreduce.jobtracker.address
2020-03-02 10:21:22,902 [main] INFO  org.apache.pig.backend.hadoop.executionengine.HExecutionEngine - Connecting to hadoop file system at: hdfs://hadoop-cluster
2020-03-02 10:21:23,330 [main] INFO  org.apache.pig.PigServer - Pig Script ID for the session: PIG-default-873b155f-7ae5-49c7-abf3-6ee29ae29ff7
2020-03-02 10:21:23,330 [main] WARN  org.apache.pig.PigServer - ATS is disabled since yarn.timeline-service.enabled set to false
grunt> REGISTER pig_udf.jar;
grunt> DEFINE PpfunsStorage  com.ppfuns.PpfunsStorage;
grunt> dataLogs = LOAD '/data/logs/2019/11/{11,18}'
>>    USING PpfunsStorage()
>>    as (eventsType:chararray,mac:chararray, sn:chararray, userId:chararray, userType:chararray,
>>    parentColumnId:chararray,columnId:chararray,nowSpm:chararray,afterSpm:chararray,
>>    posName:chararray,contentId:chararray,contentType:chararray,
>>    operateType:chararray,keyWord:chararray,createTime:datetime);
grunt> dump dataLogs;

 

你可能感兴趣的:(pig,hadoop)