6. 自定义 Hadoop Writable data type

pom.xml


  4.0.0

  com.yinbodotcc
  countwords
  0.0.1-SNAPSHOT
  jar

  countwords
  http://maven.apache.org
 
  
   UTF-8
   3.0.3
   0.13.1
   0.98.6-hadoop2


  
    
    
    
        org.apache.mrunit
        mrunit
        1.1.0
        hadoop2
        test
    
    
    
    
        org.apache.hadoop
        hadoop-client-minicluster
        3.0.3
        test
    
                                    
    
    
    
        org.apache.hadoop
        hadoop-hdfs
        3.0.3
    


    
      org.apache.hadoop  
      hadoop-client  
      2.5.1  
   

  
      org.apache.hadoop
      hadoop-common
      2.5.0
  
  
   
   
  
   
   
    
  
  
      
          
              
                org.apache.maven.plugins
                maven-compiler-plugin  
                    3.1  
                      
                        1.8  
                        1.8

1. 自定义Writable数据类型LogWritable

package chapter4;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;

/**
 * HTTP server log processing sample for the Chapter 4 of Hadoop MapReduce
 * Cookbook. 
 * 
 * @author Thilina Gunarathne
 */
public class LogWritable implements WritableComparable {

    private Text userIP, timestamp, request;    
    private IntWritable responseSize, status;   

    public LogWritable() {
        this.userIP = new Text();
        this.timestamp =  new Text();
        this.request = new Text();
        this.responseSize = new IntWritable();
        this.status = new IntWritable();        
    }
    
    public void set (String userIP, String timestamp, String request, int bytes, int status)
    {
        this.userIP.set(userIP);
        this.timestamp.set(timestamp);
        this.request.set(request);
        this.responseSize.set(bytes);
        this.status.set(status);    
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        userIP.readFields(in);
        timestamp.readFields(in);
        request.readFields(in);
        responseSize.readFields(in);
        status.readFields(in);
    }

    @Override
    public void write(DataOutput out) throws IOException {
        userIP.write(out);
        timestamp.write(out);
        request.write(out);
        responseSize.write(out);
        status.write(out);
    }
    
    @Override
    public int compareTo(LogWritable o) {
        if (userIP.compareTo(o.userIP) == 0) {
            return timestamp.compareTo(o.timestamp);
        } else
            return userIP.compareTo(o.userIP);
    }
    
    public int hashCode()
    {
        return userIP.hashCode();
    }

    public Text getUserIP() {
        return userIP;
    }

    public Text getTimestamp() {
        return timestamp;
    }

    public Text getRequest() {
        return request;
    }

    public IntWritable getResponseSize() {
        return responseSize;
    }

    public IntWritable getStatus() {
        return status;
    }
}

2. Mapper实现类LogProcessorMap

package chapter4;

import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

/**
 * HTTP server log processing sample for the Chapter 4 of Hadoop MapReduce
 * Cookbook. 
 * 
 * @author Thilina Gunarathne
 */
public class LogProcessorMap extends Mapper {
    LogWritable outValue = new LogWritable();
    Text outKey = new Text();
    
    @Override
    public void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {
        String logEntryPattern = "^(\\S+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\d+)";

        Pattern p = Pattern.compile(logEntryPattern);
        Matcher matcher = p.matcher(value.toString());
        if (!matcher.matches()) {
            System.err.println("Bad Record : "+value);
            return;
        }
        
        String userIP = matcher.group(1);
        String timestamp = matcher.group(4);
        String request = matcher.group(5);
        int status = Integer.parseInt(matcher.group(6));
        int bytes = Integer.parseInt(matcher.group(7));
        
        outKey.set(userIP);
        outValue.set(userIP, timestamp, request,
                bytes,status);
        context.write(outKey,outValue);
    }
    
}

3. Reducer实现类LogProcessorReduce

package chapter4;
import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

/**
 * HTTP server log processing sample for the Chapter 4 of Hadoop MapReduce
 * Cookbook. 
 * 
 * @author Thilina Gunarathne
 */
public class LogProcessorReduce extends
        Reducer {
   private IntWritable result = new IntWritable();

   @Override
   public void reduce(Text key, Iterable values, 
                      Context context) throws IOException, InterruptedException {
     int sum = 0;
     for (LogWritable val : values) {
       sum += val.getResponseSize().get();
     }
     result.set(sum);
     context.write(key, result);
   }
}

4. 包含Main函数的类LogProcessor

package chapter4;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class LogProcessor extends Configured implements Tool {
    
    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new LogProcessor(), args);
        System.exit(res);
    }

    @Override
    public int run(String[] args) throws Exception {
        if (args.length < 3) {
            System.err
                    .println("Usage:    ");
            System.exit(-1);
        }

        /* input parameters */
        String inputPath = args[0];
        String outputPath = args[1];
        int numReduce = Integer.parseInt(args[2]);

        Job job = Job.getInstance(getConf(), "log-analysis");

        job.setJarByClass(LogProcessor.class);
        job.setMapperClass(LogProcessorMap.class);
        job.setReducerClass(LogProcessorReduce.class);
        job.setNumReduceTasks(numReduce);
    
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LogWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        
        FileInputFormat.setInputPaths(job, new Path(inputPath));
        FileOutputFormat.setOutputPath(job, new Path(outputPath));
        
        int exitStatus = job.waitForCompletion(true) ? 0 : 1;
        System.out.println("--------------------->>>>>>>>>>>>>>>>>>>>");
        return exitStatus;
    }
}

输入文件下载位置：ftp://ita.ee.lbl.gov/traces/NASA_access_log_Jul95.gz 这个下载后解压缩其中的文件到input中

图片.png

6. 自定义 Hadoop Writable data type

pom.xml

1. 自定义Writable数据类型LogWritable

2. Mapper实现类LogProcessorMap

3. Reducer实现类LogProcessorReduce

4. 包含Main函数的类LogProcessor

你可能感兴趣的:(6. 自定义 Hadoop Writable data type)