MR解析hdfs操作日志文件示例


针对如下样例文件:

2016-04-03 22:53:19,912 INFO FSNamesystem.audit: allowed=true   ugi=hdfs (auth:SIMPLE)  ip=/192.168.0.4 cmd=getfileinfo src=/tmp    dst=null    perm=null   proto=rpc
2016-04-03 22:53:26,141 INFO FSNamesystem.audit: allowed=true   ugi=hdfs (auth:SIMPLE)  ip=/192.168.0.4 cmd=getfileinfo src=/tmp    dst=null    perm=null   proto=rpc
2016-04-06 23:57:00,632 INFO FSNamesystem.audit: allowed=true   ugi=hue (auth:SIMPLE)   ip=/192.168.203.129 cmd=getfileinfo src=/user/hue/.cloudera_manager_hive_metastore_canary/hive_HIVEMETASTORE_08096d8f1a6060afe22c0e7653753346/cm_test_table/p1=p1   dst=null    perm=null   proto=rpc
2016-04-06 23:57:00,635 INFO FSNamesystem.audit: allowed=true   ugi=hue (auth:SIMPLE)   ip=/192.168.203.129 cmd=listStatus  src=/user/hue/.cloudera_manager_hive_metastore_canary/hive_HIVEMETASTORE_08096d8f1a6060afe22c0e7653753346/cm_test_table/p1=p1   dst=null    perm=null   proto=rpc
2016-04-16 05:07:39,883 INFO FSNamesystem.audit: allowed=true   ugi=hue (auth:SIMPLE)   ip=/192.168.1.5 cmd=getfileinfo src=/user/hue/.cloudera_manager_hive_metastore_canary/hive_HIVEMETASTORE_08096d8f1a6060afe22c0e7653753346/cm_test_table/p1=p0/p2=420    dst=null    perm=null   proto=rpc
2016-04-16 05:07:39,885 INFO FSNamesystem.audit: allowed=true   ugi=hue (auth:SIMPLE)   ip=/192.168.1.5 cmd=getfileinfo src=/user/hue/.cloudera_manager_hive_metastore_canary/hive_HIVEMETASTORE_08096d8f1a6060afe22c0e7653753346/cm_test_table/p1=p0   dst=null    perm=null   proto=rpc
2016-04-16 04:02:31,601 INFO FSNamesystem.audit: allowed=true   ugi=hue (auth:SIMPLE)   ip=/192.168.1.5 cmd=getfileinfo src=/user/hue/.cloudera_manager_hive_metastore_canary/hive_HIVEMETASTORE_08096d8f1a6060afe22c0e7653753346/cm_test_table dst=null    perm=null   proto=rpc
2016-04-16 04:02:31,606 INFO FSNamesystem.audit: allowed=true   ugi=hue (auth:SIMPLE)   ip=/192.168.1.5 cmd=getfileinfo src=/user/hue/.cloudera_manager_hive_metastore_canary/hive_HIVEMETASTORE_08096d8f1a6060afe22c0e7653753346/cm_test_table/p1=p1   dst=null    perm=null   proto=rpc
2016-04-16 04:02:31,608 INFO FSNamesystem.audit: allowed=true   ugi=hue (auth:SIMPLE)   ip=/192.168.1.5 cmd=listStatus  src=/user/hue/.cloudera_manager_hive_metastore_canary/hive_HIVEMETASTORE_08096d8f1a6060afe22c0e7653753346/cm_test_table/p1=p1   dst=null    perm=null   proto=rpc
2016-04-16 04:02:31,610 INFO FSNamesystem.audit: allowed=true   ugi=hue (auth:SIMPLE)   ip=/192.168.1.5 cmd=listStatus  src=/user/hue/.cloudera_manager_hive_metastore_canary/hive_HIVEMETASTORE_08096d8f1a6060afe22c0e7653753346/cm_test_table/p1=p1/p2=421    dst=null    perm=null   proto=rpc
2016-04-16 04:02:31,614 INFO FSNamesystem.audit: allowed=true   ugi=hue (auth:SIMPLE)   ip=/192.168.1.5 cmd=getfileinfo src=/user/hue/.cloudera_manager_hive_metastore_canary/hive_HIVEMETASTORE_08096d8f1a6060afe22c0e7653753346/cm_test_table/p1=p1   dst=null    perm=null   proto=rpc

此文件是记录HDFS操作记录的日志文件,现有如下需求:

使用MapReduce完成对文件的数据统计, 得出以下结果

Ip, cmd, 操作次数, 该IP的总操作次数.
该需求类似以下伪SQL:

SELECT IP, CMD, COUNT(*) CMDTOTAL
, SUM(COUNT(*)) OVER(PARTITION BY IP) IPTOTAL
FROM xxx GROUP BY IP, CMD

以下代码基于新版MR2实现。

Mapper类:

public class ComplexMRMapper extends Mapper {
  @Override
  protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
     if (value == null) {
        return;
     }
     String[] line = value.toString().trim().split ("\t");
     String ip = line[2].substring(4).trim();
     String cmd = line[3].substring(4).trim();
     try {
        context.write(new Text(ip), new Text(cmd));
     } catch (Exception e) {
        e.printStackTrace();
     }
   }
}

Reduce类:

public class ComplexMRReducer extends Reducer {
    @Override
    protected void reduce(Text key, Iterable values, Context context) {

    int i = 0; // 记录sum(count(*)) over(partition by ip)
    Map map = new HashMap(); // 记录操作次数 记录 count(*)
    for (Text v : values) {
        i++;
        if(map.containsKey(v.toString())){
            map.put(v.toString(),map.get(v.toString()) + 1);
        } else{
            map.put(v.toString(),1);
        }
    }
    String str = "";//输出的value
    for(String s : map.keySet()) {
        str = s + ",count(*)=" + map.get(s) + ",sum() over(by ip)=" + i;
        try {
            context.write(key, new Text(str));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
  }
}

主类:

public class ComplexMR {
public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "ComplexMR job");
    job.setJarByClass(ComplexMR.class);
    job.setInputFormatClass(TextInputFormat.class); //设置输入格式
    job.setOutputFormatClass(TextOutputFormat.class);//设置输出格式
    job.setMapperClass(ComplexMRMapper.class); //设置Mapper类
    job.setReducerClass(ComplexMRReducer.class);//设置Reducer类
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class); //设置输出的KEY类型
    job.setOutputValueClass(Text.class);//设置输出的VALUE类型
    FileInputFormat.setInputPaths(job, new Path("D:/hdfs-audit"));//设置输入文件路径
    FileOutputFormat.setOutputPath(job, new Path("D:/out/hdfsoutput"));//设置输出文件路径
    job.waitForCompletion(true);
  }
}

引入以下maven依赖:


    
        org.apache.hadoop
        hadoop-common
        2.7.2
    
    
        org.apache.hadoop
        hadoop-mapreduce-client-core
        2.7.2
    

在Eclipse或者idea等IDE中运行,以下是结果:

192.168.0.4  getEZForPath,count(*)=2,sum() over(by ip)=495  
192.168.0.4 listCachePools,count(*)=24,sum() over(by ip)=495  
192.168.1.100   getfileinfo,count(*)=160,sum() over(by ip)=291  
192.168.1.100   listStatus,count(*)=131,sum() over(by ip)=291  
192.168.1.101   listCacheDirectives,count(*)=29,sum() over(by ip)=747  
192.168.1.101   getfileinfo,count(*)=352,sum() over(by ip)=747  
192.168.1.101   rename,count(*)=5,sum() over(by ip)=747  
192.168.1.101   listCachePools,count(*)=83,sum() over(by ip)=747  
192.168.1.101   mkdirs,count(*)=8,sum() over(by ip)=747  
192.168.1.101   listStatus,count(*)=219,sum() over(by ip)=747  
192.168.1.103   open,count(*)=1,sum() over(by ip)=8  
192.168.1.5 listSnapshottableDirectory,count(*)=238,sum() over(by ip)=106880  
192.168.1.5 getEZForPath,count(*)=722,sum() over(by ip)=106880  
192.168.1.5 listCachePools,count(*)=6035,sum() over(by ip)=106880  
192.168.1.5 listStatus,count(*)=29930,sum() over(by ip)=106880  
192.168.1.5 open,count(*)=4450,sum() over(by ip)=106880  
192.168.1.5 contentSummary,count(*)=13,sum() over(by ip)=106880  
192.168.1.6 getfileinfo,count(*)=386,sum() over(by ip)=823  
192.168.1.6 delete,count(*)=25,sum() over(by ip)=823  
192.168.1.6 open,count(*)=142,sum() over(by ip)=823  
192.168.203.129 listCacheDirectives,count(*)=83,sum() over(by ip)=2614  
192.168.203.129 getfileinfo,count(*)=969,sum() over(by ip)=2614  
192.168.203.129 getEZForPath,count(*)=16,sum() over(by ip)=2614  
192.168.203.129 rename,count(*)=51,sum() over(by ip)=2614  
192.168.203.129 delete,count(*)=101,sum() over(by ip)=2614  
192.168.203.129 open,count(*)=111,sum() over(by ip)=2614

到站了,各位乘客,都下车吧,欢迎下次乘坐。

你可能感兴趣的:(Hadoop)