MR清洗IP数据

pom文件


<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0modelVersion>

    <groupId>com.ccj.pxjgroupId>
    <artifactId>IPETLartifactId>
    <version>1.0-SNAPSHOTversion>
    <properties>
        <project.build.sourceEncoding>UTF-8project.build.sourceEncoding>
        <maven.compiler.source>1.8maven.compiler.source>
        <maven.compiler.target>1.8maven.compiler.target>
        <hadoop.version>2.6.0-cdh5.16.2hadoop.version>
        <hive.version>1.1.0-cdh5.16.2hive.version>
    properties>
    <repositories>
        <repository>
            <id>clouderaid>
            <url>http://repository.cloudera.com/artifactory/cloudera-repos/url>
        repository>
    repositories>
    <dependencies>
        <dependency>
            <groupId>org.lionsoulgroupId>
            <artifactId>ip2regionartifactId>
            <version>1.7.2version>
        dependency>
        <dependency>
            <groupId>org.apache.hadoopgroupId>
            <artifactId>hadoop-clientartifactId>
            <version>${hadoop.version}version>
        dependency>

    dependencies>
    <build>
        <pluginManagement>
            <plugins>

                <plugin>
                    <artifactId>maven-clean-pluginartifactId>
                    <version>3.1.0version>
                plugin>

                <plugin>
                    <artifactId>maven-resources-pluginartifactId>
                    <version>3.0.2version>
                plugin>
                <plugin>
                    <artifactId>maven-compiler-pluginartifactId>
                    <version>3.8.0version>
                plugin>
                <plugin>
                    <artifactId>maven-surefire-pluginartifactId>
                    <version>2.22.1version>
                plugin>
                <plugin>
                    <artifactId>maven-jar-pluginartifactId>
                    <version>3.0.2version>
                plugin>
                <plugin>
                    <artifactId>maven-install-pluginartifactId>
                    <version>2.5.2version>
                plugin>
                <plugin>
                    <artifactId>maven-deploy-pluginartifactId>
                    <version>2.8.2version>
                plugin>

                <plugin>
                    <artifactId>maven-site-pluginartifactId>
                    <version>3.7.1version>
                plugin>
                <plugin>
                    <artifactId>maven-project-info-reports-pluginartifactId>
                    <version>3.0.0version>
                plugin>




            plugins>
        pluginManagement>




    build>
project>

IP解释工具类

package com.ccj.pxj.pk;



import org.lionsoul.ip2region.DataBlock;
import org.lionsoul.ip2region.DbConfig;
import org.lionsoul.ip2region.DbSearcher;
import org.lionsoul.ip2region.Util;

import java.io.File;
import java.lang.reflect.Method;

public class IPUtils {
    public static String parseIP(String ip){

        //db
        String dbPath = IPUtils.class.getResource("/ip2region.db").getPath();
//        String dbPath = IPUtils.class.getClassLoader().getResource("ip2region.db").getFile();
        File file = new File(dbPath);
        if ( file.exists() == false ) {
            System.out.println("Error: Invalid ip2region.db file");
        }

        //查询算法
        int algorithm = DbSearcher.BTREE_ALGORITHM; //B-tree
        //DbSearcher.BINARY_ALGORITHM //Binary
        //DbSearcher.MEMORY_ALGORITYM //Memory
        try {
            DbConfig config = new DbConfig();
            DbSearcher searcher = new DbSearcher(config, dbPath);

            //define the method
            Method method = null;
            switch ( algorithm )
            {
                case DbSearcher.BTREE_ALGORITHM:
                    method = searcher.getClass().getMethod("btreeSearch", String.class);
                    break;
                case DbSearcher.BINARY_ALGORITHM:
                    method = searcher.getClass().getMethod("binarySearch", String.class);
                    break;
                case DbSearcher.MEMORY_ALGORITYM:
                    method = searcher.getClass().getMethod("memorySearch", String.class);
                    break;
            }

            DataBlock dataBlock = null;
            if ( Util.isIpAddress(ip) == false ) {
                System.out.println("Error: Invalid ip address");
            }

            dataBlock  = (DataBlock) method.invoke(searcher, ip);
            String IP = dataBlock.getRegion();
            StringBuilder sb = new StringBuilder(IP);
            sb.replace(IP.indexOf("|")+1,(IP.indexOf("|",IP.indexOf("|")+1)),"-");
            String IPS = sb.substring(IP.indexOf("|", IP.indexOf("|") + 1) + 1);
            String province = IPS.substring(0, IPS.indexOf("|"));
            String city = IPS.substring(IPS.indexOf("|") + 1, IPS.indexOf("|", IPS.indexOf("|") + 1));
            String isp = IPS.substring(IPS.indexOf("|", IPS.indexOf("|") + 1) + 1);
            sb.setLength(0);
            sb.append(province).append("|").append(city).append("|").append(isp);


            return sb.toString();

        } catch (Exception e) {
            e.printStackTrace();
        }

        return null;
    }


  /*  public static void main(String[] args) {
        System.out.println(IPUtils.parseIP("182.82.3.148"));
    }*/
}

javabean

package com.ccj.pxj.pk;

public class Access {
//[01/02/2019:06:44:46 +0800]	121.77.248.104	-	651	-	https://www.bilibili.com/video/av80522857	404	297	1204	MISS
private String ip; // 原始日志的字段 ==> 国家 省份 城市 运营商
    private String proxyIp;
    private long reponseTime;
    private String referer;
    private String method;
    private String url; // 原始日志的字段 ==> http,domain,path
    private String httpCode;
    private long requestSize;
    private long responseSize;
    private String cache;
    private String year;
    private String month;
    private String day;
    private String province;
    private String city;
    private String isp;
    private String http;
    private String domain;
    private String path; //==> params

    public String getIp() {
        return ip;
    }

    public void setIp(String ip) {
        this.ip = ip;
    }

    public String getProxyIp() {
        return proxyIp;
    }

    public void setProxyIp(String proxyIp) {
        this.proxyIp = proxyIp;
    }

    public long getReponseTime() {
        return reponseTime;
    }

    public void setReponseTime(long reponseTime) {
        this.reponseTime = reponseTime;
    }

    public String getReferer() {
        return referer;
    }

    public void setReferer(String referer) {
        this.referer = referer;
    }

    public String getMethod() {
        return method;
    }

    public void setMethod(String method) {
        this.method = method;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public String getHttpCode() {
        return httpCode;
    }

    public void setHttpCode(String httpCode) {
        this.httpCode = httpCode;
    }

    public long getRequestSize() {
        return requestSize;
    }

    public void setRequestSize(long requestSize) {
        this.requestSize = requestSize;
    }

    public long getResponseSize() {
        return responseSize;
    }

    public void setResponseSize(long responseSize) {
        this.responseSize = responseSize;
    }

    public String getCache() {
        return cache;
    }

    public void setCache(String cache) {
        this.cache = cache;
    }

    public String getYear() {
        return year;
    }

    public void setYear(String year) {
        this.year = year;
    }

    public String getMonth() {
        return month;
    }

    public void setMonth(String month) {
        this.month = month;
    }

    public String getDay() {
        return day;
    }

    public void setDay(String day) {
        this.day = day;
    }

    public String getProvince() {
        return province;
    }

    public void setProvince(String province) {
        this.province = province;
    }

    public String getCity() {
        return city;
    }

    public void setCity(String city) {
        this.city = city;
    }

    public String getIsp() {
        return isp;
    }

    public void setIsp(String isp) {
        this.isp = isp;
    }

    public String getHttp() {
        return http;
    }

    public void setHttp(String http) {
        this.http = http;
    }

    public String getDomain() {
        return domain;
    }

    public void setDomain(String domain) {
        this.domain = domain;
    }

    public String getPath() {
        return path;
    }

    public void setPath(String path) {
        this.path = path;
    }

    public Access() {
    }

    public Access(String ip, String proxyIp, long reponseTime, String referer, String method, String url, String httpCode, long requestSize, long responseSize, String cache, String year, String month, String day, String province, String city, String isp, String http, String domain, String path) {
        this.ip = ip;
        this.proxyIp = proxyIp;
        this.reponseTime = reponseTime;
        this.referer = referer;
        this.method = method;
        this.url = url;
        this.httpCode = httpCode;
        this.requestSize = requestSize;
        this.responseSize = responseSize;
        this.cache = cache;
        this.year = year;
        this.month = month;
        this.day = day;
        this.province = province;
        this.city = city;
        this.isp = isp;
        this.http = http;
        this.domain = domain;
        this.path = path;
    }
    @Override
    public String toString() {
        return ip + "\t" +
                proxyIp + "\t" +
                reponseTime + "\t" +
                referer + "\t" +
                method + "\t" +
                url + "\t" +
                httpCode + "\t" +
                requestSize + "\t" +
                responseSize +"\t" +
                cache + "\t" +
                province + "\t" +
                city + "\t" +
                isp + "\t" +
                http + "\t" +
                domain + "\t" +
                path + "\t" +
                year + "\t" +
                month + "\t" +
                day;
    }
}

路径工具

package com.ccj.pxj.pk;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

public class FileUtils {
    public static  void deleteOutput(Configuration configuration, String out) throws  Exception{
        FileSystem fileSystem = FileSystem.get(configuration);
        Path path = new Path(out);
        if(fileSystem.exists(path)){
            fileSystem.delete(path,true);
        }
    }
}

Mapper

package com.ccj.pxj.pk;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;

public class LogMapper  extends Mapper<LongWritable, Text,Text, NullWritable> {
    private  Access access;
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        access=new Access();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        try{
            context.getCounter("etl","access_total").increment(1);
       String log= value.toString();
        String[] logs = log.split("\t");
        String times = logs[0];
        String ips = logs[1];
        String proxyIp = logs[2];
        String reponseTime = logs[3];
        String referer = logs[4];
        String method = logs[5];
        String url = logs[6];
        String httpcode = logs[7];
        String requestsize = logs[8];
        String responsesize = logs[9];
        String cache = logs[10];
        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("[dd/MM/yyyy:HH:mm:ss ZZZ]");


            Date date = simpleDateFormat.parse(times);

        Calendar calendar = Calendar.getInstance();
        calendar.setTime(date);
        int year = calendar.get(Calendar.YEAR);
        int month = calendar.get(Calendar.MONTH) + 1;
        int day = calendar.get(Calendar.DATE);
        URL urls = new URL(url);
        String http = urls.getProtocol();
        String domain = urls.getAuthority();
        String path = urls.getPath();
        Access access = new Access();
        access.setYear(String.valueOf(year));
        access.setMonth(month<10?"0"+month:month+"");
        access.setDay(day<10?"0"+day:day+"");
        access.setProxyIp(proxyIp);
        access.setReponseTime(Long.parseLong(reponseTime));
        access.setReferer(referer);
        access.setMethod(method);
        access.setUrl(url);
        access.setHttpCode(httpcode);
        access.setRequestSize(Long.parseLong(requestsize));
        String ip = IPUtils.parseIP(ips);
        access.setResponseSize(Long.parseLong(responsesize));
            context.getCounter("etl","access_format").increment(1);
        String[] split = ip.split("\\|");
        access.setProvince(split[0]);
        access.setCity(split[1]);
        access.setIsp(split[2]);
        access.setCache(cache);
        access.setHttp(http);
        access.setDomain(domain);
        access.setPath(path);
        access.setIp(ip);

        context.write(new Text(access.toString()),NullWritable.get());
        }catch (Exception e){
            context.getCounter("etl","access_error").increment(1);
            e.printStackTrace();
        }
    }
}

驱动类

package com.ccj.pxj.pk;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.CounterGroup;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.Iterator;

public class LogDriver2  extends Configured implements Tool {
   private  static Logger logger= LoggerFactory.getLogger("LogDriver2");
    public static void main(String[] args)throws  Exception {

        int run = ToolRunner.run(new Configuration(), new LogDriver2(), args);
        System.exit(run );
    }

    @Override
    public int run(String[] args) throws Exception {


        String InputPath=args[0];
        String OutPath=args[1];
        /*String InputPath="hdfs://pxj:9000/user/pxj/pxj/app/offline-dw/data";
        String OutPath="hdfs://pxj:9000/user/pxj/pxj/app/offline-dw/out";*/

        // 1)获取Job对象
        Configuration configuration = super.getConf();
        Job job = Job.getInstance(configuration);

        FileUtils.deleteOutput(configuration, OutPath);

        // 2)本job对应要执行的主类是哪个
        job.setJarByClass(LogDriver2.class);

        // 3)设置Mapper和Reducer
        job.setMapperClass(LogMapper.class);

        // 4)设置Mapper阶段输出数据的类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);
        //设置Combiner




        //
        // 6)设置输入和输出路径
        FileInputFormat.setInputPaths(job, new Path(InputPath));
        FileOutputFormat.setOutputPath(job, new Path(OutPath));

        // 7)提交作业
        boolean result = job.waitForCompletion(true);
        CounterGroup etl = job.getCounters().getGroup("etl");
        Iterator<Counter> iterator = etl.iterator();
        while (iterator.hasNext()){
            Counter next = iterator.next();
            System.out.println(next.getName() + "-->" + next.getValue());
            logger.info(next.getName() + "-->" + next.getValue());
        }
        return 0;
    }
}

测试数据

[15/01/2020:22:40:42 +0800]	182.82.3.148	-	1425	-	GET	https://www.bilibili.com/video/av73376233	202	198	-	HIT
[22/12/2019:02:10:37 +0800]	139.215.218.49	-	44	-	GET	https://www.bilibili.com/video/av76542615	200	276	1917	MISS
[27/01/2020:22:14:03 +0800]	210.33.139.98	-	2889	-	POST	https://www.bilibili.com/video/av76542615	200	167	438	MISS
[03/02/2020:02:41:04 +0800]	36.63.106.187	-	1482	-	GET	https://www.bilibili.com/video/av76542615	200	150	2618	MISS
[20/01/2018:21:41:06 +0800]	222.88.27.61	-	2334	-	GET	https://www.bilibili.com/video/av30031910	200	260	-	MISS
[15/02/2020:19:40:23 +0800]	121.77.147.133	-	618	-	POST	https://www.bilibili.com/video/av52167219	404	226	4682	HIT
[26/01/2018:18:55:35 +0800]	171.14.50.145	-	987	-	GET	https://www.bilibili.com/video/av80522857	200	42	-	MISS
[16/01/2020:02:41:04 +0800]	139.214.37.55	-	572	-	POST	https://ruoze.ke.qq.com	404	139	-	MISS
[16/02/2019:02:26:03 +0800]	61.232.28.171	-	183	-	POST	https://www.bilibili.com/video/av80522857	500	3	-	MISS
[16/02/2020:01:52:27 +0800]	36.62.235.207	-	465	-	GET	https://www.bilibili.com/video/av52167219	506	67	-	MISS
[18/01/2018:02:41:04 +0800]	171.13.167.211	-	120	-	POST	https://www.bilibili.com/video/av34829124	202	188	3131	MISS
[16/02/2018:02:40:29 +0800]	171.11.12.169	-	2067	-	GET	https://www.bilibili.com/video/av34829124	200	294	-	MISS
[09/02/2020:02:40:19 +0800]	36.59.81.58	-	366	-	POST	https://www.bilibili.com/video/av73376233	500	259	-	HIT
[06/02/2019:20:04:06 +0800]	121.76.66.41	-	429	-	GET	https://www.bilibili.com/video/av73376233	506	213	-	HIT
[04/01/2018:22:41:05 +0800]	36.61.136.75	-	2573	-	GET	https://www.bilibili.com/video/av80522857	500	114	1258	MISS
[15/02/2020:21:53:48 +0800]	171.15.138.128	-	1000	-	POST	https://ruoze.ke.qq.com	506	243	2470	MISS
[02/02/2019:02:40:49 +0800]	222.79.99.1	-	2239	-	GET	https://ruoze.ke.qq.com	200	230	2240	MISS
[29/01/2020:02:38:06 +0800]	121.77.237.110	-	517	-	GET	https://www.bilibili.com/video/av52167219	506	138	1136	MISS
[06/02/2018:02:17:03 +0800]	123.234.170.91	-	603	-	GET	https://ruoze.ke.qq.com	202	107	-	HIT
[28/01/2020:02:41:03 +0800]	182.83.239.168	-	2160	-	POST	https://www.bilibili.com/video/av34829124	404	35	4113	MISS
[04/02/2020:00:00:43 +0800]	121.77.104.20	-	2450	-	POST	https://ruoze.ke.qq.com	202	194	4828	MISS
[16/02/2020:02:41:03 +0800]	171.14.120.192	-	1318	-	POST	https://www.bilibili.com/video/av34829124	500	224	1549	MISS
[19/12/2017:22:51:44 +0800]	36.61.143.111	-	725	-	GET	https://ruoze.ke.qq.com	200	174	1361	HIT
[18/01/2020:23:41:06 +0800]	139.214.152.182	-	53	-	GET	https://www.bilibili.com/video/av76542615	404	128	-	HIT
[03/01/2020:00:41:07 +0800]	210.27.223.68	-	1065	-	POST	https://www.bilibili.com/video/av76542615	506	157	-	HIT
[16/02/2019:02:41:04 +0800]	171.8.149.129	-	2081	-	POST	https://www.bilibili.com/video/av30031910	506	8	2961	HIT
[13/02/2020:02:41:07 +0800]	121.76.103.138	-	887	-	POST	https://www.bilibili.com/video/av30031910	500	4	-	HIT
[05/01/2019:02:41:05 +0800]	182.90.180.94	-	838	-	GET	https://www.bilibili.com/video/av76542615	202	34	1170	MISS
[16/02/2020:02:34:25 +0800]	123.235.173.110	-	1668	-	GET	https://www.bilibili.com/video/av73376233	200	237	-	HIT
[12/02/2020:02:03:00 +0800]	222.77.179.225	-	1295	-	POST	https://www.bilibili.com/video/av52167219	202	1	1345	HIT
[07/02/2018:19:21:22 +0800]	61.233.188.93	-	2652	-	POST	https://www.bilibili.com/video/av30031910	506	98	3506	HIT
[11/02/2019:01:40:38 +0800]	182.91.225.89	-	2619	-	GET	https://www.bilibili.com/video/av73376233	506	108	-	MISS
[16/02/2020:02:41:05 +0800]	222.94.117.246	-	2082	-	GET	https://www.bilibili.com/video/av73376233	202	213	3334	HIT
[16/02/2018:02:40:57 +0800]	106.89.69.81	-	2380	-	GET	https://www.bilibili.com/video/av52167219	506	140	1719	MISS
[16/02/2020:02:41:05 +0800]	139.196.86.93	-	1784	-	POST	http://www.ruozedata.com	500	65	-	MISS
[15/01/2020:22:29:06 +0800]	210.38.109.213	-	2791	-	GET	https://www.bilibili.com/video/av30031910	506	143	3052	MISS
[16/02/2020:02:41:06 +0800]	123.234.46.110	-	2315	-	GET	https://www.bilibili.com/video/av76542615	200	136	3662	HIT
[22/01/2020:00:02:16 +0800]	121.76.226.122	-	1356	-	GET	http://www.ruozedata.com	202	17	4435	MISS
[24/01/2020:02:41:06 +0800]	123.235.227.183	-	382	-	GET	https://www.bilibili.com/video/av34829124	506	177	-	MISS
[15/02/2019:22:41:04 +0800]	106.92.141.175	-	153	-	POST	https://www.bilibili.com/video/av80522857	404	1	3425	MISS
[16/02/2019:02:41:05 +0800]	171.10.3.71	-	1697	-	POST	https://www.bilibili.com/video/av52167219	404	255	-	MISS
[15/02/2020:22:20:04 +0800]	139.208.181.68	-	64	-	POST	https://www.bilibili.com/video/av52167219	500	62	-	MISS
[13/02/2020:20:40:29 +0800]	61.236.83.33	-	567	-	POST	https://www.bilibili.com/video/av34829124	506	91	-	HIT
[28/01/2020:02:40:25 +0800]	123.234.219.81	-	331	-	GET	https://www.bilibili.com/video/av52167219	506	78	-	MISS
[16/02/2019:02:19:46 +0800]	171.10.57.64	-	2478	-	GET	https://www.bilibili.com/video/av30031910	200	129	-	HIT
[30/12/2019:02:33:50 +0800]	171.13.243.236	-	2730	-	POST	https://ruoze.ke.qq.com	500	34	-	MISS
[16/02/2020:02:41:03 +0800]	182.81.91.30	-	518	-	GET	https://www.bilibili.com/video/av30031910	200	178	-	HIT
[16/02/2020:02:41:06 +0800]	61.236.246.172	-	246	-	GET	https://www.bilibili.com/video/av52167219	202	234	2256	HIT
[16/02/2020:02:40:23 +0800]	36.58.17.244	-	1872	-	GET	https://ruoze.ke.qq.com	202	244	-	MISS
[16/02/2020:00:17:04 +0800]	121.76.207.213	-	2839	-	POST	https://www.bilibili.com/video/av76542615	200	113	-	HIT

服务器运行

[pxj@pxj /home/pxj/app/offline-dw/lib]$export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:ip2region-1.7.2.jar:ip2region.db 
[pxj@pxj /home/pxj/app/offline-dw/lib]$hadoop jar IPETL-1.0-SNAPSHOT.jar com.ccj.pxj.pk.LogDriver2 -libjars ip2region-1.7.2.jar,ip2region.db /user/pxj/pxj/app/offline-dw/data/access.log /user/pxj/pxj/app/offline-dw/outpath 






20/02/17 12:40:36 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
20/02/17 12:40:38 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
20/02/17 12:40:40 INFO input.FileInputFormat: Total input paths to process : 1
20/02/17 12:40:41 INFO mapreduce.JobSubmitter: number of splits:1
20/02/17 12:40:41 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1581901965461_0003
20/02/17 12:40:42 INFO impl.YarnClientImpl: Submitted application application_1581901965461_0003
20/02/17 12:40:42 INFO mapreduce.Job: The url to track the job: http://pxj:38088/proxy/application_1581901965461_0003/
20/02/17 12:40:42 INFO mapreduce.Job: Running job: job_1581901965461_0003
20/02/17 12:40:55 INFO mapreduce.Job: Job job_1581901965461_0003 running in uber mode : false
20/02/17 12:40:55 INFO mapreduce.Job:  map 0% reduce 0%
20/02/17 12:41:08 INFO mapreduce.Job:  map 100% reduce 0%
20/02/17 12:41:19 INFO mapreduce.Job:  map 100% reduce 100%
20/02/17 12:41:20 INFO mapreduce.Job: Job job_1581901965461_0003 completed successfully
20/02/17 12:41:20 INFO mapreduce.Job: Counters: 52
	File System Counters
		FILE: Number of bytes read=4233
		FILE: Number of bytes written=297683
		FILE: Number of read operations=0
		FILE: Number of large read operations=0
		FILE: Number of write operations=0
		HDFS: Number of bytes read=5584
		HDFS: Number of bytes written=4131
		HDFS: Number of read operations=6
		HDFS: Number of large read operations=0
		HDFS: Number of write operations=2
	Job Counters 
		Launched map tasks=1
		Launched reduce tasks=1
		Data-local map tasks=1
		Total time spent by all maps in occupied slots (ms)=11821
		Total time spent by all reduces in occupied slots (ms)=6834
		Total time spent by all map tasks (ms)=11821
		Total time spent by all reduce tasks (ms)=6834
		Total vcore-milliseconds taken by all map tasks=11821
		Total vcore-milliseconds taken by all reduce tasks=6834
		Total megabyte-milliseconds taken by all map tasks=12104704
		Total megabyte-milliseconds taken by all reduce tasks=6998016
	Map-Reduce Framework
		Map input records=50
		Map output records=24
		Map output bytes=4155
		Map output materialized bytes=4233
		Input split bytes=124
		Combine input records=0
		Combine output records=0
		Reduce input groups=24
		Reduce shuffle bytes=4233
		Reduce input records=24
		Reduce output records=24
		Spilled Records=48
		Shuffled Maps =1
		Failed Shuffles=0
		Merged Map outputs=1
		GC time elapsed (ms)=883
		CPU time spent (ms)=6370
		Physical memory (bytes) snapshot=460800000
		Virtual memory (bytes) snapshot=5547888640
		Total committed heap usage (bytes)=399507456
	Shuffle Errors
		BAD_ID=0
		CONNECTION=0
		IO_ERROR=0
		WRONG_LENGTH=0
		WRONG_MAP=0
		WRONG_REDUCE=0
	etl
		access_error=26
		access_format=24
		access_total=50
	File Input Format Counters 
		Bytes Read=5460
	File Output Format Counters 
		Bytes Written=4131
access_error-->26
20/02/17 12:41:20 INFO LogDriver2: access_error-->26
access_format-->24
20/02/17 12:41:20 INFO LogDriver2: access_format-->24
access_total-->50
20/02/17 12:41:20 INFO LogDriver2: access_total-->50

运行结果

[pxj@pxj /home/pxj/app/offline-dw/lib]$hadoop fs  -text /user/pxj/pxj/app/offline-dw/outpath/part*
20/02/17 12:53:01 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
上海|上海市|有线通	-	1356	-	GET	http://www.ruozedata.com	202	17	4435	MISS	上海	上海市	有线通	http	www.ruozedata.com		2020	01	22
上海|上海市|有线通	-	2450	-	POST	https://ruoze.ke.qq.com	202	194	4828	MISS	上海	上海市	有线通	https	ruoze.ke.qq.com		2020	02	04
上海|上海市|有线通	-	517	-	GET	https://www.bilibili.com/video/av52167219	506	138	1136	MISS	上海	上海市	有线通	https	www.bilibili.com	/video/av52167219	2020	01	29
上海|上海市|有线通	-	618	-	POST	https://www.bilibili.com/video/av52167219	404	226	4682	HIT	上海	上海市	有线通	https	www.bilibili.com	/video/av52167219	2020	02	15
吉林省|长春市|联通	-	44	-	GET	https://www.bilibili.com/video/av76542615	200	276	1917	MISS	吉林省	长春市	联通https	www.bilibili.com	/video/av76542615	2019	12	22
安徽省|合肥市|电信	-	2573	-	GET	https://www.bilibili.com/video/av80522857	500	114	1258	MISS	安徽省	合肥市	电信https	www.bilibili.com	/video/av80522857	2018	01	04
安徽省|合肥市|电信	-	725	-	GET	https://ruoze.ke.qq.com	200	174	1361	HIT	安徽省	合肥市	电信	https	ruoze.ke.qq.com		2017	12	19
安徽省|安庆市|电信	-	1482	-	GET	https://www.bilibili.com/video/av76542615	200	150	2618	MISS	安徽省	安庆市	电信https	www.bilibili.com	/video/av76542615	2020	02	03
山东省|青岛市|联通	-	2315	-	GET	https://www.bilibili.com/video/av76542615	200	136	3662	HIT	山东省	青岛市	联通https	www.bilibili.com	/video/av76542615	2020	02	16
广东省|广州市|教育网	-	2791	-	GET	https://www.bilibili.com/video/av30031910	506	143	3052	MISS	广东省	广州市	教育网	https	www.bilibili.com	/video/av30031910	2020	01	15
广西|梧州市|联通	-	838	-	GET	https://www.bilibili.com/video/av76542615	202	34	1170	MISS	广西	梧州市	联通https	www.bilibili.com	/video/av76542615	2019	01	05
江苏省|南京市|电信	-	2082	-	GET	https://www.bilibili.com/video/av73376233	202	213	3334	HIT	江苏省	南京市	电信https	www.bilibili.com	/video/av73376233	2020	02	16
河南省|信阳市|电信	-	1318	-	POST	https://www.bilibili.com/video/av34829124	500	224	1549	MISS	河南省	信阳市	电信https	www.bilibili.com	/video/av34829124	2020	02	16
河南省|郑州市|电信	-	2081	-	POST	https://www.bilibili.com/video/av30031910	506	8	2961	HIT	河南省	郑州市	电信https	www.bilibili.com	/video/av30031910	2019	02	16
河南省|驻马店市|电信	-	1000	-	POST	https://ruoze.ke.qq.com	506	243	2470	MISS	河南省	驻马店市	电信	https	ruoze.ke.qq.com		2020	02	15
河南省|驻马店市|电信	-	120	-	POST	https://www.bilibili.com/video/av34829124	202	188	3131	MISS	河南省	驻马店市	电信	https	www.bilibili.com	/video/av34829124	2018	01	18
浙江省|舟山市|教育网	-	2889	-	POST	https://www.bilibili.com/video/av76542615	200	167	438	MISS	浙江省	舟山市	教育网	https	www.bilibili.com	/video/av76542615	2020	01	27
福建省|厦门市|电信	-	2239	-	GET	https://ruoze.ke.qq.com	200	230	2240	MISS	福建省	厦门市	电信	https	ruoze.ke.qq.com		2019	02	02
福建省|福州市|电信	-	1295	-	POST	https://www.bilibili.com/video/av52167219	202	1	1345	HIT	福建省	福州市	电信https	www.bilibili.com	/video/av52167219	2020	02	12
重庆|重庆市|电信	-	153	-	POST	https://www.bilibili.com/video/av80522857	404	1	3425	MISS	重庆	重庆市	电信https	www.bilibili.com	/video/av80522857	2019	02	15
重庆|重庆市|电信	-	2380	-	GET	https://www.bilibili.com/video/av52167219	506	140	1719	MISS	重庆	重庆市	电信https	www.bilibili.com	/video/av52167219	2018	02	16
陕西省|商洛市|陕西广电	-	2160	-	POST	https://www.bilibili.com/video/av34829124	404	35	4113	MISS	陕西省	商洛市	陕西广电	https	www.bilibili.com	/video/av34829124	2020	01	28
陕西省|汉中市|铁通	-	246	-	GET	https://www.bilibili.com/video/av52167219	202	234	2256	HIT	陕西省	汉中市	铁通https	www.bilibili.com	/video/av52167219	2020	02	16
黑龙江省|哈尔滨市|铁通	-	2652	-	POST	https://www.bilibili.com/video/av30031910	506	98	3506	HIT	黑龙江省	哈尔滨市	铁通	https	www.bilibili.com	/video/av30031910	2018	02	07

作者:pxj(潘陈)
日期:2020-02-17 下午12:54:32
你若安好便是晴天,愿汝一切安好!

你可能感兴趣的:(MR清洗IP数据)