大数据系列之Flink实时统计每小时HTTP请求Top10

文章目录

      • 1 Maven工程bigdata-analysis-collect生成测试数据
      • 2 Maven工程bigdata-analysis-flink读取文件测试
      • 3 测试方案之读取文件测试
        • 3.1 文件测试类型输出结果
      • 4 测试方案之读取Kafka测试将结果写入ElasticSearch
        • 4.1 Kafka Eagle创建topic
        • 4.2 Kafka Eagle查看Topic Meta
        • 4.3 Kibana整合ElasticSearch查看结果
      • 5 Flink使用Standalone模式提交代码
        • 5.1 Flink首页概览
        • 5.2 Flink作业提交并查看执行计划
        • 5.3 Flink作业详情
      • 6 源码分享

1 Maven工程bigdata-analysis-collect生成测试数据

大数据系列之Flink实时统计每小时HTTP请求Top10_第1张图片

/**
  * 生成文件数据
  */
private static void genFileData(){
    try {
        FileOutputStream fos = new FileOutputStream("./nginxlog.txt",true);
        for (int i = 1; i <= 10000 ; i++) {
            String msg = "127.0.0.1"+" " + new Date().getTime()+" "+InfoUtils.getURL()+"\n";
            fos.write(msg.getBytes());
        }
        fos.close();
    }catch (Exception e){
        e.printStackTrace();
    }
}

/**
  * 生成kafka相关数据
  */
private static void genKafakData(){
    try {
        KafkaProducer producer = KafkaProducer.getInstance();
        String topic = "test-top10";
        while (true) {
            String msg = "127.0.0.1"+" " + new Date().getTime()+" "+InfoUtils.getURL();
            System.out.println(msg);
            producer.sendMessgae(topic, msg);
            Thread.sleep(1000);
        }
    }catch (Exception e){
        e.printStackTrace();
    }
}

2 Maven工程bigdata-analysis-flink读取文件测试

大数据系列之Flink实时统计每小时HTTP请求Top10_第2张图片

3 测试方案之读取文件测试

package com.pusidun.flink.streaming.urltopn;

import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;

/**
 * 统计每小时内nginx日志中访问前十的url
 */
public class NginxLogUrlTopN {

    /**
     * 程序入口
     * @param args 参数
     * @throws Exception e
     */
    public static void main(String[] args) throws Exception {
         args = new String[]{"./nginxlog.txt"};
        //参数封装
        if (args.length != 1) {
            System.err.printf("Usage: %s [generic options]    \n", "NginxLogUrlTopN");
            System.exit(-1);
        }
        String filePath = args[0];

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
        env.enableCheckpointing(60000);
        SingleOutputStreamOperator<String> process =
            env.readTextFile(filePath)
            .process(new ProcessFunction<String, ApacheLogEvent>() {
                @Override
                public void processElement(String line, Context ctx, Collector<ApacheLogEvent> out) throws Exception {
                    String[] dataArray = line.split("");
                    ApacheLogEvent logEvent = new ApacheLogEvent(Long.parseLong(dataArray[1]), dataArray[2]);
                    out.collect(logEvent);
                }
            })
            .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<ApacheLogEvent>(Time.seconds(5)) {
                @Override
                public long extractTimestamp(ApacheLogEvent element) {
                    return element.getEventTime();
                }
            })
            .keyBy("url")
            .timeWindow(Time.hours(1))
            .aggregate(new UrlCountAgg(), new WindowResult())
            .keyBy("windowEnd")
            .process(new KeyedProcessFunction<Tuple, ApacheUrlCount, String>() {
                //定义一个状态
                private  transient ValueState<List<ApacheUrlCount>> valueState;
                @Override
                public void open(Configuration parameters) throws Exception {
                    ValueStateDescriptor<List<ApacheUrlCount>> valueStateDescriptor =  new ValueStateDescriptor<List<ApacheUrlCount>>
                            ("list-state",TypeInformation.of(new TypeHint<List<ApacheUrlCount>>() {}));
                    valueState = getRuntimeContext().getState(valueStateDescriptor);
                }
                //处理每一个元素
                @Override
                public void processElement(ApacheUrlCount value, Context ctx, Collector<String> out) throws Exception {
                    //获取状态值
                    List<ApacheUrlCount> list = valueState.value();
                    if(null == list){
                        list = new ArrayList<ApacheUrlCount>();
                    }
                    list.add(value);
                    valueState.update(list);
                    //注册定时器,当为窗口最后的时间时,通过加1触发定时器
                    ctx.timerService().registerEventTimeTimer(value.getWindowEnd()+1);
                }

                //定时处理,排序操作取N
                @Override
                public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
                    List<ApacheUrlCount> list = valueState.value();
                    list.sort(new Comparator<ApacheUrlCount>() {
                        public int compare(ApacheUrlCount o1, ApacheUrlCount o2) {
                            return -(int)(o1.getCount() - o2.getCount());
                        }
                    });

                    StringBuffer result = new StringBuffer();
                    result.append("时间:").append( new Timestamp( timestamp - 1 ) ).append("\n");
                    for (int i = 0; i < 10; i++) {
                        result.append("NO").append(i + 1).append(":")
                                .append(" URL=").append(list.get(i).getUrl())
                                .append(" 访问量=").append(list.get(i).getCount()).append("\n");
                    }
                    result.append("=============================");
                    valueState.update(null);
                    out.collect(result.toString());
                }
            });
        process.print();
        env.execute();
    }

    /**
     * 自定义聚合函数 AggregateFunction<输入的类型,累加器的类型,输出的数据类型>
     */
    public static class UrlCountAgg implements AggregateFunction<ApacheLogEvent, Long, Long> {
        public Long createAccumulator() {
            return 0l;
        }
        public Long add(ApacheLogEvent apacheLogEvent, Long aLong) {
            return aLong + 1;
        }
        public Long getResult(Long aLong) {
            return aLong;
        }
        public Long merge(Long aLong, Long acc1) {
            return aLong + acc1;
        }
    }

    /**
     * 自定义聚合输出结果 WindowFunction<聚合函数输出类型,输出类型,分组类型,窗口对象>
     */
    public static class WindowResult implements WindowFunction<Long, ApacheUrlCount, Tuple, TimeWindow> {
        public void apply(Tuple tuple, TimeWindow window, Iterable<Long> input, Collector<ApacheUrlCount> out) throws Exception {
            String url = tuple.getField(0);
            Long windowEnd = window.getEnd();
            Long count = input.iterator().next();
            out.collect(new ApacheUrlCount(url, windowEnd, count));
        }
    }

}

3.1 文件测试类型输出结果

7> 时间:2020-08-07 21:00:00.0
NO1: URL=/g 访问量=1678
NO2: URL=/c 访问量=896
NO3: URL=/f 访问量=861
NO4: URL=/a 访问量=848
NO5: URL=/k 访问量=843
NO6: URL=/h 访问量=839
NO7: URL=/d 访问量=821
NO8: URL=/b 访问量=819
NO9: URL=/i 访问量=807
NO10: URL=/l 访问量=798

4 测试方案之读取Kafka测试将结果写入ElasticSearch

package com.pusidun.flink.streaming.urltopn;

import com.pusidun.utils.EsClientUtils;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.restartstrategy.RestartStrategies;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.util.Collector;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import java.sql.Timestamp;
import java.util.*;

/**
 * 统计每小时内nginx日志中访问前十的url
 */
public class NginxLogUrlTopN {

    /**
     * 程序入口
     * @param args 参数
     * @throws Exception e
     */
    public static void main(String[] args) throws Exception {
        //1.构建执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        //只有开启了CheckPointing,才会有重启策略
        env.enableCheckpointing(5000);
        //此处设置重启策略为:出现异常重启3次,隔10秒一次
        env.getConfig().setRestartStrategy(RestartStrategies.fixedDelayRestart(3, 10000));
        //系统异常退出或人为 Cancel 掉,不删除checkpoint数据
        env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
        //设置Checkpoint模式(与Kafka整合,一定要设置Checkpoint模式为Exactly_Once)
        env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);

        String topic = "test-top10";
        Properties properties = new Properties();
        properties.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "node1.com:9092,node2.com:9092,node3.com:9092");
        properties.put(ConsumerConfig.GROUP_ID_CONFIG, "consumer-group");
        properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer");
        properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer");
        //如果没有记录偏移量,第一次从最开始消费earliest、latest
        properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest");
        //Kafka的消费者,不自动提交偏移量
        properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");

        SingleOutputStreamOperator<String> process =
            //env.readTextFile(filePath)
            env.addSource(new FlinkKafkaConsumer<String>(topic, new SimpleStringSchema(), properties))
            .process(new ProcessFunction<String, ApacheLogEvent>() {
                @Override
                public void processElement(String line, Context ctx, Collector<ApacheLogEvent> out) throws Exception {
                    String[] dataArray = line.split(" ");
                    ApacheLogEvent logEvent = new ApacheLogEvent(Long.parseLong(dataArray[1]), dataArray[2]);
                    out.collect(logEvent);
                }
            })
            .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<ApacheLogEvent>(Time.seconds(5)) {
                @Override
                public long extractTimestamp(ApacheLogEvent element) {
                    return element.getEventTime();
                }
            })
            .keyBy("url")
            //为了方便测试调整为滑动窗口每五分钟滑动一次
            //.timeWindow(Time.hours(1))
            .timeWindow(Time.hours(1))
            .aggregate(new UrlCountAgg(), new WindowResult())
            .keyBy("windowEnd")
            .process(new KeyedProcessFunction<Tuple, ApacheUrlCount, String>() {
                //定义一个状态
                private  transient ValueState<List<ApacheUrlCount>> valueState;
                @Override
                public void open(Configuration parameters) throws Exception {
                    ValueStateDescriptor<List<ApacheUrlCount>> valueStateDescriptor =  new ValueStateDescriptor<List<ApacheUrlCount>>
                            ("list-state",TypeInformation.of(new TypeHint<List<ApacheUrlCount>>() {}));
                    valueState = getRuntimeContext().getState(valueStateDescriptor);
                }
                //处理每一个元素
                @Override
                public void processElement(ApacheUrlCount value, Context ctx, Collector<String> out) throws Exception {
                    //获取状态值
                    List<ApacheUrlCount> list = valueState.value();
                    if(null == list){
                        list = new ArrayList<ApacheUrlCount>();
                    }
                    list.add(value);
                    valueState.update(list);
                    //注册定时器,当为窗口最后的时间时,通过加1触发定时器
                    ctx.timerService().registerEventTimeTimer(value.getWindowEnd()+1);
                }

                //定时处理,排序操作取N
                @Override
                public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
                    List<ApacheUrlCount> list = valueState.value();
                    list.sort(new Comparator<ApacheUrlCount>() {
                        public int compare(ApacheUrlCount o1, ApacheUrlCount o2) {
                            return -(int)(o1.getCount() - o2.getCount());
                        }
                    });

                    StringBuffer result = new StringBuffer();
                    result.append("时间:").append( new Timestamp( timestamp - 1 ) ).append("\n");
                    for (int i = 0; i < 10; i++) {
                        result.append("NO").append(i + 1).append(":")
                                .append(" URL=").append(list.get(i).getUrl())
                                .append(" 访问量=").append(list.get(i).getCount()).append("\n");
                    }
                    valueState.update(null);
                    out.collect(result.toString());

                    //将结果写入ElasticSearch
                    Map<String, Object> resultMap = new HashMap<>();
                    resultMap.put("data", result.toString());
                    resultMap.put("create_time", new Date(timestamp - 1 ));
                    EsClientUtils.getInstance().addIndexMap("url_hours_count","url",resultMap);
                }
            });
        process.print();
        env.execute();
    }

    /**
     * 自定义聚合函数 AggregateFunction<输入的类型,累加器的类型,输出的数据类型>
     */
    public static class UrlCountAgg implements AggregateFunction<ApacheLogEvent, Long, Long> {
        public Long createAccumulator() {
            return 0l;
        }
        public Long add(ApacheLogEvent apacheLogEvent, Long aLong) {
            return aLong + 1;
        }
        public Long getResult(Long aLong) {
            return aLong;
        }
        public Long merge(Long aLong, Long acc1) {
            return aLong + acc1;
        }
    }

    /**
     * 自定义聚合输出结果 WindowFunction<聚合函数输出类型,输出类型,分组类型,窗口对象>
     */
    public static class WindowResult implements WindowFunction<Long, ApacheUrlCount, Tuple, TimeWindow> {
        public void apply(Tuple tuple, TimeWindow window, Iterable<Long> input, Collector<ApacheUrlCount> out) throws Exception {
            String url = tuple.getField(0);
            Long windowEnd = window.getEnd();
            Long count = input.iterator().next();
            out.collect(new ApacheUrlCount(url, windowEnd, count));
        }
    }


}

4.1 Kafka Eagle创建topic

大数据系列之Flink实时统计每小时HTTP请求Top10_第3张图片

4.2 Kafka Eagle查看Topic Meta

大数据系列之Flink实时统计每小时HTTP请求Top10_第4张图片

4.3 Kibana整合ElasticSearch查看结果

大数据系列之Flink实时统计每小时HTTP请求Top10_第5张图片
大数据系列之Flink实时统计每小时HTTP请求Top10_第6张图片

5 Flink使用Standalone模式提交代码

5.1 Flink首页概览

大数据系列之Flink实时统计每小时HTTP请求Top10_第7张图片

5.2 Flink作业提交并查看执行计划

大数据系列之Flink实时统计每小时HTTP请求Top10_第8张图片

5.3 Flink作业详情

大数据系列之Flink实时统计每小时HTTP请求Top10_第9张图片

6 源码分享

https://github.com/wang-xue-qiang/bigdata-analysis

大数据系列之Flink实时统计每小时HTTP请求Top10_第10张图片

你可能感兴趣的:(大数据系列学习笔记)