Kafka练习

需求:写一个生产者,不断的去生产用户行为数据,写入到kafka的一个topic中

生产的数据格式: 造数据

{"guid":1,"eventId":"pageview","timestamp":1637868346789} isNew = 1

{"guid":1,"eventId":"addcard","timestamp":1637868347625} isNew = 0

{"guid":2,"eventId":"collect","timestamp":16378683463219}

{"guid":3,"eventId":"paid","timestamp":16378683467829}

......

再写一个消费者,不断的从kafka中消费上面的用户行为数据,做一个统计

1.每5s输出一次当前来了多少用户(去重) uv

2.将每条数据添加一个字段来标识,如果这个用户的id是第一次出现,那么就标注1,否则就是0

生产者代码示例: 

package com.doit.kafaka;

import com.alibaba.fastjson.JSON;
import org.apache.commons.lang3.RandomStringUtils;
import org.apache.commons.lang3.RandomUtils;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.common.serialization.StringSerializer;

import java.util.Properties;

//需求:写一个生产者,不断的去生产用户行为数据,写入到kafka的一个topic中  ==>先造一部分数据,然后不断的往kafka中写数据(生产者)
//        * 生产的数据格式:  造数据
//        * {"guid":1,"eventId":"pageview","timestamp":1637868346789}  isNew = 1  ==》 fastjson  ==》javabean 创建对象  格式化成json串
//        * {"guid":1,"eventId":"addcard","timestamp":1637868347625}   isNew = 0
//        * {"guid":2,"eventId":"collect","timestamp":16378683463219}
//        * {"guid":3,"eventId":"paid","timestamp":16378683467829}
//        * ......
//        * 再写一个消费者,不断的从kafka中消费上面的用户行为数据,做一个统计  ==》poll for(具体的逻辑)
//        * 1.每5s输出一次当前来了多少用户(去重)  uv   每5s输出一次==》 任务调度器 Timer
//        * 2.将每条数据添加一个字段来标识,如果这个用户的id是第一次出现,那么就标注1,否则就是0  ==》 判断这个数之前有没有出现过
//        */
public class _Producer_uv {
    public static void main(String[] args) throws InterruptedException {
        Properties props = new Properties();
        props.setProperty(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG,"linux01:9092,linux02:9092,linux03:9092");
        props.setProperty(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());
        props.setProperty(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,StringSerializer.class.getName());
        props.setProperty("value.serializer",StringSerializer.class.getName());

        KafkaProducer producer = new KafkaProducer<>(props);
        EventLog eventLog = new EventLog();

        while(true){
            eventLog.setEventId(RandomStringUtils.randomAlphabetic(10));
            eventLog.setGuid(RandomUtils.nextInt(10000,100000));
            eventLog.setTimestamp(System.currentTimeMillis());
            String jsonString = JSON.toJSONString(eventLog);
            ProducerRecord record = new ProducerRecord<>("event-log",jsonString);
            producer.send(record);
            producer.flush();
            Thread.sleep(RandomUtils.nextInt(10,200));

        }
    }
}

 消费者代码示例: 用hashset来实现:

package com.doit.kafaka;

import com.alibaba.fastjson.JSON;
import com.doit.demo.Consumer;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.kafka.common.serialization.StringSerializer;

import java.time.Duration;
import java.util.*;

//再写一个消费者,不断的从kafka中消费上面的用户行为数据,做一个统计  ==》poll for(具体的逻辑)
//        * 1.每5s输出一次当前来了多少用户(去重)  uv   每5s输出一次==》 任务调度器 Timer
//        * 2.将每条数据添加一个字段来标识,如果这个用户的id是第一次出现,那么就标注1,否则就是0  ==》 判断这个数之前有没有出现过
public class _Consumer_uv {
    public static void main(String[] args) {
      HashSet set = new HashSet<>();
      new Thread(new SetTask(set)).start();

        Timer timer = new Timer();
        timer.schedule(new TimerTask() {
            @Override
            public void run() {
                System.out.println("截止到现在的uv数:"+set.size()+",当前时间是:"+System.currentTimeMillis());

            }
        },1000,5000);
    }
}
class SetTask implements Runnable{
private HashSet set;
private KafkaConsumer consumer;

    public SetTask(HashSet set) {

        Properties props = new Properties();
        props.setProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG,"linux01:9092");
        props.setProperty(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName());
        props.setProperty(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,StringDeserializer.class.getName());
        props.setProperty(ConsumerConfig.GROUP_ID_CONFIG,"group02");
        props.setProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,"earliest");
        props.setProperty(ConsumerConfig.ALLOW_AUTO_CREATE_TOPICS_CONFIG,"true");
        props.setProperty(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG,"false");
        props.setProperty(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG,"10000");

        consumer = new KafkaConsumer(props);
        this.set = set;
    }

    @Override
    public void run() {

        consumer.subscribe(Arrays.asList("event-log"));
        while (true){
            ConsumerRecords records = consumer.poll(Duration.ofMillis(Integer.MAX_VALUE));
            for (ConsumerRecord record : records) {
                String value = record.value();
                EventLog eventLog = JSON.parseObject(value, EventLog.class);
                int guid = eventLog.getGuid();
                set.add(guid);
            }
        }

    }
}

用hashset来实现很显然会出问题,如果数据量一直往上增长,会出现oom的问题,而且占用资源越来越多,影响电脑性能!!!

方案二:将HashSet改成bitMap来计数,就很完美,大逻辑不变,小逻辑就是将HashMap改成bitMap

package com.doit.kafaka;

import com.alibaba.fastjson.JSON;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.roaringbitmap.RoaringBitmap;

import java.time.Duration;
import java.util.*;

//再写一个消费者,不断的从kafka中消费上面的用户行为数据,做一个统计  ==》poll for(具体的逻辑)
//        * 1.每5s输出一次当前来了多少用户(去重)  uv   每5s输出一次==》 任务调度器 Timer
//        * 2.将每条数据添加一个字段来标识,如果这个用户的id是第一次出现,那么就标注1,否则就是0  ==》 判断这个数之前有没有出现过
public class _Consumer_uv2 {

    public static void main(String[] args) {
//      HashSet set = new HashSet<>();
        RoaringBitmap bitmap = new RoaringBitmap();
      new Thread(new BitMapTask(bitmap)).start();

        Timer timer = new Timer();
        timer.schedule(new TimerTask() {
            @Override
            public void run() {
                System.out.println("截止到现在的uv数:"+bitmap.getCardinality()+",当前时间是:"+System.currentTimeMillis());

            }
        },1000,5000);
    }
}
class BitMapTask implements Runnable{
private RoaringBitmap bitmap;
private KafkaConsumer consumer;

    public BitMapTask(RoaringBitmap bitmap) {

        Properties props = new Properties();
        props.setProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG,"linux01:9092");
        props.setProperty(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName());
        props.setProperty(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,StringDeserializer.class.getName());
        props.setProperty(ConsumerConfig.GROUP_ID_CONFIG,"group02");
        props.setProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,"earliest");
        props.setProperty(ConsumerConfig.ALLOW_AUTO_CREATE_TOPICS_CONFIG,"true");
        props.setProperty(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG,"false");
        props.setProperty(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG,"10000");

        consumer = new KafkaConsumer(props);
        this.bitmap = bitmap;
    }

    @Override
    public void run() {

        consumer.subscribe(Arrays.asList("event-log"));
        while (true){
            ConsumerRecords records = consumer.poll(Duration.ofMillis(Integer.MAX_VALUE));
            for (ConsumerRecord record : records) {
                String value = record.value();
                EventLog eventLog = JSON.parseObject(value, EventLog.class);
                int guid = eventLog.getGuid();
                bitmap.add(guid);
            }
        }

    }
}

需求二:判断来没来过的问题,可以用bitmap来搞,当然还可以用布隆过滤器来搞

 

package com.doit.kafaka;

import com.alibaba.fastjson.JSON;
import com.google.common.hash.BloomFilter;
import com.google.common.hash.Funnels;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.checkerframework.checker.nullness.qual.Nullable;
import redis.clients.jedis.Jedis;

import java.time.Duration;
import java.util.Arrays;
import java.util.Properties;
import java.util.Timer;
import java.util.TimerTask;

//再写一个消费者,不断的从kafka中消费上面的用户行为数据,做一个统计  ==》poll for(具体的逻辑)
//        * 1.每5s输出一次当前来了多少用户(去重)  uv   每5s输出一次==》 任务调度器 Timer
//        * 2.将每条数据添加一个字段来标识,如果这个用户的id是第一次出现,那么就标注1,否则就是0  ==》 判断这个数之前有没有出现过
public class _Consumer_uv4 {
    public static void main(String[] args) {
        BloomFilter bloom = BloomFilter.create(Funnels.longFunnel(), 1000000);
        Properties props = new Properties();
        props.setProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG,"linux01:9092");
        props.setProperty(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName());
        props.setProperty(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,StringDeserializer.class.getName());
        props.setProperty(ConsumerConfig.GROUP_ID_CONFIG,"group02");
        props.setProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,"earliest");
        props.setProperty(ConsumerConfig.ALLOW_AUTO_CREATE_TOPICS_CONFIG,"true");
        props.setProperty(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG,"false");
        props.setProperty(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG,"10000");

        KafkaConsumer  consumer = new KafkaConsumer(props);
        consumer.subscribe(Arrays.asList("event-log"));
        while (true){
            ConsumerRecords records = consumer.poll(Duration.ofMillis(Integer.MAX_VALUE));
            for (ConsumerRecord record : records) {
                String value = record.value();
                EventLog eventLog = JSON.parseObject(value, EventLog.class);
                boolean flag = bloom.mightContain((long) eventLog.getGuid());
                if (!flag){
                    eventLog.setIsNew(1);
                    bloom.put((long) eventLog.getGuid());
                }else {
                    eventLog.setIsNew(0);
                }
                System.out.println(JSON.toJSONString(eventLog));


            }
        }
    }
}


 

 

 

你可能感兴趣的:(kafka,java,分布式)