需求:写一个生产者,不断的去生产用户行为数据,写入到kafka的一个topic中
生产的数据格式: 造数据
{"guid":1,"eventId":"pageview","timestamp":1637868346789} isNew = 1
{"guid":1,"eventId":"addcard","timestamp":1637868347625} isNew = 0
{"guid":2,"eventId":"collect","timestamp":16378683463219}
{"guid":3,"eventId":"paid","timestamp":16378683467829}
......
再写一个消费者,不断的从kafka中消费上面的用户行为数据,做一个统计
1.每5s输出一次当前来了多少用户(去重) uv
2.将每条数据添加一个字段来标识,如果这个用户的id是第一次出现,那么就标注1,否则就是0
生产者代码示例:
package com.doit.kafaka;
import com.alibaba.fastjson.JSON;
import org.apache.commons.lang3.RandomStringUtils;
import org.apache.commons.lang3.RandomUtils;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.common.serialization.StringSerializer;
import java.util.Properties;
//需求:写一个生产者,不断的去生产用户行为数据,写入到kafka的一个topic中 ==>先造一部分数据,然后不断的往kafka中写数据(生产者)
// * 生产的数据格式: 造数据
// * {"guid":1,"eventId":"pageview","timestamp":1637868346789} isNew = 1 ==》 fastjson ==》javabean 创建对象 格式化成json串
// * {"guid":1,"eventId":"addcard","timestamp":1637868347625} isNew = 0
// * {"guid":2,"eventId":"collect","timestamp":16378683463219}
// * {"guid":3,"eventId":"paid","timestamp":16378683467829}
// * ......
// * 再写一个消费者,不断的从kafka中消费上面的用户行为数据,做一个统计 ==》poll for(具体的逻辑)
// * 1.每5s输出一次当前来了多少用户(去重) uv 每5s输出一次==》 任务调度器 Timer
// * 2.将每条数据添加一个字段来标识,如果这个用户的id是第一次出现,那么就标注1,否则就是0 ==》 判断这个数之前有没有出现过
// */
public class _Producer_uv {
public static void main(String[] args) throws InterruptedException {
Properties props = new Properties();
props.setProperty(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG,"linux01:9092,linux02:9092,linux03:9092");
props.setProperty(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());
props.setProperty(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,StringSerializer.class.getName());
props.setProperty("value.serializer",StringSerializer.class.getName());
KafkaProducer producer = new KafkaProducer<>(props);
EventLog eventLog = new EventLog();
while(true){
eventLog.setEventId(RandomStringUtils.randomAlphabetic(10));
eventLog.setGuid(RandomUtils.nextInt(10000,100000));
eventLog.setTimestamp(System.currentTimeMillis());
String jsonString = JSON.toJSONString(eventLog);
ProducerRecord record = new ProducerRecord<>("event-log",jsonString);
producer.send(record);
producer.flush();
Thread.sleep(RandomUtils.nextInt(10,200));
}
}
}
消费者代码示例: 用hashset来实现:
package com.doit.kafaka;
import com.alibaba.fastjson.JSON;
import com.doit.demo.Consumer;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.kafka.common.serialization.StringSerializer;
import java.time.Duration;
import java.util.*;
//再写一个消费者,不断的从kafka中消费上面的用户行为数据,做一个统计 ==》poll for(具体的逻辑)
// * 1.每5s输出一次当前来了多少用户(去重) uv 每5s输出一次==》 任务调度器 Timer
// * 2.将每条数据添加一个字段来标识,如果这个用户的id是第一次出现,那么就标注1,否则就是0 ==》 判断这个数之前有没有出现过
public class _Consumer_uv {
public static void main(String[] args) {
HashSet set = new HashSet<>();
new Thread(new SetTask(set)).start();
Timer timer = new Timer();
timer.schedule(new TimerTask() {
@Override
public void run() {
System.out.println("截止到现在的uv数:"+set.size()+",当前时间是:"+System.currentTimeMillis());
}
},1000,5000);
}
}
class SetTask implements Runnable{
private HashSet set;
private KafkaConsumer consumer;
public SetTask(HashSet set) {
Properties props = new Properties();
props.setProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG,"linux01:9092");
props.setProperty(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName());
props.setProperty(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,StringDeserializer.class.getName());
props.setProperty(ConsumerConfig.GROUP_ID_CONFIG,"group02");
props.setProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,"earliest");
props.setProperty(ConsumerConfig.ALLOW_AUTO_CREATE_TOPICS_CONFIG,"true");
props.setProperty(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG,"false");
props.setProperty(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG,"10000");
consumer = new KafkaConsumer(props);
this.set = set;
}
@Override
public void run() {
consumer.subscribe(Arrays.asList("event-log"));
while (true){
ConsumerRecords records = consumer.poll(Duration.ofMillis(Integer.MAX_VALUE));
for (ConsumerRecord record : records) {
String value = record.value();
EventLog eventLog = JSON.parseObject(value, EventLog.class);
int guid = eventLog.getGuid();
set.add(guid);
}
}
}
}
用hashset来实现很显然会出问题,如果数据量一直往上增长,会出现oom的问题,而且占用资源越来越多,影响电脑性能!!!
方案二:将HashSet改成bitMap来计数,就很完美,大逻辑不变,小逻辑就是将HashMap改成bitMap
package com.doit.kafaka;
import com.alibaba.fastjson.JSON;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.roaringbitmap.RoaringBitmap;
import java.time.Duration;
import java.util.*;
//再写一个消费者,不断的从kafka中消费上面的用户行为数据,做一个统计 ==》poll for(具体的逻辑)
// * 1.每5s输出一次当前来了多少用户(去重) uv 每5s输出一次==》 任务调度器 Timer
// * 2.将每条数据添加一个字段来标识,如果这个用户的id是第一次出现,那么就标注1,否则就是0 ==》 判断这个数之前有没有出现过
public class _Consumer_uv2 {
public static void main(String[] args) {
// HashSet set = new HashSet<>();
RoaringBitmap bitmap = new RoaringBitmap();
new Thread(new BitMapTask(bitmap)).start();
Timer timer = new Timer();
timer.schedule(new TimerTask() {
@Override
public void run() {
System.out.println("截止到现在的uv数:"+bitmap.getCardinality()+",当前时间是:"+System.currentTimeMillis());
}
},1000,5000);
}
}
class BitMapTask implements Runnable{
private RoaringBitmap bitmap;
private KafkaConsumer consumer;
public BitMapTask(RoaringBitmap bitmap) {
Properties props = new Properties();
props.setProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG,"linux01:9092");
props.setProperty(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName());
props.setProperty(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,StringDeserializer.class.getName());
props.setProperty(ConsumerConfig.GROUP_ID_CONFIG,"group02");
props.setProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,"earliest");
props.setProperty(ConsumerConfig.ALLOW_AUTO_CREATE_TOPICS_CONFIG,"true");
props.setProperty(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG,"false");
props.setProperty(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG,"10000");
consumer = new KafkaConsumer(props);
this.bitmap = bitmap;
}
@Override
public void run() {
consumer.subscribe(Arrays.asList("event-log"));
while (true){
ConsumerRecords records = consumer.poll(Duration.ofMillis(Integer.MAX_VALUE));
for (ConsumerRecord record : records) {
String value = record.value();
EventLog eventLog = JSON.parseObject(value, EventLog.class);
int guid = eventLog.getGuid();
bitmap.add(guid);
}
}
}
}
需求二:判断来没来过的问题,可以用bitmap来搞,当然还可以用布隆过滤器来搞
package com.doit.kafaka;
import com.alibaba.fastjson.JSON;
import com.google.common.hash.BloomFilter;
import com.google.common.hash.Funnels;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.checkerframework.checker.nullness.qual.Nullable;
import redis.clients.jedis.Jedis;
import java.time.Duration;
import java.util.Arrays;
import java.util.Properties;
import java.util.Timer;
import java.util.TimerTask;
//再写一个消费者,不断的从kafka中消费上面的用户行为数据,做一个统计 ==》poll for(具体的逻辑)
// * 1.每5s输出一次当前来了多少用户(去重) uv 每5s输出一次==》 任务调度器 Timer
// * 2.将每条数据添加一个字段来标识,如果这个用户的id是第一次出现,那么就标注1,否则就是0 ==》 判断这个数之前有没有出现过
public class _Consumer_uv4 {
public static void main(String[] args) {
BloomFilter bloom = BloomFilter.create(Funnels.longFunnel(), 1000000);
Properties props = new Properties();
props.setProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG,"linux01:9092");
props.setProperty(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName());
props.setProperty(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,StringDeserializer.class.getName());
props.setProperty(ConsumerConfig.GROUP_ID_CONFIG,"group02");
props.setProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,"earliest");
props.setProperty(ConsumerConfig.ALLOW_AUTO_CREATE_TOPICS_CONFIG,"true");
props.setProperty(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG,"false");
props.setProperty(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG,"10000");
KafkaConsumer consumer = new KafkaConsumer(props);
consumer.subscribe(Arrays.asList("event-log"));
while (true){
ConsumerRecords records = consumer.poll(Duration.ofMillis(Integer.MAX_VALUE));
for (ConsumerRecord record : records) {
String value = record.value();
EventLog eventLog = JSON.parseObject(value, EventLog.class);
boolean flag = bloom.mightContain((long) eventLog.getGuid());
if (!flag){
eventLog.setIsNew(1);
bloom.put((long) eventLog.getGuid());
}else {
eventLog.setIsNew(0);
}
System.out.println(JSON.toJSONString(eventLog));
}
}
}
}