shell 创建
/usr/bin/kafka-topics --create --zookeeper cdh1:2181/kafka --replication-factor 2 --partitions 4 --topic test-gpf-topic
–replication-factor 副本数量/usr/bin/kafka-topics --zookeeper cdh3:2181/kafka --list
/usr/bin/kafka-console-producer --broker-list cdh1:9092,cdh3:9092 --topic test-gpf-topic
/usr/bin/kafka-console-consumer --zookeeper cdh1:2181/kafka --topic test-gpf-topic
以上为在shell里边基础的操作命令
java-API使用
版本
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_2.12</artifactId>
<version>1.0.1</version>
</dependency>
生产数据 放入指定的topic中
使用 new ProducerRecord
第一个参数: topic 名称
第二个参数: key ,相同的key会被分入相同的partition
第三个参数: 数据
package cn.huimin.sparkstreaming.producer;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.Partitioner;
import org.apache.kafka.clients.producer.Producer;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.common.Cluster;
import org.apache.kafka.common.PartitionInfo;
import org.apache.kafka.common.record.InvalidRecordException;
import org.apache.kafka.common.utils.Utils;
import java.util.List;
import java.util.Map;
import java.util.Properties;
public class Demo{
public static void main(String[] args) {
Properties props = new Properties();
props.put("bootstrap.servers", "cdh1:9092");
//1. acks=0 意味着生产者能够通过网络吧消息发送出去,那么就认为消息已成功写入Kafka 一定会丢失一些数据
//2. acks=1 意味着首领在疏导消息并把它写到分区数据问津是会返回确认或者错误响应,还是可能会丢数据
//3. acks=all 意味着首领在返回确认或错误响应之前,会等待所有同步副本都收到消息。如果和min.insync.replicas参数结合起来,,就可以决定在返回 确认前至少有多个副本能够收到消息。但是效率较低。可以通过一部模式和更大的批次来加快速度,但这样做会降低吞吐量
props.put("acks", "all");
//min.insync.replicas指定replicas的最小数目(必须确认每一个repica的写数据都是成功的),如果这个数目没有达到,producer会产生异常。
props.put("min.insync.replicas", "2");
//缓冲大小 32K(32768) 默认16K(16384)
props.put("batch.size", "32768");
//设置默认时间提交时间 1000ms 默认为0ms
props.put("linger.ms", "200");
props.put("group.id", "test-1");
props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
Producer<String, String> producer = new KafkaProducer<String, String>(props);
try {
for (int i = 0; i < 1000; i++){
Thread.sleep(10);
producer.send(new ProducerRecord<String, String>("test-gpf-topic","bi2019",Integer.toString(i+1)));
}
}catch (Exception e){
e.printStackTrace();
} finally {
producer.close();
System.out.println("product send over");
}
}
}
指定消费固定的topic
package cn.huimin.sparkstreaming.comsumer;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import java.time.Duration;
import java.util.Arrays;
import java.util.Properties;
public class Demo {
public static void main(String[] args) throws Exception {
String topic = "test-gpf-topic";
Properties props = new Properties();
props.setProperty("bootstrap.servers", "cdh1:9092");
props.setProperty("group.id", "gpf");
props.setProperty("enable.auto.commit", "true");
props.setProperty("auto.commit.interval.ms", "1000");
props.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
props.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
KafkaConsumer<String, String> consumer = new KafkaConsumer<>(props);
//consumer.subscribe(Arrays.asList("foo", "bar"));
consumer.subscribe(Arrays.asList("test-gpf-topic"));
while (true) {
ConsumerRecords<String, String> records = consumer.poll(Duration.ofMillis(100));
for (ConsumerRecord<String, String> record : records)
System.out.printf("offset = %d, key = %s, value = %s, partition = %s%n", record.offset(), record.key(), record.value(),record.partition());
}
}
}
以上为javaAPI操作代码
使用sparkstreaming来消费kafka的数据
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.10</artifactId>
<version>1.6.0</version>
<!--<scope>provided</scope>-->
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka_2.10</artifactId>
<version>1.6.0</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.10.4</version>
</dependency>
我们通过scala编写程序
package cn.huimin.spark.comsumer
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
object Demo2 {
def main(args: Array[String]) {
val brokers = "cdh3:9092"
val topics = "test-gpf-topic"
//默认为5秒
val split_rdd_time = 5
// 创建上下文
val sparkConf = new SparkConf()
.setAppName("SendSampleKafkaDataToApple").setMaster("local[*]")
.set("spark.app.id", "streaming_kafka")
val sc = new SparkContext(sparkConf)
sc.setLogLevel("WARN")
val ssc = new StreamingContext(sc, Seconds(split_rdd_time))
// ssc.checkpoint(checkpointDirectory)
// 创建包含brokers和topic的直接kafka流
val topicsSet: Set[String] = topics.split(",").toSet
//kafka配置参数
val kafkaParams: Map[String, String] = Map[String, String](
"metadata.broker.list" -> brokers,
"group.id" -> "apple_sample",
"serializer.class" -> "kafka.serializer.StringEncoder",
"auto.offset.reset" -> "largest" //自动将偏移重置为最新偏移(默认)
//,"auto.offset.reset" -> "earliest" //自动将偏移重置为最早的偏移
//,"auto.offset.reset" -> "none" //如果没有为消费者组找到以前的偏移,则向消费者抛出异常
)
/**
* 从指定位置开始读取kakfa数据
* 注意:由于Exactly Once的机制,所以任何情况下,数据只会被消费一次!
* 指定了开始的offset后,将会从上一次Streaming程序停止处,开始读取kafka数据
*/
val offsetList = List((topics, 0, 0l),(topics, 1, 0l),(topics, 2, 0l),(topics, 3, 0l)) //指定topic,partition_no,offset
val fromOffsets = setFromOffsets(offsetList) //构建参数
val messageHandler = (mam: MessageAndMetadata[String, String]) => (String.valueOf(mam.topic+" $ "+mam.partition +" $ " +mam.offset +" $ "+mam.key()), mam.message()) //构建MessageAndMetadata
//使用高级API从指定的offset开始消费,欲了解详情,
//请进入"http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.streaming.kafka.KafkaUtils$"查看
val messages: InputDStream[(String, String)] = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, fromOffsets, messageHandler)
//数据操作
messages.foreachRDD(rdd => {
rdd.foreachPartition(lines => {
lines.foreach(line => {
println("++++++++++++++++++++++++++++++此处记录offset+++++++++++++++++++++++++++++++++++++++")
println("The kafka line is :" + "当前metaData:"+line._1+" 当前数据:"+line._2)
println("+++++++++++++++++++++++++++++++此处消费数据操作++++++++++++++++++++++++++++++++++++++")
})
})
})
ssc.start()
ssc.awaitTermination()
}
//构建Map
def setFromOffsets(list: List[(String, Int, Long)]): Map[TopicAndPartition, Long] = {
var fromOffsets: Map[TopicAndPartition, Long] = Map()
for (offset <- list) {
val tp = TopicAndPartition(offset._1, offset._2) //topic和分区数
fromOffsets += (tp -> offset._3) // offset位置
}
fromOffsets
}
}
spark读取kafka数据流提供了两种方式createDstream和createDirectStream。
两者区别如下:
KafkaUtils.createDstream
构造函数为KafkaUtils.createDstream(ssc, [zk], [consumer group id], [per-topic,partitions] )
使用了receivers来接收数据,利用的是Kafka高层次的消费者api,对于所有的receivers接收到的数据将会保存在Spark executors中,然后通过Spark Streaming启动job来处理这些数据,默认会丢失,可启用WAL日志,该日志存储在HDFS上
A、创建一个receiver来对kafka进行定时拉取数据,ssc的rdd分区和kafka的topic分区不是一个概念,故如果增加特定主体分区数仅仅是增加一个receiver中消费topic的线程数,并不增加spark的并行处理数据数量
B、对于不同的group和topic可以使用多个receivers创建不同的DStream
C、如果启用了WAL,需要设置存储级别,即KafkaUtils.createStream(….,StorageLevel.MEMORY_AND_DISK_SER)
KafkaUtils.createDirectStream
区别Receiver接收数据,这种方式定期地从kafka的topic+partition中查询最新的偏移量,再根据偏移量范围在每个batch里面处理数据,使用的是kafka的简单消费者api
优点:
A、简化并行,不需要多个kafka输入流,该方法将会创建和kafka分区一样的rdd个数,而且会从kafka并行读取。
B、高效,这种方式并不需要WAL,WAL模式需要对数据复制两次,第一次是被kafka复制,另一次是写到wal中
C、恰好一次语义(Exactly-once-semantics),传统的读取kafka数据是通过kafka高层次api把偏移量写入zookeeper中,存在数据丢失的可能性是zookeeper中和ssc的偏移量不一致。EOS通过实现kafka低层次api,偏移量仅仅被ssc保存在checkpoint中,消除了zk和ssc偏移量不一致的问题。缺点是无法使用基于zookeeper的kafka监控工具
我们这里推荐使用KafkaUtils.createDirectStream