Spark-Streaming处理Kafka数据——封装成对象方式
1、Spark-Streaming处理Kafka数据格式基本上都是字符串类型的,我们如何封装成对象处理呢
首先准备数据,kafka的数据,这是Person的信息,包括id,name,age的信息:
1 billy 14 2 lily 16 3 lilei 17 4 lucy 13 5 green 11 6 jim 17 7 tom 15
2、通过Kakfa来处理,Kafa传过来的Person信息
import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import kafka.serializer.StringDecoder import scala.collection.immutable.HashMap import org.apache.log4j.Level import org.apache.log4j.Logger /** * @author Administrator */ object KafkaDataTest2 { //封装case clase , 需放在main之前声明 case class Person(id: Long, name: String, age: Int) def main(args: Array[String]): Unit = { Logger.getLogger("org.apache.spark").setLevel(Level.WARN); Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.ERROR); val conf = new SparkConf().setAppName("KafkaDataTest2").setMaster("local[3]") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(3)) val topics = Set("MyTopic") val brokers = "spark1:9092,spark2:9092,spark3:9092" val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers, "serializer.class" -> "kafka.serializer.StringEncoder") // 接收到自kafka的数据 val kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics) val dStream = kafkaStream.map(_._2) //按 " " split后,映射成Person对象 val personDStream = dStream.map(_.split(" ")).filter(_.length >= 3).map(x => Person(x(0).toLong, x(1).toString, x(2).toInt)) val sqlContext = new org.apache.spark.sql.SQLContext(sc) //通过foreachRDD来触发DStream的Action personDStream.foreachRDD(rdd => { import sqlContext.implicits._ //创建临时表,这个可以与Person名一致,也可以不致 val df = rdd.toDF().registerTempTable("Person") //可通过sql进行过滤或者统计处理,字段名与Person设置的一致,通过Person类的字段反射来指定的 val rcount = sqlContext.sql("select id,name,age from Person where age>=15") //处理sql的RDD需通过foreachPartition来处理 rcount.foreachPartition(iterator => { iterator.foreach(data => { val id = data.get(0) val name = data.get(1) val age = data.get(2) println("Person.id=" + id + ", Person.name=" + name + ", Person.age=" + age) }) }) }) ssc.start() ssc.awaitTermination() } }
Person.id=2, Person.name=lily, Person.age=16 Person.id=3, Person.name=lilei, Person.age=17 Person.id=6, Person.name=jim, Person.age=17 Person.id=7, Person.name=tom, Person.age=15