SparkStreaming读取Kafka Json格式数据

  1. 方法一:处理JSON字符串为case class 生成RDD[case class] 然后直接转成DataFrame

    stream.map(record => handleMessage2CaseClass(record.value())).foreachRDD(rdd => { 
    		val spark = SparkSession.builder().config(rdd.sparkContext.getConf).getOrCreate()
    		val df = spark.createDataFrame(rdd) df.show() 
    		}
    	) 
    	def handleMessage2CaseClass(jsonStr: String): KafkaMessage = {
    		val gson = new Gson()
    		gson.fromJson(jsonStr, classOf[KafkaMessage])
    	}
    
    	case class KafkaMessage(time: String, namespace: String, id: String, region: String, value: String, valueType: String)
    
  2. 方法二:处理JSON字符串为Tuple 生成RDD[Tuple] 然后转成DataFrame

    stream.map(record => handleMessage2Tuples(record.value())).foreachRDD(rdd => { 
    		val spark = SparkSession.builder().config(rdd.sparkContext.getConf).getOrCreate() 
    		import spark.implicits._ 
    		val df = rdd.toDF("id", "value", "time", "valueType", "region", "namespace") df.show() 
    		}
    	)
    	
    	def handleMessage2Tuples(jsonStr: String): (String, String, String, String, String, String) = { 
    		import scala.collection.JavaConverters._ 
    		val list = JSON.parseObject(jsonStr, classOf[JLinkedHashMap[String, Object]]).asScala.values.map(x => String.valueOf(x)).toList 
    		list match { case List(v1, v2, v3, v4, v5, v6) => (v1, v2, v3, v4, v5, v6) } 
    	}
    
  3. 方法三:处理JSON字符串为Row 生成RDD[Row] 然后通过schema创建DataFrame

    val schema = StructType(List( StructField("id", StringType), 
    								StructField("value", StringType), 
    								StructField("time", StringType), 
    								StructField("valueType", StringType), 
    								StructField("region", StringType), 
    								StructField("namespace", StringType)) 
    							) 
    	stream.map(record => handlerMessage2Row(record.value())).foreachRDD(rdd => {
    		val spark = SparkSession.builder().config(rdd.sparkContext.getConf).getOrCreate() 
    		val df = spark.createDataFrame(rdd, schema) df.show() 
    		}
    	)
    
    	def handlerMessage2Row(jsonStr: String): Row = { 
    		import scala.collection.JavaConverters._ 
    		val array = JSON.parseObject(jsonStr, classOf[JLinkedHashMap[String, Object]]).asScala.values.map(x => String.valueOf(x)).toArray Row(array: _*) 
    	}
    
  4. 方法四:直接将 RDD[String] 转成DataSet 然后通过schema转换

    val schema = StructType(List( StructField("namespace", StringType), 
    								StructField("id", StringType), 
    								StructField("region", StringType),
    								StructField("time", StringType), 
    								StructField("value", StringType),
    								StructField("valueType", StringType)) ) 
    	stream.map(record => record.value()).foreachRDD(rdd => { 
    		val spark = SparkSession.builder().config(rdd.sparkContext.getConf).getOrCreate() 
    		import spark.implicits._ val 
    		ds = spark.createDataset(rdd) 
    		ds.select(from_json('value.cast("string"), schema) as "value").select($"value.*").show() 
    		}
    	)
    
  5. 方法五:直接将 RDD[String] 转成DataSet 然后通过read.json转成DataFrame

    stream.map(record => record.value()).foreachRDD(rdd => { 
    		val spark = SparkSession.builder().config(rdd.sparkContext.getConf).getOrCreate() 
    		import spark.implicits._ 
    		val df = spark.read.json(spark.createDataset(rdd)) df.show() 
    		}
    	)
    

你可能感兴趣的:(hadoop,Spark)