一 、数据准备
通过kafka-rest 写入kafka-avro数据
public class Test {
public static void main(String args[]){
String url = "http://node9:8082/topics/ztwo";
int x=1;
while (true){
Random random =new Random();
int i = random.nextInt();
String json;
if (i%2==0) {
json="{\"value_schema\": \"{\\\"type\\\": \\\"record\\\", \\\"name\\\": \\\"news_doc\\\", \\\"fields\\\": [{\\\"name\\\": \\\"name\\\", \\\"type\\\": \\\"string\\\"},{\\\"name\\\": \\\"time\\\", \\\"type\\\": \\\"long\\\"}]}\", \"records\": [{\"value\": {\"name\": \"one\",\"time\":1553069910680}}]}";
}else {
json="{\"value_schema\": \"{\\\"type\\\": \\\"record\\\", \\\"name\\\": \\\"news_doc\\\", \\\"fields\\\": [{\\\"name\\\": \\\"name\\\", \\\"type\\\": \\\"string\\\"},{\\\"name\\\": \\\"time\\\", \\\"type\\\": \\\"long\\\"}]}\", \"records\": [{\"value\": {\"name\": \"two\",\"time\":1553069910680}}]}";
}
x++;
System.out.println(x);
HttpRequest.sendPost(url,json);
try {
Thread.sleep(3000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
public static String sendPost(String url ,String data) {
HttpClient httpClient = new DefaultHttpClient();
//JSONObject json = new JSONObject(data);
HttpPost post = new HttpPost(url);
post.setHeader("Content-type", "application/vnd.kafka.avro.v1+json");
StringEntity entity = new StringEntity(data, Charset.forName("UTF-8"));
entity.setContentEncoding("UTF-8");
// 发送Json格式的数据请求
entity.setContentType("application/json");
post.setEntity(entity);
HttpResponse response = null;
try {
response = httpClient.execute(post);
String enStr = EntityUtils.toString(response.getEntity(), "utf-8");
return enStr;
} catch (IOException e) {
e.printStackTrace();
return e.getMessage();
}
}
2,spark读取解析数据
package cn
import io.confluent.kafka.schemaregistry.client.{CachedSchemaRegistryClient, SchemaRegistryClient}
import io.confluent.kafka.serializers.AbstractKafkaAvroDeserializer
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.avro._
/**
* Hello world!
*
*/
object App {
private var schemaRegistryClient: SchemaRegistryClient = _
private var kafkaAvroDeserializer: AvroDeserializer = _
def getTopicSchema(topic: String) = {
schemaRegistryClient.getLatestSchemaMetadata(topic + "-value").getSchema
}
def avroSchemaToSparkSchema(avroSchema: String) = {
SchemaConverters.toSqlType(new Schema.Parser().parse(avroSchema))
}
def main(args: Array[String]): Unit = {
val conf= new SparkConf()
.setAppName("kafka-structured").set("spark.testing.memory","2147480000")
.setMaster("local[*]");
val spark = SparkSession.builder()
.config(conf)
.getOrCreate()
val bootstrapServers ="node9:9092"
val topic = "ztwo"
val schemaRegistryUrl = "http://node9:8081"
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.WARN)
Logger.getLogger("org.apache.kafka.clients.consumer").setLevel(Level.WARN)
consumeAvro(spark, bootstrapServers, topic, schemaRegistryUrl)
spark.stop()
}
private def consumeAvro(spark: SparkSession, bootstrapServers: String, topic: String, schemaRegistryUrl: String): Unit = {
import spark.implicits._
schemaRegistryClient = new CachedSchemaRegistryClient(schemaRegistryUrl, 128)
kafkaAvroDeserializer = new AvroDeserializer(schemaRegistryClient)
spark.udf.register("deserialize", (bytes: Array[Byte]) =>
kafkaAvroDeserializer.deserialize(bytes)
)
val rawDf = spark.readStream
.format("kafka")
.option("kafka.bootstrap.servers", bootstrapServers)
.option("subscribe", topic)
.option("startingOffsets", "earliest")
.option("group.id","1")
.load()
import org.apache.spark.sql.functions._
val jsonDf = rawDf.select(callUDF("deserialize", 'value).as("value"))
val dfValueSchema = {
val rawSchema = getTopicSchema(topic)
avroSchemaToSparkSchema(rawSchema)
}
val parsedDf = jsonDf.select(from_json('value, dfValueSchema.dataType).alias("value")
).select($"value.*")
parsedDf.createTempView(topic)
val output=spark.sql("select count(*) from "+topic+" group by name")
output.writeStream
.format("console")
.outputMode("complete")
//.outputMode("append ")
.start()
.awaitTermination()
}
class AvroDeserializer extends AbstractKafkaAvroDeserializer {
def this(client: SchemaRegistryClient) {
this()
this.schemaRegistry = client
}
override def deserialize(bytes: Array[Byte]): String = {
val value = super.deserialize(bytes)
value match {
case str: String =>
str
case _ =>
val genericRecord = value.asInstanceOf[GenericRecord]
if (genericRecord==null) {
// 返回空字符串
null
}else{
genericRecord.toString
}
}
}
}
}
4.0.0
cn.golaxy
kafka-spark-connector
1.0-SNAPSHOT
2008
2.12.7
scala-tools.org
Scala-Tools Maven2 Repository
http://scala-tools.org/repo-releases
confluent
http://packages.confluent.io/maven/
scala-tools.org
Scala-Tools Maven2 Repository
http://scala-tools.org/repo-releases
org.apache.spark
spark-core_2.12
2.4.0
org.apache.spark
spark-sql_2.12
2.4.0
org.apache.spark
spark-streaming_2.12
2.4.0
org.apache.kafka
kafka_2.12
2.1.0
org.apache.spark
spark-streaming-kafka-0-10_2.12
2.4.0
org.apache.spark
spark-sql-kafka-0-10_2.12
2.4.0
com.alibaba
fastjson
1.2.38
com.thoughtworks.paranamer
paranamer
2.8
org.apache.spark
spark-avro_2.12
2.4.0
com.databricks
spark-avro_2.11
4.0.0
io.confluent
kafka-avro-serializer
3.2.0
org.apache.avro
avro
junit
junit
4.4
test
org.specs
specs
1.2.5
test
src/main/scala
src/test/scala
org.scala-tools
maven-scala-plugin
compile
testCompile
${scala.version}
-target:jvm-1.8