pom文件如下
org.apache.spark
spark-streaming_2.11
2.3.0
org.apache.spark
spark-streaming-kafka-0-10_2.11
2.3.0
org.apache.spark
spark-sql_2.11
2.3.0
org.apache.spark
spark-sql-kafka-0-10_2.11
2.3.0
log4j
log4j
1.2.17
org.apache.commons
commons-lang3
3.5
下面的代码在kafka的Topic上没有ACL权限认证时运行良好
package com.unistack.calc.structstream;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.streaming.OutputMode;
import org.apache.spark.sql.streaming.StreamingQuery;
import org.apache.spark.sql.streaming.StreamingQueryException;
import org.apache.spark.sql.types.MetadataBuilder;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import static org.apache.spark.sql.functions.*;
import static org.apache.spark.sql.types.DataTypes.IntegerType;
public class StructedStreamTest2 {
//192.168.1.110:9093,192.168.1.111:9093,192.168.1.112:9093
private static final String ips = "192.168.1.110:9093,192.168.1.111:9093,192.168.1.112:9093";
public static void main(String[] args) {
// System.setProperty("java.security.auth.login.config","/Users/frank/Desktop/shell/lyh.conf");
SparkSession spark = SparkSession
.builder()
.appName("app")
.master("local[6]")
.getOrCreate();
Dataset df1 = spark
.readStream()
.format("kafka")
.option("kafka.bootstrap.servers",ips)
.option("startingOffsets", "earliest")
// .option("security.protocol","SASL_PLAINTEXT")
// .option("sasl.mechanism","PLAIN")
.option("subscribe", "yh1")
.load();
MetadataBuilder b = new MetadataBuilder();
StructField[] fields = {
new StructField("id",IntegerType, true,b.build()),
new StructField("age",IntegerType, true,b.build()),
new StructField("height",IntegerType, true,b.build())
};
StructType type = new StructType(fields);
Dataset d1 = df1
.withWatermark("timestamp","1 hours")
.selectExpr("CAST(value AS STRING)")
.select(from_json(col("value"),type).as("v"))
.selectExpr("v.id","v.age","v.height");
Dataset df2 = spark
.readStream()
.format("kafka")
.option("kafka.bootstrap.servers",ips)
.option("startingOffsets", "earliest")
// .option("security.protocol","SASL_PLAINTEXT")
// .option("sasl.mechanism","PLAIN")
.option("subscribe", "yh2")
.load();
StructField[] fields2={
new StructField("yh2_id",IntegerType, true,b.build()),
new StructField("yh2_age",IntegerType, true,b.build()),
new StructField("yh2_height",IntegerType, true,b.build())
};
StructType type2 = new StructType(fields2);
Dataset d2 = df2
.withWatermark("timestamp","1 hours")
.selectExpr("CAST(value AS STRING)")
.select(from_json(col("value"),type2).as("v"))
.selectExpr("v.yh2_id","v.yh2_age","v.yh2_height");
StreamingQuery query = d1.join(d2,expr("id = yh2_id"))
.writeStream()
.format("console")
.outputMode(OutputMode.Append())
.start();
try{
query.awaitTermination();
}catch(StreamingQueryException e) {
e.printStackTrace();
}
// query = df \
// .selectExpr("CAST(userId AS STRING) AS key", "to_json(struct(*)) AS value") \
// .writeStream \
// .format("kafka") \
// .option("kafka.bootstrap.servers", "host1:port1,host2:port2") \
// .option("topic", "topic1") \
// .option("checkpointLocation", "/path/to/HDFS/dir") \
// .start()
}
}
参考文章:
https://sonra.io/2017/11/27/advanced-spark-structured-streaming-aggregations-joins-checkpointing/
https://databricks.com/blog/2017/04/26/processing-data-in-apache-kafka-with-structured-streaming-in-apache-spark-2-2.html
其中yh1中的数据如下:
{"id":1,"age":1,"height":1}
yh2中的数据如下
{"yh2_id":1,"yh2_age":1,"yh2_height":1}