aggDF
.writeStream()
.outputMode("complete")
.format("console")
.start();
writeStream
.format("parquet") // can be "orc", "json", "csv", etc.
.option("path", "path/to/destination/dir")
.start()
writeStream
.format("kafka")
.option("kafka.bootstrap.servers", "host1:port1,host2:port2")
.option("topic", "updates")
.start()
writeStream
.format("console")
.start()
输出以内存表的形式存储在内存中。支持两种输出模式:Append和Complete。这应该用于低数据量的调试目的,因为整个输出被收集并存储在驱动程序的内存中。因此,请谨慎使用。
// Foreach
streamingDatasetOfString.writeStream().foreach(
new ForeachWriter() {
@Override public boolean open(long partitionId, long version) {
// Open connection
}
@Override public void process(String record) {
// Write string to connection
}
@Override public void close(Throwable errorOrNull) {
// Close the connection
}
}
).start();
// ForeachBatch
streamingDatasetOfString.writeStream().foreachBatch(
new VoidFunction2, Long>() {
public void call(Dataset dataset, Long batchId) {
// Transform and write batchDF
}
}
).start();
在业务场景中,经常会遇到按时间段进行聚合操作,Spark提供了基于滑动窗口的事件时间集合操作,每个时间段作为一个分组,并对每个组内的每行数据进行聚合操作。
可以使用groupBy()和window()操作来表示窗口聚合。
Dataset words = ... // streaming DataFrame of schema { timestamp: Timestamp, word: String }
// Group the data by window and word and compute the count of each group
Dataset windowedCounts = words.groupBy(
functions.window(words.col("timestamp"), "10 minutes", "5 minutes"),
words.col("word")
).count();
WaterMarking的作用主要是为了解决:延迟到达的数据是否丢弃,系统可以删除过期的数据。
Dataset words = ... // streaming DataFrame of schema { timestamp: Timestamp, word: String }
// Group the data by window and word and compute the count of each group
Dataset windowedCounts = words
.withWatermark("timestamp", "10 minutes") // 延迟10分钟后到达的数据将会被丢弃
.groupBy(
window(col("timestamp"), "10 minutes", "5 minutes"),
col("word"))
.count();
package com.penngo.spark;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.streaming.StreamingQuery;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.roaringbitmap.art.Art;
import java.io.Serializable;
import java.sql.Timestamp;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import static org.apache.spark.sql.functions.col;
import static org.apache.spark.sql.functions.window;
public class SparkStructStream {
private static final DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
public static class DataTxt implements Serializable {
private String text;
private Timestamp time;
public DataTxt(String text, LocalDateTime time) {
this.text = text;
this.time = Timestamp.valueOf(time);
}
public String getText() {
return text;
}
public void setText(String text) {
this.text = text;
}
public Timestamp getTime() {
return time;
}
public void setTime(Timestamp time) {
this.time = time;
}
}
public static void socket(SparkSession spark) throws Exception{
// 运行:nc -lk 9999
Dataset<Row> lines = spark
.readStream()
.format("socket")
.option("host", "localhost")
.option("port", 9999)
.load();
Dataset<DataTxt> words = lines
.as(Encoders.STRING())
.map((MapFunction<String, DataTxt>) x -> {
String[] strs = x.split(",");
LocalDateTime date = LocalDateTime.parse(strs[1],formatter);
Arrays.asList(x.split(",")).iterator();
DataTxt data = new DataTxt(strs[0], date);
return data;
}, Encoders.bean(DataTxt.class));
Dataset<Row> wordCounts = words.toDF()
.withWatermark("time", "10 minutes") // 延迟10分钟后到达的数据将会被丢弃
.groupBy(
window(col("time"), "10 minutes", "5 minutes"),
col("text"))
.count();
wordCounts.writeStream().outputMode("append")
.foreach(new ForeachWriter<Row>() {
@Override public boolean open(long partitionId, long version) {
// System.out.println("open==========partitionId:" + partitionId + ",version:" + version);
return true;
}
@Override public void process(Row record) {
// Write string to connection
System.out.println("recordxxxxxxxxxxxxxxxxxx:======" + record);
}
@Override public void close(Throwable errorOrNull) {
// Close the connection
// System.out.println("close==========errorOrNull:" + errorOrNull);
}
})
// .format("console")
.start().awaitTermination();
}
public static void kafka(SparkSession spark) throws Exception{
// Subscribe to 1 topic
Dataset<Row> df = spark
.readStream()
.format("kafka")
.option("kafka.bootstrap.servers", "192.168.245.1:9092")
.option("subscribe", "topic-news")
.option("startingOffsets","latest")
.option("maxOffsetsPerTrigger",1000)
.load();
df = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)");
df.printSchema();
df.writeStream().outputMode("append")
.format("console")
.start().awaitTermination();
}
public static void main(String[] args) throws Exception{
Logger.getLogger("org.apache.spark").setLevel(Level.WARN);
Logger.getLogger("org.apache.eclipse.jetty.server").setLevel(Level.OFF);
Logger.getLogger("org.apache.kafka").setLevel(Level.WARN);
System.setProperty("hadoop.home.dir", "/usr/local/hadoop-3.3.6");
System.setProperty("HADOOP_USER_NAME", "root");
SparkSession spark = SparkSession
.builder()
.appName("SparkStructStream")
.master("local[*]")
.getOrCreate();
// socket(spark);
kafka(spark);
}
}
参考自官方文档:https://spark.apache.org/docs/3.1.2/structured-streaming-programming-guide.html