flink参考
flink参考
datastream是flink提供给用户使用的用于进行流计算和批处理的api,是对底层流式计算模型的api封装,便于用户编程
一般流程为:
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-scala_2.11artifactId>
<version>${flink.version}version>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-coreartifactId>
<version>${flink.version}version>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-streaming-scala_2.11artifactId>
<version>${flink.version}version>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-clients_2.11artifactId>
<version>${flink.version}version>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-shaded-hadoop-2-uberartifactId>
<version>2.4.1-9.0version>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-connector-kafka_2.11artifactId>
<version>${flink.version}version>
dependency>
<dependency>
<groupId>org.apache.kafkagroupId>
<artifactId>kafka-clientsartifactId>
<version>${kafka.version}version>
dependency>
<dependency>
<groupId>com.alibabagroupId>
<artifactId>fastjsonartifactId>
<version>1.2.62version>
dependency>
<dependency>
<groupId>org.apache.kafkagroupId>
<artifactId>kafka_2.11artifactId>
<version>${kafka.version}version>
dependency>
object FlinkReadWriteKafka {
def main(args: Array[String]): Unit = {
//首先获取flink流计算环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
//kafka props
val prop = new Properties()
//指定kafka的Broker地址
prop.setProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "single:9092")
//指定组的ID
prop.setProperty(ConsumerConfig.GROUP_ID_CONFIG, "md")
//k v 序列化
prop.setProperty(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer")
prop.setProperty(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer")
//如果没有记录偏移量,第一次开始消费
prop.setProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest")
val ds = env.addSource(
new FlinkKafkaConsumer[String](
"event_attendees",
//对于字符串序列化和反序列化的schema
new SimpleStringSchema(),
prop
).setStartFromEarliest()//重置游标
)
//transform操作
val dataStream = ds.map(x => {
val info = x.split(",", -1)
Array(
(info(0), info(1).split(" "), "yes"),
(info(0), info(2).split(" "), "maybe"),
(info(0), info(3).split(" "), "invited"),
(info(0), info(4).split(" "), "no")
)
}).flatMap(x => x).flatMap(x => x._2.map(y => (x._1, y, x._3))).filter(_._2 != "")
.map(_.productIterator.mkString(","))
val prop2 = new Properties()
prop2.setProperty(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "single:9092")
prop2.setProperty(ProducerConfig.RETRIES_CONFIG, "0")
prop2.setProperty(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer")
prop2.setProperty(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer")
dataStream.addSink(new FlinkKafkaProducer[String](
"single:9092",
"event_attendees_ff",
new SimpleStringSchema()))
//启动流式计算
env.execute("event_attendees_xf")
}
}
trait Read[T] {
def read(prop:Properties,tableName:String):DataStream[T]
}
trait Write[T] {
def write(localhost:String,tableName:String,dataStream:DataStream[T]):Unit
}
trait Transform[T,V] {
def transform(in:DataStream[T]):DataStream[V]
}
class KafkaSource(env: StreamExecutionEnvironment) extends Read[String] {
override def read(prop: Properties, tableName: String): DataStream[String] = {
env.addSource(
new FlinkKafkaConsumer[String](
tableName,
new SimpleStringSchema(),
prop
)
)
}
}
object KafkaSource {
def apply(env: StreamExecutionEnvironment): KafkaSource = new KafkaSource(env)
}
class KafkaSink extends Write[String] {
override def write(localhost: String, tableName: String, dataStream: DataStream[String]): Unit = {
dataStream.addSink(new FlinkKafkaProducer[String](
localhost,
tableName,
new SimpleStringSchema()
))
}
}
object KafkaSink {
def apply[T](): KafkaSink = new KafkaSink()
}
trait FlikTransform extends Transform[String, String] {
override def transform(in: DataStream[String]): DataStream[String] = {
in.map(x => {
val info = x.split(",", -1)
Array(
(info(0), info(1).split(" "), "yes"),
(info(0), info(2).split(" "), "maybe"),
(info(0), info(3).split(" "), "invited"),
(info(0), info(4).split(" "), "no")
)
}).flatMap(x => x).flatMap(x => x._2.map(y => (x._1, y, x._3))).filter(_._2 != "")
.map(_.productIterator.mkString(","))
}
}
class KTExcutor(readConf: Properties, writelocalhost: String) {
tran: FlikTransform =>
def worker(intopic: String, outputtopic: String) = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
val kr = new KafkaSource(env).read(readConf, intopic)
val ds = tran.transform(kr)
KafkaSink().write(writelocalhost, outputtopic, ds)
env.execute()
}
}
object EAtest {
def main(args: Array[String]): Unit = {
val prop = new Properties()
prop.setProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "single:9092")
prop.setProperty(ConsumerConfig.GROUP_ID_CONFIG, "md")
prop.setProperty(ConsumerConfig.MAX_POLL_INTERVAL_MS_CONFIG, "1000")
prop.setProperty(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer")
prop.setProperty(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer")
prop.setProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest")
val localhost = "single:9092"
(new KTExcutor(prop, localhost) with FlikTransform)
.worker("event_attendees", "attendees_AA")
}
}