Spark Streaming除了可以使用内置的接收器(Receivers,比如Flume、Kafka、Kinesis、files和sockets等)来接收流数据,还可以自定义接收器来从任意的流中接收数据。开发者们可以自己实现org.apache.spark.streaming.receiver.Receiver类来从其他的数据源中接收数据。本文将介绍如何实现自定义接收器,并且在Spark Streaming应用程序中使用。我们可以用Scala或者Java来实现自定义接收器。
<span style="font-family: Arial, Helvetica, sans-serif;">class CustomReceiver(host: String, port: Int)</span>
extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) with Logging { def onStart() { // Start the thread that receives data over a connection new Thread("Socket Receiver") { override def run() { receive() } }.start() } def onStop() { // There is nothing much to do as the thread calling receive() // is designed to stop by itself if isStopped() returns false } /** Create a socket connection and receive data until receiver is stopped */ private def receive() { var socket: Socket = null var userInput: String = null try { // Connect to host:port socket = new Socket(host, port) // Until stopped or connection broken continue reading val reader = new BufferedReader(new InputStreamReader(socket.getInputStream(), "UTF-8")) userInput = reader.readLine() while(!isStopped && userInput != null) { store(userInput) userInput = reader.readLine() } reader.close() socket.close() // Restart in an attempt to connect again when server is active again restart("Trying to connect again") } catch { case e: java.net.ConnectException => // restart if could not connect to server restart("Error connecting to " + host + ":" + port, e) case t: Throwable => // restart if there is any other error restart("Error receiving data", t) } } }
package org.apache.spark.examples.streaming import java.io.{BufferedReader, InputStream, InputStreamReader} import java.net.Socket import org.apache.spark.{Logging, SparkConf} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.receiver.Receiver /** * Custom Receiver that receives data over a socket. Received bytes is interpreted as * text and \n delimited lines are considered as records. They are then counted and printed. * * To run this on your local machine, you need to first run a Netcat server * `$ nc -lk 9999` * and then run the example * `$ bin/run-example org.apache.spark.examples.streaming.CustomReceiver localhost 9999` */ object CustomReceiver { def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: CustomReceiver <hostname> <port>") System.exit(1) } StreamingExamples.setStreamingLogLevels() // Create the context with a 1 second batch size val sparkConf = new SparkConf().setAppName("CustomReceiver") val ssc = new StreamingContext(sparkConf, Seconds(1)) // Create a input stream with the custom receiver on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') val lines = ssc.receiverStream(new CustomReceiver(args(0), args(1).toInt)) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } } class CustomReceiver(host: String, port: Int) extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) with Logging { def onStart() { // Start the thread that receives data over a connection new Thread("Socket Receiver") { override def run() { receive() } }.start() } def onStop() { // There is nothing much to do as the thread calling receive() // is designed to stop by itself isStopped() returns false } /** Create a socket connection and receive data until receiver is stopped */ private def receive() { var socket: Socket = null var userInput: String = null try { logInfo("Connecting to " + host + ":" + port) socket = new Socket(host, port) logInfo("Connected to " + host + ":" + port) val reader = new BufferedReader(new InputStreamReader(socket.getInputStream(), "UTF-8")) userInput = reader.readLine() while(!isStopped && userInput != null) { store(userInput) userInput = reader.readLine() } reader.close() socket.close() logInfo("Stopped receiving") restart("Trying to connect again") } catch { case e: java.net.ConnectException => restart("Error connecting to " + host + ":" + port, e) case t: Throwable => restart("Error receiving data", t) } } }
class CustomActor extends Actor with ActorHelper { def receive = { case data: String => store(data) } }然后使用下面的方法来使用这个自定义的actor:
package org.apache.spark.examples.streaming import scala.collection.mutable.LinkedHashSet import scala.reflect.ClassTag import scala.util.Random import akka.actor._ import com.typesafe.config.ConfigFactory import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.akka.{ActorReceiver, AkkaUtils} case class SubscribeReceiver(receiverActor: ActorRef) case class UnsubscribeReceiver(receiverActor: ActorRef) /** * Sends the random content to every receiver subscribed with 1/2 * second delay. */ class FeederActor extends Actor { val rand = new Random() val receivers = new LinkedHashSet[ActorRef]() val strings: Array[String] = Array("words ", "may ", "count ") def makeMessage(): String = { val x = rand.nextInt(3) strings(x) + strings(2 - x) } /* * A thread to generate random messages */ new Thread() { override def run() { while (true) { Thread.sleep(500) receivers.foreach(_ ! makeMessage) } } }.start() def receive: Receive = { case SubscribeReceiver(receiverActor: ActorRef) => println("received subscribe from %s".format(receiverActor.toString)) receivers += receiverActor case UnsubscribeReceiver(receiverActor: ActorRef) => println("received unsubscribe from %s".format(receiverActor.toString)) receivers -= receiverActor } } /** * A sample actor as receiver, is also simplest. This receiver actor * goes and subscribe to a typical publisher/feeder actor and receives * data. * * @see [[org.apache.spark.examples.streaming.FeederActor]] */ class SampleActorReceiver[T](urlOfPublisher: String) extends ActorReceiver { lazy private val remotePublisher = context.actorSelection(urlOfPublisher) override def preStart(): Unit = remotePublisher ! SubscribeReceiver(context.self) def receive: PartialFunction[Any, Unit] = { case msg => store(msg.asInstanceOf[T]) } override def postStop(): Unit = remotePublisher ! UnsubscribeReceiver(context.self) } /** * A sample feeder actor * * Usage: FeederActor <hostname> <port> * <hostname> and <port> describe the AkkaSystem that Spark Sample feeder would start on. */ object FeederActor { def main(args: Array[String]) { if (args.length < 2){ System.err.println("Usage: FeederActor <hostname> <port>\n") System.exit(1) } val Seq(host, port) = args.toSeq val akkaConf = ConfigFactory.parseString( s"""akka.actor.provider = "akka.remote.RemoteActorRefProvider" |akka.remote.enabled-transports = ["akka.remote.netty.tcp"] |akka.remote.netty.tcp.hostname = "$host" |akka.remote.netty.tcp.port = $port |""".stripMargin) val actorSystem = ActorSystem("test", akkaConf) val feeder = actorSystem.actorOf(Props[FeederActor], "FeederActor") println("Feeder started as:" + feeder) actorSystem.awaitTermination() } } /** * A sample word count program demonstrating the use of plugging in * * Actor as Receiver * Usage: ActorWordCount <hostname> <port> * <hostname> and <port> describe the AkkaSystem that Spark Sample feeder is running on. * * To run this example locally, you may run Feeder Actor as * `$ bin/run-example org.apache.spark.examples.streaming.FeederActor localhost 9999` * and then run the example * `$ bin/run-example org.apache.spark.examples.streaming.ActorWordCount localhost 9999` */ object ActorWordCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println( "Usage: ActorWordCount <hostname> <port>") System.exit(1) } StreamingExamples.setStreamingLogLevels() val Seq(host, port) = args.toSeq val sparkConf = new SparkConf().setAppName("ActorWordCount") // Create the context and set the batch size val ssc = new StreamingContext(sparkConf, Seconds(2)) /* * Following is the use of AkkaUtils.createStream to plug in custom actor as receiver * * An important point to note: * Since Actor may exist outside the spark framework, It is thus user's responsibility * to ensure the type safety, i.e type of data received and InputDStream * should be same. * * For example: Both AkkaUtils.createStream and SampleActorReceiver are parameterized * to same type to ensure type safety. */ val lines = AkkaUtils.createStream[String]( ssc, Props(classOf[SampleActorReceiver[String]], "akka.tcp://test@%s:%s/user/FeederActor".format(host, port.toInt)), "SampleReceiver") // compute wordcount lines.flatMap(_.split("\\s+")).map(x => (x, 1)).reduceByKey(_ + _).print() ssc.start() ssc.awaitTermination() } }