* Create a input stream from TCP source hostname:port. Data is received using
* a TCP socket and the receive bytes is interpreted as UTF8 encoded `\n` delimited
* lines.
* @param hostname Hostname to connect to for receiving data
* @param port Port to connect to for receiving data
* @param storageLevel Storage level to use for storing the received objects
* (default: StorageLevel.MEMORY_AND_DISK_SER_2)
hostname: String,
port: Int,
// 默认存储方式:内存或磁盘序列化2份
storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2
): ReceiverInputDStream[String] = withNamedScope("socket text stream") {
// 将StorageLevel变量传给SocketInoutCStream
socketStream[String](hostname, port, SocketReceiver.bytesToLines, storageLevel)
class SocketInputDStream[T: ClassTag](
ssc_ : StreamingContext,
host: String,
port: Int,
bytesToObjects: InputStream => Iterator[T],
storageLevel: StorageLevel
) extends ReceiverInputDStream[T](ssc_) {
def getReceiver(): Receiver[T] = {
newSocketReceiver(host, port, bytesToObjects,storageLevel)
Receiver :
// Receiver都有StorageLevel变量
abstract class Receiver[T](val storageLevel: StorageLevel) extends Serializable {
private valreceivedBlockHandler: ReceivedBlockHandler = {
if (WriteAheadLogUtils.enableReceiverLog(env.conf)) {
if (checkpointDirOption.isEmpty) {
throw new SparkException(
"Cannot enable receiver write-ahead log without checkpoint directory set. " +
"Please use streamingContext.checkpoint() to set the checkpoint directory. " +
"See documentation for more details.")
new WriteAheadLogBasedBlockHandler(env.blockManager, receiver.streamId,
receiver.storageLevel, env.conf, hadoopConf, checkpointDirOption.get)
} else {
// 生成BlockManagerBasedBlockHandler时,有storageLevel。
new BlockManagerBasedBlockHandler(env.blockManager, receiver.storageLevel)
* Implementation of a [[org.apache.spark.streaming.receiver.ReceivedBlockHandler]] which
* stores the received blocks into a block manager with the specified storage level.
private[streaming] class BlockManagerBasedBlockHandler(
blockManager: BlockManager, storageLevel: StorageLevel)
extends ReceivedBlockHandler with Logging {
def storeBlock(blockId: StreamBlockId, block: ReceivedBlock): ReceivedBlockStoreResult = {
var numRecords = None: Option[Long]
val putResult: Seq[(BlockId, BlockStatus)] = block match {
case ArrayBufferBlock(arrayBuffer) =>
numRecords = Some(arrayBuffer.size.toLong)
blockManager.putIterator(blockId, arrayBuffer.iterator, storageLevel,
tellMaster = true)
case IteratorBlock(iterator) =>
val countIterator = new CountingIterator(iterator)
// 传入指定的storageLevel,即MEM_AND_DISK_SER_2
val putResult = blockManager.putIterator(blockId, countIterator, storageLevel,
tellMaster = true)
numRecords = countIterator.count
private[streaming] classWriteAheadLogBasedBlockHandler(
blockManager: BlockManager,
streamId: Int,
storageLevel: StorageLevel,
conf: SparkConf,
hadoopConf: Configuration,
checkpointDir: String,
clock: Clock = new SystemClock
) extends ReceivedBlockHandler with Logging {
private val blockStoreTimeout = conf.getInt(
"spark.streaming.receiver.blockStoreTimeout", 30).seconds
private val effectiveStorageLevel = {
if (storageLevel.deserialized) {
logWarning(s"Storage level serialization ${storageLevel.deserialized} is not supported when" +
s" write ahead log is enabled, change to serialization false")
if (storageLevel.replication > 1) {
logWarning(s"Storage level replication ${storageLevel.replication} is unnecessary when " +
s"write ahead log is enabled, change to replication 1")
StorageLevel(storageLevel.useDisk, storageLevel.useMemory, storageLevel.useOffHeap, false, 1)
if (storageLevel != effectiveStorageLevel) {
logWarning(s"User defined storage level $storageLevel is changed to effective storage level " +
s"$effectiveStorageLevel when write ahead log is enabled")
// Write ahead log manages
private val writeAheadLog = WriteAheadLogUtils.createLogForReceiver(
conf, checkpointDirToLogDir(checkpointDir, streamId), hadoopConf)
WriteAheadLog :
public abstract class WriteAheadLog {
* Write the record to the log and return a record handle, which contains all the information
* necessary to read back the written record. The time is used to the index the record,
* such that it can be cleaned later. Note that implementations of this abstract class must
* ensure that the written data is durable and readable (using the record handle) by the
* time this function returns.
abstract public WriteAheadLogRecordHandle write(ByteBuffer record, long time);
* Read a written record based on the given record handle.
abstract public ByteBuffer read(WriteAheadLogRecordHandle handle);
* Read and return an iterator of all the records that have been written but not yet cleaned up.
abstract public Iterator readAll();
* Clean all the records that are older than the threshold time. It can wait for
* the completion of the deletion.
abstract public void clean(long threshTime, boolean waitForCompletion);
* Close this log and release any resources.
abstract public void close();
private[streaming] classFileBasedWriteAheadLog(
conf: SparkConf,
logDirectory: String,
hadoopConf: Configuration,
rollingIntervalSecs: Int,
maxFailures: Int,
closeFileAfterWrite: Boolean
) extendsWriteAheadLogwith Logging {
* Write a byte buffer to the log file. This method synchronously writes the data in the
* ByteBuffer to HDFS. When this method returns, the data is guaranteed to have been flushed
* to HDFS, and will be available for readers to read.
defwrite(byteBuffer: ByteBuffer, time: Long): FileBasedWriteAheadLogSegment = synchronized {
var fileSegment: FileBasedWriteAheadLogSegment = null
var failures = 0
var lastException: Exception = null
var succeeded = false
while (!succeeded && failures < maxFailures) {
try {
fileSegment = getLogWriter(time).write(byteBuffer)
if (closeFileAfterWrite) {
succeeded = true
} catch {
case ex: Exception =>
lastException = ex
logWarning("Failed to write to write ahead log")
failures += 1
if (fileSegment == null) {
logError(s"Failed to write to write ahead log after $failures failures")
throw lastException
defread(segment: WriteAheadLogRecordHandle): ByteBuffer = {
val fileSegment = segment.asInstanceOf[FileBasedWriteAheadLogSegment]
var reader: FileBasedWriteAheadLogRandomReader = null
var byteBuffer: ByteBuffer = null
try {
reader = new FileBasedWriteAheadLogRandomReader(fileSegment.path, hadoopConf)
byteBuffer = reader.read(fileSegment)
} finally {
def read(segment: FileBasedWriteAheadLogSegment): ByteBuffer = synchronized {
// 跳过便宜读取
val nextLength = instream.readInt()
HdfsUtils.checkState(nextLength == segment.length,
s"Expected message length to be ${segment.length}, but was $nextLength")
val buffer = new Array[Byte](nextLength)
K: ClassTag,
V: ClassTag,
U <: Decoder[K]: ClassTag,
T <: Decoder[V]: ClassTag,
R: ClassTag](
ssc_ : StreamingContext,
val kafkaParams: Map[String, String],
val fromOffsets: Map[TopicAndPartition, Long],
messageHandler: MessageAndMetadata[K, V] => R
) extends InputDStream[R](ssc_) with Logging {
val maxRetries = context.sparkContext.getConf.getInt(
"spark.streaming.kafka.maxRetries", 1)
// Keep this consistent with how other streams are named (e.g. "Flume polling stream [2]")
private[streaming] override def name: String = s"Kafka direct stream [$id]"
protected[streaming] override val checkpointData =
new DirectKafkaInputDStreamCheckpointData
override defcompute(validTime: Time): Option[KafkaRDD[K, V, U, T, R]] = {
val untilOffsets = clamp(latestLeaderOffsets(maxRetries))
val rdd =KafkaRDD[K, V, U, T, R](
context.sparkContext, kafkaParams, currentOffsets, untilOffsets, messageHandler)
// Report the record number and metadata of this batch interval to InputInfoTracker.
val offsetRanges = currentOffsets.map { case (tp, fo) =>
val uo = untilOffsets(tp)
OffsetRange(tp.topic, tp.partition, fo, uo.offset)
val description = offsetRanges.filter { offsetRange =>
// Don't display empty ranges.
offsetRange.fromOffset != offsetRange.untilOffset
}.map { offsetRange =>
s"topic: ${offsetRange.topic}\tpartition: ${offsetRange.partition}\t" +
s"offsets: ${offsetRange.fromOffset} to ${offsetRange.untilOffset}"
// Copy offsetRanges to immutable.List to prevent from being modified by the user
val metadata = Map(
"offsets" -> offsetRanges.toList,
StreamInputInfo.METADATA_KEY_DESCRIPTION -> description)
val inputInfo = StreamInputInfo(id, rdd.count, metadata)
ssc.scheduler.inputInfoTracker.reportInfo(validTime, inputInfo)
currentOffsets = untilOffsets.map(kv => kv._1 -> kv._2.offset)
K: ClassTag,
V: ClassTag,
U <: Decoder[_]: ClassTag,
T <: Decoder[_]: ClassTag,
R: ClassTag] private[spark] (
sc: SparkContext,
kafkaParams: Map[String, String],
val offsetRanges: Array[OffsetRange],
leaders: Map[TopicAndPartition, (String, Int)],
messageHandler: MessageAndMetadata[K, V] => R
) extends RDD[R](sc, Nil) with Logging with HasOffsetRanges {
override def getPartitions: Array[Partition] = {
offsetRanges.zipWithIndex.map { case (o, i) =>
val (host, port) = leaders(TopicAndPartition(o.topic, o.partition))
new KafkaRDDPartition(i, o.topic, o.partition, o.fromOffset, o.untilOffset, host, port)