最天然的数据容错就是利用数据副本,另外一种是数据源支持重放。
基于BlockManager来做数据备份,StorageLevel
/**
* Create a input stream from TCP source hostname:port. Data is received using
* a TCP socket and the receive bytes is interpreted as UTF8 encoded `\n` delimited
* lines.
* @param hostname Hostname to connect to for receiving data
* @param port Port to connect to for receiving data
* @param storageLevel Storage level to use for storing the received objects
* (default: StorageLevel.MEMORY_AND_DISK_SER_2)
*/
defsocketTextStream(
hostname: String,
port: Int,
// 默认存储方式:内存或磁盘序列化2份
storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2
): ReceiverInputDStream[String] = withNamedScope("socket text stream") {
// 将StorageLevel变量传给SocketInoutCStream
socketStream[String](hostname, port, SocketReceiver.bytesToLines, storageLevel)
}
SocketInputDStream:
class SocketInputDStream[T: ClassTag](
ssc_ : StreamingContext,
host: String,
port: Int,
bytesToObjects: InputStream => Iterator[T],
storageLevel: StorageLevel
) extends ReceiverInputDStream[T](ssc_) {
def getReceiver(): Receiver[T] = {
newSocketReceiver(host, port, bytesToObjects,storageLevel)
}
}
Receiver :
// Receiver都有StorageLevel变量
abstract class Receiver[T](val storageLevel: StorageLevel) extends Serializable {
ReceiverSupervisorImpl:
private valreceivedBlockHandler: ReceivedBlockHandler = {
if (WriteAheadLogUtils.enableReceiverLog(env.conf)) {
if (checkpointDirOption.isEmpty) {
throw new SparkException(
"Cannot enable receiver write-ahead log without checkpoint directory set. " +
"Please use streamingContext.checkpoint() to set the checkpoint directory. " +
"See documentation for more details.")
}
new WriteAheadLogBasedBlockHandler(env.blockManager, receiver.streamId,
receiver.storageLevel, env.conf, hadoopConf, checkpointDirOption.get)
} else {
// 生成BlockManagerBasedBlockHandler时,有storageLevel。
new BlockManagerBasedBlockHandler(env.blockManager, receiver.storageLevel)
}
}
ReceivedBlockHandler:
/**
* Implementation of a [[org.apache.spark.streaming.receiver.ReceivedBlockHandler]] which
* stores the received blocks into a block manager with the specified storage level.
*/
private[streaming] class BlockManagerBasedBlockHandler(
blockManager: BlockManager, storageLevel: StorageLevel)
extends ReceivedBlockHandler with Logging {
def storeBlock(blockId: StreamBlockId, block: ReceivedBlock): ReceivedBlockStoreResult = {
var numRecords = None: Option[Long]
val putResult: Seq[(BlockId, BlockStatus)] = block match {
case ArrayBufferBlock(arrayBuffer) =>
numRecords = Some(arrayBuffer.size.toLong)
blockManager.putIterator(blockId, arrayBuffer.iterator, storageLevel,
tellMaster = true)
case IteratorBlock(iterator) =>
val countIterator = new CountingIterator(iterator)
// 传入指定的storageLevel,即MEM_AND_DISK_SER_2
val putResult = blockManager.putIterator(blockId, countIterator, storageLevel,
tellMaster = true)
numRecords = countIterator.count
putResult
启用WAL时必须设置checkpoint目录,因为WAL会写入日志到checkpoint目录。
WriteAheadLogBasedBlockHandler:
private[streaming] classWriteAheadLogBasedBlockHandler(
blockManager: BlockManager,
streamId: Int,
storageLevel: StorageLevel,
conf: SparkConf,
hadoopConf: Configuration,
checkpointDir: String,
clock: Clock = new SystemClock
) extends ReceivedBlockHandler with Logging {
private val blockStoreTimeout = conf.getInt(
"spark.streaming.receiver.blockStoreTimeout", 30).seconds
private val effectiveStorageLevel = {
if (storageLevel.deserialized) {
logWarning(s"Storage level serialization ${storageLevel.deserialized} is not supported when" +
s" write ahead log is enabled, change to serialization false")
}
if (storageLevel.replication > 1) {
logWarning(s"Storage level replication ${storageLevel.replication} is unnecessary when " +
s"write ahead log is enabled, change to replication 1")
}
StorageLevel(storageLevel.useDisk, storageLevel.useMemory, storageLevel.useOffHeap, false, 1)
}
if (storageLevel != effectiveStorageLevel) {
logWarning(s"User defined storage level $storageLevel is changed to effective storage level " +
s"$effectiveStorageLevel when write ahead log is enabled")
}
// Write ahead log manages
private val writeAheadLog = WriteAheadLogUtils.createLogForReceiver(
conf, checkpointDirToLogDir(checkpointDir, streamId), hadoopConf)
用户需要继承WriteAheadLog类,实现读写,清除,关闭,获得迭代器的接口。
WriteAheadLog :
public abstract class WriteAheadLog {
/**
* Write the record to the log and return a record handle, which contains all the information
* necessary to read back the written record. The time is used to the index the record,
* such that it can be cleaned later. Note that implementations of this abstract class must
* ensure that the written data is durable and readable (using the record handle) by the
* time this function returns.
*/
abstract public WriteAheadLogRecordHandle write(ByteBuffer record, long time);
/**
* Read a written record based on the given record handle.
*/
abstract public ByteBuffer read(WriteAheadLogRecordHandle handle);
/**
* Read and return an iterator of all the records that have been written but not yet cleaned up.
*/
abstract public Iterator readAll();
/**
* Clean all the records that are older than the threshold time. It can wait for
* the completion of the deletion.
*/
abstract public void clean(long threshTime, boolean waitForCompletion);
/**
* Close this log and release any resources.
*/
abstract public void close();
}
FileBasedWriteAheadLog继承自WriteAheadLog类,完成WAL的操作。
FileBasedWriteAheadLog:
private[streaming] classFileBasedWriteAheadLog(
conf: SparkConf,
logDirectory: String,
hadoopConf: Configuration,
rollingIntervalSecs: Int,
maxFailures: Int,
closeFileAfterWrite: Boolean
) extendsWriteAheadLogwith Logging {
...
写操作当失败时会尝试几次,maxFailures为构造时传入的参数,默认为1
FileBasedWriteAheadLog:
/**
* Write a byte buffer to the log file. This method synchronously writes the data in the
* ByteBuffer to HDFS. When this method returns, the data is guaranteed to have been flushed
* to HDFS, and will be available for readers to read.
*/
defwrite(byteBuffer: ByteBuffer, time: Long): FileBasedWriteAheadLogSegment = synchronized {
var fileSegment: FileBasedWriteAheadLogSegment = null
var failures = 0
var lastException: Exception = null
var succeeded = false
while (!succeeded && failures < maxFailures) {
try {
fileSegment = getLogWriter(time).write(byteBuffer)
if (closeFileAfterWrite) {
resetWriter()
}
succeeded = true
} catch {
case ex: Exception =>
lastException = ex
logWarning("Failed to write to write ahead log")
resetWriter()
failures += 1
}
}
if (fileSegment == null) {
logError(s"Failed to write to write ahead log after $failures failures")
throw lastException
}
fileSegment
}
读操作,调用FileBasedWriteAheadLogRandomReader类的read方法,随机读取。
FileBasedWriteAheadLog:
defread(segment: WriteAheadLogRecordHandle): ByteBuffer = {
val fileSegment = segment.asInstanceOf[FileBasedWriteAheadLogSegment]
var reader: FileBasedWriteAheadLogRandomReader = null
var byteBuffer: ByteBuffer = null
try {
reader = new FileBasedWriteAheadLogRandomReader(fileSegment.path, hadoopConf)
byteBuffer = reader.read(fileSegment)
} finally {
reader.close()
}
byteBuffer
}
FileBasedWriteAheadLogRandomReader:
def read(segment: FileBasedWriteAheadLogSegment): ByteBuffer = synchronized {
assertOpen()
// 跳过便宜读取
instream.seek(segment.offset)
val nextLength = instream.readInt()
HdfsUtils.checkState(nextLength == segment.length,
s"Expected message length to be ${segment.length}, but was $nextLength")
val buffer = new Array[Byte](nextLength)
instream.readFully(buffer)
ByteBuffer.wrap(buffer)
}
SparkStreaming从Kafka获取数据有两种方式:Receiver和Direct方式。Receiver方式是通过Zookeeper来管理Kafka偏移量元数据信息的。如果失败后,Kafka后基于这个Offset重新进行读取。可能存在数据重复消费的问题。Direct方式是直接操作Kafka,自己管理Offset,可以做到有且仅有一次的容错处理。
DirectKafkaInputDStream:
classDirectKafkaInputDStream[
K: ClassTag,
V: ClassTag,
U <: Decoder[K]: ClassTag,
T <: Decoder[V]: ClassTag,
R: ClassTag](
ssc_ : StreamingContext,
val kafkaParams: Map[String, String],
val fromOffsets: Map[TopicAndPartition, Long],
messageHandler: MessageAndMetadata[K, V] => R
) extends InputDStream[R](ssc_) with Logging {
val maxRetries = context.sparkContext.getConf.getInt(
"spark.streaming.kafka.maxRetries", 1)
// Keep this consistent with how other streams are named (e.g. "Flume polling stream [2]")
private[streaming] override def name: String = s"Kafka direct stream [$id]"
protected[streaming] override val checkpointData =
new DirectKafkaInputDStreamCheckpointData
...
override defcompute(validTime: Time): Option[KafkaRDD[K, V, U, T, R]] = {
val untilOffsets = clamp(latestLeaderOffsets(maxRetries))
val rdd =KafkaRDD[K, V, U, T, R](
context.sparkContext, kafkaParams, currentOffsets, untilOffsets, messageHandler)
// Report the record number and metadata of this batch interval to InputInfoTracker.
val offsetRanges = currentOffsets.map { case (tp, fo) =>
val uo = untilOffsets(tp)
OffsetRange(tp.topic, tp.partition, fo, uo.offset)
}
val description = offsetRanges.filter { offsetRange =>
// Don't display empty ranges.
offsetRange.fromOffset != offsetRange.untilOffset
}.map { offsetRange =>
s"topic: ${offsetRange.topic}\tpartition: ${offsetRange.partition}\t" +
s"offsets: ${offsetRange.fromOffset} to ${offsetRange.untilOffset}"
}.mkString("\n")
// Copy offsetRanges to immutable.List to prevent from being modified by the user
val metadata = Map(
"offsets" -> offsetRanges.toList,
StreamInputInfo.METADATA_KEY_DESCRIPTION -> description)
val inputInfo = StreamInputInfo(id, rdd.count, metadata)
ssc.scheduler.inputInfoTracker.reportInfo(validTime, inputInfo)
currentOffsets = untilOffsets.map(kv => kv._1 -> kv._2.offset)
Some(rdd)
}
KafkaRDD是基于偏移量,来获取数据生成RDD的。
KafkaRDD:
classKafkaRDD[
K: ClassTag,
V: ClassTag,
U <: Decoder[_]: ClassTag,
T <: Decoder[_]: ClassTag,
R: ClassTag] private[spark] (
sc: SparkContext,
kafkaParams: Map[String, String],
val offsetRanges: Array[OffsetRange],
leaders: Map[TopicAndPartition, (String, Int)],
messageHandler: MessageAndMetadata[K, V] => R
) extends RDD[R](sc, Nil) with Logging with HasOffsetRanges {
override def getPartitions: Array[Partition] = {
offsetRanges.zipWithIndex.map { case (o, i) =>
val (host, port) = leaders(TopicAndPartition(o.topic, o.partition))
new KafkaRDDPartition(i, o.topic, o.partition, o.fromOffset, o.untilOffset, host, port)
}.toArray
}
容错的弊端是耗时间,降低性能。
总结:
Executor端的容错方式默认使用的StorageLevel为MEMORY_AND_DISK_2的方式,如果一个Executor挂了,就可以从另一个Executor上获取数据继续进行计算。
还可以使用WAL的方式,在接收到数据先写入到日志中,如果Executor挂了,就可以从checkpoint中进行恢复。数据重放是基于Kafka实现的,通过设置要读取的数据偏移量来获取数据。当计算失败时,就可以根据上次的偏移量重新获取数据进行计算。
备注:
资料来源于:DT_大数据梦工厂(Spark发行版本定制)
更多私密内容,请关注微信公众号:DT_Spark
如果您对大数据Spark感兴趣,可以免费听由王家林老师每天晚上20:00开设的Spark永久免费公开课,地址YY房间号:68917580