spark作为一个基于内存的分布式计算框架,在开发过程中 及时处理跟踪和处理异常,保持大数据应用的稳定性 就尤为重要,在开发过程中 总会因为资源配置和代码质量的原因,导致一些异常,因此我打算针对一系列常用的异常,从源码和经验的角度,提出分析和解决的思路。
源码分析
spark
YarnAllocator
private[yarn] def processCompletedContainers(completedContainers: Seq[ContainerStatus]): Unit = {
for (completedContainer <- completedContainers) {
val containerId = completedContainer.getContainerId
val alreadyReleased = releasedContainers.remove(containerId)
val hostOpt = allocatedContainerToHostMap.get(containerId)
val onHostStr = hostOpt.map(host => s" on host: $host").getOrElse("")
val exitReason = if (!alreadyReleased) {
// Decrement the number of executors running. The next iteration of
// the ApplicationMaster's reporting thread will take care of allocating.
numExecutorsRunning.decrementAndGet()
logInfo("Completed container %s%s (state: %s, exit status: %s)".format(
containerId,
onHostStr,
completedContainer.getState,
completedContainer.getExitStatus))
// Hadoop 2.2.X added a ContainerExitStatus we should switch to use
// there are some exit status' we shouldn't necessarily count against us, but for
// now I think its ok as none of the containers are expected to exit.
val exitStatus = completedContainer.getExitStatus
val (exitCausedByApp, containerExitReason) = exitStatus match {
case ContainerExitStatus.SUCCESS =>
(false, s"Executor for container $containerId exited because of a YARN event (e.g., " +
"pre-emption) and not because of an error in the running job.")
case ContainerExitStatus.PREEMPTED =>
// Preemption is not the fault of the running tasks, since YARN preempts containers
// merely to do resource sharing, and tasks that fail due to preempted executors could
// just as easily finish on any other executor. See SPARK-8167.
(false, s"Container ${containerId}${onHostStr} was preempted.")
// Should probably still count memory exceeded exit codes towards task failures
case VMEM_EXCEEDED_EXIT_CODE =>
(true, memLimitExceededLogMessage(
completedContainer.getDiagnostics,
VMEM_EXCEEDED_PATTERN))
case PMEM_EXCEEDED_EXIT_CODE =>
(true, memLimitExceededLogMessage(
completedContainer.getDiagnostics,
PMEM_EXCEEDED_PATTERN))
case _ =>
// Enqueue the timestamp of failed executor
failedExecutorsTimeStamps.enqueue(clock.getTimeMillis())
(true, "Container marked as failed: " + containerId + onHostStr +
". Exit status: " + completedContainer.getExitStatus +
". Diagnostics: " + completedContainer.getDiagnostics)
}
if (exitCausedByApp) {
logWarning(containerExitReason)
} else {
logInfo(containerExitReason)
}
ExecutorExited(exitStatus, exitCausedByApp, containerExitReason)
} else {
// If we have already released this container, then it must mean
// that the driver has explicitly requested it to be killed
ExecutorExited(completedContainer.getExitStatus, exitCausedByApp = false,
s"Container $containerId exited from explicit termination request.")
}
for {
host <- hostOpt
containerSet <- allocatedHostToContainersMap.get(host)
} {
containerSet.remove(containerId)
if (containerSet.isEmpty) {
allocatedHostToContainersMap.remove(host)
} else {
allocatedHostToContainersMap.update(host, containerSet)
}
allocatedContainerToHostMap.remove(containerId)
}
containerIdToExecutorId.remove(containerId).foreach { eid =>
executorIdToContainer.remove(eid)
pendingLossReasonRequests.remove(eid) match {
case Some(pendingRequests) =>
// Notify application of executor loss reasons so it can decide whether it should abort
pendingRequests.foreach(_.reply(exitReason))
case None =>
// We cannot find executor for pending reasons. This is because completed container
// is processed before querying pending result. We should store it for later query.
// This is usually happened when explicitly killing a container, the result will be
// returned in one AM-RM communication. So query RPC will be later than this completed
// container process.
releasedExecutorLossReasons.put(eid, exitReason)
}
if (!alreadyReleased) {
// The executor could have gone away (like no route to host, node failure, etc)
// Notify backend about the failure of the executor
numUnexpectedContainerRelease += 1
driverRef.send(RemoveExecutor(eid, exitReason))
}
}
}
}
private object YarnAllocator {
val MEM_REGEX = "[0-9.]+ [KMG]B"
val PMEM_EXCEEDED_PATTERN =
Pattern.compile(s"$MEM_REGEX of $MEM_REGEX physical memory used")
val VMEM_EXCEEDED_PATTERN =
Pattern.compile(s"$MEM_REGEX of $MEM_REGEX virtual memory used")
val VMEM_EXCEEDED_EXIT_CODE = -103
val PMEM_EXCEEDED_EXIT_CODE = -104
def memLimitExceededLogMessage(diagnostics: String, pattern: Pattern): String = {
val matcher = pattern.matcher(diagnostics)
val diag = if (matcher.find()) " " + matcher.group() + "." else ""
("Container killed by YARN for exceeding memory limits." + diag
+ " Consider boosting spark.yarn.executor.memoryOverhead.")
}
}
hadoop yarn
ContainerStatusProto
private void initFields() {
containerId_ = org.apache.hadoop.yarn.proto.YarnProtos.ContainerIdProto.getDefaultInstance();
state_ = org.apache.hadoop.yarn.proto.YarnProtos.ContainerStateProto.C_NEW;
diagnostics_ = "N/A";
exitStatus_ = -1000;
}
private byte memoizedIsInitialized = -1;
public final boolean isInitialized() {
byte isInitialized = memoizedIsInitialized;
if (isInitialized != -1) return isInitialized == 1;
memoizedIsInitialized = 1;
return true;
}
public void writeTo(com.google.protobuf.CodedOutputStream output)
throws java.io.IOException {
getSerializedSize();
if (((bitField0_ & 0x00000001) == 0x00000001)) {
output.writeMessage(1, containerId_);
}
if (((bitField0_ & 0x00000002) == 0x00000002)) {
output.writeEnum(2, state_.getNumber());
}
if (((bitField0_ & 0x00000004) == 0x00000004)) {
output.writeBytes(3, getDiagnosticsBytes());
}
if (((bitField0_ & 0x00000008) == 0x00000008)) {
output.writeInt32(4, exitStatus_);
}
getUnknownFields().writeTo(output);
}
ContainerStatusProtoOrBuilder
int getExitStatus();
ContainerStatusPBImpl
@Override
public synchronized int getExitStatus() {
ContainerStatusProtoOrBuilder p = viaProto ? proto : builder;
return p.getExitStatus();
}
关于spark.driver.memoryOverhead
的作用
The amount of off-heap memory to be allocated per driver in cluster mode, " +
"in MiB unless otherwise specified.
解决方案
移除RDD缓存操作
增加spark.storage.memoryFraction系数值
增加spark.yarn.executor.memoryOverhead值
--conf spark.yarn.executor.memoryOverhead=4096
减少spark.default.parallelism和spark.sql.shuffle.partitions
根据源码中的说法
private[spark] object MapStatus {
def apply(loc: BlockManagerId, uncompressedSizes: Array[Long]): MapStatus = {
if (uncompressedSizes.length > 2000) {
HighlyCompressedMapStatus(loc, uncompressedSizes)
} else {
new CompressedMapStatus(loc, uncompressedSizes)
}
}
尝试使用2000 Partitions 的分区,可以提高性能