一直在想如何介绍Watchdog,思来想去还是源码比较给力“This class calls its monitor every minute. Killing this process if they don't return”简单粗暴。Android系统为了保证系统的稳定性,搞了这么个Watchdog,专门负责监控Android系统的一些核心服务和线程,并且在这些服务和线程发生异常或者block时进行重启,并保存问题发生时的现场。同时Watchdog分hardware watchdog检测硬件和system server watchdog检测systemserver关键服务和线程(下面简称为sswd),本文主要结合AndroidP代码分析后者的原理。
使用gdb工具从coredump解析出了系统watchdog线程中mHandlerCheckers集合的数据,便可以获取sswd检测的服务和线程
Watchdog监听的系统关键线程
[000] = 0x184cbaa0 Lcom/android/server/Watchdog$HandlerChecker; foreground thread
[001] = 0x184cbf70 Lcom/android/server/Watchdog$HandlerChecker; main thread
[002] = 0x184cbfa0 Lcom/android/server/Watchdog$HandlerChecker; ui thread
[003] = 0x184cbfd0 Lcom/android/server/Watchdog$HandlerChecker; i/o thread
[004] = 0x184cc000 Lcom/android/server/Watchdog$HandlerChecker; display thread
[005] = 0x184cc030 Lcom/android/server/Watchdog$HandlerChecker; ActivityManager
[006] = 0x184cc060 Lcom/android/server/Watchdog$HandlerChecker; PowerManagerService
[007] = 0x184cc090 Lcom/android/server/Watchdog$HandlerChecker; main//同main thread
[008] = 0x184cc0c0 Lcom/android/server/Watchdog$HandlerChecker; PackageManager
[009] = 0x184cc0f0 Lcom/android/server/Watchdog$HandlerChecker; PackageManager//同上
fg->mMonitors(deadlock监听)核心服务
[000] = 0x184cbf30 Lcom/android/server/Watchdog$BinderThreadMonitor;
[001] = 0x15b00a80 Lcom/android/server/am/ActivityManagerService;
[002] = 0x15b1f770 Lcom/android/server/power/PowerManagerService;
[003] = 0x172759f0 Lcom/sonymobile/server/mirrorpowersave/LcdPowerSaveService;
[004] = 0x15b02220 Lcom/android/server/wm/WindowManagerService;
[005] = 0x15e4ee58 Lcom/android/server/input/InputManagerService;
[006] = 0x15e78220 Lcom/android/server/NetworkManagementService;
[007] = 0x18028bf8 Lcom/android/server/media/MediaSessionService;
[008] = 0x1726a8b0 Lcom/android/server/media/MediaRouterService;
[009] = 0x13f0d010 Lcom/android/server/media/projection/MediaProjectionManagerService;
设定检测超时时间为60s,通过四种状态判定系统服务和线程的工作状态,自旋修改自身的状态
检测算法
@Override
public void run() {
boolean waitedHalf = false;
File initialStack = null;
final ProcessCpuTracker processCpuTracker = new ProcessCpuTracker(true);
processCpuTracker.init();
while (true) {
final List blockedCheckers;//记录异常的服务
final String subject;
final boolean allowRestart;
int debuggerWasConnected = 0;
synchronized (this) {
long timeout = CHECK_INTERVAL;//决定检测频率,减少功耗
// Make sure we (re)spin the checkers that have become idle within
// this wait-and-check interval
for (int i=0; i 0) {
debuggerWasConnected--;
}
// NOTE: We use uptimeMillis() here because we do not want to increment the time we
// wait while asleep. If the device is asleep then the thing that we are waiting
// to timeout on is asleep as well and won't have a chance to run, causing a false
// positive on when to kill things.
long start = SystemClock.uptimeMillis();//记录开始时间
while (timeout > 0) {
if (Debug.isDebuggerConnected()) {
debuggerWasConnected = 2;
}
try {
wait(timeout);//等待30s
} catch (InterruptedException e) {
Log.wtf(TAG, e);
}
if (Debug.isDebuggerConnected()) {
debuggerWasConnected = 2;
}
timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
}//30s继续执行
boolean fdLimitTriggered = false;
if (mOpenFdMonitor != null) {
fdLimitTriggered = mOpenFdMonitor.monitor();
}
//检测的主要算法
//检测分为两段时间前30s,后30s,检测结果分为四种
if (!fdLimitTriggered) {
final int waitState = evaluateCheckerCompletionLocked();//获取当前检测的状态
if (waitState == COMPLETED) {//正常,执行下一次检测
// The monitors have returned; reset
waitedHalf = false;
continue;
} else if (waitState == WAITING) {//执行过程中
// still waiting but within their configured intervals; back off and recheck
continue;
} else if (waitState == WAITED_HALF) {//等待超过30s
if (!waitedHalf) {//先打印一些cpu的使用信息
// We've waited half the deadlock-detection interval. Pull a stack
// trace and wait another half.
ArrayList pids = new ArrayList();
pids.add(Process.myPid());
initialStack = ActivityManagerService.dumpStackTraces(true, pids,
null, null, getInterestingNativePids());
waitedHalf = true;
processCpuTracker.update();
}
continue;
}
// something is overdue!超时发生,获取异常的服务和线程
blockedCheckers = getBlockedCheckersLocked();
subject = describeCheckersLocked(blockedCheckers);
} else {
blockedCheckers = Collections.emptyList();
subject = "Open FD high water mark reached";
}
allowRestart = mAllowRestart;
}
// Only kill the process if the debugger is not attached.
if (Debug.isDebuggerConnected()) {
debuggerWasConnected = 2;
}
if (debuggerWasConnected >= 2) {
Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
} else if (debuggerWasConnected > 0) {
Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
} else if (!allowRestart) {
Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
} else {
Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
WatchdogDiagnostics.diagnoseCheckers(blockedCheckers);
Slog.w(TAG, "*** GOODBYE!");
// Check if we should do system dump or not
if (errorHandlingInfo.mSystemDump) {
mActivity.forceCrashDump(errorHandlingInfo);
}
Process.killProcess(Process.myPid());
System.exit(10);//系统重启
}
waitedHalf = false;
}
}
检测关键类HandlerChecker
public final class HandlerChecker implements Runnable {
private final Handler mHandler;//检测的线程对应的Handler
private final String mName;
private final long mWaitMax;//等待最大时间60s
private final ArrayList mMonitors = new ArrayList();//只存在与foreground thread对应的HandlerChecker中,用来描述系统的核心服务,检测其中是否存在deadlock
private boolean mCompleted;//检测完成状态
private Monitor mCurrentMonitor;//当前检测的服务
private long mStartTime;//在一次60s检测中,记录开始时间
HandlerChecker(Handler handler, String name, long waitMaxMillis) {
mHandler = handler;
mName = name;
mWaitMax = waitMaxMillis;
mCompleted = true;
}
public void addMonitor(Monitor monitor) {
mMonitors.add(monitor);
}
public void scheduleCheckLocked() {
if (mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling()) {
// If the target looper has recently been polling, then
// there is no reason to enqueue our checker on it since that
// is as good as it not being deadlocked. This avoid having
// to do a context switch to check the thread. Note that we
// only do this if mCheckReboot is false and we have no
// monitors, since those would need to be executed at this point.
mCompleted = true;
return;
}
if (!mCompleted) {
// we already have a check in flight, so no need
return;
}
mCompleted = false;
mCurrentMonitor = null;
mStartTime = SystemClock.uptimeMillis();//记录当前执行检测的时间
mHandler.postAtFrontOfQueue(this);//在对应线程的messagequeue的头部发送一个消息
}
public boolean isOverdueLocked() {//是否存在超时
//mCompleted==false并且执行已经超时60s未完成检测
return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax);
}
public int getCompletionStateLocked() {
if (mCompleted) {
return COMPLETED;
} else {
long latency = SystemClock.uptimeMillis() - mStartTime;
if (latency < mWaitMax/2) {
return WAITING;
} else if (latency < mWaitMax) {
return WAITED_HALF;
}
}
return OVERDUE;
}
public Thread getThread() {
return mHandler.getLooper().getThread();
}
public String getName() {
return mName;
}
public String describeBlockedStateLocked() {
if (mCurrentMonitor == null) {
return "Blocked in handler on " + mName + " (" + getThread().getName() + ")";
} else {
return "Blocked in monitor " + mCurrentMonitor.getClass().getName()
+ " on " + mName + " (" + getThread().getName() + ")";
}
}
@Override
public void run() {
//phase1:检测死锁
final int size = mMonitors.size();
for (int i = 0 ; i < size ; i++) {
synchronized (Watchdog.this) {
mCurrentMonitor = mMonitors.get(i);
}
mCurrentMonitor.monitor();//尝试获取各个服务中的lock
}
//phase2:执行到这里分为两种情况
// action1:mMonitors.size() == 0,属于检测线程loop messagequeue是否存在block即对应线程是否block
// action1:mMonitors.size() != 0,属于检测deadlock,判断对应服务中的lock是否长时间被占有,未即时释放
//当执行到这里的时候,说明不存在lock被长时间占有,线程也未存在block情况因为检测发送的消息已经被执行,不存在消息堵塞的情况。
synchronized (Watchdog.this) {
mCompleted = true;//标记检测完成
mCurrentMonitor = null;//清除当前检测记录
}
}
}
当我们理解了SSWD的原理,会发现其实也并没有什么,总结一句话,SSWD会每间隔30s检测一下系统关键的服务和线程,当出现60s超时时,重启SystemServer进程。在实际的开发测试中,遇到SSWD的问题时,我们还是需要依据log信息准确的分析原因。我们知道有两种SSWD问题,一种死锁,一种线程block,有些时候虽然爆出的是看是死锁或者线程block,但是确实是由于一些其他原因导致的。