SystemServer.java
private void startOtherServices() {
...
// 创建 watchdog【1.2节】
final Watchdog watchdog = Watchdog.getInstance();
// init watchdog【1.3节】
watchdog.init(context, mActivityManagerService);
...
mSystemServiceManager.startBootPhase(SystemService.PHASE_LOCK_SETTINGS_READY); // 480
...
mActivityManagerService.systemReady(new Runnable() {
public void run() {
mSystemServiceManager.startBootPhase(
SystemService.PHASE_ACTIVITY_MANAGER_READY);
...
// watchdog 启动【1.4节】
Watchdog.getInstance().start();
mSystemServiceManager.startBootPhase(
SystemService.PHASE_THIRD_PARTY_APPS_CAN_START);
}
}
}
从上面可以看到 watchdog 初始化的过程主要分为三步:
下面我们分这三步来分别看一下
Watchdog.java
public static Watchdog getInstance() {
if (sWatchdog == null) {
sWatchdog = new Watchdog();
}
return sWatchdog;
}
可以看到这就是一个单例模式,下面看一下 Watchdog 的构造函数
Watchdog.java
private Watchdog() {
super("watchdog");
// 初始化各 handler checker
// fg 线程是最主要的 check 对象,同时各个 MonitorChecker 也会被添加到这个 HandlerChecker
mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
"foreground thread", DEFAULT_TIMEOUT);
mHandlerCheckers.add(mMonitorChecker);
// Add checker for main thread. We only do a quick check since there
// can be UI running on the thread.
mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()),
"main thread", DEFAULT_TIMEOUT));
// Add checker for shared UI thread.
mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(),
"ui thread", DEFAULT_TIMEOUT));
// And also check IO thread.
mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(),
"i/o thread", DEFAULT_TIMEOUT));
// And the display thread.
mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(),
"display thread", DEFAULT_TIMEOUT));
// Initialize monitor for Binder threads.
addMonitor(new BinderThreadMonitor());
}
可以看到这里的作用是初始化各个 HandlerChecker,并将他们添加到 mHandlerCheckers 这个 ArrayList 中
Watchdog.java
public final class HandlerChecker implements Runnable {
private final Handler mHandler;
private final String mName; // 线程名
private final long mWaitMax; // 最大等待时间
private final ArrayList mMonitors = new ArrayList(); // 包含的 Monitor
private boolean mCompleted; // 本轮 check 是否完成
private Monitor mCurrentMonitor; // 当前 check 的 Monitor
private long mStartTime; // 开始 check 的系统时间
HandlerChecker(Handler handler, String name, long waitMaxMillis) {
mHandler = handler;
mName = name;
mWaitMax = waitMaxMillis;
mCompleted = true;
}
}
可以看到仅仅是初始化了一些成员变量,各个成员的含义见注释
Watchdog.java
public void addMonitor(Monitor monitor) {
synchronized (this) {
if (isAlive()) {
throw new RuntimeException("Monitors can't be added once the Watchdog is running");
}
// 将 monitor 添加到 mMonitorChecker
mMonitorChecker.addMonitor(monitor);
}
}
这里的作用是将 new BinderThreadMonitor() 添加到 mMonitorChecker 中,也就是 fg 线程的 HandlerChecker 中,这个 Monitor 是用来 check binder 线程的,用来确保其他进程可以与 system_server 进程通信
Watchdog.java
public void init(Context context, ActivityManagerService activity) {
mResolver = context.getContentResolver();
// AMS
mActivity = activity;
context.registerReceiver(new RebootRequestReceiver(),
new IntentFilter(Intent.ACTION_REBOOT),
android.Manifest.permission.REBOOT, null);
}
可以看到这里的作用主要是对 mResolver、mActivity 进行赋值,并且注册了一个 RebootRequestReceiver 来监听 ACTION_REBOOT 的广播
Watchdog.java
final class RebootRequestReceiver extends BroadcastReceiver {
@Override
public void onReceive(Context c, Intent intent) {
if (intent.getIntExtra("nowait", 0) != 0) {
rebootSystem("Received ACTION_REBOOT broadcast");
return;
}
Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent);
}
}
void rebootSystem(String reason) {
Slog.i(TAG, "Rebooting system because: " + reason);
IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE);
try {
pms.reboot(false, reason, false);
} catch (RemoteException ex) {
}
}
可以看到其是监听到广播并且条件符合的情况下通过 PMS 重启手机
这一步会调用 “watchdog” 线程的 run() 方法,下面我们来具体看一下 “watchdog” 线程是如何检测的
Watchdog.java
public void run() {
boolean waitedHalf = false;
while (true) {
final ArrayList blockedCheckers; // 用于记录被 block 的 Checkers
final String subject;
final boolean allowRestart;
int debuggerWasConnected = 0;
synchronized (this) {
long timeout = CHECK_INTERVAL; // 正常模式下为 30s
for (int i=0; i// 第一步,对每个 HandlerChecker 执行 scheduleCheckLocked() 方法
hc.scheduleCheckLocked();
}
if (debuggerWasConnected > 0) {
debuggerWasConnected--;
}
// 第二步,等待 30s
long start = SystemClock.uptimeMillis();
while (timeout > 0) {
if (Debug.isDebuggerConnected()) {
debuggerWasConnected = 2;
}
try {
wait(timeout);
} catch (InterruptedException e) {
Log.wtf(TAG, e);
}
if (Debug.isDebuggerConnected()) {
debuggerWasConnected = 2;
}
timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
}
// 第三步,得悉 check 的结果
final int waitState = evaluateCheckerCompletionLocked();
if (waitState == COMPLETED) {
// check 通过,reset
waitedHalf = false;
continue;
} else if (waitState == WAITING) {
// ?没有搞清这里存在的意义
continue;
} else if (waitState == WAITED_HALF) { //
if (!waitedHalf) {
// We've waited half the deadlock-detection interval. Pull a stack
// trace and wait another half.
ArrayList pids = new ArrayList();
pids.add(Process.myPid());
ActivityManagerService.dumpStackTraces(true, pids, null, null,
NATIVE_STACKS_OF_INTEREST);
waitedHalf = true;
}
continue;
}
// block 超过 60s,获得被 block 的 Checkers 信息等
blockedCheckers = getBlockedCheckersLocked();
subject = describeCheckersLocked(blockedCheckers);
allowRestart = mAllowRestart;
}
// 第四步,走到这里意谓着系统很有可能 hung 住了
// First collect stack traces from all threads of the system process.
// Then kill this process so that the system will restart.
EventLog.writeEvent(EventLogTags.WATCHDOG, subject);
ArrayList pids = new ArrayList();
pids.add(Process.myPid());
if (mPhonePid > 0) pids.add(mPhonePid);
// Pass !waitedHalf so that just in case we somehow wind up here without having
// dumped the halfway stacks, we properly re-initialize the trace file.
final File stack = ActivityManagerService.dumpStackTraces(
!waitedHalf, pids, null, null, NATIVE_STACKS_OF_INTEREST);
// Give some extra time to make sure the stack traces get written.
// The system's been hanging for a minute, another second or two won't hurt much.
SystemClock.sleep(2000);
// Pull our own kernel thread stacks as well if we're configured for that
if (RECORD_KERNEL_THREADS) {
dumpKernelStackTraces();
}
String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null);
String traceFileNameAmendment = "_SystemServer_WDT" + mTraceDateFormat.format(new Date());
if (tracesPath != null && tracesPath.length() != 0) {
File traceRenameFile = new File(tracesPath);
String newTracesPath;
int lpos = tracesPath.lastIndexOf (".");
if (-1 != lpos)
newTracesPath = tracesPath.substring (0, lpos) + traceFileNameAmendment + tracesPath.substring (lpos);
else
newTracesPath = tracesPath + traceFileNameAmendment;
traceRenameFile.renameTo(new File(newTracesPath));
tracesPath = newTracesPath;
}
final File newFd = new File(tracesPath);
// Try to add the error to the dropbox, but assuming that the ActivityManager
// itself may be deadlocked. (which has happened, causing this statement to
// deadlock and the watchdog as a whole to be ineffective)
Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
public void run() {
mActivity.addErrorToDropBox(
"watchdog", null, "system_server", null, null,
subject, null, newFd, null);
}
};
dropboxThread.start();
try {
dropboxThread.join(2000); // wait up to 2 seconds for it to return.
} catch (InterruptedException ignored) {}
// Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the kernel log
Slog.e(TAG, "Triggering SysRq for system_server watchdog");
doSysRq('w');
doSysRq('l');
// At times, when user space watchdog traces don't give an indication on
// which component held a lock, because of which other threads are blocked,
// (thereby causing Watchdog), crash the device to analyze RAM dumps
boolean crashOnWatchdog = SystemProperties
.getBoolean("persist.sys.crashOnWatchdog", false);
if (crashOnWatchdog) {
// wait until the above blocked threads be dumped into kernel log
SystemClock.sleep(3000);
// now try to crash the target
doSysRq('c');
}
IActivityController controller;
...
// Only kill the process if the debugger is not attached.
if (...) {
} else if (!allowRestart) {
Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
} else {
Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
for (int i=0; i" stack trace:");
StackTraceElement[] stackTrace
= blockedCheckers.get(i).getThread().getStackTrace();
for (StackTraceElement element: stackTrace) {
Slog.w(TAG, " at " + element);
}
}
Slog.w(TAG, "*** GOODBYE!");
Process.killProcess(Process.myPid());
System.exit(10);
}
waitedHalf = false;
}
}
可以看到其主要分为四步:
下面,我们从一、三、四步来分别进行分析
Watchdog.java#HandlerChecker
public void scheduleCheckLocked() {
if (mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling()) {
// 如果 target looper 为 polling 状态,并且其 mMonitors 的 size 为 0
mCompleted = true;
return;
}
if (!mCompleted) {
// 正在进行 check 不需要重新安排
return;
}
// 1. 初始化变量
mCompleted = false; // 标志本轮 check 开始
mCurrentMonitor = null; // 正在 check 的 Monitor
mStartTime = SystemClock.uptimeMillis(); // 设置开始时间
// 2. 将 msg 放到 mHandler 的 msg 队列首
mHandler.postAtFrontOfQueue(this);
}
可以看到第二步将 msg 放到 mHandler 的 msg 队列首,这样 mHandler 在处理完当前的 msg 后,就会处理到这个 msg,会调用到 HandlerChecker 的 run() 方法。
如果当前线程中存在耗时较长的操作,就会导致在某次 Handler Check 的时候 msg 不能立刻执行,这就是对 Handler check 的原理。
Watchdog.java#HandlerChecker
public void run() {
final int size = mMonitors.size();
for (int i = 0 ; i < size ; i++) {
synchronized (Watchdog.this) {
mCurrentMonitor = mMonitors.get(i);
}
mCurrentMonitor.monitor();
}
synchronized (Watchdog.this) {
mCompleted = true;
mCurrentMonitor = null;
}
}
可以看到,这里是对 HandlerChecker 中的每个 Monitor 执行 monitor() 方法,monitor() 实际上是一个拿锁操作,如果有其他线程一直持锁,譬如 “ActivityManager” 线程一直持着 AMS 的 this 锁(Monitor 要事先添加到 mMonitors 中,见后面),那么 monitor() 将一直被 block 无法返回,导致超时,这就是 Monitor Check 的原理。
Watchdog.java
private int evaluateCheckerCompletionLocked() {
int state = COMPLETED;
for (int i=0; ireturn state;
}
可以看到其是遍历所有的 HandlerChecker,并取出它们数值最大的状态,状态包含四种,分别是:COMPLETED = 0、WAITING = 1、WAITED_HALF = 2、OVERDUE = 3
Watchdog.java#HandlerChecker
public int getCompletionStateLocked() {
if (mCompleted) { // 已经完成 check 则返回 COMPLETED
return COMPLETED;
} else {
long latency = SystemClock.uptimeMillis() - mStartTime;
if (latency < mWaitMax/2) {
return WAITING;
} else if (latency < mWaitMax) {
return WAITED_HALF;
}
}
return OVERDUE;
}
分两种情况返回状态:
Watchdog.run()
if (waitState == COMPLETED) {
// 1. COMPLETED 则恢复 waitedHalf 初始值,开始下轮检测
waitedHalf = false;
continue;
} else if (waitState == WAITING) {
// 2. WAITING 直接再次检测
continue;
} else if (waitState == WAITED_HALF) {
if (!waitedHalf) {
// 3. 如果第一次 WAITED_HALF 状态,则 dump traces 并且再次经历一轮检测查看状态
ArrayList pids = new ArrayList();
pids.add(Process.myPid());
ActivityManagerService.dumpStackTraces(true, pids, null, null,
NATIVE_STACKS_OF_INTEREST);
waitedHalf = true;
}
continue;
}
// 4. OVERDUE,一般相当于连续的第二次 WAITED_HALF
blockedCheckers = getBlockedCheckersLocked();
subject = describeCheckersLocked(blockedCheckers);
allowRestart = mAllowRestart;
上面的注释列出了对于四种状态分别是如何处理的,注意第四种状态 OVERDUE 就是通常所说的 Watchdog 超时了,后面还需对其进行更多处理
Watchdog.java
private ArrayList getBlockedCheckersLocked() {
ArrayList checkers = new ArrayList();
for (int i=0; iif (hc.isOverdueLocked()) {
checkers.add(hc);
}
}
return checkers;
}
返回处于 OVERDUE 状态的 HandlerChecker 的 ArrayList
Watchdog.java
private String describeCheckersLocked(ArrayList checkers) {
StringBuilder builder = new StringBuilder(128);
for (int i=0; iif (builder.length() > 0) {
builder.append(", ");
}
builder.append(checkers.get(i).describeBlockedStateLocked());
}
return builder.toString();
}
Watchdog.java#HandlerChecker
public String describeBlockedStateLocked() {
if (mCurrentMonitor == null) {
return "Blocked in handler on " + mName + " (" + getThread().getName() + ")";
} else {
return "Blocked in monitor " + mCurrentMonitor.getClass().getName()
+ " on " + mName + " (" + getThread().getName() + ")";
}
}
可以看到这里是将每个 Blocked 的 checker 的信息拼在一起,每个 Blocked 的 checker 的信息是由 describeBlockedStateLocked() 方法来获得的,主要分为两种情况:
并且这些信息会用于后面 log 的打印
Watchdog.java
public static File dumpStackTraces(boolean clearTraces, ArrayList firstPids,
ProcessCpuTracker processCpuTracker, SparseArray lastPids, String[] nativeProcs) {
// 默认情况下为 /data/anr/traces.txt
String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null);
if (tracesPath == null || tracesPath.length() == 0) {
return null;
}
File tracesFile = new File(tracesPath);
try {
File tracesDir = tracesFile.getParentFile();
if (!tracesDir.exists()) {
tracesDir.mkdirs();
if (!SELinux.restorecon(tracesDir)) {
return null;
}
}
FileUtils.setPermissions(tracesDir.getPath(), 0775, -1, -1); // drwxrwxr-x
// 如果需要清理并且文件存在则删除存在文件
if (clearTraces && tracesFile.exists()) tracesFile.delete();
tracesFile.createNewFile();
FileUtils.setPermissions(tracesFile.getPath(), 0666, -1, -1); // -rw-rw-rw-
} catch (IOException e) {
Slog.w(TAG, "Unable to prepare ANR traces file: " + tracesPath, e);
return null;
}
// 写入 traces 信息
dumpStackTraces(tracesPath, firstPids, processCpuTracker, lastPids, nativeProcs);
return tracesFile;
}
可以看到这里主要是初始化目录和文件等,可以通过设置 clearTraces 来决定是否清除之前的 traces,这就是 Watchdog traces 文件中可以有两个时间点的 traces 的原因。
Watchdog.java
private File dumpKernelStackTraces() {
String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null);
if (tracesPath == null || tracesPath.length() == 0) {
return null;
}
native_dumpKernelStacks(tracesPath);
return new File(tracesPath);
}
通过调用 native_dumpKernelStacks(tracesPath) 来 dump kernel traces,即下面的方法
android_server_Watchdog.cpp
static void dumpKernelStacks(JNIEnv* env, jobject clazz, jstring pathStr) {
...
int outFd = open(path, O_WRONLY | O_APPEND | O_CREAT,
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH);
if (outFd < 0) {
ALOGE("Unable to open stack dump file: %d (%s)", errno, strerror(errno));
goto done;
}
snprintf(buf, sizeof(buf), "\n----- begin pid %d kernel stacks -----\n", getpid());
write(outFd, buf, strlen(buf));
// look up the list of all threads in this process
snprintf(buf, sizeof(buf), "/proc/%d/task", getpid());
taskdir = opendir(buf);
if (taskdir != NULL) {
struct dirent * ent;
while ((ent = readdir(taskdir)) != NULL) {
int tid = atoi(ent->d_name);
if (tid > 0 && tid <= 65535) {
// dump each stack trace
dumpOneStack(tid, outFd);
}
}
closedir(taskdir);
}
...
}
可以看出这里是通过 /proc/%d/task
节点获取进程的所有线程信息,然后再通过 dumpOneStack 方法 dump 每个线程的 stack
android_server_Watchdog.cpp
static void dumpOneStack(int tid, int outFd) {
char buf[64];
snprintf(buf, sizeof(buf), "/proc/%d/stack", tid);
int stackFd = open(buf, O_RDONLY);
if (stackFd >= 0) {
// header for readability
strncat(buf, ":\n", sizeof(buf) - strlen(buf) - 1);
write(outFd, buf, strlen(buf));
// copy the stack dump text
int nBytes;
while ((nBytes = read(stackFd, buf, sizeof(buf))) > 0) {
write(outFd, buf, nBytes);
}
write(outFd, "\n", 1);
close(stackFd);
} else {
ALOGE("Unable to open stack of tid %d : %d (%s)", tid, errno, strerror(errno));
}
}
通过读取 /proc/%d/stack
节点来 dump 每个线程的 kernel stack
Watchdog.java
private void doSysRq(char c) {
try {
FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger");
sysrq_trigger.write(c);
sysrq_trigger.close();
} catch (IOException e) {
Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e);
}
}
通过向节点 /proc/sysrq-trigger
写入字符,触发 kernel 操作
不详细介绍了
Watchdog 是一个运行在 system_server 进程的名为 “watchdog” 的线程,可以看到:
/data/anr
目录下生成 “traces__SystemServer_WDT时间戳XXX.txt” 的 traces 文件,其是由 dump traces 的文件 “traces.txt” 重命名而来的/proc/%d/task
节点获取进程的所有线程信息"/proc/%d/stack"
节点来 dump 每个线程的 kernel stack