Android ANR原理代码分析(三)

分析解决ANR

LOG原理

收集

系统发生ANR之后,会通过com.android.server.am.ProcessRecord#appNotResponding收集相关的log信息。主要代码逻辑 如下:

void appNotResponding(String activityShortComponentName, ApplicationInfo aInfo,
            String parentShortComponentName, WindowProcessController parentProcess,
            boolean aboveSystem, String annotation, boolean onlyDumpSelf) {
        ArrayList<Integer> firstPids = new ArrayList<>(5);
        SparseArray<Boolean> lastPids = new SparseArray<>(20);

        mWindowProcessController.appEarlyNotResponding(annotation, () -> kill("anr",
                  ApplicationExitInfo.REASON_ANR, true));

        long anrTime = SystemClock.uptimeMillis();
        if (isMonitorCpuUsage()) {
            mService.updateCpuStatsNow();
        }

        final boolean isSilentAnr;
        synchronized (mService) {
            // PowerManager.reboot() can block for a long time, so ignore ANRs while shutting down.
            if (mService.mAtmInternal.isShuttingDown()) {
                Slog.i(TAG, "During shutdown skipping ANR: " + this + " " + annotation);
                return;
            } else if (isNotResponding()) {
                Slog.i(TAG, "Skipping duplicate ANR: " + this + " " + annotation);
                return;
            } else if (isCrashing()) {
                Slog.i(TAG, "Crashing app skipping ANR: " + this + " " + annotation);
                return;
            } else if (killedByAm) {
                Slog.i(TAG, "App already killed by AM skipping ANR: " + this + " " + annotation);
                return;
            } else if (killed) {
                Slog.i(TAG, "Skipping died app ANR: " + this + " " + annotation);
                return;
            }

            // In case we come through here for the same app before completing
            // this one, mark as anring now so we will bail out.
            setNotResponding(true);

            // Log the ANR to the event log.
            EventLog.writeEvent(EventLogTags.AM_ANR, userId, pid, processName, info.flags,
                    annotation);

            // Dump thread traces as quickly as we can, starting with "interesting" processes.
            firstPids.add(pid);

            // Don't dump other PIDs if it's a background ANR or is requested to only dump self.
            isSilentAnr = isSilentAnr();
            if (!isSilentAnr && !onlyDumpSelf) {
                int parentPid = pid;
                if (parentProcess != null && parentProcess.getPid() > 0) {
                    parentPid = parentProcess.getPid();
                }
                if (parentPid != pid) firstPids.add(parentPid);

                if (MY_PID != pid && MY_PID != parentPid) firstPids.add(MY_PID);

                for (int i = getLruProcessList().size() - 1; i >= 0; i--) {
                    ProcessRecord r = getLruProcessList().get(i);
                    if (r != null && r.thread != null) {
                        int myPid = r.pid;
                        if (myPid > 0 && myPid != pid && myPid != parentPid && myPid != MY_PID) {
                            if (r.isPersistent()) {
                                firstPids.add(myPid);
                                if (DEBUG_ANR) Slog.i(TAG, "Adding persistent proc: " + r);
                            } else if (r.treatLikeActivity) {
                                firstPids.add(myPid);
                                if (DEBUG_ANR) Slog.i(TAG, "Adding likely IME: " + r);
                            } else {
                                lastPids.put(myPid, Boolean.TRUE);
                                if (DEBUG_ANR) Slog.i(TAG, "Adding ANR proc: " + r);
                            }
                        }
                    }
                }
            }
        }

        // Log the ANR to the main log.
        StringBuilder info = new StringBuilder();
        info.setLength(0);
        info.append("ANR in ").append(processName);
        if (activityShortComponentName != null) {
            info.append(" (").append(activityShortComponentName).append(")");
        }
        info.append("\n");
        info.append("PID: ").append(pid).append("\n");
        if (annotation != null) {
            info.append("Reason: ").append(annotation).append("\n");
        }
        if (parentShortComponentName != null
                && parentShortComponentName.equals(activityShortComponentName)) {
            info.append("Parent: ").append(parentShortComponentName).append("\n");
        }

        StringBuilder report = new StringBuilder();
        report.append(MemoryPressureUtil.currentPsiState());
        ProcessCpuTracker processCpuTracker = new ProcessCpuTracker(true);

        // don't dump native PIDs for background ANRs unless it is the process of interest
        String[] nativeProcs = null;
        if (isSilentAnr || onlyDumpSelf) {
            for (int i = 0; i < NATIVE_STACKS_OF_INTEREST.length; i++) {
                if (NATIVE_STACKS_OF_INTEREST[i].equals(processName)) {
                    nativeProcs = new String[] { processName };
                    break;
                }
            }
        } else {
            nativeProcs = NATIVE_STACKS_OF_INTEREST;
        }

        int[] pids = nativeProcs == null ? null : Process.getPidsForCommands(nativeProcs);
        ArrayList<Integer> nativePids = null;

        if (pids != null) {
            nativePids = new ArrayList<>(pids.length);
            for (int i : pids) {
                nativePids.add(i);
            }
        }

        // For background ANRs, don't pass the ProcessCpuTracker to
        // avoid spending 1/2 second collecting stats to rank lastPids.
        StringWriter tracesFileException = new StringWriter();
        // To hold the start and end offset to the ANR trace file respectively.
        final long[] offsets = new long[2];
        File tracesFile = ActivityManagerService.dumpStackTraces(firstPids,
                isSilentAnr ? null : processCpuTracker, isSilentAnr ? null : lastPids,
                nativePids, tracesFileException, offsets);

        if (isMonitorCpuUsage()) {
            mService.updateCpuStatsNow();
            synchronized (mService.mProcessCpuTracker) {
                report.append(mService.mProcessCpuTracker.printCurrentState(anrTime));
            }
            info.append(processCpuTracker.printCurrentLoad());
            info.append(report);
        }
        report.append(tracesFileException.getBuffer());

        info.append(processCpuTracker.printCurrentState(anrTime));

        Slog.e(TAG, info.toString());
        if (tracesFile == null) {
            // There is no trace file, so dump (only) the alleged culprit's threads to the log
            Process.sendSignal(pid, Process.SIGNAL_QUIT);
        } else if (offsets[1] > 0) {
            // We've dumped into the trace file successfully
            mService.mProcessList.mAppExitInfoTracker.scheduleLogAnrTrace(
                    pid, uid, getPackageList(), tracesFile, offsets[0], offsets[1]);
        }

        FrameworkStatsLog.write(FrameworkStatsLog.ANR_OCCURRED, uid, processName,
                activityShortComponentName == null ? "unknown": activityShortComponentName,
                annotation,
                (this.info != null) ? (this.info.isInstantApp()
                        ? FrameworkStatsLog.ANROCCURRED__IS_INSTANT_APP__TRUE
                        : FrameworkStatsLog.ANROCCURRED__IS_INSTANT_APP__FALSE)
                        : FrameworkStatsLog.ANROCCURRED__IS_INSTANT_APP__UNAVAILABLE,
                isInterestingToUserLocked()
                        ? FrameworkStatsLog.ANROCCURRED__FOREGROUND_STATE__FOREGROUND
                        : FrameworkStatsLog.ANROCCURRED__FOREGROUND_STATE__BACKGROUND,
                getProcessClassEnum(),
                (this.info != null) ? this.info.packageName : "");
        final ProcessRecord parentPr = parentProcess != null
                ? (ProcessRecord) parentProcess.mOwner : null;
        mService.addErrorToDropBox("anr", this, processName, activityShortComponentName,
                parentShortComponentName, parentPr, annotation, report.toString(), tracesFile,
                null);

        if (mWindowProcessController.appNotResponding(info.toString(), () -> kill("anr",
                ApplicationExitInfo.REASON_ANR, true),
                () -> {
                    synchronized (mService) {
                        mService.mServices.scheduleServiceTimeoutLocked(this);
                    }
                })) {
            return;
        }

        synchronized (mService) {
            // mBatteryStatsService can be null if the AMS is constructed with injector only. This
            // will only happen in tests.
            if (mService.mBatteryStatsService != null) {
                mService.mBatteryStatsService.noteProcessAnr(processName, uid);
            }

            if (isSilentAnr() && !isDebugging()) {
                kill("bg anr", ApplicationExitInfo.REASON_ANR, true);
                return;
            }

            // Set the app's notResponding state, and look up the errorReportReceiver
            makeAppNotRespondingLocked(activityShortComponentName,
                    annotation != null ? "ANR " + annotation : "ANR", info.toString());

            // mUiHandler can be null if the AMS is constructed with injector only. This will only
            // happen in tests.
            if (mService.mUiHandler != null) {
                // Bring up the infamous App Not Responding dialog
                Message msg = Message.obtain();
                msg.what = ActivityManagerService.SHOW_NOT_RESPONDING_UI_MSG;
                msg.obj = new AppNotRespondingDialog.Data(this, aInfo, aboveSystem);

                mService.mUiHandler.sendMessage(msg);
            }
        }
    }

通过代码我们可以看到,发生ANR之后,都是通过com.android.server.am.AppErrors#appNotResponding的方法来进行通知。在appNotResponding里面会进行如下的动作:

  • 判断mService.mController,如果不等于null,就需要告诉当前的controller发生了ANR。相当于是Activity的一种代理,可以应用与UIAutomator自动化测试里面收集ANR信息。
  • 调用com.android.server.am.ActivityManagerService#updateCpuStatsNow开始采集当前的CPU使用信息。主要是读取每个进程对应的/proc/${pid}/stat以及/proc/stat里面的信息进行统计,关于这两个节点表示的意思可以参见上面的文档。而且每次统计的CPU信息也会保存到BatteryStatsService里面。
  • 如果是关机、Crashing、被am killed、已经杀死的进程或者已经弹出了ANR对话框,那么不再执行相关逻辑。
  • 通过EventLog.writeEvent(EventLogTags.AM_ANR, app.userId, app.pid,app.processName, app.info.flags, annotation);将当前app的userid、pid等信息打印到Event log里面。
  • MemoryPressureUtil.currentPsiState(): 打印当前的内存压力。
  • 并且通过mService.dumpStackTraces/data/anr/traces.txt里面输入当前感兴趣进程的堆栈信息。包括最近使用的几个进程的信息,以及native层,默认感兴趣的进程的信息,这个和我们在WatchDog机制里面讲解的类似。
  • 往main log里面陆续输出相关log信息。
  • 通过mService.addErrorToDropBox往dropbox里面产生一个anr文件,比如[email protected]。内容和/data/anr/traces.txt里面不太一致。
  • 如果是后台ANR,则直接调用系统的kill方法,杀掉当前的app,往event log里面输出am_kill的信息,然后直接返回。如果是前台ANR,那么会弹出我们肉眼可以看到的对话框。

采集cpu信息主要读取proc/${pid}/statproc/stat节点的信息,接下来分析下这两个节点的含义。

/proc/${pid}/stat

在ANR的状态统计中,大量用到/proc/${pid}/stat里面的数据,为了能够正常阅读ANR LOG输出机制的相关代码以及正确解读输出来的相关LOG信息,首先我们必须弄清楚这个概念以及里面的内容含义, 可以参见kernel/Documentation/filesystems/proc.txt里面的stat描述。

手动查看com.android.settings进程的的stat,通过adb shell ps | grep settings获取当前的pid是4345.然后我们输入adb shell cat /proc/4345/stat,有如下结果:

4345 (ndroid.settings) S 2604 2604 0 0 -1 1077936448 39731 0 1 0 705 156 0 0 20 0 14 0 12203 1640771584 26653 18446744073709551615 366503874560 366503889508 549475827216 549475817472 547750451276 0 4612 0 34040 18446744073709551615 0 0 17 7 0 0 0 0 0 366503893608 366503895048 367382896640 549475830414 549475830513 549475830513 549475831774 0

从左向右,总共有52个数据。第一个表示:process id,第三个表示当前进程的状态。每一个数据表示的意思可以查看表1:

表1

id Field Content
1 pid process id
2 tcomm filename of the executable
3 state state (R is running, S is sleeping, D is sleeping in an uninterruptible wait, Z is zombie, T is traced or stopped)
4 ppid process id of the parent process
5 pgrp pgrp of the process
6 sid session id
7 tty_nr tty the process uses
8 tty_pgrp pgrp of the tty
9 flags task flags
10 min_flt number of minor faults
11 cmin_flt number of minor faults with child’s
12 maj_flt number of major faults
13 cmaj_flt number of major faults with child’s
14 utime user mode jiffies
15 stime kernel mode jiffies
16 cutime user mode jiffies with child’s
17 cstime kernel mode jiffies with child’s
18 priority priority level
19 nice nice level
20 num_threads number of threads
21 it_real_value (obsolete, always 0)
22 start_time time the process started after system boot
23 vsize virtual memory size
24 rss resident set memory size
25 rsslim current limit in bytes on the rss
26 start_code address above which program text can run
27 end_code address below which program text can run
28 start_stack address of the start of the main process stack
29 esp current value of ESP
30 eip current value of EIP
31 pending bitmap of pending signals
32 blocked bitmap of blocked signals
33 sigign bitmap of ignored signals
34 sigcatch bitmap of caught signals
35 wchan address where process went to sleep
36 0 (place holder)
37 0 (place holder)
38 exit_signal signal to send to parent thread on exit
39 task_cpu which CPU the task is scheduled on
40 rt_priority realtime priority
41 policy scheduling policy (man sched_setscheduler)
42 blkio_ticks time spent waiting for block IO
43 gtime guest time of the task in jiffies
44 cgtime guest time of the task children in jiffies
45 start_data address above which program data+bss is placed
46 end_data address below which program data+bss is placed
47 start_brk address above which program heap can be expanded with brk()
48 arg_start address above which program command line is placed
49 arg_end address below which program command line is placed
50 env_start address above which program environment is placed
51 env_end address below which program environment is placed
52 exit_code the thread’s exit_code in the form reported by the waitpid system call

在ANR输出的log信息里面,针对单个进程,主要读取的是/proc/${pid}/stat里面如下10,12,14,15里面的值,具体代码:com.android.internal.os.ProcessCpuTracker#PROCESS_STATS_FORMAT ,当调用了Process.``_readProcFile_``(st.``**statFile**``.toString(),
``**PROCESS_STATS_FORMAT**``, ``**null**``, procStats, ``**null**``) 会返回,增加了PROC_OUT_LONG的标志的位:

	static final int PROCESS_STAT_MINOR_FAULTS = 0;
    static final int PROCESS_STAT_MAJOR_FAULTS = 1;
    static final int PROCESS_STAT_UTIME = 2;
    static final int PROCESS_STAT_STIME = 3;

                    if (!Process.readProcFile(st.statFile.toString(),
                            PROCESS_STATS_FORMAT, null, procStats, null)) {
                        continue;
                    }
                    final long minfaults = procStats[PROCESS_STAT_MINOR_FAULTS];
                    final long majfaults = procStats[PROCESS_STAT_MAJOR_FAULTS];
                    final long utime = procStats[PROCESS_STAT_UTIME] * mJiffyMillis;
                    final long stime = procStats[PROCESS_STAT_STIME] * mJiffyMillis;

proc/stat

除开获取每一个应用的据悉信息之外,还会通过proc/stat获取整个系统的当前状态。

说明位于:Documentation/filesystems/proc.txt

字段 含义
user: normal processes executing in user mode
nice: niced processes executing in user mode
system: processes executing in kernel mode
idle: twiddling thumbs
iowait: waiting for I/O to complete
irq: servicing interrupts
softirq: servicing softirqs
steal: involuntary wait

Stats 类

ProcessCpuTracker里面,有一个Stats的类,主要记录了每次采样的信息。系统每次调用collectStats读取proc/${pic}/stat的时候计算,主要有如下关键的属性:

  • base_uptime; 调用collectStats读取proc/${pic}/stat时,系统的SystemClock.uptimeMillis()时间。

  • rel_uptime; ** 本次调用时的base_uptime-上次调用时的base_uptime,也就是说,这个值表示两次计算stat之间的时间差。最后ANR信息里面输出的类似17% 18820/perfd: 11% user + 6.8% kernel / faults: 1358 minor这样的信息,计算出来的17%里面的%分母**就是用的rel_uptime,意思是:两次采样之间,${pid}cpu使用时间占用总采样的时间。

  • base_utime; 上次采样,进程用户态空间占用CPU的时间。

  • base_stime; 上次采样,进程内核态占用CPU的时间。

  • rel_utime; 本次采样,进程用户态空间占用CPU的时间。

  • rel_stime; 本次采样,进程内核态空间占用CPU的时间。

  • base_minfaults; 上次采样, 次要的错误。minor fault, 帮助文档原始描述:The number of minor faults the process has made which have not required loading a memory page from disk。如果minor faults过高,说明当前进程有大量的内存访问操作。

  • base_majfaults; 上次采样,主要的错误。major faults,帮助文档原始描述:The number of major faults the process has made which have required loading a memory page from disk. 说明系统当前有大量的从disk读取数据的操作(IO操作)。

  • rel_minfaults; 本次采样,次要的错误。

  • rel_majfaults; 本次采样,主要的错误。

ANR LOG 格式解析

main log里面会出现如下信息:

PID: 6519
    Reason: Input dispatching timed out (4c6226a com.xiaosayidao.debugdemo/com.xiaosayidao.debugdemo.MainActivity (server) is not responding. Waited 5002ms for KeyEvent(deviceId=-1, source=0x00000101, displayId=0, action=DOWN, flags=0x00000048, keyCode=4, scanCode=0, metaState=0x00000000, repeatCount=0), policyFlags=0x6b000002)
    Parent: com.xiaosayidao.debugdemo/.Load
    MainActivity: 0.39 / 31.41 / 56.52
    // 内存压力 
    ----- Output from /proc/pressure/memory -----
    some avg10=0.00 avg60=0.00 avg300=0.34 total=14994254
    full avg10=0.00 avg60=0.00 avg300=0.00 total=35476
    ----- End output from /proc/pressure/memory -----
    
    CPU usage from 11382ms to 0ms ago (2021-03-10 06:07:48.385 to 2021-03-10 06:07:59.767):
      0.8% 2168/com.android.providers.media.module: 0.6% user + 0.1% kernel / faults: 638 minor
      19% 6519/com.xiaosayidao.debugdemo: 10% user + 9.6% kernel / faults: 14631 minor
      15% 482/system_server: 10% user + 4.3% kernel / faults: 8477 minor
      5.4% 328/surfaceflinger: 4.7% user + 0.7% kernel / faults: 1001 minor
      3.6% 306/android.hardware.graphics.composer@2.2-service: 1.2% user + 2.3% kernel / faults: 12110 minor
      1.3% 1595/com.android.systemui: 0.9% user + 0.3% kernel / faults: 416 minor
      0.5% 17/kworker/1:0-cgroup_pidlist_destroy: 0% user + 0.5% kernel
      0.4% 6488/kworker/0:0-virtio_vsock: 0% user + 0.4% kernel
      0.3% 324/audioserver: 0.2% user + 0% kernel
      0.2% 293/android.hardware.audio.service: 0% user + 0.1% kernel
      0.1% 163/logd: 0% user + 0.1% kernel / faults: 1 minor
      0.1% 295/android.hardware.bluetooth@1.1-service.sim: 0% user + 0% kernel
      0% 117/kworker/u4:2: 0% user + 0% kernel
      0% 433/logcat: 0% user + 0% kernel / faults: 2 minor
      0% 1803/com.android.phone: 0% user + 0% kernel / faults: 10 minor
      0% 5773/adbd: 0% user + 0% kernel / faults: 124 minor
    33% TOTAL: 21% user + 11% kernel + 0.1% iowait + 0.2% irq + 0% softirq
    CPU usage from 6ms to 244ms later (2021-03-10 06:07:59.773 to 2021-03-10 06:08:00.011):
      37% 2168/com.android.providers.media.module: 28% user + 9.4% kernel / faults: 15 minor
        9.4% 6161/Thread-13: 4.7% user + 4.7% kernel
        4.7% 2374/Thread-5: 4.7% user + 0% kernel
        4.7% 2375/Thread-6: 4.7% user + 0% kernel
        4.7% 6148/Thread-8: 0% user + 4.7% kernel
        4.7% 6149/Thread-9: 4.7% user + 0% kernel
        4.7% 6150/Thread-10: 0% user + 4.7% kernel
        4.7% 6160/Thread-12: 4.7% user + 0% kernel
        4.7% 6187/Thread-14: 4.7% user + 0% kernel
      31% 482/system_server: 22% user + 8.9% kernel / faults: 468 minor
        22% 6333/Binder:482_16: 17% user + 4.4% kernel
        4.4% 6477/Binder:482_17: 0% user + 4.4% kernel
        4.4% 6641/AnrConsumer: 0% user + 4.4% kernel
      34% 6519/com.xiaosayidao.debugdemo: 14% user + 19% kernel / faults: 307 minor
        34% 6519/yidao.debugdemo: 14% user + 19% kernel
      4.5% 1595/com.android.systemui: 4.5% user + 0% kernel / faults: 18 minor
      4.8% 5773/adbd: 0% user + 4.8% kernel / faults: 182 minor
       +0% 6647/shell svc 6646: 0% user + 0% kernel
     +0% 6646/logcat: 0% user + 0% kernel
    60% TOTAL: 35% user + 25% kernel
  • **CPU usage from **: 上次采样(mLastSampleTime)到请求打印当前CPU使用信息的运行时间。对于CPU信息来说,当我们调用dump的时候,此时CPU采集信息可能已经过去了好久了。由于代码里面调用了两次,所以会输出两次"CPU usage from"

  • to 6236ms ago: 本次采样(mCurrentSampleTime)到请求打印当前CPU使用信息的运行时间。

  • (2021-03-10 06:07:48.385 to 2021-03-10 06:07:59.767)**: **上次采样真实时钟时间(mLastSampleWallTime)到本次采样真实时钟时间(mCurrentSampleWallTime)。

  • with 99% awake: 采样期间,手机有1%处于深度睡眠。

  • **19% 6519/com.xiaosayidao.debugdemo: **: 11% user + 6.8% kernel / faults: 1358 minor : user+system+iowait+irq+softIrq,当前进程的用户态空间占用时间+系统空间占用时间+iowait时间+硬件终端时间+软件中断的时间之和 占用当前测量时间段内的时间占比。

  • ** 33% TOTAL: **21% user + 11% kernel + 0.1% iowait + 0.2% irq + 0% softirq. 此信息是读取的/proc/stat获取的相关值,关于/proc/stat/相关数据结构,可以参考前面的描述。最终也是获取的在某一个测量时间段内。用户空间占用时间+系统空间占用时间+iowait时间+硬件终端时间+软件中断

占比超过100%

在有一些log文件中,计算出来的百分比超过了100%,这不是太奇怪了么?
实际上是正常的,因为total时间是真实的,比如我们同级的总共是5s,那么total就是5s,。当cpu是多核时,那么user+system+iowait+irq+softIrq可能显示的是多核的时间。比如,此应用程序在5s的时间内,同时使用了3个核,每个核心都是90%,那么此时的占比就是270%。比如下面的例子:滴答拼车占用了794%,说明完全占用了8个核心的CPU时间。

CPU usage from 4434ms to -1005ms ago (2017-09-15 04:11:08.233 to 2017-09-15 04:11:13.672) with 99% awake:
794% 11538/com.didapinche.booking: 792% user + 2.3% kernel / faults: 2435 minor

cpu usage from 显示负数

如上面例子显示,-1005ms ago。查看代码如下,由于now是发生ANR的时间,mCurrentSampleTime是当前请求CPU采样的时间。也就是说:当发生ANR的之后,系统请求打印当前的CPU使用信息之后,系统在某处也请求了CPU采样。这样就造成了发生ANR的时间小于CPU采样的事件,从而值就会变成负数。
**
如果是负值,可以认为是:发生ANR之前的1005ms之内。

pw.print("CPU usage from ");
if (now > mLastSampleTime) {
            pw.print(now-mLastSampleTime);
            pw.print("ms to ");
            pw.print(now-mCurrentSampleTime);
            pw.print("ms ago");
} else {
            pw.print(mLastSampleTime-now);
            pw.print("ms to ");
            pw.print(mCurrentSampleTime-now);
            pw.print("ms later");
}

诊断ANR

通过原理分析可以知道,发生ANR的原因是主Handler上的Message分发或者执行超时。

但是导致超时的原因可能很多,实际开发过程中,可能遇到如下场景:

  • 应用在主线程上非常缓慢地执行涉及 I/O 的操作。
  • 应用在主线程上进行长时间的计算。
  • 主线程在对另一个进程进行同步 binder 调用,而后者需要很长时间才能返回。
  • 主线程处于阻塞状态,为发生在另一个线程上的长操作等待同步的块。
  • 主线程在进程中或通过 binder 调用与另一个线程之间发生死锁。主线程不只是在等待长操作执行完毕,而且处于死锁状态。如需更多信息,请参阅维基百科上的死锁。
  • 内存泄露。
  • CPU饥饿。CPU占用百分比过高。

分析ANR原因

分析ANR的主要方法如下:

  • main log. 查看 ANR in 关键字。
  • event log: am_anr 关键字。
  • trace文件: 主要生成在"/data/anr"目录下,main log里面会输出相关信息。比如: ActivityManager: Dumping to /data/anr/anr_2021-03-10-06-08-00-015.
adb root
    adb shell ls /data/anr
    adb pull /data/anr/<filename>
  • 严格模式:使用 StrictMode 有助于您在开发应用时发现主线程上的意外 I/O 操作。您可以在应用级别或 Activity 级别使用 StrictMode。
  • 启用后台 ANR 对话框:只有在设备的开发者选项中启用了显示所有 ANR 时,Android 才会针对花费过长时间处理广播消息的应用显示 ANR 对话框。因此,系统并不会始终向用户显示后台 ANR 对话框,但应用仍可能会遇到性能问题。
  • TraceView:您可以使用 TraceView 在查看用例时获取正在运行的应用的跟踪信息,并找出主线程繁忙的位置。如需了解如何使用 TraceView,请参阅使用 TraceView 和 dmtracedump 分析性能。

接下来,使用具体的log日志针对不同的原因进行详细的分解。

应用在主线程上非常缓慢地执行涉及 I/O 的操作。

  • main log信息:
ActivityManager: Dumping to /data/anr/anr_2021-03-10-06-08-00-015
2021-03-10 14:08:01.596 482-6641/system_process E/ActivityManager: ANR in com.xiaosayidao.debugdemo (com.xiaosayidao.debugdemo/.MainActivity)
    PID: 6519
    Reason: Input dispatching timed out (4c6226a com.xiaosayidao.debugdemo/com.xiaosayidao.debugdemo.MainActivity (server) is not responding. Waited 5002ms for KeyEvent(deviceId=-1, source=0x00000101, displayId=0, action=DOWN, flags=0x00000048, keyCode=4, scanCode=0, metaState=0x00000000, repeatCount=0), policyFlags=0x6b000002)
    Parent: com.xiaosayidao.debugdemo/.MainActivity
    Load: 0.39 / 31.41 / 56.52
    ----- Output from /proc/pressure/memory -----
    some avg10=0.00 avg60=0.00 avg300=0.34 total=14994254
    full avg10=0.00 avg60=0.00 avg300=0.00 total=35476
    ----- End output from /proc/pressure/memory -----
    
    CPU usage from 11382ms to 0ms ago (2021-03-10 06:07:48.385 to 2021-03-10 06:07:59.767):
      0.8% 2168/com.android.providers.media.module: 0.6% user + 0.1% kernel / faults: 638 minor
      19% 6519/com.xiaosayidao.debugdemo: 10% user + 9.6% kernel / faults: 14631 minor
      15% 482/system_server: 10% user + 4.3% kernel / faults: 8477 minor
      5.4% 328/surfaceflinger: 4.7% user + 0.7% kernel / faults: 1001 minor
      3.6% 306/android.hardware.graphics.composer@2.2-service: 1.2% user + 2.3% kernel / faults: 12110 minor
      1.3% 1595/com.android.systemui: 0.9% user + 0.3% kernel / faults: 416 minor
      0.5% 17/kworker/1:0-cgroup_pidlist_destroy: 0% user + 0.5% kernel
      0.4% 6488/kworker/0:0-virtio_vsock: 0% user + 0.4% kernel
      0.3% 324/audioserver: 0.2% user + 0% kernel
      0.2% 293/android.hardware.audio.service: 0% user + 0.1% kernel
      0.1% 163/logd: 0% user + 0.1% kernel / faults: 1 minor
      0.1% 295/android.hardware.bluetooth@1.1-service.sim: 0% user + 0% kernel
      0% 117/kworker/u4:2: 0% user + 0% kernel
      0% 433/logcat: 0% user + 0% kernel / faults: 2 minor
      0% 1803/com.android.phone: 0% user + 0% kernel / faults: 10 minor
      0% 5773/adbd: 0% user + 0% kernel / faults: 124 minor
    33% TOTAL: 21% user + 11% kernel + 0.1% iowait + 0.2% irq + 0% softirq
    CPU usage from 6ms to 244ms later (2021-03-10 06:07:59.773 to 2021-03-10 06:08:00.011):
      37% 2168/com.android.providers.media.module: 28% user + 9.4% kernel / faults: 15 minor
        9.4% 6161/Thread-13: 4.7% user + 4.7% kernel
        4.7% 2374/Thread-5: 4.7% user + 0% kernel
        4.7% 2375/Thread-6: 4.7% user + 0% kernel
        4.7% 6148/Thread-8: 0% user + 4.7% kernel
        4.7% 6149/Thread-9: 4.7% user + 0% kernel
        4.7% 6150/Thread-10: 0% user + 4.7% kernel
        4.7% 6160/Thread-12: 4.7% user + 0% kernel
        4.7% 6187/Thread-14: 4.7% user + 0% kernel
      31% 482/system_server: 22% user + 8.9% kernel / faults: 468 minor
        22% 6333/Binder:482_16: 17% user + 4.4% kernel
        4.4% 6477/Binder:482_17: 0% user + 4.4% kernel
        4.4% 6641/AnrConsumer: 0% user + 4.4% kernel
      34% 6519/com.xiaosayidao.debugdemo: 14% user + 19% kernel / faults: 307 minor
        34% 6519/yidao.debugdemo: 14% user + 19% kernel
      4.5% 1595/com.android.systemui: 4.5% user + 0% kernel / faults: 18 minor
      4.8% 5773/adbd: 0% user + 4.8% kernel / faults: 182 minor
       +0% 6647/shell svc 6646: 0% user + 0% kernel
     +0% 6646/logcat: 0% user + 0% kernel
    60% TOTAL: 35% user + 25% kernel
2021-03-10 14:08:01.597 482-6641/system_process D/ActivityManager: Completed ANR of com.xiaosayidao.debugdemo in 1829ms, latency 0ms
2021-03-10 14:08:01.608 482-6705/system_process I/DropBoxManagerService: add tag=data_app_anr isTagEnabled=true flags=0x2
2021-03-10 14:08:01.687 482-482/system_process I/RenderThread: type=1400 audit(0.0:243): avc: denied { execmem } for scontext=u:r:system_server:s0 tcontext=u:r:system_server:s0 tclass=process permissive=1 b/65201432
2021-03-10 14:08:02.327 482-5723/system_process I/OpenGLRenderer: Davey! duration=718ms; Flags=1, IntendedVsync=939710444333, Vsync=939710444333, OldestInputEvent=9223372036854775807, NewestInputEvent=0, HandleInputStart=939710968461, AnimationStart=939710968791, PerformTraversalsStart=939710968941, DrawStart=939721373996, SyncQueued=939721609596, SyncStart=939721724957, IssueDrawCommandsStart=939721761427, SwapBuffers=940424952612, FrameCompleted=940429220954, DequeueBufferDuration=121210, QueueBufferDuration=136240, GpuCompleted=8314254617093341183, 
2021-03-10 14:08:02.327 482-509/system_process W/Looper: Slow dispatch took 719ms android.ui h=android.view.Choreographer$FrameHandler c=android.view.Choreographer$FrameDisplayEventReceiver@64528f4 m=0
2021-03-10 14:08:02.328 482-509/system_process W/Looper: Slow delivery took 718ms android.ui h=android.view.ViewRootImpl$ViewRootHandler c=null m=31
2021-03-10 14:08:02.328 482-509/system_process I/Choreographer: Skipped 42 frames!  The application may be doing too much work on its main thread.
2021-03-10 14:08:02.343 482-509/system_process W/Looper: Drained
2021-03-10 14:08:05.595 482-482/system_process W/WindowManager: removeWindowToken: Attempted to remove non-existing token: android.os.Binder@3bf1dca
  • trace.tx 文件内容参考:
  • 解决方案:https://developer.android.com/training/basics/network-ops

应用在主线程上进行长时间的计算。

如前面分析可知,四大组件的关键生命周期方法如果超时,那么就会导致ANR。

主线程在对另一个进程进行同步 binder 调用,而后者需要很长时间才能返回。

  • trace信息:
"main" prio=5 tid=1 Native
  | group="main" sCount=1 dsCount=0 flags=1 obj=0x71904680 self=0xe0641c10
  | sysTid=4644 nice=-10 cgrp=top-app sched=0/0 handle=0xeec84478
  | state=S schedstat=( 231878217 164445640 466 ) utm=20 stm=2 core=0 HZ=100
  | stack=0xff15f000-0xff161000 stackSize=8192KB
  | held mutexes=
  native: #00 pc 00000b97  [vdso] (__kernel_vsyscall+7)
  native: #01 pc 000cda5c  /apex/com.android.runtime/lib/bionic/libc.so (__ioctl+28)
  native: #02 pc 0008145a  /apex/com.android.runtime/lib/bionic/libc.so (ioctl+58)
  native: #03 pc 000512cb  /system/lib/libbinder.so (android::IPCThreadState::talkWithDriver(bool)+331)
  native: #04 pc 0005269f  /system/lib/libbinder.so (android::IPCThreadState::waitForResponse(android::Parcel*, int*)+47)
  native: #05 pc 000523a1  /system/lib/libbinder.so (android::IPCThreadState::transact(int, unsigned int, android::Parcel const&, android::Parcel*, unsigned int)+177)
  native: #06 pc 000491d9  /system/lib/libbinder.so (android::BpBinder::transact(unsigned int, android::Parcel const&, android::Parcel*, unsigned int)+153)
  native: #07 pc 0012301f  /system/lib/libandroid_runtime.so (android_os_BinderProxy_transact(_JNIEnv*, _jobject*, int, _jobject*, _jobject*, int)+143)
  at android.os.BinderProxy.transactNative(Native method)
  at android.os.BinderProxy.transact(BinderProxy.java:550)
  at android.app.IActivityTaskManager$Stub$Proxy.getAppTasks(IActivityTaskManager.java:5439)
  at android.app.ActivityManager.getAppTasks(ActivityManager.java:1939)
  at com.xiaosayidao.debugdemo.anr.AnrFragment.binderBlocked(AnrFragment.java:118)
  at com.xiaosayidao.debugdemo.anr.AnrFragment.onItemClick(AnrFragment.java:107)
  at com.xiaosayidao.debugdemo.MyAdapter.lambda$onBindViewHolder$0$MyAdapter(MyAdapter.java:54)
  at com.xiaosayidao.debugdemo.-$$Lambda$MyAdapter$Mj25d8sLFZl2JzWUWgclp3AeDOE.onClick(lambda:-1)
  at android.view.View.performClick(View.java:7448)
  at android.view.View.performClickInternal(View.java:7425)
  at android.view.View.access$3600(View.java:810)
  at android.view.View$PerformClick.run(View.java:28305)
  at android.os.Handler.handleCallback(Handler.java:938)
  at android.os.Handler.dispatchMessage(Handler.java:99)
  at android.os.Looper.loop(Looper.java:223)
  at android.app.ActivityThread.main(ActivityThread.java:7660)
  at java.lang.reflect.Method.invoke(Native method)
  at com.android.internal.os.RuntimeInit$MethodAndArgsCaller.run(RuntimeInit.java:592)
  at com.android.internal.os.ZygoteInit.main(ZygoteInit.java:947)
  • 对端信息:
"Binder:3324_4" prio=5 tid=95 Sleeping
  | group="main" sCount=1 dsCount=0 flags=1 obj=0x142422d8 self=0xb6750a10
  | sysTid=3506 nice=-10 cgrp=foreground sched=0/0 handle=0xb42e61e0
  | state=S schedstat=( 131527913 469153906 846 ) utm=9 stm=4 core=0 HZ=100
  | stack=0xb41eb000-0xb41ed000 stackSize=1008KB
  | held mutexes=
  at java.lang.Thread.sleep(Native method)
  - sleeping on <0x03bfb302> (a java.lang.Object)
  at java.lang.Thread.sleep(Thread.java:442)
  - locked <0x03bfb302> (a java.lang.Object)
  at java.lang.Thread.sleep(Thread.java:358)
  at com.android.server.wm.ActivityTaskManagerService.getAppTasks(ActivityTaskManagerService.java:3128)
  at android.app.IActivityTaskManager$Stub.onTransact(IActivityTaskManager.java:2538)
  at android.os.Binder.execTransactInternal(Binder.java:1154)
  at android.os.Binder.execTransact(Binder.java:1123)

主线程处于阻塞状态,为发生在另一个线程上的长操作等待同步的块。

  • 代码举例:
void synchronizedBlock() {
        new LockTask().execute();
       
        mHandler.postDelayed(new Runnable() {
            @Override
            public void run() {
                synchronized (mObject) {
                    
                    Toast.makeText(mActivityContext,
                            mActivityContext.getResources().getText(R.string.lock_dev),
                            Toast.LENGTH_SHORT).show();
                }
            }
        }, 1000);


    }

    public class LockTask extends AsyncTask<Integer[], Integer, Long> {
        @Override
        protected Long doInBackground(Integer[]... integers) {
            synchronized (mObject) {
                Log.d(TAG, "synchronizedBlock: task wait");
                // This is a long-running operation, which makes
                // the lock last for a long time
                AnrUtils.waitTime(AnrUtils.DEFAULT_INPUT_DISPATCHING_TIMEOUT_NANOS * 2,
                        "synchronized ");
            }
            return null;
        }
    }
  • trace 信息:
"AsyncTask #1" prio=5 tid=2 Runnable
  | group="main" sCount=0 dsCount=0 flags=0 obj=0x12c66180 self=0xe4045410
  | sysTid=17661 nice=10 cgrp=top-app sched=0/0 handle=0xbe73c1e0
  | state=R schedstat=( 8259291933 201088775 3222 ) utm=824 stm=1 core=1 HZ=100
  | stack=0xbe639000-0xbe63b000 stackSize=1040KB
  | held mutexes= "mutator lock"(shared held)
  at com.xiaosayidao.debugdemo.anr.AnrUtils.waitTime(AnrUtils.java:46)
  at com.xiaosayidao.debugdemo.anr.AnrFragment$LockTask.doInBackground(AnrFragment.java:165)
  - locked <0x0471ab90> (a java.lang.Object)
  at com.xiaosayidao.debugdemo.anr.AnrFragment$LockTask.doInBackground(AnrFragment.java:158)
  at android.os.AsyncTask$3.call(AsyncTask.java:394)
  at java.util.concurrent.FutureTask.run(FutureTask.java:266)
  at android.os.AsyncTask$SerialExecutor$1.run(AsyncTask.java:305)
  at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1167)
  at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:641)
  at java.lang.Thread.run(Thread.java:923)


"main" prio=5 tid=1 Blocked
  | group="main" sCount=1 dsCount=0 flags=1 obj=0x723852e0 self=0xe4042a10
  | sysTid=17621 nice=-10 cgrp=top-app sched=0/0 handle=0xf261b478
  | state=S schedstat=( 259859865 230557628 582 ) utm=23 stm=2 core=0 HZ=100
  | stack=0xff293000-0xff295000 stackSize=8192KB
  | held mutexes=
  at com.xiaosayidao.debugdemo.anr.AnrFragment$2.run(AnrFragment.java:146)
  - waiting to lock <0x0471ab90> (a java.lang.Object) held by thread 2
  at android.os.Handler.handleCallback(Handler.java:938)
  at android.os.Handler.dispatchMessage(Handler.java:99)
  at android.os.Looper.loop(Looper.java:223)
  at android.app.ActivityThread.main(ActivityThread.java:7660)
  at java.lang.reflect.Method.invoke(Native method)
  at com.android.internal.os.RuntimeInit$MethodAndArgsCaller.run(RuntimeInit.java:592)
  at com.android.internal.os.ZygoteInit.main(ZygoteInit.java:947)

主线程在进程中或通过 binder 调用与另一个线程之间发生死锁。主线程不只是在等待长操作执行完毕,而且处于死锁状态。

此类场景交复杂,以具体的实例延时相关的log信息。

解决问题

  1. 您应该将主线程中运行的工作移至工作线程
  2. IO 操作示例包括网络和存储操作应该移到工作线程。
  3. 请确保将持有锁的时间降到最少,或者最好从一开始就评估应用是否需要持有锁。如果您使用锁来确定何时根据工作线程的处理情况来更新界面,请使用 onProgressUpdate() 和 onPostExecute() 之类的机制在工作线程和主线程之间进行通信。
  4. 如果是广播接收器里面超时。建议将长时间运行的操作移至 IntentService,因为它使用工作线程来执行其工作。

你可能感兴趣的:(Android ANR原理代码分析(三))