native 解析死锁方法

一. 概述

    我们平时在分析system_server watchdog问题时有时候会遇到一些native层的死锁问题,正常情况下我们是把所有可疑的线程调用栈全部通过symbols找到可以的死锁线程,这个方法比较耗费体力和脑力。这篇文章将会教会你一个更为简便的方法我们通过gdb调试直接打印出对应吃锁的线程。

 

二.通过GDB查看线程锁

    我们通过aosp代码找到锁的定义: http://192.99.106.107:8080/xref/android-10.0.0_r31/xref/bionic/libc/bionic/pthread_mutex.cpp#454

struct pthread_mutex_internal_t {
    _Atomic(uint16_t) state;
    uint16_t __pad;
    union {
        atomic_int owner_tid;
        PIMutex pi_mutex;
    };
    char __reserved[28];

    PIMutex& ToPIMutex() {
        return pi_mutex;
    }

    void FreePIMutex() {
    }
} __attribute__((aligned(4)));

我们发现里边有一个owner_tid,那么当有线程持锁时这个owner_tid就是已经拿着锁的线程了。

那么我们直接实践一下吧,直接制造一个在等锁的案例:

frameworks/native/services/sensorservice/SensorService.cpp
status_t SensorService::dump(int fd, const Vector& args) {
    ......
#ifdef HAS_SENSOR_CONTROL
        if (3 == args.size() && args[0] == String16("sensorsControl")) {
            if (NO_ERROR == defaultSensorControl()->executeCommand(this, args)) {
                ALOGW("pzc Sensors SensorService::dump1");
                Mutex::Autolock _l(mLock);
                ALOGW("pzc Sensors SensorService::mLock11");
                char value[PROPERTY_VALUE_MAX];
                property_get("sensor.debug", value, "0");
                if (atoi(value) == 1) {
                    ALOGW("pzc Sensors SensorService::mLock12");
                    sleep(30);
                    Mutex::Autolock _l(mLock);
                    ALOGW("pzc Sensors SensorService::mLock13");
                }
                sleep(5);
                ALOGW("pzc Sensors SensorService::dump2");
                return NO_ERROR;
            }
        }
#endif
   ......

我们这里在dumpsys sensorservice时会去拿锁30S,然后我们去注册一些sensor就可以出现等锁的情况了,中间的步骤就不一一说了,直接看gdb调试的结果吧:

(gdb) bt
#0  syscall () at bionic/libc/arch-arm64/bionic/syscall.S:41
#1  0x0000007d77592f6c in __futex (ftx=, op=, value=, timeout=0x0, bitset=-1) at bionic/libc/private/bionic_futex.h:45
#2  FutexWithTimeout (ftx=0x7ce89cf130, op=137, value=2, use_realtime_clock=, abs_timeout=, bitset=-1) at bionic/libc/bionic/bionic_futex.cpp:58
#3  __futex_wait_ex (ftx=, shared=, value=, use_realtime_clock=, abs_timeout=) at bionic/libc/bionic/bionic_futex.cpp:63
#4  0x0000007d775f722c in NonPI::NormalMutexLock (mutex=0x7ce89cf130, use_realtime_clock=false, abs_timeout_or_null=0x0, shared=) at bionic/libc/bionic/pthread_mutex.cpp:607
#5  NonPI::MutexLockWithTimeout (mutex=0x7ce89cf130, use_realtime_clock=false, abs_timeout_or_null=0x0) at bionic/libc/bionic/pthread_mutex.cpp:710
#6  0x0000007c89b773d0 in android::Mutex::lock (this=0x7ce89cf130) at system/core/libutils/include/utils/Mutex.h:183
#7  android::Mutex::Autolock::Autolock (this=, mutex=...) at system/core/libutils/include/utils/Mutex.h:132
#8  android::SensorService::populateActiveConnections (this=0x7ce89cf000, activeConnections=0x7c6362ab80) at frameworks/native/services/sensorservice/SensorService.cpp:1935
#9  0x0000007c89b79c1c in android::SensorService::threadLoop (this=0x7ce89cf000) at frameworks/native/services/sensorservice/SensorService.cpp:818
#10 0x0000007c89b7a564 in non-virtual thunk to android::SensorService::threadLoop() ()
   from /media/pzc/9eaf5fc7-55e5-4005-a86f-6a262079fa3a/LOG/J11/0.2.28.cn_mtbf_hang/out/target/product/qssi/symbols/system/lib64/libsensorservice.so
#11 0x0000007d7995a5f4 in android::Thread::_threadLoop (user=0x7ce89cf020) at system/core/libutils/Threads.cpp:746
#12 0x0000007d795e6ce4 in android::AndroidRuntime::javaThreadShell (args=) at frameworks/base/core/jni/AndroidRuntime.cpp:1400
#13 0x0000007d775f6474 in __pthread_start (arg=0x7c6362ad50) at bionic/libc/bionic/pthread_create.cpp:338
#14 0x0000007d77594230 in __start_thread (fn=0x7d775f644c <__pthread_start(void*)>, arg=0x7c6362ad50) at bionic/libc/bionic/clone.cpp:53
(gdb) f 5
#5  NonPI::MutexLockWithTimeout (mutex=0x6f8f756630, use_realtime_clock=false, abs_timeout_or_null=0x0) at bionic/libc/bionic/pthread_mutex.cpp:775
775	        if (RecursiveOrErrorcheckMutexWait(mutex, shared, old_state, use_realtime_clock,
(gdb) ptype mutex
type = struct pthread_mutex_internal_t {
    uint16_t state;
    uint16_t __pad;
    union {
        atomic_int owner_tid;
        PIMutex pi_mutex;
    };
    char __reserved[28];
  public:
    PIMutex & ToPIMutex(void);
    void FreePIMutex(void);
} *
(gdb) p mutex
$1 = (pthread_mutex_internal_t *) 0x7dd823af30
(gdb) p *(pthread_mutex_internal_t *) 0x7dd823af30
$2 = {
  state = 2, 
  __pad = 0, 
  {
    owner_tid = 0, 
    pi_mutex = {
      type = 0 '\000', 
      shared = false, 
      counter = 0, 
      owner_tid = 0
    }
  }, 
  __reserved = '\000' 
}



1.构建gdb调试环境

2.先找到在等锁的线程,你可以通过thread apply all bt,然后找到一个在等sensor 锁的线程,然后通过t 命令切换到对应的线程

3.通过bt查看调用栈

4.切换到NonPI::MutexLockWithTimeout这个栈帧上

5.通过p 命令打印mutex

6.查看owner_tid

不幸的是我们发现owner_tid=0,因为这个功能默认是没有打开的,我们需要添加一些代码把功能打开,patch如下:


diff --git a/libutils/include/utils/Mutex.h b/libutils/include/utils/Mutex.h
index 1325bf3..5c70ac3 100644
--- a/libutils/include/utils/Mutex.h
+++ b/libutils/include/utils/Mutex.h
@@ -160,10 +160,20 @@
 #if !defined(_WIN32)
 
 inline Mutex::Mutex() {
-    pthread_mutex_init(&mMutex, nullptr);
+    // pthread_mutex_init(&mMutex, nullptr);
+    pthread_mutexattr_t attr;
+    pthread_mutexattr_init(&attr);
+    pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK );
+    pthread_mutex_init(&mMutex, &attr);
+    pthread_mutexattr_destroy(&attr);
 }
 inline Mutex::Mutex(__attribute__((unused)) const char* name) {
-    pthread_mutex_init(&mMutex, nullptr);
+    // pthread_mutex_init(&mMutex, nullptr);
+    pthread_mutexattr_t attr;
+    pthread_mutexattr_init(&attr);
+    pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK );
+    pthread_mutex_init(&mMutex, &attr);
+    pthread_mutexattr_destroy(&attr);
 }
 inline Mutex::Mutex(int type, __attribute__((unused)) const char* name) {
     if (type == SHARED) {
@@ -173,7 +183,12 @@
         pthread_mutex_init(&mMutex, &attr);
         pthread_mutexattr_destroy(&attr);
     } else {
-        pthread_mutex_init(&mMutex, nullptr);
+        // pthread_mutex_init(&mMutex, nullptr);
+        pthread_mutexattr_t attr;
+        pthread_mutexattr_init(&attr);
+        pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK);
+        pthread_mutex_init(&mMutex, &attr);
+        pthread_mutexattr_destroy(&attr);
     }
 }
 inline Mutex::~Mutex() {

我们加上patch后重新编译 libutils.so,并把它push到手机里,重复上边的操作,再次打印mutex:

(gdb) p mutex
$1 = (pthread_mutex_internal_t *) 0x6f8f756630
(gdb) p *(pthread_mutex_internal_t *) 0x6f8f756630
$2 = {
  state = 32770, 
  __pad = 0, 
  {
    owner_tid = 2972, 
    pi_mutex = {
      type = 156 '\234', 
      shared = 11, 
      counter = 0, 
      owner_tid = 0
    }
  }, 
  __reserved = '\000' 
}

这下我们发现owner_tid有值了,是2972线程,那么我们用gdb切换到对应的线程上去看看吧:

(gdb) info thread
  Id   Target Id         Frame
  ......
  134  Thread 1616.2972 "Binder:1616_8" nanosleep () at bionic/libc/arch-arm64/syscalls/nanosleep.S:7
  ......

(gdb) t 134
[Switching to thread 134 (Thread 1616.2972)]
#0  nanosleep () at bionic/libc/arch-arm64/syscalls/nanosleep.S:7
7	    svc     #0
(gdb) bt
#0  nanosleep () at bionic/libc/arch-arm64/syscalls/nanosleep.S:7
#1  0x000000702a042d08 in sleep (seconds=30) at bionic/libc/upstream-freebsd/lib/libc/gen/sleep.c:58
#2  0x0000006f3b843718 in android::SensorService::dump (this=, fd=283, args=...) at frameworks/native/services/sensorservice/SensorService.cpp:433
#3  0x0000007028d2cce0 in android::BBinder::onTransact (this=, code=, data=..., reply=) at frameworks/native/libs/binder/Binder.cpp:247
#4  0x000000702acefbdc in android::BnSensorServer::onTransact (this=, code=550352544, data=..., reply=, flags=16) at frameworks/native/libs/sensor/ISensorServer.cpp:277
#5  0x0000007028d2c67c in android::BBinder::transact (this=0x6f8f756508, code=1598311760, data=..., reply=, flags=) at frameworks/native/libs/binder/Binder.cpp:134
#6  0x0000007028d399f8 in android::IPCThreadState::executeCommand (this=, cmd=) at frameworks/native/libs/binder/IPCThreadState.cpp:1247
#7  0x0000007028d393cc in android::IPCThreadState::getAndExecuteCommand (this=0x6f20f26c00) at frameworks/native/libs/binder/IPCThreadState.cpp:520
#8  0x0000007028d39ca4 in android::IPCThreadState::joinThreadPool (this=0x6f20f26c00, isMain=false) at frameworks/native/libs/binder/IPCThreadState.cpp:619
#9  0x0000007028d5fd98 in android::PoolThread::threadLoop (this=0x6f9535e260) at frameworks/native/libs/binder/ProcessState.cpp:67
#10 0x00000070287f4610 in android::Thread::_threadLoop (user=0x6f9535e260) at system/core/libutils/Threads.cpp:746
#11 0x000000702a6efce4 in android::AndroidRuntime::javaThreadShell (args=) at frameworks/base/core/jni/AndroidRuntime.cpp:1400
#12 0x000000702a07f474 in __pthread_start (arg=0x6f20cdbd50) at bionic/libc/bionic/pthread_create.cpp:338
#13 0x000000702a01d230 in __start_thread (fn=0x702a07f44c <__pthread_start(void*)>, arg=0x6f20cdbd50) at bionic/libc/bionic/clone.cpp:53

我们发现这个线程正在拿着锁sleep,这段代码正好是我们加上用来调试用的

 

三.扩展:kernel死锁解析

    参考“https://blog.csdn.net/aa787282301/article/details/103464927” ->2.3 Kernel ramdump分析

你可能感兴趣的:(安卓系统,c/c++,gdb)