信号量死锁排查手段

信号量死锁表现就是系统卡死了,但是在多线程程序中并不是那么容易跟踪。
一种排查方法是

#define MODULE_LOCK(sem_handle)  \
	do{                                 \
			bool cret;              \
			module_pwarning("lock in\n");  \
			cret=os_acquire_sem(sem_handle,OS_WAIT_TIMEOUT_MAX);\
			module_pwarning("lock out\n");              \
			if(cret)\
			{                               \
				module_perror("fail\n");                \
				return ERR_ACQUIRE_SEM_FAIL; \
			}                                   \
	}while(0)

这种方法不好的地方是死锁后并不显然知道哪个锁死锁了,需要猜哪个锁死锁,然后放开打印信息。放开锁的打印打印信息会非常多影响系统运行。

在项目实施过程中想到了另外一种思路,简单说就是将获取信号量者记录下来,当信号量死锁的时候将记录的访问者打印出来。

具体实施如下:
增加信号量管理结构

typedef struct
{
    bool is_used;
    u32 handle;
    bool is_acquired;					/*信号量是否被访问*/
    const s8* acquired_func_record;	/*访问者的方法*/
    u32 acquired_func_line_record;		/*访问者的行号*/
}os_sem_manage_t;

在os_acquire_sem声明时进行宏替换增加__FUNCTION__, __LINE__两个参数。

s32 os_acquire_sem_trace(const s8* call_func, u32 line_num, u32 sem_handle,u32 timeout_ms);
#define os_acquire_sem(sem_handle, timeout_ms) os_acquire_sem_trace(__FUNCTION__, __LINE__, sem_handle, timeout_ms)
s32 os_acquire_sem_trace(const s8* call_func, u32 line_num, u32 sem_handle,u32 timeout_ms)
{
    ......

/*找到管理结构*/
for(i=0;i=OS_SEM_MAX)
{
    os_perror("sem_handle:0x%x cannot find record info \n", sem_handle);
    return true;
}

......

/*访问信号量
  如果超时时间不是死等待形式的,则获取一次信号量,获取不到则打印错误信息
  如果超时时间是死等待形式的,则每60S获取一次信号量,获取不到就打印一次错误信息
*/
if(timeout_ms != OS_WAIT_TIMEOUT_MAX)
{
    ret = g_os_opt.os_acquire_sem(sem_handle, timeout_ms);
    if(ret)
    {        
        os_perror("acquire sem failed sem_handle:0x%x sem_manage[%d]:0x%x is_acquired:0x%x, last_acquired_func[%s:%d] curr_acquire_func[%s:%d]\n", 
            sem_handle, i, &g_os_sem_manage[i], 
            g_os_sem_manage[i].is_acquired, g_os_sem_manage[i].acquired_func_record, g_os_sem_manage[i].acquired_func_line_record,
            call_func, line_num);
        return ret;
    }
}
else
{
    while(1)
    {                
        ret = g_os_opt.os_acquire_sem(sem_handle, 60000);
        if(ret == 0)
        {
            break;
        }
        else if(ret == 2)
        {
            os_perror("acquire sem failed sem_handle:0x%x sem_manage[%d]:0x%x is_acquired:0x%x, last_acquired_func[%s:%d] curr_acquire_func[%s:%d]\n", 
                    sem_handle, i, &g_os_sem_manage[i], 
                    g_os_sem_manage[i].is_acquired, g_os_sem_manage[i].acquired_func_record, g_os_sem_manage[i].acquired_func_line_record,
                    call_func, line_num);
            }
            else
            {
                os_perror("acquire sem failed sem_handle:0x%x sem_manage[%d]:0x%x is_acquired:0x%x, last_acquired_func[%s:%d] curr_acquire_func[%s:%d]\n", 
                    sem_handle, i, &g_os_sem_manage[i], 
                    g_os_sem_manage[i].is_acquired, g_os_sem_manage[i].acquired_func_record, g_os_sem_manage[i].acquired_func_line_record,
                    call_func, line_num);
                    return ret;
            }
        }
    }

    /*记录访问者信息*/
    g_os_sem_manage[i].is_acquired = true;
#ifdef CONTROL_PRINTF
    g_os_sem_manage[i].acquired_func_record = call_func;
    g_os_sem_manage[i].acquired_func_line_record = line_num;
#endif

	return 0;
}

信号量释放时将记录状态清空

bool os_release_sem(u32 sem_handle)
{
	......
	
	/*找到管理结构*/
	和os_acquire_sem基本相同......
	
    /*访问状态清空*/
    g_os_sem_manage[i].is_acquired = false;
    g_os_sem_manage[i].acquired_func_record = "none";
    g_os_sem_manage[i].acquired_func_line_record = INVALID_VALUE;
    
}

出错时打印信息:

Line 79947: [16:06:04:361][I:ERROR][T:70/01/01 00:01:16:105][F:os.c][C:os_acquire_sem_trace][L:964] acquire sem failed sem_handle:0x812ab880 sem_manage[8]:0x810370a0 is_acquired:0x1, last_acquired_func[module_set_volume:221] curr_acquire_func[module_show_intra_frame:453]
Line 79953: [16:07:15:211][I:ERROR][T:70/01/01 00:02:26:958][F:os.c][C:os_acquire_sem_trace][L:964] acquire sem failed sem_handle:0x812ab880 sem_manage[8]:0x810370a0 is_acquired:0x1, last_acquired_func[module_set_volume:221] curr_acquire_func[module_clear_intra_frame:477]
Line 80014: [16:08:25:749][I:ERROR][T:70/01/01 00:03:37:500][F:os.c][C:os_acquire_sem_trace][L:964] acquire sem failed sem_handle:0x812ab880 sem_manage[8]:0x810370a0 is_acquired:0x1, last_acquired_func[module_set_volume:221] curr_acquire_func[module_set_aspect_ratio:403]

你可能感兴趣的:(调试手段)