信号量死锁表现就是系统卡死了,但是在多线程程序中并不是那么容易跟踪。
一种排查方法是
#define MODULE_LOCK(sem_handle) \
do{ \
bool cret; \
module_pwarning("lock in\n"); \
cret=os_acquire_sem(sem_handle,OS_WAIT_TIMEOUT_MAX);\
module_pwarning("lock out\n"); \
if(cret)\
{ \
module_perror("fail\n"); \
return ERR_ACQUIRE_SEM_FAIL; \
} \
}while(0)
这种方法不好的地方是死锁后并不显然知道哪个锁死锁了,需要猜哪个锁死锁,然后放开打印信息。放开锁的打印打印信息会非常多影响系统运行。
在项目实施过程中想到了另外一种思路,简单说就是将获取信号量者记录下来,当信号量死锁的时候将记录的访问者打印出来。
具体实施如下:
增加信号量管理结构
typedef struct
{
bool is_used;
u32 handle;
bool is_acquired; /*信号量是否被访问*/
const s8* acquired_func_record; /*访问者的方法*/
u32 acquired_func_line_record; /*访问者的行号*/
}os_sem_manage_t;
在os_acquire_sem声明时进行宏替换增加__FUNCTION__, __LINE__两个参数。
s32 os_acquire_sem_trace(const s8* call_func, u32 line_num, u32 sem_handle,u32 timeout_ms);
#define os_acquire_sem(sem_handle, timeout_ms) os_acquire_sem_trace(__FUNCTION__, __LINE__, sem_handle, timeout_ms)
s32 os_acquire_sem_trace(const s8* call_func, u32 line_num, u32 sem_handle,u32 timeout_ms)
{
......
/*找到管理结构*/
for(i=0;i=OS_SEM_MAX)
{
os_perror("sem_handle:0x%x cannot find record info \n", sem_handle);
return true;
}
......
/*访问信号量
如果超时时间不是死等待形式的,则获取一次信号量,获取不到则打印错误信息
如果超时时间是死等待形式的,则每60S获取一次信号量,获取不到就打印一次错误信息
*/
if(timeout_ms != OS_WAIT_TIMEOUT_MAX)
{
ret = g_os_opt.os_acquire_sem(sem_handle, timeout_ms);
if(ret)
{
os_perror("acquire sem failed sem_handle:0x%x sem_manage[%d]:0x%x is_acquired:0x%x, last_acquired_func[%s:%d] curr_acquire_func[%s:%d]\n",
sem_handle, i, &g_os_sem_manage[i],
g_os_sem_manage[i].is_acquired, g_os_sem_manage[i].acquired_func_record, g_os_sem_manage[i].acquired_func_line_record,
call_func, line_num);
return ret;
}
}
else
{
while(1)
{
ret = g_os_opt.os_acquire_sem(sem_handle, 60000);
if(ret == 0)
{
break;
}
else if(ret == 2)
{
os_perror("acquire sem failed sem_handle:0x%x sem_manage[%d]:0x%x is_acquired:0x%x, last_acquired_func[%s:%d] curr_acquire_func[%s:%d]\n",
sem_handle, i, &g_os_sem_manage[i],
g_os_sem_manage[i].is_acquired, g_os_sem_manage[i].acquired_func_record, g_os_sem_manage[i].acquired_func_line_record,
call_func, line_num);
}
else
{
os_perror("acquire sem failed sem_handle:0x%x sem_manage[%d]:0x%x is_acquired:0x%x, last_acquired_func[%s:%d] curr_acquire_func[%s:%d]\n",
sem_handle, i, &g_os_sem_manage[i],
g_os_sem_manage[i].is_acquired, g_os_sem_manage[i].acquired_func_record, g_os_sem_manage[i].acquired_func_line_record,
call_func, line_num);
return ret;
}
}
}
/*记录访问者信息*/
g_os_sem_manage[i].is_acquired = true;
#ifdef CONTROL_PRINTF
g_os_sem_manage[i].acquired_func_record = call_func;
g_os_sem_manage[i].acquired_func_line_record = line_num;
#endif
return 0;
}
信号量释放时将记录状态清空
bool os_release_sem(u32 sem_handle)
{
......
/*找到管理结构*/
和os_acquire_sem基本相同......
/*访问状态清空*/
g_os_sem_manage[i].is_acquired = false;
g_os_sem_manage[i].acquired_func_record = "none";
g_os_sem_manage[i].acquired_func_line_record = INVALID_VALUE;
}
出错时打印信息:
Line 79947: [16:06:04:361][I:ERROR][T:70/01/01 00:01:16:105][F:os.c][C:os_acquire_sem_trace][L:964] acquire sem failed sem_handle:0x812ab880 sem_manage[8]:0x810370a0 is_acquired:0x1, last_acquired_func[module_set_volume:221] curr_acquire_func[module_show_intra_frame:453]
Line 79953: [16:07:15:211][I:ERROR][T:70/01/01 00:02:26:958][F:os.c][C:os_acquire_sem_trace][L:964] acquire sem failed sem_handle:0x812ab880 sem_manage[8]:0x810370a0 is_acquired:0x1, last_acquired_func[module_set_volume:221] curr_acquire_func[module_clear_intra_frame:477]
Line 80014: [16:08:25:749][I:ERROR][T:70/01/01 00:03:37:500][F:os.c][C:os_acquire_sem_trace][L:964] acquire sem failed sem_handle:0x812ab880 sem_manage[8]:0x810370a0 is_acquired:0x1, last_acquired_func[module_set_volume:221] curr_acquire_func[module_set_aspect_ratio:403]