最近调的一个程序老在pthread_barrier_wait的futex(.., FUTEX_WAIT, ...)处死锁,于是找来glibc的源代码,结合调试,研究了一把这个函数,下面谈谈我的理解。
/* Wait on barrier. */ int pthread_barrier_wait (barrier) pthread_barrier_t *barrier; { struct pthread_barrier *ibarrier = (struct pthread_barrier *) barrier; int result = 0; /* Make sure we are alone. */ lll_lock (ibarrier->lock, ibarrier->private ^ FUTEX_PRIVATE_FLAG); /* One more arrival. */ --ibarrier->left; /* Are these all? */ if (ibarrier->left == 0) { /* Yes. Increment the event counter to avoid invalid wake-ups and tell the current waiters that it is their turn. */ ++ibarrier->curr_event; /* Wake up everybody. */ lll_futex_wake (&ibarrier->curr_event, INT_MAX, ibarrier->private ^ FUTEX_PRIVATE_FLAG); /* This is the thread which finished the serialization. */ result = PTHREAD_BARRIER_SERIAL_THREAD; } else { /* The number of the event we are waiting for. The barrier's event number must be bumped before we continue. */ unsigned int event = ibarrier->curr_event; /* Before suspending, make the barrier available to others. */ lll_unlock (ibarrier->lock, ibarrier->private ^ FUTEX_PRIVATE_FLAG); /* Wait for the event counter of the barrier to change. */ do lll_futex_wait (&ibarrier->curr_event, event, ibarrier->private ^ FUTEX_PRIVATE_FLAG); while (event == ibarrier->curr_event); } /* Make sure the init_count is stored locally or in a register. */ unsigned int init_count = ibarrier->init_count; /* If this was the last woken thread, unlock. */ if (atomic_increment_val (&ibarrier->left) == init_count) /* We are done. */ lll_unlock (ibarrier->lock, ibarrier->private ^ FUTEX_PRIVATE_FLAG); return result; }
图1. pthread_barrier_wait的基本代码
.text .globl pthread_barrier_wait .type pthread_barrier_wait,@function .align 16 pthread_barrier_wait: cfi_startproc pushl %ebx cfi_adjust_cfa_offset(4) cfi_offset(%ebx, -8) movl 8(%esp), %ebx /* Get the mutex. */ movl $1, %edx xorl %eax, %eax LOCK cmpxchgl %edx, MUTEX(%ebx) jnz 1f /* One less waiter. If this was the last one needed wake everybody. */ 2: subl $1, LEFT(%ebx) je 3f /* There are more threads to come. */ pushl %esi cfi_adjust_cfa_offset(4) cfi_offset(%esi, -12) #if CURR_EVENT == 0 movl (%ebx), %edx #else movl CURR_EVENT(%ebx), %edx #endif /* Release the mutex. */ LOCK subl $1, MUTEX(%ebx) jne 6f /* Wait for the remaining threads. The call will return immediately if the CURR_EVENT memory has meanwhile been changed. */ 7: #if FUTEX_WAIT == 0 movl PRIVATE(%ebx), %ecx #else movl $FUTEX_WAIT, %ecx orl PRIVATE(%ebx), %ecx #endif xorl %esi, %esi 8: movl $SYS_futex, %eax ENTER_KERNEL /* Don't return on spurious wakeups. The syscall does not change any register except %eax so there is no need to reload any of them. */ #if CURR_EVENT == 0 cmpl %edx, (%ebx) #else cmpl %edx, CURR_EVENT(%ebx) #endif je 8b /* Increment LEFT. If this brings the count back to the initial count unlock the object. */ movl $1, %edx movl INIT_COUNT(%ebx), %ecx LOCK xaddl %edx, LEFT(%ebx) subl $1, %ecx cmpl %ecx, %edx jne 10f /* Release the mutex. We cannot release the lock before waking the waiting threads since otherwise a new thread might arrive and gets waken up, too. */ LOCK subl $1, MUTEX(%ebx) jne 9f /* Note: %esi is still zero. */ 10: movl %esi, %eax /* != PTHREAD_BARRIER_SERIAL_THREAD */ popl %esi cfi_adjust_cfa_offset(-4) cfi_restore(%esi) popl %ebx cfi_adjust_cfa_offset(-4) cfi_restore(%ebx) ret cfi_adjust_cfa_offset(4) cfi_offset(%ebx, -8) /* The necessary number of threads arrived. */ 3: #if CURR_EVENT == 0 addl $1, (%ebx) #else addl $1, CURR_EVENT(%ebx) #endif /* Wake up all waiters. The count is a signed number in the kernel so 0x7fffffff is the highest value. */ movl $0x7fffffff, %edx movl $FUTEX_WAKE, %ecx orl PRIVATE(%ebx), %ecx movl $SYS_futex, %eax ENTER_KERNEL /* Increment LEFT. If this brings the count back to the initial count unlock the object. */ movl $1, %edx movl INIT_COUNT(%ebx), %ecx LOCK xaddl %edx, LEFT(%ebx) subl $1, %ecx cmpl %ecx, %edx jne 5f /* Release the mutex. We cannot release the lock before waking the waiting threads since otherwise a new thread might arrive and gets waken up, too. */ LOCK subl $1, MUTEX(%ebx) jne 4f 5: orl $-1, %eax /* == PTHREAD_BARRIER_SERIAL_THREAD */ popl %ebx cfi_adjust_cfa_offset(-4) cfi_restore(%ebx) ret cfi_adjust_cfa_offset(4) cfi_offset(%ebx, -8) 1: movl PRIVATE(%ebx), %ecx leal MUTEX(%ebx), %edx xorl $LLL_SHARED, %ecx call __lll_lock_wait jmp 2b 4: movl PRIVATE(%ebx), %ecx leal MUTEX(%ebx), %eax xorl $LLL_SHARED, %ecx call __lll_unlock_wake jmp 5b cfi_adjust_cfa_offset(4) cfi_offset(%esi, -12) 6: movl PRIVATE(%ebx), %ecx leal MUTEX(%ebx), %eax xorl $LLL_SHARED, %ecx call __lll_unlock_wake jmp 7b 9: movl PRIVATE(%ebx), %ecx leal MUTEX(%ebx), %eax xorl $LLL_SHARED, %ecx call __lll_unlock_wake jmp 10b cfi_endproc .size pthread_barrier_wait,.-pthread_barrier_wait
图2. pthread_barrier_wait真正实现
glibc的pthread_barrier_wait其实在../nptl/sysdeps/unix/sysv/linux/i386/i686/../i486/pthread_barrier_wait.S:31用汇编实现的,但是要表达的意思跟../nptl/pthread_barrier_wait.C的C语言实现一个样。
假设程序有4个线程,设有两个barrier。
首先对barrier的操作用mutex保护起来,如图1第9行所示,使用了ibarrier->lock这个整数当mutex。
结构体指针barrier指向的结构体如下所示
struct pthread_barrier
{
unsigned int curr_event;
int lock;
unsigned int left;
unsigned int init_count;
int private;
}
在pthread_barrier_init中初始化成了
{curr_event = 0, lock = 0, left = 4, init_count = 4, private = 128}
在第一个barrier处,假设线程T1, T2, T3, T4按照先后顺序执行了pthread_barrier_wait,且在第一个barrier处没有两个线程同时等待在lock处,因此T1, T2, T3在获取了lock后,把left--然后释放lock进入futex(&curr_event, FUTEX_WAIT_PRIVATE /*==128*/, 0, NULL)睡眠,T4在获取lock把left--后,发现left为0了,因此执行futex(&cur_event, FUTEX_WAKE_PRIVATE /*129*/, 0x7FFFFFFF)唤醒T1, T2, T3, T4,在这之前把++cur_event,以便下次进入barrier与这次区分,T4返回PTHREAD_BARRIER_SERIAL_THREAD == -1,其他线程返回0,注意到T4还没有释放lock,而这个lock是用最后一个离开barrier的线程释放的,如图1第42所示,每个线程在离开barrier时会把left++,如图1第40行所示。
假设还有线程没有退出第一个barrier,因此lock == 1,这时退出的线程在进入第二个barrier时由图2的第16行->121->__lll_lock_wait(../nptl/sysdeps/unix/sysv/linux/i386/i686/../i486/lowlevellock.S, line 123.)。__lll_lock_wait如图3所示
.globl __lll_lock_wait .type __lll_lock_wait,@function .hidden __lll_lock_wait .align 16 __lll_lock_wait: cfi_startproc pushl %edx cfi_adjust_cfa_offset(4) pushl %ebx cfi_adjust_cfa_offset(4) pushl %esi cfi_adjust_cfa_offset(4) cfi_offset(%edx, -8) cfi_offset(%ebx, -12) cfi_offset(%esi, -16) movl %edx, %ebx movl $2, %edx xorl %esi, %esi /* No timeout. */ LOAD_FUTEX_WAIT (%ecx) cmpl %edx, %eax /* NB: %edx == 2 */ jne 2f 1: movl $SYS_futex, %eax ENTER_KERNEL 2: movl %edx, %eax xchgl %eax, (%ebx) /* NB: lock is implied */ testl %eax, %eax jnz 1b popl %esi cfi_adjust_cfa_offset(-4) cfi_restore(%esi) popl %ebx cfi_adjust_cfa_offset(-4) cfi_restore(%ebx) popl %edx cfi_adjust_cfa_offset(-4) cfi_restore(%edx) ret cfi_endproc .size __lll_lock_wait,.-__lll_lock_wait
图3. __lll_lock_wait代码
假设T2, T3, T4都离开了第一个barrier,T1还没有离开一个barrier,barrier的要求就是在所有的线程离开上一个barrier后才能进入下一个barrier,因此T2, T3, T4沿着上面的路径进入__lll_lock_wait并lock设为2,然后futex(&lock, 128, 2, NULL)进入睡眠。
这时T1要离开了第一个barrier了,在把lock--后,如图2第68行,发现lock不为0,因此69行->135行->__lll_unlock_wake。
(如果是T4最后一个离开第一个barrier则为110行->123行->__lll_unlock_wake。总之都是进入了__lll_unlock_wake。)我们发现133行也有一个__lll_unlock_wake,它的进入路径为33行->130行->__lll_unlock_wake,这在什么情况下发生呢?
请看下面分析。
.globl __lll_unlock_wake .type __lll_unlock_wake,@function .hidden __lll_unlock_wake .align 16 __lll_unlock_wake: cfi_startproc pushl %ebx cfi_adjust_cfa_offset(4) pushl %ecx cfi_adjust_cfa_offset(4) pushl %edx cfi_adjust_cfa_offset(4) cfi_offset(%ebx, -8) cfi_offset(%ecx, -12) cfi_offset(%edx, -16) movl %eax, %ebx movl $0, (%eax) LOAD_FUTEX_WAKE (%ecx) movl $1, %edx /* Wake one thread. */ movl $SYS_futex, %eax ENTER_KERNEL popl %edx cfi_adjust_cfa_offset(-4) cfi_restore(%edx) popl %ecx cfi_adjust_cfa_offset(-4) cfi_restore(%ecx) popl %ebx cfi_adjust_cfa_offset(-4) cfi_restore(%ebx) ret cfi_endproc .size __lll_unlock_wake,.-__lll_unlock_wake
图4. __lll_unlock_wake代码
__lll_unlock_wake在../nptl/sysdeps/unix/sysv/linux/i386/i686/../i486/lowlevellock.S, line 363,如途所示,
唤醒其他线程前把lock设为了0,图4第17行,然后futex(&lock, 129, 1)唤醒一个线程。醒来的线程沿着图2第122行->19行->33行->130行->__lll_unlock_wake继续唤醒下一个线程。
可以得出下面结论:
1)进入图2的9:的是在这个barrier执行了futex(&curr_event, FUTEX_WAIT_PRIVATE /*==128*/, 0, NULL),退出时发现有线程试图进入下一个barrier而在lock上进入睡眠,因而去唤醒其中的一个线程。
2)进入图2的4:的是在这barrier执行了futex(&cur_event, FUTEX_WAKE_PRIVATE /*129*/, 0x7FFFFFFF),退出时发现有线程试图进入下一个barrier而在lock上进入睡眠,因而去唤醒其中的一个线程。
3)进入图2的6:的是试图进入barrier而发现还有线程没有结束上个barrier而在lock进入睡眠的线程被唤醒后再去唤醒其他的在lock上睡眠的线程。或者是试图进入第一个barrier获取lock后,发现有试图进入这个barrier而在lock上睡眠的线程,因而要去唤醒睡眠的线程。这里回答了上面的问题,也就是说,在获取lock执行一系列操作,然后在释放lock时发现有在lock上睡眠的线程,因此进入6。
这三类线程回跳的地址有很大的差别,因为1)2)还没有结束第一个barrier,而3)已经结束了第一个barrier,1)为在curr_event上睡眠的线程,而2)为唤醒所有的在curr_event上睡眠的线程,两者要求的返回值不同。因此
1)回跳到10:,返回0
2)回跳到5:,返回-1
3)回跳到7:,在curr_event上进入睡眠。