spinlock internal

Spinlock is used in cpu synchronization for shared memory in multiprocessor environment.  when one processor lock the data bus it will access to a shared variable called spinlock_t and compare the spinlock_t with a predefined value representing the lock is locked. if they are not equal then set the spinlock_t as the value to demonstrate that it is locked and if they are not equal then loop the procedure until the spinlock_t is not locked(spin on spinlock_t). the procedure must be supported by hardware primitives such as lock instruction.

pthread is a famous multithread development library and it implements its mutual-exclusion lock called mutex. the mutex is also implmented with spinlock. let's have a look at how the mutex is implemented in pthread library.

//code snippet extracted from spinlock.c in pthread library
struct _pthread_fastlock
{
  long int __status;   /* "Free" or "taken" or head of waiting list */
  int __spinlock;      /* Used by compare_and_swap emulation. Also,
              adaptive SMP lock stores spin count here. */
};
void __pthread_alt_lock(struct _pthread_fastlock * lock,
                pthread_descr self)
{
#if defined HAS_COMPARE_AND_SWAP
  long oldstatus, newstatus;
#endif
  struct wait_node wait_node;

#if defined TEST_FOR_COMPARE_AND_SWAP
  if (!__pthread_has_cas)
#endif
#if !defined HAS_COMPARE_AND_SWAP || defined TEST_FOR_COMPARE_AND_SWAP
  {
    int suspend_needed = 0;
    __pthread_acquire(&lock->__spinlock);

    if (lock->__status == 0)
      lock->__status = 1;
    else {
      if (self == NULL)
    self = thread_self();

      wait_node.abandoned = 0;
      wait_node.next = (struct wait_node *) lock->__status;
      wait_node.thr = self;
      lock->__status = (long) &wait_node;
      suspend_needed = 1;
    }

    __pthread_release(&lock->__spinlock);

    if (suspend_needed)
      suspend (self);
    return;
  }
#endif

#if defined HAS_COMPARE_AND_SWAP
  do {
    oldstatus = lock->__status;
    if (oldstatus == 0) {
      newstatus = 1;
    } else {
      if (self == NULL)
    self = thread_self();
      wait_node.thr = self;
      newstatus = (long) &wait_node;
    }
    wait_node.abandoned = 0;
    wait_node.next = (struct wait_node *) oldstatus;
    /* Make sure the store in wait_node.next completes before performing
       the compare-and-swap */
    MEMORY_BARRIER();
  } while(! __compare_and_swap(&lock->__status, oldstatus, newstatus));

  /* Suspend. Note that unlike in __pthread_lock, we don't worry
     here about spurious wakeup. That's because this lock is not
     used in situations where that can happen; the restart can
     only come from the previous lock owner. */

  if (oldstatus != 0)
    suspend(self);

  READ_MEMORY_BARRIER();
#endif
}
From the above code we can know that there are two methods to implement spinlock in the pthread. one is test_and_set and another one is MCS list-based queuing lock

Let us check the first one

//in pthread_mutex_lock, pthread will call function "testandset" to acquire spin lock
static void __pthread_acquire(int * spinlock)
{
  int cnt = 0;
  struct timespec tm;

  READ_MEMORY_BARRIER();

//it will always spin on spinlock and there will be hundreds of write here
//, which is a very expensive operation.
  while (testandset(spinlock)) { 
    if (cnt < MAX_SPIN_COUNT) {
      sched_yield();
      cnt++;
    } else {
     tm.tv_sec = 0;
     tm.tv_nsec = SPIN_SLEEP_DURATION;
     nanosleep(&tm, NULL);
     cnt = 0;
    }
  }
}

//In x86 architecture:
//"XCHG exchanges two operands. The operands can be in either order. 
//If a memory operand is involved, BUS LOCK is asserted for the duration of the exchange,
//regardless of the presence or absence of the LOCK prefix or of the value of the IOPL." 
//(extracted from 80386 programmer's reference manual.)

PT_EI long int
testandset (int *spinlock)
{
  long int ret;

  __asm__ __volatile__(
      /*swap %0 and %1. if spinlock = 1 then ret =  1.
         if spinlock=0 then ret =0;*/
       "xchgl %0, %1"
       : "=r"(ret), "=m"(*spinlock)
       : "0"(1), "m"(*spinlock)
       : "memory");

      /*return 0 means lock has not been taken and we can acquire it. 
         otherwise it means lock has been taken.*/  
      return ret;
}

//in power architecture:
/*
lwarx and stwcx.atomic update of shared storage. //load and reservation and store conditional.
lwarx load a word and create a reservation.
stwcx. store a word if reservation created by lwarx is still valid.
one processor can only have one reservation
*/

//typical compare ans swap emulation.
PT_EI long int
testandset (int *p)
{
  long int ret, val = 1;
  MEMORY_BARRIER ();

  __asm__ __volatile__ (
       "0:    lwarx %0,0,%1 ;"//load and reserve, load value *p into register %0
       "      cmpwi  0,%0,0;"//compare register %0 with 0
       "      bne 1f;"//if not equal then jump to 1:. this command means lock has been taken
       "      stwcx. %2,0,%1;"//if equal then it means lock has not been taken then store 1 into *p if reservation is still valid
       "      bne- 0b;"//if stwcx. stores succeeds, then EQ bit in cr0 is 1.
                       //if store fails then EQ bit in CR0 is 0 then it jumps to 0: 
                      //and continue to try to acquire lock.
       "1:    "
    : "=&r"(ret)
    : "r"(p), "r" (val)
    : "cr0", "memory");//cr0 will be changed by stwcw. and we need reload ret so memory must be specified in the clobbered list.
  MEMORY_BARRIER ();
  return ret != 0;
}

finally, all the lock operations must supported by hardware lock.Test_and_set is a typical compare and swap emulation. lock the data bus and compare shared variable to a value to test whethere it is locked. if locked then return false and otherwise return true. it will always write 1 to variable "spinlock". (at least it is in the x86 platform).in power arch, it will write spinlock always if it is locked already.

The problem in test_and_set lies in that it will always write spinlock even though only one processor can acquire the lock. in the cache coherent architecture it will cause invalidation of whole cache line which has a bad damage on performance.

The second implementation is MCS list-based queuing lock. it is proposed in the paper "algorithm for scalable synchronization in shared memory multiprocessor"

void __pthread_alt_lock(struct _pthread_fastlock * lock,
                pthread_descr self)
{
#if defined HAS_COMPARE_AND_SWAP
  long oldstatus, newstatus;
#endif
  struct wait_node wait_node;

#if defined HAS_COMPARE_AND_SWAP
  do {
    oldstatus = lock->__status;
    if (oldstatus == 0) {
      newstatus = 1;
    } else {
      if (self == NULL)
    self = thread_self();
      wait_node.thr = self;
      newstatus = (long) &wait_node;
    }
    wait_node.abandoned = 0;
    wait_node.next = (struct wait_node *) oldstatus;
    /* Make sure the store in wait_node.next completes before performing
       the compare-and-swap */
    MEMORY_BARRIER();
  } while(! __compare_and_swap(&lock->__status, oldstatus, newstatus));        
  //here for thread that can get lock lock->_status will be set 1.
  // for those threads that cannot get lock lock->_status will be set address of wait_node.
  //spin on local oldstatus variable, which is very cheap.

  /* Suspend. Note that unlike in __pthread_lock, we don't worry
  here about spurious wakeup. That's because this lock is not
  used in situations where that can happen; the restart can
  only come from the previous lock owner. */

  if (oldstatus != 0)
    suspend(self);//if oldstatus != 0 it means that the thread does not acquire  
                  //the lock successfully and has to suspend itself to wait for the lock. 
  READ_MEMORY_BARRIER();
#endif
}
/*in function __pthread_alt_lock which is called by pthread_mutex_lock will try to get lock if the lock has not been taken 
or put the current thread into waiting list and suspend the thread with thread signal(pthread_sigsuspend).*/

/*in function __pthread_alt_unlock,it will release the lock and wake up the thread with highest priority in the waiting list.
(try it: all the thread in the waiting list will be removed ? )
the trick here is it use lock->_status to store the address of wait node for the thread that cannot get lock and has to suspend.*/

//function "__compare_and_swap" in __pthread_alt_lock in ppc architecture:
PT_EI int
__compare_and_swap (long int *p, long int oldval, long int newval)
{
  long int ret;

  __asm__ __volatile__ (
       "0:    ldarx %0,0,%1 ;"
       "      xor. %0,%3,%0;"    //here it uses xor for testing whether *p == oldval (always succeed)
       "      bne 1f;"
       "      stdcx. %2,0,%1;"
       "      bne- 0b;"
       "1:    "
    : "=&r"(ret)
    : "r"(p), "r"(newval), "r"(oldval)
    : "cr0", "memory");
  /* This version of __compare_and_swap is to be used when acquiring
     a lock, so we don't need to worry about whether other memory
     operations have completed, but we do need to be sure that any loads
     after this point really occur after we have acquired the lock.  */
  __asm__ __volatile__ ("isync" : : : "memory");
  return (int)(ret == 0);
}

//i think __compare_and_swap in i386 has to perform twice each loop due to 
//it has to set old to eax when old is not equal to eax.
PT_EI int
__compare_and_swap (long int *p, long int oldval, long int newval)
{
  char ret;
  long int readval;

  __asm__ __volatile__ ("lock; cmpxchgl %3, %1; sete %0"
            : "=q" (ret), "=m" (*p), "=a" (readval)
            : "r" (newval), "m" (*p), "a" (oldval)
            : "memory");
  return ret;
}

In this method, algorithm will put all the processor waiting for the lock into queue in FIFO order. so that algorithm does not have to always write variable "spinlock". it just write the thread node into the queue and lock->status will hold the head node of the queue.

So on average, algorithm will just have to call compare_and_swap only once and write the thread node address into the queue and suspend thread itself to wait for the lock. this method can improve the efficiency quite lot and also serve the requests in FIFO order.

All the method above must be supported by the hardware atomic operations, including "lock" in x86 and "ldarx", "stdcx" in powerpc. algorithm optimizatoin majorly lies in how to reduce the times of write and hardware lock operation.


reference:

1. http://www.cs.ucla.edu/~kohler/class/04f-aos/l14.txt (really good notes)

2. Paper: algorithm for scalable synchronization in shared memory multiprocessor

3. pthread library source code: http://ftp.gnu.org/gnu/glibc/


你可能感兴趣的:(spinlock internal)