参考资料:https://www.ibm.com/developerworks/cn/linux/l-cn-mcsspinlock/index.html
在qspinlock系列中使用了MCS,使用时仍使用spin_lock/spin_unlock、spin_lock_bh/spin_unlock_bh一套接口,对使用者透明。
mcs主要优化是将大部分自旋操作放在了per cpu的node中,防止多个cpu对同一个lock内存多次读写(全局的lock每次修改都会导致其他处理器缓存失效,频繁的缓存同步操作会导致繁重的系统总线和内存的流量,从而大大降低了系统整体的性能),以提高性能;通过链表实现了排队功能。
源码:
#define arch_spin_lock(l) queued_spin_lock(l)
#define arch_spin_unlock(l) queued_spin_unlock(l)
//spinlock
spinlock是一个atomic 类型的值,按bit分成如下几部分,在CPU个数不同时有所差别
/*
* Bitfields in the atomic value:
*
* When NR_CPUS < 16K
* 0- 7: locked byte
* 8: pending
* 9-15: not used
* 16-17: tail index
* 18-31: tail cpu (+1)
*
* When NR_CPUS >= 16K
* 0- 7: locked byte
* 8: pending
* 9-10: tail index
* 11-31: tail cpu (+1)
*/
/**
* queued_spin_lock - acquire a queued spinlock
* @lock: Pointer to queued spinlock structure
*/
static __always_inline void queued_spin_lock(struct qspinlock *lock)
{
u32 val;
/*尝试获取lock,获取成功直接return, 否则进入queued_spin_lock_slowpath排队获取lock*/
val = atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL);
if (likely(val == 0))
return;
queued_spin_lock_slowpath(lock, val);
}
/**
* queued_spin_lock_slowpath - acquire the queued spinlock
* @lock: Pointer to queued spinlock structure
* @val: Current value of the queued spinlock 32-bit word
*
* (queue tail, pending bit, lock value)
*
* fast : slow : unlock
* : :
* uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0)
* : | ^--------.------. / :
* : v \ \ | :
* pending : (0,1,1) +--> (0,1,0) \ | :
* : | ^--' | | :
* : v | | :
* uncontended : (n,x,y) +--> (n,0,0) --' | :
* queue : | ^--' | :
* : v | :
* contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' :
* queue : ^--' :
*/
void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
{
struct mcs_spinlock *prev, *next, *node;
u32 new, old, tail;
int idx;
BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
if (pv_enabled())
goto queue;
if (virt_spin_lock(lock))
return;
/*
* wait for in-progress pending->locked hand-overs
*
* 0,1,0 -> 0,0,1
*/
if (val == _Q_PENDING_VAL) {
while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL)
cpu_relax();
}
/*
* trylock || pending
*
* 0,0,0 -> 0,0,1 ; trylock
* 0,0,1 -> 0,1,1 ; pending
*/
for (;;) {
/*
* If we observe any contention; queue.
*/
if (val & ~_Q_LOCKED_MASK)//(tail | idx | pending) != 0 表示有竞争,需排队
goto queue;
new = _Q_LOCKED_VAL;
if (val == new)
new |= _Q_PENDING_VAL;//锁正在被占用,添加pending,表示我将使用。
old = atomic_cmpxchg(&lock->val, val, new);
if (old == val)
break;//添加lock(pending)成功。
val = old;
}
/*
* we won the trylock
*/
if (new == _Q_LOCKED_VAL)
return;//加锁成功,直接return;
//pending成功, 此时除了owner,其他cpu会在前面(val == _Q_PENDING_VAL)处等待, 获取进入queue
/*
* we're pending, wait for the owner to go away.
*
* *,1,1 -> *,1,0
*
* this wait loop must be a load-acquire such that we match the
* store-release that clears the locked bit and create lock
* sequentiality; this is because not all clear_pending_set_locked()
* implementations imply full barriers.
*/
while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK)
cpu_relax();
/*
* take ownership and clear the pending bit.
*
* *,1,0 -> *,0,1
*/
//清除pending标记,并设置lock位,清空pending后,其他cpu就可以排队或pending。
clear_pending_set_locked(lock);
return;
/*
* End of pending bit optimistic spinning and beginning of MCS
* queuing.
*/
queue:
node = this_cpu_ptr(&mcs_nodes[0]);
idx = node->count++;
tail = encode_tail(smp_processor_id(), idx);//构造tail
node += idx; //获取node
node->locked = 0;
node->next = NULL;
pv_init_node(node);
/*
* We touched a (possibly) cold cacheline in the per-cpu queue node;
* attempt the trylock once more in the hope someone let go while we
* weren't watching.
*/
//再次尝试lock
if (queued_spin_trylock(lock))
goto release;
/*
* We have already touched the queueing cacheline; don't bother with
* pending stuff.
*
* p,*,* -> n,*,*
*/
old = xchg_tail(lock, tail);//将当前node放置lock->tail
/*
* if there was a previous node; link it and wait until reaching the
* head of the waitqueue.
*/
if (old & _Q_TAIL_MASK) {//解析pre节点
prev = decode_tail(old);
WRITE_ONCE(prev->next, node);//将node挂mcs链表
pv_wait_node(node);//等待前面所有节点释放
arch_mcs_spin_lock_contended(&node->locked);
}
/*
* we're at the head of the waitqueue, wait for the owner & pending to
* go away.
*
* *,x,y -> *,0,0
*
* this wait loop must use a load-acquire such that we match the
* store-release that clears the locked bit and create lock
* sequentiality; this is because the set_locked() function below
* does not imply a full barrier.
*
*/
pv_wait_head(lock, node);//等待上个节点结束
while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)//等待所有pending | lock结束
cpu_relax();
//到这里已经完全拥有了锁。其他都要排队
/*
* claim the lock:
*
* n,0,0 -> 0,0,1 : lock, uncontended
* *,0,0 -> *,0,1 : lock, contended
*
* If the queue head is the only one in the queue (lock value == tail),
* clear the tail code and grab the lock. Otherwise, we only need
* to grab the lock.
*/
for (;;) {
if (val != tail) {//tail已更新,即不是最后一个lock waiter,直接加锁
set_locked(lock);//lock tail被重置。
break;
}
old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
if (old == val)
goto release; /* No contention */
val = old;
}
/*
* contended path; wait for next, release.
*/
while (!(next = READ_ONCE(node->next)))//等待next节点完成挂链表
cpu_relax();
//设置next->locked 并wakeup next->node
arch_mcs_spin_unlock_contended(&next->locked);
pv_kick_node(lock, next);
release:
/*
* release the node
*/
this_cpu_dec(mcs_nodes[0].count);
}
//qrwlock
lock结构分为cnt, wait_lock两部分,cnt又分为wmode\rcnts两部分; wait_lock用于锁的竞争中排队使用,wmode为写标志,rcnts为加读锁计数。
rwlock应用场景为多个并发读,很少竞争写的场景。
struct __qrwlock {
union {
atomic_t cnts;
struct {
#ifdef __LITTLE_ENDIAN
u8 wmode; /* Writer mode */
u8 rcnts[3]; /* Reader counts */
#else
u8 rcnts[3]; /* Reader counts */
u8 wmode; /* Writer mode */
#endif
};
};
arch_spinlock_t lock;
};
typedef struct qrwlock {
atomic_t cnts;
arch_spinlock_t wait_lock;
} arch_rwlock_t;
#define arch_read_lock(l) queued_read_lock(l)
#define arch_write_lock(l) queued_write_lock(l)
/**
* queued_read_lock - acquire read lock of a queue rwlock
* @lock: Pointer to queue rwlock structure
*/
static inline void queued_read_lock(struct qrwlock *lock)
{
u32 cnts;
//将读者计数直接加1,如果没有写锁则加读锁成功,return.否则进入slowpath
cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts);
if (likely(!(cnts & _QW_WMASK)))
return;
/* The slowpath will decrement the reader count, if necessary. */
//由于前面已经加1,所以slow流程可能会减1
queued_read_lock_slowpath(lock, cnts);
}
/**
* queued_write_lock - acquire write lock of a queue rwlock
* @lock : Pointer to queue rwlock structure
*/
static inline void queued_write_lock(struct qrwlock *lock)
{
/* Optimize for the unfair lock case where the fair flag is 0. */
//如果lock空闲(没有reader且没有writer),则直接获取lock
if (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0)
return;
//否则进入slow流程
queued_write_lock_slowpath(lock);
}
/**
* queued_read_unlock - release read lock of a queue rwlock
* @lock : Pointer to queue rwlock structure
*/
static inline void queued_read_unlock(struct qrwlock *lock)
{
/*
* Atomically decrement the reader count
*/
(void)atomic_sub_return_release(_QR_BIAS, &lock->cnts);//解读锁,将计数-0xff(_QR_BIAS)
}
/**
* queued_write_unlock - release write lock of a queue rwlock
* @lock : Pointer to queue rwlock structure
*/
static inline void queued_write_unlock(struct qrwlock *lock)
{
smp_store_release((u8 *)&lock->cnts, 0);//解写锁,将写锁标志位置0; (u8*)lock->cnts[0] = 0
}
/**
* queued_read_lock_slowpath - acquire read lock of a queue rwlock
* @lock: Pointer to queue rwlock structure
* @cnts: Current qrwlock lock value
*/
void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
{
/*
* Readers come here when they cannot get the lock without waiting
*/
if (unlikely(in_interrupt())) {//中断流程
/*
* Readers in interrupt context will get the lock immediately
* if the writer is just waiting (not holding the lock yet).
* The rspin_until_writer_unlock() function returns immediately
* in this case. Otherwise, they will spin (with ACQUIRE
* semantics) until the lock is available without waiting in
* the queue.
*/
//如果中断上下文,则提高获取锁的优先级,如果其他cpu已经处于_QW_WAITING
//需要此处读锁优先使用完。
rspin_until_writer_unlock(lock, cnts);
return;
}
atomic_sub(_QR_BIAS, &lock->cnts);//读计数减1
/*
* Put the reader into the wait queue
*/
arch_spin_lock(&lock->wait_lock);//加入waitlock排队队列
/*
* The ACQUIRE semantics of the following spinning code ensure
* that accesses can't leak upwards out of our subsequent critical
* section in the case that the lock is currently held for write.
*/
cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS;
rspin_until_writer_unlock(lock, cnts);//等待writer释放锁
/*
* Signal the next one in queue to become queue head//释放waitlock,下一个排队者进入竞争流程
*/
arch_spin_unlock(&lock->wait_lock);
}
/**
* queued_write_lock_slowpath - acquire write lock of a queue rwlock
* @lock : Pointer to queue rwlock structure
*/
void queued_write_lock_slowpath(struct qrwlock *lock)
{
u32 cnts;
/* Put the writer into the wait queue *///加入wait_lock排队队列,wait_lock是个qspinlock
arch_spin_lock(&lock->wait_lock);
/* Try to acquire the lock directly if no reader is present */
//尝试直接获取lock,如获取成功则释放wait_lock并返回
if (!atomic_read(&lock->cnts) &&
(atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0))
goto unlock;
/*
* Set the waiting flag to notify readers that a writer is pending,
* or wait for a previous writer to go away.
*/
//通知正在进行的读者有一个writer在等待,或者等待上一个写者结束
for (;;) {
struct __qrwlock *l = (struct __qrwlock *)lock;
//1、没有writer时(可能有reader),设置wmode标记告诉读者有writer在等待,读锁需要排队获取lock;
//2、有writer时,for循环等待writer释放lock,此处上个writer释放lock时,可能会被其他刚要获取读写锁的
// 对象(处于queued_write_lock/queued_write_lock入口)强入
// 已但参加排队的由于存在waitlock则不能强入。所以强入次数可控,不会导致锁饥饿现象。
if (!READ_ONCE(l->wmode) &&
(cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0))//因为可能存在强入,此处要使用cas操作
break;
cpu_relax_lowlatency();
}
//前面已经设置了_QW_WAITING, 新的reader、writer都需要等待waitlock排队。
/* When no more readers, set the locked flag */
//等待所有reader结束,设置_QW_LOCKED
for (;;) {
cnts = atomic_read(&lock->cnts);
//这里之所以用原子操作,应该是为了防止 queued_read_lock_slowpath 中atomic_sub并发,
//因为read_lock时是先atomic_add加1,如果获取lock失败,在slow中再atomic_sub减1。
//
if ((cnts == _QW_WAITING) &&
(atomic_cmpxchg_acquire(&lock->cnts, _QW_WAITING,
_QW_LOCKED) == _QW_WAITING))
break;
cpu_relax_lowlatency();
}
unlock:
arch_spin_unlock(&lock->wait_lock);
}