Linux4.4内核MCS/queue spinlocks

参考资料:https://www.ibm.com/developerworks/cn/linux/l-cn-mcsspinlock/index.html

在qspinlock系列中使用了MCS,使用时仍使用spin_lock/spin_unlock、spin_lock_bh/spin_unlock_bh一套接口,对使用者透明。

mcs主要优化是将大部分自旋操作放在了per cpu的node中,防止多个cpu对同一个lock内存多次读写(全局的lock每次修改都会导致其他处理器缓存失效,频繁的缓存同步操作会导致繁重的系统总线和内存的流量,从而大大降低了系统整体的性能),以提高性能;通过链表实现了排队功能。



源码:

#define arch_spin_lock(l)		queued_spin_lock(l)
#define arch_spin_unlock(l)		queued_spin_unlock(l)

//spinlock

spinlock是一个atomic 类型的值,按bit分成如下几部分,在CPU个数不同时有所差别

/*
 * Bitfields in the atomic value:
 *
 * When NR_CPUS < 16K
 *  0- 7: locked byte
 *     8: pending
 *  9-15: not used
 * 16-17: tail index
 * 18-31: tail cpu (+1)
 *
 * When NR_CPUS >= 16K
 *  0- 7: locked byte
 *     8: pending
 *  9-10: tail index
 * 11-31: tail cpu (+1)
 */

/**
 * queued_spin_lock - acquire a queued spinlock
 * @lock: Pointer to queued spinlock structure
 */
static __always_inline void queued_spin_lock(struct qspinlock *lock)
{
	u32 val;
        /*尝试获取lock,获取成功直接return, 否则进入queued_spin_lock_slowpath排队获取lock*/
	val = atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL);
	if (likely(val == 0))
		return;
	queued_spin_lock_slowpath(lock, val);
}
/**
 * queued_spin_lock_slowpath - acquire the queued spinlock
 * @lock: Pointer to queued spinlock structure
 * @val: Current value of the queued spinlock 32-bit word
 *
 * (queue tail, pending bit, lock value)
 *
 *              fast     :    slow                                  :    unlock
 *                       :                                          :
 * uncontended  (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0)
 *                       :       | ^--------.------.             /  :
 *                       :       v           \      \            |  :
 * pending               :    (0,1,1) +--> (0,1,0)   \           |  :
 *                       :       | ^--'              |           |  :
 *                       :       v                   |           |  :
 * uncontended           :    (n,x,y) +--> (n,0,0) --'           |  :
 *   queue               :       | ^--'                          |  :
 *                       :       v                               |  :
 * contended             :    (*,x,y) +--> (*,0,0) ---> (*,0,1) -'  :
 *   queue               :         ^--'                             :
 */
void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
{
	struct mcs_spinlock *prev, *next, *node;
	u32 new, old, tail;
	int idx;

	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));

	if (pv_enabled())
		goto queue;

	if (virt_spin_lock(lock))
		return;

	/*
	 * wait for in-progress pending->locked hand-overs
	 *
	 * 0,1,0 -> 0,0,1
	 */
	if (val == _Q_PENDING_VAL) {
		while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL)
			cpu_relax();
	}

	/*
	 * trylock || pending
	 *
	 * 0,0,0 -> 0,0,1 ; trylock
	 * 0,0,1 -> 0,1,1 ; pending
	 */
	for (;;) {
		/*
		 * If we observe any contention; queue.
		 */
		if (val & ~_Q_LOCKED_MASK)//(tail | idx | pending) != 0 表示有竞争,需排队
			goto queue;

		new = _Q_LOCKED_VAL;
		if (val == new)
			new |= _Q_PENDING_VAL;//锁正在被占用,添加pending,表示我将使用。

		old = atomic_cmpxchg(&lock->val, val, new);
		if (old == val)
			break;//添加lock(pending)成功。

		val = old;
	}

	/*
	 * we won the trylock
	 */
	if (new == _Q_LOCKED_VAL)  
		return;//加锁成功,直接return;

	//pending成功, 此时除了owner,其他cpu会在前面(val == _Q_PENDING_VAL)处等待, 获取进入queue
	/*
	 * we're pending, wait for the owner to go away.
	 *
	 * *,1,1 -> *,1,0
	 *
	 * this wait loop must be a load-acquire such that we match the
	 * store-release that clears the locked bit and create lock
	 * sequentiality; this is because not all clear_pending_set_locked()
	 * implementations imply full barriers.
	 */
	while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK)
		cpu_relax();

	/*
	 * take ownership and clear the pending bit.
	 *
	 * *,1,0 -> *,0,1
	 */
	 //清除pending标记,并设置lock位,清空pending后,其他cpu就可以排队或pending。
	clear_pending_set_locked(lock);
	return;

	/*
	 * End of pending bit optimistic spinning and beginning of MCS
	 * queuing.
	 */
queue:
	node = this_cpu_ptr(&mcs_nodes[0]);
	idx = node->count++;
	tail = encode_tail(smp_processor_id(), idx);//构造tail

	node += idx;   //获取node
	node->locked = 0;
	node->next = NULL;
	pv_init_node(node);

	/*
	 * We touched a (possibly) cold cacheline in the per-cpu queue node;
	 * attempt the trylock once more in the hope someone let go while we
	 * weren't watching.
	 */
	 //再次尝试lock
	if (queued_spin_trylock(lock))
		goto release;

	/*
	 * We have already touched the queueing cacheline; don't bother with
	 * pending stuff.
	 *
	 * p,*,* -> n,*,*
	 */
	old = xchg_tail(lock, tail);//将当前node放置lock->tail

	/*
	 * if there was a previous node; link it and wait until reaching the
	 * head of the waitqueue.
	 */
	if (old & _Q_TAIL_MASK) {//解析pre节点
		prev = decode_tail(old);
		WRITE_ONCE(prev->next, node);//将node挂mcs链表

		pv_wait_node(node);//等待前面所有节点释放
		arch_mcs_spin_lock_contended(&node->locked);
	}

	/*
	 * we're at the head of the waitqueue, wait for the owner & pending to
	 * go away.
	 *
	 * *,x,y -> *,0,0
	 *
	 * this wait loop must use a load-acquire such that we match the
	 * store-release that clears the locked bit and create lock
	 * sequentiality; this is because the set_locked() function below
	 * does not imply a full barrier.
	 *
	 */
	pv_wait_head(lock, node);//等待上个节点结束
	while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)//等待所有pending | lock结束
		cpu_relax();


	//到这里已经完全拥有了锁。其他都要排队
	/*
	 * claim the lock:
	 *
	 * n,0,0 -> 0,0,1 : lock, uncontended
	 * *,0,0 -> *,0,1 : lock, contended
	 *
	 * If the queue head is the only one in the queue (lock value == tail),
	 * clear the tail code and grab the lock. Otherwise, we only need
	 * to grab the lock.
	 */
	for (;;) {
		if (val != tail) {//tail已更新,即不是最后一个lock waiter,直接加锁
			set_locked(lock);//lock tail被重置。
			break;
		}
		old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
		if (old == val)
			goto release;	/* No contention */

		val = old;
	}

	/*
	 * contended path; wait for next, release.
	 */
	while (!(next = READ_ONCE(node->next)))//等待next节点完成挂链表
		cpu_relax();

	//设置next->locked 并wakeup next->node
	arch_mcs_spin_unlock_contended(&next->locked);
	pv_kick_node(lock, next);

release:
	/*
	 * release the node
	 */
	this_cpu_dec(mcs_nodes[0].count);
}

//qrwlock

lock结构分为cnt, wait_lock两部分,cnt又分为wmode\rcnts两部分; wait_lock用于锁的竞争中排队使用,wmode为写标志,rcnts为加读锁计数。

rwlock应用场景为多个并发读,很少竞争写的场景。

struct __qrwlock {
	union {
		atomic_t cnts;
		struct {
#ifdef __LITTLE_ENDIAN
			u8 wmode;	/* Writer mode   */
			u8 rcnts[3];	/* Reader counts */
#else
			u8 rcnts[3];	/* Reader counts */
			u8 wmode;	/* Writer mode   */
#endif
		};
	};
	arch_spinlock_t	lock;
};

typedef struct qrwlock {
	atomic_t		cnts;
	arch_spinlock_t		wait_lock;
} arch_rwlock_t;
#define arch_read_lock(l)	queued_read_lock(l)
#define arch_write_lock(l)	queued_write_lock(l)


/**
 * queued_read_lock - acquire read lock of a queue rwlock
 * @lock: Pointer to queue rwlock structure
 */
static inline void queued_read_lock(struct qrwlock *lock)
{
	u32 cnts;

	//将读者计数直接加1,如果没有写锁则加读锁成功,return.否则进入slowpath
	cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts);
	if (likely(!(cnts & _QW_WMASK)))
		return;

	/* The slowpath will decrement the reader count, if necessary. */
	//由于前面已经加1,所以slow流程可能会减1
	queued_read_lock_slowpath(lock, cnts);
}

/**
 * queued_write_lock - acquire write lock of a queue rwlock
 * @lock : Pointer to queue rwlock structure
 */
static inline void queued_write_lock(struct qrwlock *lock)
{
	/* Optimize for the unfair lock case where the fair flag is 0. */
	//如果lock空闲(没有reader且没有writer),则直接获取lock
	if (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0)
		return;

	//否则进入slow流程
	queued_write_lock_slowpath(lock);
}

/**
 * queued_read_unlock - release read lock of a queue rwlock
 * @lock : Pointer to queue rwlock structure
 */
static inline void queued_read_unlock(struct qrwlock *lock)
{
	/*
	 * Atomically decrement the reader count
	 */
	(void)atomic_sub_return_release(_QR_BIAS, &lock->cnts);//解读锁,将计数-0xff(_QR_BIAS)
}

/**
 * queued_write_unlock - release write lock of a queue rwlock
 * @lock : Pointer to queue rwlock structure
 */
static inline void queued_write_unlock(struct qrwlock *lock)
{
	smp_store_release((u8 *)&lock->cnts, 0);//解写锁,将写锁标志位置0; (u8*)lock->cnts[0] = 0
}


/**
 * queued_read_lock_slowpath - acquire read lock of a queue rwlock
 * @lock: Pointer to queue rwlock structure
 * @cnts: Current qrwlock lock value
 */
void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
{
	/*
	 * Readers come here when they cannot get the lock without waiting
	 */
	if (unlikely(in_interrupt())) {//中断流程
		/*
		 * Readers in interrupt context will get the lock immediately
		 * if the writer is just waiting (not holding the lock yet).
		 * The rspin_until_writer_unlock() function returns immediately
		 * in this case. Otherwise, they will spin (with ACQUIRE
		 * semantics) until the lock is available without waiting in
		 * the queue.
		 */
		 //如果中断上下文,则提高获取锁的优先级,如果其他cpu已经处于_QW_WAITING
		 //需要此处读锁优先使用完。
		rspin_until_writer_unlock(lock, cnts);
		return;
	}
	atomic_sub(_QR_BIAS, &lock->cnts);//读计数减1

	/*
	 * Put the reader into the wait queue
	 */
	arch_spin_lock(&lock->wait_lock);//加入waitlock排队队列

	/*
	 * The ACQUIRE semantics of the following spinning code ensure
	 * that accesses can't leak upwards out of our subsequent critical
	 * section in the case that the lock is currently held for write.
	 */
	cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS;
	rspin_until_writer_unlock(lock, cnts);//等待writer释放锁

	/*
	 * Signal the next one in queue to become queue head//释放waitlock,下一个排队者进入竞争流程
	 */
	arch_spin_unlock(&lock->wait_lock);
}

/**
 * queued_write_lock_slowpath - acquire write lock of a queue rwlock
 * @lock : Pointer to queue rwlock structure
 */
void queued_write_lock_slowpath(struct qrwlock *lock)
{
	u32 cnts;

	/* Put the writer into the wait queue *///加入wait_lock排队队列,wait_lock是个qspinlock
	arch_spin_lock(&lock->wait_lock);

	/* Try to acquire the lock directly if no reader is present */
	//尝试直接获取lock,如获取成功则释放wait_lock并返回
	if (!atomic_read(&lock->cnts) &&
	    (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0))
		goto unlock;

	/*
	 * Set the waiting flag to notify readers that a writer is pending,
	 * or wait for a previous writer to go away.
	 */
	 //通知正在进行的读者有一个writer在等待,或者等待上一个写者结束
	for (;;) {
		struct __qrwlock *l = (struct __qrwlock *)lock;

		//1、没有writer时(可能有reader),设置wmode标记告诉读者有writer在等待,读锁需要排队获取lock;
		//2、有writer时,for循环等待writer释放lock,此处上个writer释放lock时,可能会被其他刚要获取读写锁的
		//   对象(处于queued_write_lock/queued_write_lock入口)强入
		//    已但参加排队的由于存在waitlock则不能强入。所以强入次数可控,不会导致锁饥饿现象。
		if (!READ_ONCE(l->wmode) &&
		   (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0))//因为可能存在强入,此处要使用cas操作
			break;

		cpu_relax_lowlatency();
	}

	//前面已经设置了_QW_WAITING, 新的reader、writer都需要等待waitlock排队。
	/* When no more readers, set the locked flag */
	//等待所有reader结束,设置_QW_LOCKED
	for (;;) {
		cnts = atomic_read(&lock->cnts);

		//这里之所以用原子操作,应该是为了防止 queued_read_lock_slowpath 中atomic_sub并发,
		//因为read_lock时是先atomic_add加1,如果获取lock失败,在slow中再atomic_sub减1。
		//
		if ((cnts == _QW_WAITING) &&
		    (atomic_cmpxchg_acquire(&lock->cnts, _QW_WAITING,  
					    _QW_LOCKED) == _QW_WAITING))
			break;

		cpu_relax_lowlatency();
	}
unlock:
	arch_spin_unlock(&lock->wait_lock);
}


你可能感兴趣的:(学习笔记)