先看下ftrace, 可惜ocfs2_wait_for_mask不能ftrace,也不知到为什么?不过,它直接调用了wait_for_complete,所以就用这个函数代替了,这个函数浪费的时间最多了!
0) | __ocfs2_cluster_lock() { 0) | wait_for_completion() { ------------------------------------------ 0) iomaker-10882 => ocfs2dc-10793 ------------------------------------------ 0) 0.000 us | ocfs2_dlm_lock(); ------------------------------------------ 0) ocfs2dc-10793 => iomaker-10882 ------------------------------------------ 0) ! 11609.94 us | } 0) 0.000 us | ocfs2_dlm_lock(); 0) ! 443.137 us | wait_for_completion(); 0) ! 12053.08 us | } /* __ocfs2_cluster_lock */
这个函数分析起来,没那么容易,又长又臭。 分片过代码吧:
1362 static int __ocfs2_cluster_lock(struct ocfs2_super *osb, 1363 struct ocfs2_lock_res *lockres, 1364 int level, 1365 u32 lkm_flags, 1366 int arg_flags, 1367 int l_subclass, 1368 unsigned long caller_ip) 1369 { // lockres是->ip_inode_lockres, level=EX, lkm_flags=0, arg_flags=0, subclass=IO_LS_NORMAL // caller_ip=__RET_IP_,不知道为什么需要这个参数? 1370 struct ocfs2_mask_waiter mw; 1371 int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR); 1372 int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */ 1373 unsigned long flags; 1374 unsigned int gen; 1375 int noqueue_attempted = 0; 1376 //ocfs2_mask_waiter结构设计的挺巧妙;mask暗指标志位掩码,lockres->l_flags有许多标志位,如OCFS2_LOCK_BUSY, //OCFS2_LOCK_BLOCKED,OCFS2_LOCK_PENDING,etc.,->mw_mask用来指示哪一个bit,->mw_goal用来表示希望这个 //bit是0或1; waiter就意味着wait_for_completion这个位变成我们想要的值。 1377 ocfs2_init_mask_waiter(&mw); 1378 //ocfs2_inode_inode_lops->flags=LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB //所以if成立 1379 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) 1380 lkm_flags |= DLM_LKF_VALBLK;
割...
1382 again: //哪些情况会goto到这儿? //#1493行,从ocfs2_dlm_lock中成功返回,但是BUSY标记还没有清除掉,这意味着ast还没被调用或返回,因为所有类型的ast都去清除这个标记; //#1520行,因为args_flags=0,#1514行的if语句不成立,所以这行根本执行不到; //#1525行,1524行if语句一定成立,即ret=0 1383 wait = 0; 1384 1385 spin_lock_irqsave(&lockres->l_lock, flags); 1386 1387 if (catch_signals && signal_pending(current)) { 1388 ret = -ERESTARTSYS; 1389 goto unlock; 1390 } 1391 1392 mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING, 1393 "Cluster lock called on freeing lockres %s! flags " 1394 "0x%lx\n", lockres->l_name, lockres->l_flags); 1395 1396 /* We only compare against the currently granted level 1397 * here. If the lock is blocked waiting on a downconvert, 1398 * we'll get caught below. */ 1399 if (lockres->l_flags & OCFS2_LOCK_BUSY && 1400 level > lockres->l_level) { //BUSY表示还有dlm lock请求没有返回,必须等着... 1401 /* is someone sitting in dlm_lock? If so, wait on 1402 * them. */ 1403 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); 1404 wait = 1; 1405 goto unlock; //unlock处,#1502行,不能理解!!! 总之,很快就进入等待函数了... 1406 } 1407 1408 if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING) { 1409 /* 1410 * We've upconverted. If the lock now has a level we can 1411 * work with, we take it. If, however, the lock is not at the 1412 * required level, we go thru the full cycle. One way this could 1413 * happen is if a process requesting an upconvert to PR is 1414 * closely followed by another requesting upconvert to an EX. 1415 * If the process requesting EX lands here, we want it to 1416 * continue attempting to upconvert and let the process 1417 * requesting PR take the lock. 1418 * If multiple processes request upconvert to PR, the first one 1419 * here will take the lock. The others will have to go thru the 1420 * OCFS2_LOCK_BLOCKED check to ensure that there is no pending 1421 * downconvert request. 1422 */ //这段注释非常清楚 1423 if (level <= lockres->l_level) 1424 goto update_holders; 1425 } 1426 1427 if (lockres->l_flags & OCFS2_LOCK_BLOCKED && 1428 !ocfs2_may_continue_on_blocked_lock(lockres, level)) { //BLOCKED: blocked waiting for downconvert; //ocfs2_may_continue_on...在想要的锁和->l_locking兼容,返回1; 我猜是为了避免重复等待 1429 /* is the lock is currently blocked on behalf of 1430 * another node */ 1431 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0); 1432 wait = 1; 1433 goto unlock;
割...
1436 if (level > lockres->l_level) { //申请的锁级别要高于当前granted lock level 1437 if (noqueue_attempted > 0) { //noqueue_attempted一直等于0,所以可以无视这个if语句; 1438 ret = -EAGAIN; 1439 goto unlock; 1440 } //lkm_flags不会将DLM_LKF_NOQUEUE置位,所以也可以无视这个if语句 1441 if (lkm_flags & DLM_LKF_NOQUEUE) 1442 noqueue_attempted = 1; 1443 //->l_action用来指示ast回调时执行哪个动作,有OCFS2_AST_ATTACH, OCFS2_AST_CONVERT, OCFS2_AST_DOWNCONVERT; 1444 if (lockres->l_action != OCFS2_AST_INVALID) 1445 mlog(ML_ERROR, "lockres %s has action %u pending\n", 1446 lockres->l_name, lockres->l_action); 1447 1448 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { //如果OCFS2_LOCK_ATTACHED为0,表示该锁资源的LVB还没有初始化,也意味着这是初次对该资源加锁; 1449 lockres->l_action = OCFS2_AST_ATTACH; 1450 lkm_flags &= ~DLM_LKF_CONVERT; 1451 } else { //否则,一定是申请锁转换 1452 lockres->l_action = OCFS2_AST_CONVERT; 1453 lkm_flags |= DLM_LKF_CONVERT; 1454 } 1455 1456 lockres->l_requested = level; 1457 lockres_or_flags(lockres, OCFS2_LOCK_BUSY); 1458 gen = lockres_set_pending(lockres); 1459 spin_unlock_irqrestore(&lockres->l_lock, flags); 1460 1461 BUG_ON(level == DLM_LOCK_IV); 1462 BUG_ON(level == DLM_LOCK_NL); 1463 1464 mlog(ML_BASTS, "lockres %s, convert from %d to %d\n", 1465 lockres->l_name, lockres->l_level, level); 1467 /* call dlm_lock to upgrade lock now */ 1468 ret = ocfs2_dlm_lock(osb->cconn, 1469 level, 1470 &lockres->l_lksb, 1471 lkm_flags, 1472 lockres->l_name, 1473 OCFS2_LOCK_ID_MAX_LEN - 1); 1474 lockres_clear_pending(lockres, gen, osb); 1475 if (ret) { 1476 if (!(lkm_flags & DLM_LKF_NOQUEUE) || 1477 (ret != -EAGAIN)) { 1478 ocfs2_log_dlm_error("ocfs2_dlm_lock", 1479 ret, lockres); 1480 } 1481 ocfs2_recover_from_dlm_error(lockres, 1); 1482 goto out; 1483 } 1484 1485 mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n", 1486 lockres->l_name); 1487 1488 /* At this point we've gone inside the dlm and need to 1489 * complete our work regardless. */ 1490 catch_signals = 0; 1491 1492 /* wait for busy to clear and carry on */ 1493 goto again; 1494 }
割...
1496 update_holders: 1497 /* Ok, if we get here then we're good to go. */ //能走到这一步,说明已经成功拿到了想要的锁 1498 ocfs2_inc_holders(lockres, level); 1499 1500 ret = 0; 1501 unlock: //#1502行,不清楚要干什么? 1502 lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); 1503 1504 spin_unlock_irqrestore(&lockres->l_lock, flags); 1505 out: 1506 /* 1507 * This is helping work around a lock inversion between the page lock 1508 * and dlm locks. One path holds the page lock while calling aops 1509 * which block acquiring dlm locks. The voting thread holds dlm 1510 * locks while acquiring page locks while down converting data locks. 1511 * This block is helping an aop path notice the inversion and back 1512 * off to unlock its page lock before trying the dlm lock again. 1513 */ //因为args_flags=0,这个if语句不会成立,直接无视 1514 if (wait && arg_flags & OCFS2_LOCK_NONBLOCK && 1515 mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) { 1516 wait = 0; 1517 if (lockres_remove_mask_waiter(lockres, &mw)) 1518 ret = -EAGAIN; 1519 else 1520 goto again; 1521 } //资源被占着,而且锁不兼容,只能慢慢等了!!! 1522 if (wait) { 1523 ret = ocfs2_wait_for_mask(&mw); 1524 if (ret == 0) 1525 goto again; 1526 mlog_errno(ret); 1527 } 1528 ocfs2_update_lock_stats(lockres, level, &mw, ret); 1542 return ret; 1543 }