从ftrace来看,不出意外,最耗时间的果然是__ocfs2_cluster_lock()
0) | ocfs2_inode_lock_full_nested() { 0) 0.000 us | ocfs2_wait_for_recovery(); 0) ! 12026.56 us | __ocfs2_cluster_lock(); 0) 0.000 us | ocfs2_wait_for_recovery(); 0) 0.000 us | ocfs2_inode_lock_update(); 0) ! 12026.56 us | } 0) 0.000 us | ocfs2_inode_unlock();
同样,简单过下这个函数:
2272 /* 2273 * returns < 0 error if the callback will never be called, otherwise 2274 * the result of the lock will be communicated via the callback. 2275 */ 2276 int ocfs2_inode_lock_full_nested(struct inode *inode, 2277 struct buffer_head **ret_bh, 2278 int ex, 2279 int arg_flags, 2280 int subclass) 2281 { //arg_flags=0, subclass=IO_LS_NORMAL 2282 int status, level, acquired; 2283 u32 dlm_flags; 2284 struct ocfs2_lock_res *lockres = NULL; 2285 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2286 struct buffer_head *local_bh = NULL; 2287 2288 BUG_ON(!inode); 2289 //从这个log message来看,inode lock也是META lock,那么问题来了有相应的address_space lock吗? //我推测应该没有,因为节点间共享的是inode,至于page cache节点间是独立的。 2290 mlog(0, "inode %llu, take %s META lock\n", 2291 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2292 ex ? "EXMODE" : "PRMODE"); 2293 2294 status = 0; 2295 acquired = 0; 2296 /* We'll allow faking a readonly metadata lock for 2297 * rodevices. */ 2298 if (ocfs2_is_hard_readonly(osb)) { 2299 if (ex) 2300 status = -EROFS; 2301 goto bail; 2302 } 2303 2304 if (ocfs2_mount_local(osb)) 2305 goto local; 2306 //就是说从这儿到local,是cluster特有的处理! arg_flags传入时等于0,if必成立 2307 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) 2308 ocfs2_wait_for_recovery(osb); 2309 //->ip_inode_lockres应该就是META lock, ->ip_rw_lockres就是读写,即数据锁吧 2310 lockres = &OCFS2_I(inode)->ip_inode_lockres; //level=EX 2311 level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; 2312 dlm_flags = 0; 2313 if (arg_flags & OCFS2_META_LOCK_NOQUEUE) 2314 dlm_flags |= DLM_LKF_NOQUEUE; 2315 //dlm_flags=0, args_flags=0, 2316 status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags, 2317 arg_flags, subclass, _RET_IP_); 2318 if (status < 0) { 2319 if (status != -EAGAIN && status != -EIOCBRETRY) 2320 mlog_errno(status); 2321 goto bail; 2322 } 2323 2324 /* Notify the error cleanup path to drop the cluster lock. */ 2325 acquired = 1; 2326 2327 /* We wait twice because a node may have died while we were in 2328 * the lower dlm layers. The second time though, we've 2329 * committed to owning this lock so we don't allow signals to 2330 * abort the operation. */ 2331 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) 2332 ocfs2_wait_for_recovery(osb); 2333 2334 local: 2335 /* 2336 * We only see this flag if we're being called from 2337 * ocfs2_read_locked_inode(). It means we're locking an inode 2338 * which hasn't been populated yet, so clear the refresh flag 2339 * and let the caller handle it. 2340 */ 2341 if (inode->i_state & I_NEW) { 2342 status = 0; //不能被ftrace? 2343 if (lockres) 2344 ocfs2_complete_lock_res_refresh(lockres, 0); 2345 goto bail; 2346 } 2347 2348 /* This is fun. The caller may want a bh back, or it may 2349 * not. ocfs2_inode_lock_update definitely wants one in, but 2350 * may or may not read one, depending on what's in the 2351 * LVB. The result of all of this is that we've *only* gone to 2352 * disk if we have to, so the complexity is worthwhile. */ //在inode被锁住的情况下, 先抛弃已缓存的inode元数据, 然后再调用ocfs2_refresh_inode_from_lvb更新inode一些关键字段; //据说lvb是通过网络数据传输来更新的 2353 status = ocfs2_inode_lock_update(inode, &local_bh); 2354 if (status < 0) { 2355 if (status != -ENOENT) 2356 mlog_errno(status); 2357 goto bail; 2358 } 2359 2360 if (ret_bh) { 2361 status = ocfs2_assign_bh(inode, ret_bh, local_bh); 2362 if (status < 0) { 2363 mlog_errno(status); 2364 goto bail; 2365 } 2366 } 2367 2368 bail: 2369 if (status < 0) { 2370 if (ret_bh && (*ret_bh)) { 2371 brelse(*ret_bh); 2372 *ret_bh = NULL; 2373 } 2374 if (acquired) 2375 ocfs2_inode_unlock(inode, ex); 2376 } 2377 2378 if (local_bh) 2379 brelse(local_bh); 2380 2381 return status; 2382 }