ftrace:
3) | ocfs2_readpage() { 3) | ocfs2_inode_lock_with_page() { 3) 0.126 us | down_read(); 3) 0.098 us | up_read(); 3) + 11.225 us | ocfs2_inode_unlock(); 3) ! 11945.43 us | } 3) ! 11946.01 us | }
代码分析:
274 static int ocfs2_readpage(struct file *file, struct page *page) 275 { 276 struct inode *inode = page->mapping->host; 277 struct ocfs2_inode_info *oi = OCFS2_I(inode); 278 loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT; 279 int ret, unlock = 1; 280 281 trace_ocfs2_readpage((unsigned long long)oi->ip_blkno, 282 (page ? page->index : 0)); 283 /* ocfs2_inode_lock_with_page()是个痛点,为了避免lock inversion又引入了live lock问题,这些 具体问题的细节,暂时还没理解! 调用该函数时,page已经locked, 试探性申请dlm锁,如果dlm资源正在被不可兼容的占用,那么主动放弃申请, unlock page, 然后再去申请dlm锁;为了当好人,让ocfs2dc获得page lock,如果dlm锁被写进程抢走,那么 在写锁降级的时候,要等待刷盘,很浪费时间的;肯定还暗藏其它问题,要不然怎么会浪费那么多时间! 284 ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page); 285 if (ret != 0) { 286 if (ret == AOP_TRUNCATED_PAGE) 287 unlock = 0; 288 mlog_errno(ret); 289 goto out; 290 } 291 292 if (down_read_trylock(&oi->ip_alloc_sem) == 0) { 293 /* 294 * Unlock the page and cycle ip_alloc_sem so that we don't 295 * busyloop waiting for ip_alloc_sem to unlock 296 */ 297 ret = AOP_TRUNCATED_PAGE; 298 unlock_page(page); 299 unlock = 0; 300 down_read(&oi->ip_alloc_sem); 301 up_read(&oi->ip_alloc_sem); 302 goto out_inode_unlock; 303 }
割...
305 /* 306 * i_size might have just been updated as we grabed the meta lock. We 307 * might now be discovering a truncate that hit on another node. 308 * block_read_full_page->get_block freaks out if it is asked to read 309 * beyond the end of a file, so we check here. Callers 310 * (generic_file_read, vm_ops->fault) are clever enough to check i_size 311 * and notice that the page they just read isn't needed. 312 * 313 * XXX sys_readahead() seems to get that wrong? 314 */ 315 if (start >= i_size_read(inode)) { 316 zero_user(page, 0, PAGE_SIZE); 317 SetPageUptodate(page); 318 ret = 0; 319 goto out_alloc; 320 } 321 322 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) 323 ret = ocfs2_readpage_inline(inode, page); 324 else 325 ret = block_read_full_page(page, ocfs2_get_block); /* 我觉得unlock不应该设置为0, 因为block_read_full_page函数有可能没有unlock page, 所以我改了这一块,只要page锁着,就让unlock等于1,虽然性能上去了,但是读会发生io错误; 可能不像我想得那么简单,因为block_read_full_page中可能会调用submit_bh进行IO, 是异步的,后面可能 需要等待IO完成,才会去unlock page, 难怪读会发生io错误 */ 326 unlock = 0; 327 328 out_alloc: 329 up_read(&OCFS2_I(inode)->ip_alloc_sem); 330 out_inode_unlock: 331 ocfs2_inode_unlock(inode, 0); 332 out: 333 if (unlock) 334 unlock_page(page); 335 return ret; 336 }