先来看下ftrace, 时间基本上花在了ocfs2_write_begin()函数中。
1) | generic_perform_write() { 1) ! 12184.13 us | ocfs2_write_begin(); 1) 0.341 us | iov_iter_copy_from_user_atomic(); 1) + 11.759 us | ocfs2_write_end(); 1) ! 12198.37 us | }
再作简单分析:
2577 static ssize_t generic_perform_write(struct file *file, 2578 struct iov_iter *i, loff_t pos) 2579 { 2580 struct address_space *mapping = file->f_mapping; 2581 const struct address_space_operations *a_ops = mapping->a_ops; 2582 long status = 0; 2583 ssize_t written = 0; 2584 unsigned int flags = 0; 2585 //请无视 2586 /* 2587 * Copies from kernel address space cannot fail (NFSD is a big user). 2588 */ 2589 if (segment_eq(get_fs(), KERNEL_DS)) 2590 flags |= AOP_FLAG_UNINTERRUPTIBLE; 2591 // do{}while()里面肯定是规律性重复 2592 do { 2593 struct page *page; 2594 unsigned long offset; /* Offset into pagecache page */ 2595 unsigned long bytes; /* Bytes to write to page */ 2596 size_t copied; /* Bytes copied from user */ 2597 void *fsdata; 2598 //offset: pos是文件指针(这里指针意思是“位置”),若以页大小为单位来看待文件,offset即最后一页中文件 //的末尾; //bytes: 这一次迭代,打算写入的字节数,如果最后一页空闲部分放不下iovec第一个分量,就先把尾页填满; 2599 offset = (pos & (PAGE_CACHE_SIZE - 1)); 2600 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, 2601 iov_iter_count(i)); 2602 2603 again: 2604 2605 /* 2606 * Bring in the user page that we will copy from _first_. 2607 * Otherwise there's a nasty deadlock on copying from the 2608 * same page as we're writing to, without it being marked 2609 * up-to-date. 2610 * 2611 * Not only is this an optimisation, but it is also required 2612 * to check that the address is actually valid, when atomic 2613 * usercopies are used, below. 2614 */ //掠过所有unlikely ;-) 2615 if (unlikely(iov_iter_fault_in_readable(i, bytes))) { 2616 status = -EFAULT; 2617 break; 2618 } 2619 //write_begin()以后单独分析,先简单说几句。write_begin回调ocfs2_write_begin(),ocfs2_* //调用ocfs2_inode_lock加EX锁,这个锁很厉害,会导致其他节点inode pagecache失效;down_write ip_alloc_sem //防止在接下来ocfs2_write()中,->readpage()和空间分配并发执行。 2620 status = a_ops->write_begin(file, mapping, pos, bytes, flags, 2621 &page, &fsdata); 2622 if (unlikely(status)) 2623 break; 2624 //如果用户进程调用mmap()对文件做了shared映射,恰好操作的是同一个page,就先的刷下pagecache 2625 if (mapping_writably_mapped(mapping)) 2626 flush_dcache_page(page); 2627 //copy from user;-) 2628 pagefault_disable(); 2629 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); 2630 pagefault_enable(); 2631 flush_dcache_page(page); 2632 2633 mark_page_accessed(page); //与write_begin相反,调用block_commit_write()向块层发送写请求,更新inode,dinode统计量, //ocfs2_commit_trans()提交事物,销毁写辅助结构体 2634 status = a_ops->write_end(file, mapping, pos, bytes, copied, 2635 page, fsdata); 2636 if (unlikely(status < 0)) 2637 break; 2638 copied = status; 2639 2640 cond_resched(); 2641 2642 iov_iter_advance(i, copied); 2643 if (unlikely(copied == 0)) { 2644 /* 2645 * If we were unable to copy any data at all, we must 2646 * fall back to a single segment length write. 2647 * 2648 * If we didn't fallback here, we could livelock 2649 * because not all segments in the iov can be copied at 2650 * once without a pagefault. 2651 */ 2652 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, 2653 iov_iter_single_seg_count(i)); 2654 goto again; 2655 } 2656 pos += copied; 2657 written += copied; 2658 //这个函数调用了balance_dirty_pages_ratelimited_cr(mapping, 1); 从这个1看出来,while循环 //一次只处理一个page; 这个函数会周期性检查内存dirty程度,发起write back; 2659 balance_dirty_pages_ratelimited(mapping); 2660 2661 } while (iov_iter_count(i)); 2662 2663 return written ? written : status; 2664 }