ftrace:
0) | vfs_read() { 0) | generic_file_aio_read() { 0) | do_generic_file_read() { 0) 0.195 us | find_get_page(); 0) 0.109 us | find_get_page(); 0) 0.263 us | touch_atime(); 0) 2.820 us | } 0) 3.559 us | } 0) 5.063 us | }
其中,大量重复的路径:
2) ! 11994.70 us | ocfs2_readpage(); 2) 0.149 us | find_get_page(); 2) | page_cache_sync_readahead() { 2) 0.547 us | __page_cache_alloc(); 2) 0.181 us | __page_cache_alloc(); 2) 0.184 us | __page_cache_alloc(); 2) 0.178 us | __page_cache_alloc(); 2) 4.190 us | } 2) 0.073 us | find_get_page(); 2) 0.176 us | __page_cache_alloc(); 2) ! 11983.20 us | ocfs2_readpage(); 2) 0.150 us | find_get_page(); 2) | page_cache_sync_readahead() { 2) 0.525 us | __page_cache_alloc(); 2) 0.176 us | __page_cache_alloc(); 2) 0.222 us | __page_cache_alloc(); 2) 0.185 us | __page_cache_alloc(); 2) 4.198 us | } 2) 0.068 us | find_get_page(); 2) 0.182 us | __page_cache_alloc(); 2) ! 11986.95 us | ocfs2_readpage();
分析do_generic_file_read()函数:
1258 /** 1259 * do_generic_file_read - generic file read routine 1260 * @filp: the file to read 1261 * @ppos: current file position 1262 * @desc: read_descriptor 1263 * @actor: read method 1264 * 1265 * This is a generic file read routine, and uses the 1266 * mapping->a_ops->readpage() function for the actual low-level stuff. 1267 * 1268 * This is really ugly. But the goto's actually try to clarify some 1269 * of the logic when it comes to error handling etc. 1270 */ 1271 static void do_generic_file_read(struct file *filp, loff_t *ppos, 1272 read_descriptor_t *desc, read_actor_t actor) 1273 { 1274 struct address_space *mapping = filp->f_mapping; 1275 struct inode *inode = mapping->host; 1276 struct file_ra_state *ra = &filp->f_ra; 1277 pgoff_t index; 1278 pgoff_t last_index; 1279 pgoff_t prev_index; 1280 unsigned long offset; /* offset into pagecache page */ 1281 unsigned int prev_offset; 1282 int error; 1283 1284 index = *ppos >> PAGE_CACHE_SHIFT; //原来预读结构保存了上次读位置 1285 prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT; 1286 prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1); //last_index应该指的是下次读操作完成后的位置 1287 last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; 1288 offset = *ppos & ~PAGE_CACHE_MASK; 1289 1290 for (;;) { //一次拷贝一页,末尾页特殊处理 1291 struct page *page; 1292 pgoff_t end_index; 1293 loff_t isize; 1294 unsigned long nr, ret; 1295 1296 cond_resched();
割....
1297 find_page: //以index为key,在address space的radix tree,即->page_tree中,查找并引用 1298 page = find_get_page(mapping, index); 1299 if (!page) { //在并发读写同一个文件的测试中,发现总是会查找失败,不知道为什么?像这样: // 2) | page_cache_sync_readahead() { // 2) 0.857 us | __page_cache_alloc(); // 2) 0.205 us | __page_cache_alloc(); // 2) 0.185 us | __page_cache_alloc(); // 2) 0.180 us | __page_cache_alloc(); // 2) 5.096 us | } 1300 page_cache_sync_readahead(mapping, 1301 ra, filp, 1302 index, last_index - index); 1303 page = find_get_page(mapping, index); //难道还是找不到, 又得调用__page_cache_alloc()? // 2) 0.088 us | find_get_page(); // 2) 0.223 us | __page_cache_alloc(); 1304 if (unlikely(page == NULL)) 1305 goto no_cached_page; 1306 } //#define PageReadahead(page) test_bit(PG_readahead, &(page)->flags) 1307 if (PageReadahead(page)) { /** 该页已经被预读到了,说明预读管用,所以可能又顺势多申请一些页面; 1) | page_cache_async_readahead() { 1) 1.114 us | __page_cache_alloc(); ... 1) 0.318 us | __page_cache_alloc(); 1) + 45.123 us | } 测试中,凡是进到这儿,就不会去调用ocfs2_readpage, 但是经常会在这里 浪费很多时间,比如 0) ! 2390.071 us | __lock_page_killable(); */ 1308 page_cache_async_readahead(mapping, 1309 ra, filp, page, 1310 index, last_index - index); 1311 } 1312 if (!PageUptodate(page)) { //如果页不是最新的 1313 if (inode->i_blkbits == PAGE_CACHE_SHIFT || 1314 !mapping->a_ops->is_partially_uptodate) //如果块大小等于页大小; 或支持局部更新; 1315 goto page_not_up_to_date; 1316 if (!trylock_page(page)) //page_locked没有置位,那么置位 1317 goto page_not_up_to_date; 1318 /* Did it get truncated before we got the lock? */ //哪个lock? page_locked吗? 1319 if (!page->mapping) 1320 goto page_not_up_to_date_locked; 1321 if (!mapping->a_ops->is_partially_uptodate(page, 1322 desc, offset)) //全脏 1323 goto page_not_up_to_date_locked; 1324 unlock_page(page); 1325 }
割...
1326 page_ok: //页已经在pagecache里面了 1327 /* 1328 * i_size must be checked after we know the page is Uptodate. 1329 * 1330 * Checking i_size after the check allows us to calculate 1331 * the correct value for "nr", which means the zero-filled 1332 * part of the page is not copied back to userspace (unless 1333 * another truncate extends the file - this is desired though). 1334 */ 1335 1336 isize = i_size_read(inode); 1337 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 1338 if (unlikely(!isize || index > end_index)) { 1339 page_cache_release(page); 1340 goto out; 1341 } 1342 1343 /* nr is the maximum number of bytes to copy from this page */ 1344 nr = PAGE_CACHE_SIZE; 1345 if (index == end_index) { 1346 nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; 1347 if (nr <= offset) { 1348 page_cache_release(page); 1349 goto out; 1350 } 1351 } 1352 nr = nr - offset; 1353 1354 /* If users can be writing to this page using arbitrary 1355 * virtual addresses, take care about potential aliasing 1356 * before reading the page on the kernel side. 1357 */ 1358 if (mapping_writably_mapped(mapping)) 1359 flush_dcache_page(page); 1360 1361 /* 1362 * When a sequential read accesses a page several times, 1363 * only mark it as accessed the first time. 1364 */ 1365 if (prev_index != index || offset != prev_offset) 1366 mark_page_accessed(page); 1367 prev_index = index; 1368 1369 /* 1370 * Ok, we have the page, and it's up-to-date, so 1371 * now we can copy it to user space... 1372 * 1373 * The actor routine returns how many bytes were actually used.. 1374 * NOTE! This may not be the same as how much of a user buffer 1375 * we filled up (we may be padding etc), so we can only update 1376 * "pos" here (the actor routine has to update the user buffer 1377 * pointers and the remaining count). 1378 */ 1379 ret = actor(desc, page, offset, nr); 1380 offset += ret; 1381 index += offset >> PAGE_CACHE_SHIFT; 1382 offset &= ~PAGE_CACHE_MASK; 1383 prev_offset = offset; 1384 //为什么要释放这个页呢? //数据已经从内核态page中,拷贝到用户空间;但万一下次读还是这个页呢? 1385 page_cache_release(page); //nr代表需要拷贝的字节数, ret表示时间拷贝的字节数u; //若本次拷贝成功,并且还没有读完 1386 if (ret == nr && desc->count) 1387 continue; 1388 goto out;
割...
1390 page_not_up_to_date: /* 有时候在此消耗很长时间: 1) ! 10082.08 us | __lock_page_killable(); */ 1391 /* Get exclusive access to the page ... */ 1392 error = lock_page_killable(page); 1393 if (unlikely(error)) 1394 goto readpage_error; 1395 1396 page_not_up_to_date_locked: 1397 /* Did it get truncated before we got the lock? */ 1398 if (!page->mapping) { 1399 unlock_page(page); 1400 page_cache_release(page); //对应pagecache page没有准备好,从头再来... 1401 continue; 1402 } 1403 1404 /* Did somebody else fill it already? */ 1405 if (PageUptodate(page)) { 1406 unlock_page(page); 1407 goto page_ok; 1408 }
割...
1410 readpage: 1411 /* 1412 * A previous I/O error may have been due to temporary 1413 * failures, eg. multipath errors. 1414 * PG_error will be set again if readpage fails. 1415 */ //不懂... 1416 ClearPageError(page); 1417 /* Start the actual read. The read will unlock the page. */ /* 调用ocfs2_readpage, 也是读耗时最严重的,清一色的严重: 1) ! 12604.57 us | ocfs2_readpage(); 但readpage是pagecache层向block层发送读请求, 为什么要unlock the page呢? */ 1418 error = mapping->a_ops->readpage(filp, page); 1419 1420 if (unlikely(error)) { 1421 if (error == AOP_TRUNCATED_PAGE) { 1422 page_cache_release(page); 1423 goto find_page; 1424 } 1425 goto readpage_error; 1426 } 1427 1428 if (!PageUptodate(page)) { //刚刚从block层,读到pagecache page中的数据怎么会脏呢? //从测试来看,还没发生这种情况 1429 error = lock_page_killable(page); 1430 if (unlikely(error)) 1431 goto readpage_error; 1432 if (!PageUptodate(page)) { 1433 if (page->mapping == NULL) { 1434 /* 1435 * invalidate_mapping_pages got it 1436 */ 1437 unlock_page(page); 1438 page_cache_release(page); 1439 goto find_page; 1440 } 1441 unlock_page(page); 1442 shrink_readahead_size_eio(filp, ra); 1443 error = -EIO; 1444 goto readpage_error; 1445 } 1446 unlock_page(page); 1447 } 1448 //page_ok表示页准备就绪,接着从内核态往用户态拷贝,拷贝完就退出 1449 goto page_ok;
割...
1451 readpage_error: 1452 /* UHHUH! A synchronous read error occurred. Report it */ 1453 desc->error = error; 1454 page_cache_release(page); 1455 goto out; 1456 1457 no_cached_page: 1458 /* 1459 * Ok, it wasn't cached, so we need to create a new 1460 * page.. 1461 */ 1462 page = page_cache_alloc_cold(mapping); 1463 if (!page) { 1464 desc->error = -ENOMEM; 1465 goto out; 1466 } 1467 error = add_to_page_cache_lru(page, mapping, 1468 index, GFP_KERNEL); 1469 if (error) { 1470 page_cache_release(page); 1471 if (error == -EEXIST) 1472 goto find_page; 1473 desc->error = error; 1474 goto out; 1475 } 1476 goto readpage; 1477 }
割...
1479 out: 1480 ra->prev_pos = prev_index; 1481 ra->prev_pos <<= PAGE_CACHE_SHIFT; 1482 ra->prev_pos |= prev_offset; 1483 1484 *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; 1485 file_accessed(filp); 1486 }