iostat统计磁盘信息的时候,使用的是/proc/diskstats 。而/proc/diskstats是谁在写入呢?
主要数据结构:
//genhd.h struct disk_stats { unsigned long sectors[2]; /* READs and WRITEs */ unsigned long ios[2]; unsigned long merges[2]; unsigned long ticks[2]; // jiffies差 unsigned long io_ticks; // 从入队列到完成io的时间 unsigned long time_in_queue; };
proc初始化:
//block/genhd.c static int __init proc_genhd_init(void) { proc_create("diskstats", 0, NULL, &proc_diskstats_operations); proc_create("partitions", 0, NULL, &proc_partitions_operations); return 0; }
static const struct file_operations proc_diskstats_operations = { .open = diskstats_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, };
static int diskstats_open(struct inode *inode, struct file *file) { return seq_open(file, &diskstats_op); }
static const struct seq_operations diskstats_op = { .start = disk_seqf_start, .next = disk_seqf_next, .stop = disk_seqf_stop, .show = diskstats_show };
看到,diskstats_show这个函数才是关键:
static int diskstats_show(struct seq_file *seqf, void *v) { ...... disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { cpu = part_stat_lock(); part_round_stats(cpu, hd); part_stat_unlock(); seq_printf(seqf, "%4d %7d %s %lu %lu %lu " "%u %lu %lu %lu %u %u %u %u\n", MAJOR(part_devt(hd)), MINOR(part_devt(hd)), disk_name(gp, hd->partno, buf), part_stat_read(hd, ios[READ]), part_stat_read(hd, merges[READ]), part_stat_read(hd, sectors[READ]), jiffies_to_msecs(part_stat_read(hd, ticks[READ])), part_stat_read(hd, ios[WRITE]), part_stat_read(hd, merges[WRITE]), part_stat_read(hd, sectors[WRITE]), jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])), part_in_flight(hd), jiffies_to_msecs(part_stat_read(hd, io_ticks)), jiffies_to_msecs(part_stat_read(hd, time_in_queue)) ); } disk_part_iter_exit(&piter); }
/proc/diskstats各列具体的函数参考下面:
$cat /proc/diskstats
22 0 hdc 159807 57894 6328277 1476593 179991 467858 5184662 2664218 0 886604 4140851
$cat /sys/block/hdc/stat
159807 57894 6328277 1476593 179989 467844 5184534 2664218 0 886604 4140851
/proc/diskstats文件比/sys/block/hdc/stat文件多3个域,从左至右分别对应主设备号,次设备号和设备名称。后续的11个域在这两个文件里是相同的,它们的函义将在下面解释。除了第9个域,所有的域都是从启动时的累积值。
第1个域:读磁盘的次数,成功完成读的总次数。
第2个域:合并读次数, field 6 – 合并写次数。为了效率可能会合并相邻的读和写。从而两次4K的读在它最终被处理到磁盘上之前可能会变成一次8K的读,才被计数(和排队),因此只有一次I/O操作。这个域使你知道这样的操作有多频繁。
第3个域:读扇区的次数,成功读过的扇区总次数。
第4个域:读花费的毫秒数,这是所有读操作所花费的毫秒数(用__make_request()到end_that_request_last()测量)。
第5个域:写完成的次数,成功写完成的总次数。
第7个域:写扇区的次数,成功写扇区总次数。
第8个域:写花费的毫秒数,这是所有写操作所花费的毫秒数(用__make_request()到end_that_request_last()测量)。
第9个域:I/O的当前进度,只有这个域应该是0。当请求被交给适当的request_queue_t时增加和请求完成时减小。
第10个域:花在I/O操作上的毫秒数,这个域会增长只要field 9不为0。
第11个域:加权, 花在I/O操作上的毫秒数,在每次I/O开始,I/O结束,I/O合并时这个域都会增加。这可以给I/O完成时间和存储那些可以累积的提供一个便利的测量标准。
而驱动层需要怎么提供这些数据呢?driver需要调用类似这样的一组函数:
part_stat_inc、part_stat_add、__part_stat_add(其中part_stat_add是调用 __part_stat_add,只不够它同时操作partition)
iostat是怎么根据/proc/diskstats来得到各项数据呢?
1 //iostat.c function read_diskstats_stat 2 if ((fp = fopen(DISKSTATS, "r")) == NULL) 3 return; 4 5 while (fgets(line, 256, fp) != NULL) { 6 7 /* major minor name rio rmerge rsect ruse wio wmerge wsect wuse running use aveq */ 8 i = sscanf(line, "%u %u %s %lu %lu %llu %lu %lu %lu %llu %lu %lu %lu %lu", 9 &major, &minor, dev_name, 10 &rd_ios, &rd_merges_or_rd_sec, &rd_sec_or_wr_ios, &rd_ticks_or_wr_sec, 11 &wr_ios, &wr_merges, &wr_sec, &wr_ticks, &ios_pgr, &tot_ticks, &rq_ticks); 12 13 if (i == 14) { 14 /* Device */ 15 sdev.rd_ios = rd_ios; 16 sdev.rd_merges = rd_merges_or_rd_sec; 17 sdev.rd_sectors = rd_sec_or_wr_ios; 18 sdev.rd_ticks = rd_ticks_or_wr_sec; 19 sdev.wr_ios = wr_ios; 20 sdev.wr_merges = wr_merges; 21 sdev.wr_sectors = wr_sec; 22 sdev.wr_ticks = wr_ticks; 23 sdev.ios_pgr = ios_pgr; 24 sdev.tot_ticks = tot_ticks; 25 sdev.rq_ticks = rq_ticks; 26 } 27 else if (i == 7) { 28 /* Partition */ 29 if (DISPLAY_EXTENDED(flags) || (!dlist_idx && !DISPLAY_PARTITIONS(flags))) 30 continue; 31 32 sdev.rd_ios = rd_ios; 33 sdev.rd_sectors = rd_merges_or_rd_sec; 34 sdev.wr_ios = rd_sec_or_wr_ios; 35 sdev.wr_sectors = rd_ticks_or_wr_sec; 36 } 37 else 38 /* Unknown entry: Ignore it */ 39 continue;
1 void write_ext_stat(int curr, unsigned long long itv, int flags, int fctr, 2 struct io_hdr_stats *shi, struct io_stats *ioi, 3 struct io_stats *ioj) 4 { 5 unsigned long long rd_sec, wr_sec; 6 double tput, util, await, svctm, arqsz, nr_ios; 7 8 /* 9 * Counters overflows are possible, but don't need to be handled in 10 * a special way: the difference is still properly calculated if the 11 * result is of the same type as the two values. 12 * Exception is field rq_ticks which is incremented by the number of 13 * I/O in progress times the number of milliseconds spent doing I/O. 14 * But the number of I/O in progress (field ios_pgr) happens to be 15 * sometimes negative... 16 */ 17 nr_ios = (ioi->rd_ios - ioj->rd_ios) + (ioi->wr_ios - ioj->wr_ios); 18 tput = ((double) nr_ios) * HZ / itv; 19 util = S_VALUE(ioj->tot_ticks, ioi->tot_ticks, itv); 20 svctm = tput ? util / tput : 0.0; 21 /* 22 * Kernel gives ticks already in milliseconds for all platforms 23 * => no need for further scaling. 24 */ 25 await = nr_ios ? 26 ((ioi->rd_ticks - ioj->rd_ticks) + (ioi->wr_ticks - ioj->wr_ticks)) / 27 nr_ios : 0.0; 28 29 rd_sec = ioi->rd_sectors - ioj->rd_sectors; 30 if ((ioi->rd_sectors < ioj->rd_sectors) && (ioj->rd_sectors <= 0xffffffff)) 31 rd_sec &= 0xffffffff; 32 wr_sec = ioi->wr_sectors - ioj->wr_sectors; 33 if ((ioi->wr_sectors < ioj->wr_sectors) && (ioj->wr_sectors <= 0xffffffff)) 34 wr_sec &= 0xffffffff; 35 36 arqsz = nr_ios ? (rd_sec + wr_sec) / nr_ios : 0.0; 37 38 /* DEV rrq/s wrq/s r/s w/s rsec wsec rqsz qusz await svctm %util */ 39 printf("%-13s %8.2f %8.2f %7.2f %7.2f %8.2f %8.2f %8.2f %8.2f %7.2f %6.2f %6.2f\n", 40 shi->name, 41 S_VALUE(ioj->rd_merges, ioi->rd_merges, itv), 42 S_VALUE(ioj->wr_merges, ioi->wr_merges, itv), 43 S_VALUE(ioj->rd_ios, ioi->rd_ios, itv), 44 S_VALUE(ioj->wr_ios, ioi->wr_ios, itv), 45 ll_s_value(ioj->rd_sectors, ioi->rd_sectors, itv) / fctr, 46 ll_s_value(ioj->wr_sectors, ioi->wr_sectors, itv) / fctr, 47 arqsz, 48 S_VALUE(ioj->rq_ticks, ioi->rq_ticks, itv) / 1000.0, 49 await, 50 /* The ticks output is biased to output 1000 ticks per second */ 51 svctm, 52 /* Again: Ticks in milliseconds */ 53 util / 10.0); 54 }