首先我们看sd_revalidate_disk(),这个函数很重要,一定程度上来说,正是这个函数从硬件和软件两个方面掀起了我们了解scsi磁盘的性高潮.这个函数它不是一个函数在战斗,它完全是贾宝玉林黛玉方世玉附体,由这一个函数可以牵连出N个函数.而这N个函数中的一些函数本身又有好几百行,所以我们算是陷进去了.
1496 /**
1497 * sd_revalidate_disk - called the first time a new disk is seen,
1498 * performs disk spin up, read_capacity, etc.
1499 * @disk: struct gendisk we care about
1500 **/
1501 static int sd_revalidate_disk(struct gendisk *disk)
1502 {
1503 struct scsi_disk *sdkp = scsi_disk(disk);
1504 struct scsi_device *sdp = sdkp->device;
1505 unsigned char *buffer;
1506 unsigned ordered;
1507
1508 SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp,
1509 "sd_revalidate_disk/n"));
1510
1511 /*
1512 * If the device is offline, don't try and read capacity or any
1513 * of the other niceties.
1514 */
1515 if (!scsi_device_online(sdp))
1516 goto out;
1517
1518 buffer = kmalloc(SD_BUF_SIZE, GFP_KERNEL | __GFP_DMA);
1519 if (!buffer) {
1520 sd_printk(KERN_WARNING, sdkp, "sd_revalidate_disk: Memory "
1521 "allocation failure./n");
1522 goto out;
1523 }
1524
1525 /* defaults, until the device tells us otherwise */
1526 sdp->sector_size = 512;
1527 sdkp->capacity = 0;
1528 sdkp->media_present = 1;
1529 sdkp->write_prot = 0;
1530 sdkp->WCE = 0;
1531 sdkp->RCD = 0;
1532
1533 sd_spinup_disk(sdkp);
1534
1535 /*
1536 * Without media there is no reason to ask; moreover, some devices
1537 * react badly if we do.
1538 */
1539 if (sdkp->media_present) {
1540 sd_read_capacity(sdkp, buffer);
1541 sd_read_write_protect_flag(sdkp, buffer);
1542 sd_read_cache_type(sdkp, buffer);
1543 }
1544
1545 /*
1546 * We now have all cache related info, determine how we deal
1547 * with ordered requests. Note that as the current SCSI
1548 * dispatch function can alter request order, we cannot use
1549 * QUEUE_ORDERED_TAG_* even when ordered tag is supported.
1550 */
1551 if (sdkp->WCE)
1552 ordered = sdkp->DPOFUA
1553 ? QUEUE_ORDERED_DRAIN_FUA : QUEUE_ORDERED_DRAIN_FLUSH;
1554 else
1555 ordered = QUEUE_ORDERED_DRAIN;
1556
1557 blk_queue_ordered(sdkp->disk->queue, ordered, sd_prepare_flush);
1558
1559 set_capacity(disk, sdkp->capacity);
1560 kfree(buffer);
1561
1562 out:
1563 return 0;
1564 }
用我们经常用错的一个成语来说,就是首当其冲的函数便是sd_spinup_disk().
1005 /*
1006 * spinup disk - called only in sd_revalidate_disk()
1007 */
1008 static void
1009 sd_spinup_disk(struct scsi_disk *sdkp)
1010 {
1011 unsigned char cmd[10];
1012 unsigned long spintime_expire = 0;
1013 int retries, spintime;
1014 unsigned int the_result;
1015 struct scsi_sense_hdr sshdr;
1016 int sense_valid = 0;
1017
1018 spintime = 0;
1019
1020 /* Spin up drives, as required. Only do this at boot time */
1021 /* Spinup needs to be done for module loads too. */
1022 do {
1023 retries = 0;
1024
1025 do {
1026 cmd[0] = TEST_UNIT_READY;
1027 memset((void *) &cmd[1], 0, 9);
1028
1029 the_result = scsi_execute_req(sdkp->device, cmd,
1030 DMA_NONE, NULL, 0,
1031 &sshdr, SD_TIMEOUT,
1032 SD_MAX_RETRIES);
1033
1034 /*
1035 * If the drive has indicated to us that it
1036 * doesn't have any media in it, don't bother
1037 * with any more polling.
1038 */
1039 if (media_not_present(sdkp, &sshdr))
1040 return;
1041
1042 if (the_result)
1043 sense_valid = scsi_sense_valid(&sshdr);
1044 retries++;
1045 } while (retries < 3 &&
1046 (!scsi_status_is_good(the_result) ||
1047 ((driver_byte(the_result) & DRIVER_SENSE) &&
1048 sense_valid && sshdr.sense_key == UNIT_ATTENTION)));
1049
1050 if ((driver_byte(the_result) & DRIVER_SENSE) == 0) {
1051 /* no sense, TUR either succeeded or failed
1052 * with a status error */
1053 if(!spintime && !scsi_status_is_good(the_result)) {
1054 sd_printk(KERN_NOTICE, sdkp, "Unit Not Ready/n");
1055 sd_print_result(sdkp, the_result);
1056 }
1057 break;
1058 }
1059
1060 /*
1061 * The device does not want the automatic start to be issued.
1062 */
1063 if (sdkp->device->no_start_on_add) {
1064 break;
1065 }
1066
1067 /*
1068 * If manual intervention is required, or this is an
1069 * absent USB storage device, a spinup is meaningless.
1070 */
1071 if (sense_valid &&
1072 sshdr.sense_key == NOT_READY &&
1073 sshdr.asc == 4 && sshdr.ascq == 3) {
1074 break; /* manual intervention required */
1075
1076 /*
1077 * Issue command to spin up drive when not ready
1078 */
1079 } else if (sense_valid && sshdr.sense_key == NOT_READY) {
1080 if (!spintime) {
1081 sd_printk(KERN_NOTICE, sdkp, "Spinning up disk...");
1082 cmd[0] = START_STOP;
1083 cmd[1] = 1; /* Return immediately */
1084 memset((void *) &cmd[2], 0, 8);
1085 cmd[4] = 1; /* Start spin cycle */
1086 scsi_execute_req(sdkp->device, cmd, DMA_NONE,
1087 NULL, 0, &sshdr,
1088 SD_TIMEOUT, SD_MAX_RETRIES);
1089 spintime_expire = jiffies + 100 * HZ;
1090 spintime = 1;
1091 }
1092 /* Wait 1 second for next try */
1093 msleep(1000);
1094 printk(".");
1095
1096 /*
1097 * Wait for USB flash devices with slow firmware.
1098 * Yes, this sense key/ASC combination shouldn't
1099 * occur here. It's characteristic of these devices.
1100 */
1101 } else if (sense_valid &&
1102 sshdr.sense_key == UNIT_ATTENTION &&
1103 sshdr.asc == 0x28) {
1104 if (!spintime) {
1105 spintime_expire = jiffies + 5 * HZ;
1106 spintime = 1;
1107 }
1108 /* Wait 1 second for next try */
1109 msleep(1000);
1110 } else {
1111 /* we don't understand the sense code, so it's
1112 * probably pointless to loop */
1113 if(!spintime) {
1114 sd_printk(KERN_NOTICE, sdkp, "Unit Not Ready/n");
1115 sd_print_sense_hdr(sdkp, &sshdr);
1116 }
1117 break;
1118 }
1119
1120 } while (spintime && time_before_eq(jiffies, spintime_expire));
1121
1122 if (spintime) {
1123 if (scsi_status_is_good(the_result))
1124 printk("ready/n");
1125 else
1126 printk("not responding.../n");
1127 }
1128 }
顾名思义,spinup_disk就是让磁盘转起来.然而,要看明白这个函数,你就不得不对SCSI spec有一定了解了.
这个函数虽然复杂,但是我们本着擒贼先擒王的思想,重点关注这个函数中最有价值的那行代码,没错,即使是曲阳路易买得超市门口看自行车的大妈都知道,这个函数中最有价值的那行代码一定是1029行,scsi_execute_req()函数的调用.这个函数算是scsi核心层提供的,咱们只管调用不用管实现.我们在include/scsi/scsi_device.h中能找到它的声明:
297 extern int scsi_execute_req(struct scsi_device *sdev, const unsigned char *cmd,
298 int data_direction, void *buffer, unsigned bufflen,
299 struct scsi_sense_hdr *, int timeout, int retries);
和usb核心层一样,scsi核心层也提供了大量的函数让我们调用,这些函数极大的便利了我们编写scsi设备驱动程序.我们只要准备好参数传递给这个函数,然后就万事大吉了,等着判断函数返回值就是了,至于需要传递的数据,则已经被填充在我们的参数中的buffer里边了.这就好比我每天上班的时候把自行车停在西直门城铁站外,到了晚上下班回来的时候,自行车框里自然而然的就被填充满了,什么都有,香烟盒,卫生纸,吃剩的苹果,嚼过的口香糖,偶尔还有用过的避孕套,总而言之,首都人民的热情一次次的让我感动得泪流满面,让我觉得北漂的日子并不孤独.
这个函数说白了就是执行一个scsi命令,其第一个参数不必多说,就是我们的struct scsi_device的结构体指针,咱们这个故事里就这么一个.第二个参数则是代表着命令,cmd嘛,就是command.其实每一个参数的意思都很明了.
咱们结合我们的代码来看我们具体传递了怎样的参数.第一个sdkp->device这没得说,第二个,cmd,咱们在1011行申请的一个unsigned char类型的数组,总共10个元素,1026行给赋了值为TEST_UNIT_READY.Test Unit Ready就是一个很基本的SCSI命令.DMA_NONE代表传输方向,buffer和bufflen咱们用不上,因为这个命令就是测试设备准备好了没有,不需要传递什么数据.
所以正常来讲,咱们这么一调用scsi_execute_req()以执行这个Test Unit Ready命令,返回的结果基本上都是好的,除非设备真的有毛病.
当然你要说有没有出错的时候,那当然也是有的.比如下面这个例子,
[root@localhost dev]# ls sd*
sda sda1 sda10 sda11 sda12 sda13 sda14 sda2 sda3 sda5 sda6 sda7 sda8 sda9 sdb sdc sdd sde sdf
[root@localhost ~]# sg_turs /dev/sda
Completed 1 Test Unit Ready commands with 0 errors
[root@localhost ~]# sg_turs /dev/sdb
Completed 1 Test Unit Ready commands with 0 errors
[root@localhost ~]# sg_turs /dev/sdc
Completed 1 Test Unit Ready commands with 0 errors
[root@localhost ~]# sg_turs /dev/sde
Completed 1 Test Unit Ready commands with 0 errors
[root@localhost ~]# sg_turs /dev/sdf
test unit ready: Fixed format, current; Sense key: Not Ready
Additional sense: Medium not present
Completed 1 Test Unit Ready commands with 1 errors
这里sg_turs这个命令就是用来手工发送Test Unit Ready用的.不过要使用这个命令,你得安装sg3_utils系列软件包.
[root@localhost dev]# rpm -qa | grep sg3_utils
sg3_utils-devel-1.20-2.1
sg3_utils-1.20-2.1
sg3_utils-libs-1.20-2.1
我们看到在我的五块硬盘中,前四块都没有问题,但是第六块就报错了.所以在执行完命令之后,我们用the_result记录下结果,并且在1046行调用scsi_status_is_good()来判断结果.关于scsi_status_is_good()以及和它相关的一些宏定义于include/scsi/scsi.h文件中:
125 /*
126 * SCSI Architecture Model (SAM) Status codes. Taken from SAM-3 draft
127 * T10/1561-D Revision 4 Draft dated 7th November 2002.
128 */
129 #define SAM_STAT_GOOD 0x00
130 #define SAM_STAT_CHECK_CONDITION 0x02
131 #define SAM_STAT_CONDITION_MET 0x04
132 #define SAM_STAT_BUSY 0x08
133 #define SAM_STAT_INTERMEDIATE 0x10
134 #define SAM_STAT_INTERMEDIATE_CONDITION_MET 0x14
135 #define SAM_STAT_RESERVATION_CONFLICT 0x18
136 #define SAM_STAT_COMMAND_TERMINATED 0x22 /* obsolete in SAM-3 */
137 #define SAM_STAT_TASK_SET_FULL 0x28
138 #define SAM_STAT_ACA_ACTIVE 0x30
139 #define SAM_STAT_TASK_ABORTED 0x40
140
141 /** scsi_status_is_good - check the status return.
142 *
143 * @status: the status passed up from the driver (including host and
144 * driver components)
145 *
146 * This returns true for known good conditions that may be treated as
147 * command completed normally
148 */
149 static inline int scsi_status_is_good(int status)
150 {
151 /*
152 * FIXME: bit0 is listed as reserved in SCSI-2, but is
153 * significant in SCSI-3. For now, we follow the SCSI-2
154 * behaviour and ignore reserved bits.
155 */
156 status &= 0xfe;
157 return ((status == SAM_STAT_GOOD) ||
158 (status == SAM_STAT_INTERMEDIATE) ||
159 (status == SAM_STAT_INTERMEDIATE_CONDITION_MET) ||
160 /* FIXME: this is obsolete in SAM-3 */
161 (status == SAM_STAT_COMMAND_TERMINATED));
162 }
上面的那些宏被称为状态码, scsi_execute_req()的返回值就是这些状态码中的一个.而其中可以被认为是good的状态就是scsi_status_is_good函数中列出来的这四种,当然理论上来说最理想的就是SAM_STAT_GOOD,而另外这几种也勉强算是可以接受,将就将就的让它过去.
不过有一点必须明白的是,the_result和状态码还是有区别的,毕竟状态码只有那么多,用8位来表示足矣,而the_result我们看到是unsigned int,显然它不只是8位,于是我们就充分利用资源,因此就有了下面这些宏,
358 /*
359 * Use these to separate status msg and our bytes
360 *
361 * These are set by:
362 *
363 * status byte = set from target device
364 * msg_byte = return status from host adapter itself.
365 * host_byte = set by low-level driver to indicate status.
366 * driver_byte = set by mid-level.
367 */
368 #define status_byte(result) (((result) >> 1) & 0x7f)
369 #define msg_byte(result) (((result) >> 8) & 0xff)
370 #define host_byte(result) (((result) >> 16) & 0xff)
371 #define driver_byte(result) (((result) >> 24) & 0xff)
372 #define suggestion(result) (driver_byte(result) & SUGGEST_MASK)
也就是说除了最低的那个byte是作为status byte用,剩下的byte我们也没浪费,它们都被用来承载信息,其中driver_byte,即bit23到bit31,这8位被用来承载mid-level设置的信息.而这里用它和DRIVER_SENSE相与,则判断的是是否有sense data,我们当初在usb-storage故事中就说过,scsi世界里的sense data就是错误信息.这里1025行至1048行的这个do-while循环就是如果不成功就最多重复三次,循环结束了之后,1050行再次判断有没有sense data,如果没有,则说明也许成功了.
Scsi子系统最无耻的地方就在于错误判断的代码特别的多.而针对sense data的处理则是错误判断的一部分.
8 /*
9 * This is a slightly modified SCSI sense "descriptor" format header.
10 * The addition is to allow the 0x70 and 0x71 response codes. The idea
11 * is to place the salient data from either "fixed" or "descriptor" sense
12 * format into one structure to ease application processing.
13 *
14 * The original sense buffer should be kept around for those cases
15 * in which more information is required (e.g. the LBA of a MEDIUM ERROR).
16 */
17 struct scsi_sense_hdr { /* See SPC-3 section 4.5 */
18 u8 response_code; /* permit: 0x0, 0x70, 0x71, 0x72, 0x73 */
19 u8 sense_key;
20 u8 asc;
21 u8 ascq;
22 u8 byte4;
23 u8 byte5;
24 u8 byte6;
25 u8 additional_length; /* always 0 for fixed sense format */
26 };
27
28 static inline int scsi_sense_valid(struct scsi_sense_hdr *sshdr)
29 {
30 if (!sshdr)
31 return 0;
32
33 return (sshdr->response_code & 0x70) == 0x70;
34 }
这里定义的struct scsi_sense_hdr就是被用来描述一个sense data.”hdr”就是header的意思,因为sense data可能长度比较长,但是其前8个bytes是最重要的,所以这部分被叫做header,或者说头部,大多数情况下只要理睬头部就够了.
我们看函数scsi_execute_req()中第六个参数是struct scsi_sense_hdr *sshdr,换言之,如果命令执行出错了,那么sense data就会通过这个参数返回.所以咱们定义了sshdr,然后咱们通过判断它和它的各个成员,来决定下一步.
而sense data中,最基本的一个元素叫做response_code,它相当于为一个sense data定了性,即它属于哪一个类别,因为sense data毕竟有很多种.response code总共就是8个bits,目前使用的值只有70h,71h,72h,73h,其它的像00h到6Fh以及74h到7Eh这些都是保留的,以备将来之用.所以这里判断的就是response code得是0x70,0x71,0x72,0x73才是valid,否则就是invalid.这就是scsi_sense_valid()做的事情.
关于sense data,事实上,坊间一直流传着一本叫做SCSI Primary Commands(SPC)的秘籍,在这本秘籍的第四章,确切的说是4.5节,名字就叫做Sense data,即这一节是专门介绍Sense Data的.Sense data中最有意义的东西叫做sense key和sense code.这两个概念基本上确定了你这个错误究竟是什么错误.
1048行,我们判断sshdr的sense_key是不是等于UNIT_ATTENTION,这个信息表示这个设备可能被重置了或者可移动的介质发生了变化,或者更通俗一点说,只要设备发生了一些变化,然后它希望引起主机控制器的关注,比如说设备原本是on-line的,突然变成了off-line,或者反过来,设备从off-line回到了on-line.在正式读写设备之前,如果有UNIT_ATTENTION条件,必须把它给清除掉.而这(清除UNIT ATTENTION)也正是Test Unit Ready的工作之一.
而如果sense key等于NOT_READY,则表明这个logical unit不能被访问.(NOT READY: Indicates that the logical unit is not accessible.)而如果sense key等于NOT READY,而asc等于04h,ascq等于03h,这表明”Logical Unit Not Ready,Manual Intervention required”.(详见SPC-4,附录D部分)这说明需要人工干预.
当然大多数情况下,应该执行的是1079行这个else if所包含的代码.即磁盘确实应该是NOT_READY,于是我们需要发送下一个命令,即START STOP,在另一部江湖武功秘籍名为SCSI Block Commands-2(SBC-2)的书中,5.17节专门介绍了START STOP UNIT这个命令.这个命令简而言之,就相当于电源开关,SBC-2中Table 48给出了这个命令的格式:
<shapetype id="_x0000_t75" stroked="f" filled="f" path="m@4@5l@4@11@9@11@9@5xe" o:preferrelative="t" o:spt="75" coordsize="21600,21600"><stroke joinstyle="miter"></stroke><formulas><f eqn="if lineDrawn pixelLineWidth 0"></f><f eqn="sum @0 1 0"></f><f eqn="sum 0 0 @1"></f><f eqn="prod @2 1 2"></f><f eqn="prod @3 21600 pixelWidth"></f><f eqn="prod @3 21600 pixelHeight"></f><f eqn="sum @0 0 1"></f><f eqn="prod @6 1 2"></f><f eqn="prod @7 21600 pixelWidth"></f><f eqn="sum @8 21600 0"></f><f eqn="prod @7 21600 pixelHeight"></f><f eqn="sum @10 21600 0"></f></formulas><path o:connecttype="rect" gradientshapeok="t" o:extrusionok="f"></path><lock aspectratio="t" v:ext="edit"></lock></shapetype><shape id="_x0000_i1025" style="WIDTH: 415.5pt; HEIGHT: 133.5pt" type="#_x0000_t75"><imagedata o:title="" src="file:///C:/DOCUME~1/JASON_~1/LOCALS~1/Temp/msohtml1/01/clip_image001.emz"></imagedata></shape>
结合代码看,咱们把cmd[4]设置为1,实际上就等于是把这张图里的START位设置为1.而在SBC-2中,这个START位的含义如下:
If the START bit is set to zero, then the logical unit shall transition to the stopped power condition, disable the idle condition timer if it is active (see SPC-3), and disable the standby condition timer if it is active (see SPC-3). If the START bit set to one, then the logical unit shall transition to the active power condition, enable the idle condition timer if it is active, and enable the standby condition timer if it is active.
很明显,这就是真正的电源开关.因此,1086行再次调用scsi_execute_req以执行START STOP UNIT命令,就是真正的让硬盘转起来.或者用郭富城的话说,动起来!
于是我们就很清楚从1022行直到1120行这一百行代码的do-while循环的意思了.其理想情况的流程就是:
1. 软件说:磁盘磁盘我问你,你准备好了没有?
2. 磁盘说:没有!
3. 软件说:磁盘磁盘你听着,你快给我转起来!
4. 软件:睡眠1000毫秒之后重复第一步的问题.(但磁盘这次可能走第二步,也可能走第五步.)
5. 磁盘说:是的,我准备好了,我们时刻准备着.
6. 这时,1057行break语句会被执行,从而循环结束.sd_spinup_disk()函数也就结束了它的使命.
7. 在第一次走到第四步的时候,会设置spintime_expire为100秒,即这个时间为软件忍耐极限,磁盘你只要在100秒之内给我动起来,我就既往不咎,倘若给你100秒你还敬酒不吃吃罚酒,那就没办法了,while循环自然结束,1126行这个printk语句执行,告诉上级说,not responding,换言之,这厮没救了,整个一扶不起的阿斗.