好了,了解完必要的scsi设备驱动知识以后,我们就可以安心分析scsi_request_fn函数了。大家回忆一下对,这个函数指针通过几次传递并最终在blk_init_queue_node()中被赋予了q->request_fn。所以这一层的重点就是这个scsi_request_fn函数。
在看scsi_request_fn之前,注意回忆一下scsi_alloc_queue函数的1598行至1560行还赋了三个函数指针:
1590 struct request_queue *scsi_alloc_queue(struct scsi_device *sdev) 1591 { 1592 struct request_queue *q; 1593 1594 q = __scsi_alloc_queue(sdev->host, scsi_request_fn); 1595 if (!q) 1596 return NULL; 1597 1598 blk_queue_prep_rq(q, scsi_prep_fn); 1599 blk_queue_issue_flush_fn(q, scsi_issue_flush_fn); 1600 blk_queue_softirq_done(q, scsi_softirq_done); 1601 return q; 1602 }
143 void blk_queue_prep_rq(request_queue_t *q, prep_rq_fn *pfn) 144 { 145 q->prep_rq_fn = pfn; 146 }
313 void blk_queue_issue_flush_fn(request_queue_t *q, issue_flush_fn *iff) 314 { 315 q->issue_flush_fn = iff; 316 } 173 void blk_queue_softirq_done(request_queue_t *q, softirq_done_fn *fn) 174 { 175 q->softirq_done_fn = fn; 176 } |
分别是把scsi_prep_fn赋给了q->prep_rq_fn,把scsi_issue_flush_fn赋给了q->issue_flush_fn,把scsi_softirq_done赋给了q->softirq_done_fn。尤其是scsi_prep_fn我们马上就会用到。
好,让我们继续前面的话题,重点关注scsi_request_fn():
1422 static void scsi_request_fn(struct request_queue *q) 1423 { 1424 struct scsi_device *sdev = q->queuedata; 1425 struct scsi_Host *shost; 1426 struct scsi_cmnd *cmd; 1427 struct request *req; 1428 1429 if (!sdev) { 1430 printk("scsi: killing requests for dead queue/n"); 1431 while ((req = elv_next_request(q)) != NULL) 1432 scsi_kill_request(req, q); 1433 return; 1434 } 1435 1436 if(!get_device(&sdev->sdev_gendev)) 1437 /* We must be tearing the block queue down already */ 1438 return; 1439 1440 /* 1441 * To start with, we keep looping until the queue is empty, or until 1442 * the host is no longer able to accept any more requests. 1443 */ 1444 shost = sdev->host; 1445 while (!blk_queue_plugged(q)) { 1446 int rtn; 1447 /* 1448 * get next queueable request. We do this early to make sure 1449 * that the request is fully prepared even if we cannot 1450 * accept it. 1451 */ 1452 req = elv_next_request(q); 1453 if (!req || !scsi_dev_queue_ready(q, sdev)) 1454 break; 1455 1456 if (unlikely(!scsi_device_online(sdev))) { 1457 sdev_printk(KERN_ERR, sdev, 1458 "rejecting I/O to offline device/n"); 1459 scsi_kill_request(req, q); 1460 continue; 1461 } 1462 1463 1464 /* 1465 * Remove the request from the request list. 1466 */ 1467 if (!(blk_queue_tagged(q) && !blk_queue_start_tag(q, req))) 1468 blkdev_dequeue_request(req); 1469 sdev->device_busy++; /* 说明命令正在执行中 */ 1470 1471 spin_unlock(q->queue_lock); 1472 cmd = req->special; 1473 if (unlikely(cmd == NULL)) { 1474 printk(KERN_CRIT "impossible request in %s./n" 1475 "please mail a stack trace to " 1476 "[email protected]/n", 1477 __FUNCTION__); 1478 blk_dump_rq_flags(req, "foo"); 1479 BUG(); 1480 } 1481 spin_lock(shost->host_lock); 1482 1483 if (!scsi_host_queue_ready(q, shost, sdev)) 1484 goto not_ready; 1485 if (sdev->single_lun) { 1486 if (scsi_target(sdev)->starget_sdev_user && 1487 scsi_target(sdev)->starget_sdev_user != sdev) 1488 goto not_ready; 1489 scsi_target(sdev)->starget_sdev_user = sdev; 1490 } 1491 shost->host_busy++; 1492 1493 /* 1494 * XXX(hch): This is rather suboptimal, scsi_dispatch_cmd will 1495 * take the lock again. 1496 */ 1497 spin_unlock_irq(shost->host_lock); 1498 1499 /* 1500 * Finally, initialize any error handling parameters, and set up 1501 * the timers for timeouts. 1502 */ 1503 scsi_init_cmd_errh(cmd); 1504 1505 /* 1506 * Dispatch the command to the low-level driver. 1507 */ 1508 rtn = scsi_dispatch_cmd(cmd); 1509 spin_lock_irq(q->queue_lock); 1510 if(rtn) { 1511 /* we're refusing the command; because of 1512 * the way locks get dropped, we need to 1513 * check here if plugging is required */ 1514 if(sdev->device_busy == 0) 1515 blk_plug_device(q); 1516 1517 break; 1518 } 1519 } 1520 1521 goto out; 1522 1523 not_ready: 1524 spin_unlock_irq(shost->host_lock); 1525 1526 /* 1527 * lock q, handle tag, requeue req, and decrement device_busy. We 1528 * must return with queue_lock held. 1529 * 1530 * Decrementing device_busy without checking it is OK, as all such 1531 * cases (host limits or settings) should run the queue at some 1532 * later time. 1533 */ 1534 spin_lock_irq(q->queue_lock); 1535 blk_requeue_request(q, req); 1536 sdev->device_busy--; 1537 if(sdev->device_busy == 0) 1538 blk_plug_device(q); 1539 out: 1540 /* must be careful here...if we trigger the ->remove() function 1541 * we cannot be holding the q lock */ 1542 spin_unlock_irq(q->queue_lock); 1543 put_device(&sdev->sdev_gendev); 1544 spin_lock_irq(q->queue_lock); 1545 } |
scsi_request_fn函数为scsi设备请求队列处理函数,前面看到该函数被注册到了request_queue->request_fn上。块设备请求的bio最终会merge到request queue中,然后通过unplug_fn函数调用request_queue->request_fn,实现scsi_reuqest_fn函数的调用。
scsi_request_fn函数实现了请求队列的处理,首先1452-1468行按照电梯算法从请求队列中摘取一个request,所以我们首先关注1452行的elv_next_request(),来自block/elevator.c:
712 struct request *elv_next_request(request_queue_t *q) 713 { 714 struct request *rq; 715 int ret; 716 717 while ((rq = __elv_next_request(q)) != NULL) { 718 if (!(rq->cmd_flags & REQ_STARTED)) { 719 /* 720 * This is the first time the device driver 721 * sees this request (possibly after 722 * requeueing). Notify IO scheduler. 723 */ 724 if (blk_sorted_rq(rq)) 725 elv_activate_rq(q, rq); 726 727 /* 728 * just mark as started even if we don't start 729 * it, a request that has been delayed should 730 * not be passed by new incoming requests 731 */ 732 rq->cmd_flags |= REQ_STARTED; 733 blk_add_trace_rq(q, rq, BLK_TA_ISSUE); 734 } 735 736 if (!q->boundary_rq || q->boundary_rq == rq) { 737 q->end_sector = rq_end_sector(rq); 738 q->boundary_rq = NULL; 739 } 740 741 if ((rq->cmd_flags & REQ_DONTPREP) || !q->prep_rq_fn) 742 break; 743 744 ret = q->prep_rq_fn(q, rq); 745 if (ret == BLKPREP_OK) { 746 break; 747 } else if (ret == BLKPREP_DEFER) { 748 /* 749 * the request may have been (partially) prepped. 750 * we need to keep this request in the front to 751 * avoid resource deadlock. REQ_STARTED will 752 * prevent other fs requests from passing this one. 753 */ 754 rq = NULL; 755 break; 756 } else if (ret == BLKPREP_KILL) { 757 int nr_bytes = rq->hard_nr_sectors << 9; 758 759 if (!nr_bytes) 760 nr_bytes = rq->data_len; 761 762 blkdev_dequeue_request(rq); 763 rq->cmd_flags |= REQ_QUIET; 764 end_that_request_chunk(rq, 0, nr_bytes); 765 end_that_request_last(rq, 0); 766 } else { 767 printk(KERN_ERR "%s: bad return=%d/n", __FUNCTION__, 768 ret); 769 break; 770 } 771 } 772 773 return rq; 774 } |
它调用的__elv_next_request()仍然来自block/elevator.c:
696 static inline struct request *__elv_next_request(request_queue_t *q) 697 { 698 struct request *rq; 699 700 while (1) { 701 while (!list_empty(&q->queue_head)) { 702 rq = list_entry_rq(q->queue_head.next); 703 if (blk_do_ordered(q, &rq)) 704 return rq; 705 } 706 707 if (!q->elevator->ops->elevator_dispatch_fn(q, 0)) 708 return NULL; 709 } 710 } |
由于我们在I/O调度层中插入了一个request,所以这里q->queue_head不可能为空。所以702行从中取出一个request来。然后是blk_do_ordered(),来自block/ll_rw_blk.c:
478 int blk_do_ordered(request_queue_t *q, struct request **rqp) 479 { 480 struct request *rq = *rqp; 481 int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq); 482 483 if (!q->ordseq) { 484 if (!is_barrier) 485 return 1; 486 487 if (q->next_ordered != QUEUE_ORDERED_NONE) { 488 *rqp = start_ordered(q, rq); 489 return 1; 490 } else { 491 /* 492 * This can happen when the queue switches to 493 * ORDERED_NONE while this request is on it. 494 */ 495 blkdev_dequeue_request(rq); 496 end_that_request_first(rq, -EOPNOTSUPP, 497 rq->hard_nr_sectors); 498 end_that_request_last(rq, -EOPNOTSUPP); 499 *rqp = NULL; 500 return 0; 501 } 502 } 503 504 /* 505 * Ordered sequence in progress 506 */ 507 508 /* Special requests are not subject to ordering rules. */ 509 if (!blk_fs_request(rq) && 510 rq != &q->pre_flush_rq && rq != &q->post_flush_rq) 511 return 1; 512 513 if (q->ordered & QUEUE_ORDERED_TAG) { 514 /* Ordered by tag. Blocking the next barrier is enough. */ 515 if (is_barrier && rq != &q->bar_rq) 516 *rqp = NULL; 517 } else { 518 /* Ordered by draining. Wait for turn. */ 519 WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q)); 520 if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q)) 521 *rqp = NULL 522 } 523 524 return 1; 525 } |
首先看一下blk_fs_request,
528 #define blk_fs_request(rq) ((rq)->cmd_type == REQ_TYPE_FS)
很显然,咱们从来没有设置这个标识,所以不去管它。
所以在咱们这个上下文里,is_barrier一定是0。所以,blk_do_ordered二话不说,直接返回1。那么回到__elv_next_request以后,703行这个if条件是满足的,所以也就是返回rq,下面的那个elevator_dispatch_fn根本不会执行的。另一方面,我们从__elv_next_request返回,回到elv_next_request()的时候,只要request queue不是空的,那么返回值就是队列中最前边的那个request。
继续在elv_next_request中往下走,request得到了,cmd_flags其实整个故事中设置REQ_STARTED的也就是这里,732行。所以在我们执行732行之前,这个flag是没有设置的。因此,if条件是满足的。
而blk_sorted_rq又是一个宏,来自include/linux/blkdev.h:
543 #define blk_sorted_rq(rq) ((rq)->cmd_flags & REQ_SORTED)
很显然,咱们也从来没有设置过这个flag,所以这里不关我们的事。
当然了,对于noop,即便执行下一个函数也没有意义,因为这个elv_activate_rq()来自block/elevator.c:
272 static void elv_activate_rq(request_queue_t *q, struct request *rq) 273 { 274 elevator_t *e = q->elevator; 275 276 if (e->ops->elevator_activate_req_fn) 277 e->ops->elevator_activate_req_fn(q, rq); 278 } |
我们假设使用最简单的noop电梯算法,即根本就没有这个指针,所以不去管他。
这时候,我们设置REQ_STARTED这个flag,最开始我们在elevator_init()中,有这么一句:
230 q->boundary_rq = NULL;
于是rq_end_sector会被执行,这其实也只是一个很简单的宏:
172 #define rq_end_sector(rq) ((rq)->sector + (rq)->nr_sectors)
同时,boundary_rq还是被置为NULL。
回到elv_next_request中,接下来744行,由于我们把prep_rq_fn赋上了scsi_prep_fn,所以我们要看一下这个scsi_prep_fn(),这个来自drivers/scsi/scsi_lib.c的函数:
1093static int scsi_prep_fn(struct request_queue *q, struct request *req) 1094{ 1095 struct scsi_device *sdev = q->queuedata; 1096 struct scsi_cmnd *cmd; 1097 int specials_only = 0; 1098 1099 /* 1100 * Just check to see if the device is online. If it isn't, we 1101 * refuse to process any commands. The device must be brought 1102 * online before trying any recovery commands 1103 */ 1104 if (unlikely(!scsi_device_online(sdev))) { 1105 sdev_printk(KERN_ERR, sdev, 1106 "rejecting I/O to offline device/n"); 1107 goto kill; 1108 } 1109 if (unlikely(sdev->sdev_state != SDEV_RUNNING)) { 1110 /* OK, we're not in a running state don't prep 1111 * user commands */ 1112 if (sdev->sdev_state == SDEV_DEL) { 1113 /* Device is fully deleted, no commands 1114 * at all allowed down */ 1115 sdev_printk(KERN_ERR, sdev, 1116 "rejecting I/O to dead device/n"); 1117 goto kill; 1118 } 1119 /* OK, we only allow special commands (i.e. not 1120 * user initiated ones */ 1121 specials_only = sdev->sdev_state; 1122 } 1123 1124 /* 1125 * Find the actual device driver associated with this command. 1126 * The SPECIAL requests are things like character device or 1127 * ioctls, which did not originate from ll_rw_blk. Note that 1128 * the special field is also used to indicate the cmd for 1129 * the remainder of a partially fulfilled request that can 1130 * come up when there is a medium error. We have to treat 1131 * these two cases differently. We differentiate by looking 1132 * at request->cmd, as this tells us the real story. 1133 */ 1134 if (req->flags & REQ_SPECIAL && req->special) { 1135 cmd = req->special; 1136 } else if (req->flags & (REQ_CMD | REQ_BLOCK_PC)) { 1137 1138 if(unlikely(specials_only) && !(req->flags & REQ_SPECIAL)) { 1139 if(specials_only == SDEV_QUIESCE || 1140 specials_only == SDEV_BLOCK) 1141 goto defer; 1142 1143 sdev_printk(KERN_ERR, sdev, 1144 "rejecting I/O to device being removed/n"); 1145 goto kill; 1146 } 1147 1148 1149 /* 1150 * Now try and find a command block that we can use. 1151 */ 1152 if (!req->special) { 1153 cmd = scsi_get_command(sdev, GFP_ATOMIC); 1154 if (unlikely(!cmd)) 1155 goto defer; 1156 } else 1157 cmd = req->special; 1158 1159 /* pull a tag out of the request if we have one */ 1160 cmd->tag = req->tag; 1161 } else { 1162 blk_dump_rq_flags(req, "SCSI bad req"); 1163 goto kill; 1164 } 1165 1166 /* note the overloading of req->special. When the tag 1167 * is active it always means cmd. If the tag goes 1168 * back for re-queueing, it may be reset */ 1169 req->special = cmd; 1170 cmd->request = req; 1171 1172 /* 1173 * FIXME: drop the lock here because the functions below 1174 * expect to be called without the queue lock held. Also, 1175 * previously, we dequeued the request before dropping the 1176 * lock. We hope REQ_STARTED prevents anything untoward from 1177 * happening now. 1178 */ 1179 if (req->flags & (REQ_CMD | REQ_BLOCK_PC)) { 1180 int ret; 1181 1182 /* 1183 * This will do a couple of things: 1184 * 1) Fill in the actual SCSI command. 1185 * 2) Fill in any other upper-level specific fields 1186 * (timeout). 1187 * 1188 * If this returns 0, it means that the request failed 1189 * (reading past end of disk, reading offline device, 1190 * etc). This won't actually talk to the device, but 1191 * some kinds of consistency checking may cause the 1192 * request to be rejected immediately. 1193 */ 1194 1195 /* 1196 * This sets up the scatter-gather table (allocating if 1197 * required). 1198 */ 1199 ret = scsi_init_io(cmd); 1200 switch(ret) { 1201 /* For BLKPREP_KILL/DEFER the cmd was released */ 1202 case BLKPREP_KILL: 1203 goto kill; 1204 case BLKPREP_DEFER: 1205 goto defer; 1206 } 1207 1208 /* 1209 * Initialize the actual SCSI command for this request. 1210 */ 1211 if (req->flags & REQ_BLOCK_PC) { 1212 scsi_setup_blk_pc_cmnd(cmd); 1213 } else if (req->rq_disk) { 1214 struct scsi_driver *drv; 1215 1216 drv = *(struct scsi_driver **)req->rq_disk->private_data; 1217 if (unlikely(!drv->init_command(cmd))) { 1218 scsi_release_buffers(cmd); 1219 scsi_put_command(cmd); 1220 goto kill; 1221 } 1222 } 1223 } 1224 1225 /* 1226 * The request is now prepped, no need to come back here 1227 */ 1228 req->flags |= REQ_DONTPREP; 1229 return BLKPREP_OK; 1230 1231 defer: 1232 /* If we defer, the elv_next_request() returns NULL, but the 1233 * queue must be restarted, so we plug here if no returning 1234 * command will automatically do that. */ 1235 if (sdev->device_busy == 0) 1236 blk_plug_device(q); 1237 return BLKPREP_DEFER; 1238 kill: 1239 req->errors = DID_NO_CONNECT << 16; 1240 return BLKPREP_KILL; 1241} |
大家还记得我们前面使用__make_request函数创建一个request的时候,曾经通过init_request_from_bio(req, bio)初始化请求描述符中的字段。其中把request的设置flags字段中的REQ_CMD标识,说明这次request是一个标准的读或写操作。注意,前面我们并没有设置REQ_BLOCK_PC标识。
所以scsi_prep_fn函数首先会进入1136那个条件分支。1138-1146的代码是对该块设备状态的一个检查,一般不会出什么问题。随后1153行调用scsi_get_command函数给我们这个request对应的scsi_device分配一个scsi_cmnd结构,其地址赋给函数内部变量cmd指针:
struct scsi_cmnd *scsi_get_command(struct scsi_device *dev, gfp_t gfp_mask) { struct scsi_cmnd *cmd;
/* Bail if we can't get a reference to the device */ if (!get_device(&dev->sdev_gendev)) return NULL;
cmd = __scsi_get_command(dev->host, gfp_mask);
if (likely(cmd != NULL)) { unsigned long flags;
memset(cmd, 0, sizeof(*cmd)); cmd->device = dev; init_timer(&cmd->eh_timeout); INIT_LIST_HEAD(&cmd->list); spin_lock_irqsave(&dev->list_lock, flags); list_add_tail(&cmd->list, &dev->cmd_list); spin_unlock_irqrestore(&dev->list_lock, flags); cmd->jiffies_at_alloc = jiffies; } else put_device(&dev->sdev_gendev);
return cmd; }
static struct scsi_cmnd *__scsi_get_command(struct Scsi_Host *shost, gfp_t gfp_mask) { struct scsi_cmnd *cmd;
cmd = kmem_cache_alloc(shost->cmd_pool->slab, gfp_mask | shost->cmd_pool->gfp_mask);
if (unlikely(!cmd)) { unsigned long flags;
spin_lock_irqsave(&shost->free_list_lock, flags); if (likely(!list_empty(&shost->free_list))) { cmd = list_entry(shost->free_list.next, struct scsi_cmnd, list); list_del_init(&cmd->list); } spin_unlock_irqrestore(&shost->free_list_lock, flags); }
return cmd; } |
看不懂这个分配函数的回去好好看一下“scsi设备驱动体系架构”最后那个图,我就不多费口舌了。回到scsi_prep_fn中,1160行把reqest的tag赋给这个全新的scsi_cmnd结构;然后1169、1170行把这个reqest和scsi_cmnd联系起来。随后又进入1179行条件判断,1199行,调用scsi_init_io函数初始化这个scsi_cmnd结构:
static int scsi_init_io(struct scsi_cmnd *cmd) { struct request *req = cmd->request; struct scatterlist *sgpnt; int count;
/* * if this is a rq->data based REQ_BLOCK_PC, setup for a non-sg xfer */ if ((req->flags & REQ_BLOCK_PC) && !req->bio) { cmd->request_bufflen = req->data_len; cmd->request_buffer = req->data; req->buffer = req->data; cmd->use_sg = 0; return 0; }
/* * we used to not use scatter-gather for single segment request, * but now we do (it makes highmem I/O easier to support without * kmapping pages) */ cmd->use_sg = req->nr_phys_segments;
/* * if sg table allocation fails, requeue request later. */ sgpnt = scsi_alloc_sgtable(cmd, GFP_ATOMIC); if (unlikely(!sgpnt)) { scsi_unprep_request(req); return BLKPREP_DEFER; }
cmd->request_buffer = (char *) sgpnt; cmd->request_bufflen = req->nr_sectors << 9; if (blk_pc_request(req)) cmd->request_bufflen = req->data_len; req->buffer = NULL;
/* * Next, walk the list, and fill in the addresses and sizes of * each segment. */ count = blk_rq_map_sg(req->q, req, cmd->request_buffer);
/* * mapped well, send it off */ if (likely(count <= cmd->use_sg)) { cmd->use_sg = count; return 0; }
printk(KERN_ERR "Incorrect number of segments after building list/n"); printk(KERN_ERR "counted %d, received %d/n", count, cmd->use_sg); printk(KERN_ERR "req nr_sec %lu, cur_nr_sec %u/n", req->nr_sectors, req->current_nr_sectors);
/* release the command and kill it */ scsi_release_buffers(cmd); scsi_put_command(cmd); return BLKPREP_KILL; } |
一般情况下,scsi_init_io返回0,否则致命错误,导致scsi_prep_fn退出。继续走,由于我们并没有设置REQ_BLOCK_PC标识,而且req的rq_disk是存在的,gendisk,忘了?那你完了。所以scsi_prep_fn函数来到1217行,执行本函数中最重要的过程,drv->init_command。这个drv是啥?来自gendisk的private_data字段。还记得sd_probe吗?我们在其中把它赋值给了对应scsi_disk结构的driver字段,就是前面那个sd_template常量,别告诉我你又忘了。如果真忘了,那就好好从头开始,从scsi磁盘驱动的初始化函数init_sd开始。
我们知道sd_template常量的init_command指针指向sd_init_command函数地址,所以下面就来看看sd_init_command这个函数,十分重要,来自drivers/scsi/sd.c:
366static int sd_init_command(struct scsi_cmnd * SCpnt) 367{ 368 struct scsi_device *sdp = SCpnt->device; 369 struct request *rq = SCpnt->request; 370 struct gendisk *disk = rq->rq_disk; 371 sector_t block = rq->sector; 372 unsigned int this_count = SCpnt->request_bufflen >> 9; 373 unsigned int timeout = sdp->timeout; 374 375 SCSI_LOG_HLQUEUE(1, printk("sd_init_command: disk=%s, block=%llu, " 376 "count=%d/n", disk->disk_name, 377 (unsigned long long)block, this_count)); 378 379 if (!sdp || !scsi_device_online(sdp) || 380 block + rq->nr_sectors > get_capacity(disk)) { 381 SCSI_LOG_HLQUEUE(2, printk("Finishing %ld sectors/n", 382 rq->nr_sectors)); 383 SCSI_LOG_HLQUEUE(2, printk("Retry with 0x%p/n", SCpnt)); 384 return 0; 385 } 386 387 if (sdp->changed) { 388 /* 389 * quietly refuse to do anything to a changed disc until 390 * the changed bit has been reset 391 */ 392 /* printk("SCSI disk has been changed. Prohibiting further I/O./n"); */ 393 return 0; 394 } 395 SCSI_LOG_HLQUEUE(2, printk("%s : block=%llu/n", 396 disk->disk_name, (unsigned long long)block)); 397 398 /* 399 * If we have a 1K hardware sectorsize, prevent access to single 400 * 512 byte sectors. In theory we could handle this - in fact 401 * the scsi cdrom driver must be able to handle this because 402 * we typically use 1K blocksizes, and cdroms typically have 403 * 2K hardware sectorsizes. Of course, things are simpler 404 * with the cdrom, since it is read-only. For performance 405 * reasons, the filesystems should be able to handle this 406 * and not force the scsi disk driver to use bounce buffers 407 * for this. 408 */ 409 if (sdp->sector_size == 1024) { 410 if ((block & 1) || (rq->nr_sectors & 1)) { 411 printk(KERN_ERR "sd: Bad block number requested"); 412 return 0; 413 } else { 414 block = block >> 1; 415 this_count = this_count >> 1; 416 } 417 } 418 if (sdp->sector_size == 2048) { 419 if ((block & 3) || (rq->nr_sectors & 3)) { 420 printk(KERN_ERR "sd: Bad block number requested"); 421 return 0; 422 } else { 423 block = block >> 2; 424 this_count = this_count >> 2; 425 } 426 } 427 if (sdp->sector_size == 4096) { 428 if ((block & 7) || (rq->nr_sectors & 7)) { 429 printk(KERN_ERR "sd: Bad block number requested"); 430 return 0; 431 } else { 432 block = block >> 3; 433 this_count = this_count >> 3; 434 } 435 } 436 if (rq_data_dir(rq) == WRITE) { 437 if (!sdp->writeable) { 438 return 0; 439 } 440 SCpnt->cmnd[0] = WRITE_6; 441 SCpnt->sc_data_direction = DMA_TO_DEVICE; 442 } else if (rq_data_dir(rq) == READ) { 443 SCpnt->cmnd[0] = READ_6; 444 SCpnt->sc_data_direction = DMA_FROM_DEVICE; 445 } else { 446 printk(KERN_ERR "sd: Unknown command %lx/n", rq->flags); 447/* overkill panic("Unknown sd command %lx/n", rq->flags); */ 448 return 0; 449 } 450 451 SCSI_LOG_HLQUEUE(2, printk("%s : %s %d/%ld 512 byte blocks./n", 452 disk->disk_name, (rq_data_dir(rq) == WRITE) ? 453 "writing" : "reading", this_count, rq->nr_sectors)); 454 455 SCpnt->cmnd[1] = 0; 456 457 if (block > 0xffffffff) { 458 SCpnt->cmnd[0] += READ_16 - READ_6; 459 SCpnt->cmnd[1] |= blk_fua_rq(rq) ? 0x8 : 0; 460 SCpnt->cmnd[2] = sizeof(block) > 4 ? (unsigned char) (block >> 56) & 0xff : 0; 461 SCpnt->cmnd[3] = sizeof(block) > 4 ? (unsigned char) (block >> 48) & 0xff : 0; 462 SCpnt->cmnd[4] = sizeof(block) > 4 ? (unsigned char) (block >> 40) & 0xff : 0; 463 SCpnt->cmnd[5] = sizeof(block) > 4 ? (unsigned char) (block >> 32) & 0xff : 0; 464 SCpnt->cmnd[6] = (unsigned char) (block >> 24) & 0xff; 465 SCpnt->cmnd[7] = (unsigned char) (block >> 16) & 0xff; 466 SCpnt->cmnd[8] = (unsigned char) (block >> 8) & 0xff; 467 SCpnt->cmnd[9] = (unsigned char) block & 0xff; 468 SCpnt->cmnd[10] = (unsigned char) (this_count >> 24) & 0xff; 469 SCpnt->cmnd[11] = (unsigned char) (this_count >> 16) & 0xff; 470 SCpnt->cmnd[12] = (unsigned char) (this_count >> 8) & 0xff; 471 SCpnt->cmnd[13] = (unsigned char) this_count & 0xff; 472 SCpnt->cmnd[14] = SCpnt->cmnd[15] = 0; 473 } else if ((this_count > 0xff) || (block > 0x1fffff) || 474 SCpnt->device->use_10_for_rw) { 475 if (this_count > 0xffff) 476 this_count = 0xffff; 477 478 SCpnt->cmnd[0] += READ_10 - READ_6; 479 SCpnt->cmnd[1] |= blk_fua_rq(rq) ? 0x8 : 0; 480 SCpnt->cmnd[2] = (unsigned char) (block >> 24) & 0xff; 481 SCpnt->cmnd[3] = (unsigned char) (block >> 16) & 0xff; 482 SCpnt->cmnd[4] = (unsigned char) (block >> 8) & 0xff; 483 SCpnt->cmnd[5] = (unsigned char) block & 0xff; 484 SCpnt->cmnd[6] = SCpnt->cmnd[9] = 0; 485 SCpnt->cmnd[7] = (unsigned char) (this_count >> 8) & 0xff; 486 SCpnt->cmnd[8] = (unsigned char) this_count & 0xff; 487 } else { 488 if (unlikely(blk_fua_rq(rq))) { 489 /* 490 * This happens only if this drive failed 491 * 10byte rw command with ILLEGAL_REQUEST 492 * during operation and thus turned off 493 * use_10_for_rw. 494 */ 495 printk(KERN_ERR "sd: FUA write on READ/WRITE(6) drive/n"); 496 return 0; 497 } 498 499 SCpnt->cmnd[1] |= (unsigned char) ((block >> 16) & 0x1f); 500 SCpnt->cmnd[2] = (unsigned char) ((block >> 8) & 0xff); 501 SCpnt->cmnd[3] = (unsigned char) block & 0xff; 502 SCpnt->cmnd[4] = (unsigned char) this_count; 503 SCpnt->cmnd[5] = 0; 504 } 505 SCpnt->request_bufflen = this_count * sdp->sector_size; 506 507 /* 508 * We shouldn't disconnect in the middle of a sector, so with a dumb 509 * host adapter, it's safe to assume that we can at least transfer 510 * this many bytes between each connect / disconnect. 511 */ 512 SCpnt->transfersize = sdp->sector_size; 513 SCpnt->underflow = this_count << 9; 514 SCpnt->allowed = SD_MAX_RETRIES; 515 SCpnt->timeout_per_command = timeout; 516 517 /* 518 * This is the completion routine we use. This is matched in terms 519 * of capability to this function. 520 */ 521 SCpnt->done = sd_rw_intr; 522 523 /* 524 * This indicates that the command is ready from our end to be 525 * queued. 526 */ 527 return 1; 528} |
这个函数很重要,看似也很长,但是对照着前面scsi块设备驱动体系架构仔细看看,就会发现其实代码虽多,但很好理解。379~394检查一下磁盘状态,正常的话就不进入相应的条件分支。409~435行,根据扇区大小对内部变量block和this_count进行调整,其中block表示将要对磁盘读写的起始扇区号,this_count表示将要读入scsi_cmnd对应的那个缓冲区的字节数。这个缓冲区是通过前面scsi_init_io函数调用scsi_alloc_sgtable获得的,感兴趣的同学可以深入研究一下。
继续走,436行,通过rq_data_dir宏获得request的传输方向:
#define rq_data_dir(rq) ((rq)->flags & 1)
如果是WRITE就把scsi命令设置成WRITE_6,否则设置成READ_6。457-478是针对有些磁盘的大扇区的处理,我们略过,然后499-503初始化CDB的其他字段,大家可以对照“scsi设备驱动体系架构”中CDB的格式来分析这些代码的意思。最后,sd_init_command函数初始化scsi_cmnd的其他字段,并返回到scsi_prep_fn函数中。由于sd_init_command返回的是1,最终,正常的话,scsi_prep_fn函数返回BLKPREP_OK。prep表示prepare的意思,用我们的母语说就是准备的意思,最后BLKPREP_OK就说明准备好了,或者说准备就绪。而scsi_prep_fn()也将返回这个值,返回之前还设置了cmd_flags中的REQ_DONTPREP。(注意elv_next_request()函数741行判断的就是设这个flag。)
回到elv_next_request()中,由于返回值是BLKPREP_OK,所以746行我们就break了。换言之,我们取到了一个request,我们为之准备好了scsi命令,我们下一步就该是执行这个命令了。所以我们不需要再在elv_next_request()中滞留。我们终于回到了scsi_request_fn(),结束了elv_next_request,又要看下一个,不只是一个,而是两个,1467行,一个宏加一个函数,宏是blk_queue_tagged,来自include/linux/blkdev.h:
#define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
而函数是blk_queue_start_tag,来自block/ll_rw_blk.c:
1122 int blk_queue_start_tag(request_queue_t *q, struct request *rq) 1123 { 1124 struct blk_queue_tag *bqt = q->queue_tags; 1125 int tag; 1126 1127 if (unlikely((rq->cmd_flags & REQ_QUEUED))) { 1128 printk(KERN_ERR 1129 "%s: request %p for device [%s] already tagged %d", 1130 __FUNCTION__, rq, 1131 rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag); 1132 BUG(); 1133 } 1134 1135 /* 1136 * Protect against shared tag maps, as we may not have exclusive 1137 * access to the tag map. 1138 */ 1139 do { 1140 tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth); 1141 if (tag >= bqt->max_depth) 1142 return 1; 1143 1144 } while (test_and_set_bit(tag, bqt->tag_map)); 1145 1146 rq->cmd_flags |= REQ_QUEUED; 1147 rq->tag = tag; 1148 bqt->tag_index[tag] = rq; 1149 blkdev_dequeue_request(rq); 1150 list_add(&rq->queuelist, &bqt->busy_list); 1151 bqt->busy++; 1152 return 0; 1153 } |
对于我们大多数人来说,这两个函数的返回值都是0。
也因此,下一个函数blkdev_dequeue_request()就会被执行。来自include/linux/blkdev.h:
725 static inline void blkdev_dequeue_request(struct request *req) 726 { 727 elv_dequeue_request(req->q, req); 728 } |
而elv_dequeue_request来自block/elevator.c:
778 void elv_dequeue_request(request_queue_t *q, struct request *rq) 779 { 780 BUG_ON(list_empty(&rq->queuelist)); 781 BUG_ON(ELV_ON_HASH(rq)); 782 783 list_del_init(&rq->queuelist); 784 785 /* 786 * the time frame between a request being removed from the lists 787 * and to it is freed is accounted as io that is in progress at 788 * the driver side. 789 */ 790 if (blk_account_rq(rq)) 791 q->in_flight++; 792 } |
现在这个社会就是利用与被利用的关系,既然这个request已经没有了利用价值,我们已经从它身上得到了我们想要的scsi命令,那么我们完全可以过河拆桥卸磨杀驴了。list_del_init把这个request从request queue队列里删除掉。
而下面这个blk_account_rq也是一个来自include/linux/blkdev.h的宏:
536 #define blk_account_rq(rq) (blk_rq_started(rq) && blk_fs_request(rq))
很显然,至少第二个条件我们是不满足的。所以不用多说,结束这个elv_dequeue_request。
现在是时候去执行scsi命令了,回到scsi_request_fn函数中elv_next_request执行完毕之后,req的special就存放对scsi硬件设备发出“特殊”命令的请求所使用的数据的指针,1472行,把它赋给内部 scsi_cmnd型变量cmd。然后1508行调用scsi_dispatch_cmd函数执行这个cmd。
整个块设备驱动层的处理就结束了,我还是在网上找到一个图,正好可以总结上面的过程:
从前面分析可以看出,请求队列queue是top level与middle level之间的纽带。上层请求会在请求队列中维护,处理函数的方法由上下各层提供。在请求队列的处理过程中,将普通的块设备请求转换成标准的scsi命令,然后再通过middle level与low level之间的接口将请求递交给scsi host。