Linux那些事儿之我是Block层(7)scsi命令的前世今生(一)

现在我们块设备也有了,队列也有了,要提交请求也就可以开始提交了.那就让我们来研究一下如何提交请求如何处理请求吧.不过哥们儿有言在先,出错处理的那些乱七八糟的代码咱们就不理睬了.

仍然以scsi磁盘举例,最初scsi这边发送的是scsi命令,可是从block走就得变成request,然而走到usb-storage那边又得变回scsi命令.换言之,这整个过程scsi命令要变两次身.

首先让我们从sd那边很常用的一个函数开始,我们来看scsi命令是如何在光天化日之下被偷梁换柱的变成了request,这个函数就是scsi_execute_req().来自drivers/scsi/scsi_lib.c:

    216 int scsi_execute_req(struct scsi_device *sdev, const unsigned char *cmd,

    217                      int data_direction, void *buffer, unsigned bufflen,

    218                      struct scsi_sense_hdr *sshdr, int timeout, int retries)

    219 {

    220         char *sense = NULL;

    221         int result;

    222

    223         if (sshdr) {

    224                 sense = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_NOIO);

    225                 if (!sense)

    226                         return DRIVER_ERROR << 24;

    227         }

    228         result = scsi_execute(sdev, cmd, data_direction, buffer, bufflen,

    229                               sense, timeout, retries, 0);

    230         if (sshdr)

    231                 scsi_normalize_sense(sense, SCSI_SENSE_BUFFERSIZE, sshdr);

    232

    233         kfree(sense);

    234         return result;

    235 }

这里面最需要关注的就是一个函数,scsi_execute(),来自同一个文件.

    164 /**

    165  * scsi_execute - insert request and wait for the result

    166  * @sdev:       scsi device

    167  * @cmd:        scsi command

    168  * @data_direction: data direction

    169  * @buffer:     data buffer

    170  * @bufflen:    len of buffer

    171  * @sense:      optional sense buffer

    172  * @timeout:    request timeout in seconds

    173  * @retries:    number of times to retry request

    174  * @flags:      or into request flags;

    175  *

    176  * returns the req->errors value which is the scsi_cmnd result

    177  * field.

    178  **/

    179 int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,

    180                  int data_direction, void *buffer, unsigned bufflen,

    181                  unsigned char *sense, int timeout, int retries, int flags)

    182 {

    183         struct request *req;

    184         int write = (data_direction == DMA_TO_DEVICE);

    185         int ret = DRIVER_ERROR << 24;

    186

    187         req = blk_get_request(sdev->request_queue, write, __GFP_WAIT);

    188

    189         if (bufflen &&  blk_rq_map_kern(sdev->request_queue, req,

    190                                         buffer, bufflen, __GFP_WAIT))

    191                 goto out;

    192

    193         req->cmd_len = COMMAND_SIZE(cmd[0]);

    194         memcpy(req->cmd, cmd, req->cmd_len);

    195         req->sense = sense;

    196         req->sense_len = 0;

    197         req->retries = retries;

    198         req->timeout = timeout;

    199         req->cmd_type = REQ_TYPE_BLOCK_PC;

    200         req->cmd_flags |= flags | REQ_QUIET | REQ_PREEMPT;

    201

    202         /*

    203          * head injection *required* here otherwise quiesce won't work

204          */

    205         blk_execute_rq(req->q, NULL, req, 1);

    206

    207         ret = req->errors;

    208  out:

    209         blk_put_request(req);

    210

    211         return ret;

    212 }

首先被调用的是blk_get_request.来自block/ll_rw_blk.c:

   2215 struct request *blk_get_request(request_queue_t *q, int rw, gfp_t gfp_mask)

   2216 {

   2217         struct request *rq;

   2218

   2219         BUG_ON(rw != READ && rw != WRITE);

   2220

   2221         spin_lock_irq(q->queue_lock);

   2222         if (gfp_mask & __GFP_WAIT) {

   2223                 rq = get_request_wait(q, rw, NULL);

   2224         } else {

   2225                 rq = get_request(q, rw, NULL, gfp_mask);

   2226                 if (!rq)

   2227                         spin_unlock_irq(q->queue_lock);

   2228         }

   2229         /* q->queue_lock is unlocked at this point */

   2230

   2231         return rq;

   2232 }

注意到我们调用这个函数的时候,第二个参数确实是__GFP_WAIT.所以2223行会被执行.get_request_wait()来自同一个文件:

   2173 static struct request *get_request_wait(request_queue_t *q, int rw_flags,

   2174                                         struct bio *bio)

   2175 {

   2176         const int rw = rw_flags & 0x01;

   2177         struct request *rq;

   2178

   2179         rq = get_request(q, rw_flags, bio, GFP_NOIO);

   2180         while (!rq) {

   2181                 DEFINE_WAIT(wait);

   2182                 struct request_list *rl = &q->rq;

   2183

   2184                 prepare_to_wait_exclusive(&rl->wait[rw], &wait,

   2185                                 TASK_UNINTERRUPTIBLE);

   2186

   2187                 rq = get_request(q, rw_flags, bio, GFP_NOIO);

   2188

   2189                 if (!rq) {

   2190                         struct io_context *ioc;

   2191

   2192                         blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);

   2193

   2194                         __generic_unplug_device(q);

   2195                         spin_unlock_irq(q->queue_lock);

   2196                         io_schedule();

   2197

   2198                         /*

   2199                          * After sleeping, we become a "batching" process and

   2200                          * will be able to allocate at least one request, and

   2201                          * up to a big batch of them for a small period time.

   2202                          * See ioc_batching, ioc_set_batching

   2203                          */

   2204                         ioc = current_io_context(GFP_NOIO, q->node);

   2205                         ioc_set_batching(q, ioc);

   2206

   2207                         spin_lock_irq(q->queue_lock);

   2208                 }

   2209                 finish_wait(&rl->wait[rw], &wait);

   2210         }

   2211

   2212         return rq;

   2213 }

而真正被调用的又是get_request(),仍然是来自同一个文件.

   2063 /*

   2064  * Get a free request, queue_lock must be held.

   2065  * Returns NULL on failure, with queue_lock held.

   2066  * Returns !NULL on success, with queue_lock *not held*.

   2067  */

   2068 static struct request *get_request(request_queue_t *q, int rw_flags,

   2069                                    struct bio *bio, gfp_t gfp_mask)

   2070 {

   2071         struct request *rq = NULL;

   2072         struct request_list *rl = &q->rq;

   2073         struct io_context *ioc = NULL;

   2074         const int rw = rw_flags & 0x01;

   2075         int may_queue, priv;

   2076

   2077         may_queue = elv_may_queue(q, rw_flags);

   2078         if (may_queue == ELV_MQUEUE_NO)

   2079                 goto rq_starved;

   2080

   2081         if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) {

   2082                 if (rl->count[rw]+1 >= q->nr_requests) {

   2083                         ioc = current_io_context(GFP_ATOMIC, q->node);

   2084                         /*

   2085                          * The queue will fill after this allocation, so set

   2086                          * it as full, and mark this process as "batching".

   2087                          * This process will be allowed to complete a batch of

   2088                          * requests, others will be blocked.

   2089                          */

   2090                         if (!blk_queue_full(q, rw)) {

   2091                                 ioc_set_batching(q, ioc);

   2092                                 blk_set_queue_full(q, rw);

   2093                         } else {

   2094                                 if (may_queue != ELV_MQUEUE_MUST

   2095                                                 && !ioc_batching(q, ioc)) {

   2096                                         /*

   2097                                          * The queue is full and the allocating

   2098                                          * process is not a "batcher", and not

   2099                                          * exempted by the IO scheduler

   2100                                          */

   2101                                         goto out;

   2102                                 }

   2103                         }

   2104                 }

   2105                 blk_set_queue_congested(q, rw);

   2106         }

   2107

   2108         /*

   2109          * Only allow batching queuers to allocate up to 50% over the defined

   2110          * limit of requests, otherwise we could have thousands of requests

   2111          * allocated with any setting of ->nr_requests

   2112          */

   2113         if (rl->count[rw] >= (3 * q->nr_requests / 2))

   2114                 goto out;

   2115

   2116         rl->count[rw]++;

   2117         rl->starved[rw] = 0;

   2118

   2119         priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);

   2120         if (priv)

   2121                 rl->elvpriv++;

   2122

   2123         spin_unlock_irq(q->queue_lock);

   2124

   2125         rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);

   2126         if (unlikely(!rq)) {

   2127                 /*

   2128                  * Allocation failed presumably due to memory. Undo anything

   2129                  * we might have messed up.

   2130                  *

   2131                  * Allocating task should really be put onto the front of the

   2132                  * wait queue, but this is pretty rare.

   2133                  */

   2134                 spin_lock_irq(q->queue_lock);

   2135                 freed_request(q, rw, priv);

   2136

   2137                 /*

   2138                  * in the very unlikely event that allocation failed and no

   2139                  * requests for this direction was pending, mark us starved

   2140                  * so that freeing of a request in the other direction will

   2141                  * notice us. another possible fix would be to split the

   2142                  * rq mempool into READ and WRITE

   2143                  */

   2144 rq_starved:

   2145                 if (unlikely(rl->count[rw] == 0))

   2146                         rl->starved[rw] = 1;

   2147

   2148                 goto out;

   2149         }

   2150

   2151         /*

   2152          * ioc may be NULL here, and ioc_batching will be false. That's

   2153          * OK, if the queue is under the request limit then requests need

   2154          * not count toward the nr_batch_requests limit. There will always

   2155          * be some limit enforced by BLK_BATCH_TIME.

   2156          */

   2157         if (ioc_batching(q, ioc))

   2158                 ioc->nr_batch_requests--;

   2159

   2160         rq_init(q, rq);

   2161

   2162         blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);

   2163 out:

   2164         return rq;

   2165 }

这个elv_may_queue来自block/elevator.c:

    848 int elv_may_queue(request_queue_t *q, int rw)

    849 {

    850         elevator_t *e = q->elevator;

    851

    852         if (e->ops->elevator_may_queue_fn)

    853                 return e->ops->elevator_may_queue_fn(q, rw);

    854

    855         return ELV_MQUEUE_MAY;

    856 }

属于我们的那个elevator_t结构体变量是当初我们在elevator_init()中调用elevator_alloc()申请的.它的ops显然是和具体我们采用了哪种电梯有关系的.这里我们为了简便起见,做一个最不要脸的选择,选择”noop”,这种最简单最原始的机制.再一次贴出它的elevator_type.

     87 static struct elevator_type elevator_noop = {

     88         .ops = {

     89              .elevator_merge_req_fn          = noop_merged_requests,

     90                 .elevator_dispatch_fn           = noop_dispatch,

     91                 .elevator_add_req_fn            = noop_add_request,

     92                 .elevator_queue_empty_fn        = noop_queue_empty,

     93                 .elevator_former_req_fn         = noop_former_request,

     94                 .elevator_latter_req_fn         = noop_latter_request,

     95                 .elevator_init_fn               = noop_init_queue,

     96                 .elevator_exit_fn               = noop_exit_queue,

     97         },

     98         .elevator_name = "noop",

     99         .elevator_owner = THIS_MODULE,

    100 };

是不是觉得很开心. 对于我们选择的这种noop的电梯,elevator_may_queue_fn根本就没有定义哎.虽然我们这样做很无耻,但是谁叫我们不幸生在现在的中国呢?只要我们够作践,够胆大,够无耻,够疯狂,所谓的道德底线不是大底”,重心可以下移,完全有向下突破的机会.

带着一个返回值ELV_MQUEUE_MAY,我们返回到get_request()中来.rl又是什么呢?2072行我们让它指向了q->rq.在这样一个危急关头,我不得不搬出一个复杂的结构体了,它就是request_queue,或者叫request_queue_t,定义于include/linux/blkdev.h:

     38 struct request_queue;

     39 typedef struct request_queue request_queue_t;

    360 struct request_queue

    361 {

    362         /*

    363          * Together with queue_head for cacheline sharing

    364          */

    365         struct list_head        queue_head;

    366         struct request          *last_merge;

    367         elevator_t              *elevator;

    368

    369         /*

    370          * the queue request freelist, one for reads and one for writes

    371          */

    372         struct request_list     rq;

    373

    374         request_fn_proc         *request_fn;

    375         make_request_fn         *make_request_fn;

    376         prep_rq_fn              *prep_rq_fn;

    377         unplug_fn               *unplug_fn;

    378         merge_bvec_fn           *merge_bvec_fn;

    379         issue_flush_fn          *issue_flush_fn;

    380         prepare_flush_fn        *prepare_flush_fn;

    381         softirq_done_fn         *softirq_done_fn;

    382

    383         /*

    384          * Dispatch queue sorting

    385          */

    386         sector_t                end_sector;

    387         struct request          *boundary_rq;

    388

    389         /*

    390          * Auto-unplugging state

    391          */

    392         struct timer_list       unplug_timer;

    393         int                     unplug_thresh;  /* After this many requests */

394         unsigned long           unplug_delay;   /* After this many jiffies */

    395         struct work_struct      unplug_work;

    396

    397         struct backing_dev_info backing_dev_info;

    398

    399         /*

    400          * The queue owner gets to use this for whatever they like.

    401          * ll_rw_blk doesn't touch it.

    402          */

    403         void                    *queuedata;

    404

    405         /*

    406          * queue needs bounce pages for pages above this limit

    407          */

    408         unsigned long           bounce_pfn;

    409         gfp_t                   bounce_gfp;

    410

    411         /*

    412          * various queue flags, see QUEUE_* below

    413          */

    414         unsigned long           queue_flags;

    415

    416         /*

    417          * protects queue structures from reentrancy. ->__queue_lock should

    418          * _never_ be used directly, it is queue private. always use

    419          * ->queue_lock.

    420          */

    421         spinlock_t              __queue_lock;

    422         spinlock_t              *queue_lock;

    423

    424         /*

    425          * queue kobject

    426          */

    427         struct kobject kobj;

    428

    429         /*

    430          * queue settings

    431          */

    432         unsigned long           nr_requests;    /* Max # of requests */

    433         unsigned int            nr_congestion_on;

    434         unsigned int            nr_congestion_off;

435         unsigned int            nr_batching;

    436

    437         unsigned int            max_sectors;

    438         unsigned int            max_hw_sectors;

    439         unsigned short          max_phys_segments;

    440         unsigned short          max_hw_segments;

    441         unsigned short          hardsect_size;

    442         unsigned int            max_segment_size;

    443

    444         unsigned long           seg_boundary_mask;

    445         unsigned int            dma_alignment;

    446

    447         struct blk_queue_tag    *queue_tags;

    448

    449         unsigned int            nr_sorted;

    450         unsigned int            in_flight;

    451

    452         /*

    453          * sg stuff

    454          */

    455         unsigned int            sg_timeout;

    456         unsigned int            sg_reserved_size;

    457         int                     node;

    458 #ifdef CONFIG_BLK_DEV_IO_TRACE

    459         struct blk_trace        *blk_trace;

    460 #endif

    461         /*

    462          * reserved for flush operations

    463          */

    464         unsigned int            ordered, next_ordered, ordseq;

    465         int                     orderr, ordcolor;

    466         struct request          pre_flush_rq, bar_rq, post_flush_rq;

    467         struct request          *orig_bar_rq;

    468         unsigned int            bi_size;

    469

    470         struct mutex            sysfs_lock;

    471 };

这里我们看到了rq其实是struct request_list结构体变量.这个结构体定义于同一个文件.

    131 struct request_list {

    132         int count[2];

    133         int starved[2];

    134         int elvpriv;

    135         mempool_t *rq_pool;

    136         wait_queue_head_t wait[2];

    137 };

不过这些我们现在都不想看,我们想看的只有其中的几个函数,第一个是2125blk_alloc_request().来自ll_rw_blk.c:

   1970 static struct request *

   1971 blk_alloc_request(request_queue_t *q, int rw, int priv, gfp_t gfp_mask)

   1972 {

   1973         struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);

   1974

   1975         if (!rq)

   1976                 return NULL;

   1977

   1978         /*

   1979          * first three bits are identical in rq->cmd_flags and bio->bi_rw,

   1980          * see bio.h and blkdev.h

   1981          */

   1982         rq->cmd_flags = rw | REQ_ALLOCED;

   1983

   1984         if (priv) {

   1985                 if (unlikely(elv_set_request(q, rq, gfp_mask))) {

   1986                         mempool_free(rq, q->rq.rq_pool);

   1987                         return NULL;

   1988                 }

   1989                 rq->cmd_flags |= REQ_ELVPRIV;

   1990         }

   1991

   1992         return rq;

   1993 }

其它我们不懂没有关系,至少我们从1972行可以看出这里申请了一个struct request的结构体指针,换句话说,此前,我们已经有了请求队列,但是没有实质性的元素,从这一刻起,我们有了一个真正的request.虽然现在还没有进入到队伍中去,但这只是早晚的事儿了.

下一个rq_init().

    238 static void rq_init(request_queue_t *q, struct request *rq)

    239 {

    240         INIT_LIST_HEAD(&rq->queuelist);

    241         INIT_LIST_HEAD(&rq->donelist);

    242

    243         rq->errors = 0;

    244         rq->bio = rq->biotail = NULL;

    245         INIT_HLIST_NODE(&rq->hash);

    246         RB_CLEAR_NODE(&rq->rb_node);

    247         rq->ioprio = 0;

    248         rq->buffer = NULL;

    249         rq->ref_count = 1;

    250         rq->q = q;

    251         rq->special = NULL;

    252         rq->data_len = 0;

    253         rq->data = NULL;

    254         rq->nr_phys_segments = 0;

    255         rq->sense = NULL;

    256         rq->end_io = NULL;

    257         rq->end_io_data = NULL;

    258         rq->completion_data = NULL;

    259 }

这个函数在干什么不用我说,浦东金杨新村卖麻辣烫的大妈都知道,对刚申请的rq进行初始化.

然后,get_request()就开开心心的返回了,正常情况下,get_request_wait()也会跟着返回,再接着,blk_get_request()也就返回了.我们也带着申请好初始化好的req回到scsi_execute()中去,而接下来一段代码就是我们最关心的,req的真正的赋值.比如req->cmd_len,req->cmd等等,就是这样被赋上的.换言之,我们的scsi命令就是这样被request拖下水的,从此它们之间不再是以前那种水留不住落花的漂泊,落花走不进水的世界的关系,而是沦落到了一荣俱荣一损俱损狼狈为奸的关系.

至此,完成了第一次变身,scsi命令到request的变身.

 

你可能感兴趣的:(linux,struct,IOC,null,buffer,Allocation)