硬盘的每次读写都会牵涉到DMA的过程,而文件系统对硬盘的I/O请求不是连续的,数据所在的物理内存页也是不连续的,如果能够将这些不连续的内存页组合到一起,再启用DMA操作,那么这些数据就能够一次传输完成,这样也就能高效的传输数据。以Silicon Image 3114为例,可以将不连续的物理内存页和该页的长度组合放到physical region descriptor table里,physical region descriptor table 结构如下:
Befor the controller starts a master transfer it is given a pointer to a Physical Region Descriptor Table.This table contains some number of Physical region Descriptor(PRD)which describe areas of memory that are involved in the data transfer.The descriptor table must be aligned on a four byte boundary and the table can’t cross a 64K boundary in memory. The physical memory region to be transferred in described by a Physical Region Descriptor(PRD).The data transfer will proceed until all regions described by the PRDs in the table have been transferred. Each Physical Region Descriptor entry is 8 bytes in length.The first 4 bytes specify the byte address of a physical memory region.The next two bytes specify the count of the region in bytes(64K byte limit per region).A value of zero in these two bytes indicates 64K.Bit 7 of the last byte indicates the end of the table;bus master operation terminates when the last descriptor has been retired. |
从上面的描述可以得知:要进行数据的传输必需将相应的物理页以及物理页内数据长度填充到PRD里面。
首先在驱动初始化的时候会为每个port分配相应的DMA总线地址,改地址指向PRD table。
int ata_port_start(struct ata_port *ap) { struct device *dev = ap->dev; int rc; ap->prd = dmam_alloc_coherent(dev, ATA_PRD_TBL_SZ, &ap->prd_dma, GFP_KERNEL); if (!ap->prd) return -ENOMEM; rc = ata_pad_alloc(ap, dev); if (rc) return rc; DPRINTK("prd alloc, virt %p, dma %llx\n", ap->prd, (unsigned long long)ap->prd_dma); return 0; } |
void * dmam_alloc_coherent(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp) int ata_port_start(struct ata_port *ap)
该函数处理了缓冲区的分配和映射,返回值是缓冲区的内核虚拟地址,被驱动程序使用,dma_handle 保存了相关的总线地址,该函数对分配的缓冲区做了一些处理,从而缓冲区可以用于DMA;
该函数具体的调用在ata_host_start里面
for (i = 0; i < host->n_ports; i++) { struct ata_port *ap = host->ports[i]; if (ap->ops->port_start) { rc = ap->ops->port_start(ap); if (rc) { ata_port_printk(ap, KERN_ERR, "failed to " "start port (errno=%d)\n", rc); goto err_out; } } ata_eh_freeze_port(ap); } |
可以看出sata控制器有多少个端口就会分配多少次,最终获得的dma总线地址会在dma建立的过程中写到sata控制器相应的寄存器里面;
void ata_bmdma_setup(struct ata_queued_cmd *qc) { struct ata_port *ap = qc->ap; unsigned int rw = (qc->tf.flags & ATA_TFLAG_WRITE); u8 dmactl; /* load PRD table addr. */ mb(); /* make sure PRD table writes are visible to controller */ iowrite32(ap->prd_dma, ap->ioaddr.bmdma_addr + ATA_DMA_TABLE_OFS); /* specify data direction, triple-check start bit is clear */ dmactl = ioread8(ap->ioaddr.bmdma_addr + ATA_DMA_CMD); dmactl &= ~(ATA_DMA_WR | ATA_DMA_START); if (!rw) dmactl |= ATA_DMA_WR; iowrite8(dmactl, ap->ioaddr.bmdma_addr + ATA_DMA_CMD); /* issue r/w command */ ap->ops->exec_command(ap, &qc->tf); } |
文件系统需要写到硬盘的数据保存在page里面,那么这个过程有又是怎么和dma建立关系的呢?scsi层的scsi_init_io函数即是做了这些工作,它将bio 下的相应的page转换到struct scatterlist的page里,具体的过程如下:
在scsi层将请求转换为scsi命令函数里,对应的请求类型是REQ_TYPE_FS;
static int scsi_prep_fn(struct request_queue *q, struct request *req) { struct scsi_device *sdev = q->queuedata; int ret = BLKPREP_OK; /* * If the device is not in running state we will reject some * or all commands. */ if (unlikely(sdev->sdev_state != SDEV_RUNNING)) { switch (sdev->sdev_state) { case SDEV_OFFLINE: /* * If the device is offline we refuse to process any * commands. The device must be brought online * before trying any recovery commands. */ sdev_printk(KERN_ERR, sdev, "rejecting I/O to offline device\n"); ret = BLKPREP_KILL; break; case SDEV_DEL: /* * If the device is fully deleted, we refuse to * process any commands as well. */ sdev_printk(KERN_ERR, sdev, "rejecting I/O to dead device\n"); ret = BLKPREP_KILL; break; case SDEV_QUIESCE: case SDEV_BLOCK: /* * If the devices is blocked we defer normal commands. */ if (!(req->cmd_flags & REQ_PREEMPT)) ret = BLKPREP_DEFER; break; default: /* * For any other not fully online state we only allow * special commands. In particular any user initiated * command is not allowed. */ if (!(req->cmd_flags & REQ_PREEMPT)) ret = BLKPREP_KILL; break; } if (ret != BLKPREP_OK) goto out; } switch (req->cmd_type) { case REQ_TYPE_BLOCK_PC: ret = scsi_setup_blk_pc_cmnd(sdev, req); break; case REQ_TYPE_FS: //建立scsi命令 ret = scsi_setup_fs_cmnd(sdev, req); break; default: /* * All other command types are not supported. * * Note that these days the SCSI subsystem does not use * REQ_TYPE_SPECIAL requests anymore. These are only used * (directly or via blk_insert_request) by non-SCSI drivers. */ blk_dump_rq_flags(req, "SCSI bad req"); ret = BLKPREP_KILL; break; } out: switch (ret) { case BLKPREP_KILL: req->errors = DID_NO_CONNECT << 16; break; case BLKPREP_DEFER: /* * If we defer, the elv_next_request() returns NULL, but the * queue must be restarted, so we plug here if no returning * command will automatically do that. */ if (sdev->device_busy == 0) blk_plug_device(q); break; default: req->cmd_flags |= REQ_DONTPREP; } return ret; } static int scsi_setup_fs_cmnd(struct scsi_device *sdev, struct request *req) { struct scsi_cmnd *cmd; struct scsi_driver *drv; int ret; /* * Filesystem requests must transfer data. */ BUG_ON(!req->nr_phys_segments); cmd = scsi_get_cmd_from_req(sdev, req); if (unlikely(!cmd)) return BLKPREP_DEFER; ret = scsi_init_io(cmd); if (unlikely(ret)) return ret; /* * Initialize the actual SCSI command for this request. */ drv = *(struct scsi_driver **)req->rq_disk->private_data; if (unlikely(!drv->init_command(cmd))) { scsi_release_buffers(cmd); scsi_put_command(cmd); return BLKPREP_KILL; } return BLKPREP_OK; } /* * Function: scsi_init_io() * * Purpose: SCSI I/O initialize function. * * Arguments: cmd - Command descriptor we wish to initialize * * Returns: 0 on success * BLKPREP_DEFER if the failure is retryable * BLKPREP_KILL if the failure is fatal */ static int scsi_init_io(struct scsi_cmnd *cmd) { struct request *req = cmd->request; struct scatterlist *sgpnt; int count; /* * We used to not use scatter-gather for single segment request, * but now we do (it makes highmem I/O easier to support without * kmapping pages) */ cmd->use_sg = req->nr_phys_segments; /* * If sg table allocation fails, requeue request later. */ sgpnt = scsi_alloc_sgtable(cmd, GFP_ATOMIC); if (unlikely(!sgpnt)) { scsi_unprep_request(req); return BLKPREP_DEFER; } req->buffer = NULL; cmd->request_buffer = (char *) sgpnt; if (blk_pc_request(req)) cmd->request_bufflen = req->data_len; else cmd->request_bufflen = req->nr_sectors << 9; /* * Next, walk the list, and fill in the addresses and sizes of * each segment. */ count = blk_rq_map_sg(req->q, req, cmd->request_buffer); if (likely(count <= cmd->use_sg)) { //scatter list的个数 cmd->use_sg = count; return BLKPREP_OK; } printk(KERN_ERR "Incorrect number of segments after building list\n"); printk(KERN_ERR "counted %d, received %d\n", count, cmd->use_sg); printk(KERN_ERR "req nr_sec %lu, cur_nr_sec %u\n", req->nr_sectors, req->current_nr_sectors); /* release the command and kill it */ scsi_release_buffers(cmd); scsi_put_command(cmd); return BLKPREP_KILL; } |
将请求的bio里面的page和scatter list里的page关联起来
int blk_rq_map_sg(struct request_queue *q, struct request *rq, struct scatterlist *sg) { struct bio_vec *bvec, *bvprv; struct bio *bio; int nsegs, i, cluster; nsegs = 0; cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER); //每个请求下有多个struct bio,bio里具体的描述是 struct bio_vec,所以函数的返回值肯定 //是大于0的,并且scatter list的个数会小于等于物理段数目 /* * for each bio in rq */ bvprv = NULL; rq_for_each_bio(bio, rq) { /* * for each segment in bio */ bio_for_each_segment(bvec, bio, i) { int nbytes = bvec->bv_len; if (bvprv && cluster) { if (sg[nsegs - 1].length + nbytes > q->max_segment_size) goto new_segment; if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) goto new_segment; if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec)) goto new_segment; sg[nsegs - 1].length += nbytes; } else { new_segment: memset(&sg[nsegs],0,sizeof(struct scatterlist)); sg[nsegs].page = bvec->bv_page; sg[nsegs].length = nbytes; sg[nsegs].offset = bvec->bv_offset; nsegs++; } bvprv = bvec; } /* segments in bio */ } /* bios in rq */ return nsegs; } |
到此,将文件系统里的具体page和scatter list 的page关联起来了
static int ata_scsi_translate(struct ata_device *dev, struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *), ata_xlat_func_t xlat_func) { struct ata_queued_cmd *qc; int is_io = xlat_func == ata_scsi_rw_xlat; VPRINTK("ENTER\n"); if (unlikely(ata_scmd_need_defer(dev, is_io))) goto defer; qc = ata_scsi_qc_new(dev, cmd, done); if (!qc) goto err_mem; /* data is present; dma-map it */ if (cmd->sc_data_direction == DMA_FROM_DEVICE || cmd->sc_data_direction == DMA_TO_DEVICE) { if (unlikely(cmd->request_bufflen < 1)) { ata_dev_printk(dev, KERN_WARNING, "WARNING: zero len r/w req\n"); goto err_did; } //直接使用scatter list if (cmd->use_sg) ata_sg_init(qc, cmd->request_buffer, cmd->use_sg); else ata_sg_init_one(qc, cmd->request_buffer, cmd->request_bufflen); qc->dma_dir = cmd->sc_data_direction; } qc->complete_fn = ata_scsi_qc_complete; if (xlat_func(qc)) goto early_finish; /* select device, send command to hardware */ ata_qc_issue(qc); VPRINTK("EXIT\n"); return 0; early_finish: ata_qc_free(qc); qc->scsidone(cmd); DPRINTK("EXIT - early finish (good or error)\n"); return 0; err_did: ata_qc_free(qc); cmd->result = (DID_ERROR << 16); qc->scsidone(cmd); err_mem: DPRINTK("EXIT - internal\n"); return 0; defer: DPRINTK("EXIT - defer\n"); return SCSI_MLQUEUE_DEVICE_BUSY; } void ata_sg_init(struct ata_queued_cmd *qc, struct scatterlist *sg, unsigned int n_elem) { qc->flags |= ATA_QCFLAG_SG; qc->__sg = sg; qc->n_elem = n_elem; qc->orig_n_elem = n_elem; } void ata_qc_issue(struct ata_queued_cmd *qc) { struct ata_port *ap = qc->ap; /* Make sure only one non-NCQ command is outstanding. The * check is skipped for old EH because it reuses active qc to * request ATAPI sense. */ WARN_ON(ap->ops->error_handler && ata_tag_valid(ap->active_tag)); if (qc->tf.protocol == ATA_PROT_NCQ) { WARN_ON(ap->sactive & (1 << qc->tag)); ap->sactive |= 1 << qc->tag; } else { WARN_ON(ap->sactive); ap->active_tag = qc->tag; } qc->flags |= ATA_QCFLAG_ACTIVE; ap->qc_active |= 1 << qc->tag; if (ata_should_dma_map(qc)) { if (qc->flags & ATA_QCFLAG_SG) { if (ata_sg_setup(qc)) goto sg_err; } else if (qc->flags & ATA_QCFLAG_SINGLE) { if (ata_sg_setup_one(qc)) goto sg_err; } } else { qc->flags &= ~ATA_QCFLAG_DMAMAP; } ap->ops->qc_prep(qc); qc->err_mask |= ap->ops->qc_issue(qc); if (unlikely(qc->err_mask)) goto err; return; sg_err: qc->flags &= ~ATA_QCFLAG_DMAMAP; qc->err_mask |= AC_ERR_SYSTEM; err: ata_qc_complete(qc); } static int ata_sg_setup(struct ata_queued_cmd *qc) { struct ata_port *ap = qc->ap; struct scatterlist *sg = qc->__sg; //取最后一个scatter list struct scatterlist *lsg = &sg[qc->n_elem - 1]; int n_elem, pre_n_elem, dir, trim_sg = 0; VPRINTK("ENTER, ata%u\n", ap->print_id); WARN_ON(!(qc->flags & ATA_QCFLAG_SG)); /* we must lengthen transfers to end on a 32-bit boundary */ qc->pad_len = lsg->length & 3; if (qc->pad_len) { void *pad_buf = ap->pad + (qc->tag * ATA_DMA_PAD_SZ); struct scatterlist *psg = &qc->pad_sgent; unsigned int offset; WARN_ON(qc->dev->class != ATA_DEV_ATAPI); memset(pad_buf, 0, ATA_DMA_PAD_SZ); /* * psg->page/offset are used to copy to-be-written * data in this function or read data in ata_sg_clean. */ offset = lsg->offset + lsg->length - qc->pad_len; psg->page = nth_page(lsg->page, offset >> PAGE_SHIFT); psg->offset = offset_in_page(offset); if (qc->tf.flags & ATA_TFLAG_WRITE) { void *addr = kmap_atomic(psg->page, KM_IRQ0); memcpy(pad_buf, addr + psg->offset, qc->pad_len); kunmap_atomic(addr, KM_IRQ0); } sg_dma_address(psg) = ap->pad_dma + (qc->tag * ATA_DMA_PAD_SZ); sg_dma_len(psg) = ATA_DMA_PAD_SZ; /* trim last sg */ lsg->length -= qc->pad_len; if (lsg->length == 0) trim_sg = 1; DPRINTK("padding done, sg[%d].length=%u pad_len=%u\n", qc->n_elem - 1, lsg->length, qc->pad_len); } pre_n_elem = qc->n_elem; if (trim_sg && pre_n_elem) pre_n_elem--; if (!pre_n_elem) { n_elem = 0; goto skip_map; } dir = qc->dma_dir; n_elem = dma_map_sg(ap->dev, sg, pre_n_elem, dir); if (n_elem < 1) { /* restore last sg */ lsg->length += qc->pad_len; return -1; } DPRINTK("%d sg elements mapped\n", n_elem); skip_map: qc->n_elem = n_elem; return 0; } static void ata_fill_sg(struct ata_queued_cmd *qc) { struct ata_port *ap = qc->ap; struct scatterlist *sg; unsigned int idx; WARN_ON(qc->__sg == NULL); WARN_ON(qc->n_elem == 0 && qc->pad_len == 0); idx = 0; ata_for_each_sg(sg, qc) { u32 addr, offset; u32 sg_len, len; /* determine if physical DMA addr spans 64K boundary. * Note h/w doesn't support 64-bit, so we unconditionally * truncate dma_addr_t to u32. */ addr = (u32) sg_dma_address(sg); sg_len = sg_dma_len(sg); while (sg_len) { offset = addr & 0xffff; len = sg_len; if ((offset + sg_len) > 0x10000) len = 0x10000 - offset; //按照PRD table填充相应的数据 ap->prd[idx].addr = cpu_to_le32(addr); ap->prd[idx].flags_len = cpu_to_le32(len & 0xffff); VPRINTK("PRD[%u] = (0x%X, 0x%X)\n", idx, addr, len); idx++; sg_len -= len; addr += len; } } //physical region descriptor table 的结束标志 if (idx) ap->prd[idx - 1].flags_len |= cpu_to_le32(ATA_PRD_EOT); } |
总结:不同厂商的sata控制器对PRD的实现不同,但是总的来说都是将相应的物理页以及物理页内数据长度填充到PRD里面。