[PATCH] libata: eliminate the home grown dma padding in favour of that provided by the block layer

linux-ide.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] libata: eliminate the home grown dma padding in favour of that provided by the block layer
@ 2007-12-31 21:56 James Bottomley
  2007-12-31 22:56 ` Jeff Garzik
                   ` (3 more replies)
  0 siblings, 4 replies; 33+ messages in thread
From: James Bottomley @ 2007-12-31 21:56 UTC (permalink / raw)
  To: linux-scsi, linux-ide; +Cc: Jens Axboe, FUJITA Tomonori, Jeff Garzik

ATA requires that all DMA transfers begin and end on word boundaries.
Because of this, a large amount of machinery grew up in ide to adjust
scatterlists on this basis.  However, as of 2.5, the block layer has a
dma_alignment variable which ensures both the beginning and length of a
DMA transfer are aligned on the dma_alignment boundary.  Although the
block layer does adjust the beginning of the transfer to ensure this
happens, it doesn't actually adjust the length, it merely makes sure
that space is allocated for transfers beyond the declared length.  The
upshot of this is that scatterlists may be padded to any size between
the actual length and the length adjusted to the dma_alignment safely
knowing that memory is allocated in this region.

Right at the moment, SCSI takes the default dma_aligment which is on a
512 byte boundary.  Note that this aligment only applies to transfers
coming in from user space.  However, since all kernel allocations are
automatically aligned on a minimum of 32 byte boundaries, it is safe to
adjust them in this manner as well.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>

---

This also fixes a current panic in libsas with SATAPI devices.  I was
trying to trace a bad interaction between the libata padding and the new
sglist code which was the root cause, and ended up concluding that the
whole libata padding thing was redundant.

In a future improvement, I think we can relax SCSI dma_alignemnt (it's
causing almost every user space command to be copied instead of directly
passed through) and allow devices and transports to build up their own
requirements instead.

James

 drivers/ata/ahci.c            |    5 -
 drivers/ata/libata-core.c     |  126 +++---------------------------------------
 drivers/ata/libata-scsi.c     |   18 +-----
 drivers/ata/pata_icside.c     |    8 --
 drivers/ata/sata_fsl.c        |   17 -----
 drivers/ata/sata_mv.c         |    5 -
 drivers/ata/sata_nv.c         |    2 
 drivers/ata/sata_promise.c    |    2 
 drivers/ata/sata_qstor.c      |    2 
 drivers/ata/sata_sil24.c      |    5 -
 drivers/scsi/ipr.c            |    4 -
 drivers/scsi/libsas/sas_ata.c |    4 -
 include/linux/libata.h        |   35 -----------
 13 files changed, 25 insertions(+), 208 deletions(-)

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 54f38c2..4dd318d 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -1965,16 +1965,11 @@ static int ahci_port_start(struct ata_port *ap)
 	struct ahci_port_priv *pp;
 	void *mem;
 	dma_addr_t mem_dma;
-	int rc;
 
 	pp = devm_kzalloc(dev, sizeof(*pp), GFP_KERNEL);
 	if (!pp)
 		return -ENOMEM;
 
-	rc = ata_pad_alloc(ap, dev);
-	if (rc)
-		return rc;
-
 	mem = dmam_alloc_coherent(dev, AHCI_PORT_PRIV_DMA_SZ, &mem_dma,
 				  GFP_KERNEL);
 	if (!mem)
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 4753a18..97db82f 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -60,6 +60,8 @@
 #include <linux/io.h>
 #include <scsi/scsi.h>
 #include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_dbg.h>
 #include <scsi/scsi_host.h>
 #include <linux/libata.h>
 #include <asm/semaphore.h>
@@ -4464,7 +4466,6 @@ void ata_sg_clean(struct ata_queued_cmd *qc)
 	struct ata_port *ap = qc->ap;
 	struct scatterlist *sg = qc->__sg;
 	int dir = qc->dma_dir;
-	void *pad_buf = NULL;
 
 	WARN_ON(!(qc->flags & ATA_QCFLAG_DMAMAP));
 	WARN_ON(sg == NULL);
@@ -4474,34 +4475,14 @@ void ata_sg_clean(struct ata_queued_cmd *qc)
 
 	VPRINTK("unmapping %u sg elements\n", qc->n_elem);
 
-	/* if we padded the buffer out to 32-bit bound, and data
-	 * xfer direction is from-device, we must copy from the
-	 * pad buffer back into the supplied buffer
-	 */
-	if (qc->pad_len && !(qc->tf.flags & ATA_TFLAG_WRITE))
-		pad_buf = ap->pad + (qc->tag * ATA_DMA_PAD_SZ);
-
 	if (qc->flags & ATA_QCFLAG_SG) {
 		if (qc->n_elem)
 			dma_unmap_sg(ap->dev, sg, qc->n_elem, dir);
-		/* restore last sg */
-		sg_last(sg, qc->orig_n_elem)->length += qc->pad_len;
-		if (pad_buf) {
-			struct scatterlist *psg = &qc->pad_sgent;
-			void *addr = kmap_atomic(sg_page(psg), KM_IRQ0);
-			memcpy(addr + psg->offset, pad_buf, qc->pad_len);
-			kunmap_atomic(addr, KM_IRQ0);
-		}
 	} else {
 		if (qc->n_elem)
 			dma_unmap_single(ap->dev,
 				sg_dma_address(&sg[0]), sg_dma_len(&sg[0]),
 				dir);
-		/* restore sg */
-		sg->length += qc->pad_len;
-		if (pad_buf)
-			memcpy(qc->buf_virt + sg->length - qc->pad_len,
-			       pad_buf, qc->pad_len);
 	}
 
 	qc->flags &= ~ATA_QCFLAG_DMAMAP;
@@ -4526,7 +4507,7 @@ static void ata_fill_sg(struct ata_queued_cmd *qc)
 	unsigned int idx;
 
 	WARN_ON(qc->__sg == NULL);
-	WARN_ON(qc->n_elem == 0 && qc->pad_len == 0);
+	WARN_ON(qc->n_elem == 0);
 
 	idx = 0;
 	ata_for_each_sg(sg, qc) {
@@ -4580,7 +4561,7 @@ static void ata_fill_sg_dumb(struct ata_queued_cmd *qc)
 	unsigned int idx;
 
 	WARN_ON(qc->__sg == NULL);
-	WARN_ON(qc->n_elem == 0 && qc->pad_len == 0);
+	WARN_ON(qc->n_elem == 0);
 
 	idx = 0;
 	ata_for_each_sg(sg, qc) {
@@ -4774,7 +4755,6 @@ void ata_sg_init_one(struct ata_queued_cmd *qc, void *buf, unsigned int buflen)
 
 	qc->__sg = &qc->sgent;
 	qc->n_elem = 1;
-	qc->orig_n_elem = 1;
 	qc->buf_virt = buf;
 	qc->nbytes = buflen;
 	qc->cursg = qc->__sg;
@@ -4802,7 +4782,6 @@ void ata_sg_init(struct ata_queued_cmd *qc, struct scatterlist *sg,
 	qc->flags |= ATA_QCFLAG_SG;
 	qc->__sg = sg;
 	qc->n_elem = n_elem;
-	qc->orig_n_elem = n_elem;
 	qc->cursg = qc->__sg;
 }
 
@@ -4825,50 +4804,18 @@ static int ata_sg_setup_one(struct ata_queued_cmd *qc)
 	int dir = qc->dma_dir;
 	struct scatterlist *sg = qc->__sg;
 	dma_addr_t dma_address;
-	int trim_sg = 0;
 
 	/* we must lengthen transfers to end on a 32-bit boundary */
-	qc->pad_len = sg->length & 3;
-	if (qc->pad_len) {
-		void *pad_buf = ap->pad + (qc->tag * ATA_DMA_PAD_SZ);
-		struct scatterlist *psg = &qc->pad_sgent;
-
-		WARN_ON(qc->dev->class != ATA_DEV_ATAPI);
-
-		memset(pad_buf, 0, ATA_DMA_PAD_SZ);
-
-		if (qc->tf.flags & ATA_TFLAG_WRITE)
-			memcpy(pad_buf, qc->buf_virt + sg->length - qc->pad_len,
-			       qc->pad_len);
-
-		sg_dma_address(psg) = ap->pad_dma + (qc->tag * ATA_DMA_PAD_SZ);
-		sg_dma_len(psg) = ATA_DMA_PAD_SZ;
-		/* trim sg */
-		sg->length -= qc->pad_len;
-		if (sg->length == 0)
-			trim_sg = 1;
-
-		DPRINTK("padding done, sg->length=%u pad_len=%u\n",
-			sg->length, qc->pad_len);
-	}
-
-	if (trim_sg) {
-		qc->n_elem--;
-		goto skip_map;
-	}
-
+	sg->length += ATA_DMA_PAD_MASK -
+		((ATA_DMA_PAD_MASK + sg->length) & ATA_DMA_PAD_MASK);
 	dma_address = dma_map_single(ap->dev, qc->buf_virt,
 				     sg->length, dir);
-	if (dma_mapping_error(dma_address)) {
-		/* restore sg */
-		sg->length += qc->pad_len;
+	if (dma_mapping_error(dma_address))
 		return -1;
-	}
 
 	sg_dma_address(sg) = dma_address;
 	sg_dma_len(sg) = sg->length;
 
-skip_map:
 	DPRINTK("mapped buffer of %d bytes for %s\n", sg_dma_len(sg),
 		qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read");
 
@@ -4893,69 +4840,23 @@ static int ata_sg_setup(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap = qc->ap;
 	struct scatterlist *sg = qc->__sg;
-	struct scatterlist *lsg = sg_last(qc->__sg, qc->n_elem);
-	int n_elem, pre_n_elem, dir, trim_sg = 0;
+	int n_elem, pre_n_elem, dir;
 
 	VPRINTK("ENTER, ata%u\n", ap->print_id);
 	WARN_ON(!(qc->flags & ATA_QCFLAG_SG));
 
 	/* we must lengthen transfers to end on a 32-bit boundary */
-	qc->pad_len = lsg->length & 3;
-	if (qc->pad_len) {
-		void *pad_buf = ap->pad + (qc->tag * ATA_DMA_PAD_SZ);
-		struct scatterlist *psg = &qc->pad_sgent;
-		unsigned int offset;
-
-		WARN_ON(qc->dev->class != ATA_DEV_ATAPI);
-
-		memset(pad_buf, 0, ATA_DMA_PAD_SZ);
-
-		/*
-		 * psg->page/offset are used to copy to-be-written
-		 * data in this function or read data in ata_sg_clean.
-		 */
-		offset = lsg->offset + lsg->length - qc->pad_len;
-		sg_init_table(psg, 1);
-		sg_set_page(psg, nth_page(sg_page(lsg), offset >> PAGE_SHIFT),
-				qc->pad_len, offset_in_page(offset));
-
-		if (qc->tf.flags & ATA_TFLAG_WRITE) {
-			void *addr = kmap_atomic(sg_page(psg), KM_IRQ0);
-			memcpy(pad_buf, addr + psg->offset, qc->pad_len);
-			kunmap_atomic(addr, KM_IRQ0);
-		}
-
-		sg_dma_address(psg) = ap->pad_dma + (qc->tag * ATA_DMA_PAD_SZ);
-		sg_dma_len(psg) = ATA_DMA_PAD_SZ;
-		/* trim last sg */
-		lsg->length -= qc->pad_len;
-		if (lsg->length == 0)
-			trim_sg = 1;
-
-		DPRINTK("padding done, sg[%d].length=%u pad_len=%u\n",
-			qc->n_elem - 1, lsg->length, qc->pad_len);
-	}
-
+	sg->length += ATA_DMA_PAD_MASK -
+		((ATA_DMA_PAD_MASK + sg->length) & ATA_DMA_PAD_MASK);
 	pre_n_elem = qc->n_elem;
-	if (trim_sg && pre_n_elem)
-		pre_n_elem--;
-
-	if (!pre_n_elem) {
-		n_elem = 0;
-		goto skip_map;
-	}
 
 	dir = qc->dma_dir;
 	n_elem = dma_map_sg(ap->dev, sg, pre_n_elem, dir);
-	if (n_elem < 1) {
-		/* restore last sg */
-		lsg->length += qc->pad_len;
+	if (n_elem < 1)
 		return -1;
-	}
 
 	DPRINTK("%d sg elements mapped\n", n_elem);
 
-skip_map:
 	qc->n_elem = n_elem;
 
 	return 0;
@@ -6617,17 +6518,12 @@ void ata_host_resume(struct ata_host *host)
 int ata_port_start(struct ata_port *ap)
 {
 	struct device *dev = ap->dev;
-	int rc;
 
 	ap->prd = dmam_alloc_coherent(dev, ATA_PRD_TBL_SZ, &ap->prd_dma,
 				      GFP_KERNEL);
 	if (!ap->prd)
 		return -ENOMEM;
 
-	rc = ata_pad_alloc(ap, dev);
-	if (rc)
-		return rc;
-
 	DPRINTK("prd alloc, virt %p, dma %llx\n", ap->prd,
 		(unsigned long long)ap->prd_dma);
 	return 0;
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index a883bb0..571fba5 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -832,15 +832,6 @@ static void ata_scsi_dev_config(struct scsi_device *sdev,
 	/* configure max sectors */
 	blk_queue_max_sectors(sdev->request_queue, dev->max_sectors);
 
-	/* SATA DMA transfers must be multiples of 4 byte, so
-	 * we need to pad ATAPI transfers using an extra sg.
-	 * Decrement max hw segments accordingly.
-	 */
-	if (dev->class == ATA_DEV_ATAPI) {
-		struct request_queue *q = sdev->request_queue;
-		blk_queue_max_hw_segments(q, q->max_hw_segments - 1);
-	}
-
 	if (dev->flags & ATA_DFLAG_AN)
 		set_bit(SDEV_EVT_MEDIA_CHANGE, sdev->supported_events);
 
@@ -3527,7 +3518,7 @@ EXPORT_SYMBOL_GPL(ata_sas_port_alloc);
  *	@ap: Port to initialize
  *
  *	Called just after data structures for each port are
- *	initialized.  Allocates DMA pad.
+ *	initialized.
  *
  *	May be used as the port_start() entry in ata_port_operations.
  *
@@ -3536,7 +3527,7 @@ EXPORT_SYMBOL_GPL(ata_sas_port_alloc);
  */
 int ata_sas_port_start(struct ata_port *ap)
 {
-	return ata_pad_alloc(ap, ap->dev);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(ata_sas_port_start);
 
@@ -3544,8 +3535,6 @@ EXPORT_SYMBOL_GPL(ata_sas_port_start);
  *	ata_port_stop - Undo ata_sas_port_start()
  *	@ap: Port to shut down
  *
- *	Frees the DMA pad.
- *
  *	May be used as the port_stop() entry in ata_port_operations.
  *
  *	LOCKING:
@@ -3554,7 +3543,6 @@ EXPORT_SYMBOL_GPL(ata_sas_port_start);
 
 void ata_sas_port_stop(struct ata_port *ap)
 {
-	ata_pad_free(ap, ap->dev);
 }
 EXPORT_SYMBOL_GPL(ata_sas_port_stop);
 
@@ -3603,7 +3591,7 @@ EXPORT_SYMBOL_GPL(ata_sas_port_destroy);
  *
  *	RETURNS:
  *	Zero.
- */
+< */
 
 int ata_sas_slave_configure(struct scsi_device *sdev, struct ata_port *ap)
 {
diff --git a/drivers/ata/pata_icside.c b/drivers/ata/pata_icside.c
index 842fe08..d53809c 100644
--- a/drivers/ata/pata_icside.c
+++ b/drivers/ata/pata_icside.c
@@ -303,12 +303,6 @@ static int icside_dma_init(struct pata_icside_info *info)
 }
 
 
-static int pata_icside_port_start(struct ata_port *ap)
-{
-	/* No PRD to alloc */
-	return ata_pad_alloc(ap, ap->dev);
-}
-
 static struct scsi_host_template pata_icside_sht = {
 	.module			= THIS_MODULE,
 	.name			= DRV_NAME,
@@ -388,8 +382,6 @@ static struct ata_port_operations pata_icside_port_ops = {
 	.irq_clear		= ata_dummy_noret,
 	.irq_on			= ata_irq_on,
 
-	.port_start		= pata_icside_port_start,
-
 	.bmdma_stop		= pata_icside_bmdma_stop,
 	.bmdma_status		= pata_icside_bmdma_status,
 };
diff --git a/drivers/ata/sata_fsl.c b/drivers/ata/sata_fsl.c
index d015b4a..7fdab46 100644
--- a/drivers/ata/sata_fsl.c
+++ b/drivers/ata/sata_fsl.c
@@ -354,8 +354,8 @@ static unsigned int sata_fsl_fill_sg(struct ata_queued_cmd *qc, void *cmd_desc,
 			ata_port_printk(qc->ap, KERN_ERR,
 					"s/g len unaligned : 0x%x\n", sg_len);
 
-		if ((num_prde == (SATA_FSL_MAX_PRD_DIRECT - 1)) &&
-		    (qc->n_iter + 1 != qc->n_elem)) {
+		if (num_prde == (SATA_FSL_MAX_PRD_DIRECT - 1) &&
+		    sg_next(sg) != NULL) {
 			VPRINTK("setting indirect prde\n");
 			prd_ptr_to_indirect_ext = prd;
 			prd->dba = cpu_to_le32(indirect_ext_segment_paddr);
@@ -600,21 +600,9 @@ static int sata_fsl_port_start(struct ata_port *ap)
 	if (!pp)
 		return -ENOMEM;
 
-	/*
-	 * allocate per command dma alignment pad buffer, which is used
-	 * internally by libATA to ensure that all transfers ending on
-	 * unaligned boundaries are padded, to align on Dword boundaries
-	 */
-	retval = ata_pad_alloc(ap, dev);
-	if (retval) {
-		kfree(pp);
-		return retval;
-	}
-
 	mem = dma_alloc_coherent(dev, SATA_FSL_PORT_PRIV_DMA_SZ, &mem_dma,
 				 GFP_KERNEL);
 	if (!mem) {
-		ata_pad_free(ap, dev);
 		kfree(pp);
 		return -ENOMEM;
 	}
@@ -693,7 +681,6 @@ static void sata_fsl_port_stop(struct ata_port *ap)
 	dma_free_coherent(dev, SATA_FSL_PORT_PRIV_DMA_SZ,
 			  pp->cmdslot, pp->cmdslot_paddr);
 
-	ata_pad_free(ap, dev);
 	kfree(pp);
 }
 
diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index 37b850a..94bf504 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -1054,7 +1054,6 @@ static int mv_port_start(struct ata_port *ap)
 	void *mem;
 	dma_addr_t mem_dma;
 	unsigned long flags;
-	int rc;
 
 	pp = devm_kzalloc(dev, sizeof(*pp), GFP_KERNEL);
 	if (!pp)
@@ -1066,10 +1065,6 @@ static int mv_port_start(struct ata_port *ap)
 		return -ENOMEM;
 	memset(mem, 0, MV_PORT_PRIV_DMA_SZ);
 
-	rc = ata_pad_alloc(ap, dev);
-	if (rc)
-		return rc;
-
 	/* First item in chunk of DMA memory:
 	 * 32-slot command request table (CRQB), 32 bytes each in size
 	 */
diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c
index ed5dc7c..7eaacc7 100644
--- a/drivers/ata/sata_nv.c
+++ b/drivers/ata/sata_nv.c
@@ -2000,7 +2000,7 @@ static void nv_swncq_fill_sg(struct ata_queued_cmd *qc)
 	struct ata_prd *prd;
 
 	WARN_ON(qc->__sg == NULL);
-	WARN_ON(qc->n_elem == 0 && qc->pad_len == 0);
+	WARN_ON(qc->n_elem == 0);
 
 	prd = pp->prd + ATA_MAX_PRD * qc->tag;
 
diff --git a/drivers/ata/sata_promise.c b/drivers/ata/sata_promise.c
index 7914def..242949c 100644
--- a/drivers/ata/sata_promise.c
+++ b/drivers/ata/sata_promise.c
@@ -548,7 +548,7 @@ static void pdc_fill_sg(struct ata_queued_cmd *qc)
 		return;
 
 	WARN_ON(qc->__sg == NULL);
-	WARN_ON(qc->n_elem == 0 && qc->pad_len == 0);
+	WARN_ON(qc->n_elem == 0);
 
 	idx = 0;
 	ata_for_each_sg(sg, qc) {
diff --git a/drivers/ata/sata_qstor.c b/drivers/ata/sata_qstor.c
index 2f1de6e..258c482 100644
--- a/drivers/ata/sata_qstor.c
+++ b/drivers/ata/sata_qstor.c
@@ -291,7 +291,7 @@ static unsigned int qs_fill_sg(struct ata_queued_cmd *qc)
 	u8 *prd = pp->pkt + QS_CPB_BYTES;
 
 	WARN_ON(qc->__sg == NULL);
-	WARN_ON(qc->n_elem == 0 && qc->pad_len == 0);
+	WARN_ON(qc->n_elem == 0);
 
 	nelem = 0;
 	ata_for_each_sg(sg, qc) {
diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c
index 96fd526..228c36c 100644
--- a/drivers/ata/sata_sil24.c
+++ b/drivers/ata/sata_sil24.c
@@ -1232,7 +1232,6 @@ static int sil24_port_start(struct ata_port *ap)
 	union sil24_cmd_block *cb;
 	size_t cb_size = sizeof(*cb) * SIL24_MAX_CMDS;
 	dma_addr_t cb_dma;
-	int rc;
 
 	pp = devm_kzalloc(dev, sizeof(*pp), GFP_KERNEL);
 	if (!pp)
@@ -1245,10 +1244,6 @@ static int sil24_port_start(struct ata_port *ap)
 		return -ENOMEM;
 	memset(cb, 0, cb_size);
 
-	rc = ata_pad_alloc(ap, dev);
-	if (rc)
-		return rc;
-
 	pp->cmd_block = cb;
 	pp->cmd_block_dma = cb_dma;
 
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index 9018ee8..987f44a 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -5140,7 +5140,7 @@ static void ipr_build_ata_ioadl(struct ipr_cmnd *ipr_cmd,
 	struct ipr_ioarcb *ioarcb = &ipr_cmd->ioarcb;
 	struct ipr_ioadl_desc *ioadl = ipr_cmd->ioadl;
 	struct ipr_ioadl_desc *last_ioadl = NULL;
-	int len = qc->nbytes + qc->pad_len;
+	int len = qc->nbytes;
 	struct scatterlist *sg;
 
 	if (len == 0)
@@ -5205,7 +5205,7 @@ static unsigned int ipr_qc_issue(struct ata_queued_cmd *qc)
 	ioarcb->cmd_pkt.request_type = IPR_RQTYPE_ATA_PASSTHRU;
 	ioarcb->cmd_pkt.flags_hi |= IPR_FLAGS_HI_NO_LINK_DESC;
 	ioarcb->cmd_pkt.flags_hi |= IPR_FLAGS_HI_NO_ULEN_CHK;
-	ipr_cmd->dma_use_sg = qc->pad_len ? qc->n_elem + 1 : qc->n_elem;
+	ipr_cmd->dma_use_sg = qc->n_elem;
 
 	ipr_build_ata_ioadl(ipr_cmd, qc);
 	regs->flags |= IPR_ATA_FLAG_STATUS_ON_GOOD_COMPLETION;
diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
index 0829b55..78bd2f3 100644
--- a/drivers/scsi/libsas/sas_ata.c
+++ b/drivers/scsi/libsas/sas_ata.c
@@ -178,8 +178,8 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
 	task->uldd_task = qc;
 	if (is_atapi_taskfile(&qc->tf)) {
 		memcpy(task->ata_task.atapi_packet, qc->cdb, qc->dev->cdb_len);
-		task->total_xfer_len = qc->nbytes + qc->pad_len;
-		task->num_scatter = qc->pad_len ? qc->n_elem + 1 : qc->n_elem;
+		task->total_xfer_len = qc->nbytes;
+		task->num_scatter = qc->n_elem;
 	} else {
 		ata_for_each_sg(sg, qc) {
 			num++;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 124033c..2f40d57 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -282,7 +282,7 @@ enum {
 
 	/* size of buffer to pad xfers ending on unaligned boundaries */
 	ATA_DMA_PAD_SZ		= 4,
-	ATA_DMA_PAD_BUF_SZ	= ATA_DMA_PAD_SZ * ATA_MAX_QUEUE,
+	ATA_DMA_PAD_MASK	= ATA_DMA_PAD_SZ - 1,
 
 	/* ering size */
 	ATA_ERING_SIZE		= 32,
@@ -446,12 +446,9 @@ struct ata_queued_cmd {
 	unsigned long		flags;		/* ATA_QCFLAG_xxx */
 	unsigned int		tag;
 	unsigned int		n_elem;
-	unsigned int		n_iter;
-	unsigned int		orig_n_elem;
 
 	int			dma_dir;
 
-	unsigned int		pad_len;
 	unsigned int		sect_size;
 
 	unsigned int		nbytes;
@@ -461,7 +458,6 @@ struct ata_queued_cmd {
 	unsigned int		cursg_ofs;
 
 	struct scatterlist	sgent;
-	struct scatterlist	pad_sgent;
 	void			*buf_virt;
 
 	/* DO NOT iterate over __sg manually, use ata_for_each_sg() */
@@ -606,9 +602,6 @@ struct ata_port {
 	struct ata_prd		*prd;	 /* our SG list */
 	dma_addr_t		prd_dma; /* and its DMA mapping */
 
-	void			*pad;	/* array of DMA pad buffers */
-	dma_addr_t		pad_dma;
-
 	struct ata_ioports	ioaddr;	/* ATA cmd/ctl/dma register blocks */
 
 	u8			ctl;	/* cache of ATA control register */
@@ -1080,24 +1073,15 @@ extern void ata_port_pbar_desc(struct ata_port *ap, int bar, ssize_t offset,
 static inline struct scatterlist *
 ata_qc_first_sg(struct ata_queued_cmd *qc)
 {
-	qc->n_iter = 0;
 	if (qc->n_elem)
 		return qc->__sg;
-	if (qc->pad_len)
-		return &qc->pad_sgent;
 	return NULL;
 }
 
 static inline struct scatterlist *
 ata_qc_next_sg(struct scatterlist *sg, struct ata_queued_cmd *qc)
 {
-	if (sg == &qc->pad_sgent)
-		return NULL;
-	if (++qc->n_iter < qc->n_elem)
-		return sg_next(sg);
-	if (qc->pad_len)
-		return &qc->pad_sgent;
-	return NULL;
+	return sg_next(sg);
 }
 
 #define ata_for_each_sg(sg, qc) \
@@ -1343,9 +1327,7 @@ static inline void ata_qc_reinit(struct ata_queued_cmd *qc)
 	qc->cursg_ofs = 0;
 	qc->nbytes = qc->curbytes = 0;
 	qc->n_elem = 0;
-	qc->n_iter = 0;
 	qc->err_mask = 0;
-	qc->pad_len = 0;
 	qc->sect_size = ATA_SECT_SIZE;
 
 	ata_tf_init(qc->dev, &qc->tf);
@@ -1379,19 +1361,6 @@ static inline unsigned int __ac_err_mask(u8 status)
 	return mask;
 }
 
-static inline int ata_pad_alloc(struct ata_port *ap, struct device *dev)
-{
-	ap->pad_dma = 0;
-	ap->pad = dmam_alloc_coherent(dev, ATA_DMA_PAD_BUF_SZ,
-				      &ap->pad_dma, GFP_KERNEL);
-	return (ap->pad == NULL) ? -ENOMEM : 0;
-}
-
-static inline void ata_pad_free(struct ata_port *ap, struct device *dev)
-{
-	dmam_free_coherent(dev, ATA_DMA_PAD_BUF_SZ, ap->pad, ap->pad_dma);
-}
-
 static inline struct ata_port *ata_shost_to_port(struct Scsi_Host *host)
 {
 	return *(struct ata_port **)&host->hostdata[0];




^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2007-12-31 21:56 [PATCH] libata: eliminate the home grown dma padding in favour of that provided by the block layer James Bottomley
@ 2007-12-31 22:56 ` Jeff Garzik
  2008-01-03  7:58 ` FUJITA Tomonori
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 33+ messages in thread
From: Jeff Garzik @ 2007-12-31 22:56 UTC (permalink / raw)
  To: James Bottomley; +Cc: linux-scsi, linux-ide, Jens Axboe, FUJITA Tomonori

James Bottomley wrote:
> ATA requires that all DMA transfers begin and end on word boundaries.
> Because of this, a large amount of machinery grew up in ide to adjust
> scatterlists on this basis.  However, as of 2.5, the block layer has a
> dma_alignment variable which ensures both the beginning and length of a
> DMA transfer are aligned on the dma_alignment boundary.  Although the
> block layer does adjust the beginning of the transfer to ensure this
> happens, it doesn't actually adjust the length, it merely makes sure
> that space is allocated for transfers beyond the declared length.  The
> upshot of this is that scatterlists may be padded to any size between
> the actual length and the length adjusted to the dma_alignment safely
> knowing that memory is allocated in this region.
> 
> Right at the moment, SCSI takes the default dma_aligment which is on a
> 512 byte boundary.  Note that this aligment only applies to transfers
> coming in from user space.  However, since all kernel allocations are
> automatically aligned on a minimum of 32 byte boundaries, it is safe to
> adjust them in this manner as well.
> 
> Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
> 
> ---
> 
> This also fixes a current panic in libsas with SATAPI devices.  I was
> trying to trace a bad interaction between the libata padding and the new
> sglist code which was the root cause, and ended up concluding that the
> whole libata padding thing was redundant.
> 
> In a future improvement, I think we can relax SCSI dma_alignemnt (it's
> causing almost every user space command to be copied instead of directly
> passed through) and allow devices and transports to build up their own
> requirements instead.

Initial read:  ACK, with thanks.  I'll queue this into libata#upstream, 
unless you have other suggestions.

This is definitely the sort of thing that low-level drivers (and their 
helper libs, e.g. libata) should not need to do.

	Jeff





^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2007-12-31 21:56 [PATCH] libata: eliminate the home grown dma padding in favour of that provided by the block layer James Bottomley
  2007-12-31 22:56 ` Jeff Garzik
@ 2008-01-03  7:58 ` FUJITA Tomonori
  2008-01-03 15:12   ` James Bottomley
  2008-01-09  2:10 ` Tejun Heo
  2008-01-18 23:14 ` [PATCH RESEND] " James Bottomley
  3 siblings, 1 reply; 33+ messages in thread
From: FUJITA Tomonori @ 2008-01-03  7:58 UTC (permalink / raw)
  To: James.Bottomley; +Cc: linux-scsi, linux-ide, Jens.Axboe, fujita.tomonori, jeff

On Mon, 31 Dec 2007 15:56:08 -0600
James Bottomley <James.Bottomley@HansenPartnership.com> wrote:

> ATA requires that all DMA transfers begin and end on word boundaries.
> Because of this, a large amount of machinery grew up in ide to adjust
> scatterlists on this basis.  However, as of 2.5, the block layer has a
> dma_alignment variable which ensures both the beginning and length of a
> DMA transfer are aligned on the dma_alignment boundary.  Although the
> block layer does adjust the beginning of the transfer to ensure this
> happens, it doesn't actually adjust the length, it merely makes sure
> that space is allocated for transfers beyond the declared length.  The
> upshot of this is that scatterlists may be padded to any size between
> the actual length and the length adjusted to the dma_alignment safely
> knowing that memory is allocated in this region.

Great!


> diff --git a/include/linux/libata.h b/include/linux/libata.h
> index 124033c..2f40d57 100644
> --- a/include/linux/libata.h
> +++ b/include/linux/libata.h
> @@ -282,7 +282,7 @@ enum {
>  
>  	/* size of buffer to pad xfers ending on unaligned boundaries */
>  	ATA_DMA_PAD_SZ		= 4,
> -	ATA_DMA_PAD_BUF_SZ	= ATA_DMA_PAD_SZ * ATA_MAX_QUEUE,
> +	ATA_DMA_PAD_MASK	= ATA_DMA_PAD_SZ - 1,
>  
>  	/* ering size */
>  	ATA_ERING_SIZE		= 32,
> @@ -446,12 +446,9 @@ struct ata_queued_cmd {
>  	unsigned long		flags;		/* ATA_QCFLAG_xxx */
>  	unsigned int		tag;
>  	unsigned int		n_elem;
> -	unsigned int		n_iter;
> -	unsigned int		orig_n_elem;
>  
>  	int			dma_dir;
>  
> -	unsigned int		pad_len;
>  	unsigned int		sect_size;
>  
>  	unsigned int		nbytes;
> @@ -461,7 +458,6 @@ struct ata_queued_cmd {
>  	unsigned int		cursg_ofs;
>  
>  	struct scatterlist	sgent;
> -	struct scatterlist	pad_sgent;
>  	void			*buf_virt;
>  
>  	/* DO NOT iterate over __sg manually, use ata_for_each_sg() */
> @@ -606,9 +602,6 @@ struct ata_port {
>  	struct ata_prd		*prd;	 /* our SG list */
>  	dma_addr_t		prd_dma; /* and its DMA mapping */
>  
> -	void			*pad;	/* array of DMA pad buffers */
> -	dma_addr_t		pad_dma;
> -
>  	struct ata_ioports	ioaddr;	/* ATA cmd/ctl/dma register blocks */
>  
>  	u8			ctl;	/* cache of ATA control register */
> @@ -1080,24 +1073,15 @@ extern void ata_port_pbar_desc(struct ata_port *ap, int bar, ssize_t offset,
>  static inline struct scatterlist *
>  ata_qc_first_sg(struct ata_queued_cmd *qc)
>  {
> -	qc->n_iter = 0;
>  	if (qc->n_elem)
>  		return qc->__sg;
> -	if (qc->pad_len)
> -		return &qc->pad_sgent;
>  	return NULL;
>  }
>  
>  static inline struct scatterlist *
>  ata_qc_next_sg(struct scatterlist *sg, struct ata_queued_cmd *qc)
>  {
> -	if (sg == &qc->pad_sgent)
> -		return NULL;
> -	if (++qc->n_iter < qc->n_elem)
> -		return sg_next(sg);
> -	if (qc->pad_len)
> -		return &qc->pad_sgent;
> -	return NULL;
> +	return sg_next(sg);
>  }
>  
>  #define ata_for_each_sg(sg, qc) \

How about removing ata_qc_first_sg and ata_qc_next_sg completely?

Now we can just replace ata_qc_next_sg with sg_next. qc->__sg seems to
be always initialized to NULL so we can remove ata_qc_first_sg too.


diff --git a/include/linux/libata.h b/include/linux/libata.h
index 4f6404c..2774882 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1054,25 +1054,8 @@ extern void ata_port_pbar_desc(struct ata_port *ap, int bar, ssize_t offset,
 			       const char *name);
 #endif
 
-/*
- * qc helpers
- */
-static inline struct scatterlist *
-ata_qc_first_sg(struct ata_queued_cmd *qc)
-{
-	if (qc->n_elem)
-		return qc->__sg;
-	return NULL;
-}
-
-static inline struct scatterlist *
-ata_qc_next_sg(struct scatterlist *sg, struct ata_queued_cmd *qc)
-{
-	return sg_next(sg);
-}
-
 #define ata_for_each_sg(sg, qc) \
-	for (sg = ata_qc_first_sg(qc); sg; sg = ata_qc_next_sg(sg, qc))
+	for (sg = qc->__sg; sg; sg = sg_next(sg))
 
 static inline unsigned int ata_tag_valid(unsigned int tag)
 {

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2008-01-03  7:58 ` FUJITA Tomonori
@ 2008-01-03 15:12   ` James Bottomley
  0 siblings, 0 replies; 33+ messages in thread
From: James Bottomley @ 2008-01-03 15:12 UTC (permalink / raw)
  To: FUJITA Tomonori; +Cc: linux-scsi, linux-ide, Jens.Axboe, fujita.tomonori, jeff


On Thu, 2008-01-03 at 16:58 +0900, FUJITA Tomonori wrote:
> On Mon, 31 Dec 2007 15:56:08 -0600
> James Bottomley <James.Bottomley@HansenPartnership.com> wrote:
> > @@ -1080,24 +1073,15 @@ extern void ata_port_pbar_desc(struct ata_port *ap, int bar, ssize_t offset,
> >  static inline struct scatterlist *
> >  ata_qc_first_sg(struct ata_queued_cmd *qc)
> >  {
> > -	qc->n_iter = 0;
> >  	if (qc->n_elem)
> >  		return qc->__sg;
> > -	if (qc->pad_len)
> > -		return &qc->pad_sgent;
> >  	return NULL;
> >  }
> >  
> >  static inline struct scatterlist *
> >  ata_qc_next_sg(struct scatterlist *sg, struct ata_queued_cmd *qc)
> >  {
> > -	if (sg == &qc->pad_sgent)
> > -		return NULL;
> > -	if (++qc->n_iter < qc->n_elem)
> > -		return sg_next(sg);
> > -	if (qc->pad_len)
> > -		return &qc->pad_sgent;
> > -	return NULL;
> > +	return sg_next(sg);
> >  }
> >  
> >  #define ata_for_each_sg(sg, qc) \
> 
> How about removing ata_qc_first_sg and ata_qc_next_sg completely?
> 
> Now we can just replace ata_qc_next_sg with sg_next. qc->__sg seems to
> be always initialized to NULL so we can remove ata_qc_first_sg too.

Sure ... I assumed (without actually looking) that the inlines were
there because they were an API used throughout the drivers.  Actually,
grep tells me they're only used in the ata_for_each_sg macro, so this
patch looks good.

Thanks,

James

> diff --git a/include/linux/libata.h b/include/linux/libata.h
> index 4f6404c..2774882 100644
> --- a/include/linux/libata.h
> +++ b/include/linux/libata.h
> @@ -1054,25 +1054,8 @@ extern void ata_port_pbar_desc(struct ata_port *ap, int bar, ssize_t offset,
>  			       const char *name);
>  #endif
>  
> -/*
> - * qc helpers
> - */
> -static inline struct scatterlist *
> -ata_qc_first_sg(struct ata_queued_cmd *qc)
> -{
> -	if (qc->n_elem)
> -		return qc->__sg;
> -	return NULL;
> -}
> -
> -static inline struct scatterlist *
> -ata_qc_next_sg(struct scatterlist *sg, struct ata_queued_cmd *qc)
> -{
> -	return sg_next(sg);
> -}
> -
>  #define ata_for_each_sg(sg, qc) \
> -	for (sg = ata_qc_first_sg(qc); sg; sg = ata_qc_next_sg(sg, qc))
> +	for (sg = qc->__sg; sg; sg = sg_next(sg))
>  
>  static inline unsigned int ata_tag_valid(unsigned int tag)
>  {
> -
> To unsubscribe from this list: send the line "unsubscribe linux-ide" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2007-12-31 21:56 [PATCH] libata: eliminate the home grown dma padding in favour of that provided by the block layer James Bottomley
  2007-12-31 22:56 ` Jeff Garzik
  2008-01-03  7:58 ` FUJITA Tomonori
@ 2008-01-09  2:10 ` Tejun Heo
  2008-01-09  4:24   ` James Bottomley
  2008-01-18 23:14 ` [PATCH RESEND] " James Bottomley
  3 siblings, 1 reply; 33+ messages in thread
From: Tejun Heo @ 2008-01-09  2:10 UTC (permalink / raw)
  To: James Bottomley
  Cc: linux-scsi, linux-ide, Jens Axboe, FUJITA Tomonori, Jeff Garzik

James Bottomley wrote:
> ATA requires that all DMA transfers begin and end on word boundaries.
> Because of this, a large amount of machinery grew up in ide to adjust
> scatterlists on this basis.  However, as of 2.5, the block layer has a
> dma_alignment variable which ensures both the beginning and length of a
> DMA transfer are aligned on the dma_alignment boundary.  Although the
> block layer does adjust the beginning of the transfer to ensure this
> happens, it doesn't actually adjust the length, it merely makes sure
> that space is allocated for transfers beyond the declared length.  The
> upshot of this is that scatterlists may be padded to any size between
> the actual length and the length adjusted to the dma_alignment safely
> knowing that memory is allocated in this region.
> 
> Right at the moment, SCSI takes the default dma_aligment which is on a
> 512 byte boundary.  Note that this aligment only applies to transfers
> coming in from user space.  However, since all kernel allocations are
> automatically aligned on a minimum of 32 byte boundaries, it is safe to
> adjust them in this manner as well.
> 
> Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
> 
> ---
> 
> This also fixes a current panic in libsas with SATAPI devices.  I was
> trying to trace a bad interaction between the libata padding and the new
> sglist code which was the root cause, and ended up concluding that the
> whole libata padding thing was redundant.
> 
> In a future improvement, I think we can relax SCSI dma_alignemnt (it's
> causing almost every user space command to be copied instead of directly
> passed through) and allow devices and transports to build up their own
> requirements instead.

Great but this intersects with currently queued and pending changes for
libata-dev#upstream and needs to be merged.  Also, DMA alignment at
block layer isn't enough for ATA.  ATA needs drain buffers for ATAPI
commands with variable length response.  :-(

-- 
tejun

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2008-01-09  2:10 ` Tejun Heo
@ 2008-01-09  4:24   ` James Bottomley
  2008-01-09  5:13     ` Tejun Heo
  0 siblings, 1 reply; 33+ messages in thread
From: James Bottomley @ 2008-01-09  4:24 UTC (permalink / raw)
  To: Tejun Heo; +Cc: linux-scsi, linux-ide, Jens Axboe, FUJITA Tomonori, Jeff Garzik


On Wed, 2008-01-09 at 11:10 +0900, Tejun Heo wrote:
> James Bottomley wrote:
> > ATA requires that all DMA transfers begin and end on word boundaries.
> > Because of this, a large amount of machinery grew up in ide to adjust
> > scatterlists on this basis.  However, as of 2.5, the block layer has a
> > dma_alignment variable which ensures both the beginning and length of a
> > DMA transfer are aligned on the dma_alignment boundary.  Although the
> > block layer does adjust the beginning of the transfer to ensure this
> > happens, it doesn't actually adjust the length, it merely makes sure
> > that space is allocated for transfers beyond the declared length.  The
> > upshot of this is that scatterlists may be padded to any size between
> > the actual length and the length adjusted to the dma_alignment safely
> > knowing that memory is allocated in this region.
> > 
> > Right at the moment, SCSI takes the default dma_aligment which is on a
> > 512 byte boundary.  Note that this aligment only applies to transfers
> > coming in from user space.  However, since all kernel allocations are
> > automatically aligned on a minimum of 32 byte boundaries, it is safe to
> > adjust them in this manner as well.
> > 
> > Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
> > 
> > ---
> > 
> > This also fixes a current panic in libsas with SATAPI devices.  I was
> > trying to trace a bad interaction between the libata padding and the new
> > sglist code which was the root cause, and ended up concluding that the
> > whole libata padding thing was redundant.
> > 
> > In a future improvement, I think we can relax SCSI dma_alignemnt (it's
> > causing almost every user space command to be copied instead of directly
> > passed through) and allow devices and transports to build up their own
> > requirements instead.
> 
> Great but this intersects with currently queued and pending changes for
> libata-dev#upstream and needs to be merged. 

That's OK, I can do the merge .. that's what git is for.

>  Also, DMA alignment at
> block layer isn't enough for ATA.  ATA needs drain buffers for ATAPI
> commands with variable length response.  :-(

OK, where is this in the libata code?  The dma_pad size is only 4 bytes,
so this drain, I assume is only a word long?  Given the word alignment
requirements of ATA doesn't this still mean it's only draining up to the
word boundary anyway (so the code is still correct)?

James



^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2008-01-09  4:24   ` James Bottomley
@ 2008-01-09  5:13     ` Tejun Heo
  2008-01-09 15:13       ` James Bottomley
  0 siblings, 1 reply; 33+ messages in thread
From: Tejun Heo @ 2008-01-09  5:13 UTC (permalink / raw)
  To: James Bottomley
  Cc: linux-scsi, linux-ide, Jens Axboe, FUJITA Tomonori, Jeff Garzik

James Bottomley wrote:
>>  Also, DMA alignment at
>> block layer isn't enough for ATA.  ATA needs drain buffers for ATAPI
>> commands with variable length response.  :-(
> 
> OK, where is this in the libata code?  The dma_pad size is only 4 bytes,
> so this drain, I assume is only a word long?  Given the word alignment
> requirements of ATA doesn't this still mean it's only draining up to the
> word boundary anyway (so the code is still correct)?

Patch is acked but not merged yet.

http://git.kernel.org/?p=linux/kernel/git/tj/libata-dev.git;a=commit;h=6cd22a0febc74bebe52d58eb22271b8770892a2d

The full function can be read from the following.  It's
ata_sg_setup_extra().

http://git.kernel.org/?p=linux/kernel/git/tj/libata-dev.git;a=blob;f=drivers/ata/libata-core.c;h=d763c072e6cefc724ea24cb68a7adf47b340f054;hb=6cd22a0febc74bebe52d58eb22271b8770892a2d

-- 
tejun

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2008-01-09  5:13     ` Tejun Heo
@ 2008-01-09 15:13       ` James Bottomley
  0 siblings, 0 replies; 33+ messages in thread
From: James Bottomley @ 2008-01-09 15:13 UTC (permalink / raw)
  To: Tejun Heo; +Cc: linux-scsi, linux-ide, Jens Axboe, FUJITA Tomonori, Jeff Garzik


On Wed, 2008-01-09 at 14:13 +0900, Tejun Heo wrote:
> James Bottomley wrote:
> >>  Also, DMA alignment at
> >> block layer isn't enough for ATA.  ATA needs drain buffers for ATAPI
> >> commands with variable length response.  :-(
> > 
> > OK, where is this in the libata code?  The dma_pad size is only 4 bytes,
> > so this drain, I assume is only a word long?  Given the word alignment
> > requirements of ATA doesn't this still mean it's only draining up to the
> > word boundary anyway (so the code is still correct)?
> 
> Patch is acked but not merged yet.
> 
> http://git.kernel.org/?p=linux/kernel/git/tj/libata-dev.git;a=commit;h=6cd22a0febc74bebe52d58eb22271b8770892a2d
> 
> The full function can be read from the following.  It's
> ata_sg_setup_extra().
> 
> http://git.kernel.org/?p=linux/kernel/git/tj/libata-dev.git;a=blob;f=drivers/ata/libata-core.c;h=d763c072e6cefc724ea24cb68a7adf47b340f054;hb=6cd22a0febc74bebe52d58eb22271b8770892a2d

OK, but that patch was sent to the mailing list on 4 Jan ... five days
after this one.  It's a little hard to take unposted patches into
account ...

It's a fairly comprehensive merge clash ... where is this drain patch on
the upstream track?  because currently ipr and aic94xx panic the system
without the dma padding removal (I just figured -rc6 was a bit late for
major surgery like this, but I was planning a backport if it stood up in
2.6.24).

James



^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH RESEND] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2007-12-31 21:56 [PATCH] libata: eliminate the home grown dma padding in favour of that provided by the block layer James Bottomley
                   ` (2 preceding siblings ...)
  2008-01-09  2:10 ` Tejun Heo
@ 2008-01-18 23:14 ` James Bottomley
  2008-02-01 19:40   ` [PATCH RESEND number 2] " James Bottomley
  3 siblings, 1 reply; 33+ messages in thread
From: James Bottomley @ 2008-01-18 23:14 UTC (permalink / raw)
  To: linux-scsi; +Cc: linux-ide, Jens Axboe, FUJITA Tomonori, Jeff Garzik, Tejun Heo

ATA requires that all DMA transfers begin and end on word boundaries.
Because of this, a large amount of machinery grew up in ide to adjust
scatterlists on this basis.  However, as of 2.5, the block layer has a
dma_alignment variable which ensures both the beginning and length of a
DMA transfer are aligned on the dma_alignment boundary.  Although the
block layer does adjust the beginning of the transfer to ensure this
happens, it doesn't actually adjust the length, it merely makes sure
that space is allocated for transfers beyond the declared length.  The
upshot of this is that scatterlists may be padded to any size between
the actual length and the length adjusted to the dma_alignment safely
knowing that memory is allocated in this region.

Right at the moment, SCSI takes the default dma_aligment which is on a
512 byte boundary.  Note that this aligment only applies to transfers
coming in from user space.  However, since all kernel allocations are
automatically aligned on a minimum of 32 byte boundaries, it is safe to
adjust them in this manner as well.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 drivers/ata/ahci.c            |    5 --
 drivers/ata/libata-core.c     |  146 +++++-----------------------------------
 drivers/ata/libata-scsi.c     |   16 +----
 drivers/ata/pata_icside.c     |    8 --
 drivers/ata/sata_fsl.c        |   17 +----
 drivers/ata/sata_mv.c         |    5 --
 drivers/ata/sata_sil24.c      |    5 --
 drivers/scsi/ipr.c            |    4 +-
 drivers/scsi/libsas/sas_ata.c |    4 +-
 include/linux/libata.h        |   28 +--------
 10 files changed, 28 insertions(+), 210 deletions(-)

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 49761bc..9b363ea 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -1964,16 +1964,11 @@ static int ahci_port_start(struct ata_port *ap)
 	struct ahci_port_priv *pp;
 	void *mem;
 	dma_addr_t mem_dma;
-	int rc;
 
 	pp = devm_kzalloc(dev, sizeof(*pp), GFP_KERNEL);
 	if (!pp)
 		return -ENOMEM;
 
-	rc = ata_pad_alloc(ap, dev);
-	if (rc)
-		return rc;
-
 	mem = dmam_alloc_coherent(dev, AHCI_PORT_PRIV_DMA_SZ, &mem_dma,
 				  GFP_KERNEL);
 	if (!mem)
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 06ff781..9802fe3 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -60,6 +60,8 @@
 #include <linux/io.h>
 #include <scsi/scsi.h>
 #include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_dbg.h>
 #include <scsi/scsi_host.h>
 #include <linux/libata.h>
 #include <asm/semaphore.h>
@@ -4476,30 +4478,13 @@ void ata_sg_clean(struct ata_queued_cmd *qc)
 	struct ata_port *ap = qc->ap;
 	struct scatterlist *sg = qc->sg;
 	int dir = qc->dma_dir;
-	void *pad_buf = NULL;
 
 	WARN_ON(sg == NULL);
 
-	VPRINTK("unmapping %u sg elements\n", qc->mapped_n_elem);
+	VPRINTK("unmapping %u sg elements\n", qc->n_elem);
 
-	/* if we padded the buffer out to 32-bit bound, and data
-	 * xfer direction is from-device, we must copy from the
-	 * pad buffer back into the supplied buffer
-	 */
-	if (qc->pad_len && !(qc->tf.flags & ATA_TFLAG_WRITE))
-		pad_buf = ap->pad + (qc->tag * ATA_DMA_PAD_SZ);
-
-	if (qc->mapped_n_elem)
-		dma_unmap_sg(ap->dev, sg, qc->mapped_n_elem, dir);
-	/* restore last sg */
-	if (qc->last_sg)
-		*qc->last_sg = qc->saved_last_sg;
-	if (pad_buf) {
-		struct scatterlist *psg = &qc->extra_sg[1];
-		void *addr = kmap_atomic(sg_page(psg), KM_IRQ0);
-		memcpy(addr + psg->offset, pad_buf, qc->pad_len);
-		kunmap_atomic(addr, KM_IRQ0);
-	}
+	if (qc->n_elem)
+		dma_unmap_sg(ap->dev, sg, qc->n_elem, dir);
 
 	qc->flags &= ~ATA_QCFLAG_DMAMAP;
 	qc->sg = NULL;
@@ -4765,97 +4750,6 @@ void ata_sg_init(struct ata_queued_cmd *qc, struct scatterlist *sg,
 	qc->cursg = qc->sg;
 }
 
-static unsigned int ata_sg_setup_extra(struct ata_queued_cmd *qc,
-				       unsigned int *n_elem_extra,
-				       unsigned int *nbytes_extra)
-{
-	struct ata_port *ap = qc->ap;
-	unsigned int n_elem = qc->n_elem;
-	struct scatterlist *lsg, *copy_lsg = NULL, *tsg = NULL, *esg = NULL;
-
-	*n_elem_extra = 0;
-	*nbytes_extra = 0;
-
-	/* needs padding? */
-	qc->pad_len = qc->nbytes & 3;
-
-	if (likely(!qc->pad_len))
-		return n_elem;
-
-	/* locate last sg and save it */
-	lsg = sg_last(qc->sg, n_elem);
-	qc->last_sg = lsg;
-	qc->saved_last_sg = *lsg;
-
-	sg_init_table(qc->extra_sg, ARRAY_SIZE(qc->extra_sg));
-
-	if (qc->pad_len) {
-		struct scatterlist *psg = &qc->extra_sg[1];
-		void *pad_buf = ap->pad + (qc->tag * ATA_DMA_PAD_SZ);
-		unsigned int offset;
-
-		WARN_ON(qc->dev->class != ATA_DEV_ATAPI);
-
-		memset(pad_buf, 0, ATA_DMA_PAD_SZ);
-
-		/* psg->page/offset are used to copy to-be-written
-		 * data in this function or read data in ata_sg_clean.
-		 */
-		offset = lsg->offset + lsg->length - qc->pad_len;
-		sg_set_page(psg, nth_page(sg_page(lsg), offset >> PAGE_SHIFT),
-			    qc->pad_len, offset_in_page(offset));
-
-		if (qc->tf.flags & ATA_TFLAG_WRITE) {
-			void *addr = kmap_atomic(sg_page(psg), KM_IRQ0);
-			memcpy(pad_buf, addr + psg->offset, qc->pad_len);
-			kunmap_atomic(addr, KM_IRQ0);
-		}
-
-		sg_dma_address(psg) = ap->pad_dma + (qc->tag * ATA_DMA_PAD_SZ);
-		sg_dma_len(psg) = ATA_DMA_PAD_SZ;
-
-		/* Trim the last sg entry and chain the original and
-		 * padding sg lists.
-		 *
-		 * Because chaining consumes one sg entry, one extra
-		 * sg entry is allocated and the last sg entry is
-		 * copied to it if the length isn't zero after padded
-		 * amount is removed.
-		 *
-		 * If the last sg entry is completely replaced by
-		 * padding sg entry, the first sg entry is skipped
-		 * while chaining.
-		 */
-		lsg->length -= qc->pad_len;
-		if (lsg->length) {
-			copy_lsg = &qc->extra_sg[0];
-			tsg = &qc->extra_sg[0];
-		} else {
-			n_elem--;
-			tsg = &qc->extra_sg[1];
-		}
-
-		esg = &qc->extra_sg[1];
-
-		(*n_elem_extra)++;
-		(*nbytes_extra) += 4 - qc->pad_len;
-	}
-
-	if (copy_lsg)
-		sg_set_page(copy_lsg, sg_page(lsg), lsg->length, lsg->offset);
-
-	sg_chain(lsg, 1, tsg);
-	sg_mark_end(esg);
-
-	/* sglist can't start with chaining sg entry, fast forward */
-	if (qc->sg == lsg) {
-		qc->sg = tsg;
-		qc->cursg = tsg;
-	}
-
-	return n_elem;
-}
-
 /**
  *	ata_sg_setup - DMA-map the scatter-gather table associated with a command.
  *	@qc: Command with scatter-gather table to be mapped.
@@ -4872,26 +4766,29 @@ static unsigned int ata_sg_setup_extra(struct ata_queued_cmd *qc,
 static int ata_sg_setup(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap = qc->ap;
-	unsigned int n_elem, n_elem_extra, nbytes_extra;
+	unsigned int n_elem;
+	int pad = qc->nbytes & ATA_DMA_PAD_MASK;
 
 	VPRINTK("ENTER, ata%u\n", ap->print_id);
 
-	n_elem = ata_sg_setup_extra(qc, &n_elem_extra, &nbytes_extra);
+	n_elem = qc->n_elem;
+
+	/* needs padding? */
+	if (pad) {
+		struct scatterlist *sg = sg_last(qc->sg, n_elem);
+		qc->nbytes += ATA_DMA_PAD_SZ - pad;
+		sg->length +=  ATA_DMA_PAD_SZ - pad;
+	}
 
-	if (n_elem) {
+	if (qc->n_elem) {
 		n_elem = dma_map_sg(ap->dev, qc->sg, n_elem, qc->dma_dir);
-		if (n_elem < 1) {
-			/* restore last sg */
-			if (qc->last_sg)
-				*qc->last_sg = qc->saved_last_sg;
+		if (n_elem < 1)
 			return -1;
-		}
+
 		DPRINTK("%d sg elements mapped\n", n_elem);
 	}
 
-	qc->n_elem = qc->mapped_n_elem = n_elem;
-	qc->n_elem += n_elem_extra;
-	qc->nbytes += nbytes_extra;
+	qc->n_elem = n_elem;
 	qc->flags |= ATA_QCFLAG_DMAMAP;
 
 	return 0;
@@ -6581,17 +6478,12 @@ void ata_host_resume(struct ata_host *host)
 int ata_port_start(struct ata_port *ap)
 {
 	struct device *dev = ap->dev;
-	int rc;
 
 	ap->prd = dmam_alloc_coherent(dev, ATA_PRD_TBL_SZ, &ap->prd_dma,
 				      GFP_KERNEL);
 	if (!ap->prd)
 		return -ENOMEM;
 
-	rc = ata_pad_alloc(ap, dev);
-	if (rc)
-		return rc;
-
 	DPRINTK("prd alloc, virt %p, dma %llx\n", ap->prd,
 		(unsigned long long)ap->prd_dma);
 	return 0;
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 58f0208..c4aeaab 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -835,15 +835,6 @@ static void ata_scsi_dev_config(struct scsi_device *sdev,
 	/* configure max sectors */
 	blk_queue_max_sectors(sdev->request_queue, dev->max_sectors);
 
-	/* SATA DMA transfers must be multiples of 4 byte, so
-	 * we need to pad ATAPI transfers using an extra sg.
-	 * Decrement max hw segments accordingly.
-	 */
-	if (dev->class == ATA_DEV_ATAPI) {
-		struct request_queue *q = sdev->request_queue;
-		blk_queue_max_hw_segments(q, q->max_hw_segments - 1);
-	}
-
 	if (dev->class == ATA_DEV_ATA)
 		sdev->manage_start_stop = 1;
 
@@ -3551,7 +3542,7 @@ EXPORT_SYMBOL_GPL(ata_sas_port_alloc);
  *	@ap: Port to initialize
  *
  *	Called just after data structures for each port are
- *	initialized.  Allocates DMA pad.
+ *	initialized.
  *
  *	May be used as the port_start() entry in ata_port_operations.
  *
@@ -3560,7 +3551,7 @@ EXPORT_SYMBOL_GPL(ata_sas_port_alloc);
  */
 int ata_sas_port_start(struct ata_port *ap)
 {
-	return ata_pad_alloc(ap, ap->dev);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(ata_sas_port_start);
 
@@ -3568,8 +3559,6 @@ EXPORT_SYMBOL_GPL(ata_sas_port_start);
  *	ata_port_stop - Undo ata_sas_port_start()
  *	@ap: Port to shut down
  *
- *	Frees the DMA pad.
- *
  *	May be used as the port_stop() entry in ata_port_operations.
  *
  *	LOCKING:
@@ -3578,7 +3567,6 @@ EXPORT_SYMBOL_GPL(ata_sas_port_start);
 
 void ata_sas_port_stop(struct ata_port *ap)
 {
-	ata_pad_free(ap, ap->dev);
 }
 EXPORT_SYMBOL_GPL(ata_sas_port_stop);
 
diff --git a/drivers/ata/pata_icside.c b/drivers/ata/pata_icside.c
index 5b8586d..f97068b 100644
--- a/drivers/ata/pata_icside.c
+++ b/drivers/ata/pata_icside.c
@@ -304,12 +304,6 @@ static int icside_dma_init(struct pata_icside_info *info)
 }
 
 
-static int pata_icside_port_start(struct ata_port *ap)
-{
-	/* No PRD to alloc */
-	return ata_pad_alloc(ap, ap->dev);
-}
-
 static struct scsi_host_template pata_icside_sht = {
 	.module			= THIS_MODULE,
 	.name			= DRV_NAME,
@@ -389,8 +383,6 @@ static struct ata_port_operations pata_icside_port_ops = {
 	.irq_clear		= ata_dummy_noret,
 	.irq_on			= ata_irq_on,
 
-	.port_start		= pata_icside_port_start,
-
 	.bmdma_stop		= pata_icside_bmdma_stop,
 	.bmdma_status		= pata_icside_bmdma_status,
 };
diff --git a/drivers/ata/sata_fsl.c b/drivers/ata/sata_fsl.c
index d041709..5d40e4b 100644
--- a/drivers/ata/sata_fsl.c
+++ b/drivers/ata/sata_fsl.c
@@ -356,8 +356,8 @@ static unsigned int sata_fsl_fill_sg(struct ata_queued_cmd *qc, void *cmd_desc,
 			ata_port_printk(qc->ap, KERN_ERR,
 					"s/g len unaligned : 0x%x\n", sg_len);
 
-		if ((num_prde == (SATA_FSL_MAX_PRD_DIRECT - 1)) &&
-		    (qc->n_iter + 1 != qc->n_elem)) {
+		if (num_prde == (SATA_FSL_MAX_PRD_DIRECT - 1) &&
+		    sg_next(sg) != NULL) {
 			VPRINTK("setting indirect prde\n");
 			prd_ptr_to_indirect_ext = prd;
 			prd->dba = cpu_to_le32(indirect_ext_segment_paddr);
@@ -602,21 +602,9 @@ static int sata_fsl_port_start(struct ata_port *ap)
 	if (!pp)
 		return -ENOMEM;
 
-	/*
-	 * allocate per command dma alignment pad buffer, which is used
-	 * internally by libATA to ensure that all transfers ending on
-	 * unaligned boundaries are padded, to align on Dword boundaries
-	 */
-	retval = ata_pad_alloc(ap, dev);
-	if (retval) {
-		kfree(pp);
-		return retval;
-	}
-
 	mem = dma_alloc_coherent(dev, SATA_FSL_PORT_PRIV_DMA_SZ, &mem_dma,
 				 GFP_KERNEL);
 	if (!mem) {
-		ata_pad_free(ap, dev);
 		kfree(pp);
 		return -ENOMEM;
 	}
@@ -695,7 +683,6 @@ static void sata_fsl_port_stop(struct ata_port *ap)
 	dma_free_coherent(dev, SATA_FSL_PORT_PRIV_DMA_SZ,
 			  pp->cmdslot, pp->cmdslot_paddr);
 
-	ata_pad_free(ap, dev);
 	kfree(pp);
 }
 
diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index 7e72463..73fb2ba 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -1054,7 +1054,6 @@ static int mv_port_start(struct ata_port *ap)
 	void *mem;
 	dma_addr_t mem_dma;
 	unsigned long flags;
-	int rc;
 
 	pp = devm_kzalloc(dev, sizeof(*pp), GFP_KERNEL);
 	if (!pp)
@@ -1066,10 +1065,6 @@ static int mv_port_start(struct ata_port *ap)
 		return -ENOMEM;
 	memset(mem, 0, MV_PORT_PRIV_DMA_SZ);
 
-	rc = ata_pad_alloc(ap, dev);
-	if (rc)
-		return rc;
-
 	/* First item in chunk of DMA memory:
 	 * 32-slot command request table (CRQB), 32 bytes each in size
 	 */
diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c
index b4b1f91..df7988d 100644
--- a/drivers/ata/sata_sil24.c
+++ b/drivers/ata/sata_sil24.c
@@ -1234,7 +1234,6 @@ static int sil24_port_start(struct ata_port *ap)
 	union sil24_cmd_block *cb;
 	size_t cb_size = sizeof(*cb) * SIL24_MAX_CMDS;
 	dma_addr_t cb_dma;
-	int rc;
 
 	pp = devm_kzalloc(dev, sizeof(*pp), GFP_KERNEL);
 	if (!pp)
@@ -1247,10 +1246,6 @@ static int sil24_port_start(struct ata_port *ap)
 		return -ENOMEM;
 	memset(cb, 0, cb_size);
 
-	rc = ata_pad_alloc(ap, dev);
-	if (rc)
-		return rc;
-
 	pp->cmd_block = cb;
 	pp->cmd_block_dma = cb_dma;
 
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index 73270ff..6762e89 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -5140,7 +5140,7 @@ static void ipr_build_ata_ioadl(struct ipr_cmnd *ipr_cmd,
 	struct ipr_ioarcb *ioarcb = &ipr_cmd->ioarcb;
 	struct ipr_ioadl_desc *ioadl = ipr_cmd->ioadl;
 	struct ipr_ioadl_desc *last_ioadl = NULL;
-	int len = qc->nbytes + qc->pad_len;
+	int len = qc->nbytes;
 	struct scatterlist *sg;
 	unsigned int si;
 
@@ -5206,7 +5206,7 @@ static unsigned int ipr_qc_issue(struct ata_queued_cmd *qc)
 	ioarcb->cmd_pkt.request_type = IPR_RQTYPE_ATA_PASSTHRU;
 	ioarcb->cmd_pkt.flags_hi |= IPR_FLAGS_HI_NO_LINK_DESC;
 	ioarcb->cmd_pkt.flags_hi |= IPR_FLAGS_HI_NO_ULEN_CHK;
-	ipr_cmd->dma_use_sg = qc->pad_len ? qc->n_elem + 1 : qc->n_elem;
+	ipr_cmd->dma_use_sg = qc->n_elem;
 
 	ipr_build_ata_ioadl(ipr_cmd, qc);
 	regs->flags |= IPR_ATA_FLAG_STATUS_ON_GOOD_COMPLETION;
diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
index 0996f86..7cd05b5 100644
--- a/drivers/scsi/libsas/sas_ata.c
+++ b/drivers/scsi/libsas/sas_ata.c
@@ -178,8 +178,8 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
 	task->uldd_task = qc;
 	if (ata_is_atapi(qc->tf.protocol)) {
 		memcpy(task->ata_task.atapi_packet, qc->cdb, qc->dev->cdb_len);
-		task->total_xfer_len = qc->nbytes + qc->pad_len;
-		task->num_scatter = qc->pad_len ? qc->n_elem + 1 : qc->n_elem;
+		task->total_xfer_len = qc->nbytes;
+		task->num_scatter = qc->n_elem;
 	} else {
 		for_each_sg(qc->sg, sg, qc->n_elem, si)
 			xfer += sg->length;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index ccb0556..8ee6dfc 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -278,7 +278,7 @@ enum {
 
 	/* size of buffer to pad xfers ending on unaligned boundaries */
 	ATA_DMA_PAD_SZ		= 4,
-	ATA_DMA_PAD_BUF_SZ	= ATA_DMA_PAD_SZ * ATA_MAX_QUEUE,
+	ATA_DMA_PAD_MASK	= ATA_DMA_PAD_SZ - 1,
 
 	/* ering size */
 	ATA_ERING_SIZE		= 32,
@@ -457,12 +457,9 @@ struct ata_queued_cmd {
 	unsigned long		flags;		/* ATA_QCFLAG_xxx */
 	unsigned int		tag;
 	unsigned int		n_elem;
-	unsigned int		n_iter;
-	unsigned int		mapped_n_elem;
 
 	int			dma_dir;
 
-	unsigned int		pad_len;
 	unsigned int		sect_size;
 
 	unsigned int		nbytes;
@@ -472,10 +469,7 @@ struct ata_queued_cmd {
 	struct scatterlist	*cursg;
 	unsigned int		cursg_ofs;
 
-	struct scatterlist	*last_sg;
-	struct scatterlist	saved_last_sg;
 	struct scatterlist	sgent;
-	struct scatterlist	extra_sg[2];
 
 	struct scatterlist	*sg;
 
@@ -620,9 +614,6 @@ struct ata_port {
 	struct ata_prd		*prd;	 /* our SG list */
 	dma_addr_t		prd_dma; /* and its DMA mapping */
 
-	void			*pad;	/* array of DMA pad buffers */
-	dma_addr_t		pad_dma;
-
 	struct ata_ioports	ioaddr;	/* ATA cmd/ctl/dma register blocks */
 
 	u8			ctl;	/* cache of ATA control register */
@@ -1363,11 +1354,7 @@ static inline void ata_qc_reinit(struct ata_queued_cmd *qc)
 	qc->cursg_ofs = 0;
 	qc->nbytes = qc->raw_nbytes = qc->curbytes = 0;
 	qc->n_elem = 0;
-	qc->mapped_n_elem = 0;
-	qc->n_iter = 0;
 	qc->err_mask = 0;
-	qc->pad_len = 0;
-	qc->last_sg = NULL;
 	qc->sect_size = ATA_SECT_SIZE;
 
 	ata_tf_init(qc->dev, &qc->tf);
@@ -1422,19 +1409,6 @@ static inline unsigned int __ac_err_mask(u8 status)
 	return mask;
 }
 
-static inline int ata_pad_alloc(struct ata_port *ap, struct device *dev)
-{
-	ap->pad_dma = 0;
-	ap->pad = dmam_alloc_coherent(dev, ATA_DMA_PAD_BUF_SZ,
-				      &ap->pad_dma, GFP_KERNEL);
-	return (ap->pad == NULL) ? -ENOMEM : 0;
-}
-
-static inline void ata_pad_free(struct ata_port *ap, struct device *dev)
-{
-	dmam_free_coherent(dev, ATA_DMA_PAD_BUF_SZ, ap->pad, ap->pad_dma);
-}
-
 static inline struct ata_port *ata_shost_to_port(struct Scsi_Host *host)
 {
 	return *(struct ata_port **)&host->hostdata[0];
-- 
1.5.3.7




^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH RESEND number 2] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2008-01-18 23:14 ` [PATCH RESEND] " James Bottomley
@ 2008-02-01 19:40   ` James Bottomley
  2008-02-01 20:02     ` Jeff Garzik
  0 siblings, 1 reply; 33+ messages in thread
From: James Bottomley @ 2008-02-01 19:40 UTC (permalink / raw)
  To: linux-scsi; +Cc: linux-ide, Jens Axboe, FUJITA Tomonori, Jeff Garzik, Tejun Heo

Could we please get this in ... I thought I mentioned several times that
it fixes a fatal oops in both aic94xx and ipr.

James

---

From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Mon, 31 Dec 2007 15:56:08 -0600
Subject: libata: eliminate the home grown dma padding in favour of that provided by the block layer

ATA requires that all DMA transfers begin and end on word boundaries.
Because of this, a large amount of machinery grew up in ide to adjust
scatterlists on this basis.  However, as of 2.5, the block layer has a
dma_alignment variable which ensures both the beginning and length of a
DMA transfer are aligned on the dma_alignment boundary.  Although the
block layer does adjust the beginning of the transfer to ensure this
happens, it doesn't actually adjust the length, it merely makes sure
that space is allocated for transfers beyond the declared length.  The
upshot of this is that scatterlists may be padded to any size between
the actual length and the length adjusted to the dma_alignment safely
knowing that memory is allocated in this region.

Right at the moment, SCSI takes the default dma_aligment which is on a
512 byte boundary.  Note that this aligment only applies to transfers
coming in from user space.  However, since all kernel allocations are
automatically aligned on a minimum of 32 byte boundaries, it is safe to
adjust them in this manner as well.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 drivers/ata/ahci.c            |    5 --
 drivers/ata/libata-core.c     |  146 +++++-----------------------------------
 drivers/ata/libata-scsi.c     |   23 +-----
 drivers/ata/pata_icside.c     |    8 --
 drivers/ata/sata_fsl.c        |   17 +----
 drivers/ata/sata_mv.c         |    5 --
 drivers/ata/sata_sil24.c      |    5 --
 drivers/scsi/ipr.c            |    4 +-
 drivers/scsi/libsas/sas_ata.c |    4 +-
 include/linux/libata.h        |   28 +--------
 10 files changed, 30 insertions(+), 215 deletions(-)

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 6f089b8..2a5a77a 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -1977,16 +1977,11 @@ static int ahci_port_start(struct ata_port *ap)
 	struct ahci_port_priv *pp;
 	void *mem;
 	dma_addr_t mem_dma;
-	int rc;
 
 	pp = devm_kzalloc(dev, sizeof(*pp), GFP_KERNEL);
 	if (!pp)
 		return -ENOMEM;
 
-	rc = ata_pad_alloc(ap, dev);
-	if (rc)
-		return rc;
-
 	mem = dmam_alloc_coherent(dev, AHCI_PORT_PRIV_DMA_SZ, &mem_dma,
 				  GFP_KERNEL);
 	if (!mem)
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index bdbd55a..679a404 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -60,6 +60,8 @@
 #include <linux/io.h>
 #include <scsi/scsi.h>
 #include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_dbg.h>
 #include <scsi/scsi_host.h>
 #include <linux/libata.h>
 #include <asm/semaphore.h>
@@ -4476,30 +4478,13 @@ void ata_sg_clean(struct ata_queued_cmd *qc)
 	struct ata_port *ap = qc->ap;
 	struct scatterlist *sg = qc->sg;
 	int dir = qc->dma_dir;
-	void *pad_buf = NULL;
 
 	WARN_ON(sg == NULL);
 
-	VPRINTK("unmapping %u sg elements\n", qc->mapped_n_elem);
+	VPRINTK("unmapping %u sg elements\n", qc->n_elem);
 
-	/* if we padded the buffer out to 32-bit bound, and data
-	 * xfer direction is from-device, we must copy from the
-	 * pad buffer back into the supplied buffer
-	 */
-	if (qc->pad_len && !(qc->tf.flags & ATA_TFLAG_WRITE))
-		pad_buf = ap->pad + (qc->tag * ATA_DMA_PAD_SZ);
-
-	if (qc->mapped_n_elem)
-		dma_unmap_sg(ap->dev, sg, qc->mapped_n_elem, dir);
-	/* restore last sg */
-	if (qc->last_sg)
-		*qc->last_sg = qc->saved_last_sg;
-	if (pad_buf) {
-		struct scatterlist *psg = &qc->extra_sg[1];
-		void *addr = kmap_atomic(sg_page(psg), KM_IRQ0);
-		memcpy(addr + psg->offset, pad_buf, qc->pad_len);
-		kunmap_atomic(addr, KM_IRQ0);
-	}
+	if (qc->n_elem)
+		dma_unmap_sg(ap->dev, sg, qc->n_elem, dir);
 
 	qc->flags &= ~ATA_QCFLAG_DMAMAP;
 	qc->sg = NULL;
@@ -4765,97 +4750,6 @@ void ata_sg_init(struct ata_queued_cmd *qc, struct scatterlist *sg,
 	qc->cursg = qc->sg;
 }
 
-static unsigned int ata_sg_setup_extra(struct ata_queued_cmd *qc,
-				       unsigned int *n_elem_extra,
-				       unsigned int *nbytes_extra)
-{
-	struct ata_port *ap = qc->ap;
-	unsigned int n_elem = qc->n_elem;
-	struct scatterlist *lsg, *copy_lsg = NULL, *tsg = NULL, *esg = NULL;
-
-	*n_elem_extra = 0;
-	*nbytes_extra = 0;
-
-	/* needs padding? */
-	qc->pad_len = qc->nbytes & 3;
-
-	if (likely(!qc->pad_len))
-		return n_elem;
-
-	/* locate last sg and save it */
-	lsg = sg_last(qc->sg, n_elem);
-	qc->last_sg = lsg;
-	qc->saved_last_sg = *lsg;
-
-	sg_init_table(qc->extra_sg, ARRAY_SIZE(qc->extra_sg));
-
-	if (qc->pad_len) {
-		struct scatterlist *psg = &qc->extra_sg[1];
-		void *pad_buf = ap->pad + (qc->tag * ATA_DMA_PAD_SZ);
-		unsigned int offset;
-
-		WARN_ON(qc->dev->class != ATA_DEV_ATAPI);
-
-		memset(pad_buf, 0, ATA_DMA_PAD_SZ);
-
-		/* psg->page/offset are used to copy to-be-written
-		 * data in this function or read data in ata_sg_clean.
-		 */
-		offset = lsg->offset + lsg->length - qc->pad_len;
-		sg_set_page(psg, nth_page(sg_page(lsg), offset >> PAGE_SHIFT),
-			    qc->pad_len, offset_in_page(offset));
-
-		if (qc->tf.flags & ATA_TFLAG_WRITE) {
-			void *addr = kmap_atomic(sg_page(psg), KM_IRQ0);
-			memcpy(pad_buf, addr + psg->offset, qc->pad_len);
-			kunmap_atomic(addr, KM_IRQ0);
-		}
-
-		sg_dma_address(psg) = ap->pad_dma + (qc->tag * ATA_DMA_PAD_SZ);
-		sg_dma_len(psg) = ATA_DMA_PAD_SZ;
-
-		/* Trim the last sg entry and chain the original and
-		 * padding sg lists.
-		 *
-		 * Because chaining consumes one sg entry, one extra
-		 * sg entry is allocated and the last sg entry is
-		 * copied to it if the length isn't zero after padded
-		 * amount is removed.
-		 *
-		 * If the last sg entry is completely replaced by
-		 * padding sg entry, the first sg entry is skipped
-		 * while chaining.
-		 */
-		lsg->length -= qc->pad_len;
-		if (lsg->length) {
-			copy_lsg = &qc->extra_sg[0];
-			tsg = &qc->extra_sg[0];
-		} else {
-			n_elem--;
-			tsg = &qc->extra_sg[1];
-		}
-
-		esg = &qc->extra_sg[1];
-
-		(*n_elem_extra)++;
-		(*nbytes_extra) += 4 - qc->pad_len;
-	}
-
-	if (copy_lsg)
-		sg_set_page(copy_lsg, sg_page(lsg), lsg->length, lsg->offset);
-
-	sg_chain(lsg, 1, tsg);
-	sg_mark_end(esg);
-
-	/* sglist can't start with chaining sg entry, fast forward */
-	if (qc->sg == lsg) {
-		qc->sg = tsg;
-		qc->cursg = tsg;
-	}
-
-	return n_elem;
-}
-
 /**
  *	ata_sg_setup - DMA-map the scatter-gather table associated with a command.
  *	@qc: Command with scatter-gather table to be mapped.
@@ -4872,26 +4766,29 @@ static unsigned int ata_sg_setup_extra(struct ata_queued_cmd *qc,
 static int ata_sg_setup(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap = qc->ap;
-	unsigned int n_elem, n_elem_extra, nbytes_extra;
+	unsigned int n_elem;
+	int pad = qc->nbytes & ATA_DMA_PAD_MASK;
 
 	VPRINTK("ENTER, ata%u\n", ap->print_id);
 
-	n_elem = ata_sg_setup_extra(qc, &n_elem_extra, &nbytes_extra);
+	n_elem = qc->n_elem;
+
+	/* needs padding? */
+	if (pad) {
+		struct scatterlist *sg = sg_last(qc->sg, n_elem);
+		qc->nbytes += ATA_DMA_PAD_SZ - pad;
+		sg->length +=  ATA_DMA_PAD_SZ - pad;
+	}
 
-	if (n_elem) {
+	if (qc->n_elem) {
 		n_elem = dma_map_sg(ap->dev, qc->sg, n_elem, qc->dma_dir);
-		if (n_elem < 1) {
-			/* restore last sg */
-			if (qc->last_sg)
-				*qc->last_sg = qc->saved_last_sg;
+		if (n_elem < 1)
 			return -1;
-		}
+
 		DPRINTK("%d sg elements mapped\n", n_elem);
 	}
 
-	qc->n_elem = qc->mapped_n_elem = n_elem;
-	qc->n_elem += n_elem_extra;
-	qc->nbytes += nbytes_extra;
+	qc->n_elem = n_elem;
 	qc->flags |= ATA_QCFLAG_DMAMAP;
 
 	return 0;
@@ -6566,17 +6463,12 @@ void ata_host_resume(struct ata_host *host)
 int ata_port_start(struct ata_port *ap)
 {
 	struct device *dev = ap->dev;
-	int rc;
 
 	ap->prd = dmam_alloc_coherent(dev, ATA_PRD_TBL_SZ, &ap->prd_dma,
 				      GFP_KERNEL);
 	if (!ap->prd)
 		return -ENOMEM;
 
-	rc = ata_pad_alloc(ap, dev);
-	if (rc)
-		return rc;
-
 	DPRINTK("prd alloc, virt %p, dma %llx\n", ap->prd,
 		(unsigned long long)ap->prd_dma);
 	return 0;
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index c02c490..8ffff44 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -832,24 +832,12 @@ static void ata_scsi_dev_config(struct scsi_device *sdev,
 	/* configure max sectors */
 	blk_queue_max_sectors(sdev->request_queue, dev->max_sectors);
 
-	/* SATA DMA transfers must be multiples of 4 byte, so
-	 * we need to pad ATAPI transfers using an extra sg.
-	 * Decrement max hw segments accordingly.
-	 */
-	if (dev->class == ATA_DEV_ATAPI) {
-		struct request_queue *q = sdev->request_queue;
-		blk_queue_max_hw_segments(q, q->max_hw_segments - 1);
-
-		/* set the min alignment */
-		blk_queue_update_dma_alignment(sdev->request_queue,
-					       ATA_DMA_PAD_SZ - 1);
-	} else
+	if (dev->class == ATA_DEV_ATA) {
 		/* ATA devices must be sector aligned */
 		blk_queue_update_dma_alignment(sdev->request_queue,
 					       ATA_SECT_SIZE - 1);
-
-	if (dev->class == ATA_DEV_ATA)
 		sdev->manage_start_stop = 1;
+	}
 
 	if (dev->flags & ATA_DFLAG_AN)
 		set_bit(SDEV_EVT_MEDIA_CHANGE, sdev->supported_events);
@@ -3555,7 +3543,7 @@ EXPORT_SYMBOL_GPL(ata_sas_port_alloc);
  *	@ap: Port to initialize
  *
  *	Called just after data structures for each port are
- *	initialized.  Allocates DMA pad.
+ *	initialized.
  *
  *	May be used as the port_start() entry in ata_port_operations.
  *
@@ -3564,7 +3552,7 @@ EXPORT_SYMBOL_GPL(ata_sas_port_alloc);
  */
 int ata_sas_port_start(struct ata_port *ap)
 {
-	return ata_pad_alloc(ap, ap->dev);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(ata_sas_port_start);
 
@@ -3572,8 +3560,6 @@ EXPORT_SYMBOL_GPL(ata_sas_port_start);
  *	ata_port_stop - Undo ata_sas_port_start()
  *	@ap: Port to shut down
  *
- *	Frees the DMA pad.
- *
  *	May be used as the port_stop() entry in ata_port_operations.
  *
  *	LOCKING:
@@ -3582,7 +3568,6 @@ EXPORT_SYMBOL_GPL(ata_sas_port_start);
 
 void ata_sas_port_stop(struct ata_port *ap)
 {
-	ata_pad_free(ap, ap->dev);
 }
 EXPORT_SYMBOL_GPL(ata_sas_port_stop);
 
diff --git a/drivers/ata/pata_icside.c b/drivers/ata/pata_icside.c
index 5b8586d..f97068b 100644
--- a/drivers/ata/pata_icside.c
+++ b/drivers/ata/pata_icside.c
@@ -304,12 +304,6 @@ static int icside_dma_init(struct pata_icside_info *info)
 }
 
 
-static int pata_icside_port_start(struct ata_port *ap)
-{
-	/* No PRD to alloc */
-	return ata_pad_alloc(ap, ap->dev);
-}
-
 static struct scsi_host_template pata_icside_sht = {
 	.module			= THIS_MODULE,
 	.name			= DRV_NAME,
@@ -389,8 +383,6 @@ static struct ata_port_operations pata_icside_port_ops = {
 	.irq_clear		= ata_dummy_noret,
 	.irq_on			= ata_irq_on,
 
-	.port_start		= pata_icside_port_start,
-
 	.bmdma_stop		= pata_icside_bmdma_stop,
 	.bmdma_status		= pata_icside_bmdma_status,
 };
diff --git a/drivers/ata/sata_fsl.c b/drivers/ata/sata_fsl.c
index 922d7b2..9323dd0 100644
--- a/drivers/ata/sata_fsl.c
+++ b/drivers/ata/sata_fsl.c
@@ -355,8 +355,8 @@ static unsigned int sata_fsl_fill_sg(struct ata_queued_cmd *qc, void *cmd_desc,
 			ata_port_printk(qc->ap, KERN_ERR,
 					"s/g len unaligned : 0x%x\n", sg_len);
 
-		if ((num_prde == (SATA_FSL_MAX_PRD_DIRECT - 1)) &&
-		    (qc->n_iter + 1 != qc->n_elem)) {
+		if (num_prde == (SATA_FSL_MAX_PRD_DIRECT - 1) &&
+		    sg_next(sg) != NULL) {
 			VPRINTK("setting indirect prde\n");
 			prd_ptr_to_indirect_ext = prd;
 			prd->dba = cpu_to_le32(indirect_ext_segment_paddr);
@@ -601,21 +601,9 @@ static int sata_fsl_port_start(struct ata_port *ap)
 	if (!pp)
 		return -ENOMEM;
 
-	/*
-	 * allocate per command dma alignment pad buffer, which is used
-	 * internally by libATA to ensure that all transfers ending on
-	 * unaligned boundaries are padded, to align on Dword boundaries
-	 */
-	retval = ata_pad_alloc(ap, dev);
-	if (retval) {
-		kfree(pp);
-		return retval;
-	}
-
 	mem = dma_alloc_coherent(dev, SATA_FSL_PORT_PRIV_DMA_SZ, &mem_dma,
 				 GFP_KERNEL);
 	if (!mem) {
-		ata_pad_free(ap, dev);
 		kfree(pp);
 		return -ENOMEM;
 	}
@@ -694,7 +682,6 @@ static void sata_fsl_port_stop(struct ata_port *ap)
 	dma_free_coherent(dev, SATA_FSL_PORT_PRIV_DMA_SZ,
 			  pp->cmdslot, pp->cmdslot_paddr);
 
-	ata_pad_free(ap, dev);
 	kfree(pp);
 }
 
diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index 7e72463..73fb2ba 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -1054,7 +1054,6 @@ static int mv_port_start(struct ata_port *ap)
 	void *mem;
 	dma_addr_t mem_dma;
 	unsigned long flags;
-	int rc;
 
 	pp = devm_kzalloc(dev, sizeof(*pp), GFP_KERNEL);
 	if (!pp)
@@ -1066,10 +1065,6 @@ static int mv_port_start(struct ata_port *ap)
 		return -ENOMEM;
 	memset(mem, 0, MV_PORT_PRIV_DMA_SZ);
 
-	rc = ata_pad_alloc(ap, dev);
-	if (rc)
-		return rc;
-
 	/* First item in chunk of DMA memory:
 	 * 32-slot command request table (CRQB), 32 bytes each in size
 	 */
diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c
index b4b1f91..df7988d 100644
--- a/drivers/ata/sata_sil24.c
+++ b/drivers/ata/sata_sil24.c
@@ -1234,7 +1234,6 @@ static int sil24_port_start(struct ata_port *ap)
 	union sil24_cmd_block *cb;
 	size_t cb_size = sizeof(*cb) * SIL24_MAX_CMDS;
 	dma_addr_t cb_dma;
-	int rc;
 
 	pp = devm_kzalloc(dev, sizeof(*pp), GFP_KERNEL);
 	if (!pp)
@@ -1247,10 +1246,6 @@ static int sil24_port_start(struct ata_port *ap)
 		return -ENOMEM;
 	memset(cb, 0, cb_size);
 
-	rc = ata_pad_alloc(ap, dev);
-	if (rc)
-		return rc;
-
 	pp->cmd_block = cb;
 	pp->cmd_block_dma = cb_dma;
 
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index 73270ff..6762e89 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -5140,7 +5140,7 @@ static void ipr_build_ata_ioadl(struct ipr_cmnd *ipr_cmd,
 	struct ipr_ioarcb *ioarcb = &ipr_cmd->ioarcb;
 	struct ipr_ioadl_desc *ioadl = ipr_cmd->ioadl;
 	struct ipr_ioadl_desc *last_ioadl = NULL;
-	int len = qc->nbytes + qc->pad_len;
+	int len = qc->nbytes;
 	struct scatterlist *sg;
 	unsigned int si;
 
@@ -5206,7 +5206,7 @@ static unsigned int ipr_qc_issue(struct ata_queued_cmd *qc)
 	ioarcb->cmd_pkt.request_type = IPR_RQTYPE_ATA_PASSTHRU;
 	ioarcb->cmd_pkt.flags_hi |= IPR_FLAGS_HI_NO_LINK_DESC;
 	ioarcb->cmd_pkt.flags_hi |= IPR_FLAGS_HI_NO_ULEN_CHK;
-	ipr_cmd->dma_use_sg = qc->pad_len ? qc->n_elem + 1 : qc->n_elem;
+	ipr_cmd->dma_use_sg = qc->n_elem;
 
 	ipr_build_ata_ioadl(ipr_cmd, qc);
 	regs->flags |= IPR_ATA_FLAG_STATUS_ON_GOOD_COMPLETION;
diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
index 0996f86..7cd05b5 100644
--- a/drivers/scsi/libsas/sas_ata.c
+++ b/drivers/scsi/libsas/sas_ata.c
@@ -178,8 +178,8 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
 	task->uldd_task = qc;
 	if (ata_is_atapi(qc->tf.protocol)) {
 		memcpy(task->ata_task.atapi_packet, qc->cdb, qc->dev->cdb_len);
-		task->total_xfer_len = qc->nbytes + qc->pad_len;
-		task->num_scatter = qc->pad_len ? qc->n_elem + 1 : qc->n_elem;
+		task->total_xfer_len = qc->nbytes;
+		task->num_scatter = qc->n_elem;
 	} else {
 		for_each_sg(qc->sg, sg, qc->n_elem, si)
 			xfer += sg->length;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 4374c42..be2ace3 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -278,7 +278,7 @@ enum {
 
 	/* size of buffer to pad xfers ending on unaligned boundaries */
 	ATA_DMA_PAD_SZ		= 4,
-	ATA_DMA_PAD_BUF_SZ	= ATA_DMA_PAD_SZ * ATA_MAX_QUEUE,
+	ATA_DMA_PAD_MASK	= ATA_DMA_PAD_SZ - 1,
 
 	/* ering size */
 	ATA_ERING_SIZE		= 32,
@@ -457,12 +457,9 @@ struct ata_queued_cmd {
 	unsigned long		flags;		/* ATA_QCFLAG_xxx */
 	unsigned int		tag;
 	unsigned int		n_elem;
-	unsigned int		n_iter;
-	unsigned int		mapped_n_elem;
 
 	int			dma_dir;
 
-	unsigned int		pad_len;
 	unsigned int		sect_size;
 
 	unsigned int		nbytes;
@@ -472,10 +469,7 @@ struct ata_queued_cmd {
 	struct scatterlist	*cursg;
 	unsigned int		cursg_ofs;
 
-	struct scatterlist	*last_sg;
-	struct scatterlist	saved_last_sg;
 	struct scatterlist	sgent;
-	struct scatterlist	extra_sg[2];
 
 	struct scatterlist	*sg;
 
@@ -620,9 +614,6 @@ struct ata_port {
 	struct ata_prd		*prd;	 /* our SG list */
 	dma_addr_t		prd_dma; /* and its DMA mapping */
 
-	void			*pad;	/* array of DMA pad buffers */
-	dma_addr_t		pad_dma;
-
 	struct ata_ioports	ioaddr;	/* ATA cmd/ctl/dma register blocks */
 
 	u8			ctl;	/* cache of ATA control register */
@@ -1366,11 +1357,7 @@ static inline void ata_qc_reinit(struct ata_queued_cmd *qc)
 	qc->cursg_ofs = 0;
 	qc->nbytes = qc->raw_nbytes = qc->curbytes = 0;
 	qc->n_elem = 0;
-	qc->mapped_n_elem = 0;
-	qc->n_iter = 0;
 	qc->err_mask = 0;
-	qc->pad_len = 0;
-	qc->last_sg = NULL;
 	qc->sect_size = ATA_SECT_SIZE;
 
 	ata_tf_init(qc->dev, &qc->tf);
@@ -1425,19 +1412,6 @@ static inline unsigned int __ac_err_mask(u8 status)
 	return mask;
 }
 
-static inline int ata_pad_alloc(struct ata_port *ap, struct device *dev)
-{
-	ap->pad_dma = 0;
-	ap->pad = dmam_alloc_coherent(dev, ATA_DMA_PAD_BUF_SZ,
-				      &ap->pad_dma, GFP_KERNEL);
-	return (ap->pad == NULL) ? -ENOMEM : 0;
-}
-
-static inline void ata_pad_free(struct ata_port *ap, struct device *dev)
-{
-	dmam_free_coherent(dev, ATA_DMA_PAD_BUF_SZ, ap->pad, ap->pad_dma);
-}
-
 static inline struct ata_port *ata_shost_to_port(struct Scsi_Host *host)
 {
 	return *(struct ata_port **)&host->hostdata[0];
-- 
1.5.3.8




^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH RESEND number 2] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2008-02-01 19:40   ` [PATCH RESEND number 2] " James Bottomley
@ 2008-02-01 20:02     ` Jeff Garzik
  2008-02-01 21:09       ` James Bottomley
  0 siblings, 1 reply; 33+ messages in thread
From: Jeff Garzik @ 2008-02-01 20:02 UTC (permalink / raw)
  To: James Bottomley
  Cc: linux-scsi, linux-ide, Jens Axboe, FUJITA Tomonori, Tejun Heo

James Bottomley wrote:
> Could we please get this in ... I thought I mentioned several times that
> it fixes a fatal oops in both aic94xx and ipr.

Tejun has a persistent objection...  see other email.

	Jeff




^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH RESEND number 2] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2008-02-01 20:02     ` Jeff Garzik
@ 2008-02-01 21:09       ` James Bottomley
  2008-02-03  3:04         ` Tejun Heo
  0 siblings, 1 reply; 33+ messages in thread
From: James Bottomley @ 2008-02-01 21:09 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: linux-scsi, linux-ide, Jens Axboe, FUJITA Tomonori, Tejun Heo


On Fri, 2008-02-01 at 15:02 -0500, Jeff Garzik wrote:
> James Bottomley wrote:
> > Could we please get this in ... I thought I mentioned several times that
> > it fixes a fatal oops in both aic94xx and ipr.
> 
> Tejun has a persistent objection...  see other email.

Actually, see other email .. I meant that this patch (eliminate dma
padding) is independent of the drain one.

James



^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH RESEND number 2] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2008-02-01 21:09       ` James Bottomley
@ 2008-02-03  3:04         ` Tejun Heo
  2008-02-03  4:32           ` James Bottomley
  0 siblings, 1 reply; 33+ messages in thread
From: Tejun Heo @ 2008-02-03  3:04 UTC (permalink / raw)
  To: James Bottomley
  Cc: Jeff Garzik, linux-scsi, linux-ide, Jens Axboe, FUJITA Tomonori

James Bottomley wrote:
> On Fri, 2008-02-01 at 15:02 -0500, Jeff Garzik wrote:
>> James Bottomley wrote:
>>> Could we please get this in ... I thought I mentioned several times that
>>> it fixes a fatal oops in both aic94xx and ipr.
>> Tejun has a persistent objection...  see other email.
> 
> Actually, see other email .. I meant that this patch (eliminate dma
> padding) is independent of the drain one.

Sorry about the delay.

There's a problem here.  For the blk layer dma padding itself, it's okay
but the problem is that it blocks the pending draining patch without
supplying usable alternative at the moment.  I agree that the long term
solution should be in the block layer && I understand that it causes
problem for SAS controllers but for the moment if we don't include the
existing draining patch, far more ATAPI devices are affected.  So, it's
catch-22 situation.

I think the best solution is to update block layer draining such that it
can be included together before the merge window closes.  I'll dig into it.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH RESEND number 2] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2008-02-03  3:04         ` Tejun Heo
@ 2008-02-03  4:32           ` James Bottomley
  2008-02-03  7:37             ` Tejun Heo
  0 siblings, 1 reply; 33+ messages in thread
From: James Bottomley @ 2008-02-03  4:32 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Jeff Garzik, linux-scsi, linux-ide, Jens Axboe, FUJITA Tomonori


On Sun, 2008-02-03 at 12:04 +0900, Tejun Heo wrote:
> James Bottomley wrote:
> > On Fri, 2008-02-01 at 15:02 -0500, Jeff Garzik wrote:
> >> James Bottomley wrote:
> >>> Could we please get this in ... I thought I mentioned several times that
> >>> it fixes a fatal oops in both aic94xx and ipr.
> >> Tejun has a persistent objection...  see other email.
> > 
> > Actually, see other email .. I meant that this patch (eliminate dma
> > padding) is independent of the drain one.
> 
> Sorry about the delay.
> 
> There's a problem here.  For the blk layer dma padding itself, it's okay
> but the problem is that it blocks the pending draining patch without
> supplying usable alternative at the moment.

The alternative is already there.  The current drain infrastructure is
already in the block layer.  The patch I sent merely activates it for
ATA.  Is there some issue with this that I haven't forseen?  My analysis
is that it should do everything your patch does (except at the block
layer and in a manner that doesn't trigger the aic94xx panic).

>   I agree that the long term
> solution should be in the block layer && I understand that it causes
> problem for SAS controllers but for the moment if we don't include the
> existing draining patch, far more ATAPI devices are affected.  So, it's
> catch-22 situation.
> 
> I think the best solution is to update block layer draining such that it
> can be included together before the merge window closes.  I'll dig into it.

Like I said, the block layer pieces are already upstream.  All we need
is the ATA bits and I think it should all work ... unless there's some
part I haven't though of?

James



^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH RESEND number 2] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2008-02-03  4:32           ` James Bottomley
@ 2008-02-03  7:37             ` Tejun Heo
  2008-02-03 14:38               ` James Bottomley
  0 siblings, 1 reply; 33+ messages in thread
From: Tejun Heo @ 2008-02-03  7:37 UTC (permalink / raw)
  To: James Bottomley
  Cc: Jeff Garzik, linux-scsi, linux-ide, Jens Axboe, FUJITA Tomonori

James Bottomley wrote:
>> I think the best solution is to update block layer draining such that it
>> can be included together before the merge window closes.  I'll dig into it.
> 
> Like I said, the block layer pieces are already upstream.  All we need
> is the ATA bits and I think it should all work ... unless there's some
> part I haven't though of?

Yeah, I think we'll probably need to add rq->raw_data_len and I'd really
like to map longer drain area but I think the basics are there.  What's
needed is updating libata accordingly and testing it.  I'm currently
away from all my toys.  I'll implement the ATA part, test it and submit
the patch on Monday.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH RESEND number 2] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2008-02-03  7:37             ` Tejun Heo
@ 2008-02-03 14:38               ` James Bottomley
  2008-02-03 15:14                 ` Tejun Heo
  0 siblings, 1 reply; 33+ messages in thread
From: James Bottomley @ 2008-02-03 14:38 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Jeff Garzik, linux-scsi, linux-ide, Jens Axboe, FUJITA Tomonori

On Sun, 2008-02-03 at 16:37 +0900, Tejun Heo wrote:
> James Bottomley wrote:
> >> I think the best solution is to update block layer draining such that it
> >> can be included together before the merge window closes.  I'll dig into it.
> > 
> > Like I said, the block layer pieces are already upstream.  All we need
> > is the ATA bits and I think it should all work ... unless there's some
> > part I haven't though of?
> 
> Yeah, I think we'll probably need to add rq->raw_data_len and I'd really
> like to map longer drain area but I think the basics are there.

I'm reluctant to add another parameter to the request, but this one you
can calculate:  you just do it wherever you work out the size of the
request.  If data_len is the true data length and total_data_len is the
data length plus the drain length, the calculation fragment is

if (blk_pc_request(req))
	data_len = req->data_len;
else
	data_len = req->nr_sectors << 9;
total_data_len = data_len + req->q->dma_drain_size;

If the request has already been mapped by scsi, then data_len is
actually scsi_cmnd->sdb.length

> What's needed is updating libata accordingly and testing it.

Actually, I sent the patch to do this a few days ago:

http://marc.info/?l=linux-ide&m=120189565418258

But I've attached it again.

> I'm currently away from all my toys.  I'll implement the ATA part,
> test it and submit the patch on Monday.

Great, will look forward to the results.  Like I said, I think the turn
on draining in slave configure should be narrowed from all ATAPI devices
to all AHCI ATAPI devices if there's no evidence that any other
implementation that uses DMA to transfer PIO isn't similarly broken (I
know the aic94xx works fine, for instance ...

James

---

From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Thu, 10 Jan 2008 11:42:50 -0600
Subject: libata: implement drain buffers

This just updates the libata slave configure routine to take advantage
of the block layer drain buffers.

I suspect I should also be checking for AHCI as well as ATA_DEV_ATAPI,
but I couldn't see how to do that easily.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 drivers/ata/libata-scsi.c |   26 ++++++++++++++++++++++----
 1 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 8ffff44..a28351c 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -826,8 +826,8 @@ static void ata_scsi_sdev_config(struct scsi_device *sdev)
 	sdev->max_device_blocked = 1;
 }
 
-static void ata_scsi_dev_config(struct scsi_device *sdev,
-				struct ata_device *dev)
+static int ata_scsi_dev_config(struct scsi_device *sdev,
+			       struct ata_device *dev)
 {
 	/* configure max sectors */
 	blk_queue_max_sectors(sdev->request_queue, dev->max_sectors);
@@ -839,6 +839,16 @@ static void ata_scsi_dev_config(struct scsi_device *sdev,
 		sdev->manage_start_stop = 1;
 	}
 
+	if (dev->class == ATA_DEV_ATAPI) {
+		struct request_queue *q = sdev->request_queue;
+		void *buf = kmalloc(ATAPI_MAX_DRAIN, GFP_KERNEL);
+		if (!buf) {
+			sdev_printk(KERN_ERR, sdev, "drain buffer allocation failed\n");
+			return -ENOMEM;
+		}
+		blk_queue_dma_drain(q, buf, ATAPI_MAX_DRAIN);
+	}
+
 	if (dev->flags & ATA_DFLAG_AN)
 		set_bit(SDEV_EVT_MEDIA_CHANGE, sdev->supported_events);
 
@@ -849,6 +859,8 @@ static void ata_scsi_dev_config(struct scsi_device *sdev,
 		depth = min(ATA_MAX_QUEUE - 1, depth);
 		scsi_adjust_queue_depth(sdev, MSG_SIMPLE_TAG, depth);
 	}
+
+	return 0;
 }
 
 /**
@@ -867,13 +879,14 @@ int ata_scsi_slave_config(struct scsi_device *sdev)
 {
 	struct ata_port *ap = ata_shost_to_port(sdev->host);
 	struct ata_device *dev = __ata_scsi_find_dev(ap, sdev);
+	int rc = 0;
 
 	ata_scsi_sdev_config(sdev);
 
 	if (dev)
-		ata_scsi_dev_config(sdev, dev);
+		rc = ata_scsi_dev_config(sdev, dev);
 
-	return 0;
+	return rc;
 }
 
 /**
@@ -895,6 +908,7 @@ void ata_scsi_slave_destroy(struct scsi_device *sdev)
 	struct ata_port *ap = ata_shost_to_port(sdev->host);
 	unsigned long flags;
 	struct ata_device *dev;
+	struct request_queue *q = sdev->request_queue;
 
 	if (!ap->ops->error_handler)
 		return;
@@ -908,6 +922,10 @@ void ata_scsi_slave_destroy(struct scsi_device *sdev)
 		ata_port_schedule_eh(ap);
 	}
 	spin_unlock_irqrestore(ap->lock, flags);
+
+	kfree(q->dma_drain_buffer);
+	q->dma_drain_buffer = NULL;
+	q->dma_drain_size = 0;
 }
 
 /**
-- 
1.5.3.8




^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH RESEND number 2] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2008-02-03 14:38               ` James Bottomley
@ 2008-02-03 15:14                 ` Tejun Heo
  2008-02-03 16:12                   ` James Bottomley
  0 siblings, 1 reply; 33+ messages in thread
From: Tejun Heo @ 2008-02-03 15:14 UTC (permalink / raw)
  To: James Bottomley
  Cc: Jeff Garzik, linux-scsi, linux-ide, Jens Axboe, FUJITA Tomonori

James Bottomley wrote:
> I'm reluctant to add another parameter to the request, but this one you
> can calculate:  you just do it wherever you work out the size of the
> request.  If data_len is the true data length and total_data_len is the
> data length plus the drain length, the calculation fragment is
> 
> if (blk_pc_request(req))
> 	data_len = req->data_len;
> else
> 	data_len = req->nr_sectors << 9;
> total_data_len = data_len + req->q->dma_drain_size;
> 
> If the request has already been mapped by scsi, then data_len is
> actually scsi_cmnd->sdb.length

We either need to add a field or a helper and rq->data_len should
probably record the size with drain buffer attached and either add
raw_data_len or blk_rq_raw_data_len().  That size is the length of data
in sg and should be programmed into the controller etc... For ATAPI the
raw size is only used to program the chunk size for odd devices.

>> What's needed is updating libata accordingly and testing it.
> 
> Actually, I sent the patch to do this a few days ago:
> 
> http://marc.info/?l=linux-ide&m=120189565418258
> 
> But I've attached it again.

Thanks a lot.

>> I'm currently away from all my toys.  I'll implement the ATA part,
>> test it and submit the patch on Monday.
> 
> Great, will look forward to the results.  Like I said, I think the turn
> on draining in slave configure should be narrowed from all ATAPI devices
> to all AHCI ATAPI devices if there's no evidence that any other
> implementation that uses DMA to transfer PIO isn't similarly broken (I
> know the aic94xx works fine, for instance ...

What do you mean by aic94xx working fine?  Does the controller
automatically throw away extra data FISes?

Off to bed now.  See you tomorrow.

-- 
tejun

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH RESEND number 2] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2008-02-03 15:14                 ` Tejun Heo
@ 2008-02-03 16:12                   ` James Bottomley
  2008-02-03 16:38                     ` Jeff Garzik
  2008-02-04  1:28                     ` Tejun Heo
  0 siblings, 2 replies; 33+ messages in thread
From: James Bottomley @ 2008-02-03 16:12 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Jeff Garzik, linux-scsi, linux-ide, Jens Axboe, FUJITA Tomonori

On Mon, 2008-02-04 at 00:14 +0900, Tejun Heo wrote:
> James Bottomley wrote:
> > I'm reluctant to add another parameter to the request, but this one you
> > can calculate:  you just do it wherever you work out the size of the
> > request.  If data_len is the true data length and total_data_len is the
> > data length plus the drain length, the calculation fragment is
> > 
> > if (blk_pc_request(req))
> > 	data_len = req->data_len;
> > else
> > 	data_len = req->nr_sectors << 9;
> > total_data_len = data_len + req->q->dma_drain_size;
> > 
> > If the request has already been mapped by scsi, then data_len is
> > actually scsi_cmnd->sdb.length
> 
> We either need to add a field or a helper and rq->data_len should
> probably record the size with drain buffer attached and either add
> raw_data_len or blk_rq_raw_data_len().  That size is the length of data
> in sg and should be programmed into the controller etc... For ATAPI the
> raw size is only used to program the chunk size for odd devices.

OK, could you show me an example of where you need it and I'll come up
with the macro ... that should also help us decide whether it needs to
be in block or in libata alone.  Note that aic94xx only wants the true
size (we effectively treat the drain element as non existent), and I
anticipate this being true of most conforming implementations.  It's
only the problem HBAs that need to know how much slack they have for DMA
overruns.

> >> What's needed is updating libata accordingly and testing it.
> > 
> > Actually, I sent the patch to do this a few days ago:
> > 
> > http://marc.info/?l=linux-ide&m=120189565418258
> > 
> > But I've attached it again.
> 
> Thanks a lot.
> 
> >> I'm currently away from all my toys.  I'll implement the ATA part,
> >> test it and submit the patch on Monday.
> > 
> > Great, will look forward to the results.  Like I said, I think the turn
> > on draining in slave configure should be narrowed from all ATAPI devices
> > to all AHCI ATAPI devices if there's no evidence that any other
> > implementation that uses DMA to transfer PIO isn't similarly broken (I
> > know the aic94xx works fine, for instance ...
> 
> What do you mean by aic94xx working fine?  Does the controller
> automatically throw away extra data FISes?

The aic94xx sequencer has a very finely honed sense of DMA transfers.
It's fully automated, and handles both ATA DMA and ATA PIO in the
sequencer engine (so all the driver sees is DMA).

It reports both underrun and overrun conditions.  For DMA underrun
(device transfers less than expected, it just returns what it has and
how much was missing as the residual) for DMA overrun (as in device
tried to take more than it was programmed to send on either read or
write) for PIO it does seem to zero fill or discard and then simply
report task complete with overrun and let libsas sort it out.  I suspect
for DMA it first tries DMAT before taking other actions, but I'd need a
protocol analyser (or the sequencer docs) to be sure.

We handle overruns as error conditions in both SAS and ATA at the
moment, but the point is that the ATAPI device is fully happy and
quiesced when we do this.

James

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH RESEND number 2] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2008-02-03 16:12                   ` James Bottomley
@ 2008-02-03 16:38                     ` Jeff Garzik
  2008-02-03 17:12                       ` James Bottomley
  2008-02-04  1:28                     ` Tejun Heo
  1 sibling, 1 reply; 33+ messages in thread
From: Jeff Garzik @ 2008-02-03 16:38 UTC (permalink / raw)
  To: James Bottomley
  Cc: Tejun Heo, linux-scsi, linux-ide, Jens Axboe, FUJITA Tomonori

James Bottomley wrote:
> The aic94xx sequencer has a very finely honed sense of DMA transfers.
> It's fully automated, and handles both ATA DMA and ATA PIO in the
> sequencer engine (so all the driver sees is DMA).

ditto AHCI, and most other DMA engines

> It reports both underrun and overrun conditions.  For DMA underrun

ditto AHCI, and most other DMA engines

> (device transfers less than expected, it just returns what it has and
> how much was missing as the residual) for DMA overrun (as in device
> tried to take more than it was programmed to send on either read or
> write) for PIO it does seem to zero fill or discard and then simply
> report task complete with overrun and let libsas sort it out.  I suspect
> for DMA it first tries DMAT before taking other actions, but I'd need a
> protocol analyser (or the sequencer docs) to be sure.

Almost every other DMA engine on the planet besides aic94xx is pretty 
much the same...  you set up an s/g tables, and it reports overrun or 
underrun via an interrupt + status register bit.

It sounds like aic94xx might do more work in the firmware -- that counts 
as "advanced", since some of the DMA engine cleanup clearly occurs in 
firmware, rather than pushed to kernel software.

Nowhere do I see anything about AHCI that is "broken."  It has standard 
DMA engine behavior found in storage and non-storage hardware.

> We handle overruns as error conditions in both SAS and ATA at the
> moment, but the point is that the ATAPI device is fully happy and
> quiesced when we do this.

That may be the result of aic94xx handling extra FIS's in the firmware, 
something we cannot depend on for purely silicon-based devices.

mvsas, broadsas, ahci, sata_sil24, and others behave similarly... 
Please don't mistake lack of firmware cleanup as "broken hardware."

	Jeff

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH RESEND number 2] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2008-02-03 16:38                     ` Jeff Garzik
@ 2008-02-03 17:12                       ` James Bottomley
  2008-02-04  1:21                         ` Tejun Heo
  0 siblings, 1 reply; 33+ messages in thread
From: James Bottomley @ 2008-02-03 17:12 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: Tejun Heo, linux-scsi, linux-ide, Jens Axboe, FUJITA Tomonori

On Sun, 2008-02-03 at 11:38 -0500, Jeff Garzik wrote:
> James Bottomley wrote:
> > The aic94xx sequencer has a very finely honed sense of DMA transfers.
> > It's fully automated, and handles both ATA DMA and ATA PIO in the
> > sequencer engine (so all the driver sees is DMA).
> 
> ditto AHCI, and most other DMA engines
> 
> 
> > It reports both underrun and overrun conditions.  For DMA underrun
> 
> ditto AHCI, and most other DMA engines
> 
> 
> > (device transfers less than expected, it just returns what it has and
> > how much was missing as the residual) for DMA overrun (as in device
> > tried to take more than it was programmed to send on either read or
> > write) for PIO it does seem to zero fill or discard and then simply
> > report task complete with overrun and let libsas sort it out.  I suspect
> > for DMA it first tries DMAT before taking other actions, but I'd need a
> > protocol analyser (or the sequencer docs) to be sure.
> 
> Almost every other DMA engine on the planet besides aic94xx is pretty 
> much the same...  you set up an s/g tables, and it reports overrun or 
> underrun via an interrupt + status register bit.
> 
> It sounds like aic94xx might do more work in the firmware -- that counts 
> as "advanced", since some of the DMA engine cleanup clearly occurs in 
> firmware, rather than pushed to kernel software.
> 
> Nowhere do I see anything about AHCI that is "broken."  It has standard 
> DMA engine behavior found in storage and non-storage hardware.

I'm only really going by what Tejun says about AHCI.  The problem as I
understand it is data overrun on PIO mode commands.  AHCI apparently
(like aic94xx) processes these internally and doesn't actually use the
libata pio handlers, so it just uses an internal buffer to receive the
PIO and DMA it into memory.  However, Tejun says (and he'll correct me
if I'm paraphrasing wrongly, since this was on IRC a while ago) that jmb
ahci and sata_sil24 both error out in different (but fairly nasty) ways
if they get extra PIO data that there's no place in the SG list to
deposit.  This seems to be why he wants to introduce a DMA drain in
addition to the existing PIO drain.  I'd certainly characterise this
behaviour as "broken" ... especially as not all AHCI implementations
apparently have the bug ... some do exactly the right thing on PIO
overruns and don't need the drain element.

> > We handle overruns as error conditions in both SAS and ATA at the
> > moment, but the point is that the ATAPI device is fully happy and
> > quiesced when we do this.
> 
> That may be the result of aic94xx handling extra FIS's in the firmware, 
> something we cannot depend on for purely silicon-based devices.
> 
> mvsas, broadsas, ahci, sata_sil24, and others behave similarly... 
> Please don't mistake lack of firmware cleanup as "broken hardware."

OK, well this is probably me coming from the SCSI world.  Data overruns
happen (especially if you allow users to control the command
parameters), and just discarding the overrun has been common practice
for decades.

Correct me if I'm wrong, but I think ATAPI7v3 covers the PIO overrun
case in 20.4 and indicates precisely how the HBA should handle the
problem.  ACHI also includes the PxIS.OFS bits precisly for this case.
The problem seems to come when using this mechanism hangs the device (as
it apparently does in a limited number of ACHI implementations) and we
just have to take the data in via DMA from the HBA (hence the need for a
drain buffer).

James

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH RESEND number 2] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2008-02-03 17:12                       ` James Bottomley
@ 2008-02-04  1:21                         ` Tejun Heo
  0 siblings, 0 replies; 33+ messages in thread
From: Tejun Heo @ 2008-02-04  1:21 UTC (permalink / raw)
  To: James Bottomley
  Cc: Jeff Garzik, linux-scsi, linux-ide, Jens Axboe, FUJITA Tomonori

James Bottomley wrote:
> I'm only really going by what Tejun says about AHCI.  The problem as I
> understand it is data overrun on PIO mode commands.  AHCI apparently
> (like aic94xx) processes these internally and doesn't actually use the
> libata pio handlers, so it just uses an internal buffer to receive the
> PIO and DMA it into memory.  However, Tejun says (and he'll correct me
> if I'm paraphrasing wrongly, since this was on IRC a while ago) that jmb
> ahci and sata_sil24 both error out in different (but fairly nasty) ways
> if they get extra PIO data that there's no place in the SG list to
> deposit.  This seems to be why he wants to introduce a DMA drain in
> addition to the existing PIO drain.  I'd certainly characterise this
> behaviour as "broken" ... especially as not all AHCI implementations
> apparently have the bug ... some do exactly the right thing on PIO
> overruns and don't need the drain element.

JMB ahci triggers internal error (or was it timeout?  my memory is a bit
blurry now) on overflow.  ICHx ahci and sata_sil24 corrupt data by
offsetting the last FIS containing odd bytes by a byte if data buffer is
not aligned to 4bytes under certain conditions.

[and some explanations on why the aligning and draining stuff]

Also note that ignoring overflow and/or appending draining buffer
shouldn't be applied to READ/WRITE.  Over/underflow should just cause
HSM violation on RWs and friends.  We can do this for each driver or
rather each controller by enabling OFS (ahci), DRD (sil24) but the catch
is that it's pretty darn difficult to verify it actually works.  It not
only depends on specific controller but also on which ATAPI device is
attached and how it behaves depending on chunk size, transfer size in
CDB and command protocol including where it splits data FISes.

For example, IIRC, the above offset-by-one condition occurs on ICHx
ahci's (I've tested 7, 8 and 9), under PIO protocol, when the device
determines to use a separate data FIS for the last three bytes and I
don't know why it doesn't happen for DMA.  It's probably because the
ATAPI devices I have don't split FIS there but who knows?

So, I think we're far better off implementing a generic mechanism at
some higher layer.  libata core was okay.  Block layer is much better.
The overhead is insignificant and the aligning and draining aren't
needed for hot path commands anyway.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH RESEND number 2] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2008-02-03 16:12                   ` James Bottomley
  2008-02-03 16:38                     ` Jeff Garzik
@ 2008-02-04  1:28                     ` Tejun Heo
  2008-02-04  9:25                       ` Tejun Heo
  1 sibling, 1 reply; 33+ messages in thread
From: Tejun Heo @ 2008-02-04  1:28 UTC (permalink / raw)
  To: James Bottomley
  Cc: Jeff Garzik, linux-scsi, linux-ide, Jens Axboe, FUJITA Tomonori

James Bottomley wrote:
> On Mon, 2008-02-04 at 00:14 +0900, Tejun Heo wrote:
>> James Bottomley wrote:
>>> I'm reluctant to add another parameter to the request, but this one you
>>> can calculate:  you just do it wherever you work out the size of the
>>> request.  If data_len is the true data length and total_data_len is the
>>> data length plus the drain length, the calculation fragment is
>>>
>>> if (blk_pc_request(req))
>>> 	data_len = req->data_len;
>>> else
>>> 	data_len = req->nr_sectors << 9;
>>> total_data_len = data_len + req->q->dma_drain_size;
>>>
>>> If the request has already been mapped by scsi, then data_len is
>>> actually scsi_cmnd->sdb.length
>> We either need to add a field or a helper and rq->data_len should
>> probably record the size with drain buffer attached and either add
>> raw_data_len or blk_rq_raw_data_len().  That size is the length of data
>> in sg and should be programmed into the controller etc... For ATAPI the
>> raw size is only used to program the chunk size for odd devices.
> 
> OK, could you show me an example of where you need it and I'll come up
> with the macro ... that should also help us decide whether it needs to
> be in block or in libata alone.  Note that aic94xx only wants the true
> size (we effectively treat the drain element as non existent), and I
> anticipate this being true of most conforming implementations.  It's
> only the problem HBAs that need to know how much slack they have for DMA
> overruns.

Some ATA controllers including SFF BMDMA and libata PIO HSM need the
number of bytes mapped in the sg table.  Yeah, it can be calculated with
a simple macro but it also is a fundamentally confusing dual-sizing
which should be made as clear as possible.  Plus, it can be difficult to
find out when somebody used the wrong thing, so what I'm saying is that
we need to make it easy.  Anyways, please lemme work on it a bit.  I'll
get back to you guys soon.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH RESEND number 2] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2008-02-04  1:28                     ` Tejun Heo
@ 2008-02-04  9:25                       ` Tejun Heo
  2008-02-04 14:43                         ` Tejun Heo
  2008-02-04 15:43                         ` James Bottomley
  0 siblings, 2 replies; 33+ messages in thread
From: Tejun Heo @ 2008-02-04  9:25 UTC (permalink / raw)
  To: James Bottomley
  Cc: Jeff Garzik, linux-scsi, linux-ide, Jens Axboe, FUJITA Tomonori

Tejun Heo wrote:
> Some ATA controllers including SFF BMDMA and libata PIO HSM need the
> number of bytes mapped in the sg table.  Yeah, it can be calculated with
> a simple macro but it also is a fundamentally confusing dual-sizing
> which should be made as clear as possible.  Plus, it can be difficult to
> find out when somebody used the wrong thing, so what I'm saying is that
> we need to make it easy.  Anyways, please lemme work on it a bit.  I'll
> get back to you guys soon.

Okay, here's first draft combined patch.  Only compile tested (expect
it to be broken) but it should be functionally equivalent to
ata_sg_setup_extra() based implementation albeit with shorter drain
buffer size.  Several things to note...

* fsl last sg check isn't included here.  Will split it out and post
  it separately.

* rq->raw_data_len added.  Rationales...

  - All these padding and draining are to prevent controllers from
    crapping themselves when data buffer is shorter than it likes it
    to be.  Any controller which talks MMC (or SPC for that matter)
    should be ready for transfers shorter than buffer so feeding
    enlarged buffer size is inherently safter than feeding the length,
    so the primary data length field, rq->data_len, contains the
    adjusted length.

  - raw_data_len can't be easily deduced from data_len.  The other way
    is possible but with both aligning and draining and command
    filtering, calculating it later is messy.

* Draining configuration is done in sr as it's the driver for MMC.  It
  can move both ways - either into SCSI midlayer as SPC and other
  commands do variable length responses too or into libata if all
  non-ATA controllers are happy without such workarounds.  If you ask
  me, I'm inclined to move it into SCSI midlayer as the added overhead
  is insignificant (especially with drain_needed added) and it won't
  break anything (well, theoretically, at least).

* Padding via alinging seems a bit too hacky to me.  It doesn't even
  cover all sg cases.  I think we'll need improvements there, well,
  but for the time being, this should do.

I'll test and report in a few hours.

Thanks.

Index: work/block/blk-core.c
===================================================================
--- work.orig/block/blk-core.c
+++ work/block/blk-core.c
@@ -116,6 +116,7 @@ void rq_init(struct request_queue *q, st
 	rq->ref_count = 1;
 	rq->q = q;
 	rq->special = NULL;
+	rq->raw_data_len = 0;
 	rq->data_len = 0;
 	rq->data = NULL;
 	rq->nr_phys_segments = 0;
@@ -1982,6 +1983,7 @@ void blk_rq_bio_prep(struct request_queu
 	rq->hard_cur_sectors = rq->current_nr_sectors;
 	rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
 	rq->buffer = bio_data(bio);
+	rq->raw_data_len = bio->bi_size;
 	rq->data_len = bio->bi_size;
 
 	rq->bio = rq->biotail = bio;
Index: work/block/blk-map.c
===================================================================
--- work.orig/block/blk-map.c
+++ work/block/blk-map.c
@@ -19,6 +19,7 @@ int blk_rq_append_bio(struct request_que
 		rq->biotail->bi_next = bio;
 		rq->biotail = bio;
 
+		rq->raw_data_len += bio->bi_size;
 		rq->data_len += bio->bi_size;
 	}
 	return 0;
@@ -56,8 +57,10 @@ static int __blk_rq_map_user(struct requ
 	if (!(uaddr & queue_dma_alignment(q)) &&
 	    !(len & queue_dma_alignment(q)))
 		bio = bio_map_user(q, NULL, uaddr, len, reading);
-	else
+	else {
 		bio = bio_copy_user(q, uaddr, len, reading);
+		rq->data_len = roundup(len, queue_dma_alignment(q));
+	}
 
 	if (IS_ERR(bio))
 		return PTR_ERR(bio);
Index: work/include/linux/blkdev.h
===================================================================
--- work.orig/include/linux/blkdev.h
+++ work/include/linux/blkdev.h
@@ -214,6 +214,7 @@ struct request {
 	unsigned int cmd_len;
 	unsigned char cmd[BLK_MAX_CDB];
 
+	unsigned int raw_data_len;
 	unsigned int data_len;
 	unsigned int sense_len;
 	void *data;
@@ -256,6 +257,7 @@ struct bio_vec;
 typedef int (merge_bvec_fn) (struct request_queue *, struct bio *, struct bio_vec *);
 typedef void (prepare_flush_fn) (struct request_queue *, struct request *);
 typedef void (softirq_done_fn)(struct request *);
+typedef int (dma_drain_needed_fn)(struct request *);
 
 enum blk_queue_state {
 	Queue_down,
@@ -292,6 +294,7 @@ struct request_queue
 	merge_bvec_fn		*merge_bvec_fn;
 	prepare_flush_fn	*prepare_flush_fn;
 	softirq_done_fn		*softirq_done_fn;
+	dma_drain_needed_fn	*dma_drain_needed;
 
 	/*
 	 * Dispatch queue sorting
@@ -696,8 +699,9 @@ extern void blk_queue_max_hw_segments(st
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
 extern void blk_queue_hardsect_size(struct request_queue *, unsigned short);
 extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b);
-extern int blk_queue_dma_drain(struct request_queue *q, void *buf,
-			       unsigned int size);
+extern int blk_queue_dma_drain(struct request_queue *q,
+			       dma_drain_needed_fn *dma_drain_needed,
+			       void *buf, unsigned int size);
 extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
 extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn);
 extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *);
Index: work/block/blk-merge.c
===================================================================
--- work.orig/block/blk-merge.c
+++ work/block/blk-merge.c
@@ -220,7 +220,7 @@ new_segment:
 		bvprv = bvec;
 	} /* segments in rq */
 
-	if (q->dma_drain_size) {
+	if (q->dma_drain_size && q->dma_drain_needed(rq)) {
 		sg->page_link &= ~0x02;
 		sg = sg_next(sg);
 		sg_set_page(sg, virt_to_page(q->dma_drain_buffer),
@@ -228,6 +228,7 @@ new_segment:
 			    ((unsigned long)q->dma_drain_buffer) &
 			    (PAGE_SIZE - 1));
 		nsegs++;
+		rq->data_len += q->dma_drain_size;
 	}
 
 	if (sg)
Index: work/block/bsg.c
===================================================================
--- work.orig/block/bsg.c
+++ work/block/bsg.c
@@ -437,14 +437,14 @@ static int blk_complete_sgv4_hdr_rq(stru
 	}
 
 	if (rq->next_rq) {
-		hdr->dout_resid = rq->data_len;
-		hdr->din_resid = rq->next_rq->data_len;
+		hdr->dout_resid = rq->raw_data_len;
+		hdr->din_resid = rq->next_rq->raw_data_len;
 		blk_rq_unmap_user(bidi_bio);
 		blk_put_request(rq->next_rq);
 	} else if (rq_data_dir(rq) == READ)
-		hdr->din_resid = rq->data_len;
+		hdr->din_resid = rq->raw_data_len;
 	else
-		hdr->dout_resid = rq->data_len;
+		hdr->dout_resid = rq->raw_data_len;
 
 	/*
 	 * If the request generated a negative error number, return it
Index: work/block/scsi_ioctl.c
===================================================================
--- work.orig/block/scsi_ioctl.c
+++ work/block/scsi_ioctl.c
@@ -266,7 +266,7 @@ static int blk_complete_sghdr_rq(struct 
 	hdr->info = 0;
 	if (hdr->masked_status || hdr->host_status || hdr->driver_status)
 		hdr->info |= SG_INFO_CHECK;
-	hdr->resid = rq->data_len;
+	hdr->resid = rq->raw_data_len;
 	hdr->sb_len_wr = 0;
 
 	if (rq->sense_len && hdr->sbp) {
@@ -528,6 +528,7 @@ static int __blk_send_generic(struct req
 	rq = blk_get_request(q, WRITE, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_BLOCK_PC;
 	rq->data = NULL;
+	rq->raw_data_len = 0;
 	rq->data_len = 0;
 	rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
 	memset(rq->cmd, 0, sizeof(rq->cmd));
Index: work/block/blk-settings.c
===================================================================
--- work.orig/block/blk-settings.c
+++ work/block/blk-settings.c
@@ -296,6 +296,7 @@ EXPORT_SYMBOL(blk_queue_stack_limits);
  * blk_queue_dma_drain - Set up a drain buffer for excess dma.
  *
  * @q:  the request queue for the device
+ * @drain_needed: fn which returns non-zero if drain is needed for the request
  * @buf:	physically contiguous buffer
  * @size:	size of the buffer in bytes
  *
@@ -315,14 +316,16 @@ EXPORT_SYMBOL(blk_queue_stack_limits);
  * device can support otherwise there won't be room for the drain
  * buffer.
  */
-int blk_queue_dma_drain(struct request_queue *q, void *buf,
-				unsigned int size)
+extern int blk_queue_dma_drain(struct request_queue *q,
+			       dma_drain_needed_fn *dma_drain_needed,
+			       void *buf, unsigned int size)
 {
 	if (q->max_hw_segments < 2 || q->max_phys_segments < 2)
 		return -EINVAL;
 	/* make room for appending the drain */
 	--q->max_hw_segments;
 	--q->max_phys_segments;
+	q->dma_drain_needed = dma_drain_needed;
 	q->dma_drain_buffer = buf;
 	q->dma_drain_size = size;
 
Index: work/drivers/scsi/sr.c
===================================================================
--- work.orig/drivers/scsi/sr.c
+++ work/drivers/scsi/sr.c
@@ -557,11 +557,30 @@ static void sr_release(struct cdrom_devi
 
 }
 
+static int sr_drain_needed(struct request *rq)
+{
+	if (likely(!blk_pc_request(rq)))
+		return 0;
+
+	switch (rq->cmd[0]) {
+	case GPCMD_READ_10:
+	case GPCMD_READ_12:
+	case GPCMD_WRITE_10:
+	case GPCMD_WRITE_12:
+	case GPCMD_WRITE_AND_VERIFY_10:
+		return 0;
+	}
+
+	return 1;
+}
+
 static int sr_probe(struct device *dev)
 {
 	struct scsi_device *sdev = to_scsi_device(dev);
+	struct request_queue *queue = sdev->request_queue;
 	struct gendisk *disk;
-	struct scsi_cd *cd;
+	struct scsi_cd *cd = NULL;
+	void *drain_buf = NULL;
 	int minor, error;
 
 	error = -ENODEV;
@@ -573,11 +592,15 @@ static int sr_probe(struct device *dev)
 	if (!cd)
 		goto fail;
 
+	drain_buf = kmalloc(SR_DRAIN_SIZE, queue->bounce_gfp | GFP_KERNEL);
+	if (!drain_buf)
+		goto fail;
+
 	kref_init(&cd->kref);
 
 	disk = alloc_disk(1);
 	if (!disk)
-		goto fail_free;
+		goto fail;
 
 	spin_lock(&sr_index_lock);
 	minor = find_first_zero_bit(sr_index_bits, SR_DISKS);
@@ -615,13 +638,14 @@ static int sr_probe(struct device *dev)
 
 	/* FIXME: need to handle a get_capabilities failure properly ?? */
 	get_capabilities(cd);
-	blk_queue_prep_rq(sdev->request_queue, sr_prep_fn);
+	blk_queue_prep_rq(queue, sr_prep_fn);
+	blk_queue_dma_drain(queue, sr_drain_needed, drain_buf, SR_DRAIN_SIZE);
 	sr_vendor_init(cd);
 
 	disk->driverfs_dev = &sdev->sdev_gendev;
 	set_capacity(disk, cd->capacity);
 	disk->private_data = &cd->driver;
-	disk->queue = sdev->request_queue;
+	disk->queue = queue;
 	cd->cdi.disk = disk;
 
 	if (register_cdrom(&cd->cdi))
@@ -637,9 +661,9 @@ static int sr_probe(struct device *dev)
 
 fail_put:
 	put_disk(disk);
-fail_free:
-	kfree(cd);
 fail:
+	kfree(cd);
+	kfree(drain_buf);
 	return error;
 }
 
@@ -894,6 +918,12 @@ static void sr_kref_release(struct kref 
 static int sr_remove(struct device *dev)
 {
 	struct scsi_cd *cd = dev_get_drvdata(dev);
+	struct scsi_device *sdev = to_scsi_device(dev);
+	struct request_queue *queue = sdev->request_queue;
+
+	kfree(queue->dma_drain_buffer);
+	queue->dma_drain_buffer = NULL;
+	queue->dma_drain_size = 0;
 
 	del_gendisk(cd->disk);
 
Index: work/drivers/scsi/sr.h
===================================================================
--- work.orig/drivers/scsi/sr.h
+++ work/drivers/scsi/sr.h
@@ -22,6 +22,7 @@
 
 #define MAX_RETRIES	3
 #define SR_TIMEOUT	(30 * HZ)
+#define SR_DRAIN_SIZE	PAGE_SIZE
 
 struct scsi_device;
 
Index: work/drivers/ata/libata-core.c
===================================================================
--- work.orig/drivers/ata/libata-core.c
+++ work/drivers/ata/libata-core.c
@@ -4476,30 +4476,13 @@ void ata_sg_clean(struct ata_queued_cmd 
 	struct ata_port *ap = qc->ap;
 	struct scatterlist *sg = qc->sg;
 	int dir = qc->dma_dir;
-	void *pad_buf = NULL;
 
 	WARN_ON(sg == NULL);
 
-	VPRINTK("unmapping %u sg elements\n", qc->mapped_n_elem);
+	VPRINTK("unmapping %u sg elements\n", qc->n_elem);
 
-	/* if we padded the buffer out to 32-bit bound, and data
-	 * xfer direction is from-device, we must copy from the
-	 * pad buffer back into the supplied buffer
-	 */
-	if (qc->pad_len && !(qc->tf.flags & ATA_TFLAG_WRITE))
-		pad_buf = ap->pad + (qc->tag * ATA_DMA_PAD_SZ);
-
-	if (qc->mapped_n_elem)
-		dma_unmap_sg(ap->dev, sg, qc->mapped_n_elem, dir);
-	/* restore last sg */
-	if (qc->last_sg)
-		*qc->last_sg = qc->saved_last_sg;
-	if (pad_buf) {
-		struct scatterlist *psg = &qc->extra_sg[1];
-		void *addr = kmap_atomic(sg_page(psg), KM_IRQ0);
-		memcpy(addr + psg->offset, pad_buf, qc->pad_len);
-		kunmap_atomic(addr, KM_IRQ0);
-	}
+	if (qc->n_elem)
+		dma_unmap_sg(ap->dev, sg, qc->n_elem, dir);
 
 	qc->flags &= ~ATA_QCFLAG_DMAMAP;
 	qc->sg = NULL;
@@ -4765,97 +4748,6 @@ void ata_sg_init(struct ata_queued_cmd *
 	qc->cursg = qc->sg;
 }
 
-static unsigned int ata_sg_setup_extra(struct ata_queued_cmd *qc,
-				       unsigned int *n_elem_extra,
-				       unsigned int *nbytes_extra)
-{
-	struct ata_port *ap = qc->ap;
-	unsigned int n_elem = qc->n_elem;
-	struct scatterlist *lsg, *copy_lsg = NULL, *tsg = NULL, *esg = NULL;
-
-	*n_elem_extra = 0;
-	*nbytes_extra = 0;
-
-	/* needs padding? */
-	qc->pad_len = qc->nbytes & 3;
-
-	if (likely(!qc->pad_len))
-		return n_elem;
-
-	/* locate last sg and save it */
-	lsg = sg_last(qc->sg, n_elem);
-	qc->last_sg = lsg;
-	qc->saved_last_sg = *lsg;
-
-	sg_init_table(qc->extra_sg, ARRAY_SIZE(qc->extra_sg));
-
-	if (qc->pad_len) {
-		struct scatterlist *psg = &qc->extra_sg[1];
-		void *pad_buf = ap->pad + (qc->tag * ATA_DMA_PAD_SZ);
-		unsigned int offset;
-
-		WARN_ON(qc->dev->class != ATA_DEV_ATAPI);
-
-		memset(pad_buf, 0, ATA_DMA_PAD_SZ);
-
-		/* psg->page/offset are used to copy to-be-written
-		 * data in this function or read data in ata_sg_clean.
-		 */
-		offset = lsg->offset + lsg->length - qc->pad_len;
-		sg_set_page(psg, nth_page(sg_page(lsg), offset >> PAGE_SHIFT),
-			    qc->pad_len, offset_in_page(offset));
-
-		if (qc->tf.flags & ATA_TFLAG_WRITE) {
-			void *addr = kmap_atomic(sg_page(psg), KM_IRQ0);
-			memcpy(pad_buf, addr + psg->offset, qc->pad_len);
-			kunmap_atomic(addr, KM_IRQ0);
-		}
-
-		sg_dma_address(psg) = ap->pad_dma + (qc->tag * ATA_DMA_PAD_SZ);
-		sg_dma_len(psg) = ATA_DMA_PAD_SZ;
-
-		/* Trim the last sg entry and chain the original and
-		 * padding sg lists.
-		 *
-		 * Because chaining consumes one sg entry, one extra
-		 * sg entry is allocated and the last sg entry is
-		 * copied to it if the length isn't zero after padded
-		 * amount is removed.
-		 *
-		 * If the last sg entry is completely replaced by
-		 * padding sg entry, the first sg entry is skipped
-		 * while chaining.
-		 */
-		lsg->length -= qc->pad_len;
-		if (lsg->length) {
-			copy_lsg = &qc->extra_sg[0];
-			tsg = &qc->extra_sg[0];
-		} else {
-			n_elem--;
-			tsg = &qc->extra_sg[1];
-		}
-
-		esg = &qc->extra_sg[1];
-
-		(*n_elem_extra)++;
-		(*nbytes_extra) += 4 - qc->pad_len;
-	}
-
-	if (copy_lsg)
-		sg_set_page(copy_lsg, sg_page(lsg), lsg->length, lsg->offset);
-
-	sg_chain(lsg, 1, tsg);
-	sg_mark_end(esg);
-
-	/* sglist can't start with chaining sg entry, fast forward */
-	if (qc->sg == lsg) {
-		qc->sg = tsg;
-		qc->cursg = tsg;
-	}
-
-	return n_elem;
-}
-
 /**
  *	ata_sg_setup - DMA-map the scatter-gather table associated with a command.
  *	@qc: Command with scatter-gather table to be mapped.
@@ -4872,26 +4764,15 @@ static unsigned int ata_sg_setup_extra(s
 static int ata_sg_setup(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap = qc->ap;
-	unsigned int n_elem, n_elem_extra, nbytes_extra;
 
 	VPRINTK("ENTER, ata%u\n", ap->print_id);
 
-	n_elem = ata_sg_setup_extra(qc, &n_elem_extra, &nbytes_extra);
+	qc->n_elem = dma_map_sg(ap->dev, qc->sg, qc->n_elem, qc->dma_dir);
+	if (qc->n_elem < 1)
+		return -1;
 
-	if (n_elem) {
-		n_elem = dma_map_sg(ap->dev, qc->sg, n_elem, qc->dma_dir);
-		if (n_elem < 1) {
-			/* restore last sg */
-			if (qc->last_sg)
-				*qc->last_sg = qc->saved_last_sg;
-			return -1;
-		}
-		DPRINTK("%d sg elements mapped\n", n_elem);
-	}
+	DPRINTK("%d sg elements mapped\n", n_elem);
 
-	qc->n_elem = qc->mapped_n_elem = n_elem;
-	qc->n_elem += n_elem_extra;
-	qc->nbytes += nbytes_extra;
 	qc->flags |= ATA_QCFLAG_DMAMAP;
 
 	return 0;
@@ -6566,19 +6447,12 @@ void ata_host_resume(struct ata_host *ho
 int ata_port_start(struct ata_port *ap)
 {
 	struct device *dev = ap->dev;
-	int rc;
 
 	ap->prd = dmam_alloc_coherent(dev, ATA_PRD_TBL_SZ, &ap->prd_dma,
 				      GFP_KERNEL);
 	if (!ap->prd)
 		return -ENOMEM;
 
-	rc = ata_pad_alloc(ap, dev);
-	if (rc)
-		return rc;
-
-	DPRINTK("prd alloc, virt %p, dma %llx\n", ap->prd,
-		(unsigned long long)ap->prd_dma);
 	return 0;
 }
 
Index: work/drivers/ata/libata-scsi.c
===================================================================
--- work.orig/drivers/ata/libata-scsi.c
+++ work/drivers/ata/libata-scsi.c
@@ -832,18 +832,11 @@ static void ata_scsi_dev_config(struct s
 	/* configure max sectors */
 	blk_queue_max_sectors(sdev->request_queue, dev->max_sectors);
 
-	/* SATA DMA transfers must be multiples of 4 byte, so
-	 * we need to pad ATAPI transfers using an extra sg.
-	 * Decrement max hw segments accordingly.
-	 */
-	if (dev->class == ATA_DEV_ATAPI) {
-		struct request_queue *q = sdev->request_queue;
-		blk_queue_max_hw_segments(q, q->max_hw_segments - 1);
-
+	if (dev->class == ATA_DEV_ATAPI)
 		/* set the min alignment */
 		blk_queue_update_dma_alignment(sdev->request_queue,
 					       ATA_DMA_PAD_SZ - 1);
-	} else
+	else
 		/* ATA devices must be sector aligned */
 		blk_queue_update_dma_alignment(sdev->request_queue,
 					       ATA_SECT_SIZE - 1);
@@ -2500,7 +2493,7 @@ static unsigned int atapi_xlat(struct at
 	 * want to set it properly, and for DMA where it is
 	 * effectively meaningless.
 	 */
-	nbytes = min(qc->nbytes, (unsigned int)63 * 1024);
+	nbytes = min(scmd->request->raw_data_len, (unsigned int)63 * 1024);
 
 	/* Most ATAPI devices which honor transfer chunk size don't
 	 * behave according to the spec when odd chunk size which
@@ -3564,7 +3557,7 @@ EXPORT_SYMBOL_GPL(ata_sas_port_alloc);
  */
 int ata_sas_port_start(struct ata_port *ap)
 {
-	return ata_pad_alloc(ap, ap->dev);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(ata_sas_port_start);
 
@@ -3582,7 +3575,6 @@ EXPORT_SYMBOL_GPL(ata_sas_port_start);
 
 void ata_sas_port_stop(struct ata_port *ap)
 {
-	ata_pad_free(ap, ap->dev);
 }
 EXPORT_SYMBOL_GPL(ata_sas_port_stop);
 
Index: work/drivers/ata/sata_fsl.c
===================================================================
--- work.orig/drivers/ata/sata_fsl.c
+++ work/drivers/ata/sata_fsl.c
@@ -601,21 +601,9 @@ static int sata_fsl_port_start(struct at
 	if (!pp)
 		return -ENOMEM;
 
-	/*
-	 * allocate per command dma alignment pad buffer, which is used
-	 * internally by libATA to ensure that all transfers ending on
-	 * unaligned boundaries are padded, to align on Dword boundaries
-	 */
-	retval = ata_pad_alloc(ap, dev);
-	if (retval) {
-		kfree(pp);
-		return retval;
-	}
-
 	mem = dma_alloc_coherent(dev, SATA_FSL_PORT_PRIV_DMA_SZ, &mem_dma,
 				 GFP_KERNEL);
 	if (!mem) {
-		ata_pad_free(ap, dev);
 		kfree(pp);
 		return -ENOMEM;
 	}
@@ -694,7 +682,6 @@ static void sata_fsl_port_stop(struct at
 	dma_free_coherent(dev, SATA_FSL_PORT_PRIV_DMA_SZ,
 			  pp->cmdslot, pp->cmdslot_paddr);
 
-	ata_pad_free(ap, dev);
 	kfree(pp);
 }
 
Index: work/drivers/ata/sata_mv.c
===================================================================
--- work.orig/drivers/ata/sata_mv.c
+++ work/drivers/ata/sata_mv.c
@@ -1157,17 +1157,13 @@ static int mv_port_start(struct ata_port
 	struct mv_port_priv *pp;
 	void __iomem *port_mmio = mv_ap_base(ap);
 	unsigned long flags;
-	int tag, rc;
+	int tag;
 
 	pp = devm_kzalloc(dev, sizeof(*pp), GFP_KERNEL);
 	if (!pp)
 		return -ENOMEM;
 	ap->private_data = pp;
 
-	rc = ata_pad_alloc(ap, dev);
-	if (rc)
-		return rc;
-
 	pp->crqb = dma_pool_alloc(hpriv->crqb_pool, GFP_KERNEL, &pp->crqb_dma);
 	if (!pp->crqb)
 		return -ENOMEM;
Index: work/drivers/ata/sata_sil24.c
===================================================================
--- work.orig/drivers/ata/sata_sil24.c
+++ work/drivers/ata/sata_sil24.c
@@ -1234,7 +1234,6 @@ static int sil24_port_start(struct ata_p
 	union sil24_cmd_block *cb;
 	size_t cb_size = sizeof(*cb) * SIL24_MAX_CMDS;
 	dma_addr_t cb_dma;
-	int rc;
 
 	pp = devm_kzalloc(dev, sizeof(*pp), GFP_KERNEL);
 	if (!pp)
@@ -1247,10 +1246,6 @@ static int sil24_port_start(struct ata_p
 		return -ENOMEM;
 	memset(cb, 0, cb_size);
 
-	rc = ata_pad_alloc(ap, dev);
-	if (rc)
-		return rc;
-
 	pp->cmd_block = cb;
 	pp->cmd_block_dma = cb_dma;
 
Index: work/include/linux/libata.h
===================================================================
--- work.orig/include/linux/libata.h
+++ work/include/linux/libata.h
@@ -278,7 +278,6 @@ enum {
 
 	/* size of buffer to pad xfers ending on unaligned boundaries */
 	ATA_DMA_PAD_SZ		= 4,
-	ATA_DMA_PAD_BUF_SZ	= ATA_DMA_PAD_SZ * ATA_MAX_QUEUE,
 
 	/* ering size */
 	ATA_ERING_SIZE		= 32,
@@ -458,11 +457,9 @@ struct ata_queued_cmd {
 	unsigned int		tag;
 	unsigned int		n_elem;
 	unsigned int		n_iter;
-	unsigned int		mapped_n_elem;
 
 	int			dma_dir;
 
-	unsigned int		pad_len;
 	unsigned int		sect_size;
 
 	unsigned int		nbytes;
@@ -473,9 +470,7 @@ struct ata_queued_cmd {
 	unsigned int		cursg_ofs;
 
 	struct scatterlist	*last_sg;
-	struct scatterlist	saved_last_sg;
 	struct scatterlist	sgent;
-	struct scatterlist	extra_sg[2];
 
 	struct scatterlist	*sg;
 
@@ -620,9 +615,6 @@ struct ata_port {
 	struct ata_prd		*prd;	 /* our SG list */
 	dma_addr_t		prd_dma; /* and its DMA mapping */
 
-	void			*pad;	/* array of DMA pad buffers */
-	dma_addr_t		pad_dma;
-
 	struct ata_ioports	ioaddr;	/* ATA cmd/ctl/dma register blocks */
 
 	u8			ctl;	/* cache of ATA control register */
@@ -1366,10 +1358,8 @@ static inline void ata_qc_reinit(struct 
 	qc->cursg_ofs = 0;
 	qc->nbytes = qc->raw_nbytes = qc->curbytes = 0;
 	qc->n_elem = 0;
-	qc->mapped_n_elem = 0;
 	qc->n_iter = 0;
 	qc->err_mask = 0;
-	qc->pad_len = 0;
 	qc->last_sg = NULL;
 	qc->sect_size = ATA_SECT_SIZE;
 
@@ -1425,19 +1415,6 @@ static inline unsigned int __ac_err_mask
 	return mask;
 }
 
-static inline int ata_pad_alloc(struct ata_port *ap, struct device *dev)
-{
-	ap->pad_dma = 0;
-	ap->pad = dmam_alloc_coherent(dev, ATA_DMA_PAD_BUF_SZ,
-				      &ap->pad_dma, GFP_KERNEL);
-	return (ap->pad == NULL) ? -ENOMEM : 0;
-}
-
-static inline void ata_pad_free(struct ata_port *ap, struct device *dev)
-{
-	dmam_free_coherent(dev, ATA_DMA_PAD_BUF_SZ, ap->pad, ap->pad_dma);
-}
-
 static inline struct ata_port *ata_shost_to_port(struct Scsi_Host *host)
 {
 	return *(struct ata_port **)&host->hostdata[0];
Index: work/drivers/scsi/libsas/sas_ata.c
===================================================================
--- work.orig/drivers/scsi/libsas/sas_ata.c
+++ work/drivers/scsi/libsas/sas_ata.c
@@ -178,8 +178,8 @@ static unsigned int sas_ata_qc_issue(str
 	task->uldd_task = qc;
 	if (ata_is_atapi(qc->tf.protocol)) {
 		memcpy(task->ata_task.atapi_packet, qc->cdb, qc->dev->cdb_len);
-		task->total_xfer_len = qc->nbytes + qc->pad_len;
-		task->num_scatter = qc->pad_len ? qc->n_elem + 1 : qc->n_elem;
+		task->total_xfer_len = qc->nbytes;
+		task->num_scatter = qc->n_elem;
 	} else {
 		for_each_sg(qc->sg, sg, qc->n_elem, si)
 			xfer += sg->length;
Index: work/drivers/scsi/ipr.c
===================================================================
--- work.orig/drivers/scsi/ipr.c
+++ work/drivers/scsi/ipr.c
@@ -5140,7 +5140,7 @@ static void ipr_build_ata_ioadl(struct i
 	struct ipr_ioarcb *ioarcb = &ipr_cmd->ioarcb;
 	struct ipr_ioadl_desc *ioadl = ipr_cmd->ioadl;
 	struct ipr_ioadl_desc *last_ioadl = NULL;
-	int len = qc->nbytes + qc->pad_len;
+	int len = qc->nbytes;
 	struct scatterlist *sg;
 	unsigned int si;
 
@@ -5206,7 +5206,7 @@ static unsigned int ipr_qc_issue(struct 
 	ioarcb->cmd_pkt.request_type = IPR_RQTYPE_ATA_PASSTHRU;
 	ioarcb->cmd_pkt.flags_hi |= IPR_FLAGS_HI_NO_LINK_DESC;
 	ioarcb->cmd_pkt.flags_hi |= IPR_FLAGS_HI_NO_ULEN_CHK;
-	ipr_cmd->dma_use_sg = qc->pad_len ? qc->n_elem + 1 : qc->n_elem;
+	ipr_cmd->dma_use_sg = qc->n_elem;
 
 	ipr_build_ata_ioadl(ipr_cmd, qc);
 	regs->flags |= IPR_ATA_FLAG_STATUS_ON_GOOD_COMPLETION;
Index: work/drivers/ata/ahci.c
===================================================================
--- work.orig/drivers/ata/ahci.c
+++ work/drivers/ata/ahci.c
@@ -1979,16 +1979,11 @@ static int ahci_port_start(struct ata_po
 	struct ahci_port_priv *pp;
 	void *mem;
 	dma_addr_t mem_dma;
-	int rc;
 
 	pp = devm_kzalloc(dev, sizeof(*pp), GFP_KERNEL);
 	if (!pp)
 		return -ENOMEM;
 
-	rc = ata_pad_alloc(ap, dev);
-	if (rc)
-		return rc;
-
 	mem = dmam_alloc_coherent(dev, AHCI_PORT_PRIV_DMA_SZ, &mem_dma,
 				  GFP_KERNEL);
 	if (!mem)
Index: work/drivers/ata/pata_icside.c
===================================================================
--- work.orig/drivers/ata/pata_icside.c
+++ work/drivers/ata/pata_icside.c
@@ -304,12 +304,6 @@ static int icside_dma_init(struct pata_i
 }
 
 
-static int pata_icside_port_start(struct ata_port *ap)
-{
-	/* No PRD to alloc */
-	return ata_pad_alloc(ap, ap->dev);
-}
-
 static struct scsi_host_template pata_icside_sht = {
 	.module			= THIS_MODULE,
 	.name			= DRV_NAME,
@@ -389,8 +383,6 @@ static struct ata_port_operations pata_i
 	.irq_clear		= ata_dummy_noret,
 	.irq_on			= ata_irq_on,
 
-	.port_start		= pata_icside_port_start,
-
 	.bmdma_stop		= pata_icside_bmdma_stop,
 	.bmdma_status		= pata_icside_bmdma_status,
 };

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH RESEND number 2] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2008-02-04  9:25                       ` Tejun Heo
@ 2008-02-04 14:43                         ` Tejun Heo
  2008-02-04 16:23                           ` James Bottomley
  2008-02-04 15:43                         ` James Bottomley
  1 sibling, 1 reply; 33+ messages in thread
From: Tejun Heo @ 2008-02-04 14:43 UTC (permalink / raw)
  To: James Bottomley
  Cc: Jeff Garzik, linux-scsi, linux-ide, Jens Axboe, FUJITA Tomonori

And, here's working version.  I'll splite and post them tomorrow.

Thanks.

Index: work/block/blk-core.c
===================================================================
--- work.orig/block/blk-core.c
+++ work/block/blk-core.c
@@ -116,6 +116,7 @@ void rq_init(struct request_queue *q, st
 	rq->ref_count = 1;
 	rq->q = q;
 	rq->special = NULL;
+	rq->raw_data_len = 0;
 	rq->data_len = 0;
 	rq->data = NULL;
 	rq->nr_phys_segments = 0;
@@ -1982,6 +1983,7 @@ void blk_rq_bio_prep(struct request_queu
 	rq->hard_cur_sectors = rq->current_nr_sectors;
 	rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
 	rq->buffer = bio_data(bio);
+	rq->raw_data_len = bio->bi_size;
 	rq->data_len = bio->bi_size;
 
 	rq->bio = rq->biotail = bio;
Index: work/block/blk-map.c
===================================================================
--- work.orig/block/blk-map.c
+++ work/block/blk-map.c
@@ -19,6 +19,7 @@ int blk_rq_append_bio(struct request_que
 		rq->biotail->bi_next = bio;
 		rq->biotail = bio;
 
+		rq->raw_data_len += bio->bi_size;
 		rq->data_len += bio->bi_size;
 	}
 	return 0;
@@ -139,6 +140,25 @@ int blk_rq_map_user(struct request_queue
 		ubuf += ret;
 	}
 
+	/*
+	 * __blk_rq_map_user() copies the buffers if starting address
+	 * or length aren't aligned.  As the copied buffer is always
+	 * page aligned, we know for a fact that there's enough room
+	 * for padding.  Extend the last bio and update rq->data_len
+	 * accordingly.
+	 *
+	 * On unmap, bio_uncopy_user() will use unmodified
+	 * bio_map_data pointed to by bio->bi_private.
+	 */
+	if (len & queue_dma_alignment(q)) {
+		unsigned int pad_len = (queue_dma_alignment(q) & ~len) + 1;
+		struct bio *bio = rq->biotail;
+
+		bio->bi_io_vec[bio->bi_vcnt - 1].bv_len += pad_len;
+		bio->bi_size += pad_len;
+		rq->data_len += pad_len;
+	}
+
 	rq->buffer = rq->data = NULL;
 	return 0;
 unmap_rq:
Index: work/include/linux/blkdev.h
===================================================================
--- work.orig/include/linux/blkdev.h
+++ work/include/linux/blkdev.h
@@ -214,6 +214,7 @@ struct request {
 	unsigned int cmd_len;
 	unsigned char cmd[BLK_MAX_CDB];
 
+	unsigned int raw_data_len;
 	unsigned int data_len;
 	unsigned int sense_len;
 	void *data;
@@ -256,6 +257,7 @@ struct bio_vec;
 typedef int (merge_bvec_fn) (struct request_queue *, struct bio *, struct bio_vec *);
 typedef void (prepare_flush_fn) (struct request_queue *, struct request *);
 typedef void (softirq_done_fn)(struct request *);
+typedef int (dma_drain_needed_fn)(struct request *);
 
 enum blk_queue_state {
 	Queue_down,
@@ -292,6 +294,7 @@ struct request_queue
 	merge_bvec_fn		*merge_bvec_fn;
 	prepare_flush_fn	*prepare_flush_fn;
 	softirq_done_fn		*softirq_done_fn;
+	dma_drain_needed_fn	*dma_drain_needed;
 
 	/*
 	 * Dispatch queue sorting
@@ -696,8 +699,9 @@ extern void blk_queue_max_hw_segments(st
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
 extern void blk_queue_hardsect_size(struct request_queue *, unsigned short);
 extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b);
-extern int blk_queue_dma_drain(struct request_queue *q, void *buf,
-			       unsigned int size);
+extern int blk_queue_dma_drain(struct request_queue *q,
+			       dma_drain_needed_fn *dma_drain_needed,
+			       void *buf, unsigned int size);
 extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
 extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn);
 extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *);
Index: work/block/blk-merge.c
===================================================================
--- work.orig/block/blk-merge.c
+++ work/block/blk-merge.c
@@ -220,7 +220,7 @@ new_segment:
 		bvprv = bvec;
 	} /* segments in rq */
 
-	if (q->dma_drain_size) {
+	if (q->dma_drain_size && q->dma_drain_needed(rq)) {
 		sg->page_link &= ~0x02;
 		sg = sg_next(sg);
 		sg_set_page(sg, virt_to_page(q->dma_drain_buffer),
@@ -228,6 +228,7 @@ new_segment:
 			    ((unsigned long)q->dma_drain_buffer) &
 			    (PAGE_SIZE - 1));
 		nsegs++;
+		rq->data_len += q->dma_drain_size;
 	}
 
 	if (sg)
Index: work/block/bsg.c
===================================================================
--- work.orig/block/bsg.c
+++ work/block/bsg.c
@@ -437,14 +437,14 @@ static int blk_complete_sgv4_hdr_rq(stru
 	}
 
 	if (rq->next_rq) {
-		hdr->dout_resid = rq->data_len;
-		hdr->din_resid = rq->next_rq->data_len;
+		hdr->dout_resid = rq->raw_data_len;
+		hdr->din_resid = rq->next_rq->raw_data_len;
 		blk_rq_unmap_user(bidi_bio);
 		blk_put_request(rq->next_rq);
 	} else if (rq_data_dir(rq) == READ)
-		hdr->din_resid = rq->data_len;
+		hdr->din_resid = rq->raw_data_len;
 	else
-		hdr->dout_resid = rq->data_len;
+		hdr->dout_resid = rq->raw_data_len;
 
 	/*
 	 * If the request generated a negative error number, return it
Index: work/block/scsi_ioctl.c
===================================================================
--- work.orig/block/scsi_ioctl.c
+++ work/block/scsi_ioctl.c
@@ -266,7 +266,7 @@ static int blk_complete_sghdr_rq(struct 
 	hdr->info = 0;
 	if (hdr->masked_status || hdr->host_status || hdr->driver_status)
 		hdr->info |= SG_INFO_CHECK;
-	hdr->resid = rq->data_len;
+	hdr->resid = rq->raw_data_len;
 	hdr->sb_len_wr = 0;
 
 	if (rq->sense_len && hdr->sbp) {
@@ -528,6 +528,7 @@ static int __blk_send_generic(struct req
 	rq = blk_get_request(q, WRITE, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_BLOCK_PC;
 	rq->data = NULL;
+	rq->raw_data_len = 0;
 	rq->data_len = 0;
 	rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
 	memset(rq->cmd, 0, sizeof(rq->cmd));
Index: work/drivers/scsi/scsi_lib.c
===================================================================
--- work.orig/drivers/scsi/scsi_lib.c
+++ work/drivers/scsi/scsi_lib.c
@@ -1015,10 +1015,6 @@ static int scsi_init_sgtable(struct requ
 	}
 
 	req->buffer = NULL;
-	if (blk_pc_request(req))
-		sdb->length = req->data_len;
-	else
-		sdb->length = req->nr_sectors << 9;
 
 	/* 
 	 * Next, walk the list, and fill in the addresses and sizes of
@@ -1027,6 +1023,10 @@ static int scsi_init_sgtable(struct requ
 	count = blk_rq_map_sg(req->q, req, sdb->table.sgl);
 	BUG_ON(count > sdb->table.nents);
 	sdb->table.nents = count;
+	if (blk_pc_request(req))
+		sdb->length = req->data_len;
+	else
+		sdb->length = req->nr_sectors << 9;
 	return BLKPREP_OK;
 }
 
Index: work/block/blk-settings.c
===================================================================
--- work.orig/block/blk-settings.c
+++ work/block/blk-settings.c
@@ -296,6 +296,7 @@ EXPORT_SYMBOL(blk_queue_stack_limits);
  * blk_queue_dma_drain - Set up a drain buffer for excess dma.
  *
  * @q:  the request queue for the device
+ * @dma_drain_needed: fn which returns non-zero if drain is necessary
  * @buf:	physically contiguous buffer
  * @size:	size of the buffer in bytes
  *
@@ -315,14 +316,16 @@ EXPORT_SYMBOL(blk_queue_stack_limits);
  * device can support otherwise there won't be room for the drain
  * buffer.
  */
-int blk_queue_dma_drain(struct request_queue *q, void *buf,
-				unsigned int size)
+extern int blk_queue_dma_drain(struct request_queue *q,
+			       dma_drain_needed_fn *dma_drain_needed,
+			       void *buf, unsigned int size)
 {
 	if (q->max_hw_segments < 2 || q->max_phys_segments < 2)
 		return -EINVAL;
 	/* make room for appending the drain */
 	--q->max_hw_segments;
 	--q->max_phys_segments;
+	q->dma_drain_needed = dma_drain_needed;
 	q->dma_drain_buffer = buf;
 	q->dma_drain_size = size;
 
Index: work/drivers/scsi/sr.c
===================================================================
--- work.orig/drivers/scsi/sr.c
+++ work/drivers/scsi/sr.c
@@ -557,11 +557,30 @@ static void sr_release(struct cdrom_devi
 
 }
 
+static int sr_drain_needed(struct request *rq)
+{
+	if (likely(!blk_pc_request(rq)))
+		return 0;
+
+	switch (rq->cmd[0]) {
+	case GPCMD_READ_10:
+	case GPCMD_READ_12:
+	case GPCMD_WRITE_10:
+	case GPCMD_WRITE_12:
+	case GPCMD_WRITE_AND_VERIFY_10:
+		return 0;
+	}
+
+	return 1;
+}
+
 static int sr_probe(struct device *dev)
 {
 	struct scsi_device *sdev = to_scsi_device(dev);
+	struct request_queue *queue = sdev->request_queue;
 	struct gendisk *disk;
-	struct scsi_cd *cd;
+	struct scsi_cd *cd = NULL;
+	void *drain_buf = NULL;
 	int minor, error;
 
 	error = -ENODEV;
@@ -573,11 +592,15 @@ static int sr_probe(struct device *dev)
 	if (!cd)
 		goto fail;
 
+	drain_buf = kmalloc(SR_DRAIN_SIZE, queue->bounce_gfp | GFP_KERNEL);
+	if (!drain_buf)
+		goto fail;
+
 	kref_init(&cd->kref);
 
 	disk = alloc_disk(1);
 	if (!disk)
-		goto fail_free;
+		goto fail;
 
 	spin_lock(&sr_index_lock);
 	minor = find_first_zero_bit(sr_index_bits, SR_DISKS);
@@ -615,13 +638,14 @@ static int sr_probe(struct device *dev)
 
 	/* FIXME: need to handle a get_capabilities failure properly ?? */
 	get_capabilities(cd);
-	blk_queue_prep_rq(sdev->request_queue, sr_prep_fn);
+	blk_queue_prep_rq(queue, sr_prep_fn);
+	blk_queue_dma_drain(queue, sr_drain_needed, drain_buf, SR_DRAIN_SIZE);
 	sr_vendor_init(cd);
 
 	disk->driverfs_dev = &sdev->sdev_gendev;
 	set_capacity(disk, cd->capacity);
 	disk->private_data = &cd->driver;
-	disk->queue = sdev->request_queue;
+	disk->queue = queue;
 	cd->cdi.disk = disk;
 
 	if (register_cdrom(&cd->cdi))
@@ -637,9 +661,9 @@ static int sr_probe(struct device *dev)
 
 fail_put:
 	put_disk(disk);
-fail_free:
-	kfree(cd);
 fail:
+	kfree(cd);
+	kfree(drain_buf);
 	return error;
 }
 
@@ -894,6 +918,12 @@ static void sr_kref_release(struct kref 
 static int sr_remove(struct device *dev)
 {
 	struct scsi_cd *cd = dev_get_drvdata(dev);
+	struct scsi_device *sdev = to_scsi_device(dev);
+	struct request_queue *queue = sdev->request_queue;
+
+	kfree(queue->dma_drain_buffer);
+	queue->dma_drain_buffer = NULL;
+	queue->dma_drain_size = 0;
 
 	del_gendisk(cd->disk);
 
Index: work/drivers/scsi/sr.h
===================================================================
--- work.orig/drivers/scsi/sr.h
+++ work/drivers/scsi/sr.h
@@ -22,6 +22,7 @@
 
 #define MAX_RETRIES	3
 #define SR_TIMEOUT	(30 * HZ)
+#define SR_DRAIN_SIZE	PAGE_SIZE
 
 struct scsi_device;
 
Index: work/drivers/ata/libata-core.c
===================================================================
--- work.orig/drivers/ata/libata-core.c
+++ work/drivers/ata/libata-core.c
@@ -4476,30 +4476,13 @@ void ata_sg_clean(struct ata_queued_cmd 
 	struct ata_port *ap = qc->ap;
 	struct scatterlist *sg = qc->sg;
 	int dir = qc->dma_dir;
-	void *pad_buf = NULL;
 
 	WARN_ON(sg == NULL);
 
-	VPRINTK("unmapping %u sg elements\n", qc->mapped_n_elem);
+	VPRINTK("unmapping %u sg elements\n", qc->n_elem);
 
-	/* if we padded the buffer out to 32-bit bound, and data
-	 * xfer direction is from-device, we must copy from the
-	 * pad buffer back into the supplied buffer
-	 */
-	if (qc->pad_len && !(qc->tf.flags & ATA_TFLAG_WRITE))
-		pad_buf = ap->pad + (qc->tag * ATA_DMA_PAD_SZ);
-
-	if (qc->mapped_n_elem)
-		dma_unmap_sg(ap->dev, sg, qc->mapped_n_elem, dir);
-	/* restore last sg */
-	if (qc->last_sg)
-		*qc->last_sg = qc->saved_last_sg;
-	if (pad_buf) {
-		struct scatterlist *psg = &qc->extra_sg[1];
-		void *addr = kmap_atomic(sg_page(psg), KM_IRQ0);
-		memcpy(addr + psg->offset, pad_buf, qc->pad_len);
-		kunmap_atomic(addr, KM_IRQ0);
-	}
+	if (qc->n_elem)
+		dma_unmap_sg(ap->dev, sg, qc->n_elem, dir);
 
 	qc->flags &= ~ATA_QCFLAG_DMAMAP;
 	qc->sg = NULL;
@@ -4765,97 +4748,6 @@ void ata_sg_init(struct ata_queued_cmd *
 	qc->cursg = qc->sg;
 }
 
-static unsigned int ata_sg_setup_extra(struct ata_queued_cmd *qc,
-				       unsigned int *n_elem_extra,
-				       unsigned int *nbytes_extra)
-{
-	struct ata_port *ap = qc->ap;
-	unsigned int n_elem = qc->n_elem;
-	struct scatterlist *lsg, *copy_lsg = NULL, *tsg = NULL, *esg = NULL;
-
-	*n_elem_extra = 0;
-	*nbytes_extra = 0;
-
-	/* needs padding? */
-	qc->pad_len = qc->nbytes & 3;
-
-	if (likely(!qc->pad_len))
-		return n_elem;
-
-	/* locate last sg and save it */
-	lsg = sg_last(qc->sg, n_elem);
-	qc->last_sg = lsg;
-	qc->saved_last_sg = *lsg;
-
-	sg_init_table(qc->extra_sg, ARRAY_SIZE(qc->extra_sg));
-
-	if (qc->pad_len) {
-		struct scatterlist *psg = &qc->extra_sg[1];
-		void *pad_buf = ap->pad + (qc->tag * ATA_DMA_PAD_SZ);
-		unsigned int offset;
-
-		WARN_ON(qc->dev->class != ATA_DEV_ATAPI);
-
-		memset(pad_buf, 0, ATA_DMA_PAD_SZ);
-
-		/* psg->page/offset are used to copy to-be-written
-		 * data in this function or read data in ata_sg_clean.
-		 */
-		offset = lsg->offset + lsg->length - qc->pad_len;
-		sg_set_page(psg, nth_page(sg_page(lsg), offset >> PAGE_SHIFT),
-			    qc->pad_len, offset_in_page(offset));
-
-		if (qc->tf.flags & ATA_TFLAG_WRITE) {
-			void *addr = kmap_atomic(sg_page(psg), KM_IRQ0);
-			memcpy(pad_buf, addr + psg->offset, qc->pad_len);
-			kunmap_atomic(addr, KM_IRQ0);
-		}
-
-		sg_dma_address(psg) = ap->pad_dma + (qc->tag * ATA_DMA_PAD_SZ);
-		sg_dma_len(psg) = ATA_DMA_PAD_SZ;
-
-		/* Trim the last sg entry and chain the original and
-		 * padding sg lists.
-		 *
-		 * Because chaining consumes one sg entry, one extra
-		 * sg entry is allocated and the last sg entry is
-		 * copied to it if the length isn't zero after padded
-		 * amount is removed.
-		 *
-		 * If the last sg entry is completely replaced by
-		 * padding sg entry, the first sg entry is skipped
-		 * while chaining.
-		 */
-		lsg->length -= qc->pad_len;
-		if (lsg->length) {
-			copy_lsg = &qc->extra_sg[0];
-			tsg = &qc->extra_sg[0];
-		} else {
-			n_elem--;
-			tsg = &qc->extra_sg[1];
-		}
-
-		esg = &qc->extra_sg[1];
-
-		(*n_elem_extra)++;
-		(*nbytes_extra) += 4 - qc->pad_len;
-	}
-
-	if (copy_lsg)
-		sg_set_page(copy_lsg, sg_page(lsg), lsg->length, lsg->offset);
-
-	sg_chain(lsg, 1, tsg);
-	sg_mark_end(esg);
-
-	/* sglist can't start with chaining sg entry, fast forward */
-	if (qc->sg == lsg) {
-		qc->sg = tsg;
-		qc->cursg = tsg;
-	}
-
-	return n_elem;
-}
-
 /**
  *	ata_sg_setup - DMA-map the scatter-gather table associated with a command.
  *	@qc: Command with scatter-gather table to be mapped.
@@ -4872,26 +4764,27 @@ static unsigned int ata_sg_setup_extra(s
 static int ata_sg_setup(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap = qc->ap;
-	unsigned int n_elem, n_elem_extra, nbytes_extra;
 
 	VPRINTK("ENTER, ata%u\n", ap->print_id);
 
-	n_elem = ata_sg_setup_extra(qc, &n_elem_extra, &nbytes_extra);
+	if (ata_is_atapi(qc->tf.protocol)) {
+		struct scatterlist *sg;
+		int i;
 
-	if (n_elem) {
-		n_elem = dma_map_sg(ap->dev, qc->sg, n_elem, qc->dma_dir);
-		if (n_elem < 1) {
-			/* restore last sg */
-			if (qc->last_sg)
-				*qc->last_sg = qc->saved_last_sg;
-			return -1;
-		}
-		DPRINTK("%d sg elements mapped\n", n_elem);
+		ata_dev_printk(qc->dev, KERN_INFO, "XXX cmd=%02x n_elem=%u nbytes=%u dma_dir=%d\n",
+			       qc->cdb[0], qc->n_elem, qc->nbytes, qc->dma_dir);
+
+		for_each_sg(qc->sg, sg, qc->n_elem, i)
+			ata_dev_printk(qc->dev, KERN_INFO, "YYY pfn=%lu offset=%u length=%u\n",
+				       page_to_pfn(sg_page(sg)), sg->offset, sg->length);
 	}
 
-	qc->n_elem = qc->mapped_n_elem = n_elem;
-	qc->n_elem += n_elem_extra;
-	qc->nbytes += nbytes_extra;
+	qc->n_elem = dma_map_sg(ap->dev, qc->sg, qc->n_elem, qc->dma_dir);
+	if (qc->n_elem < 1)
+		return -1;
+
+	DPRINTK("%d sg elements mapped\n", n_elem);
+
 	qc->flags |= ATA_QCFLAG_DMAMAP;
 
 	return 0;
@@ -5955,9 +5848,6 @@ void ata_qc_issue(struct ata_queued_cmd 
 	 */
 	BUG_ON(ata_is_data(prot) && (!qc->sg || !qc->n_elem || !qc->nbytes));
 
-	/* ata_sg_setup() may update nbytes */
-	qc->raw_nbytes = qc->nbytes;
-
 	if (ata_is_dma(prot) || (ata_is_pio(prot) &&
 				 (ap->flags & ATA_FLAG_PIO_DMA)))
 		if (ata_sg_setup(qc))
@@ -6566,19 +6456,12 @@ void ata_host_resume(struct ata_host *ho
 int ata_port_start(struct ata_port *ap)
 {
 	struct device *dev = ap->dev;
-	int rc;
 
 	ap->prd = dmam_alloc_coherent(dev, ATA_PRD_TBL_SZ, &ap->prd_dma,
 				      GFP_KERNEL);
 	if (!ap->prd)
 		return -ENOMEM;
 
-	rc = ata_pad_alloc(ap, dev);
-	if (rc)
-		return rc;
-
-	DPRINTK("prd alloc, virt %p, dma %llx\n", ap->prd,
-		(unsigned long long)ap->prd_dma);
 	return 0;
 }
 
Index: work/drivers/ata/libata-scsi.c
===================================================================
--- work.orig/drivers/ata/libata-scsi.c
+++ work/drivers/ata/libata-scsi.c
@@ -832,18 +832,11 @@ static void ata_scsi_dev_config(struct s
 	/* configure max sectors */
 	blk_queue_max_sectors(sdev->request_queue, dev->max_sectors);
 
-	/* SATA DMA transfers must be multiples of 4 byte, so
-	 * we need to pad ATAPI transfers using an extra sg.
-	 * Decrement max hw segments accordingly.
-	 */
-	if (dev->class == ATA_DEV_ATAPI) {
-		struct request_queue *q = sdev->request_queue;
-		blk_queue_max_hw_segments(q, q->max_hw_segments - 1);
-
+	if (dev->class == ATA_DEV_ATAPI)
 		/* set the min alignment */
 		blk_queue_update_dma_alignment(sdev->request_queue,
 					       ATA_DMA_PAD_SZ - 1);
-	} else
+	else
 		/* ATA devices must be sector aligned */
 		blk_queue_update_dma_alignment(sdev->request_queue,
 					       ATA_SECT_SIZE - 1);
@@ -2500,7 +2493,9 @@ static unsigned int atapi_xlat(struct at
 	 * want to set it properly, and for DMA where it is
 	 * effectively meaningless.
 	 */
-	nbytes = min(qc->nbytes, (unsigned int)63 * 1024);
+	nbytes = min(scmd->request->raw_data_len, (unsigned int)63 * 1024);
+	ata_dev_printk(qc->dev, KERN_INFO, "XXX raw_data_len=%u\n",
+		       scmd->request->raw_data_len);
 
 	/* Most ATAPI devices which honor transfer chunk size don't
 	 * behave according to the spec when odd chunk size which
@@ -3564,7 +3559,7 @@ EXPORT_SYMBOL_GPL(ata_sas_port_alloc);
  */
 int ata_sas_port_start(struct ata_port *ap)
 {
-	return ata_pad_alloc(ap, ap->dev);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(ata_sas_port_start);
 
@@ -3582,7 +3577,6 @@ EXPORT_SYMBOL_GPL(ata_sas_port_start);
 
 void ata_sas_port_stop(struct ata_port *ap)
 {
-	ata_pad_free(ap, ap->dev);
 }
 EXPORT_SYMBOL_GPL(ata_sas_port_stop);
 
Index: work/drivers/ata/sata_fsl.c
===================================================================
--- work.orig/drivers/ata/sata_fsl.c
+++ work/drivers/ata/sata_fsl.c
@@ -601,21 +601,9 @@ static int sata_fsl_port_start(struct at
 	if (!pp)
 		return -ENOMEM;
 
-	/*
-	 * allocate per command dma alignment pad buffer, which is used
-	 * internally by libATA to ensure that all transfers ending on
-	 * unaligned boundaries are padded, to align on Dword boundaries
-	 */
-	retval = ata_pad_alloc(ap, dev);
-	if (retval) {
-		kfree(pp);
-		return retval;
-	}
-
 	mem = dma_alloc_coherent(dev, SATA_FSL_PORT_PRIV_DMA_SZ, &mem_dma,
 				 GFP_KERNEL);
 	if (!mem) {
-		ata_pad_free(ap, dev);
 		kfree(pp);
 		return -ENOMEM;
 	}
@@ -694,7 +682,6 @@ static void sata_fsl_port_stop(struct at
 	dma_free_coherent(dev, SATA_FSL_PORT_PRIV_DMA_SZ,
 			  pp->cmdslot, pp->cmdslot_paddr);
 
-	ata_pad_free(ap, dev);
 	kfree(pp);
 }
 
Index: work/drivers/ata/sata_mv.c
===================================================================
--- work.orig/drivers/ata/sata_mv.c
+++ work/drivers/ata/sata_mv.c
@@ -1157,17 +1157,13 @@ static int mv_port_start(struct ata_port
 	struct mv_port_priv *pp;
 	void __iomem *port_mmio = mv_ap_base(ap);
 	unsigned long flags;
-	int tag, rc;
+	int tag;
 
 	pp = devm_kzalloc(dev, sizeof(*pp), GFP_KERNEL);
 	if (!pp)
 		return -ENOMEM;
 	ap->private_data = pp;
 
-	rc = ata_pad_alloc(ap, dev);
-	if (rc)
-		return rc;
-
 	pp->crqb = dma_pool_alloc(hpriv->crqb_pool, GFP_KERNEL, &pp->crqb_dma);
 	if (!pp->crqb)
 		return -ENOMEM;
Index: work/drivers/ata/sata_sil24.c
===================================================================
--- work.orig/drivers/ata/sata_sil24.c
+++ work/drivers/ata/sata_sil24.c
@@ -1234,7 +1234,6 @@ static int sil24_port_start(struct ata_p
 	union sil24_cmd_block *cb;
 	size_t cb_size = sizeof(*cb) * SIL24_MAX_CMDS;
 	dma_addr_t cb_dma;
-	int rc;
 
 	pp = devm_kzalloc(dev, sizeof(*pp), GFP_KERNEL);
 	if (!pp)
@@ -1247,10 +1246,6 @@ static int sil24_port_start(struct ata_p
 		return -ENOMEM;
 	memset(cb, 0, cb_size);
 
-	rc = ata_pad_alloc(ap, dev);
-	if (rc)
-		return rc;
-
 	pp->cmd_block = cb;
 	pp->cmd_block_dma = cb_dma;
 
Index: work/include/linux/libata.h
===================================================================
--- work.orig/include/linux/libata.h
+++ work/include/linux/libata.h
@@ -278,7 +278,6 @@ enum {
 
 	/* size of buffer to pad xfers ending on unaligned boundaries */
 	ATA_DMA_PAD_SZ		= 4,
-	ATA_DMA_PAD_BUF_SZ	= ATA_DMA_PAD_SZ * ATA_MAX_QUEUE,
 
 	/* ering size */
 	ATA_ERING_SIZE		= 32,
@@ -458,24 +457,19 @@ struct ata_queued_cmd {
 	unsigned int		tag;
 	unsigned int		n_elem;
 	unsigned int		n_iter;
-	unsigned int		mapped_n_elem;
 
 	int			dma_dir;
 
-	unsigned int		pad_len;
 	unsigned int		sect_size;
 
 	unsigned int		nbytes;
-	unsigned int		raw_nbytes;
 	unsigned int		curbytes;
 
 	struct scatterlist	*cursg;
 	unsigned int		cursg_ofs;
 
 	struct scatterlist	*last_sg;
-	struct scatterlist	saved_last_sg;
 	struct scatterlist	sgent;
-	struct scatterlist	extra_sg[2];
 
 	struct scatterlist	*sg;
 
@@ -620,9 +614,6 @@ struct ata_port {
 	struct ata_prd		*prd;	 /* our SG list */
 	dma_addr_t		prd_dma; /* and its DMA mapping */
 
-	void			*pad;	/* array of DMA pad buffers */
-	dma_addr_t		pad_dma;
-
 	struct ata_ioports	ioaddr;	/* ATA cmd/ctl/dma register blocks */
 
 	u8			ctl;	/* cache of ATA control register */
@@ -1364,12 +1355,10 @@ static inline void ata_qc_reinit(struct 
 	qc->flags = 0;
 	qc->cursg = NULL;
 	qc->cursg_ofs = 0;
-	qc->nbytes = qc->raw_nbytes = qc->curbytes = 0;
+	qc->nbytes = qc->curbytes = 0;
 	qc->n_elem = 0;
-	qc->mapped_n_elem = 0;
 	qc->n_iter = 0;
 	qc->err_mask = 0;
-	qc->pad_len = 0;
 	qc->last_sg = NULL;
 	qc->sect_size = ATA_SECT_SIZE;
 
@@ -1425,19 +1414,6 @@ static inline unsigned int __ac_err_mask
 	return mask;
 }
 
-static inline int ata_pad_alloc(struct ata_port *ap, struct device *dev)
-{
-	ap->pad_dma = 0;
-	ap->pad = dmam_alloc_coherent(dev, ATA_DMA_PAD_BUF_SZ,
-				      &ap->pad_dma, GFP_KERNEL);
-	return (ap->pad == NULL) ? -ENOMEM : 0;
-}
-
-static inline void ata_pad_free(struct ata_port *ap, struct device *dev)
-{
-	dmam_free_coherent(dev, ATA_DMA_PAD_BUF_SZ, ap->pad, ap->pad_dma);
-}
-
 static inline struct ata_port *ata_shost_to_port(struct Scsi_Host *host)
 {
 	return *(struct ata_port **)&host->hostdata[0];
Index: work/drivers/scsi/libsas/sas_ata.c
===================================================================
--- work.orig/drivers/scsi/libsas/sas_ata.c
+++ work/drivers/scsi/libsas/sas_ata.c
@@ -178,8 +178,8 @@ static unsigned int sas_ata_qc_issue(str
 	task->uldd_task = qc;
 	if (ata_is_atapi(qc->tf.protocol)) {
 		memcpy(task->ata_task.atapi_packet, qc->cdb, qc->dev->cdb_len);
-		task->total_xfer_len = qc->nbytes + qc->pad_len;
-		task->num_scatter = qc->pad_len ? qc->n_elem + 1 : qc->n_elem;
+		task->total_xfer_len = qc->nbytes;
+		task->num_scatter = qc->n_elem;
 	} else {
 		for_each_sg(qc->sg, sg, qc->n_elem, si)
 			xfer += sg->length;
Index: work/drivers/scsi/ipr.c
===================================================================
--- work.orig/drivers/scsi/ipr.c
+++ work/drivers/scsi/ipr.c
@@ -5140,7 +5140,7 @@ static void ipr_build_ata_ioadl(struct i
 	struct ipr_ioarcb *ioarcb = &ipr_cmd->ioarcb;
 	struct ipr_ioadl_desc *ioadl = ipr_cmd->ioadl;
 	struct ipr_ioadl_desc *last_ioadl = NULL;
-	int len = qc->nbytes + qc->pad_len;
+	int len = qc->nbytes;
 	struct scatterlist *sg;
 	unsigned int si;
 
@@ -5206,7 +5206,7 @@ static unsigned int ipr_qc_issue(struct 
 	ioarcb->cmd_pkt.request_type = IPR_RQTYPE_ATA_PASSTHRU;
 	ioarcb->cmd_pkt.flags_hi |= IPR_FLAGS_HI_NO_LINK_DESC;
 	ioarcb->cmd_pkt.flags_hi |= IPR_FLAGS_HI_NO_ULEN_CHK;
-	ipr_cmd->dma_use_sg = qc->pad_len ? qc->n_elem + 1 : qc->n_elem;
+	ipr_cmd->dma_use_sg = qc->n_elem;
 
 	ipr_build_ata_ioadl(ipr_cmd, qc);
 	regs->flags |= IPR_ATA_FLAG_STATUS_ON_GOOD_COMPLETION;
Index: work/drivers/ata/ahci.c
===================================================================
--- work.orig/drivers/ata/ahci.c
+++ work/drivers/ata/ahci.c
@@ -1979,16 +1979,11 @@ static int ahci_port_start(struct ata_po
 	struct ahci_port_priv *pp;
 	void *mem;
 	dma_addr_t mem_dma;
-	int rc;
 
 	pp = devm_kzalloc(dev, sizeof(*pp), GFP_KERNEL);
 	if (!pp)
 		return -ENOMEM;
 
-	rc = ata_pad_alloc(ap, dev);
-	if (rc)
-		return rc;
-
 	mem = dmam_alloc_coherent(dev, AHCI_PORT_PRIV_DMA_SZ, &mem_dma,
 				  GFP_KERNEL);
 	if (!mem)
Index: work/drivers/ata/pata_icside.c
===================================================================
--- work.orig/drivers/ata/pata_icside.c
+++ work/drivers/ata/pata_icside.c
@@ -304,12 +304,6 @@ static int icside_dma_init(struct pata_i
 }
 
 
-static int pata_icside_port_start(struct ata_port *ap)
-{
-	/* No PRD to alloc */
-	return ata_pad_alloc(ap, ap->dev);
-}
-
 static struct scsi_host_template pata_icside_sht = {
 	.module			= THIS_MODULE,
 	.name			= DRV_NAME,
@@ -389,8 +383,6 @@ static struct ata_port_operations pata_i
 	.irq_clear		= ata_dummy_noret,
 	.irq_on			= ata_irq_on,
 
-	.port_start		= pata_icside_port_start,
-
 	.bmdma_stop		= pata_icside_bmdma_stop,
 	.bmdma_status		= pata_icside_bmdma_status,
 };

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH RESEND number 2] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2008-02-04  9:25                       ` Tejun Heo
  2008-02-04 14:43                         ` Tejun Heo
@ 2008-02-04 15:43                         ` James Bottomley
  1 sibling, 0 replies; 33+ messages in thread
From: James Bottomley @ 2008-02-04 15:43 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Jeff Garzik, linux-scsi, linux-ide, Jens Axboe, FUJITA Tomonori

On Mon, 2008-02-04 at 18:25 +0900, Tejun Heo wrote:
> Tejun Heo wrote:
> > Some ATA controllers including SFF BMDMA and libata PIO HSM need the
> > number of bytes mapped in the sg table.  Yeah, it can be calculated with
> > a simple macro but it also is a fundamentally confusing dual-sizing
> > which should be made as clear as possible.  Plus, it can be difficult to
> > find out when somebody used the wrong thing, so what I'm saying is that
> > we need to make it easy.  Anyways, please lemme work on it a bit.  I'll
> > get back to you guys soon.
> 
> Okay, here's first draft combined patch.  Only compile tested (expect
> it to be broken) but it should be functionally equivalent to
> ata_sg_setup_extra() based implementation albeit with shorter drain
> buffer size.  Several things to note...

I noticed you posted an update ... I'll go over the code in that one.

> * fsl last sg check isn't included here.  Will split it out and post
>   it separately.
> 
> * rq->raw_data_len added.  Rationales...
> 
>   - All these padding and draining are to prevent controllers from
>     crapping themselves when data buffer is shorter than it likes it
>     to be.  Any controller which talks MMC (or SPC for that matter)
>     should be ready for transfers shorter than buffer so feeding
>     enlarged buffer size is inherently safter than feeding the length,
>     so the primary data length field, rq->data_len, contains the
>     adjusted length.

Actually, no they don't.  Overrun termination is pretty much standard in
most HBAs that do MMC (including the strange block one).  Allowing an
overrun is simply unsafe and a security risk, so all apart from the ATA
ones allow us to terminate the transaction safely.

>   - raw_data_len can't be easily deduced from data_len.  The other way
>     is possible but with both aligning and draining and command
>     filtering, calculating it later is messy.

Like I said, it's simply the command size plus the drain buffer which
you get from the queue.  I'll redo your patches to show how it can be
done without adding the extra space to every request.

> * Draining configuration is done in sr as it's the driver for MMC.  It
>   can move both ways - either into SCSI midlayer as SPC and other
>   commands do variable length responses too or into libata if all
>   non-ATA controllers are happy without such workarounds.  If you ask
>   me, I'm inclined to move it into SCSI midlayer as the added overhead
>   is insignificant (especially with drain_needed added) and it won't
>   break anything (well, theoretically, at least).

Let me think about this one.  The only consumer of draining is libata.
It might make sense to process the drains in sr.c but it certainly
doesn't make sense to turn them on indiscriminately in there because
*none* of the SCSI users needs them.

> * Padding via alinging seems a bit too hacky to me.  It doesn't even
>   cover all sg cases.  I think we'll need improvements there, well,
>   but for the time being, this should do.

That's essentially what the dma_aligment parameter was designed for ...
we can always make the way it's handled differently; I just updated
block to use what exists already.

> I'll test and report in a few hours.

Great, thanks.

James



^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH RESEND number 2] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2008-02-04 14:43                         ` Tejun Heo
@ 2008-02-04 16:23                           ` James Bottomley
  2008-02-05  0:06                             ` Tejun Heo
  0 siblings, 1 reply; 33+ messages in thread
From: James Bottomley @ 2008-02-04 16:23 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Jeff Garzik, linux-scsi, linux-ide, Jens Axboe, FUJITA Tomonori

On Mon, 2008-02-04 at 23:43 +0900, Tejun Heo wrote:
> And, here's working version.  I'll splite and post them tomorrow.
> 
> Thanks.
> 
> Index: work/block/blk-core.c
> ===================================================================
> --- work.orig/block/blk-core.c
> +++ work/block/blk-core.c
> @@ -116,6 +116,7 @@ void rq_init(struct request_queue *q, st
>  	rq->ref_count = 1;
>  	rq->q = q;
>  	rq->special = NULL;
> +	rq->raw_data_len = 0;
>  	rq->data_len = 0;
>  	rq->data = NULL;
>  	rq->nr_phys_segments = 0;
> @@ -1982,6 +1983,7 @@ void blk_rq_bio_prep(struct request_queu
>  	rq->hard_cur_sectors = rq->current_nr_sectors;
>  	rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
>  	rq->buffer = bio_data(bio);
> +	rq->raw_data_len = bio->bi_size;
>  	rq->data_len = bio->bi_size;
>  
>  	rq->bio = rq->biotail = bio;
> Index: work/block/blk-map.c
> ===================================================================
> --- work.orig/block/blk-map.c
> +++ work/block/blk-map.c
> @@ -19,6 +19,7 @@ int blk_rq_append_bio(struct request_que
>  		rq->biotail->bi_next = bio;
>  		rq->biotail = bio;
>  
> +		rq->raw_data_len += bio->bi_size;
>  		rq->data_len += bio->bi_size;
>  	}
>  	return 0;
> @@ -139,6 +140,25 @@ int blk_rq_map_user(struct request_queue
>  		ubuf += ret;
>  	}
>  
> +	/*
> +	 * __blk_rq_map_user() copies the buffers if starting address
> +	 * or length aren't aligned.  As the copied buffer is always
> +	 * page aligned, we know for a fact that there's enough room
> +	 * for padding.  Extend the last bio and update rq->data_len
> +	 * accordingly.
> +	 *
> +	 * On unmap, bio_uncopy_user() will use unmodified
> +	 * bio_map_data pointed to by bio->bi_private.
> +	 */
> +	if (len & queue_dma_alignment(q)) {
> +		unsigned int pad_len = (queue_dma_alignment(q) & ~len) + 1;
> +		struct bio *bio = rq->biotail;
> +
> +		bio->bi_io_vec[bio->bi_vcnt - 1].bv_len += pad_len;
> +		bio->bi_size += pad_len;
> +		rq->data_len += pad_len;
> +	}
> +
>  	rq->buffer = rq->data = NULL;
>  	return 0;
>  unmap_rq:
> Index: work/include/linux/blkdev.h
> ===================================================================
> --- work.orig/include/linux/blkdev.h
> +++ work/include/linux/blkdev.h
> @@ -214,6 +214,7 @@ struct request {
>  	unsigned int cmd_len;
>  	unsigned char cmd[BLK_MAX_CDB];
>  
> +	unsigned int raw_data_len;
>  	unsigned int data_len;
>  	unsigned int sense_len;
>  	void *data;

Please, no ... none of this is necessary.  You're wasting four bytes in
every request from a quantity we already know in the queue.

The point of the original code is that the drain is always waiting for
you silently in the sg list, but never shown in the length.  That means
it's entirely up to you whether you use it or ignore it.  It also means
that there's no need for the discriminating function in block, you
either do or don't use the drain element.

> @@ -256,6 +257,7 @@ struct bio_vec;
>  typedef int (merge_bvec_fn) (struct request_queue *, struct bio *, struct bio_vec *);
>  typedef void (prepare_flush_fn) (struct request_queue *, struct request *);
>  typedef void (softirq_done_fn)(struct request *);
> +typedef int (dma_drain_needed_fn)(struct request *);
>  
>  enum blk_queue_state {
>  	Queue_down,
> @@ -292,6 +294,7 @@ struct request_queue
>  	merge_bvec_fn		*merge_bvec_fn;
>  	prepare_flush_fn	*prepare_flush_fn;
>  	softirq_done_fn		*softirq_done_fn;
> +	dma_drain_needed_fn	*dma_drain_needed;

Like I said, not necessary because the MMC devices are all single
command queues, so for them, you simply apply the drain buffer the whole
time in block and decide whether to use it in the issue layer

>  	/*
>  	 * Dispatch queue sorting
> @@ -696,8 +699,9 @@ extern void blk_queue_max_hw_segments(st
>  extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
>  extern void blk_queue_hardsect_size(struct request_queue *, unsigned short);
>  extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b);
> -extern int blk_queue_dma_drain(struct request_queue *q, void *buf,
> -			       unsigned int size);
> +extern int blk_queue_dma_drain(struct request_queue *q,
> +			       dma_drain_needed_fn *dma_drain_needed,
> +			       void *buf, unsigned int size);
>  extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
>  extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn);
>  extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *);
> Index: work/block/blk-merge.c
> ===================================================================
> --- work.orig/block/blk-merge.c
> +++ work/block/blk-merge.c
> @@ -220,7 +220,7 @@ new_segment:
>  		bvprv = bvec;
>  	} /* segments in rq */
>  
> -	if (q->dma_drain_size) {
> +	if (q->dma_drain_size && q->dma_drain_needed(rq)) {
>  		sg->page_link &= ~0x02;
>  		sg = sg_next(sg);
>  		sg_set_page(sg, virt_to_page(q->dma_drain_buffer),
> @@ -228,6 +228,7 @@ new_segment:
>  			    ((unsigned long)q->dma_drain_buffer) &
>  			    (PAGE_SIZE - 1));
>  		nsegs++;
> +		rq->data_len += q->dma_drain_size;
>  	}
>  
>  	if (sg)
> Index: work/block/bsg.c
> ===================================================================
> --- work.orig/block/bsg.c
> +++ work/block/bsg.c
> @@ -437,14 +437,14 @@ static int blk_complete_sgv4_hdr_rq(stru
>  	}
>  
>  	if (rq->next_rq) {
> -		hdr->dout_resid = rq->data_len;
> -		hdr->din_resid = rq->next_rq->data_len;
> +		hdr->dout_resid = rq->raw_data_len;
> +		hdr->din_resid = rq->next_rq->raw_data_len;
>  		blk_rq_unmap_user(bidi_bio);
>  		blk_put_request(rq->next_rq);
>  	} else if (rq_data_dir(rq) == READ)
> -		hdr->din_resid = rq->data_len;
> +		hdr->din_resid = rq->raw_data_len;
>  	else
> -		hdr->dout_resid = rq->data_len;
> +		hdr->dout_resid = rq->raw_data_len;
>  
>  	/*
>  	 * If the request generated a negative error number, return it
> Index: work/block/scsi_ioctl.c
> ===================================================================
> --- work.orig/block/scsi_ioctl.c
> +++ work/block/scsi_ioctl.c
> @@ -266,7 +266,7 @@ static int blk_complete_sghdr_rq(struct 
>  	hdr->info = 0;
>  	if (hdr->masked_status || hdr->host_status || hdr->driver_status)
>  		hdr->info |= SG_INFO_CHECK;
> -	hdr->resid = rq->data_len;
> +	hdr->resid = rq->raw_data_len;
>  	hdr->sb_len_wr = 0;
>  
>  	if (rq->sense_len && hdr->sbp) {
> @@ -528,6 +528,7 @@ static int __blk_send_generic(struct req
>  	rq = blk_get_request(q, WRITE, __GFP_WAIT);
>  	rq->cmd_type = REQ_TYPE_BLOCK_PC;
>  	rq->data = NULL;
> +	rq->raw_data_len = 0;
>  	rq->data_len = 0;
>  	rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
>  	memset(rq->cmd, 0, sizeof(rq->cmd));
> Index: work/drivers/scsi/scsi_lib.c
> ===================================================================
> --- work.orig/drivers/scsi/scsi_lib.c
> +++ work/drivers/scsi/scsi_lib.c
> @@ -1015,10 +1015,6 @@ static int scsi_init_sgtable(struct requ
>  	}
>  
>  	req->buffer = NULL;
> -	if (blk_pc_request(req))
> -		sdb->length = req->data_len;
> -	else
> -		sdb->length = req->nr_sectors << 9;
>  
>  	/* 
>  	 * Next, walk the list, and fill in the addresses and sizes of
> @@ -1027,6 +1023,10 @@ static int scsi_init_sgtable(struct requ
>  	count = blk_rq_map_sg(req->q, req, sdb->table.sgl);
>  	BUG_ON(count > sdb->table.nents);
>  	sdb->table.nents = count;
> +	if (blk_pc_request(req))
> +		sdb->length = req->data_len;
> +	else
> +		sdb->length = req->nr_sectors << 9;
>  	return BLKPREP_OK;
>  }
>  
> Index: work/block/blk-settings.c
> ===================================================================
> --- work.orig/block/blk-settings.c
> +++ work/block/blk-settings.c
> @@ -296,6 +296,7 @@ EXPORT_SYMBOL(blk_queue_stack_limits);
>   * blk_queue_dma_drain - Set up a drain buffer for excess dma.
>   *
>   * @q:  the request queue for the device
> + * @dma_drain_needed: fn which returns non-zero if drain is necessary
>   * @buf:	physically contiguous buffer
>   * @size:	size of the buffer in bytes
>   *
> @@ -315,14 +316,16 @@ EXPORT_SYMBOL(blk_queue_stack_limits);
>   * device can support otherwise there won't be room for the drain
>   * buffer.
>   */
> -int blk_queue_dma_drain(struct request_queue *q, void *buf,
> -				unsigned int size)
> +extern int blk_queue_dma_drain(struct request_queue *q,
> +			       dma_drain_needed_fn *dma_drain_needed,
> +			       void *buf, unsigned int size)
>  {
>  	if (q->max_hw_segments < 2 || q->max_phys_segments < 2)
>  		return -EINVAL;
>  	/* make room for appending the drain */
>  	--q->max_hw_segments;
>  	--q->max_phys_segments;
> +	q->dma_drain_needed = dma_drain_needed;
>  	q->dma_drain_buffer = buf;
>  	q->dma_drain_size = size;

All of these block changes go away if you simply use the drain where
it's needed

> Index: work/drivers/scsi/sr.c
> ===================================================================
> --- work.orig/drivers/scsi/sr.c
> +++ work/drivers/scsi/sr.c
> @@ -557,11 +557,30 @@ static void sr_release(struct cdrom_devi
>  
>  }
>  
> +static int sr_drain_needed(struct request *rq)
> +{
> +	if (likely(!blk_pc_request(rq)))
> +		return 0;
> +
> +	switch (rq->cmd[0]) {
> +	case GPCMD_READ_10:
> +	case GPCMD_READ_12:
> +	case GPCMD_WRITE_10:
> +	case GPCMD_WRITE_12:
> +	case GPCMD_WRITE_AND_VERIFY_10:
> +		return 0;
> +	}
> +
> +	return 1;
> +}
> +
>  static int sr_probe(struct device *dev)
>  {
>  	struct scsi_device *sdev = to_scsi_device(dev);
> +	struct request_queue *queue = sdev->request_queue;
>  	struct gendisk *disk;
> -	struct scsi_cd *cd;
> +	struct scsi_cd *cd = NULL;
> +	void *drain_buf = NULL;
>  	int minor, error;
>  
>  	error = -ENODEV;
> @@ -573,11 +592,15 @@ static int sr_probe(struct device *dev)
>  	if (!cd)
>  		goto fail;
>  
> +	drain_buf = kmalloc(SR_DRAIN_SIZE, queue->bounce_gfp | GFP_KERNEL);
> +	if (!drain_buf)
> +		goto fail;
> +
>  	kref_init(&cd->kref);
>  
>  	disk = alloc_disk(1);
>  	if (!disk)
> -		goto fail_free;
> +		goto fail;
>  
>  	spin_lock(&sr_index_lock);
>  	minor = find_first_zero_bit(sr_index_bits, SR_DISKS);
> @@ -615,13 +638,14 @@ static int sr_probe(struct device *dev)
>  
>  	/* FIXME: need to handle a get_capabilities failure properly ?? */
>  	get_capabilities(cd);
> -	blk_queue_prep_rq(sdev->request_queue, sr_prep_fn);
> +	blk_queue_prep_rq(queue, sr_prep_fn);
> +	blk_queue_dma_drain(queue, sr_drain_needed, drain_buf, SR_DRAIN_SIZE);
>  	sr_vendor_init(cd);
>  
>  	disk->driverfs_dev = &sdev->sdev_gendev;
>  	set_capacity(disk, cd->capacity);
>  	disk->private_data = &cd->driver;
> -	disk->queue = sdev->request_queue;
> +	disk->queue = queue;
>  	cd->cdi.disk = disk;
>  
>  	if (register_cdrom(&cd->cdi))
> @@ -637,9 +661,9 @@ static int sr_probe(struct device *dev)
>  
>  fail_put:
>  	put_disk(disk);
> -fail_free:
> -	kfree(cd);
>  fail:
> +	kfree(cd);
> +	kfree(drain_buf);
>  	return error;
>  }
>  
> @@ -894,6 +918,12 @@ static void sr_kref_release(struct kref 
>  static int sr_remove(struct device *dev)
>  {
>  	struct scsi_cd *cd = dev_get_drvdata(dev);
> +	struct scsi_device *sdev = to_scsi_device(dev);
> +	struct request_queue *queue = sdev->request_queue;
> +
> +	kfree(queue->dma_drain_buffer);
> +	queue->dma_drain_buffer = NULL;
> +	queue->dma_drain_size = 0;
>  
>  	del_gendisk(cd->disk);

This is basically all setup.  There's no actual changes for use.  Since
the only consumer of this is libata, all of this needs to be gated by
whether it's a libata device ... which really suggests it needs to be
done in libata.

Plus, tape devices are also ATAPI and since the problem seems to be
handling of ATAPI pio commands, they need all of this too.  It really
is, I think, better just to do the setup in the libata slave configure
if the device is atapi.

For all the rest, I think code demonstrates better ... well, except the
sata_fsl.c update ... you leave in qc->n_iter, but it's only ever set to
zero ... doesn't that cause this driver to break?

I analysed all the places you use the expanded data length.  To keep the
true length in the request and add the drain if necessary, I think
requires this update over my published libata drain patch.  I've also
added the request accessor to the drain buffer.  Could you verify this
actually works?  If it does, it's better than all the additions to block
and sr.  It would probably help to resend the series rebased against git
current.

James

---

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index a28351c..acf6a8b 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -2496,6 +2496,8 @@ static unsigned int atapi_xlat(struct ata_queued_cmd *qc)
 
 	qc->tf.command = ATA_CMD_PACKET;
 	qc->nbytes = scsi_bufflen(scmd);
+	if (blk_pc_request(scmd->request))
+		qc->nbytes += blk_rq_drain_size(scmd->request);
 
 	/* check whether ATAPI DMA is safe */
 	if (!using_pio && ata_check_atapi_dma(qc))
@@ -2832,6 +2834,8 @@ static unsigned int ata_scsi_pass_thru(struct ata_queued_cmd *qc)
 	 *       cover scatter/gather case.
 	 */
 	qc->nbytes = scsi_bufflen(scmd);
+	if (ata_is_atapi(qc->tf.protocol))
+		qc->nbytes += blk_rq_drain_size(scmd->request);
 
 	/* request result TF and be quiet about device error */
 	qc->flags |= ATA_QCFLAG_RESULT_TF | ATA_QCFLAG_QUIET;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 90392a9..a526066 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -676,6 +676,11 @@ extern void blk_complete_request(struct request *);
 extern unsigned int blk_rq_bytes(struct request *rq);
 extern unsigned int blk_rq_cur_bytes(struct request *rq);
 
+static inline int blk_rq_drain_size(struct request *rq)
+{
+	return rq->q->dma_drain_size;
+}
+
 static inline void blkdev_dequeue_request(struct request *req)
 {
 	elv_dequeue_request(req->q, req);




^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH RESEND number 2] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2008-02-04 16:23                           ` James Bottomley
@ 2008-02-05  0:06                             ` Tejun Heo
  2008-02-05  0:32                               ` James Bottomley
  0 siblings, 1 reply; 33+ messages in thread
From: Tejun Heo @ 2008-02-05  0:06 UTC (permalink / raw)
  To: James Bottomley
  Cc: Jeff Garzik, linux-scsi, linux-ide, Jens Axboe, FUJITA Tomonori

James Bottomley wrote:
> Plus, tape devices are also ATAPI and since the problem seems to be
> handling of ATAPI pio commands, they need all of this too.  It really
> is, I think, better just to do the setup in the libata slave configure
> if the device is atapi.

If no SCSI HBA needs it, fine.

> For all the rest, I think code demonstrates better ... well, except the
> sata_fsl.c update ... you leave in qc->n_iter, but it's only ever set to
> zero ... doesn't that cause this driver to break?

Probably.  I was gonna cross check with your patch while splitting.
They seem mostly identical so I'll probably use your patch with some
modifications.

> I analysed all the places you use the expanded data length.  To keep the
> true length in the request and add the drain if necessary, I think
> requires this update over my published libata drain patch.  I've also
> added the request accessor to the drain buffer.  Could you verify this
> actually works?  If it does, it's better than all the additions to block
> and sr.  It would probably help to resend the series rebased against git
> current.

No, it doesn't.  Drain needs to extend the sg table too and it makes
controllers lax about buffer overruns for commands they shouldn't be.
Gee.. What's the deal with extra four bytes?  If you're worried about
the size of struct request, chopping off PC request part into a separate
struct would be far better strategy than making what's already confusing
more confusing.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH RESEND number 2] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2008-02-05  0:06                             ` Tejun Heo
@ 2008-02-05  0:32                               ` James Bottomley
  2008-02-05  0:43                                 ` Tejun Heo
  0 siblings, 1 reply; 33+ messages in thread
From: James Bottomley @ 2008-02-05  0:32 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Jeff Garzik, linux-scsi, linux-ide, Jens Axboe, FUJITA Tomonori

On Tue, 2008-02-05 at 09:06 +0900, Tejun Heo wrote:
> James Bottomley wrote:
> > Plus, tape devices are also ATAPI and since the problem seems to be
> > handling of ATAPI pio commands, they need all of this too.  It really
> > is, I think, better just to do the setup in the libata slave configure
> > if the device is atapi.
> 
> If no SCSI HBA needs it, fine.
> 
> > For all the rest, I think code demonstrates better ... well, except the
> > sata_fsl.c update ... you leave in qc->n_iter, but it's only ever set to
> > zero ... doesn't that cause this driver to break?
> 
> Probably.  I was gonna cross check with your patch while splitting.
> They seem mostly identical so I'll probably use your patch with some
> modifications.
> 
> > I analysed all the places you use the expanded data length.  To keep the
> > true length in the request and add the drain if necessary, I think
> > requires this update over my published libata drain patch.  I've also
> > added the request accessor to the drain buffer.  Could you verify this
> > actually works?  If it does, it's better than all the additions to block
> > and sr.  It would probably help to resend the series rebased against git
> > current.
> 
> No, it doesn't.  Drain needs to extend the sg table too 

The patches do extend the sg table.

> and it makes controllers lax about buffer overruns for commands they shouldn't be.

So adjust the qc->nbytes to show no drain element in libata ... although
the extra element gets mapped, the HBA will be none the wiser if there's
zero length allocated to it.  Thus the entire system behaves exactly
like the drain element isn't present.  The block layer is really just
making the drain element available ... you don't have to use it.

> Gee.. What's the deal with extra four bytes?  If you're worried about
> the size of struct request, chopping off PC request part into a separate
> struct would be far better strategy than making what's already confusing
> more confusing.

Well, we are trying to slim down the request structure and scsi_cmnd
structure.  However, there's no inherent advantage to counting this all
in the block layer.  Think of it just as a transparent passthrough.
You've told the block layer to make sure you have room for a drain
element when it constructs the SG list, which it does.  Whether you use
it (by adding blk_rq_drain_size() on to the actual length you hand down)
or not is entirely up to you ... there's no need to push the decision on
that one into the block layer, since use or not is pretty much a cost
free thing.

James

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH RESEND number 2] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2008-02-05  0:32                               ` James Bottomley
@ 2008-02-05  0:43                                 ` Tejun Heo
  2008-02-05  0:53                                   ` James Bottomley
  0 siblings, 1 reply; 33+ messages in thread
From: Tejun Heo @ 2008-02-05  0:43 UTC (permalink / raw)
  To: James Bottomley
  Cc: Jeff Garzik, linux-scsi, linux-ide, Jens Axboe, FUJITA Tomonori

Hello,

James Bottomley wrote:
>> No, it doesn't.  Drain needs to extend the sg table too 
> 
> The patches do extend the sg table.

Hmm... Where?

>> and it makes controllers lax about buffer overruns for commands they shouldn't be.
> 
> So adjust the qc->nbytes to show no drain element in libata ... although
> the extra element gets mapped, the HBA will be none the wiser if there's
> zero length allocated to it.  Thus the entire system behaves exactly
> like the drain element isn't present.  The block layer is really just
> making the drain element available ... you don't have to use it.

Adjusting qc->nbytes isn't enough.  libata will have to trim at the end
of sglist.  With both draining and padding, it means libata will have to
trim one sg entry and a few more bytes.  Block layer is behaving
differently and in a very noticeable way, it's sending inconsistent
values in data_len and sg list.

>> Gee.. What's the deal with extra four bytes?  If you're worried about
>> the size of struct request, chopping off PC request part into a separate
>> struct would be far better strategy than making what's already confusing
>> more confusing.
> 
> Well, we are trying to slim down the request structure and scsi_cmnd
> structure.  However, there's no inherent advantage to counting this all
> in the block layer.  Think of it just as a transparent passthrough.
> You've told the block layer to make sure you have room for a drain
> element when it constructs the SG list, which it does.  Whether you use
> it (by adding blk_rq_drain_size() on to the actual length you hand down)
> or not is entirely up to you ... there's no need to push the decision on
> that one into the block layer, since use or not is pretty much a cost
> free thing.

Okay, I see your point.  The biggest problem is that by doing that blk
layer breaks a pretty important rule - keeping data len consistent with
sg list and the bugs it's gonna cause will be very subtle and dangerous.
 This is definitely where we can spend four extra bytes.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH RESEND number 2] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2008-02-05  0:43                                 ` Tejun Heo
@ 2008-02-05  0:53                                   ` James Bottomley
  2008-02-05  1:07                                     ` Tejun Heo
  0 siblings, 1 reply; 33+ messages in thread
From: James Bottomley @ 2008-02-05  0:53 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Jeff Garzik, linux-scsi, linux-ide, Jens Axboe, FUJITA Tomonori

On Tue, 2008-02-05 at 09:43 +0900, Tejun Heo wrote:
> Hello,
> 
> James Bottomley wrote:
> >> No, it doesn't.  Drain needs to extend the sg table too 
> > 
> > The patches do extend the sg table.
> 
> Hmm... Where?

It's the block component; it's already in git head with this patch:

commit fa0ccd837e3dddb44c7db2f128a8bb7e4eabc21a
Author: James Bottomley <James.Bottomley@HansenPartnership.com>
Date:   Thu Jan 10 11:30:36 2008 -0600

    block: implement drain buffers

The description is pretty reasonable.  The way it works is by making
block add the drain buffer to the sg table as the last element.

The way it works is if you simply use the length block gives you, you
find yourself doing DMA to every element in the sg table except the last
one.  If you expand the length to anything up to blk_rq_drain_size() you
then use the last element for the drain.

> >> and it makes controllers lax about buffer overruns for commands they shouldn't be.
> > 
> > So adjust the qc->nbytes to show no drain element in libata ... although
> > the extra element gets mapped, the HBA will be none the wiser if there's
> > zero length allocated to it.  Thus the entire system behaves exactly
> > like the drain element isn't present.  The block layer is really just
> > making the drain element available ... you don't have to use it.
> 
> Adjusting qc->nbytes isn't enough.  libata will have to trim at the end
> of sglist.  With both draining and padding, it means libata will have to
> trim one sg entry and a few more bytes.  Block layer is behaving
> differently and in a very noticeable way, it's sending inconsistent
> values in data_len and sg list.

No, that's the point.  With all the space available, you don't need to
trim at all ... you simply don't use it.  The rationale is the way the
sg element stuff works.  If you program a DMA table (PRD table in IDE
speak) into a controller, it doesn't care if you don't use the last few
pieces.  With the default length set, the DMA controller won't use the
last programmed element, but it's their if you expand the length or tell
it to for an overrun.

> >> Gee.. What's the deal with extra four bytes?  If you're worried about
> >> the size of struct request, chopping off PC request part into a separate
> >> struct would be far better strategy than making what's already confusing
> >> more confusing.
> > 
> > Well, we are trying to slim down the request structure and scsi_cmnd
> > structure.  However, there's no inherent advantage to counting this all
> > in the block layer.  Think of it just as a transparent passthrough.
> > You've told the block layer to make sure you have room for a drain
> > element when it constructs the SG list, which it does.  Whether you use
> > it (by adding blk_rq_drain_size() on to the actual length you hand down)
> > or not is entirely up to you ... there's no need to push the decision on
> > that one into the block layer, since use or not is pretty much a cost
> > free thing.
> 
> Okay, I see your point.  The biggest problem is that by doing that blk
> layer breaks a pretty important rule - keeping data len consistent with
> sg list and the bugs it's gonna cause will be very subtle and dangerous.
>  This is definitely where we can spend four extra bytes.

The lengths don't have to be consistent (otherwise we wouldn't need to
calculate them separately) the request reported length just has to be
less than the sg table actual length.  We even make use of this property
in SCSI when we can't fit a request into a single command ... we are
allowed to chunk up the request as we complete it.  I'm not saying it's
best practice, but it is acceptable behaviour.

James

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH RESEND number 2] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2008-02-05  0:53                                   ` James Bottomley
@ 2008-02-05  1:07                                     ` Tejun Heo
  2008-02-05  5:03                                       ` James Bottomley
  0 siblings, 1 reply; 33+ messages in thread
From: Tejun Heo @ 2008-02-05  1:07 UTC (permalink / raw)
  To: James Bottomley
  Cc: Jeff Garzik, linux-scsi, linux-ide, Jens Axboe, FUJITA Tomonori

James Bottomley wrote:
>>>> No, it doesn't.  Drain needs to extend the sg table too 
>>> The patches do extend the sg table.
>> Hmm... Where?
> 
> It's the block component; it's already in git head with this patch:
> 
> commit fa0ccd837e3dddb44c7db2f128a8bb7e4eabc21a
> Author: James Bottomley <James.Bottomley@HansenPartnership.com>
> Date:   Thu Jan 10 11:30:36 2008 -0600
> 
>     block: implement drain buffers
> 
> The description is pretty reasonable.  The way it works is by making
> block add the drain buffer to the sg table as the last element.
> 
> The way it works is if you simply use the length block gives you, you
> find yourself doing DMA to every element in the sg table except the last
> one.  If you expand the length to anything up to blk_rq_drain_size() you
> then use the last element for the drain.

Oops, I was talking about padding.  Sorry about that.

>> Adjusting qc->nbytes isn't enough.  libata will have to trim at the end
>> of sglist.  With both draining and padding, it means libata will have to
>> trim one sg entry and a few more bytes.  Block layer is behaving
>> differently and in a very noticeable way, it's sending inconsistent
>> values in data_len and sg list.
> 
> No, that's the point.  With all the space available, you don't need to
> trim at all ... you simply don't use it.  The rationale is the way the
> sg element stuff works.  If you program a DMA table (PRD table in IDE
> speak) into a controller, it doesn't care if you don't use the last few
> pieces.  With the default length set, the DMA controller won't use the
> last programmed element, but it's their if you expand the length or tell
> it to for an overrun.

Not all controllers have a place to set 'default length'.  They just run
till sg list terminates.

>> Okay, I see your point.  The biggest problem is that by doing that blk
>> layer breaks a pretty important rule - keeping data len consistent with
>> sg list and the bugs it's gonna cause will be very subtle and dangerous.
>>  This is definitely where we can spend four extra bytes.
> 
> The lengths don't have to be consistent (otherwise we wouldn't need to
> calculate them separately) the request reported length just has to be
> less than the sg table actual length.  We even make use of this property
> in SCSI when we can't fit a request into a single command ... we are
> allowed to chunk up the request as we complete it.  I'm not saying it's
> best practice, but it is acceptable behaviour.

libata definitely isn't ready for that and I don't think expanding the
behavior to whole block layer is a wise thing to do when the best
practice can be bought at the cost of 4 bytes per request.  No?

-- 
tejun

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH RESEND number 2] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2008-02-05  1:07                                     ` Tejun Heo
@ 2008-02-05  5:03                                       ` James Bottomley
  2008-02-05  5:22                                         ` Tejun Heo
  0 siblings, 1 reply; 33+ messages in thread
From: James Bottomley @ 2008-02-05  5:03 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Jeff Garzik, linux-scsi, linux-ide, Jens Axboe, FUJITA Tomonori

On Tue, 2008-02-05 at 10:07 +0900, Tejun Heo wrote:
> James Bottomley wrote:
> >>>> No, it doesn't.  Drain needs to extend the sg table too 
> >>> The patches do extend the sg table.
> >> Hmm... Where?
> > 
> > It's the block component; it's already in git head with this patch:
> > 
> > commit fa0ccd837e3dddb44c7db2f128a8bb7e4eabc21a
> > Author: James Bottomley <James.Bottomley@HansenPartnership.com>
> > Date:   Thu Jan 10 11:30:36 2008 -0600
> > 
> >     block: implement drain buffers
> > 
> > The description is pretty reasonable.  The way it works is by making
> > block add the drain buffer to the sg table as the last element.
> > 
> > The way it works is if you simply use the length block gives you, you
> > find yourself doing DMA to every element in the sg table except the last
> > one.  If you expand the length to anything up to blk_rq_drain_size() you
> > then use the last element for the drain.
> 
> Oops, I was talking about padding.  Sorry about that.

Oh, OK ... the I thought the changelog was pretty explicit.  However, it
works because the dma_aligment parameter of the block layer ensures that
all elements of the sg list begin and end on an address that conforms to
the dma_alignment.  It does this basically by forcing a copy of user
incoming commands if they don't conform.  It does nothing with kernel
based commands, but you can always assume for direct DMA that they begin
and end on the cache line boundary ... and if they don't, there's always
a spare allocation to expand them.

> >> Adjusting qc->nbytes isn't enough.  libata will have to trim at the end
> >> of sglist.  With both draining and padding, it means libata will have to
> >> trim one sg entry and a few more bytes.  Block layer is behaving
> >> differently and in a very noticeable way, it's sending inconsistent
> >> values in data_len and sg list.
> > 
> > No, that's the point.  With all the space available, you don't need to
> > trim at all ... you simply don't use it.  The rationale is the way the
> > sg element stuff works.  If you program a DMA table (PRD table in IDE
> > speak) into a controller, it doesn't care if you don't use the last few
> > pieces.  With the default length set, the DMA controller won't use the
> > last programmed element, but it's their if you expand the length or tell
> > it to for an overrun.
> 
> Not all controllers have a place to set 'default length'.  They just run
> till sg list terminates.

True ... but the traversal of the sg list terminates when the data
transfer finishes ... this is the basis of your drain patch (and mine).
Basically we don't require that anything transfer into the drain, but
it's there just in case the transfer overruns and it needs to be used.

> >> Okay, I see your point.  The biggest problem is that by doing that blk
> >> layer breaks a pretty important rule - keeping data len consistent with
> >> sg list and the bugs it's gonna cause will be very subtle and dangerous.
> >>  This is definitely where we can spend four extra bytes.
> > 
> > The lengths don't have to be consistent (otherwise we wouldn't need to
> > calculate them separately) the request reported length just has to be
> > less than the sg table actual length.  We even make use of this property
> > in SCSI when we can't fit a request into a single command ... we are
> > allowed to chunk up the request as we complete it.  I'm not saying it's
> > best practice, but it is acceptable behaviour.
> 
> libata definitely isn't ready for that and I don't think expanding the
> behavior to whole block layer is a wise thing to do when the best
> practice can be bought at the cost of 4 bytes per request.  No?

but the whole basis of both patches is inconsistent lengths: we're
expecting a transfer of X, but we program the element lists for Y (>X)
and see if the transfer actually needs >X (but hopefully <Y).

James



^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH RESEND number 2] libata: eliminate the home grown dma padding in favour of that provided by the block layer
  2008-02-05  5:03                                       ` James Bottomley
@ 2008-02-05  5:22                                         ` Tejun Heo
  0 siblings, 0 replies; 33+ messages in thread
From: Tejun Heo @ 2008-02-05  5:22 UTC (permalink / raw)
  To: James Bottomley
  Cc: Jeff Garzik, linux-scsi, linux-ide, Jens Axboe, FUJITA Tomonori

James Bottomley wrote:
>> Oops, I was talking about padding.  Sorry about that.
> 
> Oh, OK ... the I thought the changelog was pretty explicit.  However, it
> works because the dma_aligment parameter of the block layer ensures that
> all elements of the sg list begin and end on an address that conforms to
> the dma_alignment.  It does this basically by forcing a copy of user
> incoming commands if they don't conform.  It does nothing with kernel
> based commands, but you can always assume for direct DMA that they begin
> and end on the cache line boundary ... and if they don't, there's always
> a spare allocation to expand them.

Yeah, the area is expanded but the bio list isn't updated accordingly.
In turn the sg list won't be.  I found where you worked around this.
The following chunk is added to ata_sg_setup().

+	/* needs padding? */
+	if (pad) {
+		struct scatterlist *sg = sg_last(qc->sg, n_elem);
+		qc->nbytes += ATA_DMA_PAD_SZ - pad;
+		sg->length +=  ATA_DMA_PAD_SZ - pad;
+	}

So, you add the area way up in the stack and adjust sg way down the
stack.  This is too fragile.  Actually, it's already broken and
demonstrates very well why block layer shouldn't do half of the job and
defer the rest to lower layer.

With draining, the above code ends up extending the sg entry for drain
page by pad bytes and there's no extra byte allocated after the drain
page.  The controller will end up contaminating pad bytes of the next
page and it will be extremely difficult to reproduce and track down.

>> Not all controllers have a place to set 'default length'.  They just run
>> till sg list terminates.
> 
> True ... but the traversal of the sg list terminates when the data
> transfer finishes ... this is the basis of your drain patch (and mine).
> Basically we don't require that anything transfer into the drain, but
> it's there just in case the transfer overruns and it needs to be used.

The thing is there is no way to detect data overrun if sg list doesn't
terminate.  For data transfer commands, sg list must terminate precisely
at the request boundary.  If 4k drain page is automatically appended,
the controller will happily overflow into that 4k without notifying
anyone.  Or worse, the device can suck in few extra blocks without
triggering error.  sgl should never contain extra entries for any
command which can affect data integrity.

>> libata definitely isn't ready for that and I don't think expanding the
>> behavior to whole block layer is a wise thing to do when the best
>> practice can be bought at the cost of 4 bytes per request.  No?
> 
> but the whole basis of both patches is inconsistent lengths: we're
> expecting a transfer of X, but we program the element lists for Y (>X)
> and see if the transfer actually needs >X (but hopefully <Y).

Yes, it's dual-sizing.  There's the size upper layer requested and
there's slightly different size lower layer wants to see (most of the
time).  Block layer sits inbetween and matches the impedance by
extending the request necessary but from either side the picture is
still consistent.  Lower layer (mostly) just sees ->data_len and
matching sgl.  Upper layer sends and receives the data length it requested.

The difference is that with your approach what lower layer sees becomes
inconsistent in itself and half of the impedance matching is left to
lower layer and it will cause nasty problems down the road.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2008-02-05  5:22 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-12-31 21:56 [PATCH] libata: eliminate the home grown dma padding in favour of that provided by the block layer James Bottomley
2007-12-31 22:56 ` Jeff Garzik
2008-01-03  7:58 ` FUJITA Tomonori
2008-01-03 15:12   ` James Bottomley
2008-01-09  2:10 ` Tejun Heo
2008-01-09  4:24   ` James Bottomley
2008-01-09  5:13     ` Tejun Heo
2008-01-09 15:13       ` James Bottomley
2008-01-18 23:14 ` [PATCH RESEND] " James Bottomley
2008-02-01 19:40   ` [PATCH RESEND number 2] " James Bottomley
2008-02-01 20:02     ` Jeff Garzik
2008-02-01 21:09       ` James Bottomley
2008-02-03  3:04         ` Tejun Heo
2008-02-03  4:32           ` James Bottomley
2008-02-03  7:37             ` Tejun Heo
2008-02-03 14:38               ` James Bottomley
2008-02-03 15:14                 ` Tejun Heo
2008-02-03 16:12                   ` James Bottomley
2008-02-03 16:38                     ` Jeff Garzik
2008-02-03 17:12                       ` James Bottomley
2008-02-04  1:21                         ` Tejun Heo
2008-02-04  1:28                     ` Tejun Heo
2008-02-04  9:25                       ` Tejun Heo
2008-02-04 14:43                         ` Tejun Heo
2008-02-04 16:23                           ` James Bottomley
2008-02-05  0:06                             ` Tejun Heo
2008-02-05  0:32                               ` James Bottomley
2008-02-05  0:43                                 ` Tejun Heo
2008-02-05  0:53                                   ` James Bottomley
2008-02-05  1:07                                     ` Tejun Heo
2008-02-05  5:03                                       ` James Bottomley
2008-02-05  5:22                                         ` Tejun Heo
2008-02-04 15:43                         ` James Bottomley

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).