linux-scsi.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] add bidi support for block pc requests
@ 2007-05-08  2:25 FUJITA Tomonori
  2007-05-08 18:53 ` Boaz Harrosh
  0 siblings, 1 reply; 43+ messages in thread
From: FUJITA Tomonori @ 2007-05-08  2:25 UTC (permalink / raw)
  To: James.Bottomley; +Cc: linux-scsi, bhalevy, jens.axboe, hch, akpm, michaelc

Here is an updated version of the patch to add bidi support to block
pc requests. Bugs spotted by Benny were fixed.

This patch can be applied cleanly to the scsi-misc git tree and is on
the top of the following patch to add linked request support:

http://marc.info/?l=linux-scsi&m=117835587615642&w=2

---
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 7 May 2007 16:42:24 +0900
Subject: [PATCH] add bidi support to scsi pc commands

This adds bidi support to block pc requests.

A bidi request uses req->next_rq pointer for an in request.

This patch introduces a new structure, scsi_data_buffer to hold the
data buffer information. To avoid make scsi_cmnd structure fatter, the
scsi mid-layer uses cmnd->request->next_rq->special pointer for
a scsi_data_buffer structure. LLDs don't touch the second request
(req->next_rq) so it's safe to use req->special.

scsi_blk_pc_done() always completes the whole command so
scsi_end_request() simply completes the bidi chunk too.

A helper function, scsi_bidi_data_buffer() is for LLDs to access to
the scsi_data_buffer structure easily.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
---
 drivers/scsi/scsi_lib.c  |  128 +++++++++++++++++++++++++++++++++++++++------
 include/scsi/scsi_cmnd.h |   14 +++++
 2 files changed, 125 insertions(+), 17 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 61fbcdc..ba874a6 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -66,6 +66,12 @@ static struct scsi_host_sg_pool scsi_sg_pools[] = {
 
 static void scsi_run_queue(struct request_queue *q);
 
+struct scsi_data_buffer *scsi_bidi_data_buffer(struct scsi_cmnd *cmd)
+{
+	return blk_bidi_rq(cmd->request) ? cmd->request->next_rq->special : NULL;
+}
+EXPORT_SYMBOL(scsi_bidi_data_buffer);
+
 /*
  * Function:	scsi_unprep_request()
  *
@@ -81,10 +87,14 @@ static void scsi_run_queue(struct request_queue *q);
 static void scsi_unprep_request(struct request *req)
 {
 	struct scsi_cmnd *cmd = req->special;
+	struct scsi_data_buffer *sdb = scsi_bidi_data_buffer(cmd);
 
 	req->cmd_flags &= ~REQ_DONTPREP;
 	req->special = NULL;
-
+	if (sdb) {
+		kfree(sdb);
+		req->next_rq->special = NULL;
+	}
 	scsi_put_command(cmd);
 }
 
@@ -657,6 +667,7 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int uptodate,
 	request_queue_t *q = cmd->device->request_queue;
 	struct request *req = cmd->request;
 	unsigned long flags;
+	struct scsi_data_buffer *sdb = scsi_bidi_data_buffer(cmd);
 
 	/*
 	 * If there are blocks left over at the end, set up the command
@@ -685,6 +696,14 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int uptodate,
 		}
 	}
 
+	/*
+	 * a REQ_BLOCK_PC command is always completed fully so just do
+	 * end the bidi chunk.
+	 */
+	if (sdb)
+		end_that_request_chunk(req->next_rq, uptodate,
+				       sdb->request_bufflen);
+
 	add_disk_randomness(req->rq_disk);
 
 	spin_lock_irqsave(q->queue_lock, flags);
@@ -701,34 +720,35 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int uptodate,
 	return NULL;
 }
 
-struct scatterlist *scsi_alloc_sgtable(struct scsi_cmnd *cmd, gfp_t gfp_mask)
+static struct scatterlist *do_scsi_alloc_sgtable(unsigned short use_sg,
+						 unsigned short *sglist_len,
+						 gfp_t gfp_mask)
 {
 	struct scsi_host_sg_pool *sgp;
-	struct scatterlist *sgl;
 
-	BUG_ON(!cmd->use_sg);
+	BUG_ON(!use_sg);
 
-	switch (cmd->use_sg) {
+	switch (use_sg) {
 	case 1 ... 8:
-		cmd->sglist_len = 0;
+		*sglist_len = 0;
 		break;
 	case 9 ... 16:
-		cmd->sglist_len = 1;
+		*sglist_len = 1;
 		break;
 	case 17 ... 32:
-		cmd->sglist_len = 2;
+		*sglist_len = 2;
 		break;
 #if (SCSI_MAX_PHYS_SEGMENTS > 32)
 	case 33 ... 64:
-		cmd->sglist_len = 3;
+		*sglist_len = 3;
 		break;
 #if (SCSI_MAX_PHYS_SEGMENTS > 64)
 	case 65 ... 128:
-		cmd->sglist_len = 4;
+		*sglist_len = 4;
 		break;
 #if (SCSI_MAX_PHYS_SEGMENTS  > 128)
 	case 129 ... 256:
-		cmd->sglist_len = 5;
+		*sglist_len = 5;
 		break;
 #endif
 #endif
@@ -737,11 +757,14 @@ struct scatterlist *scsi_alloc_sgtable(struct scsi_cmnd *cmd, gfp_t gfp_mask)
 		return NULL;
 	}
 
-	sgp = scsi_sg_pools + cmd->sglist_len;
-	sgl = mempool_alloc(sgp->pool, gfp_mask);
-	return sgl;
+	sgp = scsi_sg_pools + *sglist_len;
+	return mempool_alloc(sgp->pool, gfp_mask);
 }
 
+struct scatterlist *scsi_alloc_sgtable(struct scsi_cmnd *cmd, gfp_t gfp_mask)
+{
+	return do_scsi_alloc_sgtable(cmd->use_sg, &cmd->sglist_len, gfp_mask);
+}
 EXPORT_SYMBOL(scsi_alloc_sgtable);
 
 void scsi_free_sgtable(struct scatterlist *sgl, int index)
@@ -775,6 +798,8 @@ EXPORT_SYMBOL(scsi_free_sgtable);
  */
 static void scsi_release_buffers(struct scsi_cmnd *cmd)
 {
+	struct scsi_data_buffer *sdb = scsi_bidi_data_buffer(cmd);
+
 	if (cmd->use_sg)
 		scsi_free_sgtable(cmd->request_buffer, cmd->sglist_len);
 
@@ -784,6 +809,15 @@ static void scsi_release_buffers(struct scsi_cmnd *cmd)
 	 */
 	cmd->request_buffer = NULL;
 	cmd->request_bufflen = 0;
+
+	if (sdb) {
+		if (sdb->use_sg)
+			scsi_free_sgtable(sdb->request_buffer, sdb->sglist_len);
+		sdb->request_buffer = NULL;
+		sdb->request_bufflen = 0;
+		kfree(sdb);
+		cmd->request->next_rq->special = NULL;
+	}
 }
 
 /*
@@ -834,6 +868,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 	}
 
 	if (blk_pc_request(req)) { /* SG_IO ioctl from block level */
+		struct scsi_data_buffer *sdb = scsi_bidi_data_buffer(cmd);
 		req->errors = result;
 		if (result) {
 			clear_errors = 0;
@@ -850,6 +885,8 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 			}
 		}
 		req->data_len = cmd->resid;
+		if (sdb)
+			req->next_rq->data_len = sdb->resid;
 	}
 
 	/*
@@ -1075,6 +1112,38 @@ static struct scsi_cmnd *scsi_get_cmd_from_req(struct scsi_device *sdev,
 	return cmd;
 }
 
+static int scsi_data_buffer_init(struct scsi_cmnd *cmd)
+{
+	struct scatterlist *sgpnt;
+	struct scsi_data_buffer *sdb = scsi_bidi_data_buffer(cmd);
+	struct request *req = cmd->request;
+	int count;
+
+	sdb->use_sg = req->next_rq->nr_phys_segments;
+	sgpnt = do_scsi_alloc_sgtable(sdb->use_sg, &sdb->sglist_len,
+				      GFP_ATOMIC);
+	if (unlikely(!sgpnt)) {
+		scsi_free_sgtable(cmd->request_buffer, cmd->sglist_len);
+		scsi_unprep_request(req);
+		return BLKPREP_DEFER;
+	}
+
+	req->next_rq->buffer = NULL;
+	sdb->request_buffer = (char *) sgpnt;
+	sdb->request_bufflen = req->next_rq->data_len;
+
+	count = blk_rq_map_sg(req->q, req->next_rq, sgpnt);
+	if (likely(count <= sdb->use_sg)) {
+		sdb->use_sg = count;
+		return BLKPREP_OK;
+	}
+
+	scsi_release_buffers(cmd);
+	scsi_put_command(cmd);
+
+	return BLKPREP_KILL;
+}
+
 static void scsi_blk_pc_done(struct scsi_cmnd *cmd)
 {
 	BUG_ON(!blk_pc_request(cmd->request));
@@ -1090,10 +1159,22 @@ static void scsi_blk_pc_done(struct scsi_cmnd *cmd)
 static int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req)
 {
 	struct scsi_cmnd *cmd;
+	struct scsi_data_buffer *sdb = NULL;
+
+	if (blk_bidi_rq(req)) {
+		sdb = kzalloc(sizeof(*sdb), GFP_ATOMIC);
+		if (unlikely(!sdb))
+			return BLKPREP_DEFER;
+	}
 
 	cmd = scsi_get_cmd_from_req(sdev, req);
-	if (unlikely(!cmd))
+	if (unlikely(!cmd)) {
+		kfree(sdb);
 		return BLKPREP_DEFER;
+	}
+
+	if (sdb)
+		req->next_rq->special = sdb;
 
 	/*
 	 * BLOCK_PC requests may transfer data, in which case they must
@@ -1109,6 +1190,12 @@ static int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req)
 		ret = scsi_init_io(cmd);
 		if (unlikely(ret))
 			return ret;
+
+		if (sdb) {
+			ret = scsi_data_buffer_init(cmd);
+			if (ret != BLKPREP_OK)
+				return ret;
+		}
 	} else {
 		BUG_ON(req->data_len);
 		BUG_ON(req->data);
@@ -1122,13 +1209,15 @@ static int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req)
 	BUILD_BUG_ON(sizeof(req->cmd) > sizeof(cmd->cmnd));
 	memcpy(cmd->cmnd, req->cmd, sizeof(cmd->cmnd));
 	cmd->cmd_len = req->cmd_len;
-	if (!req->data_len)
+	if (sdb)
+		cmd->sc_data_direction = DMA_BIDIRECTIONAL;
+	else if (!req->data_len)
 		cmd->sc_data_direction = DMA_NONE;
 	else if (rq_data_dir(req) == WRITE)
 		cmd->sc_data_direction = DMA_TO_DEVICE;
 	else
 		cmd->sc_data_direction = DMA_FROM_DEVICE;
-	
+
 	cmd->transfersize = req->data_len;
 	cmd->allowed = req->retries;
 	cmd->timeout_per_command = req->timeout;
@@ -1178,6 +1267,11 @@ static int scsi_prep_fn(struct request_queue *q, struct request *req)
 	struct scsi_device *sdev = q->queuedata;
 	int ret = BLKPREP_OK;
 
+	if (WARN_ON(!blk_pc_request(req) && blk_bidi_rq(req))) {
+		ret = BLKPREP_KILL;
+		goto out;
+	}
+
 	/*
 	 * If the device is not in running state we will reject some
 	 * or all commands.
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index a2e0c10..597c48c 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -28,6 +28,18 @@ struct scsi_pointer {
 	volatile int phase;
 };
 
+struct scsi_data_buffer {
+	unsigned short use_sg;		/* Number of pieces of scatter-gather */
+	unsigned short sglist_len;	/* size of malloc'd scatter-gather list */
+	void *request_buffer;		/* Actual requested buffer */
+	unsigned request_bufflen;	/* Actual request size */
+	/*
+	 * Number of bytes requested to be transferred less actual
+	 * number transferred (0 if not supported)
+	*/
+	int resid;
+};
+
 struct scsi_cmnd {
 	struct scsi_device *device;
 	struct list_head list;  /* scsi_cmnd participates in queue lists */
@@ -135,4 +147,6 @@ extern void scsi_kunmap_atomic_sg(void *virt);
 extern struct scatterlist *scsi_alloc_sgtable(struct scsi_cmnd *, gfp_t);
 extern void scsi_free_sgtable(struct scatterlist *, int);
 
+extern struct scsi_data_buffer *scsi_bidi_data_buffer(struct scsi_cmnd *);
+
 #endif /* _SCSI_SCSI_CMND_H */
-- 
1.4.4.3


^ permalink raw reply related	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-08  2:25 [PATCH v2] add bidi support for block pc requests FUJITA Tomonori
@ 2007-05-08 18:53 ` Boaz Harrosh
  2007-05-08 20:01   ` James Bottomley
  0 siblings, 1 reply; 43+ messages in thread
From: Boaz Harrosh @ 2007-05-08 18:53 UTC (permalink / raw)
  To: FUJITA Tomonori, James.Bottomley
  Cc: linux-scsi, bhalevy, jens.axboe, hch, akpm, michaelc

[-- Attachment #1: Type: text/plain, Size: 4543 bytes --]

FUJITA Tomonori wrote:
> Here is an updated version of the patch to add bidi support to block
> pc requests. Bugs spotted by Benny were fixed.
> 
> This patch can be applied cleanly to the scsi-misc git tree and is on
> the top of the following patch to add linked request support:
> 
> http://marc.info/?l=linux-scsi&m=117835587615642&w=2
> 
> ---
> From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
> Date: Mon, 7 May 2007 16:42:24 +0900
> Subject: [PATCH] add bidi support to scsi pc commands
> 
> This adds bidi support to block pc requests.
> 
> A bidi request uses req->next_rq pointer for an in request.
> 
> This patch introduces a new structure, scsi_data_buffer to hold the
> data buffer information. To avoid make scsi_cmnd structure fatter, the
> scsi mid-layer uses cmnd->request->next_rq->special pointer for
> a scsi_data_buffer structure. LLDs don't touch the second request
> (req->next_rq) so it's safe to use req->special.
> 
> scsi_blk_pc_done() always completes the whole command so
> scsi_end_request() simply completes the bidi chunk too.
> 
> A helper function, scsi_bidi_data_buffer() is for LLDs to access to
> the scsi_data_buffer structure easily.
> 
> Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
> ---
> @@ -685,6 +696,14 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int uptodate,
>  		}
>  	}
>  
> +	/*
> +	 * a REQ_BLOCK_PC command is always completed fully so just do
> +	 * end the bidi chunk.
> +	 */
> +	if (sdb)
> +		end_that_request_chunk(req->next_rq, uptodate,
> +				       sdb->request_bufflen);
> +

sdb->request_bufflen was zeroed in scsi_release_buffers which is called before
scsi_end_request.

>  static void scsi_blk_pc_done(struct scsi_cmnd *cmd)
>  {
>  	BUG_ON(!blk_pc_request(cmd->request));
> @@ -1090,10 +1159,22 @@ static void scsi_blk_pc_done(struct scsi_cmnd *cmd)
>  static int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req)
>  {
>  	struct scsi_cmnd *cmd;
> +	struct scsi_data_buffer *sdb = NULL;
> +
> +	if (blk_bidi_rq(req)) {
> +		sdb = kzalloc(sizeof(*sdb), GFP_ATOMIC);
> +		if (unlikely(!sdb))
> +			return BLKPREP_DEFER;
> +	}
>  
>  	cmd = scsi_get_cmd_from_req(sdev, req);
> -	if (unlikely(!cmd))
> +	if (unlikely(!cmd)) {
> +		kfree(sdb);
>  		return BLKPREP_DEFER;
> +	}
> +
> +	if (sdb)
> +		req->next_rq->special = sdb;
>  
>  	/*
>  	 * BLOCK_PC requests may transfer data, in which case they must

Before I get to my main concern here I have one comment. in scsi_get_cmd_from_req()
there is a code path in which a scsi_cmnd is taken from special and is not newly
allocated. It is best to move bidi allocation to scsi_get_cmd_from_req() and allocate
new sdb only if we get a new Command. (See my attached patch for an example)

OK! My main concern is with kzalloc(sizeof(*sdb), GFP_ATOMIC);
All IO allocations should come from slabs in case of a memory starved system.
There are 3 possible solutions I see:
1. Use struct scsi_cmnd for bidi_read and not a special scsi_data_buffer.
   Attached is above solution. It is by far the simplest of the three.
   So simple that even Jens's BigIO patch can almost apply at scsi_lib.c. (and vise versa)
   What's hanged on the request->next_rq->special is a second scsi_cmnd.
   The command is taken from regular command pool. This solution, though
   wasteful of some memory is very stable.

2. Force the users of BIDI to allocate scsi_data_buffer(s)
   Users will allocate a slab for scsi_data_buffers and hang them on nex_rq->special before
   hand. Than free them together with second request. This approach can be very stable, But
   it is bad because it is a layering violation. When block and SCSI layers decide to change
   the wiring of bidi. Users will have to change with them.

3. Let SCSI-ml manage a slab for scsi_data_buff's
   Have a flag to  scsi_setup_command_freelist() or a new API. When a device is flagged
   bidi-able we could 1-Allocate bigger slots to have extra memory for SDBs together
   with the command itself. or 2-allocate yet another slab for SDBs.

The 3rd approach is clean, and I can submit a patch for it. But I think it is not currently
necessary. The first solution (See attached patch) we can all live with, I hope. Since for
me it gives me the stability I need. And for the rest of the kernel it is the minimum
change, layered on existing building blocks.

If any one wants to try it out I put up the usual patchset that exercise this approach here.
http://www.bhalevy.com/open-osd/download/double_rq_bidi

Thanks
Boaz


[-- Attachment #2: 0002-scsi-ml-bidi-support.patch --]
[-- Type: text/plain, Size: 10884 bytes --]

>From ac8f0d3df711c5d01a269fde6a4ecce3000c3f68 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Tue, 8 May 2007 14:04:46 +0300
Subject: [PATCH] scsi-ml bidi support

    At the block level bidi request uses req->next_rq pointer for a second
    bidi_read request.
    At Scsi-midlayer a second scsi_cmnd structure is used for the sglists
    of the bidi_read part. This command is put on request->next_rq->special.
    So struct scsi_cmnd is not changed. And scsi-ml minimally changes.

    - Define scsi_end_bidi_request() to do what scsi_end_request() does but for a
      bidi request. This is necessary because bidi commands are a bit tricky here.
      (See comments in body)

    - scsi_release_buffers() will also release the bidi_read buffers.
      I have changed the check from "if (cmd->use_sg)" to
      "if(cmd->request_buffer)" because it can now be called on half
      prepared commands. (and use_sg is no longer relevant here)

    - scsi_io_completion() will now call scsi_end_bidi_request on bidi commands
      and return.

    - The previous work done in scsi_init_io() is now done in a new
      scsi_init_data_buf() (which is 95% identical to old scsi_init_io())
      The new scsi_init_io() will call the above twice if needed also for
      the bidi_read command.

    - scsi_get_cmd_from_req() (called from scsi_setup_blk_pc_cmnd()) in the
      case that a new command is allocated, will also allocate a bidi_read
      command and hang it on cmd->request->next_rq->special. Only at this
      point is a command bidi.

    - scsi_setup_blk_pc_cmnd() will update sc_dma_direction to DMA_BIDIRECTIONAL
      which is not correct. It is only here for compatibility with iSCSI bidi
      driver. At final patch this will be a DMA_TO_DEVICE for this command and
      DMA_FROM_DEVICE for the bidi_read command. A driver must call scsi_bidi_cmnd()
      to work on bidi commands.

    - Define scsi_bidi_cmnd() to return true if it is a bidi command and a
      second command was allocated.

    - Define scsi_in()/scsi_out() to return the in or out commands from this command
      This API is to isolate users from the mechanics of bidi, and is also a short
      hand to common used code. (Even in scsi_lib.c)

    - In scsi_error.c at scsi_send_eh_cmnd() make sure bidi-lld is not confused by
      a get-sense command that looks like bidi. This is done by puting NULL at
      request->next_rq, and restoring before exit.
---
 drivers/scsi/scsi_error.c |    5 ++
 drivers/scsi/scsi_lib.c   |  133 ++++++++++++++++++++++++++++++++++++++++++---
 include/scsi/scsi_cmnd.h  |   19 ++++++-
 3 files changed, 148 insertions(+), 9 deletions(-)

diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index e8350c5..169f4b4 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -620,6 +620,7 @@ static int scsi_send_eh_cmnd(struct scsi_cmnd *scmd, unsigned char *cmnd,
 	unsigned char old_cmd_len;
 	unsigned old_bufflen;
 	void *old_buffer;
+	struct request* old_next_rq;
 	int rtn;
 
 	/*
@@ -636,6 +637,9 @@ static int scsi_send_eh_cmnd(struct scsi_cmnd *scmd, unsigned char *cmnd,
 	old_cmd_len = scmd->cmd_len;
 	old_use_sg = scmd->use_sg;
 
+	old_next_rq = scmd->request->next_rq;
+	scmd->request->next_rq = NULL;
+
 	memset(scmd->cmnd, 0, sizeof(scmd->cmnd));
 	memcpy(scmd->cmnd, cmnd, cmnd_size);
 
@@ -734,6 +738,7 @@ static int scsi_send_eh_cmnd(struct scsi_cmnd *scmd, unsigned char *cmnd,
 	/*
 	 * Restore original data
 	 */
+	scmd->request->next_rq = old_next_rq;
 	scmd->request_buffer = old_buffer;
 	scmd->request_bufflen = old_bufflen;
 	memcpy(scmd->cmnd, old_cmnd, sizeof(scmd->cmnd));
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 61fbcdc..0f49195 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -85,6 +85,10 @@ static void scsi_unprep_request(struct request *req)
 	req->cmd_flags &= ~REQ_DONTPREP;
 	req->special = NULL;
 
+	if (scsi_bidi_cmnd(cmd)) {
+		scsi_put_command(scsi_in(cmd));
+		cmd->request->next_rq->special = NULL;
+	}
 	scsi_put_command(cmd);
 }
 
@@ -701,6 +705,52 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int uptodate,
 	return NULL;
 }
 
+/*
+ * Bidi commands Must be complete as a whole, both sides at once.
+ * If part of the bytes were written and lld returned
+ * scsi_in()->resid or scsi_out()->resid this information will be left
+ * in req->data_len and req->next_rq->data_len. The upper-layer driver can
+ * decide what to do with this information.
+ */
+void scsi_end_bidi_request(struct scsi_cmnd *cmd)
+{
+	struct scsi_cmnd* bidi_in = scsi_in(cmd);
+	request_queue_t *q = cmd->device->request_queue;
+	struct request *req = cmd->request;
+	unsigned long flags;
+
+	end_that_request_chunk(req, 1, req->data_len);
+	req->data_len = cmd->resid;
+	end_that_request_chunk(req->next_rq, 1, req->next_rq->data_len);
+	req->next_rq->data_len = bidi_in->resid;
+
+	req->next_rq->special = NULL;
+	scsi_put_command(bidi_in);
+
+	/*
+	 *FIXME: If ll_rw_blk.c is changed to also put_request(req->next_rq)
+	 *       in end_that_request_last() then this WARN_ON must be removed.
+	 *       Otherwise, upper-driver must have registered an end_io.
+	 */
+	WARN_ON(!req->end_io);
+
+	/* FIXME: the following code can be shared with scsi_end_request */
+	add_disk_randomness(req->rq_disk);
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	if (blk_rq_tagged(req))
+		blk_queue_end_tag(q, req);
+
+	end_that_request_last(req, 1); /*allways good for now*/
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	/*
+	 * This will goose the queue request function at the end, so we don't
+	 * need to worry about launching another command.
+	 */
+	scsi_next_command(cmd);
+}
+
 struct scatterlist *scsi_alloc_sgtable(struct scsi_cmnd *cmd, gfp_t gfp_mask)
 {
 	struct scsi_host_sg_pool *sgp;
@@ -775,7 +825,7 @@ EXPORT_SYMBOL(scsi_free_sgtable);
  */
 static void scsi_release_buffers(struct scsi_cmnd *cmd)
 {
-	if (cmd->use_sg)
+	if (cmd->request_buffer)
 		scsi_free_sgtable(cmd->request_buffer, cmd->sglist_len);
 
 	/*
@@ -784,6 +834,15 @@ static void scsi_release_buffers(struct scsi_cmnd *cmd)
 	 */
 	cmd->request_buffer = NULL;
 	cmd->request_bufflen = 0;
+
+	if (scsi_bidi_cmnd(cmd)) {
+		struct scsi_cmnd* bidi_in = scsi_in(cmd);
+		if (bidi_in->request_buffer)
+			scsi_free_sgtable(bidi_in->request_buffer,
+			                           bidi_in->sglist_len);
+		bidi_in->request_buffer = NULL;
+		bidi_in->request_bufflen = 0;
+	}
 }
 
 /*
@@ -849,9 +908,15 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 				req->sense_len = len;
 			}
 		}
+		if (scsi_bidi_cmnd(cmd)) {
+			scsi_end_bidi_request(cmd);
+			return;
+		}
 		req->data_len = cmd->resid;
 	}
 
+	BUG_ON(blk_bidi_rq(req));
+
 	/*
 	 * Next deal with any sectors which we were able to correctly
 	 * handle.
@@ -978,17 +1043,18 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 EXPORT_SYMBOL(scsi_io_completion);
 
 /*
- * Function:    scsi_init_io()
+ * Function:    scsi_init_data_buf()
  *
- * Purpose:     SCSI I/O initialize function.
+ * Purpose:     SCSI I/O initialize helper.
+ *              maps the request buffers for the given cmd.
  *
- * Arguments:   cmd   - Command descriptor we wish to initialize
+ * Arguments:   cmd   - Command whos data we wish to initialize
  *
  * Returns:     0 on success
  *		BLKPREP_DEFER if the failure is retryable
  *		BLKPREP_KILL if the failure is fatal
  */
-static int scsi_init_io(struct scsi_cmnd *cmd)
+static int scsi_init_data_buff(struct scsi_cmnd *cmd)
 {
 	struct request     *req = cmd->request;
 	struct scatterlist *sgpnt;
@@ -1032,12 +1098,45 @@ static int scsi_init_io(struct scsi_cmnd *cmd)
 	printk(KERN_ERR "req nr_sec %lu, cur_nr_sec %u\n", req->nr_sectors,
 			req->current_nr_sectors);
 
-	/* release the command and kill it */
-	scsi_release_buffers(cmd);
-	scsi_put_command(cmd);
 	return BLKPREP_KILL;
 }
 
+/*
+ * Function:    scsi_init_io()
+ *
+ * Purpose:     SCSI I/O initialize function.
+ *
+ * Arguments:   cmd   - Command descriptor we wish to initialize
+ *
+ * Returns:     0 on success
+ *		BLKPREP_DEFER if the failure is retryable
+ *		BLKPREP_KILL if the failure is fatal
+ */
+static int scsi_init_io(struct scsi_cmnd *cmd)
+{
+	struct request *req = cmd->request;
+	int error;
+
+	error = scsi_init_data_buff(cmd);
+	if (error)
+		goto err_exit;
+
+	if (scsi_bidi_cmnd(cmd)) {
+		error = scsi_init_data_buff(scsi_in(cmd));
+		if (error)
+			goto err_exit;
+	}
+
+	req->buffer = NULL;
+	return 0;
+
+err_exit:
+	scsi_release_buffers(cmd);
+	if (error == BLKPREP_KILL)
+		scsi_unprep_request(req);
+	return error;
+}
+
 static int scsi_issue_flush_fn(request_queue_t *q, struct gendisk *disk,
 			       sector_t *error_sector)
 {
@@ -1064,6 +1163,22 @@ static struct scsi_cmnd *scsi_get_cmd_from_req(struct scsi_device *sdev,
 		if (unlikely(!cmd))
 			return NULL;
 		req->special = cmd;
+		if (blk_bidi_rq(req)) {
+			struct scsi_cmnd *bidi_in_cmd;
+			/*
+			 * second bidi request must be blk_pc_request for use of
+			 * correct sizes.
+			 */
+			WARN_ON(!blk_pc_request(req->next_rq));
+
+			bidi_in_cmd = scsi_get_command(sdev, GFP_ATOMIC);
+			if (unlikely(!bidi_in_cmd)) {
+				scsi_put_command(cmd);
+				return NULL;
+			}
+			req->next_rq->special = bidi_in_cmd;
+			bidi_in_cmd->request = req->next_rq;
+		}
 	} else {
 		cmd = req->special;
 	}
@@ -1124,6 +1239,8 @@ static int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req)
 	cmd->cmd_len = req->cmd_len;
 	if (!req->data_len)
 		cmd->sc_data_direction = DMA_NONE;
+	else if (blk_bidi_rq(req))
+		cmd->sc_data_direction = DMA_BIDIRECTIONAL;
 	else if (rq_data_dir(req) == WRITE)
 		cmd->sc_data_direction = DMA_TO_DEVICE;
 	else
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index a2e0c10..1223412 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -2,11 +2,11 @@
 #define _SCSI_SCSI_CMND_H
 
 #include <linux/dma-mapping.h>
+#include <linux/blkdev.h>
 #include <linux/list.h>
 #include <linux/types.h>
 #include <linux/timer.h>
 
-struct request;
 struct scatterlist;
 struct Scsi_Host;
 struct scsi_device;
@@ -135,4 +135,21 @@ extern void scsi_kunmap_atomic_sg(void *virt);
 extern struct scatterlist *scsi_alloc_sgtable(struct scsi_cmnd *, gfp_t);
 extern void scsi_free_sgtable(struct scatterlist *, int);
 
+static inline int scsi_bidi_cmnd(struct scsi_cmnd *cmd)
+{
+	return blk_bidi_rq(cmd->request) &&
+	       (cmd->request->next_rq->special != NULL);
+}
+
+static inline struct scsi_cmnd *scsi_in(struct scsi_cmnd *cmd)
+{
+	return scsi_bidi_cmnd(cmd) ?
+	       cmd->request->next_rq->special : cmd;
+}
+
+static inline struct scsi_cmnd *scsi_out(struct scsi_cmnd *cmd)
+{
+	return cmd;
+}
+
 #endif /* _SCSI_SCSI_CMND_H */
-- 
1.5.0.4.402.g8035


^ permalink raw reply related	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-08 18:53 ` Boaz Harrosh
@ 2007-05-08 20:01   ` James Bottomley
  2007-05-09  7:46     ` Boaz Harrosh
  2007-05-09 10:49     ` FUJITA Tomonori
  0 siblings, 2 replies; 43+ messages in thread
From: James Bottomley @ 2007-05-08 20:01 UTC (permalink / raw)
  To: Boaz Harrosh
  Cc: FUJITA Tomonori, linux-scsi, bhalevy, jens.axboe, hch, akpm,
	michaelc

On Tue, 2007-05-08 at 21:53 +0300, Boaz Harrosh wrote:
> Before I get to my main concern here I have one comment. in scsi_get_cmd_from_req()
> there is a code path in which a scsi_cmnd is taken from special and is not newly
> allocated. It is best to move bidi allocation to scsi_get_cmd_from_req() and allocate
> new sdb only if we get a new Command. (See my attached patch for an example)
> 
> OK! My main concern is with kzalloc(sizeof(*sdb), GFP_ATOMIC);
> All IO allocations should come from slabs in case of a memory starved system.

I think you'll find that kzalloc comes directly out of a slab for this
size of allocation anyway ... you mean you want to see a dedicated pool
for this specific allocation?

> There are 3 possible solutions I see:
> 1. Use struct scsi_cmnd for bidi_read and not a special scsi_data_buffer.
>    Attached is above solution. It is by far the simplest of the three.
>    So simple that even Jens's BigIO patch can almost apply at scsi_lib.c. (and vise versa)
>    What's hanged on the request->next_rq->special is a second scsi_cmnd.
>    The command is taken from regular command pool. This solution, though
>    wasteful of some memory is very stable.

There's another problem in that it destroys our forward progress
guarantee.  There's always a single reserve command for every HBA so
that forward progress for freeing memory can always be made in the
system even if the command slab is out and we have to reclaim memory
through a HBA with no outstanding commands.  Allocating two commands per
bidirectional request hoses that guarantee ... it could be fixed up by
increasing the reserve pool to 2, but that's adding further unwanted
complexity ...

> 2. Force the users of BIDI to allocate scsi_data_buffer(s)
>    Users will allocate a slab for scsi_data_buffers and hang them on nex_rq->special before
>    hand. Than free them together with second request. This approach can be very stable, But
>    it is bad because it is a layering violation. When block and SCSI layers decide to change
>    the wiring of bidi. Users will have to change with them.

I Agree.

> 3. Let SCSI-ml manage a slab for scsi_data_buff's
>    Have a flag to  scsi_setup_command_freelist() or a new API. When a device is flagged
>    bidi-able we could 1-Allocate bigger slots to have extra memory for SDBs together
>    with the command itself. or 2-allocate yet another slab for SDBs.

if you're worried about allocations, doing a single allocate is better

> The 3rd approach is clean, and I can submit a patch for it. But I think it is not currently
> necessary. The first solution (See attached patch) we can all live with, I hope. Since for
> me it gives me the stability I need. And for the rest of the kernel it is the minimum
> change, layered on existing building blocks.


There's actually a fourth option you haven't considered:

Roll all the required sglist definitions (request_bufflen,
request_buffer, use_sg and sglist_len) into the sgtable pools.

We're getting very close to the point where someone gets to sweep
through the drivers eliminating the now superfluous non-sg path in the
queuecommand.  When that happens the only cases become no transfer or SG
backed commands.  At this point we can do a consolidation of the struct
scsi_cmnd fields.  This does represent the ideal time to sweep the sg
list handling fields into the sgtable and simply have a single pointer
to struct sgtable in the scsi_cmnd (== NULL is the signal for a no
transfer command).

James



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-08 20:01   ` James Bottomley
@ 2007-05-09  7:46     ` Boaz Harrosh
  2007-05-09 10:52       ` FUJITA Tomonori
  2007-05-16 17:29       ` Boaz Harrosh
  2007-05-09 10:49     ` FUJITA Tomonori
  1 sibling, 2 replies; 43+ messages in thread
From: Boaz Harrosh @ 2007-05-09  7:46 UTC (permalink / raw)
  To: James Bottomley
  Cc: FUJITA Tomonori, linux-scsi, bhalevy, jens.axboe, hch, akpm,
	michaelc

James Bottomley wrote:
> I think you'll find that kzalloc comes directly out of a slab for this
> size of allocation anyway ... you mean you want to see a dedicated pool
> for this specific allocation?
Yes, As you said below so we can always send IO for "forward progress
of freeing memory". My test machine is a Linux cluster in front of a
pNFS over OSD. The HPC cluster is diskless. It will reach this
situation very fast.

> There's another problem in that it destroys our forward progress
> guarantee.  There's always a single reserve command for every HBA so
> that forward progress for freeing memory can always be made in the
> system even if the command slab is out and we have to reclaim memory
> through a HBA with no outstanding commands.  Allocating two commands per
> bidirectional request hoses that guarantee ... it could be fixed up by
> increasing the reserve pool to 2, but that's adding further unwanted
> complexity ...
> 
Thanks for catching it! I was afraid of that. If we stick with this solution
in the interim until we do what you suggested below, we will need to put one
more for bidi. It should not be a complicated pool thing, just a reserved one
for the bidi case.

> 
> There's actually a fourth option you haven't considered:
> 
> Roll all the required sglist definitions (request_bufflen,
> request_buffer, use_sg and sglist_len) into the sgtable pools.
> 
> We're getting very close to the point where someone gets to sweep
> through the drivers eliminating the now superfluous non-sg path in the
> queuecommand.  When that happens the only cases become no transfer or SG
> backed commands.  At this point we can do a consolidation of the struct
> scsi_cmnd fields.  This does represent the ideal time to sweep the sg
> list handling fields into the sgtable and simply have a single pointer
> to struct sgtable in the scsi_cmnd (== NULL is the signal for a no
> transfer command).
> 
This is a grate Idea. Let me see if I understand what you mean.
1. An sgtable is a single allocation with an sgtable header type
   at the begining and a veriable size array of struct scatterlist.
   something like:
   struct sgtable {
   	struct sgtable_header {
		unsigned sg_count, sglist_len, length;
		struct sgtable* next; //for Jens's big io
   	} hdr;
	struct scatterlist sglist[];
   }
   Slabs are put up for above sgtable of different sizes as
   done today. (Should they be sized on different ARCHs to
   align on page boundaries?)

2. The way we can do this in stages: Meaning put up code that has
   both sets of API, Transfer drivers one-by-one to new API, deprecate
   old API for a kernel cycle or two. Than submit last piece of
   code that removes the old API.
   It can be done. We just need to copy sgtable_header fields
   to the old fields, and let them stick around for a while.

3. The second bidi sgtable will hang on request->next_rq->special.

> James
> 
> 
If everyone agrees on something like above. I can do it right away.
It's a solution I wouldn't even dream of. Thanks

Boaz

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-08 20:01   ` James Bottomley
  2007-05-09  7:46     ` Boaz Harrosh
@ 2007-05-09 10:49     ` FUJITA Tomonori
  1 sibling, 0 replies; 43+ messages in thread
From: FUJITA Tomonori @ 2007-05-09 10:49 UTC (permalink / raw)
  To: James.Bottomley
  Cc: bharrosh, fujita.tomonori, linux-scsi, bhalevy, jens.axboe, hch,
	akpm, michaelc

From: James Bottomley <James.Bottomley@SteelEye.com>
Subject: Re: [PATCH v2] add bidi support for block pc requests
Date: Tue, 08 May 2007 15:01:37 -0500

> Roll all the required sglist definitions (request_bufflen,
> request_buffer, use_sg and sglist_len) into the sgtable pools.
> 
> We're getting very close to the point where someone gets to sweep
> through the drivers eliminating the now superfluous non-sg path in the
> queuecommand.

The non-use-sg case is still alive?

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-09  7:46     ` Boaz Harrosh
@ 2007-05-09 10:52       ` FUJITA Tomonori
  2007-05-09 13:58         ` Boaz Harrosh
  2007-05-16 17:29       ` Boaz Harrosh
  1 sibling, 1 reply; 43+ messages in thread
From: FUJITA Tomonori @ 2007-05-09 10:52 UTC (permalink / raw)
  To: bharrosh
  Cc: James.Bottomley, fujita.tomonori, linux-scsi, bhalevy, jens.axboe,
	hch, akpm, michaelc

From: Boaz Harrosh <bharrosh@panasas.com>
Subject: Re: [PATCH v2] add bidi support for block pc requests
Date: Wed, 09 May 2007 10:46:34 +0300

> > Roll all the required sglist definitions (request_bufflen,
> > request_buffer, use_sg and sglist_len) into the sgtable pools.
> > 
> > We're getting very close to the point where someone gets to sweep
> > through the drivers eliminating the now superfluous non-sg path in the
> > queuecommand.  When that happens the only cases become no transfer or SG
> > backed commands.  At this point we can do a consolidation of the struct
> > scsi_cmnd fields.  This does represent the ideal time to sweep the sg
> > list handling fields into the sgtable and simply have a single pointer
> > to struct sgtable in the scsi_cmnd (== NULL is the signal for a no
> > transfer command).
> > 
> This is a grate Idea. Let me see if I understand what you mean.
> 1. An sgtable is a single allocation with an sgtable header type
>    at the begining and a veriable size array of struct scatterlist.
>    something like:
>    struct sgtable {
>    	struct sgtable_header {
> 		unsigned sg_count, sglist_len, length;
> 		struct sgtable* next; //for Jens's big io
>    	} hdr;
> 	struct scatterlist sglist[];
>    }

Can we have more simple sgtable?

struct sgtable {
	unsigned use_sg;
	unsigned length;
	unsigned sglist_len;
	struct scatterlist sglist[0];
};


Then we could do something like this:

struct scsi_host_sgtable_pool {
	size_t size;
	char *name;
	struct kmem_cache *slab;
	mempool_t *pool;
};

int __init scsi_init_queue(void)
{
	for (i = 0; i < SG_MEMPOOL_NR; i++) {
		struct scsi_host_sgtable_pool *sgtbp = scsi_sgtable_pools + i;
		int size = sizeof(struct sgtable) + sgp->size * sizeof(struct scatterlist);

		sgp->slab = kmem_cache_create(sgp->name, size, 0,
				SLAB_HWCACHE_ALIGN, NULL, NULL);
		if (!sgp->slab) {
			printk(KERN_ERR "SCSI: can't init sg slab %s\n",
					sgp->name);
		}

		sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE,
						     sgp->slab);


Jens' chaining sg lists adds sg->next so we don't need
sgtable->next. We can just add __use_sg to struct sgtable.


> 2. The way we can do this in stages: Meaning put up code that has
>    both sets of API, Transfer drivers one-by-one to new API, deprecate
>    old API for a kernel cycle or two. Than submit last piece of
>    code that removes the old API.
>    It can be done. We just need to copy sgtable_header fields
>    to the old fields, and let them stick around for a while.

That's not bad, but can we convert all the LLDs all at once?

The changes to scsi mid-layer are almost done. Probabaly, you can just
add sgtable stuff (and scsi_dma_map/scsi_dma_unmap helper functions
that Christoph proposed long ago) to my previous patch. It can be done
for several hours. So you have enough time for the LLDs' changes.


> 3. The second bidi sgtable will hang on request->next_rq->special.

I think so.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-09 10:52       ` FUJITA Tomonori
@ 2007-05-09 13:58         ` Boaz Harrosh
  2007-05-09 14:54           ` FUJITA Tomonori
  2007-05-09 14:55           ` James Bottomley
  0 siblings, 2 replies; 43+ messages in thread
From: Boaz Harrosh @ 2007-05-09 13:58 UTC (permalink / raw)
  To: FUJITA Tomonori, James.Bottomley, jens.axboe
  Cc: linux-scsi, bhalevy, hch, akpm, michaelc

FUJITA Tomonori wrote:
> From: Boaz Harrosh <bharrosh@panasas.com>
> Subject: Re: [PATCH v2] add bidi support for block pc requests
> Date: Wed, 09 May 2007 10:46:34 +0300
> 
>>> Roll all the required sglist definitions (request_bufflen,
>>> request_buffer, use_sg and sglist_len) into the sgtable pools.
>>>
>>> We're getting very close to the point where someone gets to sweep
>>> through the drivers eliminating the now superfluous non-sg path in the
>>> queuecommand.  When that happens the only cases become no transfer or SG
>>> backed commands.  At this point we can do a consolidation of the struct
>>> scsi_cmnd fields.  This does represent the ideal time to sweep the sg
>>> list handling fields into the sgtable and simply have a single pointer
>>> to struct sgtable in the scsi_cmnd (== NULL is the signal for a no
>>> transfer command).
>>>
>> This is a grate Idea. Let me see if I understand what you mean.
>> 1. An sgtable is a single allocation with an sgtable header type
>>    at the begining and a veriable size array of struct scatterlist.
>>    something like:
>>    struct sgtable {
>>    	struct sgtable_header {
>> 		unsigned sg_count, sglist_len, length;
>> 		struct sgtable* next; //for Jens's big io
>>    	} hdr;
>> 	struct scatterlist sglist[];
>>    }
> 
> Can we have more simple sgtable?
> 
> struct sgtable {
> 	unsigned use_sg;
> 	unsigned length;
> 	unsigned sglist_len;
> 	struct scatterlist sglist[0];
> };
> 
Yes sure. It was just an example.
One comment, use_sg => sg_count.
By the way I have forgotten some fields so it should be:

struct sgtable {
 	unsigned short sg_count;
 	unsigned short sg_pool; /* note that this is the pool index and not the actual count */
 	unsigned length;
 	unsigned resid;
 	struct scatterlist sglist[0];
};

resid/length together with request->data_len can be optimized, this is the current system.

> 
> Then we could do something like this:
> 
> struct scsi_host_sgtable_pool {
> 	size_t size;
> 	char *name;
> 	struct kmem_cache *slab;
> 	mempool_t *pool;
> };
> 
> int __init scsi_init_queue(void)
> {
> 	for (i = 0; i < SG_MEMPOOL_NR; i++) {
> 		struct scsi_host_sgtable_pool *sgtbp = scsi_sgtable_pools + i;
> 		int size = sizeof(struct sgtable) + sgp->size * sizeof(struct scatterlist);
> 
> 		sgp->slab = kmem_cache_create(sgp->name, size, 0,
> 				SLAB_HWCACHE_ALIGN, NULL, NULL);
> 		if (!sgp->slab) {
> 			printk(KERN_ERR "SCSI: can't init sg slab %s\n",
> 					sgp->name);
> 		}
> 
> 		sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE,
> 						     sgp->slab);
> 
> 
I think we can do a better job here by fitting exactly the number of scatterlist
entries that will take up full pages including the header. This is sizes
dependent and can be compile-time calculated. For example in x86_64, with header,
145 scatterlist will fit in a page so this is kind of magic number for this arch.

could someone explain why we need scatterlist-max-count a base-2 number?

> Jens' chaining sg lists adds sg->next so we don't need
> sgtable->next. We can just add __use_sg to struct sgtable.
> 
Yes but it uses the last struct scatterlist for the ->next, this way it is saved.
On the other hand it wastes a pointer in small IOs. So I guess this is Jens's
call.
If the "->next" in both cases will point to a struct sgtable above than we don't need
__use_sg since sg_pool(sglist_len) holds the pool this came from. If not __use_sg
must be added to struct sgtable.

It looks like proposed change races with Jens's work so it's better be done after
his. It could be nice if he incorporates some of the constrains from here before
hand.

> 
>> 2. The way we can do this in stages: Meaning put up code that has
>>    both sets of API, Transfer drivers one-by-one to new API, deprecate
>>    old API for a kernel cycle or two. Than submit last piece of
>>    code that removes the old API.
>>    It can be done. We just need to copy sgtable_header fields
>>    to the old fields, and let them stick around for a while.
> 
> That's not bad, but can we convert all the LLDs all at once?
> 
> The changes to scsi mid-layer are almost done. Probabaly, you can just
> add sgtable stuff (and scsi_dma_map/scsi_dma_unmap helper functions
> that Christoph proposed long ago) to my previous patch. It can be done
> for several hours. So you have enough time for the LLDs' changes.
I am here to do any work needed. I will wait to see what is decided.

I already have a 2.6.20 tree with all llds converted to things like
"scsi_uni(cmd)->sg" where scsi_uni() was pointing to:
struct scsi_data_buffer {
 	unsigned short sg_count;
 	unsigned short sg_pool; /* note that this is the pool index and not the actual count */
 	unsigned length;
 	unsigned resid;
 	struct scatterlist *sg;
};
as part of my old bidi work. So a simple search and replace can be done to the new API/names.
> 
> 
>> 3. The second bidi sgtable will hang on request->next_rq->special.
> 
> I think so.


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-09 13:58         ` Boaz Harrosh
@ 2007-05-09 14:54           ` FUJITA Tomonori
  2007-05-09 14:55           ` James Bottomley
  1 sibling, 0 replies; 43+ messages in thread
From: FUJITA Tomonori @ 2007-05-09 14:54 UTC (permalink / raw)
  To: bharrosh
  Cc: fujita.tomonori, James.Bottomley, jens.axboe, linux-scsi, bhalevy,
	hch, akpm, michaelc

From: Boaz Harrosh <bharrosh@panasas.com>
Subject: Re: [PATCH v2] add bidi support for block pc requests
Date: Wed, 09 May 2007 16:58:24 +0300

> FUJITA Tomonori wrote:
> > From: Boaz Harrosh <bharrosh@panasas.com>
> > Subject: Re: [PATCH v2] add bidi support for block pc requests
> > Date: Wed, 09 May 2007 10:46:34 +0300
> > 
> >>> Roll all the required sglist definitions (request_bufflen,
> >>> request_buffer, use_sg and sglist_len) into the sgtable pools.
> >>>
> >>> We're getting very close to the point where someone gets to sweep
> >>> through the drivers eliminating the now superfluous non-sg path in the
> >>> queuecommand.  When that happens the only cases become no transfer or SG
> >>> backed commands.  At this point we can do a consolidation of the struct
> >>> scsi_cmnd fields.  This does represent the ideal time to sweep the sg
> >>> list handling fields into the sgtable and simply have a single pointer
> >>> to struct sgtable in the scsi_cmnd (== NULL is the signal for a no
> >>> transfer command).
> >>>
> >> This is a grate Idea. Let me see if I understand what you mean.
> >> 1. An sgtable is a single allocation with an sgtable header type
> >>    at the begining and a veriable size array of struct scatterlist.
> >>    something like:
> >>    struct sgtable {
> >>    	struct sgtable_header {
> >> 		unsigned sg_count, sglist_len, length;
> >> 		struct sgtable* next; //for Jens's big io
> >>    	} hdr;
> >> 	struct scatterlist sglist[];
> >>    }
> > 
> > Can we have more simple sgtable?
> > 
> > struct sgtable {
> > 	unsigned use_sg;
> > 	unsigned length;
> > 	unsigned sglist_len;
> > 	struct scatterlist sglist[0];
> > };
> > 
> Yes sure. It was just an example.
> One comment, use_sg => sg_count.
> By the way I have forgotten some fields so it should be:
> 
> struct sgtable {
>  	unsigned short sg_count;
>  	unsigned short sg_pool; /* note that this is the pool index and not the actual count */
>  	unsigned length;
>  	unsigned resid;
>  	struct scatterlist sglist[0];
> };
> 
> resid/length together with request->data_len can be optimized, this is the current system.
> 
> > 
> > Then we could do something like this:
> > 
> > struct scsi_host_sgtable_pool {
> > 	size_t size;
> > 	char *name;
> > 	struct kmem_cache *slab;
> > 	mempool_t *pool;
> > };
> > 
> > int __init scsi_init_queue(void)
> > {
> > 	for (i = 0; i < SG_MEMPOOL_NR; i++) {
> > 		struct scsi_host_sgtable_pool *sgtbp = scsi_sgtable_pools + i;
> > 		int size = sizeof(struct sgtable) + sgp->size * sizeof(struct scatterlist);
> > 
> > 		sgp->slab = kmem_cache_create(sgp->name, size, 0,
> > 				SLAB_HWCACHE_ALIGN, NULL, NULL);
> > 		if (!sgp->slab) {
> > 			printk(KERN_ERR "SCSI: can't init sg slab %s\n",
> > 					sgp->name);
> > 		}
> > 
> > 		sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE,
> > 						     sgp->slab);
> > 
> > 
> I think we can do a better job here by fitting exactly the number of scatterlist
> entries that will take up full pages including the header. This is sizes
> dependent and can be compile-time calculated. For example in x86_64, with header,
> 145 scatterlist will fit in a page so this is kind of magic number for this arch.
> 
> could someone explain why we need scatterlist-max-count a base-2 number?

To let the slab allocater to do better job?

We can improve these issues later without disturbing LLDs. No need to
do this right now.


> > Jens' chaining sg lists adds sg->next so we don't need
> > sgtable->next. We can just add __use_sg to struct sgtable.
> > 
> Yes but it uses the last struct scatterlist for the ->next, this way
> it is saved.  On the other hand it wastes a pointer in small IOs. So
> I guess this is Jens's call.  If the "->next" in both cases will
> point to a struct sgtable above

I think that some non scsi stuff also need chaining sg. so sg needs a
chaining scheme; sg->next, crypto layer's hack (not to add sg->next),
etc. Instead of using sg's chaining, having stable->next is wrong.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-09 13:58         ` Boaz Harrosh
  2007-05-09 14:54           ` FUJITA Tomonori
@ 2007-05-09 14:55           ` James Bottomley
  2007-05-09 16:54             ` Boaz Harrosh
  1 sibling, 1 reply; 43+ messages in thread
From: James Bottomley @ 2007-05-09 14:55 UTC (permalink / raw)
  To: Boaz Harrosh
  Cc: FUJITA Tomonori, jens.axboe, linux-scsi, bhalevy, hch, akpm,
	michaelc

On Wed, 2007-05-09 at 16:58 +0300, Boaz Harrosh wrote:
> >> 1. An sgtable is a single allocation with an sgtable header type
> >>    at the begining and a veriable size array of struct scatterlist.
> >>    something like:
> >>    struct sgtable {
> >>    	struct sgtable_header {
> >> 		unsigned sg_count, sglist_len, length;
> >> 		struct sgtable* next; //for Jens's big io
> >>    	} hdr;
> >> 	struct scatterlist sglist[];
> >>    }
> > 
> > Can we have more simple sgtable?
> > 
> > struct sgtable {
> > 	unsigned use_sg;
> > 	unsigned length;
> > 	unsigned sglist_len;
> > 	struct scatterlist sglist[0];
> > };
> > 
> Yes sure. It was just an example.
> One comment, use_sg => sg_count.
> By the way I have forgotten some fields so it should be:
> 
> struct sgtable {
>  	unsigned short sg_count;
>  	unsigned short sg_pool; /* note that this is the pool index and not the actual count */
>  	unsigned length;
>  	unsigned resid;
>  	struct scatterlist sglist[0];
> };
> 
> resid/length together with request->data_len can be optimized, this is the current system.

Actually, the first order of business is to use accessors on the command
pointers in the drivers to free them from the internal layout of the
structure (and where it is allocated).

> > 
> > Then we could do something like this:
> > 
> > struct scsi_host_sgtable_pool {
> > 	size_t size;
> > 	char *name;
> > 	struct kmem_cache *slab;
> > 	mempool_t *pool;
> > };
> > 
> > int __init scsi_init_queue(void)
> > {
> > 	for (i = 0; i < SG_MEMPOOL_NR; i++) {
> > 		struct scsi_host_sgtable_pool *sgtbp = scsi_sgtable_pools + i;
> > 		int size = sizeof(struct sgtable) + sgp->size * sizeof(struct scatterlist);
> > 
> > 		sgp->slab = kmem_cache_create(sgp->name, size, 0,
> > 				SLAB_HWCACHE_ALIGN, NULL, NULL);
> > 		if (!sgp->slab) {
> > 			printk(KERN_ERR "SCSI: can't init sg slab %s\n",
> > 					sgp->name);
> > 		}
> > 
> > 		sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE,
> > 						     sgp->slab);
> > 
> > 
> I think we can do a better job here by fitting exactly the number of scatterlist
> entries that will take up full pages including the header. This is sizes
> dependent and can be compile-time calculated. For example in x86_64, with header,
> 145 scatterlist will fit in a page so this is kind of magic number for this arch.
> 
> could someone explain why we need scatterlist-max-count a base-2 number?

Power of 2 is the most efficient way to manage space vs pool number in
an arbitrarily sized system.  This, too, can be accessor abstracted ...

> > Jens' chaining sg lists adds sg->next so we don't need
> > sgtable->next. We can just add __use_sg to struct sgtable.
> > 
> Yes but it uses the last struct scatterlist for the ->next, this way it is saved.
> On the other hand it wastes a pointer in small IOs. So I guess this is Jens's
> call.
> If the "->next" in both cases will point to a struct sgtable above than we don't need
> __use_sg since sg_pool(sglist_len) holds the pool this came from. If not __use_sg
> must be added to struct sgtable.
> 
> It looks like proposed change races with Jens's work so it's better be done after
> his. It could be nice if he incorporates some of the constrains from here before
> hand.
> 
> > 
> >> 2. The way we can do this in stages: Meaning put up code that has
> >>    both sets of API, Transfer drivers one-by-one to new API, deprecate
> >>    old API for a kernel cycle or two. Than submit last piece of
> >>    code that removes the old API.
> >>    It can be done. We just need to copy sgtable_header fields
> >>    to the old fields, and let them stick around for a while.
> > 
> > That's not bad, but can we convert all the LLDs all at once?

No, that's why you do the accessors.  Convert all of the common drivers
to accessors on the current structure, then throw the switch to convert
to the new structure (whatever form is finally settled on).  Then any
unconverted drivers break and people fix the ones they care about.

> > The changes to scsi mid-layer are almost done. Probabaly, you can just
> > add sgtable stuff (and scsi_dma_map/scsi_dma_unmap helper functions
> > that Christoph proposed long ago) to my previous patch. It can be done
> > for several hours. So you have enough time for the LLDs' changes.
> I am here to do any work needed. I will wait to see what is decided.
> 
> I already have a 2.6.20 tree with all llds converted to things like
> "scsi_uni(cmd)->sg" where scsi_uni() was pointing to:
> struct scsi_data_buffer {
>  	unsigned short sg_count;
>  	unsigned short sg_pool; /* note that this is the pool index and not the actual count */
>  	unsigned length;
>  	unsigned resid;
>  	struct scatterlist *sg;
> };
> as part of my old bidi work. So a simple search and replace can be done to the new API/names.
> > 
> > 
> >> 3. The second bidi sgtable will hang on request->next_rq->special.
> > 
> > I think so.

James



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-09 14:55           ` James Bottomley
@ 2007-05-09 16:54             ` Boaz Harrosh
  2007-05-10  6:53               ` FUJITA Tomonori
  0 siblings, 1 reply; 43+ messages in thread
From: Boaz Harrosh @ 2007-05-09 16:54 UTC (permalink / raw)
  To: James Bottomley
  Cc: FUJITA Tomonori, jens.axboe, linux-scsi, bhalevy, hch, akpm,
	michaelc

James Bottomley wrote:
> Actually, the first order of business is to use accessors on the command
> pointers in the drivers to free them from the internal layout of the
> structure (and where it is allocated).
> 
Thanks! I totally second that. Let me look into my old patches and come
up with all the needed accessors. I hope 3-5 will be enough.
I will send some suggestions tomorrow.
> 
> No, that's why you do the accessors.  Convert all of the common drivers
> to accessors on the current structure, then throw the switch to convert
> to the new structure (whatever form is finally settled on).  Then any
> unconverted drivers break and people fix the ones they care about.

Last time I was able to compile 97% of drivers and convert by search-and-replace
the rest. Not a huge deal.

> 
> James
> 
> 


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-09 16:54             ` Boaz Harrosh
@ 2007-05-10  6:53               ` FUJITA Tomonori
  2007-05-10  7:45                 ` FUJITA Tomonori
  0 siblings, 1 reply; 43+ messages in thread
From: FUJITA Tomonori @ 2007-05-10  6:53 UTC (permalink / raw)
  To: bharrosh
  Cc: James.Bottomley, fujita.tomonori, jens.axboe, linux-scsi, bhalevy,
	hch, akpm, michaelc

From: Boaz Harrosh <bharrosh@panasas.com>
Subject: Re: [PATCH v2] add bidi support for block pc requests
Date: Wed, 09 May 2007 19:54:32 +0300

> James Bottomley wrote:
> > Actually, the first order of business is to use accessors on the command
> > pointers in the drivers to free them from the internal layout of the
> > structure (and where it is allocated).
> > 
> Thanks! I totally second that. Let me look into my old patches and come
> up with all the needed accessors. I hope 3-5 will be enough.
> I will send some suggestions tomorrow.
> > 
> > No, that's why you do the accessors.  Convert all of the common drivers
> > to accessors on the current structure, then throw the switch to convert
> > to the new structure (whatever form is finally settled on).  Then any
> > unconverted drivers break and people fix the ones they care about.
> 
> Last time I was able to compile 97% of drivers and convert by search-and-replace
> the rest. Not a huge deal.

We need to remove the non-use-sg code in the drivers and convert
them. So it's a bit more complicated than search-and-replace.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-10  6:53               ` FUJITA Tomonori
@ 2007-05-10  7:45                 ` FUJITA Tomonori
  2007-05-10 12:37                   ` Boaz Harrosh
  0 siblings, 1 reply; 43+ messages in thread
From: FUJITA Tomonori @ 2007-05-10  7:45 UTC (permalink / raw)
  To: bharrosh
  Cc: James.Bottomley, jens.axboe, linux-scsi, bhalevy, hch, akpm,
	michaelc

From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Subject: Re: [PATCH v2] add bidi support for block pc requests
Date: Thu, 10 May 2007 15:53:22 +0900

> From: Boaz Harrosh <bharrosh@panasas.com>
> Subject: Re: [PATCH v2] add bidi support for block pc requests
> Date: Wed, 09 May 2007 19:54:32 +0300
> 
> > James Bottomley wrote:
> > > Actually, the first order of business is to use accessors on the command
> > > pointers in the drivers to free them from the internal layout of the
> > > structure (and where it is allocated).
> > > 
> > Thanks! I totally second that. Let me look into my old patches and come
> > up with all the needed accessors. I hope 3-5 will be enough.
> > I will send some suggestions tomorrow.
> > > 
> > > No, that's why you do the accessors.  Convert all of the common drivers
> > > to accessors on the current structure, then throw the switch to convert
> > > to the new structure (whatever form is finally settled on).  Then any
> > > unconverted drivers break and people fix the ones they care about.
> > 
> > Last time I was able to compile 97% of drivers and convert by search-and-replace
> > the rest. Not a huge deal.
> 
> We need to remove the non-use-sg code in the drivers and convert
> them. So it's a bit more complicated than search-and-replace.

Here's a patch to convert ibmvscsi to use helper functions (though it
needs more testings).

---
diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c
index fbc1d5c..fb764ff 100644
--- a/drivers/scsi/ibmvscsi/ibmvscsi.c
+++ b/drivers/scsi/ibmvscsi/ibmvscsi.c
@@ -353,20 +353,18 @@ static void unmap_cmd_data(struct srp_cm
 	}
 }
 
-static int map_sg_list(int num_entries, 
-		       struct scatterlist *sg,
-		       struct srp_direct_buf *md)
+static int map_sg_list(struct scsi_cmnd *cmd, int nseg, struct srp_direct_buf *md)
 {
 	int i;
+	struct scatterlist *sg;
 	u64 total_length = 0;
 
-	for (i = 0; i < num_entries; ++i) {
+	scsi_for_each_sg(cmd, nseg, i) {
 		struct srp_direct_buf *descr = md + i;
-		struct scatterlist *sg_entry = &sg[i];
-		descr->va = sg_dma_address(sg_entry);
-		descr->len = sg_dma_len(sg_entry);
+		descr->va = sg_dma_address(sg);
+		descr->len = sg_dma_len(sg);
 		descr->key = 0;
-		total_length += sg_dma_len(sg_entry);
+		total_length += sg_dma_len(sg);
  	}
 	return total_length;
 }
@@ -384,43 +382,39 @@ static int map_sg_data(struct scsi_cmnd
 		       struct srp_event_struct *evt_struct,
 		       struct srp_cmd *srp_cmd, struct device *dev)
 {
-
 	int sg_mapped;
 	u64 total_length = 0;
-	struct scatterlist *sg = cmd->request_buffer;
 	struct srp_direct_buf *data =
 		(struct srp_direct_buf *) srp_cmd->add_data;
 	struct srp_indirect_buf *indirect =
 		(struct srp_indirect_buf *) data;
 
-	sg_mapped = dma_map_sg(dev, sg, cmd->use_sg, DMA_BIDIRECTIONAL);
-
-	if (sg_mapped == 0)
+	sg_mapped = scsi_dma_map(dev, cmd);
+	if (!sg_mapped)
+		return 1;
+	else if (sg_mapped < 0)
 		return 0;
+	else if (sg_mapped > SG_ALL) {
+		printk(KERN_ERR
+		       "ibmvscsi: More than %d mapped sg entries, got %d\n",
+		       SG_ALL, sg_mapped);
+		return 0;
+	}
 
 	set_srp_direction(cmd, srp_cmd, sg_mapped);
 
 	/* special case; we can use a single direct descriptor */
 	if (sg_mapped == 1) {
-		data->va = sg_dma_address(&sg[0]);
-		data->len = sg_dma_len(&sg[0]);
-		data->key = 0;
+		map_sg_list(cmd, sg_mapped, data);
 		return 1;
 	}
 
-	if (sg_mapped > SG_ALL) {
-		printk(KERN_ERR
-		       "ibmvscsi: More than %d mapped sg entries, got %d\n",
-		       SG_ALL, sg_mapped);
-		return 0;
-	}
-
 	indirect->table_desc.va = 0;
 	indirect->table_desc.len = sg_mapped * sizeof(struct srp_direct_buf);
 	indirect->table_desc.key = 0;
 
 	if (sg_mapped <= MAX_INDIRECT_BUFS) {
-		total_length = map_sg_list(sg_mapped, sg,
+		total_length = map_sg_list(cmd, sg_mapped,
 					   &indirect->desc_list[0]);
 		indirect->len = total_length;
 		return 1;
@@ -429,61 +423,27 @@ static int map_sg_data(struct scsi_cmnd
 	/* get indirect table */
 	if (!evt_struct->ext_list) {
 		evt_struct->ext_list = (struct srp_direct_buf *)
-			dma_alloc_coherent(dev, 
+			dma_alloc_coherent(dev,
 					   SG_ALL * sizeof(struct srp_direct_buf),
 					   &evt_struct->ext_list_token, 0);
 		if (!evt_struct->ext_list) {
 			printk(KERN_ERR
 			       "ibmvscsi: Can't allocate memory for indirect table\n");
 			return 0;
-			
 		}
 	}
 
-	total_length = map_sg_list(sg_mapped, sg, evt_struct->ext_list);	
+	total_length = map_sg_list(cmd, sg_mapped, evt_struct->ext_list);
 
 	indirect->len = total_length;
 	indirect->table_desc.va = evt_struct->ext_list_token;
 	indirect->table_desc.len = sg_mapped * sizeof(indirect->desc_list[0]);
 	memcpy(indirect->desc_list, evt_struct->ext_list,
 	       MAX_INDIRECT_BUFS * sizeof(struct srp_direct_buf));
-	
  	return 1;
 }
 
 /**
- * map_single_data: - Maps memory and initializes memory decriptor fields
- * @cmd:	struct scsi_cmnd with the memory to be mapped
- * @srp_cmd:	srp_cmd that contains the memory descriptor
- * @dev:	device for which to map dma memory
- *
- * Called by map_data_for_srp_cmd() when building srp cmd from scsi cmd.
- * Returns 1 on success.
-*/
-static int map_single_data(struct scsi_cmnd *cmd,
-			   struct srp_cmd *srp_cmd, struct device *dev)
-{
-	struct srp_direct_buf *data =
-		(struct srp_direct_buf *) srp_cmd->add_data;
-
-	data->va =
-		dma_map_single(dev, cmd->request_buffer,
-			       cmd->request_bufflen,
-			       DMA_BIDIRECTIONAL);
-	if (dma_mapping_error(data->va)) {
-		printk(KERN_ERR
-		       "ibmvscsi: Unable to map request_buffer for command!\n");
-		return 0;
-	}
-	data->len = cmd->request_bufflen;
-	data->key = 0;
-
-	set_srp_direction(cmd, srp_cmd, 1);
-
-	return 1;
-}
-
-/**
  * map_data_for_srp_cmd: - Calls functions to map data for srp cmds
  * @cmd:	struct scsi_cmnd with the memory to be mapped
  * @srp_cmd:	srp_cmd that contains the memory descriptor
@@ -513,11 +473,7 @@ static int map_data_for_srp_cmd(struct s
 		return 0;
 	}
 
-	if (!cmd->request_buffer)
-		return 1;
-	if (cmd->use_sg)
-		return map_sg_data(cmd, evt_struct, srp_cmd, dev);
-	return map_single_data(cmd, srp_cmd, dev);
+	return map_sg_data(cmd, evt_struct, srp_cmd, dev);
 }
 
 /* ------------------------------------------------------------
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 9f7482d..dd6fdf6 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2292,3 +2292,29 @@ void scsi_kunmap_atomic_sg(void *virt)
 	kunmap_atomic(virt, KM_BIO_SRC_IRQ);
 }
 EXPORT_SYMBOL(scsi_kunmap_atomic_sg);
+
+int scsi_dma_map(struct device *dev, struct scsi_cmnd *cmd)
+{
+	int nseg = 0;
+
+	if (cmd->use_sg) {
+		struct scatterlist *sg =
+			(struct scatterlist *) cmd->request_buffer;
+
+		nseg = dma_map_sg(dev, sg, cmd->use_sg, cmd->sc_data_direction);
+		if (unlikely(!nseg))
+			return -ENOMEM;
+	}
+	return nseg;
+}
+EXPORT_SYMBOL(scsi_dma_map);
+
+void scsi_dma_unmap(struct device *dev, struct scsi_cmnd *cmd)
+{
+	if (cmd->use_sg) {
+		struct scatterlist *sg =
+			(struct scatterlist *) cmd->request_buffer;
+		dma_unmap_sg(dev, sg, cmd->use_sg, cmd->sc_data_direction);
+	}
+}
+EXPORT_SYMBOL(scsi_dma_unmap);
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index d6948d0..799f204 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -138,4 +138,17 @@ extern void scsi_kunmap_atomic_sg(void *
 extern struct scatterlist *scsi_alloc_sgtable(struct scsi_cmnd *, gfp_t);
 extern void scsi_free_sgtable(struct scatterlist *, int);
 
+extern int scsi_dma_map(struct device *dev, struct scsi_cmnd *cmd);
+extern void scsi_dma_unmap(struct device *dev, struct scsi_cmnd *cmd);
+
+/* moved to scatterlist.h after chaining sg */
+#define sg_next(sg) ((sg) + 1)
+
+#define scsi_for_each_sg(cmd, nseg, i)					\
+	for (i = 0, sg = (cmd)->request_buffer; i < nseg;		\
+		sg = sg_next(sg), i++)					\
+
+#define scsi_sg_count(cmd) ((cmd)->use_sg)
+#define scsi_bufferlen(cmd) ((cmd)->request_bufflen)
+
 #endif /* _SCSI_SCSI_CMND_H */

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-10  7:45                 ` FUJITA Tomonori
@ 2007-05-10 12:37                   ` Boaz Harrosh
  2007-05-10 13:01                     ` FUJITA Tomonori
  2007-05-10 15:10                     ` Douglas Gilbert
  0 siblings, 2 replies; 43+ messages in thread
From: Boaz Harrosh @ 2007-05-10 12:37 UTC (permalink / raw)
  To: FUJITA Tomonori
  Cc: James.Bottomley, jens.axboe, linux-scsi, bhalevy, hch, akpm,
	michaelc

FUJITA Tomonori wrote:
> From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
> Subject: Re: [PATCH v2] add bidi support for block pc requests
> Date: Thu, 10 May 2007 15:53:22 +0900
> 
>> From: Boaz Harrosh <bharrosh@panasas.com>
>> Subject: Re: [PATCH v2] add bidi support for block pc requests
>> Date: Wed, 09 May 2007 19:54:32 +0300
>>
>>> James Bottomley wrote:
>>>> Actually, the first order of business is to use accessors on the command
>>>> pointers in the drivers to free them from the internal layout of the
>>>> structure (and where it is allocated).
>>>>
>>> Thanks! I totally second that. Let me look into my old patches and come
>>> up with all the needed accessors. I hope 3-5 will be enough.
>>> I will send some suggestions tomorrow.
>>>> No, that's why you do the accessors.  Convert all of the common drivers
>>>> to accessors on the current structure, then throw the switch to convert
>>>> to the new structure (whatever form is finally settled on).  Then any
>>>> unconverted drivers break and people fix the ones they care about.
>>> Last time I was able to compile 97% of drivers and convert by search-and-replace
>>> the rest. Not a huge deal.
>> We need to remove the non-use-sg code in the drivers and convert
>> them. So it's a bit more complicated than search-and-replace.
> 
> Here's a patch to convert ibmvscsi to use helper functions (though it
> needs more testings).
> 
> ---
> ---
> diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
> index d6948d0..799f204 100644
> --- a/include/scsi/scsi_cmnd.h
> +++ b/include/scsi/scsi_cmnd.h
> @@ -138,4 +138,17 @@ extern void scsi_kunmap_atomic_sg(void *
>  extern struct scatterlist *scsi_alloc_sgtable(struct scsi_cmnd *, gfp_t);
>  extern void scsi_free_sgtable(struct scatterlist *, int);
>  
> +extern int scsi_dma_map(struct device *dev, struct scsi_cmnd *cmd);
> +extern void scsi_dma_unmap(struct device *dev, struct scsi_cmnd *cmd);
> +
> +/* moved to scatterlist.h after chaining sg */
> +#define sg_next(sg) ((sg) + 1)
> +
> +#define scsi_for_each_sg(cmd, nseg, i)					\
> +	for (i = 0, sg = (cmd)->request_buffer; i < nseg;		\
> +		sg = sg_next(sg), i++)					\
> +

Better we do like Jens's patch
+#define for_each_sg(sglist, sg, nr, __i)	\
+	for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg))

I think that we should wait for Jens pending patchset and do this work on top
of his, then use his sg macros directly. This way the cleaners can also be
watchfully for any drivers that might brake with big IO sent to them.

> +#define scsi_sg_count(cmd) ((cmd)->use_sg)
> +#define scsi_bufferlen(cmd) ((cmd)->request_bufflen)
> +
>  #endif /* _SCSI_SCSI_CMND_H */

Above helpers look good. However I am missing 2 of them:

1.
+#define scsi_sgl(cmd) ((struct scatterlist*)(cmd)->request_buffer)

This is for drivers like iSCSI that do not do any dma mapping, as dma
is done at the lower level in the NICs. And lots of other places that just
transfer the sglist around.

2.
+#define scsi_resid(cmd) ((cmd)->resid)

Boaz



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-10 12:37                   ` Boaz Harrosh
@ 2007-05-10 13:01                     ` FUJITA Tomonori
  2007-05-10 15:10                     ` Douglas Gilbert
  1 sibling, 0 replies; 43+ messages in thread
From: FUJITA Tomonori @ 2007-05-10 13:01 UTC (permalink / raw)
  To: jens.axboe, bharrosh
  Cc: fujita.tomonori, James.Bottomley, linux-scsi, bhalevy, hch, akpm,
	michaelc

From: Boaz Harrosh <bharrosh@panasas.com>
Subject: Re: [PATCH v2] add bidi support for block pc requests
Date: Thu, 10 May 2007 15:37:48 +0300

> > +/* moved to scatterlist.h after chaining sg */
> > +#define sg_next(sg) ((sg) + 1)
> > +
> > +#define scsi_for_each_sg(cmd, nseg, i)					\
> > +	for (i = 0, sg = (cmd)->request_buffer; i < nseg;		\
> > +		sg = sg_next(sg), i++)					\
> > +
> 
> Better we do like Jens's patch
> +#define for_each_sg(sglist, sg, nr, __i)	\
> +	for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg))
> 
> I think that we should wait for Jens pending patchset and do this work on top
> of his, then use his sg macros directly. This way the cleaners can also be
> watchfully for any drivers that might brake with big IO sent to them.

Yeah, we can use sg list's macro something like:

#define scsi_for_each_sg(cmd, sg, nseg, __i)			\
	for_each_sg((cmd)->request_buffer, sg, nseg, __i)	

Seems that Jens converted lots of LLDs to use for_each_sg. If
for_each_sg goes first, we can just replase for_each_sg (but after
all, we need to touch LLDs to remove the non-use-sg path). On the
other hand, if we go first, there is no conversion for for_each_sg in
LLDs.

Jens, do you want the sg list stuff done before our cleanups?

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-10 12:37                   ` Boaz Harrosh
  2007-05-10 13:01                     ` FUJITA Tomonori
@ 2007-05-10 15:10                     ` Douglas Gilbert
  2007-05-10 15:48                       ` Boaz Harrosh
  2007-05-11 16:26                       ` James Bottomley
  1 sibling, 2 replies; 43+ messages in thread
From: Douglas Gilbert @ 2007-05-10 15:10 UTC (permalink / raw)
  To: Boaz Harrosh
  Cc: FUJITA Tomonori, James.Bottomley, jens.axboe, linux-scsi, bhalevy,
	hch, akpm, michaelc

Boaz Harrosh wrote:
> FUJITA Tomonori wrote:
>> From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
>> Subject: Re: [PATCH v2] add bidi support for block pc requests
>> Date: Thu, 10 May 2007 15:53:22 +0900
>>
>>> From: Boaz Harrosh <bharrosh@panasas.com>
>>> Subject: Re: [PATCH v2] add bidi support for block pc requests
>>> Date: Wed, 09 May 2007 19:54:32 +0300
>>>
>>>> James Bottomley wrote:
>>>>> Actually, the first order of business is to use accessors on the command
>>>>> pointers in the drivers to free them from the internal layout of the
>>>>> structure (and where it is allocated).
>>>>>
>>>> Thanks! I totally second that. Let me look into my old patches and come
>>>> up with all the needed accessors. I hope 3-5 will be enough.
>>>> I will send some suggestions tomorrow.
>>>>> No, that's why you do the accessors.  Convert all of the common drivers
>>>>> to accessors on the current structure, then throw the switch to convert
>>>>> to the new structure (whatever form is finally settled on).  Then any
>>>>> unconverted drivers break and people fix the ones they care about.
>>>> Last time I was able to compile 97% of drivers and convert by search-and-replace
>>>> the rest. Not a huge deal.
>>> We need to remove the non-use-sg code in the drivers and convert
>>> them. So it's a bit more complicated than search-and-replace.
>> Here's a patch to convert ibmvscsi to use helper functions (though it
>> needs more testings).
>>
>> ---
>> ---
>> diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
>> index d6948d0..799f204 100644
>> --- a/include/scsi/scsi_cmnd.h
>> +++ b/include/scsi/scsi_cmnd.h
>> @@ -138,4 +138,17 @@ extern void scsi_kunmap_atomic_sg(void *
>>  extern struct scatterlist *scsi_alloc_sgtable(struct scsi_cmnd *, gfp_t);
>>  extern void scsi_free_sgtable(struct scatterlist *, int);
>>  
>> +extern int scsi_dma_map(struct device *dev, struct scsi_cmnd *cmd);
>> +extern void scsi_dma_unmap(struct device *dev, struct scsi_cmnd *cmd);
>> +
>> +/* moved to scatterlist.h after chaining sg */
>> +#define sg_next(sg) ((sg) + 1)
>> +
>> +#define scsi_for_each_sg(cmd, nseg, i)					\
>> +	for (i = 0, sg = (cmd)->request_buffer; i < nseg;		\
>> +		sg = sg_next(sg), i++)					\
>> +
> 
> Better we do like Jens's patch
> +#define for_each_sg(sglist, sg, nr, __i)	\
> +	for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg))
> 
> I think that we should wait for Jens pending patchset and do this work on top
> of his, then use his sg macros directly. This way the cleaners can also be
> watchfully for any drivers that might brake with big IO sent to them.
> 
>> +#define scsi_sg_count(cmd) ((cmd)->use_sg)
>> +#define scsi_bufferlen(cmd) ((cmd)->request_bufflen)
>> +
>>  #endif /* _SCSI_SCSI_CMND_H */
> 
> Above helpers look good. However I am missing 2 of them:
> 
> 1.
> +#define scsi_sgl(cmd) ((struct scatterlist*)(cmd)->request_buffer)
> 
> This is for drivers like iSCSI that do not do any dma mapping, as dma
> is done at the lower level in the NICs. And lots of other places that just
> transfer the sglist around.
> 
> 2.
> +#define scsi_resid(cmd) ((cmd)->resid)

I have defined resid in the past as a signed (32 bit)
integer following the CAM spec. The cases are:
   - resid=0 : initiator's DMA engine got (or sent?) the
               number of bytes it expected
   - resid>0 : dxfer_len-resid bytes received, less than
               expected
   - resid<0 : more bytes received (or could have been)
               than indicated by dxfer_len

Some definitions of resid make it unsigned which rules out
the less common resid<0 case. Can this case happen? Does it
happen in practice? Naturally no more than dxfer_len should
be placed in the scatter gather list.

SPI and SAS do not convey dxfer_len (or data direction) to
a target in their transport frames. This means that the
relevant field in the cdb (e.g. transfer length) dictates
how much data a target sends back to an initiator in the
case of a read like operation. So that opens up the
possibility that dxfer_len and the cdb disagree.

FCP does convey dxfer_len and data_direction to a target.
So a FCP target can detect a mismatch between this and the
cdb and send resid information (either under or over) in its
response frame back to the initiator. Alternatively it
can treat the "over" case as an error (fcp3r04.pdf section
9.4.2).
iSCSI and SRP?

The resid<0 case may become more common if a driver or
application does not properly take account of protection
information being sent together with the requested
data.

So should we keep resid signed?

Doug Gilbert

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-10 15:10                     ` Douglas Gilbert
@ 2007-05-10 15:48                       ` Boaz Harrosh
  2007-05-11 16:26                       ` James Bottomley
  1 sibling, 0 replies; 43+ messages in thread
From: Boaz Harrosh @ 2007-05-10 15:48 UTC (permalink / raw)
  To: dougg
  Cc: FUJITA Tomonori, James.Bottomley, jens.axboe, linux-scsi, bhalevy,
	hch, akpm, michaelc

Douglas Gilbert wrote:
> Boaz Harrosh wrote:
>> FUJITA Tomonori wrote:
>>> From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
>>> Subject: Re: [PATCH v2] add bidi support for block pc requests
>>> Date: Thu, 10 May 2007 15:53:22 +0900
>>>
>>>> From: Boaz Harrosh <bharrosh@panasas.com>
>>>> Subject: Re: [PATCH v2] add bidi support for block pc requests
>>>> Date: Wed, 09 May 2007 19:54:32 +0300
>>>>
>>>>> James Bottomley wrote:
>>>>>> Actually, the first order of business is to use accessors on the command
>>>>>> pointers in the drivers to free them from the internal layout of the
>>>>>> structure (and where it is allocated).
>>>>>>
>>>>> Thanks! I totally second that. Let me look into my old patches and come
>>>>> up with all the needed accessors. I hope 3-5 will be enough.
>>>>> I will send some suggestions tomorrow.
>>>>>> No, that's why you do the accessors.  Convert all of the common drivers
>>>>>> to accessors on the current structure, then throw the switch to convert
>>>>>> to the new structure (whatever form is finally settled on).  Then any
>>>>>> unconverted drivers break and people fix the ones they care about.
>>>>> Last time I was able to compile 97% of drivers and convert by search-and-replace
>>>>> the rest. Not a huge deal.
>>>> We need to remove the non-use-sg code in the drivers and convert
>>>> them. So it's a bit more complicated than search-and-replace.
>>> Here's a patch to convert ibmvscsi to use helper functions (though it
>>> needs more testings).
>>>
>>> ---
>>> ---
>>> diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
>>> index d6948d0..799f204 100644
>>> --- a/include/scsi/scsi_cmnd.h
>>> +++ b/include/scsi/scsi_cmnd.h
>>> @@ -138,4 +138,17 @@ extern void scsi_kunmap_atomic_sg(void *
>>>  extern struct scatterlist *scsi_alloc_sgtable(struct scsi_cmnd *, gfp_t);
>>>  extern void scsi_free_sgtable(struct scatterlist *, int);
>>>  
>>> +extern int scsi_dma_map(struct device *dev, struct scsi_cmnd *cmd);
>>> +extern void scsi_dma_unmap(struct device *dev, struct scsi_cmnd *cmd);
>>> +
>>> +/* moved to scatterlist.h after chaining sg */
>>> +#define sg_next(sg) ((sg) + 1)
>>> +
>>> +#define scsi_for_each_sg(cmd, nseg, i)					\
>>> +	for (i = 0, sg = (cmd)->request_buffer; i < nseg;		\
>>> +		sg = sg_next(sg), i++)					\
>>> +
>> Better we do like Jens's patch
>> +#define for_each_sg(sglist, sg, nr, __i)	\
>> +	for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg))
>>
>> I think that we should wait for Jens pending patchset and do this work on top
>> of his, then use his sg macros directly. This way the cleaners can also be
>> watchfully for any drivers that might brake with big IO sent to them.
>>
>>> +#define scsi_sg_count(cmd) ((cmd)->use_sg)
>>> +#define scsi_bufferlen(cmd) ((cmd)->request_bufflen)
>>> +
>>>  #endif /* _SCSI_SCSI_CMND_H */
>> Above helpers look good. However I am missing 2 of them:
>>
>> 1.
>> +#define scsi_sgl(cmd) ((struct scatterlist*)(cmd)->request_buffer)
>>
>> This is for drivers like iSCSI that do not do any dma mapping, as dma
>> is done at the lower level in the NICs. And lots of other places that just
>> transfer the sglist around.
>>
>> 2.
>> +#define scsi_resid(cmd) ((cmd)->resid)
> 
> I have defined resid in the past as a signed (32 bit)
> integer following the CAM spec. The cases are:
>    - resid=0 : initiator's DMA engine got (or sent?) the
>                number of bytes it expected
>    - resid>0 : dxfer_len-resid bytes received, less than
>                expected
>    - resid<0 : more bytes received (or could have been)
>                than indicated by dxfer_len
> 
> Some definitions of resid make it unsigned which rules out
> the less common resid<0 case. Can this case happen? Does it
> happen in practice? Naturally no more than dxfer_len should
> be placed in the scatter gather list.
> 
> SPI and SAS do not convey dxfer_len (or data direction) to
> a target in their transport frames. This means that the
> relevant field in the cdb (e.g. transfer length) dictates
> how much data a target sends back to an initiator in the
> case of a read like operation. So that opens up the
> possibility that dxfer_len and the cdb disagree.
> 
> FCP does convey dxfer_len and data_direction to a target.
> So a FCP target can detect a mismatch between this and the
> cdb and send resid information (either under or over) in its
> response frame back to the initiator. Alternatively it
> can treat the "over" case as an error (fcp3r04.pdf section
> 9.4.2).
> iSCSI and SRP?
> 
> The resid<0 case may become more common if a driver or
> application does not properly take account of protection
> information being sent together with the requested
> data.
> 
> So should we keep resid signed?
> 
> Doug Gilbert
You are probably right just that I was afraid of this: req->data_len = sc->resid
and I saw in iSCSI :

iscsi_scsi_cmd_rsp()
...

		if (res_count > 0 && res_count <= sc->request_bufflen)
			sc->resid = res_count;
		else
			sc->result = (DID_BAD_TARGET << 16) | rhdr->cmd_status;

So I'm confused. It looks like the struct scsi_cmnd member resid is only the
one case of 0 <= target_resid <= request_bufflen, and the other cases are covered by
an error result?

Boaz


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-10 15:10                     ` Douglas Gilbert
  2007-05-10 15:48                       ` Boaz Harrosh
@ 2007-05-11 16:26                       ` James Bottomley
  1 sibling, 0 replies; 43+ messages in thread
From: James Bottomley @ 2007-05-11 16:26 UTC (permalink / raw)
  To: dougg
  Cc: Boaz Harrosh, FUJITA Tomonori, jens.axboe, linux-scsi, bhalevy,
	hch, akpm, michaelc

On Thu, 2007-05-10 at 11:10 -0400, Douglas Gilbert wrote:
> > +#define scsi_resid(cmd) ((cmd)->resid)
> 
> I have defined resid in the past as a signed (32 bit)
> integer following the CAM spec. The cases are:
>    - resid=0 : initiator's DMA engine got (or sent?) the
>                number of bytes it expected
>    - resid>0 : dxfer_len-resid bytes received, less than
>                expected
>    - resid<0 : more bytes received (or could have been)
>                than indicated by dxfer_len
> 
> Some definitions of resid make it unsigned which rules out
> the less common resid<0 case. Can this case happen? Does it
> happen in practice? Naturally no more than dxfer_len should
> be placed in the scatter gather list.

Attempted overrun is usually, and correctly in my opinion, treated as a
fatal error in most drivers, so for them the resid < 0 case can never
happen.

> SPI and SAS do not convey dxfer_len (or data direction) to
> a target in their transport frames. This means that the
> relevant field in the cdb (e.g. transfer length) dictates
> how much data a target sends back to an initiator in the
> case of a read like operation. So that opens up the
> possibility that dxfer_len and the cdb disagree.

Well, actually the drivers do (or those few that pay attention).
dxfer_len is used to program the SG lists into the device DMA engine, so
drivers can retrieve the actuals from the DMA engine at the end of the
transfer and set resid accordingly.

> FCP does convey dxfer_len and data_direction to a target.
> So a FCP target can detect a mismatch between this and the
> cdb and send resid information (either under or over) in its
> response frame back to the initiator. Alternatively it
> can treat the "over" case as an error (fcp3r04.pdf section
> 9.4.2).
> iSCSI and SRP?
> 
> The resid<0 case may become more common if a driver or
> application does not properly take account of protection
> information being sent together with the requested
> data.
> 
> So should we keep resid signed?

I don't really see a compelling reason to alter the driver behaviour (at
least for those where overrun is fatal).  However, since resid is signed
in the current structure, it makes sense to propagate that.

James



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-09  7:46     ` Boaz Harrosh
  2007-05-09 10:52       ` FUJITA Tomonori
@ 2007-05-16 17:29       ` Boaz Harrosh
  2007-05-16 17:53         ` Jens Axboe
  1 sibling, 1 reply; 43+ messages in thread
From: Boaz Harrosh @ 2007-05-16 17:29 UTC (permalink / raw)
  To: James Bottomley, FUJITA Tomonori
  Cc: linux-scsi, bhalevy, jens.axboe, hch, akpm, michaelc

Boaz Harrosh wrote:
> James Bottomley wrote:
>> 
>> There's actually a fourth option you haven't considered:
>>
>> Roll all the required sglist definitions (request_bufflen,
>> request_buffer, use_sg and sglist_len) into the sgtable pools.
>>
> This is a grate Idea. Let me see if I understand what you mean.
> ...
> ...

Hi Dear James, list.
I have worked on proposed solution (If anyone is interested see
url blow)

Now it works and all but I hit some problems.
What happens is that in 64 bit architectures, well in x86_64,
the sizeof scatterlist structure is 32 bytes which means that
we can only fit exactly 128 of them in a page. But together with
the scsi_sg_table header we are down to 127. Also if we want
good alignment/packing of other sized pools we want:
static struct scsi_host_sg_pool scsi_sg_pools[] = {
	SP(7),
	SP(15),
	SP(31),
	SP(63),
	SP(127)
};

now there are 2 issues with this:
1. Even if I do blk_queue_max_phys_segments(q, 127); I still get
   requests for use_sg=128 which will crash the kernel.

2. If I do SPs of 7,15,31,63,128 or even 8,16,32,64,128 it will
   boot and work but clearly it does not fit in one page. So either
   my sata drivers and iSCSI, which I test, are not sensitive to
   scatterlists fit one page. Or kernel gives me 2 contiguous pages?

I do not see away out of this problem. I think that even with Jens's
chaining of sg_tables 128 is a magic number that we don't want to cross

It leaves me with option 3 on the bidi front. I think it would be best
to Just allocate another global mem_pool of scsi_data_buffer(s) just like
we do for scsi_io_context. The uni scsi_data_buffer will be embedded in
struct scsi_cmnd and the bidi one will be allocated from this pool.
I am open to any suggestions.

If anyone wants to see I have done 2 versions of this work. One on top
of Jens's sg-chaining work (ver 5). And one on top of Tomo's cleanup
git. both can be found here:
  http://www.bhalevy.com/open-osd/download/scsi_sg_table/

Boaz


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-16 17:29       ` Boaz Harrosh
@ 2007-05-16 17:53         ` Jens Axboe
  2007-05-16 18:06           ` James Bottomley
  2007-05-17  2:57           ` FUJITA Tomonori
  0 siblings, 2 replies; 43+ messages in thread
From: Jens Axboe @ 2007-05-16 17:53 UTC (permalink / raw)
  To: Boaz Harrosh
  Cc: James Bottomley, FUJITA Tomonori, linux-scsi, bhalevy, hch, akpm,
	michaelc

On Wed, May 16 2007, Boaz Harrosh wrote:
> Boaz Harrosh wrote:
> > James Bottomley wrote:
> >> 
> >> There's actually a fourth option you haven't considered:
> >>
> >> Roll all the required sglist definitions (request_bufflen,
> >> request_buffer, use_sg and sglist_len) into the sgtable pools.
> >>
> > This is a grate Idea. Let me see if I understand what you mean.
> > ...
> > ...
> 
> Hi Dear James, list.
> I have worked on proposed solution (If anyone is interested see
> url blow)
> 
> Now it works and all but I hit some problems.
> What happens is that in 64 bit architectures, well in x86_64,
> the sizeof scatterlist structure is 32 bytes which means that
> we can only fit exactly 128 of them in a page. But together with
> the scsi_sg_table header we are down to 127. Also if we want
> good alignment/packing of other sized pools we want:
> static struct scsi_host_sg_pool scsi_sg_pools[] = {
> 	SP(7),
> 	SP(15),
> 	SP(31),
> 	SP(63),
> 	SP(127)
> };
> 
> now there are 2 issues with this:
> 1. Even if I do blk_queue_max_phys_segments(q, 127); I still get
>    requests for use_sg=128 which will crash the kernel.

That sounds like a serious issue, it should definitely not happen. Stuff
like that would bite other drivers as well, are you absolutely sure that
is happening? Power-of-2 bug in your code, or in the SCSI code?

> 2. If I do SPs of 7,15,31,63,128 or even 8,16,32,64,128 it will
>    boot and work but clearly it does not fit in one page. So either
>    my sata drivers and iSCSI, which I test, are not sensitive to
>    scatterlists fit one page. Or kernel gives me 2 contiguous pages?

The 1-page thing isn't a restriction as such, it's just an optimization.
The scatterlist allocated is purely a kernel entity, so you could do 4
contig pages and larger ios that way, if higher order allocations were
reliable.

But you are right in that we need to tweak the sg pool size so that it
ends up being a nice size, and not something that either spans a little
bit into a second page or doesn't fill a page nicely. On my x86-64 here,
a 128 segment sg table is exactly one page (looking at slabinfo). It
depends on the allocator whether that is just right, or just a little
too much due to management information.

> I do not see away out of this problem. I think that even with Jens's
> chaining of sg_tables 128 is a magic number that we don't want to cross

Label me skeptical of your findings so far :-)

> It leaves me with option 3 on the bidi front. I think it would be best
> to Just allocate another global mem_pool of scsi_data_buffer(s) just like
> we do for scsi_io_context. The uni scsi_data_buffer will be embedded in
> struct scsi_cmnd and the bidi one will be allocated from this pool.
> I am open to any suggestions.

To statements on this affair:

- Any sg table size should work, provided that we can safely and
  reliably allocate it. We stay with 1 page max for that reason alone.

- The max sg number has no magic beyond staying within a single page for
  allocation reasons.

The scattertable really only exists as a temporary way to store and
setup transfer, until it's mapped to a driver private sg structure. That
is why the phys segments limit is purely a driver property, it has
nothing to do with the hardware or the hardware limits.

> If anyone wants to see I have done 2 versions of this work. One on top
> of Jens's sg-chaining work (ver 5). And one on top of Tomo's cleanup
> git. both can be found here:
>   http://www.bhalevy.com/open-osd/download/scsi_sg_table/

+#define scsi_for_each_sg(cmd, sg, nseg, __i)                   \
+       for (__i = 0, sg = scsi_sglist(cmd); __i < (nseg); __i++, (sg)++)

Hmm?

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-16 17:53         ` Jens Axboe
@ 2007-05-16 18:06           ` James Bottomley
  2007-05-16 18:13             ` Jens Axboe
  2007-05-17  2:57           ` FUJITA Tomonori
  1 sibling, 1 reply; 43+ messages in thread
From: James Bottomley @ 2007-05-16 18:06 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Boaz Harrosh, FUJITA Tomonori, linux-scsi, bhalevy, hch, akpm,
	michaelc

On Wed, 2007-05-16 at 19:53 +0200, Jens Axboe wrote:
> The 1-page thing isn't a restriction as such, it's just an optimization.
> The scatterlist allocated is purely a kernel entity, so you could do 4
> contig pages and larger ios that way, if higher order allocations were
> reliable.
> 
> But you are right in that we need to tweak the sg pool size so that it
> ends up being a nice size, and not something that either spans a little
> bit into a second page or doesn't fill a page nicely. On my x86-64 here,
> a 128 segment sg table is exactly one page (looking at slabinfo). It
> depends on the allocator whether that is just right, or just a little
> too much due to management information.

Actually, if you look at the slab allocation algorithm (particularly
calculate_slab_order()) you'll find it's not as simplistic as you're
assuming ... what it actually does is try to allocate > 1 item in n
pages to reduce the leftovers.

Additionally, remember that turning on redzoning, which seems to be
quite popular nowadays, actually blows out the slab size calculations
anyway.

The bottom line is that it's better for us just to do exactly what we
need and let the allocation algorithms figure out how to do it
efficiently rather than trying to second guess them.

James



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-16 18:06           ` James Bottomley
@ 2007-05-16 18:13             ` Jens Axboe
  2007-05-17  8:46               ` Boaz Harrosh
  0 siblings, 1 reply; 43+ messages in thread
From: Jens Axboe @ 2007-05-16 18:13 UTC (permalink / raw)
  To: James Bottomley
  Cc: Boaz Harrosh, FUJITA Tomonori, linux-scsi, bhalevy, hch, akpm,
	michaelc

On Wed, May 16 2007, James Bottomley wrote:
> On Wed, 2007-05-16 at 19:53 +0200, Jens Axboe wrote:
> > The 1-page thing isn't a restriction as such, it's just an optimization.
> > The scatterlist allocated is purely a kernel entity, so you could do 4
> > contig pages and larger ios that way, if higher order allocations were
> > reliable.
> > 
> > But you are right in that we need to tweak the sg pool size so that it
> > ends up being a nice size, and not something that either spans a little
> > bit into a second page or doesn't fill a page nicely. On my x86-64 here,
> > a 128 segment sg table is exactly one page (looking at slabinfo). It
> > depends on the allocator whether that is just right, or just a little
> > too much due to management information.
> 
> Actually, if you look at the slab allocation algorithm (particularly
> calculate_slab_order()) you'll find it's not as simplistic as you're
> assuming ... what it actually does is try to allocate > 1 item in n
> pages to reduce the leftovers.

I'm not assuming anything, I was just being weary of having elements
that are exactly page sized if that would cause a bit of spill into a
second page. Don't tell me that PAGE_SIZE+10 (or whatever it might be)
would ever be an optimal allocation size.

> Additionally, remember that turning on redzoning, which seems to be
> quite popular nowadays, actually blows out the slab size calculations
> anyway.

Debugging will always throw these things out the window, we can't and
should not optimize for that. That goes for slab, and for lots of other
things.

> The bottom line is that it's better for us just to do exactly what we
> need and let the allocation algorithms figure out how to do it
> efficiently rather than trying to second guess them.

Partly true, it's also silly to just hardcore power-of-2 numbers without
ever bothering to look at what that results in (or even if it fits
normal use patterns).

We can easily be flexible, so it seems silly not to at least do a bit of
background research.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-16 17:53         ` Jens Axboe
  2007-05-16 18:06           ` James Bottomley
@ 2007-05-17  2:57           ` FUJITA Tomonori
  2007-05-17  5:48             ` Jens Axboe
  1 sibling, 1 reply; 43+ messages in thread
From: FUJITA Tomonori @ 2007-05-17  2:57 UTC (permalink / raw)
  To: jens.axboe
  Cc: bharrosh, James.Bottomley, fujita.tomonori, linux-scsi, bhalevy,
	hch, akpm, michaelc

From: Jens Axboe <jens.axboe@oracle.com>
Subject: Re: [PATCH v2] add bidi support for block pc requests
Date: Wed, 16 May 2007 19:53:22 +0200

> On Wed, May 16 2007, Boaz Harrosh wrote:
> > Boaz Harrosh wrote:
> > > James Bottomley wrote:
> > >> 
> > >> There's actually a fourth option you haven't considered:
> > >>
> > >> Roll all the required sglist definitions (request_bufflen,
> > >> request_buffer, use_sg and sglist_len) into the sgtable pools.
> > >>
> > > This is a grate Idea. Let me see if I understand what you mean.
> > > ...
> > > ...
> > 
> > Hi Dear James, list.
> > I have worked on proposed solution (If anyone is interested see
> > url blow)
> > 
> > Now it works and all but I hit some problems.
> > What happens is that in 64 bit architectures, well in x86_64,
> > the sizeof scatterlist structure is 32 bytes which means that
> > we can only fit exactly 128 of them in a page. But together with
> > the scsi_sg_table header we are down to 127. Also if we want
> > good alignment/packing of other sized pools we want:
> > static struct scsi_host_sg_pool scsi_sg_pools[] = {
> > 	SP(7),
> > 	SP(15),
> > 	SP(31),
> > 	SP(63),
> > 	SP(127)
> > };
> > 
> > now there are 2 issues with this:
> > 1. Even if I do blk_queue_max_phys_segments(q, 127); I still get
> >    requests for use_sg=128 which will crash the kernel.
> 
> That sounds like a serious issue, it should definitely not happen. Stuff
> like that would bite other drivers as well, are you absolutely sure that
> is happening? Power-of-2 bug in your code, or in the SCSI code?

Boaz, how do you send requests to the scsi-ml, via fs, sg, or bsg?


> > If anyone wants to see I have done 2 versions of this work. One on top
> > of Jens's sg-chaining work (ver 5). And one on top of Tomo's cleanup
> > git. both can be found here:
> >   http://www.bhalevy.com/open-osd/download/scsi_sg_table/
> 
> +#define scsi_for_each_sg(cmd, sg, nseg, __i)                   \
> +       for (__i = 0, sg = scsi_sglist(cmd); __i < (nseg); __i++, (sg)++)
> 
> Hmm?

When for_each_sg is ready, we convert scsi_for_each_sg to use
for_each_sg. I finished the cleanups on more than 40 drivers on the
top of your patches.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-17  2:57           ` FUJITA Tomonori
@ 2007-05-17  5:48             ` Jens Axboe
  2007-05-17  5:59               ` FUJITA Tomonori
  0 siblings, 1 reply; 43+ messages in thread
From: Jens Axboe @ 2007-05-17  5:48 UTC (permalink / raw)
  To: FUJITA Tomonori
  Cc: bharrosh, James.Bottomley, linux-scsi, bhalevy, hch, akpm,
	michaelc

On Thu, May 17 2007, FUJITA Tomonori wrote:
> From: Jens Axboe <jens.axboe@oracle.com>
> Subject: Re: [PATCH v2] add bidi support for block pc requests
> Date: Wed, 16 May 2007 19:53:22 +0200
> 
> > On Wed, May 16 2007, Boaz Harrosh wrote:
> > > Boaz Harrosh wrote:
> > > > James Bottomley wrote:
> > > >> 
> > > >> There's actually a fourth option you haven't considered:
> > > >>
> > > >> Roll all the required sglist definitions (request_bufflen,
> > > >> request_buffer, use_sg and sglist_len) into the sgtable pools.
> > > >>
> > > > This is a grate Idea. Let me see if I understand what you mean.
> > > > ...
> > > > ...
> > > 
> > > Hi Dear James, list.
> > > I have worked on proposed solution (If anyone is interested see
> > > url blow)
> > > 
> > > Now it works and all but I hit some problems.
> > > What happens is that in 64 bit architectures, well in x86_64,
> > > the sizeof scatterlist structure is 32 bytes which means that
> > > we can only fit exactly 128 of them in a page. But together with
> > > the scsi_sg_table header we are down to 127. Also if we want
> > > good alignment/packing of other sized pools we want:
> > > static struct scsi_host_sg_pool scsi_sg_pools[] = {
> > > 	SP(7),
> > > 	SP(15),
> > > 	SP(31),
> > > 	SP(63),
> > > 	SP(127)
> > > };
> > > 
> > > now there are 2 issues with this:
> > > 1. Even if I do blk_queue_max_phys_segments(q, 127); I still get
> > >    requests for use_sg=128 which will crash the kernel.
> > 
> > That sounds like a serious issue, it should definitely not happen. Stuff
> > like that would bite other drivers as well, are you absolutely sure that
> > is happening? Power-of-2 bug in your code, or in the SCSI code?
> 
> Boaz, how do you send requests to the scsi-ml, via fs, sg, or bsg?
> 
> 
> > > If anyone wants to see I have done 2 versions of this work. One on top
> > > of Jens's sg-chaining work (ver 5). And one on top of Tomo's cleanup
> > > git. both can be found here:
> > >   http://www.bhalevy.com/open-osd/download/scsi_sg_table/
> > 
> > +#define scsi_for_each_sg(cmd, sg, nseg, __i)                   \
> > +       for (__i = 0, sg = scsi_sglist(cmd); __i < (nseg); __i++, (sg)++)
> > 
> > Hmm?
> 
> When for_each_sg is ready, we convert scsi_for_each_sg to use
> for_each_sg. I finished the cleanups on more than 40 drivers on the
> top of your patches.

But this is from the patch that is based on my sglist branch, so it
looked somewhat odd.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-17  5:48             ` Jens Axboe
@ 2007-05-17  5:59               ` FUJITA Tomonori
  2007-05-17  8:49                 ` Boaz Harrosh
  0 siblings, 1 reply; 43+ messages in thread
From: FUJITA Tomonori @ 2007-05-17  5:59 UTC (permalink / raw)
  To: jens.axboe
  Cc: fujita.tomonori, bharrosh, James.Bottomley, linux-scsi, bhalevy,
	hch, akpm, michaelc

From: Jens Axboe <jens.axboe@oracle.com>
Subject: Re: [PATCH v2] add bidi support for block pc requests
Date: Thu, 17 May 2007 07:48:13 +0200

> On Thu, May 17 2007, FUJITA Tomonori wrote:
> > From: Jens Axboe <jens.axboe@oracle.com>
> > Subject: Re: [PATCH v2] add bidi support for block pc requests
> > Date: Wed, 16 May 2007 19:53:22 +0200
> > 
> > > On Wed, May 16 2007, Boaz Harrosh wrote:
> > > > Boaz Harrosh wrote:
> > > > > James Bottomley wrote:
> > > > >> 
> > > > >> There's actually a fourth option you haven't considered:
> > > > >>
> > > > >> Roll all the required sglist definitions (request_bufflen,
> > > > >> request_buffer, use_sg and sglist_len) into the sgtable pools.
> > > > >>
> > > > > This is a grate Idea. Let me see if I understand what you mean.
> > > > > ...
> > > > > ...
> > > > 
> > > > Hi Dear James, list.
> > > > I have worked on proposed solution (If anyone is interested see
> > > > url blow)
> > > > 
> > > > Now it works and all but I hit some problems.
> > > > What happens is that in 64 bit architectures, well in x86_64,
> > > > the sizeof scatterlist structure is 32 bytes which means that
> > > > we can only fit exactly 128 of them in a page. But together with
> > > > the scsi_sg_table header we are down to 127. Also if we want
> > > > good alignment/packing of other sized pools we want:
> > > > static struct scsi_host_sg_pool scsi_sg_pools[] = {
> > > > 	SP(7),
> > > > 	SP(15),
> > > > 	SP(31),
> > > > 	SP(63),
> > > > 	SP(127)
> > > > };
> > > > 
> > > > now there are 2 issues with this:
> > > > 1. Even if I do blk_queue_max_phys_segments(q, 127); I still get
> > > >    requests for use_sg=128 which will crash the kernel.
> > > 
> > > That sounds like a serious issue, it should definitely not happen. Stuff
> > > like that would bite other drivers as well, are you absolutely sure that
> > > is happening? Power-of-2 bug in your code, or in the SCSI code?
> > 
> > Boaz, how do you send requests to the scsi-ml, via fs, sg, or bsg?
> > 
> > 
> > > > If anyone wants to see I have done 2 versions of this work. One on top
> > > > of Jens's sg-chaining work (ver 5). And one on top of Tomo's cleanup
> > > > git. both can be found here:
> > > >   http://www.bhalevy.com/open-osd/download/scsi_sg_table/
> > > 
> > > +#define scsi_for_each_sg(cmd, sg, nseg, __i)                   \
> > > +       for (__i = 0, sg = scsi_sglist(cmd); __i < (nseg); __i++, (sg)++)
> > > 
> > > Hmm?
> > 
> > When for_each_sg is ready, we convert scsi_for_each_sg to use
> > for_each_sg. I finished the cleanups on more than 40 drivers on the
> > top of your patches.
> 
> But this is from the patch that is based on my sglist branch, so it
> looked somewhat odd.

Oops, I didn't see the patch based on the sglist branch. Yeah, it's
odd though it isn't used, I guess.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-16 18:13             ` Jens Axboe
@ 2007-05-17  8:46               ` Boaz Harrosh
  0 siblings, 0 replies; 43+ messages in thread
From: Boaz Harrosh @ 2007-05-17  8:46 UTC (permalink / raw)
  To: Jens Axboe, James Bottomley
  Cc: FUJITA Tomonori, linux-scsi, bhalevy, hch, akpm, michaelc

Jens Axboe wrote:
> On Wed, May 16 2007, James Bottomley wrote:
>> On Wed, 2007-05-16 at 19:53 +0200, Jens Axboe wrote:
>>> The 1-page thing isn't a restriction as such, it's just an optimization.
>>> The scatterlist allocated is purely a kernel entity, so you could do 4
>>> contig pages and larger ios that way, if higher order allocations were
>>> reliable.
>>>
>>> But you are right in that we need to tweak the sg pool size so that it
>>> ends up being a nice size, and not something that either spans a little
>>> bit into a second page or doesn't fill a page nicely. On my x86-64 here,
>>> a 128 segment sg table is exactly one page (looking at slabinfo). It
>>> depends on the allocator whether that is just right, or just a little
>>> too much due to management information.
>> Actually, if you look at the slab allocation algorithm (particularly
>> calculate_slab_order()) you'll find it's not as simplistic as you're
>> assuming ... what it actually does is try to allocate > 1 item in n
>> pages to reduce the leftovers.
> 
> I'm not assuming anything, I was just being weary of having elements
> that are exactly page sized if that would cause a bit of spill into a
> second page. Don't tell me that PAGE_SIZE+10 (or whatever it might be)
> would ever be an optimal allocation size.
> 
>> Additionally, remember that turning on redzoning, which seems to be
>> quite popular nowadays, actually blows out the slab size calculations
>> anyway.
> 
> Debugging will always throw these things out the window, we can't and
> should not optimize for that. That goes for slab, and for lots of other
> things.
> 
>> The bottom line is that it's better for us just to do exactly what we
>> need and let the allocation algorithms figure out how to do it
>> efficiently rather than trying to second guess them.
> 
> Partly true, it's also silly to just hardcore power-of-2 numbers without
> ever bothering to look at what that results in (or even if it fits
> normal use patterns).
> 
> We can easily be flexible, so it seems silly not to at least do a bit of
> background research.
> 
The thing is that now every thing fits like a glove. i386/32bit-arch
have 16 bytes scatterlist struct, 256 in a page. x86_64/64bit-arch 32
byte and 128 fit exactly in a page. If we do any code that throws this
off it will be a performance regression. Call it beginners luck, call
it someone spent a long night handcrafting it this way. Just that I
think the current system is perfect and we should not touch it. There
are other options for bidi. (just my $0.02)

Boaz


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-17  5:59               ` FUJITA Tomonori
@ 2007-05-17  8:49                 ` Boaz Harrosh
  2007-05-17 11:12                   ` FUJITA Tomonori
                                     ` (2 more replies)
  0 siblings, 3 replies; 43+ messages in thread
From: Boaz Harrosh @ 2007-05-17  8:49 UTC (permalink / raw)
  To: FUJITA Tomonori, jens.axboe, James.Bottomley
  Cc: linux-scsi, bhalevy, hch, akpm, michaelc

FUJITA Tomonori wrote:
> From: Jens Axboe <jens.axboe@oracle.com>
> Subject: Re: [PATCH v2] add bidi support for block pc requests
> Date: Thu, 17 May 2007 07:48:13 +0200
> 
>> On Thu, May 17 2007, FUJITA Tomonori wrote:
>>> From: Jens Axboe <jens.axboe@oracle.com>
>>> Subject: Re: [PATCH v2] add bidi support for block pc requests
>>> Date: Wed, 16 May 2007 19:53:22 +0200
>>>
>>>> On Wed, May 16 2007, Boaz Harrosh wrote:
>>>>> now there are 2 issues with this:
>>>>> 1. Even if I do blk_queue_max_phys_segments(q, 127); I still get
>>>>>    requests for use_sg=128 which will crash the kernel.
>>>> That sounds like a serious issue, it should definitely not happen. Stuff
>>>> like that would bite other drivers as well, are you absolutely sure that
>>>> is happening? Power-of-2 bug in your code, or in the SCSI code?
>>> Boaz, how do you send requests to the scsi-ml, via fs, sg, or bsg?

These are regular fs (ext3) requests during bootup. The machine will not
boot. (Usually from the read ahead code)
Don't believe me look at the second patch Over Tomo's cleanup.
If I define SCSI_MAX_SG_SEGMENTS to 127 it will crash even when I
did in code:
	blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS);
I suppose someone is looking at a different definition. Or there is
another call I need to do for this to work.

>>>
>>>
>>>>> If anyone wants to see I have done 2 versions of this work. One on top
>>>>> of Jens's sg-chaining work (ver 5). And one on top of Tomo's cleanup
>>>>> git. both can be found here:
>>>>>   http://www.bhalevy.com/open-osd/download/scsi_sg_table/
>>>> +#define scsi_for_each_sg(cmd, sg, nseg, __i)                   \
>>>> +       for (__i = 0, sg = scsi_sglist(cmd); __i < (nseg); __i++, (sg)++)
>>>>
Yes leftovers from Tomos branch. Sorry for the noise. This is in no way
finished work Just experimenting. Any way in this git tree no one is
using this macro. (git-pull of Linus tree, apply ver5, and this patch)
This branch will work fine because I have not touched the numbers yet
Just screwed allocations.
>>>> Hmm?
>>> When for_each_sg is ready, we convert scsi_for_each_sg to use
>>> for_each_sg. I finished the cleanups on more than 40 drivers on the
>>> top of your patches.
>> But this is from the patch that is based on my sglist branch, so it
>> looked somewhat odd.
> 
> Oops, I didn't see the patch based on the sglist branch. Yeah, it's
> odd though it isn't used, I guess.
> -
> To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Boaz

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-17  8:49                 ` Boaz Harrosh
@ 2007-05-17 11:12                   ` FUJITA Tomonori
  2007-05-17 17:37                     ` Benny Halevy
  2007-05-24 16:37                     ` Boaz Harrosh
  2007-05-17 11:29                   ` FUJITA Tomonori
  2007-05-17 13:27                   ` James Bottomley
  2 siblings, 2 replies; 43+ messages in thread
From: FUJITA Tomonori @ 2007-05-17 11:12 UTC (permalink / raw)
  To: bharrosh
  Cc: fujita.tomonori, jens.axboe, James.Bottomley, linux-scsi, bhalevy,
	hch, akpm, michaelc

From: Boaz Harrosh <bharrosh@panasas.com>
Subject: Re: [PATCH v2] add bidi support for block pc requests
Date: Thu, 17 May 2007 11:49:37 +0300

> FUJITA Tomonori wrote:
> > From: Jens Axboe <jens.axboe@oracle.com>
> > Subject: Re: [PATCH v2] add bidi support for block pc requests
> > Date: Thu, 17 May 2007 07:48:13 +0200
> > 
> >> On Thu, May 17 2007, FUJITA Tomonori wrote:
> >>> From: Jens Axboe <jens.axboe@oracle.com>
> >>> Subject: Re: [PATCH v2] add bidi support for block pc requests
> >>> Date: Wed, 16 May 2007 19:53:22 +0200
> >>>
> >>>> On Wed, May 16 2007, Boaz Harrosh wrote:
> >>>>> now there are 2 issues with this:
> >>>>> 1. Even if I do blk_queue_max_phys_segments(q, 127); I still get
> >>>>>    requests for use_sg=128 which will crash the kernel.
> >>>> That sounds like a serious issue, it should definitely not happen. Stuff
> >>>> like that would bite other drivers as well, are you absolutely sure that
> >>>> is happening? Power-of-2 bug in your code, or in the SCSI code?
> >>> Boaz, how do you send requests to the scsi-ml, via fs, sg, or bsg?
> 
> These are regular fs (ext3) requests during bootup. The machine will not
> boot. (Usually from the read ahead code)
> Don't believe me look at the second patch Over Tomo's cleanup.
> If I define SCSI_MAX_SG_SEGMENTS to 127 it will crash even when I
> did in code:
> 	blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS);
> I suppose someone is looking at a different definition. Or there is
> another call I need to do for this to work.

I modified your patch a bit (sgtable allocation) and set
SCSI_MAX_SG_SEGMENTS to 127. My ppc64 box seems to work (with
iscsi_tcp and ipr drivers).

iscsi_tcp sets sg_tablesize to 255, but blk_queue_max_phys_segments
seems to work.

One thing that I found is:

+#define scsi_resid(cmd) ((cmd)->sg_table->resid)


This doesn't work for some drivers (at least ipr) since they set
cmd->resid even with commands without data transfer.

This patch is against my cleanup tree.

Boaz, btw, don't worry about scsi_tgt_lib.c on
scsi_alloc/free_sgtable. You can break it. I need to fix it for bidi anyway.


diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 70454b4..d8fb9c4 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -35,33 +35,34 @@ #define SG_MEMPOOL_SIZE		2
 
 struct scsi_host_sg_pool {
 	size_t		size;
-	char		*name; 
+	char		*name;
 	struct kmem_cache	*slab;
 	mempool_t	*pool;
 };
 
-#if (SCSI_MAX_PHYS_SEGMENTS < 32)
-#error SCSI_MAX_PHYS_SEGMENTS is too small
-#endif
+/*
+ * Should fit within a single page.
+ */
+
+enum { SCSI_MAX_SG_SEGMENTS = 127 };
+
+enum { SCSI_MAX_SG_SEGMENTS_CALC =
+	((PAGE_SIZE - sizeof(struct scsi_sg_table)) /
+	sizeof(struct scatterlist)) };
+
+#define SG_MEMPOOL_NR ARRAY_SIZE(scsi_sg_pools)
 
-#define SP(x) { x, "sgpool-" #x } 
+/*#define SG_MEMPOOL_NR (ARRAY_SIZE(scsi_sg_pools) - \
+                       (SCSI_MAX_SG_SEGMENTS < 254))*/
+
+#define SP(x) { x, "sgpool-" #x }
 static struct scsi_host_sg_pool scsi_sg_pools[] = {
-	SP(8),
-	SP(16),
-	SP(32),
-#if (SCSI_MAX_PHYS_SEGMENTS > 32)
-	SP(64),
-#if (SCSI_MAX_PHYS_SEGMENTS > 64)
-	SP(128),
-#if (SCSI_MAX_PHYS_SEGMENTS > 128)
-	SP(256),
-#if (SCSI_MAX_PHYS_SEGMENTS > 256)
-#error SCSI_MAX_PHYS_SEGMENTS is too large
-#endif
-#endif
-#endif
-#endif
-}; 	
+	SP(7),
+	SP(15),
+	SP(31),
+	SP(63),
+	SP(SCSI_MAX_SG_SEGMENTS)
+};
 #undef SP
 
 static void scsi_run_queue(struct request_queue *q);
@@ -701,60 +702,38 @@ static struct scsi_cmnd *scsi_end_reques
 	return NULL;
 }
 
-struct scatterlist *scsi_alloc_sgtable(struct scsi_cmnd *cmd, gfp_t gfp_mask)
+static void scsi_free_sgtable(struct scsi_sg_table *sgt)
 {
-	struct scsi_host_sg_pool *sgp;
-	struct scatterlist *sgl;
-
-	BUG_ON(!cmd->use_sg);
-
-	switch (cmd->use_sg) {
-	case 1 ... 8:
-		cmd->sglist_len = 0;
-		break;
-	case 9 ... 16:
-		cmd->sglist_len = 1;
-		break;
-	case 17 ... 32:
-		cmd->sglist_len = 2;
-		break;
-#if (SCSI_MAX_PHYS_SEGMENTS > 32)
-	case 33 ... 64:
-		cmd->sglist_len = 3;
-		break;
-#if (SCSI_MAX_PHYS_SEGMENTS > 64)
-	case 65 ... 128:
-		cmd->sglist_len = 4;
-		break;
-#if (SCSI_MAX_PHYS_SEGMENTS  > 128)
-	case 129 ... 256:
-		cmd->sglist_len = 5;
-		break;
-#endif
-#endif
-#endif
-	default:
-		return NULL;
-	}
+	printk("%s %d %d\n", __FUNCTION__, sgt->this_sg_count, SG_MEMPOOL_NR);
+	BUG_ON(sgt->this_sg_count >= SG_MEMPOOL_NR);
 
-	sgp = scsi_sg_pools + cmd->sglist_len;
-	sgl = mempool_alloc(sgp->pool, gfp_mask);
-	return sgl;
+	mempool_free(sgt, scsi_sg_pools[sgt->this_sg_count].pool);
 }
 
-EXPORT_SYMBOL(scsi_alloc_sgtable);
-
-void scsi_free_sgtable(struct scatterlist *sgl, int index)
+static struct scsi_sg_table *scsi_alloc_sgtable(int nseg, gfp_t gfp_mask)
 {
 	struct scsi_host_sg_pool *sgp;
+	struct scsi_sg_table *sgt;
+	unsigned int idx;
 
-	BUG_ON(index >= SG_MEMPOOL_NR);
+	for (idx = 0; idx < SG_MEMPOOL_NR; idx++)
+		if (scsi_sg_pools[idx].size >= nseg)
+			goto found;
 
-	sgp = scsi_sg_pools + index;
-	mempool_free(sgl, sgp->pool);
-}
+	printk(KERN_ERR "scsi: bad segment count=%d\n", nseg);
+	BUG();
+found:
+	sgp = scsi_sg_pools + idx;
 
-EXPORT_SYMBOL(scsi_free_sgtable);
+	sgt = mempool_alloc(sgp->pool, gfp_mask);
+	if (unlikely(!sgt))
+		return NULL;
+
+	memset(sgt, 0, SG_TABLE_SIZEOF(sgp->size));
+	sgt->this_sg_count = idx;
+	sgt->sg_count = nseg;
+	return sgt;
+}
 
 /*
  * Function:    scsi_release_buffers()
@@ -775,15 +754,15 @@ EXPORT_SYMBOL(scsi_free_sgtable);
  */
 static void scsi_release_buffers(struct scsi_cmnd *cmd)
 {
-	if (cmd->use_sg)
-		scsi_free_sgtable(cmd->request_buffer, cmd->sglist_len);
+	if (cmd->sg_table)
+		scsi_free_sgtable(cmd->sg_table);
 
-	/*
-	 * Zero these out.  They now point to freed memory, and it is
-	 * dangerous to hang onto the pointers.
-	 */
+	cmd->sg_table = NULL;
+
+	/*FIXME: make code backward compatible with old system */
 	cmd->request_buffer = NULL;
 	cmd->request_bufflen = 0;
+	cmd->use_sg = 0;
 }
 
 /*
@@ -817,13 +796,14 @@ static void scsi_release_buffers(struct
 void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 {
 	int result = cmd->result;
-	int this_count = cmd->request_bufflen;
+	int this_count = scsi_bufflen(cmd);
 	request_queue_t *q = cmd->device->request_queue;
 	struct request *req = cmd->request;
 	int clear_errors = 1;
 	struct scsi_sense_hdr sshdr;
 	int sense_valid = 0;
 	int sense_deferred = 0;
+	int resid = cmd->sg_table ? cmd->sg_table->resid : 0;
 
 	scsi_release_buffers(cmd);
 
@@ -849,7 +829,7 @@ void scsi_io_completion(struct scsi_cmnd
 				req->sense_len = len;
 			}
 		}
-		req->data_len = cmd->resid;
+		req->data_len = resid;
 	}
 
 	/*
@@ -859,7 +839,6 @@ void scsi_io_completion(struct scsi_cmnd
 	SCSI_LOG_HLCOMPLETE(1, printk("%ld sectors total, "
 				      "%d bytes done.\n",
 				      req->nr_sectors, good_bytes));
-	SCSI_LOG_HLCOMPLETE(1, printk("use_sg is %d\n", cmd->use_sg));
 
 	if (clear_errors)
 		req->errors = 0;
@@ -991,44 +970,46 @@ EXPORT_SYMBOL(scsi_io_completion);
 static int scsi_init_io(struct scsi_cmnd *cmd)
 {
 	struct request     *req = cmd->request;
-	struct scatterlist *sgpnt;
 	int		   count;
+	struct scsi_sg_table *sgt;
 
-	/*
-	 * We used to not use scatter-gather for single segment request,
-	 * but now we do (it makes highmem I/O easier to support without
-	 * kmapping pages)
-	 */
-	cmd->use_sg = req->nr_phys_segments;
+	printk("%s %d %d %d\n", __FUNCTION__, __LINE__,
+	       req->nr_phys_segments, req->q->max_phys_segments);
 
 	/*
 	 * If sg table allocation fails, requeue request later.
 	 */
-	sgpnt = scsi_alloc_sgtable(cmd, GFP_ATOMIC);
-	if (unlikely(!sgpnt)) {
+	sgt = scsi_alloc_sgtable(req->nr_phys_segments, GFP_ATOMIC);
+	if (unlikely(!sgt)) {
 		scsi_unprep_request(req);
 		return BLKPREP_DEFER;
 	}
 
 	req->buffer = NULL;
-	cmd->request_buffer = (char *) sgpnt;
 	if (blk_pc_request(req))
-		cmd->request_bufflen = req->data_len;
+		sgt->length = req->data_len;
 	else
-		cmd->request_bufflen = req->nr_sectors << 9;
+		sgt->length = req->nr_sectors << 9;
 
-	/* 
+	cmd->sg_table = sgt;
+	/*
 	 * Next, walk the list, and fill in the addresses and sizes of
 	 * each segment.
 	 */
-	count = blk_rq_map_sg(req->q, req, cmd->request_buffer);
-	if (likely(count <= cmd->use_sg)) {
-		cmd->use_sg = count;
+	count = blk_rq_map_sg(req->q, req, sgt->sglist);
+	if (likely(count <= sgt->sg_count)) {
+		sgt->sg_count = count;
+
+		/*FIXME: make code backward compatible with old system */
+		cmd->request_buffer = sgt->sglist;
+		cmd->request_bufflen = sgt->length;
+		cmd->use_sg = sgt->sg_count;
+
 		return BLKPREP_OK;
 	}
 
 	printk(KERN_ERR "Incorrect number of segments after building list\n");
-	printk(KERN_ERR "counted %d, received %d\n", count, cmd->use_sg);
+	printk(KERN_ERR "counted %d, received %d\n", count, scsi_sg_count(cmd));
 	printk(KERN_ERR "req nr_sec %lu, cur_nr_sec %u\n", req->nr_sectors,
 			req->current_nr_sectors);
 
@@ -1084,7 +1065,7 @@ static void scsi_blk_pc_done(struct scsi
 	 * successfully. Since this is a REQ_BLOCK_PC command the
 	 * caller should check the request's errors value
 	 */
-	scsi_io_completion(cmd, cmd->request_bufflen);
+	scsi_io_completion(cmd, scsi_bufflen(cmd));
 }
 
 static int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req)
@@ -1113,9 +1094,7 @@ static int scsi_setup_blk_pc_cmnd(struct
 		BUG_ON(req->data_len);
 		BUG_ON(req->data);
 
-		cmd->request_bufflen = 0;
-		cmd->request_buffer = NULL;
-		cmd->use_sg = 0;
+		cmd->sg_table = NULL;
 		req->buffer = NULL;
 	}
 
@@ -1576,7 +1555,7 @@ struct request_queue *__scsi_alloc_queue
 		return NULL;
 
 	blk_queue_max_hw_segments(q, shost->sg_tablesize);
-	blk_queue_max_phys_segments(q, SCSI_MAX_PHYS_SEGMENTS);
+	blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS);
 	blk_queue_max_sectors(q, shost->max_sectors);
 	blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
 	blk_queue_segment_boundary(q, shost->dma_boundary);
@@ -1667,12 +1646,18 @@ int __init scsi_init_queue(void)
 		return -ENOMEM;
 	}
 
+	printk(KERN_ERR
+		"SCSI: max_calc=%d page=%ld so_sgtable=%Zd so_scaterlist=%Zd\n",
+		SCSI_MAX_SG_SEGMENTS_CALC, PAGE_SIZE ,sizeof(struct scsi_sg_table),
+		sizeof(struct scatterlist)
+	);
+
 	for (i = 0; i < SG_MEMPOOL_NR; i++) {
 		struct scsi_host_sg_pool *sgp = scsi_sg_pools + i;
-		int size = sgp->size * sizeof(struct scatterlist);
+		int size = SG_TABLE_SIZEOF(sgp->size);
 
 		sgp->slab = kmem_cache_create(sgp->name, size, 0,
-				SLAB_HWCACHE_ALIGN, NULL, NULL);
+				0, NULL, NULL);
 		if (!sgp->slab) {
 			printk(KERN_ERR "SCSI: can't init sg slab %s\n",
 					sgp->name);
diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index 996ff36..26f1dc0 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -11,6 +11,16 @@ struct scatterlist;
 struct Scsi_Host;
 struct scsi_device;
 
+struct scsi_sg_table {
+	unsigned length;
+	int resid;
+	short sg_count;
+	short this_sg_count;
+	struct scatterlist sglist[0];
+};
+
+#define SG_TABLE_SIZEOF(sg_count) ((sg_count)*sizeof(struct scatterlist) \
+                                   + sizeof(struct scsi_sg_table))
 
 /* embedded in scsi_cmnd */
 struct scsi_pointer {
@@ -68,10 +78,10 @@ #define MAX_COMMAND_SIZE	16
 
 	struct timer_list eh_timeout;	/* Used to time out the command. */
 	void *request_buffer;		/* Actual requested buffer */
+	struct scsi_sg_table *sg_table;
 
 	/* These elements define the operation we ultimately want to perform */
 	unsigned short use_sg;	/* Number of pieces of scatter-gather */
-	unsigned short sglist_len;	/* size of malloc'd scatter-gather list */
 
 	unsigned underflow;	/* Return error if less than
 				   this amount is transferred */
@@ -132,15 +142,12 @@ extern void *scsi_kmap_atomic_sg(struct
 				 size_t *offset, size_t *len);
 extern void scsi_kunmap_atomic_sg(void *virt);
 
-extern struct scatterlist *scsi_alloc_sgtable(struct scsi_cmnd *, gfp_t);
-extern void scsi_free_sgtable(struct scatterlist *, int);
-
 extern int scsi_dma_map(struct scsi_cmnd *cmd);
 extern void scsi_dma_unmap(struct scsi_cmnd *cmd);
 
-#define scsi_sg_count(cmd) ((cmd)->use_sg)
-#define scsi_sglist(cmd) ((struct scatterlist *)(cmd)->request_buffer)
-#define scsi_bufflen(cmd) ((cmd)->request_bufflen)
+#define scsi_sg_count(cmd) ((cmd)->sg_table ? (cmd)->sg_table->sg_count : 0)
+#define scsi_sglist(cmd) ((cmd)->sg_table ? (cmd)->sg_table->sglist : NULL)
+#define scsi_bufflen(cmd) ((cmd)->sg_table ? (cmd)->sg_table->length : 0)
 #define scsi_resid(cmd) ((cmd)->resid)
 
 #define scsi_for_each_sg(cmd, sg, nseg, __i)			\




^ permalink raw reply related	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-17  8:49                 ` Boaz Harrosh
  2007-05-17 11:12                   ` FUJITA Tomonori
@ 2007-05-17 11:29                   ` FUJITA Tomonori
  2007-05-17 13:27                   ` James Bottomley
  2 siblings, 0 replies; 43+ messages in thread
From: FUJITA Tomonori @ 2007-05-17 11:29 UTC (permalink / raw)
  To: bharrosh
  Cc: fujita.tomonori, jens.axboe, James.Bottomley, linux-scsi, bhalevy,
	hch, akpm, michaelc

From: Boaz Harrosh <bharrosh@panasas.com>
Subject: Re: [PATCH v2] add bidi support for block pc requests
Date: Thu, 17 May 2007 11:49:37 +0300

> FUJITA Tomonori wrote:
> > From: Jens Axboe <jens.axboe@oracle.com>
> > Subject: Re: [PATCH v2] add bidi support for block pc requests
> > Date: Thu, 17 May 2007 07:48:13 +0200
> > 
> >> On Thu, May 17 2007, FUJITA Tomonori wrote:
> >>> From: Jens Axboe <jens.axboe@oracle.com>
> >>> Subject: Re: [PATCH v2] add bidi support for block pc requests
> >>> Date: Wed, 16 May 2007 19:53:22 +0200
> >>>
> >>>> On Wed, May 16 2007, Boaz Harrosh wrote:
> >>>>> now there are 2 issues with this:
> >>>>> 1. Even if I do blk_queue_max_phys_segments(q, 127); I still get
> >>>>>    requests for use_sg=128 which will crash the kernel.
> >>>> That sounds like a serious issue, it should definitely not happen. Stuff
> >>>> like that would bite other drivers as well, are you absolutely sure that
> >>>> is happening? Power-of-2 bug in your code, or in the SCSI code?
> >>> Boaz, how do you send requests to the scsi-ml, via fs, sg, or bsg?
> 
> These are regular fs (ext3) requests during bootup. The machine will not
> boot. (Usually from the read ahead code)
> Don't believe me look at the second patch Over Tomo's cleanup.
> If I define SCSI_MAX_SG_SEGMENTS to 127 it will crash even when I
> did in code:
> 	blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS);
> I suppose someone is looking at a different definition. Or there is
> another call I need to do for this to work.

ata_scsi_slave_config?

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-17  8:49                 ` Boaz Harrosh
  2007-05-17 11:12                   ` FUJITA Tomonori
  2007-05-17 11:29                   ` FUJITA Tomonori
@ 2007-05-17 13:27                   ` James Bottomley
  2007-05-17 14:00                     ` Boaz Harrosh
  2 siblings, 1 reply; 43+ messages in thread
From: James Bottomley @ 2007-05-17 13:27 UTC (permalink / raw)
  To: Boaz Harrosh
  Cc: FUJITA Tomonori, jens.axboe, linux-scsi, bhalevy, hch, akpm,
	michaelc

On Thu, 2007-05-17 at 11:49 +0300, Boaz Harrosh wrote:
> These are regular fs (ext3) requests during bootup. The machine will not
> boot. (Usually from the read ahead code)
> Don't believe me look at the second patch Over Tomo's cleanup.
> If I define SCSI_MAX_SG_SEGMENTS to 127 it will crash even when I
> did in code:
> 	blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS);
> I suppose someone is looking at a different definition. Or there is
> another call I need to do for this to work.

It would really help us if you showed the actual code for what you did
and where ... if this is wrong, we have bigger problems that quibbling
about bidirectional slab sizes.  The correct way to adjust this limit
artificially to 127 is:

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 1f5a07b..4a27841 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1576,7 +1576,7 @@ struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost,
 		return NULL;
 
 	blk_queue_max_hw_segments(q, shost->sg_tablesize);
-	blk_queue_max_phys_segments(q, SCSI_MAX_PHYS_SEGMENTS);
+	blk_queue_max_phys_segments(q, 127);
 	blk_queue_max_sectors(q, shost->max_sectors);
 	blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
 	blk_queue_segment_boundary(q, shost->dma_boundary);

(It doesn't alter the allocation pools or anything else, just limits the
max phys segments of the queue).  The way to check that this limit is
being honoured is:

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 1f5a07b..ae42e4d 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1000,6 +1000,7 @@ static int scsi_init_io(struct scsi_cmnd *cmd)
 	 * kmapping pages)
 	 */
 	cmd->use_sg = req->nr_phys_segments;
+	WARN_ON(req->nr_phys_segments > 127);
 
 	/*
 	 * If sg table allocation fails, requeue request later.

James



^ permalink raw reply related	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-17 13:27                   ` James Bottomley
@ 2007-05-17 14:00                     ` Boaz Harrosh
  2007-05-17 14:11                       ` FUJITA Tomonori
  0 siblings, 1 reply; 43+ messages in thread
From: Boaz Harrosh @ 2007-05-17 14:00 UTC (permalink / raw)
  To: James Bottomley, FUJITA Tomonori
  Cc: jens.axboe, linux-scsi, bhalevy, hch, akpm, michaelc

James Bottomley wrote:
> On Thu, 2007-05-17 at 11:49 +0300, Boaz Harrosh wrote:
>> These are regular fs (ext3) requests during bootup. The machine will not
>> boot. (Usually from the read ahead code)
>> Don't believe me look at the second patch Over Tomo's cleanup.
>> If I define SCSI_MAX_SG_SEGMENTS to 127 it will crash even when I
>> did in code:
>> 	blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS);
>> I suppose someone is looking at a different definition. Or there is
>> another call I need to do for this to work.
> 
> It would really help us if you showed the actual code for what you did
> and where ... if this is wrong, we have bigger problems that quibbling
> about bidirectional slab sizes.  The correct way to adjust this limit
> artificially to 127 is:
> 
> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index 1f5a07b..4a27841 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -1576,7 +1576,7 @@ struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost,
>  		return NULL;
>  
>  	blk_queue_max_hw_segments(q, shost->sg_tablesize);
> -	blk_queue_max_phys_segments(q, SCSI_MAX_PHYS_SEGMENTS);
> +	blk_queue_max_phys_segments(q, 127);
>  	blk_queue_max_sectors(q, shost->max_sectors);
>  	blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
>  	blk_queue_segment_boundary(q, shost->dma_boundary);
> 
> (It doesn't alter the allocation pools or anything else, just limits the
> max phys segments of the queue).  The way to check that this limit is
> being honoured is:
> 
> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index 1f5a07b..ae42e4d 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -1000,6 +1000,7 @@ static int scsi_init_io(struct scsi_cmnd *cmd)
>  	 * kmapping pages)
>  	 */
>  	cmd->use_sg = req->nr_phys_segments;
> +	WARN_ON(req->nr_phys_segments > 127);
>  
>  	/*
>  	 * If sg table allocation fails, requeue request later.
> 
> James
> 
> 
Yes Tomo found it at ata_scsi_slave_config(). Attached below the way I
fixed it. Now it works with 127.

(By the way Tomo, a printk like you did in scsi_init_io and
scsi_free_sgtable, 2 for every IO, or even 1 for every IO, will make
a SCSI booting PC like mine almost un-usable. Think of printk going
to log-file doing a printk...)

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index dd81fa7..de8c796 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -800,7 +800,9 @@ int ata_scsi_slave_config(struct scsi_device *sdev)

 	ata_scsi_sdev_config(sdev);

-	blk_queue_max_phys_segments(sdev->request_queue, LIBATA_MAX_PRD);
+	if ( sdev->request_queue->max_phys_segments > LIBATA_MAX_PRD )
+		blk_queue_max_phys_segments(sdev->request_queue,
+		                                       LIBATA_MAX_PRD);

 	sdev->manage_start_stop = 1;



^ permalink raw reply related	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-17 14:00                     ` Boaz Harrosh
@ 2007-05-17 14:11                       ` FUJITA Tomonori
  2007-05-17 15:49                         ` Boaz Harrosh
  0 siblings, 1 reply; 43+ messages in thread
From: FUJITA Tomonori @ 2007-05-17 14:11 UTC (permalink / raw)
  To: bharrosh
  Cc: James.Bottomley, fujita.tomonori, jens.axboe, linux-scsi, bhalevy,
	hch, akpm, michaelc

From: Boaz Harrosh <bharrosh@panasas.com>
Subject: Re: [PATCH v2] add bidi support for block pc requests
Date: Thu, 17 May 2007 17:00:21 +0300

> James Bottomley wrote:
> > On Thu, 2007-05-17 at 11:49 +0300, Boaz Harrosh wrote:
> >> These are regular fs (ext3) requests during bootup. The machine will not
> >> boot. (Usually from the read ahead code)
> >> Don't believe me look at the second patch Over Tomo's cleanup.
> >> If I define SCSI_MAX_SG_SEGMENTS to 127 it will crash even when I
> >> did in code:
> >> 	blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS);
> >> I suppose someone is looking at a different definition. Or there is
> >> another call I need to do for this to work.
> > 
> > It would really help us if you showed the actual code for what you did
> > and where ... if this is wrong, we have bigger problems that quibbling
> > about bidirectional slab sizes.  The correct way to adjust this limit
> > artificially to 127 is:
> > 
> > diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> > index 1f5a07b..4a27841 100644
> > --- a/drivers/scsi/scsi_lib.c
> > +++ b/drivers/scsi/scsi_lib.c
> > @@ -1576,7 +1576,7 @@ struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost,
> >  		return NULL;
> >  
> >  	blk_queue_max_hw_segments(q, shost->sg_tablesize);
> > -	blk_queue_max_phys_segments(q, SCSI_MAX_PHYS_SEGMENTS);
> > +	blk_queue_max_phys_segments(q, 127);
> >  	blk_queue_max_sectors(q, shost->max_sectors);
> >  	blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
> >  	blk_queue_segment_boundary(q, shost->dma_boundary);
> > 
> > (It doesn't alter the allocation pools or anything else, just limits the
> > max phys segments of the queue).  The way to check that this limit is
> > being honoured is:
> > 
> > diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> > index 1f5a07b..ae42e4d 100644
> > --- a/drivers/scsi/scsi_lib.c
> > +++ b/drivers/scsi/scsi_lib.c
> > @@ -1000,6 +1000,7 @@ static int scsi_init_io(struct scsi_cmnd *cmd)
> >  	 * kmapping pages)
> >  	 */
> >  	cmd->use_sg = req->nr_phys_segments;
> > +	WARN_ON(req->nr_phys_segments > 127);
> >  
> >  	/*
> >  	 * If sg table allocation fails, requeue request later.
> > 
> > James
> > 
> > 
> Yes Tomo found it at ata_scsi_slave_config(). Attached below the way I
> fixed it. Now it works with 127.

I think that we can just remove blk_queue_max_phys_segments since the
ata drivers seem to set sg_tablesize to LIBATA_MAX_PRD.


> (By the way Tomo, a printk like you did in scsi_init_io and
> scsi_free_sgtable, 2 for every IO, or even 1 for every IO, will make
> a SCSI booting PC like mine almost un-usable. Think of printk going
> to log-file doing a printk...)

Oops, I forgot to remove it since my SCSI booting pSeries box works
with that. :)

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-17 14:11                       ` FUJITA Tomonori
@ 2007-05-17 15:49                         ` Boaz Harrosh
  2007-06-01 20:21                           ` Jeff Garzik
  0 siblings, 1 reply; 43+ messages in thread
From: Boaz Harrosh @ 2007-05-17 15:49 UTC (permalink / raw)
  To: FUJITA Tomonori
  Cc: James.Bottomley, jens.axboe, linux-scsi, bhalevy, hch, akpm,
	michaelc, Jeff Garzik

FUJITA Tomonori wrote:
> From: Boaz Harrosh <bharrosh@panasas.com>
> Subject: Re: [PATCH v2] add bidi support for block pc requests
> Date: Thu, 17 May 2007 17:00:21 +0300
> 
>> Yes Tomo found it at ata_scsi_slave_config(). Attached below the way I
>> fixed it. Now it works with 127.
> 
> I think that we can just remove blk_queue_max_phys_segments since the
> ata drivers seem to set sg_tablesize to LIBATA_MAX_PRD.
> 

Who should send a patch upstream? (I cc'ed Jeff Garzik)

Boaz

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index dd81fa7..3660f3e 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -800,8 +800,6 @@ int ata_scsi_slave_config(struct scsi_device *sdev)

 	ata_scsi_sdev_config(sdev);

-	blk_queue_max_phys_segments(sdev->request_queue, LIBATA_MAX_PRD);
-
 	sdev->manage_start_stop = 1;

 	if (dev)


^ permalink raw reply related	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-17 11:12                   ` FUJITA Tomonori
@ 2007-05-17 17:37                     ` Benny Halevy
  2007-05-24 16:37                     ` Boaz Harrosh
  1 sibling, 0 replies; 43+ messages in thread
From: Benny Halevy @ 2007-05-17 17:37 UTC (permalink / raw)
  To: FUJITA Tomonori
  Cc: bharrosh, jens.axboe, James.Bottomley, linux-scsi, hch, akpm,
	michaelc

FUJITA Tomonori wrote:
> From: Boaz Harrosh <bharrosh@panasas.com>
> Subject: Re: [PATCH v2] add bidi support for block pc requests
> Date: Thu, 17 May 2007 11:49:37 +0300
> 
>> FUJITA Tomonori wrote:
>>> From: Jens Axboe <jens.axboe@oracle.com>
>>> Subject: Re: [PATCH v2] add bidi support for block pc requests
>>> Date: Thu, 17 May 2007 07:48:13 +0200
>>>
>>>> On Thu, May 17 2007, FUJITA Tomonori wrote:
>>>>> From: Jens Axboe <jens.axboe@oracle.com>
>>>>> Subject: Re: [PATCH v2] add bidi support for block pc requests
>>>>> Date: Wed, 16 May 2007 19:53:22 +0200
>>>>>
>>>>>> On Wed, May 16 2007, Boaz Harrosh wrote:
>>>>>>> now there are 2 issues with this:
>>>>>>> 1. Even if I do blk_queue_max_phys_segments(q, 127); I still get
>>>>>>>    requests for use_sg=128 which will crash the kernel.
>>>>>> That sounds like a serious issue, it should definitely not happen. Stuff
>>>>>> like that would bite other drivers as well, are you absolutely sure that
>>>>>> is happening? Power-of-2 bug in your code, or in the SCSI code?
>>>>> Boaz, how do you send requests to the scsi-ml, via fs, sg, or bsg?
>> These are regular fs (ext3) requests during bootup. The machine will not
>> boot. (Usually from the read ahead code)
>> Don't believe me look at the second patch Over Tomo's cleanup.
>> If I define SCSI_MAX_SG_SEGMENTS to 127 it will crash even when I
>> did in code:
>> 	blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS);
>> I suppose someone is looking at a different definition. Or there is
>> another call I need to do for this to work.
> 
> I modified your patch a bit (sgtable allocation) and set
> SCSI_MAX_SG_SEGMENTS to 127. My ppc64 box seems to work (with
> iscsi_tcp and ipr drivers).
> 
> iscsi_tcp sets sg_tablesize to 255, but blk_queue_max_phys_segments
> seems to work.
> 
> One thing that I found is:
> 
> +#define scsi_resid(cmd) ((cmd)->sg_table->resid)
> 
> 
> This doesn't work for some drivers (at least ipr) since they set
> cmd->resid even with commands without data transfer.

Hmm, since we need a residual count also for the bidi_in transfer
this problem is another reason for having the scsi_cmd_buff in struct
scsi_cmnd, allocate another one from a pool for the bidi case,
and point to the sglist in both cases rather than having a sg_table
header allocated along with the sg list.
Alternatively, having a pool for the no-data case might be another
possible solution, though it seems a bit odd to me.

<snip>

> -void scsi_free_sgtable(struct scatterlist *sgl, int index)
> +static struct scsi_sg_table *scsi_alloc_sgtable(int nseg, gfp_t gfp_mask)
>  {
>  	struct scsi_host_sg_pool *sgp;
> +	struct scsi_sg_table *sgt;
> +	unsigned int idx;
>  
> -	BUG_ON(index >= SG_MEMPOOL_NR);
> +	for (idx = 0; idx < SG_MEMPOOL_NR; idx++)
> +		if (scsi_sg_pools[idx].size >= nseg)
> +			goto found;

Tomo, I prefer the loop to be based on calculation as follows rather
than scanning the scsi_sg_pools table in order to minimize memory access
(and thrashing of the cpu data cache - each scsi_host_sg_pool takes a cache
row on x86_64)

+	for (i = 0, size = 8; i < SG_MEMPOOL_NR-1; i++, size <<= 1)
+		if (size > nents)
+			return i;

Benny

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-17 11:12                   ` FUJITA Tomonori
  2007-05-17 17:37                     ` Benny Halevy
@ 2007-05-24 16:37                     ` Boaz Harrosh
  2007-05-24 16:46                       ` Boaz Harrosh
  2007-05-24 16:47                       ` FUJITA Tomonori
  1 sibling, 2 replies; 43+ messages in thread
From: Boaz Harrosh @ 2007-05-24 16:37 UTC (permalink / raw)
  To: FUJITA Tomonori, James.Bottomley
  Cc: jens.axboe, linux-scsi, bhalevy, hch, akpm, michaelc

FUJITA Tomonori wrote:
>> FUJITA Tomonori wrote:
> 
> One thing that I found is:
> 
> +#define scsi_resid(cmd) ((cmd)->sg_table->resid)
> 
> 
> This doesn't work for some drivers (at least ipr) since they set
> cmd->resid even with commands without data transfer.
> 

James, Tomo.

the last accessor:
+#define scsi_resid(cmd) ((cmd)->resid)

used as an l-value in drivers does not serve our purpose, as seen by the test
implementation of scsi_sg_table. Now clearly this needs an accessor and it is a
bidi parameter (need 2 of them).

I would suggest doing a set_ inline accessor and using that in drivers:
+static inline void scsi_set_resid(struct scsi_cmnd *cmd, resid)
+{
+	cmd->resid = resid;
+}

I would even suggest to make all accessors inlines and not macros just to make sure
they are not used as l-value and modified. Though I have not seen such use in
Tomo's patchset.

example:
+static inline unsigned scsi_bufflen(struct scsi_cmnd *cmd)
+{
+	return cmd->request_bufflen;
+}

Boaz

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-24 16:37                     ` Boaz Harrosh
@ 2007-05-24 16:46                       ` Boaz Harrosh
  2007-05-24 16:47                       ` FUJITA Tomonori
  1 sibling, 0 replies; 43+ messages in thread
From: Boaz Harrosh @ 2007-05-24 16:46 UTC (permalink / raw)
  To: FUJITA Tomonori
  Cc: James.Bottomley, jens.axboe, linux-scsi, bhalevy, hch, akpm,
	michaelc

Boaz Harrosh wrote:
> FUJITA Tomonori wrote:
>>> FUJITA Tomonori wrote:
>> One thing that I found is:
>>
>> +#define scsi_resid(cmd) ((cmd)->sg_table->resid)
>>
>>
>> This doesn't work for some drivers (at least ipr) since they set
>> cmd->resid even with commands without data transfer.
>>
> 
> James, Tomo.
> 
> the last accessor:
> +#define scsi_resid(cmd) ((cmd)->resid)
> 
> used as an l-value in drivers does not serve our purpose, as seen by the test
> implementation of scsi_sg_table. Now clearly this needs an accessor and it is a
> bidi parameter (need 2 of them).
> 
> I would suggest doing a set_ inline accessor and using that in drivers:
> +static inline void scsi_set_resid(struct scsi_cmnd *cmd, resid)
> +{
> +	cmd->resid = resid;
> +}
> 
> I would even suggest to make all accessors inlines and not macros just to make sure
> they are not used as l-value and modified. Though I have not seen such use in
> Tomo's patchset.
> 
> example:
> +static inline unsigned scsi_bufflen(struct scsi_cmnd *cmd)
> +{
> +	return cmd->request_bufflen;
> +}
> 
> Boaz
> -

I forgot to say. Tomo, I have started to convert to the above, based on your
cleanup git-tree. Do you need that I send a patch?
(One thing I still don't have a good solution for. When having a git-tree like that
with a long patchset. How to fix/update individual patches?)

Boaz


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-24 16:37                     ` Boaz Harrosh
  2007-05-24 16:46                       ` Boaz Harrosh
@ 2007-05-24 16:47                       ` FUJITA Tomonori
  2007-05-24 16:59                         ` Boaz Harrosh
  1 sibling, 1 reply; 43+ messages in thread
From: FUJITA Tomonori @ 2007-05-24 16:47 UTC (permalink / raw)
  To: bharrosh
  Cc: fujita.tomonori, James.Bottomley, jens.axboe, linux-scsi, bhalevy,
	hch, akpm, michaelc

From: Boaz Harrosh <bharrosh@panasas.com>
Subject: Re: [PATCH v2] add bidi support for block pc requests
Date: Thu, 24 May 2007 19:37:06 +0300

> FUJITA Tomonori wrote:
> >> FUJITA Tomonori wrote:
> > 
> > One thing that I found is:
> > 
> > +#define scsi_resid(cmd) ((cmd)->sg_table->resid)
> > 
> > 
> > This doesn't work for some drivers (at least ipr) since they set
> > cmd->resid even with commands without data transfer.
> > 
> 
> James, Tomo.
> 
> the last accessor:
> +#define scsi_resid(cmd) ((cmd)->resid)
> 
> used as an l-value in drivers does not serve our purpose, as seen by the test
> implementation of scsi_sg_table. Now clearly this needs an accessor and it is a
> bidi parameter (need 2 of them).

I thought that it would be better to fix several drivers (less than 10).

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-24 16:47                       ` FUJITA Tomonori
@ 2007-05-24 16:59                         ` Boaz Harrosh
  0 siblings, 0 replies; 43+ messages in thread
From: Boaz Harrosh @ 2007-05-24 16:59 UTC (permalink / raw)
  To: FUJITA Tomonori
  Cc: James.Bottomley, jens.axboe, linux-scsi, bhalevy, hch, akpm,
	michaelc

FUJITA Tomonori wrote:
> From: Boaz Harrosh <bharrosh@panasas.com>
> Subject: Re: [PATCH v2] add bidi support for block pc requests
> Date: Thu, 24 May 2007 19:37:06 +0300
> 
>> FUJITA Tomonori wrote:
>>>> FUJITA Tomonori wrote:
>>> One thing that I found is:
>>>
>>> +#define scsi_resid(cmd) ((cmd)->sg_table->resid)
>>>
>>>
>>> This doesn't work for some drivers (at least ipr) since they set
>>> cmd->resid even with commands without data transfer.
>>>
>> James, Tomo.
>>
>> the last accessor:
>> +#define scsi_resid(cmd) ((cmd)->resid)
>>
>> used as an l-value in drivers does not serve our purpose, as seen by the test
>> implementation of scsi_sg_table. Now clearly this needs an accessor and it is a
>> bidi parameter (need 2 of them).
> 
> I thought that it would be better to fix several drivers (less than 10).

I prefer inlines.

One - Programmer cannot make mistakes. Why give him the freedom to something he
must not do?

two - if all/most drivers are doing:
if (scsi_sgl(cmd))
	scsi_resid(cmd) = 0;

Than will it not be better to do the if() inside the API?

Boaz


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-05-17 15:49                         ` Boaz Harrosh
@ 2007-06-01 20:21                           ` Jeff Garzik
  2007-06-03  7:45                             ` Boaz Harrosh
  0 siblings, 1 reply; 43+ messages in thread
From: Jeff Garzik @ 2007-06-01 20:21 UTC (permalink / raw)
  To: Boaz Harrosh
  Cc: FUJITA Tomonori, James.Bottomley, jens.axboe, linux-scsi, bhalevy,
	hch, akpm, michaelc

Boaz Harrosh wrote:
> FUJITA Tomonori wrote:
>> From: Boaz Harrosh <bharrosh@panasas.com>
>> Subject: Re: [PATCH v2] add bidi support for block pc requests
>> Date: Thu, 17 May 2007 17:00:21 +0300
>>
>>> Yes Tomo found it at ata_scsi_slave_config(). Attached below the way I
>>> fixed it. Now it works with 127.
>> I think that we can just remove blk_queue_max_phys_segments since the
>> ata drivers seem to set sg_tablesize to LIBATA_MAX_PRD.
>>
> 
> Who should send a patch upstream? (I cc'ed Jeff Garzik)
> 
> Boaz
> 
> diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
> index dd81fa7..3660f3e 100644
> --- a/drivers/ata/libata-scsi.c
> +++ b/drivers/ata/libata-scsi.c
> @@ -800,8 +800,6 @@ int ata_scsi_slave_config(struct scsi_device *sdev)
> 
>  	ata_scsi_sdev_config(sdev);
> 
> -	blk_queue_max_phys_segments(sdev->request_queue, LIBATA_MAX_PRD);
> -
>  	sdev->manage_start_stop = 1;

I don't mind the patch, but could someone refresh me as to the context?

Is there something wrong with the above code, or is it simply redundant 
to the scsi_host_template settings in each LLDD?

	Jeff




^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-06-01 20:21                           ` Jeff Garzik
@ 2007-06-03  7:45                             ` Boaz Harrosh
  2007-06-03 13:17                               ` James Bottomley
  0 siblings, 1 reply; 43+ messages in thread
From: Boaz Harrosh @ 2007-06-03  7:45 UTC (permalink / raw)
  To: Jeff Garzik
  Cc: FUJITA Tomonori, James.Bottomley, jens.axboe, linux-scsi, bhalevy,
	hch, akpm, michaelc

Jeff Garzik wrote:
> Boaz Harrosh wrote:
>> FUJITA Tomonori wrote:
>>> From: Boaz Harrosh <bharrosh@panasas.com>
>>> Subject: Re: [PATCH v2] add bidi support for block pc requests
>>> Date: Thu, 17 May 2007 17:00:21 +0300
>>>
>>>> Yes Tomo found it at ata_scsi_slave_config(). Attached below the way I
>>>> fixed it. Now it works with 127.
>>> I think that we can just remove blk_queue_max_phys_segments since the
>>> ata drivers seem to set sg_tablesize to LIBATA_MAX_PRD.
>>>
>> Who should send a patch upstream? (I cc'ed Jeff Garzik)
>>
>> Boaz
>>
>> diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
>> index dd81fa7..3660f3e 100644
>> --- a/drivers/ata/libata-scsi.c
>> +++ b/drivers/ata/libata-scsi.c
>> @@ -800,8 +800,6 @@ int ata_scsi_slave_config(struct scsi_device *sdev)
>>
>>  	ata_scsi_sdev_config(sdev);
>>
>> -	blk_queue_max_phys_segments(sdev->request_queue, LIBATA_MAX_PRD);
>> -
>>  	sdev->manage_start_stop = 1;
> 
> I don't mind the patch, but could someone refresh me as to the context?
> 
> Is there something wrong with the above code, or is it simply redundant 
> to the scsi_host_template settings in each LLDD?
> 
> 	Jeff
> 
> 
> 

Hi Jeff
What happens is that if SCSI-ml sets an higher value than LIBATA_MAX_PRD (=128)
than every thing is OK and libata-scsi will only see its LIBATA_MAX_PRD. But what
happens if SCSI-ml sets a lower value? It will than crash on unexpected high sg
count. My first Patch was an "if >" but Tomo said that it is redundant since
drivers do that already. So I guess it is your call. Can it be removed or we need
a: if ( sdev->request_queue->max_phys_segments > LIBATA_MAX_PRD)

	Boaz




^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-06-03  7:45                             ` Boaz Harrosh
@ 2007-06-03 13:17                               ` James Bottomley
  2007-07-07 15:27                                 ` Jeff Garzik
  0 siblings, 1 reply; 43+ messages in thread
From: James Bottomley @ 2007-06-03 13:17 UTC (permalink / raw)
  To: Boaz Harrosh
  Cc: Jeff Garzik, FUJITA Tomonori, jens.axboe, linux-scsi, bhalevy,
	hch, akpm, michaelc

On Sun, 2007-06-03 at 10:45 +0300, Boaz Harrosh wrote:
> Jeff Garzik wrote:
> > Boaz Harrosh wrote:
> >> FUJITA Tomonori wrote:
> >>> From: Boaz Harrosh <bharrosh@panasas.com>
> >>> Subject: Re: [PATCH v2] add bidi support for block pc requests
> >>> Date: Thu, 17 May 2007 17:00:21 +0300
> >>>
> >>>> Yes Tomo found it at ata_scsi_slave_config(). Attached below the way I
> >>>> fixed it. Now it works with 127.
> >>> I think that we can just remove blk_queue_max_phys_segments since the
> >>> ata drivers seem to set sg_tablesize to LIBATA_MAX_PRD.
> >>>
> >> Who should send a patch upstream? (I cc'ed Jeff Garzik)
> >>
> >> Boaz
> >>
> >> diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
> >> index dd81fa7..3660f3e 100644
> >> --- a/drivers/ata/libata-scsi.c
> >> +++ b/drivers/ata/libata-scsi.c
> >> @@ -800,8 +800,6 @@ int ata_scsi_slave_config(struct scsi_device *sdev)
> >>
> >>  	ata_scsi_sdev_config(sdev);
> >>
> >> -	blk_queue_max_phys_segments(sdev->request_queue, LIBATA_MAX_PRD);
> >> -
> >>  	sdev->manage_start_stop = 1;
> > 
> > I don't mind the patch, but could someone refresh me as to the context?
> > 
> > Is there something wrong with the above code, or is it simply redundant 
> > to the scsi_host_template settings in each LLDD?
> > 
> > 	Jeff
> > 
> > 
> > 
> 
> Hi Jeff
> What happens is that if SCSI-ml sets an higher value than LIBATA_MAX_PRD (=128)
> than every thing is OK and libata-scsi will only see its LIBATA_MAX_PRD. But what
> happens if SCSI-ml sets a lower value? It will than crash on unexpected high sg
> count. My first Patch was an "if >" but Tomo said that it is redundant since
> drivers do that already. So I guess it is your call. Can it be removed or we need
> a: if ( sdev->request_queue->max_phys_segments > LIBATA_MAX_PRD)

Ordinarily, in SCSI, phys_segments is the wrong thing for a driver to
set because what a driver wants to see is what comes out the other side
of dma_map_sg() (which is controlled by hw_segments) not what goes in.
However, I could see libata having an interest in the phys_segments if
the physical region descriptors are going to the device via PIO ... is
that what this is for?

James



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-06-03 13:17                               ` James Bottomley
@ 2007-07-07 15:27                                 ` Jeff Garzik
  2007-07-07 15:42                                   ` James Bottomley
  0 siblings, 1 reply; 43+ messages in thread
From: Jeff Garzik @ 2007-07-07 15:27 UTC (permalink / raw)
  To: James Bottomley
  Cc: Boaz Harrosh, FUJITA Tomonori, jens.axboe, linux-scsi, bhalevy,
	hch, akpm, michaelc

James Bottomley wrote:
> On Sun, 2007-06-03 at 10:45 +0300, Boaz Harrosh wrote:
>> Jeff Garzik wrote:
>>> Boaz Harrosh wrote:
>>>> FUJITA Tomonori wrote:
>>>>> From: Boaz Harrosh <bharrosh@panasas.com>
>>>>> Subject: Re: [PATCH v2] add bidi support for block pc requests
>>>>> Date: Thu, 17 May 2007 17:00:21 +0300
>>>>>
>>>>>> Yes Tomo found it at ata_scsi_slave_config(). Attached below the way I
>>>>>> fixed it. Now it works with 127.
>>>>> I think that we can just remove blk_queue_max_phys_segments since the
>>>>> ata drivers seem to set sg_tablesize to LIBATA_MAX_PRD.
>>>>>
>>>> Who should send a patch upstream? (I cc'ed Jeff Garzik)
>>>>
>>>> Boaz
>>>>
>>>> diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
>>>> index dd81fa7..3660f3e 100644
>>>> --- a/drivers/ata/libata-scsi.c
>>>> +++ b/drivers/ata/libata-scsi.c
>>>> @@ -800,8 +800,6 @@ int ata_scsi_slave_config(struct scsi_device *sdev)
>>>>
>>>>  	ata_scsi_sdev_config(sdev);
>>>>
>>>> -	blk_queue_max_phys_segments(sdev->request_queue, LIBATA_MAX_PRD);
>>>> -
>>>>  	sdev->manage_start_stop = 1;
>>> I don't mind the patch, but could someone refresh me as to the context?
>>>
>>> Is there something wrong with the above code, or is it simply redundant 
>>> to the scsi_host_template settings in each LLDD?
>>>
>>> 	Jeff
>>>
>>>
>>>
>> Hi Jeff
>> What happens is that if SCSI-ml sets an higher value than LIBATA_MAX_PRD (=128)
>> than every thing is OK and libata-scsi will only see its LIBATA_MAX_PRD. But what
>> happens if SCSI-ml sets a lower value? It will than crash on unexpected high sg
>> count. My first Patch was an "if >" but Tomo said that it is redundant since
>> drivers do that already. So I guess it is your call. Can it be removed or we need
>> a: if ( sdev->request_queue->max_phys_segments > LIBATA_MAX_PRD)
> 
> Ordinarily, in SCSI, phys_segments is the wrong thing for a driver to
> set because what a driver wants to see is what comes out the other side
> of dma_map_sg() (which is controlled by hw_segments) not what goes in.
> However, I could see libata having an interest in the phys_segments if
> the physical region descriptors are going to the device via PIO ... is
> that what this is for?

LIBATA_MAX_PRD is the maximum number of DMA scatter/gather elements 
permitted by the HBA's DMA engine, for a single ATA command.

The PIO code doesn't really care about segments.

	Jeff




^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-07-07 15:27                                 ` Jeff Garzik
@ 2007-07-07 15:42                                   ` James Bottomley
  2007-07-07 16:59                                     ` Jeff Garzik
  0 siblings, 1 reply; 43+ messages in thread
From: James Bottomley @ 2007-07-07 15:42 UTC (permalink / raw)
  To: Jeff Garzik
  Cc: Boaz Harrosh, FUJITA Tomonori, jens.axboe, linux-scsi, bhalevy,
	hch, akpm, michaelc

On Sat, 2007-07-07 at 11:27 -0400, Jeff Garzik wrote:
> James Bottomley wrote:
> > On Sun, 2007-06-03 at 10:45 +0300, Boaz Harrosh wrote:
> >> Jeff Garzik wrote:
> >>> Boaz Harrosh wrote:
> >>>> FUJITA Tomonori wrote:
> >>>>> From: Boaz Harrosh <bharrosh@panasas.com>
> >>>>> Subject: Re: [PATCH v2] add bidi support for block pc requests
> >>>>> Date: Thu, 17 May 2007 17:00:21 +0300
> >>>>>
> >>>>>> Yes Tomo found it at ata_scsi_slave_config(). Attached below the way I
> >>>>>> fixed it. Now it works with 127.
> >>>>> I think that we can just remove blk_queue_max_phys_segments since the
> >>>>> ata drivers seem to set sg_tablesize to LIBATA_MAX_PRD.
> >>>>>
> >>>> Who should send a patch upstream? (I cc'ed Jeff Garzik)
> >>>>
> >>>> Boaz
> >>>>
> >>>> diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
> >>>> index dd81fa7..3660f3e 100644
> >>>> --- a/drivers/ata/libata-scsi.c
> >>>> +++ b/drivers/ata/libata-scsi.c
> >>>> @@ -800,8 +800,6 @@ int ata_scsi_slave_config(struct scsi_device *sdev)
> >>>>
> >>>>  	ata_scsi_sdev_config(sdev);
> >>>>
> >>>> -	blk_queue_max_phys_segments(sdev->request_queue, LIBATA_MAX_PRD);
> >>>> -
> >>>>  	sdev->manage_start_stop = 1;
> >>> I don't mind the patch, but could someone refresh me as to the context?
> >>>
> >>> Is there something wrong with the above code, or is it simply redundant 
> >>> to the scsi_host_template settings in each LLDD?
> >>>
> >>> 	Jeff
> >>>
> >>>
> >>>
> >> Hi Jeff
> >> What happens is that if SCSI-ml sets an higher value than LIBATA_MAX_PRD (=128)
> >> than every thing is OK and libata-scsi will only see its LIBATA_MAX_PRD. But what
> >> happens if SCSI-ml sets a lower value? It will than crash on unexpected high sg
> >> count. My first Patch was an "if >" but Tomo said that it is redundant since
> >> drivers do that already. So I guess it is your call. Can it be removed or we need
> >> a: if ( sdev->request_queue->max_phys_segments > LIBATA_MAX_PRD)
> > 
> > Ordinarily, in SCSI, phys_segments is the wrong thing for a driver to
> > set because what a driver wants to see is what comes out the other side
> > of dma_map_sg() (which is controlled by hw_segments) not what goes in.
> > However, I could see libata having an interest in the phys_segments if
> > the physical region descriptors are going to the device via PIO ... is
> > that what this is for?
> 
> LIBATA_MAX_PRD is the maximum number of DMA scatter/gather elements 
> permitted by the HBA's DMA engine, for a single ATA command.

Then it's the wrong parameter you're setting:  phys_segments is what you
have going into a dma_map_sg() but hw_segments is what you end up after
it (mathemtaically, the mapping never splits segments, so hw_segments <=
phys_segments always, but it's still the wrong way to bound this); so if
you want to limit the SG elements in the HBA, you should set hw_segments
not phys_segments (which is what the sg_tablesize parameter of the host
structure corresponds to).

However, I suspect this has to do with our iommu problem, doesn't it?
The IOMMU may merge the segments beyond your 64k DMA boundary, so you
need to split them again ... in that case you need phys_segments
bounded, not hw_segments?  I really wish we could get this iommu
parameter problem fixed so we didn't have to have all of this confusing
code.

> The PIO code doesn't really care about segments.

OK ... I just thought it was PIO because the only time you should worry
about phys_segments is if you're not going to do a dma_map_sg() on the
scatterlist.

James



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2] add bidi support for block pc requests
  2007-07-07 15:42                                   ` James Bottomley
@ 2007-07-07 16:59                                     ` Jeff Garzik
  0 siblings, 0 replies; 43+ messages in thread
From: Jeff Garzik @ 2007-07-07 16:59 UTC (permalink / raw)
  To: James Bottomley
  Cc: Boaz Harrosh, FUJITA Tomonori, jens.axboe, linux-scsi, bhalevy,
	hch, akpm, michaelc

James Bottomley wrote:
> On Sat, 2007-07-07 at 11:27 -0400, Jeff Garzik wrote:
>> LIBATA_MAX_PRD is the maximum number of DMA scatter/gather elements 
>> permitted by the HBA's DMA engine, for a single ATA command.

> Then it's the wrong parameter you're setting:  phys_segments is what you
> have going into a dma_map_sg() but hw_segments is what you end up after
> it (mathemtaically, the mapping never splits segments, so hw_segments <=
> phys_segments always, but it's still the wrong way to bound this); so if
> you want to limit the SG elements in the HBA, you should set hw_segments
> not phys_segments (which is what the sg_tablesize parameter of the host
> structure corresponds to).

Honestly I'm betting the code is a result of paranoia^H^H^Hconservatism 
on my part:  setting every DMA and segment limit I can find, in the 
hopes that somehow, somewhere, that information will get filtered down 
to the IOMMU and whatnot.  :)

Given what you say above, and the fact that I set sg_tablesize, 
presumably Boaz's patch to remove phys_segment limiting in libata-scsi 
is the right thing to do.  Your knowledge in this area is most likely 
stronger than mine.


> However, I suspect this has to do with our iommu problem, doesn't it?
> The IOMMU may merge the segments beyond your 64k DMA boundary, so you
> need to split them again ... in that case you need phys_segments
> bounded, not hw_segments?

This is why I set every segment limitation I could find :)  And then Ben 
tells me IOMMU may ignore all of that anyway, and so I have code to 
split inside libata as well :)

I just know what IDE needs:  S/G ent shouldn't cross 64k boundary, nor 
exceed 64k in size.  The maximum number of S/G ents is actually a guess. 
  Hard data is tough to come by.  Some DMA engines will cycle through 
all of memory until they hit a stop bit.  Others have limits yet to be 
discovered.  :)

	Jeff



^ permalink raw reply	[flat|nested] 43+ messages in thread

end of thread, other threads:[~2007-07-07 16:59 UTC | newest]

Thread overview: 43+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-05-08  2:25 [PATCH v2] add bidi support for block pc requests FUJITA Tomonori
2007-05-08 18:53 ` Boaz Harrosh
2007-05-08 20:01   ` James Bottomley
2007-05-09  7:46     ` Boaz Harrosh
2007-05-09 10:52       ` FUJITA Tomonori
2007-05-09 13:58         ` Boaz Harrosh
2007-05-09 14:54           ` FUJITA Tomonori
2007-05-09 14:55           ` James Bottomley
2007-05-09 16:54             ` Boaz Harrosh
2007-05-10  6:53               ` FUJITA Tomonori
2007-05-10  7:45                 ` FUJITA Tomonori
2007-05-10 12:37                   ` Boaz Harrosh
2007-05-10 13:01                     ` FUJITA Tomonori
2007-05-10 15:10                     ` Douglas Gilbert
2007-05-10 15:48                       ` Boaz Harrosh
2007-05-11 16:26                       ` James Bottomley
2007-05-16 17:29       ` Boaz Harrosh
2007-05-16 17:53         ` Jens Axboe
2007-05-16 18:06           ` James Bottomley
2007-05-16 18:13             ` Jens Axboe
2007-05-17  8:46               ` Boaz Harrosh
2007-05-17  2:57           ` FUJITA Tomonori
2007-05-17  5:48             ` Jens Axboe
2007-05-17  5:59               ` FUJITA Tomonori
2007-05-17  8:49                 ` Boaz Harrosh
2007-05-17 11:12                   ` FUJITA Tomonori
2007-05-17 17:37                     ` Benny Halevy
2007-05-24 16:37                     ` Boaz Harrosh
2007-05-24 16:46                       ` Boaz Harrosh
2007-05-24 16:47                       ` FUJITA Tomonori
2007-05-24 16:59                         ` Boaz Harrosh
2007-05-17 11:29                   ` FUJITA Tomonori
2007-05-17 13:27                   ` James Bottomley
2007-05-17 14:00                     ` Boaz Harrosh
2007-05-17 14:11                       ` FUJITA Tomonori
2007-05-17 15:49                         ` Boaz Harrosh
2007-06-01 20:21                           ` Jeff Garzik
2007-06-03  7:45                             ` Boaz Harrosh
2007-06-03 13:17                               ` James Bottomley
2007-07-07 15:27                                 ` Jeff Garzik
2007-07-07 15:42                                   ` James Bottomley
2007-07-07 16:59                                     ` Jeff Garzik
2007-05-09 10:49     ` FUJITA Tomonori

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).