[PATCH V4] blk-mq: introduce BLK_STS_DEV

public inbox for linux-block@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH V4] blk-mq: introduce BLK_STS_DEV_RESOURCE
@ 2018-01-29 20:33 Mike Snitzer
  2018-01-29 21:22 ` Bart Van Assche
  0 siblings, 1 reply; 11+ messages in thread
From: Mike Snitzer @ 2018-01-29 20:33 UTC (permalink / raw)
  To: axboe; +Cc: linux-block, dm-devel, linux-scsi, linux-nvme

From: Ming Lei <ming.lei@redhat.com>

This status is returned from driver to block layer if device related
resource is unavailable, but driver can guarantee that IO dispatch
will be triggered in future when the resource is available.

Convert some drivers to return BLK_STS_DEV_RESOURCE.  Also, if driver
returns BLK_STS_RESOURCE and SCHED_RESTART is set, rerun queue after
a delay (BLK_MQ_DELAY_QUEUE) to avoid IO stalls.  BLK_MQ_DELAY_QUEUE is
3 ms because both scsi-mq and nvmefc are using that magic value.

Suggested-by: Jens Axboe <axboe@kernel.dk>
Tested-by: Laurence Oberman <loberman@redhat.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---

V4:
        - cleanup header and code comments
        - rerun queue after BLK_MQ_QUEUE_DELAY (3ms) instead of 10ms
        - eliminate nvmefc's queue rerun now that blk-mq does it
V3:
        - fix typo, and improvement document
        - add tested-by tag
V2:
        - add comments on the new introduced status
        - patch style fix
        - both are suggested by Christoph

 block/blk-core.c             |  1 +
 block/blk-mq.c               | 20 ++++++++++++++++----
 drivers/block/null_blk.c     |  2 +-
 drivers/block/virtio_blk.c   |  2 +-
 drivers/block/xen-blkfront.c |  2 +-
 drivers/md/dm-rq.c           |  5 ++---
 drivers/nvme/host/fc.c       | 12 ++----------
 drivers/scsi/scsi_lib.c      |  6 +++---
 include/linux/blk_types.h    | 14 ++++++++++++++
 9 files changed, 41 insertions(+), 23 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index cdae69be68e9..38279d4ae08b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -145,6 +145,7 @@ static const struct {
 	[BLK_STS_MEDIUM]	= { -ENODATA,	"critical medium" },
 	[BLK_STS_PROTECTION]	= { -EILSEQ,	"protection" },
 	[BLK_STS_RESOURCE]	= { -ENOMEM,	"kernel resource" },
+	[BLK_STS_DEV_RESOURCE]	= { -ENOMEM,	"device resource" },
 	[BLK_STS_AGAIN]		= { -EAGAIN,	"nonblocking retry" },
 
 	/* device mapper special case, should not leak out: */
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 43e7449723e0..dd097ca5f1e9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1160,6 +1160,8 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
 	return true;
 }
 
+#define BLK_MQ_QUEUE_DELAY	3		/* ms units */
+
 bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 			     bool got_budget)
 {
@@ -1167,6 +1169,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 	struct request *rq, *nxt;
 	bool no_tag = false;
 	int errors, queued;
+	blk_status_t ret = BLK_STS_OK;
 
 	if (list_empty(list))
 		return false;
@@ -1179,7 +1182,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 	errors = queued = 0;
 	do {
 		struct blk_mq_queue_data bd;
-		blk_status_t ret;
 
 		rq = list_first_entry(list, struct request, queuelist);
 		if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
@@ -1224,7 +1226,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 		}
 
 		ret = q->mq_ops->queue_rq(hctx, &bd);
-		if (ret == BLK_STS_RESOURCE) {
+		if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
 			/*
 			 * If an I/O scheduler has been configured and we got a
 			 * driver tag for the next request already, free it
@@ -1255,6 +1257,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 	 * that is where we will continue on next queue run.
 	 */
 	if (!list_empty(list)) {
+		bool needs_restart;
+
 		spin_lock(&hctx->lock);
 		list_splice_init(list, &hctx->dispatch);
 		spin_unlock(&hctx->lock);
@@ -1278,10 +1282,17 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 		 * - Some but not all block drivers stop a queue before
 		 *   returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
 		 *   and dm-rq.
+		 *
+		 * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
+		 * bit is set, run queue after 10ms to avoid IO stalls
+		 * that could otherwise occur if the queue is idle.
 		 */
-		if (!blk_mq_sched_needs_restart(hctx) ||
+		needs_restart = blk_mq_sched_needs_restart(hctx);
+		if (!needs_restart ||
 		    (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
 			blk_mq_run_hw_queue(hctx, true);
+		else if (needs_restart && (ret == BLK_STS_RESOURCE))
+			blk_mq_delay_run_hw_queue(hctx, BLK_MQ_QUEUE_DELAY);
 	}
 
 	return (queued + errors) != 0;
@@ -1762,6 +1773,7 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
 		*cookie = new_cookie;
 		break;
 	case BLK_STS_RESOURCE:
+	case BLK_STS_DEV_RESOURCE:
 		__blk_mq_requeue_request(rq);
 		break;
 	default:
@@ -1824,7 +1836,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 	hctx_lock(hctx, &srcu_idx);
 
 	ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
-	if (ret == BLK_STS_RESOURCE)
+	if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
 		blk_mq_sched_insert_request(rq, false, true, false);
 	else if (ret != BLK_STS_OK)
 		blk_mq_end_request(rq, ret);
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 5b94e530570c..4bc25fc4e73c 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -1230,7 +1230,7 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd)
 				return BLK_STS_OK;
 			} else
 				/* requeue request */
-				return BLK_STS_RESOURCE;
+				return BLK_STS_DEV_RESOURCE;
 		}
 	}
 
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 68846897d213..79908e6ddbf2 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -276,7 +276,7 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
 		/* Out of mem doesn't actually happen, since we fall back
 		 * to direct descriptors */
 		if (err == -ENOMEM || err == -ENOSPC)
-			return BLK_STS_RESOURCE;
+			return BLK_STS_DEV_RESOURCE;
 		return BLK_STS_IOERR;
 	}
 
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 891265acb10e..e126e4cac2ca 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -911,7 +911,7 @@ static blk_status_t blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
 out_busy:
 	blk_mq_stop_hw_queue(hctx);
 	spin_unlock_irqrestore(&rinfo->ring_lock, flags);
-	return BLK_STS_RESOURCE;
+	return BLK_STS_DEV_RESOURCE;
 }
 
 static void blkif_complete_rq(struct request *rq)
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index b7d175e94a02..348a0cb6963a 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -404,7 +404,7 @@ static blk_status_t dm_dispatch_clone_request(struct request *clone, struct requ
 
 	clone->start_time = jiffies;
 	r = blk_insert_cloned_request(clone->q, clone);
-	if (r != BLK_STS_OK && r != BLK_STS_RESOURCE)
+	if (r != BLK_STS_OK && r != BLK_STS_RESOURCE && r != BLK_STS_DEV_RESOURCE)
 		/* must complete clone in terms of original request */
 		dm_complete_request(rq, r);
 	return r;
@@ -496,7 +496,7 @@ static int map_request(struct dm_rq_target_io *tio)
 		trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
 				     blk_rq_pos(rq));
 		ret = dm_dispatch_clone_request(clone, rq);
-		if (ret == BLK_STS_RESOURCE) {
+		if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
 			blk_rq_unprep_clone(clone);
 			tio->ti->type->release_clone_rq(clone);
 			tio->clone = NULL;
@@ -769,7 +769,6 @@ static blk_status_t dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 		/* Undo dm_start_request() before requeuing */
 		rq_end_stats(md, rq);
 		rq_completed(md, rq_data_dir(rq), false);
-		blk_mq_delay_run_hw_queue(hctx, 100/*ms*/);
 		return BLK_STS_RESOURCE;
 	}
 
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index b76ba4629e02..54e679541ad6 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -35,8 +35,6 @@ enum nvme_fc_queue_flags {
 	NVME_FC_Q_LIVE,
 };
 
-#define NVMEFC_QUEUE_DELAY	3		/* ms units */
-
 #define NVME_FC_DEFAULT_DEV_LOSS_TMO	60	/* seconds */
 
 struct nvme_fc_queue {
@@ -2231,7 +2229,7 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
 	 * the target device is present
 	 */
 	if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE)
-		goto busy;
+		return BLK_STS_RESOURCE;
 
 	if (!nvme_fc_ctrl_get(ctrl))
 		return BLK_STS_IOERR;
@@ -2311,16 +2309,10 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
 				ret != -EBUSY)
 			return BLK_STS_IOERR;
 
-		goto busy;
+		return BLK_STS_RESOURCE;
 	}
 
 	return BLK_STS_OK;
-
-busy:
-	if (!(op->flags & FCOP_FLAGS_AEN) && queue->hctx)
-		blk_mq_delay_run_hw_queue(queue->hctx, NVMEFC_QUEUE_DELAY);
-
-	return BLK_STS_RESOURCE;
 }
 
 static inline blk_status_t nvme_fc_is_ready(struct nvme_fc_queue *queue,
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index d9ca1dfab154..55be2550c555 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2030,9 +2030,9 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 	case BLK_STS_OK:
 		break;
 	case BLK_STS_RESOURCE:
-		if (atomic_read(&sdev->device_busy) == 0 &&
-		    !scsi_device_blocked(sdev))
-			blk_mq_delay_run_hw_queue(hctx, SCSI_QUEUE_DELAY);
+		if (atomic_read(&sdev->device_busy) ||
+		    scsi_device_blocked(sdev))
+			ret = BLK_STS_DEV_RESOURCE;
 		break;
 	default:
 		/*
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 2d973ac54b09..6a8ad60e7f09 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -39,6 +39,20 @@ typedef u8 __bitwise blk_status_t;
 
 #define BLK_STS_AGAIN		((__force blk_status_t)12)
 
+/*
+ * BLK_STS_DEV_RESOURCE is returned from driver to block layer if device
+ * related resource is unavailable, but driver can guarantee that queue
+ * will be rerun in future once the resource is available (whereby
+ * dispatching requests).
+ *
+ * Difference with BLK_STS_RESOURCE:
+ * If driver isn't sure if the queue will be rerun once device resource
+ * is made available, please return BLK_STS_RESOURCE.  For example: when
+ * memory allocation, DMA Mapping or other system resource allocation
+ * fails and IO can't be submitted to device.
+ */
+#define BLK_STS_DEV_RESOURCE	((__force blk_status_t)13)
+
 /**
  * blk_path_error - returns true if error may be path related
  * @error: status the request was completed with
-- 
2.15.0

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH V4] blk-mq: introduce BLK_STS_DEV_RESOURCE
  2018-01-29 20:33 [PATCH V4] blk-mq: introduce BLK_STS_DEV_RESOURCE Mike Snitzer
@ 2018-01-29 21:22 ` Bart Van Assche
  2018-01-29 21:44   ` Mike Snitzer
  2018-01-30  3:16   ` Ming Lei
  0 siblings, 2 replies; 11+ messages in thread
From: Bart Van Assche @ 2018-01-29 21:22 UTC (permalink / raw)
  To: snitzer@redhat.com, axboe@kernel.dk
  Cc: dm-devel@redhat.com, linux-scsi@vger.kernel.org,
	linux-block@vger.kernel.org, linux-nvme@lists.infradead.org

T24gTW9uLCAyMDE4LTAxLTI5IGF0IDE1OjMzIC0wNTAwLCBNaWtlIFNuaXR6ZXIgd3JvdGU6DQo+
ICsJCSAqIElmIGRyaXZlciByZXR1cm5zIEJMS19TVFNfUkVTT1VSQ0UgYW5kIFNDSEVEX1JFU1RB
UlQNCj4gKwkJICogYml0IGlzIHNldCwgcnVuIHF1ZXVlIGFmdGVyIDEwbXMgdG8gYXZvaWQgSU8g
c3RhbGxzDQo+ICsJCSAqIHRoYXQgY291bGQgb3RoZXJ3aXNlIG9jY3VyIGlmIHRoZSBxdWV1ZSBp
cyBpZGxlLg0KPiAgCQkgKi8NCj4gLQkJaWYgKCFibGtfbXFfc2NoZWRfbmVlZHNfcmVzdGFydCho
Y3R4KSB8fA0KPiArCQluZWVkc19yZXN0YXJ0ID0gYmxrX21xX3NjaGVkX25lZWRzX3Jlc3RhcnQo
aGN0eCk7DQo+ICsJCWlmICghbmVlZHNfcmVzdGFydCB8fA0KPiAgCQkgICAgKG5vX3RhZyAmJiBs
aXN0X2VtcHR5X2NhcmVmdWwoJmhjdHgtPmRpc3BhdGNoX3dhaXQuZW50cnkpKSkNCj4gIAkJCWJs
a19tcV9ydW5faHdfcXVldWUoaGN0eCwgdHJ1ZSk7DQo+ICsJCWVsc2UgaWYgKG5lZWRzX3Jlc3Rh
cnQgJiYgKHJldCA9PSBCTEtfU1RTX1JFU09VUkNFKSkNCj4gKwkJCWJsa19tcV9kZWxheV9ydW5f
aHdfcXVldWUoaGN0eCwgQkxLX01RX1FVRVVFX0RFTEFZKTsNCj4gIAl9DQoNClRoZSBhYm92ZSBj
b2RlIG9ubHkgY2FsbHMgYmxrX21xX2RlbGF5X3J1bl9od19xdWV1ZSgpIGlmIHRoZSBmb2xsb3dp
bmcgY29uZGl0aW9uDQppcyBtZXQ6ICEoIW5lZWRzX3Jlc3RhcnQgfHwgKG5vX3RhZyAmJiBsaXN0
X2VtcHR5X2NhcmVmdWwoJmhjdHgtPmRpc3BhdGNoX3dhaXQuZW50cnkpKSkNCiYmIChuZWVkc19y
ZXN0YXJ0ICYmIChyZXQgPT0gQkxLX1NUU19SRVNPVVJDRSkpLiBUaGF0IGJvb2xlYW4gZXhwcmVz
c2lvbiBjYW4gYmUNCnNpbXBsaWZpZWQgaW50byB0aGUgZm9sbG93aW5nOiBuZWVkc19yZXN0YXJ0
ICYmIHJldCA9PSBCTEtfU1RTX1JFU09VUkNFICYmIA0KIShub190YWcgJiYgbGlzdF9lbXB0eV9j
YXJlZnVsKCZoY3R4LT5kaXNwYXRjaF93YWl0LmVudHJ5KSkuIEZyb20gYmxrLW1xLXNjaGVkLmg6
DQoNCnN0YXRpYyBpbmxpbmUgYm9vbCBibGtfbXFfc2NoZWRfbmVlZHNfcmVzdGFydChzdHJ1Y3Qg
YmxrX21xX2h3X2N0eCAqaGN0eCkNCnsNCglyZXR1cm4gdGVzdF9iaXQoQkxLX01RX1NfU0NIRURf
UkVTVEFSVCwgJmhjdHgtPnN0YXRlKTsNCn0NCg0KSW4gb3RoZXIgd29yZHMsIHRoZSBhYm92ZSBj
b2RlIHdpbGwgbm90IGNhbGwgYmxrX21xX2RlbGF5X3J1bl9od19xdWV1ZSgpIGlmDQpCTEtfTVFf
U19TQ0hFRF9SRVNUQVJUIGlzIGNsZWFyZWQgYWZ0ZXIgaXQgZ290IHNldCBhbmQgYmVmb3JlIHRo
ZSBhYm92ZSBjb2RlIGlzDQpyZWFjaGVkLiBUaGUgQkxLX01RX1NfU0NIRURfUkVTVEFSVCBiaXQg
Z2V0cyBjbGVhcmVkIGlmIGEgcmVxdWVzdCBjb21wbGV0ZXMNCmNvbmN1cnJlbnRseSB3aXRoIHRo
ZSBhYm92ZSBjb2RlLiBGcm9tIGJsay1tcS1zY2hlZC5jOg0KDQpzdGF0aWMgYm9vbCBibGtfbXFf
c2NoZWRfcmVzdGFydF9oY3R4KHN0cnVjdCBibGtfbXFfaHdfY3R4ICpoY3R4KQ0Kew0KCWlmICgh
dGVzdF9iaXQoQkxLX01RX1NfU0NIRURfUkVTVEFSVCwgJmhjdHgtPnN0YXRlKSkNCgkJcmV0dXJu
IGZhbHNlOw0KDQoJaWYgKGhjdHgtPmZsYWdzICYgQkxLX01RX0ZfVEFHX1NIQVJFRCkgew0KCQlz
dHJ1Y3QgcmVxdWVzdF9xdWV1ZSAqcSA9IGhjdHgtPnF1ZXVlOw0KDQoJCWlmICh0ZXN0X2FuZF9j
bGVhcl9iaXQoQkxLX01RX1NfU0NIRURfUkVTVEFSVCwgJmhjdHgtPnN0YXRlKSkNCgkJCWF0b21p
Y19kZWMoJnEtPnNoYXJlZF9oY3R4X3Jlc3RhcnQpOw0KCX0gZWxzZQ0KCQljbGVhcl9iaXQoQkxL
X01RX1NfU0NIRURfUkVTVEFSVCwgJmhjdHgtPnN0YXRlKTsNCg0KCXJldHVybiBibGtfbXFfcnVu
X2h3X3F1ZXVlKGhjdHgsIHRydWUpOw0KfQ0KDQpUaGUgYWJvdmUgYmxrX21xX3J1bl9od19xdWV1
ZSgpIGNhbGwgbWF5IGZpbmlzaCBlaXRoZXIgYmVmb3JlIG9yIGFmdGVyDQpibGtfbXFfZGlzcGF0
Y2hfcnFfbGlzdCgpIGV4YW1pbmVzIHRoZSBCTEtfTVFfU19TQ0hFRF9SRVNUQVJUIGZsYWcuDQoN
ClRoYXQncyB3aHkgSSB3cm90ZSBpbiBwcmV2aW91cyBlLW1haWxzIHRoYXQgdGhpcyBwYXRjaCBh
bmQgYWxzbyB0aGUgcHJldmlvdXMNCnZlcnNpb25zIG9mIHRoaXMgcGF0Y2ggY2hhbmdlIGEgc3lz
dGVtYXRpYyBjYWxsIG9mIGJsa19tcV9kZWxheV9ydW5faHdfcXVldWUoKQ0KaW50byBhIGNhbGwg
dGhhdCBtYXkgb3IgbWF5IG5vdCBoYXBwZW4sIGRlcGVuZGluZyBvbiB3aGV0aGVyIG9yIG5vdCBh
IHJlcXVlc3QNCmNvbXBsZXRlcyBjb25jdXJyZW50bHkgd2l0aCByZXF1ZXN0IHF1ZXVlaW5nLiBT
b3JyeSBidXQgSSB0aGluayB0aGF0IG1lYW5zDQp0aGF0IHRoZSBhYm92ZSBjaGFuZ2UgY29tYmlu
ZWQgd2l0aCBjaGFuZ2luZyBCTEtfU1RTX1JFU09VUkNFIGludG8NCkJMS19TVFNfREVWX1JFU09V
UkNFIGlzIHdyb25nIGFuZCB3aHkgSSBleHBlY3QgdGhhdCB0aGlzIHdpbGwgcmVzdWx0IGluDQpz
cG9yYWRpYyBxdWV1ZSBzdGFsbHMuIEFzIHlvdSBrbm93IHNwb3JhZGljIHF1ZXVlIHN0YWxscyBh
cmUgYW5ub3lpbmcgYW5kIGhhcmQNCnRvIGRlYnVnLg0KDQpQbGVudHkgb2YgZS1tYWlscyBoYXZl
IGFscmVhZHkgYmVlbiBleGNoYW5nZWQgYWJvdXQgdmVyc2lvbnMgb25lIHRvIGZvdXIgb2YgdGhp
cw0KcGF0Y2ggYnV0IHNvIGZhciBub2JvZHkgaGFzIGV2ZW4gdHJpZWQgdG8gY29udHJhZGljdCB0
aGUgYWJvdmUuDQoNCkJhcnQu

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH V4] blk-mq: introduce BLK_STS_DEV_RESOURCE
  2018-01-29 21:22 ` Bart Van Assche
@ 2018-01-29 21:44   ` Mike Snitzer
  2018-01-29 21:51     ` Bart Van Assche
  2018-01-30  3:16   ` Ming Lei
  1 sibling, 1 reply; 11+ messages in thread
From: Mike Snitzer @ 2018-01-29 21:44 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: axboe@kernel.dk, linux-block@vger.kernel.org, dm-devel@redhat.com,
	linux-nvme@lists.infradead.org, linux-scsi@vger.kernel.org

On Mon, Jan 29 2018 at  4:22pm -0500,
Bart Van Assche <Bart.VanAssche@wdc.com> wrote:

> On Mon, 2018-01-29 at 15:33 -0500, Mike Snitzer wrote:
> > +		 * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
> > +		 * bit is set, run queue after 10ms to avoid IO stalls
> > +		 * that could otherwise occur if the queue is idle.
> >  		 */
> > -		if (!blk_mq_sched_needs_restart(hctx) ||
> > +		needs_restart = blk_mq_sched_needs_restart(hctx);
> > +		if (!needs_restart ||
> >  		    (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
> >  			blk_mq_run_hw_queue(hctx, true);
> > +		else if (needs_restart && (ret == BLK_STS_RESOURCE))
> > +			blk_mq_delay_run_hw_queue(hctx, BLK_MQ_QUEUE_DELAY);
> >  	}
> 
> The above code only calls blk_mq_delay_run_hw_queue() if the following condition
> is met: !(!needs_restart || (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
> && (needs_restart && (ret == BLK_STS_RESOURCE)). That boolean expression can be
> simplified into the following: needs_restart && ret == BLK_STS_RESOURCE && 
> !(no_tag && list_empty_careful(&hctx->dispatch_wait.entry)). From blk-mq-sched.h:
> 
> static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
> {
> 	return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
> }
> 
> In other words, the above code will not call blk_mq_delay_run_hw_queue() if
> BLK_MQ_S_SCHED_RESTART is cleared after it got set and before the above code is
> reached. The BLK_MQ_S_SCHED_RESTART bit gets cleared if a request completes
> concurrently with the above code. From blk-mq-sched.c:
> 
> static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
> {
> 	if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
> 		return false;
> 
> 	if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
> 		struct request_queue *q = hctx->queue;
> 
> 		if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
> 			atomic_dec(&q->shared_hctx_restart);
> 	} else
> 		clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
> 
> 	return blk_mq_run_hw_queue(hctx, true);
> }
> 
> The above blk_mq_run_hw_queue() call may finish either before or after
> blk_mq_dispatch_rq_list() examines the BLK_MQ_S_SCHED_RESTART flag.

But regardless of which wins the race, the queue will have been run.
Which is all we care about right?

> That's why I wrote in previous e-mails that this patch and also the previous
> versions of this patch change a systematic call of blk_mq_delay_run_hw_queue()
> into a call that may or may not happen, depending on whether or not a request
> completes concurrently with request queueing. Sorry but I think that means
> that the above change combined with changing BLK_STS_RESOURCE into
> BLK_STS_DEV_RESOURCE is wrong and why I expect that this will result in
> sporadic queue stalls. As you know sporadic queue stalls are annoying and hard
> to debug.
> 
> Plenty of e-mails have already been exchanged about versions one to four of this
> patch but so far nobody has even tried to contradict the above.

Sure, but we aren't mind-readers.  We're all very open to the idea that
you have noticed something we haven't.  But until your very helpful
mail I'm replying to, you hadn't been specific about the race that
concerned you.  Thanks for sharing.

I'll defer to Jens and/or Ming on whether the race you've raised is of
concern.  As I said above, I'm inclined to think it doesn't matter who
wins the race given the queue will be run again.  And when it does it
may get STS_RESOURCE again, and in that case it may then resort to
blk_mq_delay_run_hw_queue() in blk_mq_dispatch_rq_list().

Mike

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH V4] blk-mq: introduce BLK_STS_DEV_RESOURCE
  2018-01-29 21:44   ` Mike Snitzer
@ 2018-01-29 21:51     ` Bart Van Assche
  2018-01-29 22:14       ` Mike Snitzer
                         ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Bart Van Assche @ 2018-01-29 21:51 UTC (permalink / raw)
  To: snitzer@redhat.com
  Cc: dm-devel@redhat.com, linux-scsi@vger.kernel.org,
	linux-block@vger.kernel.org, linux-nvme@lists.infradead.org,
	axboe@kernel.dk

T24gTW9uLCAyMDE4LTAxLTI5IGF0IDE2OjQ0IC0wNTAwLCBNaWtlIFNuaXR6ZXIgd3JvdGU6DQo+
IEJ1dCByZWdhcmRsZXNzIG9mIHdoaWNoIHdpbnMgdGhlIHJhY2UsIHRoZSBxdWV1ZSB3aWxsIGhh
dmUgYmVlbiBydW4uDQo+IFdoaWNoIGlzIGFsbCB3ZSBjYXJlIGFib3V0IHJpZ2h0Pw0KDQpSdW5u
aW5nIHRoZSBxdWV1ZSBpcyBub3Qgc3VmZmljaWVudC4gV2l0aCB0aGlzIHBhdGNoIGFwcGxpZWQg
aXQgY2FuIGhhcHBlbg0KdGhhdCB0aGUgYmxvY2sgZHJpdmVyIHJldHVybnMgQkxLX1NUU19ERVZf
UkVTT1VSQ0UsIHRoYXQgdGhlIHR3byBvciBtb3JlDQpjb25jdXJyZW50IHF1ZXVlIHJ1bnMgZmlu
aXNoIGJlZm9yZSBzdWZmaWNpZW50IGRldmljZSByZXNvdXJjZXMgYXJlIGF2YWlsYWJsZQ0KdG8g
ZXhlY3V0ZSB0aGUgcmVxdWVzdCBhbmQgdGhhdCBibGtfbXFfZGVsYXlfcnVuX2h3X3F1ZXVlKCkg
ZG9lcyBub3QgZ2V0DQpjYWxsZWQgYXQgYWxsLiBJZiBubyBvdGhlciBhY3Rpdml0eSB0cmlnZ2Vy
cyBhIHF1ZXVlIHJ1biwgZS5nLiByZXF1ZXN0DQpjb21wbGV0aW9uLCB0aGlzIHdpbGwgcmVzdWx0
IGluIGEgcXVldWUgc3RhbGwuDQoNCkJhcnQu

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH V4] blk-mq: introduce BLK_STS_DEV_RESOURCE
  2018-01-29 21:51     ` Bart Van Assche
@ 2018-01-29 22:14       ` Mike Snitzer
  2018-01-29 22:22         ` Bart Van Assche
  2018-01-30  5:55         ` [dm-devel] " Ming Lei
  2018-01-30  3:20       ` Ming Lei
  2018-02-02  0:26       ` [dm-devel] " John Stoffel
  2 siblings, 2 replies; 11+ messages in thread
From: Mike Snitzer @ 2018-01-29 22:14 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: dm-devel@redhat.com, linux-scsi@vger.kernel.org,
	linux-block@vger.kernel.org, linux-nvme@lists.infradead.org,
	axboe@kernel.dk

On Mon, Jan 29 2018 at  4:51pm -0500,
Bart Van Assche <Bart.VanAssche@wdc.com> wrote:

> On Mon, 2018-01-29 at 16:44 -0500, Mike Snitzer wrote:
> > But regardless of which wins the race, the queue will have been run.
> > Which is all we care about right?
> 
> Running the queue is not sufficient. With this patch applied it can happen
> that the block driver returns BLK_STS_DEV_RESOURCE, that the two or more
> concurrent queue runs finish before sufficient device resources are available
> to execute the request and that blk_mq_delay_run_hw_queue() does not get
> called at all. If no other activity triggers a queue run, e.g. request
> completion, this will result in a queue stall.

If BLK_STS_DEV_RESOURCE is returned then the driver doesn't need to rely
on a future queue run.  IIUC, that is the entire premise of
BLK_STS_DEV_RESOURCE.  If the driver had doubt about whether the
resource were going to be available in the future it should return
BLK_STS_RESOURCE.

That may seem like putting a lot on a driver developer (to decide
between the 2) but I'll again defer to Jens here.  This was the approach
he was advocating be pursued.

In practice it is working.  Would welcome you testing this V4.

Thanks,
Mike

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH V4] blk-mq: introduce BLK_STS_DEV_RESOURCE
  2018-01-29 22:14       ` Mike Snitzer
@ 2018-01-29 22:22         ` Bart Van Assche
  2018-01-30  5:55         ` [dm-devel] " Ming Lei
  1 sibling, 0 replies; 11+ messages in thread
From: Bart Van Assche @ 2018-01-29 22:22 UTC (permalink / raw)
  To: snitzer@redhat.com
  Cc: dm-devel@redhat.com, linux-scsi@vger.kernel.org,
	linux-block@vger.kernel.org, linux-nvme@lists.infradead.org,
	axboe@kernel.dk

T24gTW9uLCAyMDE4LTAxLTI5IGF0IDE3OjE0IC0wNTAwLCBNaWtlIFNuaXR6ZXIgd3JvdGU6DQo+
IE9uIE1vbiwgSmFuIDI5IDIwMTggYXQgIDQ6NTFwbSAtMDUwMCwNCj4gQmFydCBWYW4gQXNzY2hl
IDxCYXJ0LlZhbkFzc2NoZUB3ZGMuY29tPiB3cm90ZToNCj4gDQo+ID4gT24gTW9uLCAyMDE4LTAx
LTI5IGF0IDE2OjQ0IC0wNTAwLCBNaWtlIFNuaXR6ZXIgd3JvdGU6DQo+ID4gPiBCdXQgcmVnYXJk
bGVzcyBvZiB3aGljaCB3aW5zIHRoZSByYWNlLCB0aGUgcXVldWUgd2lsbCBoYXZlIGJlZW4gcnVu
Lg0KPiA+ID4gV2hpY2ggaXMgYWxsIHdlIGNhcmUgYWJvdXQgcmlnaHQ/DQo+ID4gDQo+ID4gUnVu
bmluZyB0aGUgcXVldWUgaXMgbm90IHN1ZmZpY2llbnQuIFdpdGggdGhpcyBwYXRjaCBhcHBsaWVk
IGl0IGNhbiBoYXBwZW4NCj4gPiB0aGF0IHRoZSBibG9jayBkcml2ZXIgcmV0dXJucyBCTEtfU1RT
X0RFVl9SRVNPVVJDRSwgdGhhdCB0aGUgdHdvIG9yIG1vcmUNCj4gPiBjb25jdXJyZW50IHF1ZXVl
IHJ1bnMgZmluaXNoIGJlZm9yZSBzdWZmaWNpZW50IGRldmljZSByZXNvdXJjZXMgYXJlIGF2YWls
YWJsZQ0KPiA+IHRvIGV4ZWN1dGUgdGhlIHJlcXVlc3QgYW5kIHRoYXQgYmxrX21xX2RlbGF5X3J1
bl9od19xdWV1ZSgpIGRvZXMgbm90IGdldA0KPiA+IGNhbGxlZCBhdCBhbGwuIElmIG5vIG90aGVy
IGFjdGl2aXR5IHRyaWdnZXJzIGEgcXVldWUgcnVuLCBlLmcuIHJlcXVlc3QNCj4gPiBjb21wbGV0
aW9uLCB0aGlzIHdpbGwgcmVzdWx0IGluIGEgcXVldWUgc3RhbGwuDQo+IA0KPiBJZiBCTEtfU1RT
X0RFVl9SRVNPVVJDRSBpcyByZXR1cm5lZCB0aGVuIHRoZSBkcml2ZXIgZG9lc24ndCBuZWVkIHRv
IHJlbHkNCj4gb24gYSBmdXR1cmUgcXVldWUgcnVuLiAgSUlVQywgdGhhdCBpcyB0aGUgZW50aXJl
IHByZW1pc2Ugb2YNCj4gQkxLX1NUU19ERVZfUkVTT1VSQ0UuICBJZiB0aGUgZHJpdmVyIGhhZCBk
b3VidCBhYm91dCB3aGV0aGVyIHRoZQ0KPiByZXNvdXJjZSB3ZXJlIGdvaW5nIHRvIGJlIGF2YWls
YWJsZSBpbiB0aGUgZnV0dXJlIGl0IHNob3VsZCByZXR1cm4NCj4gQkxLX1NUU19SRVNPVVJDRS4N
Cj4gDQo+IFRoYXQgbWF5IHNlZW0gbGlrZSBwdXR0aW5nIGEgbG90IG9uIGEgZHJpdmVyIGRldmVs
b3BlciAodG8gZGVjaWRlDQo+IGJldHdlZW4gdGhlIDIpIGJ1dCBJJ2xsIGFnYWluIGRlZmVyIHRv
IEplbnMgaGVyZS4gIFRoaXMgd2FzIHRoZSBhcHByb2FjaA0KPiBoZSB3YXMgYWR2b2NhdGluZyBi
ZSBwdXJzdWVkLg0KDQpIZWxsbyBNaWtlLA0KDQpUaGVyZSB3YXMgYSB0eXBvIGluIG15IHJlcGx5
OiBJIHNob3VsZCBoYXZlIHJlZmVycmVkIHRvIEJMS19TVFNfUkVTT1VSQ0UNCmluc3RlYWQgb2Yg
QkxLX1NUU19ERVZfUkVTT1VSQ0UuIFRoZSBjb252ZW50aW9uIGZvciBCTEtfU1RTX0RFVl9SRVNP
VVJDRSBpcw0KbmFtZWx5IHRoYXQgaWYgLnF1ZXVlX3JxKCkgcmV0dXJucyB0aGF0IHN0YXR1cyBj
b2RlIHRoYXQgdGhlIGJsb2NrIGRyaXZlcg0KZ3VhcmFudGVlcyB0aGF0IHRoZSBxdWV1ZSB3aWxs
IGJlIHJlcnVuLg0KDQpCYXJ0Lg==

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH V4] blk-mq: introduce BLK_STS_DEV_RESOURCE
  2018-01-29 21:22 ` Bart Van Assche
  2018-01-29 21:44   ` Mike Snitzer
@ 2018-01-30  3:16   ` Ming Lei
  1 sibling, 0 replies; 11+ messages in thread
From: Ming Lei @ 2018-01-30  3:16 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: snitzer@redhat.com, axboe@kernel.dk, dm-devel@redhat.com,
	linux-scsi@vger.kernel.org, linux-block@vger.kernel.org,
	linux-nvme@lists.infradead.org

On Tue, Jan 30, 2018 at 5:22 AM, Bart Van Assche <Bart.VanAssche@wdc.com> wrote:
> On Mon, 2018-01-29 at 15:33 -0500, Mike Snitzer wrote:
>> +              * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
>> +              * bit is set, run queue after 10ms to avoid IO stalls
>> +              * that could otherwise occur if the queue is idle.
>>                */
>> -             if (!blk_mq_sched_needs_restart(hctx) ||
>> +             needs_restart = blk_mq_sched_needs_restart(hctx);
>> +             if (!needs_restart ||
>>                   (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
>>                       blk_mq_run_hw_queue(hctx, true);
>> +             else if (needs_restart && (ret == BLK_STS_RESOURCE))
>> +                     blk_mq_delay_run_hw_queue(hctx, BLK_MQ_QUEUE_DELAY);
>>       }
>
> The above code only calls blk_mq_delay_run_hw_queue() if the following condition
> is met: !(!needs_restart || (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
> && (needs_restart && (ret == BLK_STS_RESOURCE)). That boolean expression can be
> simplified into the following: needs_restart && ret == BLK_STS_RESOURCE &&
> !(no_tag && list_empty_careful(&hctx->dispatch_wait.entry)). From blk-mq-sched.h:

No, the expression of  (needs_restart && (ret == BLK_STS_RESOURCE)) clearly
expresses what we want to do.

When RESTART is working, and meantime BLK_STS_RESOURCE is returned
from driver, we need to avoid IO hang by blk_mq_delay_run_hw_queue().

>
> static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
> {
>         return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
> }
>
> In other words, the above code will not call blk_mq_delay_run_hw_queue() if
> BLK_MQ_S_SCHED_RESTART is cleared after it got set and before the above code is
> reached. The BLK_MQ_S_SCHED_RESTART bit gets cleared if a request completes
> concurrently with the above code. From blk-mq-sched.c:

That won't be an issue, given any time the SCHED_RESTART is cleared, the queue
will be run again, so we needn't to worry about that.

>
> static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
> {
>         if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
>                 return false;
>
>         if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
>                 struct request_queue *q = hctx->queue;
>
>                 if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
>                         atomic_dec(&q->shared_hctx_restart);
>         } else
>                 clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
>
>         return blk_mq_run_hw_queue(hctx, true);
> }
>
> The above blk_mq_run_hw_queue() call may finish either before or after

If the above blk_mq_run_hw_queue() finishes before blk_mq_dispatch_rq_list()
examines the flag, blk_mq_dispatch_rq_list() will see the flag cleared, and run
queue again by the following code branch:

                if (!blk_mq_sched_needs_restart(hctx) ||
                    (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
                        blk_mq_run_hw_queue(hctx, true);

If blk_mq_run_hw_queue() finishes after blk_mq_dispatch_rq_list() examines
the flag, this blk_mq_run_hw_queue() will see the new added request, and
still everything is fine. Even blk_mq_delay_run_hw_queue() can be started
too.

> blk_mq_dispatch_rq_list() examines the BLK_MQ_S_SCHED_RESTART flag.
>
> That's why I wrote in previous e-mails that this patch and also the previous
> versions of this patch change a systematic call of blk_mq_delay_run_hw_queue()
> into a call that may or may not happen, depending on whether or not a request
> completes concurrently with request queueing. Sorry but I think that means
> that the above change combined with changing BLK_STS_RESOURCE into
> BLK_STS_DEV_RESOURCE is wrong and why I expect that this will result in
> sporadic queue stalls. As you know sporadic queue stalls are annoying and hard
> to debug.

Now there is debugfs, it isn't difficult to deal with that if
reporter'd like to cowork.

>
> Plenty of e-mails have already been exchanged about versions one to four of this
> patch but so far nobody has even tried to contradict the above.

No, I don't see the issue, let's revisit the main change again:

+       needs_restart = blk_mq_sched_needs_restart(hctx);
+       if (!needs_restart ||
            (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
            blk_mq_run_hw_queue(hctx, true);
+       else if (needs_restart && (ret == BLK_STS_RESOURCE))
+           blk_mq_delay_run_hw_queue(hctx, 10);

If SCHED_RESTART isn't set, queue is run immediately, otherwise when
BLK_STS_RESOURCE is returned, we avoid IO hang by
blk_mq_delay_run_hw_queue(hctx, XXX).

And we don't cover  BLK_STS_DEV_RESOURCE above because it is documented
clearly that  BLK_STS_DEV_RESOURCE is only returned by driver iff queue will be
rerun in future if resource is available.

Is there  any hole in above code?

Thanks,
Ming Lei

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH V4] blk-mq: introduce BLK_STS_DEV_RESOURCE
  2018-01-29 21:51     ` Bart Van Assche
  2018-01-29 22:14       ` Mike Snitzer
@ 2018-01-30  3:20       ` Ming Lei
  2018-02-02  0:26       ` [dm-devel] " John Stoffel
  2 siblings, 0 replies; 11+ messages in thread
From: Ming Lei @ 2018-01-30  3:20 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: snitzer@redhat.com, dm-devel@redhat.com,
	linux-scsi@vger.kernel.org, linux-block@vger.kernel.org,
	linux-nvme@lists.infradead.org, axboe@kernel.dk

On Tue, Jan 30, 2018 at 5:51 AM, Bart Van Assche <Bart.VanAssche@wdc.com> wrote:
> On Mon, 2018-01-29 at 16:44 -0500, Mike Snitzer wrote:
>> But regardless of which wins the race, the queue will have been run.
>> Which is all we care about right?
>
> Running the queue is not sufficient. With this patch applied it can happen
> that the block driver returns BLK_STS_DEV_RESOURCE, that the two or more
> concurrent queue runs finish before sufficient device resources are available
> to execute the request and that blk_mq_delay_run_hw_queue() does not get
> called at all. If no other activity triggers a queue run, e.g. request
> completion, this will result in a queue stall.

Please see document of BLK_STS_DEV_RESOURCE:

+ * BLK_STS_DEV_RESOURCE is returned from driver to block layer if device
+ * related resource is unavailable, but driver can guarantee that queue
+ * will be rerun in future once the resource is available (whereby
+ * dispatching requests).

I have explained the SCSI's BLK_STS_DEV_RESOURCE conversion in
another thread, let's know if you have further concern:

https://marc.info/?l=linux-block&m=151727454815018&w=2

-- 
Ming Lei

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [dm-devel] [PATCH V4] blk-mq: introduce BLK_STS_DEV_RESOURCE
  2018-01-29 22:14       ` Mike Snitzer
  2018-01-29 22:22         ` Bart Van Assche
@ 2018-01-30  5:55         ` Ming Lei
  1 sibling, 0 replies; 11+ messages in thread
From: Ming Lei @ 2018-01-30  5:55 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: Bart Van Assche, linux-block@vger.kernel.org, axboe@kernel.dk,
	dm-devel@redhat.com, linux-nvme@lists.infradead.org,
	linux-scsi@vger.kernel.org

On Tue, Jan 30, 2018 at 6:14 AM, Mike Snitzer <snitzer@redhat.com> wrote:
> On Mon, Jan 29 2018 at  4:51pm -0500,
> Bart Van Assche <Bart.VanAssche@wdc.com> wrote:
>
>> On Mon, 2018-01-29 at 16:44 -0500, Mike Snitzer wrote:
>> > But regardless of which wins the race, the queue will have been run.
>> > Which is all we care about right?
>>
>> Running the queue is not sufficient. With this patch applied it can happen
>> that the block driver returns BLK_STS_DEV_RESOURCE, that the two or more
>> concurrent queue runs finish before sufficient device resources are available
>> to execute the request and that blk_mq_delay_run_hw_queue() does not get
>> called at all. If no other activity triggers a queue run, e.g. request
>> completion, this will result in a queue stall.
>
> If BLK_STS_DEV_RESOURCE is returned then the driver doesn't need to rely
> on a future queue run.  IIUC, that is the entire premise of
> BLK_STS_DEV_RESOURCE.  If the driver had doubt about whether the
> resource were going to be available in the future it should return
> BLK_STS_RESOURCE.
>
> That may seem like putting a lot on a driver developer (to decide
> between the 2) but I'll again defer to Jens here.  This was the approach
> he was advocating be pursued.

Thinking of further, maybe you can add the following description in V5,
and it should be much easier for driver developer to follow:

When any resource allocation fails, if driver can make sure that there is
any in-flight IO, it is safe to return BLK_STS_DEV_RESOURCE to blk-mq,
that is exactly what scsi_queue_rq() is doing.

Follows the theory:

1) driver returns BLK_STS_DEV_RESOURCE if driver figures out
there is any in-flight IO, in case of any resource allocation failure

2) If all these in-flight IOs complete before examining SCHED_RESTART in
blk_mq_dispatch_rq_list(), SCHED_RESTART must be cleared, so queue
is run immediately in this case by blk_mq_dispatch_rq_list();

3) if there is any in-flight IO after/when examining SCHED_RESTART in
blk_mq_dispatch_rq_list():
- if SCHED_RESTART isn't set, queue is run immediately as handled in 2)
- otherwise, this request will be dispatched after any in-flight IO is
completed via
blk_mq_sched_restart() since this request is added to hctx->dispatch already

And there are two invariants when driver returns BLK_STS_DEV_RESOURCE
iff there is any in-flight IOs:

1) SCHED_RESTART must be zero if no in-flight IOs
2) there has to be any IO in-flight if SCHED_RESTART is read as 1


Thanks,
Ming

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [dm-devel] [PATCH V4] blk-mq: introduce BLK_STS_DEV_RESOURCE
  2018-01-29 21:51     ` Bart Van Assche
  2018-01-29 22:14       ` Mike Snitzer
  2018-01-30  3:20       ` Ming Lei
@ 2018-02-02  0:26       ` John Stoffel
  2018-02-02  0:53         ` Bart Van Assche
  2 siblings, 1 reply; 11+ messages in thread
From: John Stoffel @ 2018-02-02  0:26 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: snitzer@redhat.com, linux-block@vger.kernel.org, axboe@kernel.dk,
	dm-devel@redhat.com, linux-nvme@lists.infradead.org,
	linux-scsi@vger.kernel.org

>>>>> "Bart" == Bart Van Assche <Bart.VanAssche@wdc.com> writes:

Bart> On Mon, 2018-01-29 at 16:44 -0500, Mike Snitzer wrote:
>> But regardless of which wins the race, the queue will have been run.
>> Which is all we care about right?

Bart> Running the queue is not sufficient. With this patch applied it can happen
Bart> that the block driver returns BLK_STS_DEV_RESOURCE, that the two or more
Bart> concurrent queue runs finish before sufficient device resources are available
Bart> to execute the request and that blk_mq_delay_run_hw_queue() does not get
Bart> called at all. If no other activity triggers a queue run, e.g. request
Bart> completion, this will result in a queue stall.

Doesn't this argue that you really want some sort of completions to be
run in this case instead?  Instead of busy looping or waiting for a
set amount of time, just fire off a callback to run once you have the
resources available, no?

I can't provide any code examples, I don't know the code base nearly
well enough.

John

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [dm-devel] [PATCH V4] blk-mq: introduce BLK_STS_DEV_RESOURCE
  2018-02-02  0:26       ` [dm-devel] " John Stoffel
@ 2018-02-02  0:53         ` Bart Van Assche
  0 siblings, 0 replies; 11+ messages in thread
From: Bart Van Assche @ 2018-02-02  0:53 UTC (permalink / raw)
  To: john@stoffel.org
  Cc: dm-devel@redhat.com, linux-scsi@vger.kernel.org,
	linux-block@vger.kernel.org, linux-nvme@lists.infradead.org,
	snitzer@redhat.com, axboe@kernel.dk

T24gVGh1LCAyMDE4LTAyLTAxIGF0IDE5OjI2IC0wNTAwLCBKb2huIFN0b2ZmZWwgd3JvdGU6DQo+
IERvZXNuJ3QgdGhpcyBhcmd1ZSB0aGF0IHlvdSByZWFsbHkgd2FudCBzb21lIHNvcnQgb2YgY29t
cGxldGlvbnMgdG8gYmUNCj4gcnVuIGluIHRoaXMgY2FzZSBpbnN0ZWFkPyAgSW5zdGVhZCBvZiBi
dXN5IGxvb3Bpbmcgb3Igd2FpdGluZyBmb3IgYQ0KPiBzZXQgYW1vdW50IG9mIHRpbWUsIGp1c3Qg
ZmlyZSBvZmYgYSBjYWxsYmFjayB0byBydW4gb25jZSB5b3UgaGF2ZSB0aGUNCj4gcmVzb3VyY2Vz
IGF2YWlsYWJsZSwgbm8/DQoNCkhlbGxvIEpvaG4sDQoNClJlcnVubmluZyB0aGUgcXVldWUgd2hl
biB0aGUgcmVzb3VyY2Ugd2UgcmFuIG91dCBvZiBpcyBhdmFpbGFibGUgYWdhaW4gd291bGQNCmJl
IGlkZWFsLiBIb3dldmVyLCBJJ20gbm90IHN1cmUgaXQgaXMgcG9zc2libGUgdG8gaW1wbGVtZW50
IHRoaXMuIElmIGEgZHJpdmVyDQpwZXJmb3JtcyBtZW1vcnkgYWxsb2NhdGlvbiB3aGlsZSBleGVj
dXRpbmcgYSByZXF1ZXN0IGFuZCBrbWFsbG9jKCkgcmV0dXJucw0KLUVOT01FTSB0aGVuIEkgZG9u
J3Qga25vdyB3aGljaCBtZWNoYW5pc20gc2hvdWxkIGJlIHVzZWQgdG8gdHJpZ2dlciBhIHF1ZXVl
DQpyZXJ1biB3aGVuIHN1ZmZpY2llbnQgbWVtb3J5IGlzIGF2YWlsYWJsZSBhZ2Fpbi4NCg0KQW5v
dGhlciBleGFtcGxlIGlzIHRoZSBTQ1NJIHN1YnN5c3RlbS4gSWYgYSB0aGUgLnF1ZXVlY29tbWFu
ZCgpIGltcGxlbWVudGF0aW9uDQpvZiBhIFNDU0kgTExEIHJldHVybnMgU0NTSV9NTFFVRVVFXypf
QlVTWSBiZWNhdXNlIGEgY2VydGFpbiBIQkEgY29uZGl0aW9uDQpvY2N1cnJlZCBhbmQgdGhlIEhC
QSBkb2VzIG5vdCB0cmlnZ2VyIGFuIGludGVycnVwdCB3aGVuIHRoYXQgY29uZGl0aW9uIGlzDQpj
bGVhcmVkLCBob3cgdG8gZmlndXJlIG91dCB3aGV0aGVyIG9yIG5vdCBhIHJlcXVlc3QgY2FuIGJl
IGV4ZWN1dGVkIG90aGVyIHRoYW4NCmJ5IHJldHJ5aW5nLCB3aGljaCBtZWFucyByZXJ1bm5pbmcg
dGhlIHF1ZXVlPw0KDQpUaGFua3MsDQoNCkJhcnQu

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2018-02-02  0:53 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-01-29 20:33 [PATCH V4] blk-mq: introduce BLK_STS_DEV_RESOURCE Mike Snitzer
2018-01-29 21:22 ` Bart Van Assche
2018-01-29 21:44   ` Mike Snitzer
2018-01-29 21:51     ` Bart Van Assche
2018-01-29 22:14       ` Mike Snitzer
2018-01-29 22:22         ` Bart Van Assche
2018-01-30  5:55         ` [dm-devel] " Ming Lei
2018-01-30  3:20       ` Ming Lei
2018-02-02  0:26       ` [dm-devel] " John Stoffel
2018-02-02  0:53         ` Bart Van Assche
2018-01-30  3:16   ` Ming Lei

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox