Linux block layer
 help / color / mirror / Atom feed
* [PATCH 2/8] nowait aio: Return if cannot get hold of i_rwsem
From: Goldwyn Rodrigues @ 2017-03-15 21:51 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: jack, hch, linux-block, linux-btrfs, linux-ext4, linux-xfs, sagi,
	avi, axboe, linux-api, willy, Goldwyn Rodrigues
In-Reply-To: <20170315215107.5628-1-rgoldwyn@suse.de>

From: Goldwyn Rodrigues <rgoldwyn@suse.com>

A failure to lock i_rwsem would mean there is I/O being performed
by another thread. So, let's bail.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 mm/filemap.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 1694623..e08f3b9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2982,7 +2982,12 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct inode *inode = file->f_mapping->host;
 	ssize_t ret;
 
-	inode_lock(inode);
+	if (!inode_trylock(inode)) {
+		/* Don't sleep on inode rwsem */
+		if (iocb->ki_flags & IOCB_NOWAIT)
+			return -EAGAIN;
+		inode_lock(inode);
+	}
 	ret = generic_write_checks(iocb, from);
 	if (ret > 0)
 		ret = __generic_file_write_iter(iocb, from);
-- 
2.10.2

^ permalink raw reply related

* [PATCH 1/8] nowait aio: Introduce IOCB_RW_FLAG_NOWAIT
From: Goldwyn Rodrigues @ 2017-03-15 21:51 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: jack, hch, linux-block, linux-btrfs, linux-ext4, linux-xfs, sagi,
	avi, axboe, linux-api, willy, Goldwyn Rodrigues
In-Reply-To: <20170315215107.5628-1-rgoldwyn@suse.de>

From: Goldwyn Rodrigues <rgoldwyn@suse.com>

This flag informs kernel to bail out if an AIO request will block
for reasons such as file allocations, or a writeback triggered,
or would block while allocating requests while performing
direct I/O.

Unfortunately, aio_flags is not checked for validity. If we
add the flags to aio_flags, it would break existing applications
which have it set to anything besides zero or IOCB_FLAG_RESFD.
So, we are using aio_reserved1 and renaming it to aio_rw_flags.

IOCB_RW_FLAG_NOWAIT is translated to IOCB_NOWAIT for
iocb->ki_flags.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 fs/aio.c                     | 10 +++++++++-
 include/linux/fs.h           |  1 +
 include/uapi/linux/aio_abi.h |  9 ++++++++-
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index f52d925..41409ac 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1541,11 +1541,16 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	ssize_t ret;
 
 	/* enforce forwards compatibility on users */
-	if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) {
+	if (unlikely(iocb->aio_reserved2)) {
 		pr_debug("EINVAL: reserve field set\n");
 		return -EINVAL;
 	}
 
+	if (unlikely(iocb->aio_rw_flags & ~IOCB_RW_FLAG_NOWAIT)) {
+		pr_debug("EINVAL: aio_rw_flags set with incompatible flags\n");
+		return -EINVAL;
+	}
+
 	/* prevent overflows */
 	if (unlikely(
 	    (iocb->aio_buf != (unsigned long)iocb->aio_buf) ||
@@ -1586,6 +1591,9 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		req->common.ki_flags |= IOCB_EVENTFD;
 	}
 
+	if (iocb->aio_rw_flags & IOCB_RW_FLAG_NOWAIT)
+		req->common.ki_flags |= IOCB_NOWAIT;
+
 	ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
 	if (unlikely(ret)) {
 		pr_debug("EFAULT: aio_key\n");
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7251f7b..e8d9346 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -270,6 +270,7 @@ struct writeback_control;
 #define IOCB_DSYNC		(1 << 4)
 #define IOCB_SYNC		(1 << 5)
 #define IOCB_WRITE		(1 << 6)
+#define IOCB_NOWAIT		(1 << 7)
 
 struct kiocb {
 	struct file		*ki_filp;
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index bb2554f..6d98cbe 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -54,6 +54,13 @@ enum {
  */
 #define IOCB_FLAG_RESFD		(1 << 0)
 
+/*
+ * Flags for aio_rw_flags member of "struct iocb".
+ * IOCB_RW_FLAG_NOWAIT - Set if the user wants the iocb to fail if it
+ *			would block for operations such as disk allocation.
+ */
+#define IOCB_RW_FLAG_NOWAIT	(1 << 1)
+
 /* read() from /dev/aio returns these structures. */
 struct io_event {
 	__u64		data;		/* the data field from the iocb */
@@ -79,7 +86,7 @@ struct io_event {
 struct iocb {
 	/* these are internal to the kernel/libc. */
 	__u64	aio_data;	/* data to be returned in event's data */
-	__u32	PADDED(aio_key, aio_reserved1);
+	__u32	PADDED(aio_key, aio_rw_flags);
 				/* the kernel sets aio_key to the req # */
 
 	/* common fields */
-- 
2.10.2

^ permalink raw reply related

* [PATCH 0/8 v3] No wait AIO
From: Goldwyn Rodrigues @ 2017-03-15 21:50 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: jack, hch, linux-block, linux-btrfs, linux-ext4, linux-xfs, sagi,
	avi, axboe, linux-api, willy

Formerly known as non-blocking AIO.

This series adds nonblocking feature to asynchronous I/O writes.
io_submit() can be delayed because of a number of reason:
 - Block allocation for files
 - Data writebacks for direct I/O
 - Sleeping because of waiting to acquire i_rwsem
 - Congested block device

The goal of the patch series is to return -EAGAIN/-EWOULDBLOCK if
any of these conditions are met. This way userspace can push most
of the write()s to the kernel to the best of its ability to complete
and if it returns -EAGAIN, can defer it to another thread.

In order to enable this, IOCB_RW_FLAG_NOWAIT is introduced in
uapi/linux/aio_abi.h. If set for aio_rw_flags, it translates to
IOCB_NOWAIT for struct iocb, BIO_NOWAIT for bio and IOMAP_NOWAIT for
iomap. aio_rw_flags is a new flag replacing aio_reserved1. We could
not use aio_flags because it is not currently checked for invalidity
in the kernel.

This feature is provided for direct I/O of asynchronous I/O only. I have
tested it against xfs, ext4, and btrfs.

Changes since v1:
 + changed name from _NONBLOCKING to *_NOWAIT
 + filemap_range_has_page call moved to closer to (just before) calling filemap_write_and_wait_range().
 + BIO_NOWAIT limited to get_request()
 + XFS fixes 
	- included reflink 
	- use of xfs_ilock_nowait() instead of a XFS_IOLOCK_NONBLOCKING flag
	- Translate the flag through IOMAP_NOWAIT (iomap) to check for
	  block allocation for the file.
 + ext4 coding style

Changes since v2:
 + Using aio_reserved1 as aio_rw_flags instead of aio_flags
 + blk-mq support
 + xfs uptodate with kernel and reflink changes

-- 
Goldwyn

^ permalink raw reply

* Re: [PATCH 1/2] blk-mq: don't complete un-started request in timeout handler
From: Bart Van Assche @ 2017-03-15 21:35 UTC (permalink / raw)
  To: tom.leiming@gmail.com
  Cc: linux-kernel@vger.kernel.org, hch@infradead.org,
	linux-block@vger.kernel.org, yizhan@redhat.com, axboe@fb.com,
	stable@vger.kernel.org
In-Reply-To: <20170315162158.GA18768@ming.t460p>

On Thu, 2017-03-16 at 00:22 +0800, Ming Lei wrote:
> On Wed, Mar 15, 2017 at 03:36:31PM +0000, Bart Van Assche wrote:
> > Please have another look at __blk_mq_requeue_request(). In that functio=
n
> > the following code occurs: if (test_and_clear_bit(REQ_ATOM_STARTED,
> > &rq->atomic_flags)) { ... }
> >=20
> > I think the=A0REQ_ATOM_STARTED check in blk_mq_check_expired() races wi=
th the
> > test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags) call in
> > __blk_mq_requeue_request().
>=20
> OK, this race should only exist in case that the requeue happens after di=
spatch
> busy, because COMPLETE flag isn't set. And if the requeue is from io comp=
letion,
> no such race because COMPLETE flag is set.
>=20
> One solution I thought of is to call blk_mark_rq_complete() before requeu=
ing
> when dispatch busy happened, but that looks a bit silly. Another way is t=
o
> set STARTED flag just after .queue_rq returns BLK_MQ_RQ_QUEUE_OK, which l=
ooks
> reasonable too. Any comments on the 2nd solution?

Sorry but I don't think that would be sufficient. There are several other
scenarios that have not been mentioned above, e.g. that a tag gets freed an=
d
reused after the blk_mq_check_expired() call started and before that functi=
on
has had the chance to examine any members of struct request. What is needed=
 in
blk_mq_check_expired() is the following as a single atomic operation:
* Check whether REQ_ATOM_STARTED has been set.
* Check whether REQ_ATOM_COMPLETE has not yet been set.
* If both conditions have been met, set REQ_ATOM_COMPLETE.

I don't think there is another solution than using a single state variable =
to
represent the REQ_ATOM_STARTED and REQ_ATOM_COMPLETE states instead of two
independent bits. How about the patch below?

Thanks,

Bart.


[PATCH] blk-mq: Fix request state manipulation races

Use a single state variable instead of two separate bits
REQ_ATOM_STARTED and REQ_ATOM_COMPLETE. For blk-mq, make
__blk_mq_finish_request() perform the transition from "complete" to
"not complete" instead of blk_mq_start_request(). Make sure that
blk_mq_check_expired() uses a single atomic operation to test the
"started" and "complete" states and to set the "complete" state.
---
=A0block/blk-core.c=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0|=A0=A06 +++--
=A0block/blk-mq.c=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0| 67 ++++++++++++++=
+++-----------------------------
=A0block/blk-timeout.c=A0=A0=A0=A0=A0=A0=A0=A0|=A0=A02 +-
=A0block/blk.h=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0| 21 ++++++++=
++-----
=A0drivers/scsi/virtio_scsi.c |=A0=A02 +-
=A0include/linux/blkdev.h=A0=A0=A0=A0=A0|=A0=A01 +
=A06 files changed, 47 insertions(+), 52 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 0eeb99ef654f..dff857d2b86b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1305,7 +1305,7 @@ EXPORT_SYMBOL(blk_get_request);
=A0void blk_requeue_request(struct request_queue *q, struct request *rq)
=A0{
=A0	blk_delete_timer(rq);
-	blk_clear_rq_complete(rq);
+	atomic_set(&rq->state, REQ_NOT_STARTED);
=A0	trace_block_rq_requeue(q, rq);
=A0	wbt_requeue(q->rq_wb, &rq->issue_stat);
=A0
@@ -2477,7 +2477,9 @@ void blk_start_request(struct request *req)
=A0		wbt_issue(req->q->rq_wb, &req->issue_stat);
=A0	}
=A0
-	BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
+	WARN_ONCE(atomic_read(&req->state) !=3D REQ_NOT_STARTED,
+		=A0=A0"unexpected request state %d !=3D %d\n",
+		=A0=A0atomic_read(&req->state), REQ_NOT_STARTED);
=A0	blk_add_timer(req);
=A0}
=A0EXPORT_SYMBOL(blk_start_request);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 159187a28d66..fe73d5a1ffc3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -343,7 +343,7 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx=
, struct blk_mq_ctx *ctx,
=A0	wbt_done(q->rq_wb, &rq->issue_stat);
=A0	rq->rq_flags =3D 0;
=A0
-	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+	atomic_set(&rq->state, REQ_NOT_STARTED);
=A0	clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
=A0	if (rq->tag !=3D -1)
=A0		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
@@ -479,7 +479,7 @@ EXPORT_SYMBOL(blk_mq_complete_request);
=A0
=A0int blk_mq_request_started(struct request *rq)
=A0{
-	return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+	return atomic_read(&rq->state) =3D=3D REQ_STARTED;
=A0}
=A0EXPORT_SYMBOL_GPL(blk_mq_request_started);
=A0
@@ -505,16 +505,10 @@ void blk_mq_start_request(struct request *rq)
=A0	=A0*/
=A0	smp_mb__before_atomic();
=A0
-	/*
-	=A0* Mark us as started and clear complete. Complete might have been
-	=A0* set if requeue raced with timeout, which then marked it as
-	=A0* complete. So be sure to clear complete again when we start
-	=A0* the request, otherwise we'll ignore the completion event.
-	=A0*/
-	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
-		set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
-	if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
-		clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
+	WARN_ONCE(atomic_read(&rq->state) !=3D REQ_NOT_STARTED,
+		=A0=A0"unexpected request state %d !=3D %d\n",
+		=A0=A0atomic_read(&rq->state), REQ_NOT_STARTED);
+	atomic_set(&rq->state, REQ_STARTED);
=A0
=A0	if (q->dma_drain_size && blk_rq_bytes(rq)) {
=A0		/*
@@ -530,12 +524,14 @@ EXPORT_SYMBOL(blk_mq_start_request);
=A0static void __blk_mq_requeue_request(struct request *rq)
=A0{
=A0	struct request_queue *q =3D rq->q;
+	enum rq_state prev;
=A0
=A0	trace_block_rq_requeue(q, rq);
=A0	wbt_requeue(q->rq_wb, &rq->issue_stat);
=A0	blk_mq_sched_requeue_request(rq);
=A0
-	if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
+	prev =3D atomic_xchg(&rq->state, REQ_NOT_STARTED);
+	if (prev !=3D REQ_NOT_STARTED) {
=A0		if (q->dma_drain_size && blk_rq_bytes(rq))
=A0			rq->nr_phys_segments--;
=A0	}
@@ -661,17 +657,7 @@ void blk_mq_rq_timed_out(struct request *req, bool res=
erved)
=A0	const struct blk_mq_ops *ops =3D req->q->mq_ops;
=A0	enum blk_eh_timer_return ret =3D BLK_EH_RESET_TIMER;
=A0
-	/*
-	=A0* We know that complete is set at this point. If STARTED isn't set
-	=A0* anymore, then the request isn't active and the "timeout" should
-	=A0* just be ignored. This can happen due to the bitflag ordering.
-	=A0* Timeout first checks if STARTED is set, and if it is, assumes
-	=A0* the request is active. But if we race with completion, then
-	=A0* we both flags will get cleared. So check here again, and ignore
-	=A0* a timeout event with a request that isn't active.
-	=A0*/
-	if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
-		return;
+	WARN_ON_ONCE(atomic_read(&req->state) !=3D REQ_COMPLETE);
=A0
=A0	if (ops->timeout)
=A0		ret =3D ops->timeout(req, reserved);
@@ -682,7 +668,7 @@ void blk_mq_rq_timed_out(struct request *req, bool rese=
rved)
=A0		break;
=A0	case BLK_EH_RESET_TIMER:
=A0		blk_add_timer(req);
-		blk_clear_rq_complete(req);
+		atomic_set(&req->state, REQ_STARTED);
=A0		break;
=A0	case BLK_EH_NOT_HANDLED:
=A0		break;
@@ -692,27 +678,24 @@ void blk_mq_rq_timed_out(struct request *req, bool re=
served)
=A0	}
=A0}
=A0
+/*
+ * Check whether or not a request has expired. This function can execute
+ * concurrently with other functions that change the request state. This c=
an
+ * result in returning a deadline (blk_mq_timeout_data.next) that occurs
+ * before a request times out. However, this is harmless because the next
+ * call of blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data) will
+ * yield the correct timeout, unless the same race occurs again.
+ */
=A0static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
=A0		struct request *rq, void *priv, bool reserved)
=A0{
=A0	struct blk_mq_timeout_data *data =3D priv;
=A0
-	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
-		/*
-		=A0* If a request wasn't started before the queue was
-		=A0* marked dying, kill it here or it'll go unnoticed.
-		=A0*/
-		if (unlikely(blk_queue_dying(rq->q))) {
-			rq->errors =3D -EIO;
-			blk_mq_end_request(rq, rq->errors);
-		}
-		return;
-	}
-
-	if (time_after_eq(jiffies, rq->deadline)) {
-		if (!blk_mark_rq_complete(rq))
-			blk_mq_rq_timed_out(rq, reserved);
-	} else if (!data->next_set || time_after(data->next, rq->deadline)) {
+	if (time_after_eq(jiffies, rq->deadline) &&
+	=A0=A0=A0=A0!blk_mark_rq_complete_if_started(rq)) {
+		blk_mq_rq_timed_out(rq, reserved);
+	} else if ((!data->next_set || time_after(data->next, rq->deadline)) &&
+		=A0=A0=A0blk_mq_request_started(rq)) {
=A0		data->next =3D rq->deadline;
=A0		data->next_set =3D 1;
=A0	}
@@ -2821,7 +2804,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_q=
ueue *q,
=A0
=A0	hrtimer_init_sleeper(&hs, current);
=A0	do {
-		if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
+		if (atomic_read(&rq->state) =3D=3D REQ_COMPLETE)
=A0			break;
=A0		set_current_state(TASK_UNINTERRUPTIBLE);
=A0		hrtimer_start_expires(&hs.timer, mode);
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index a30441a200c0..9a8b44ebfb99 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -94,7 +94,7 @@ static void blk_rq_timed_out(struct request *req)
=A0		break;
=A0	case BLK_EH_RESET_TIMER:
=A0		blk_add_timer(req);
-		blk_clear_rq_complete(req);
+		atomic_set(&req->state, REQ_NOT_STARTED);
=A0		break;
=A0	case BLK_EH_NOT_HANDLED:
=A0		/*
diff --git a/block/blk.h b/block/blk.h
index d1ea4bd9b9a3..8af5fe21e85f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -115,23 +115,32 @@ void blk_account_io_done(struct request *req);
=A0 * Internal atomic flags for request handling
=A0 */
=A0enum rq_atomic_flags {
-	REQ_ATOM_COMPLETE =3D 0,
-	REQ_ATOM_STARTED,
=A0	REQ_ATOM_POLL_SLEPT,
=A0};
=A0
=A0/*
+ * Request states. Note: REQ_STARTED is only used by the blk-mq code.
+ */
+enum rq_state {
+	REQ_NOT_STARTED,
+	REQ_STARTED,
+	REQ_COMPLETE,
+};
+
+/*
=A0 * EH timer and IO completion will both attempt to 'grab' the request, m=
ake
- * sure that only one of them succeeds
+ * sure that only one of them succeeds. The return value 0 means that this
+ * function changed the request state from "not complete" into "complete".
=A0 */
=A0static inline int blk_mark_rq_complete(struct request *rq)
=A0{
-	return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
+	return atomic_xchg(&rq->state, REQ_COMPLETE) =3D=3D REQ_COMPLETE;
=A0}
=A0
-static inline void blk_clear_rq_complete(struct request *rq)
+static inline int blk_mark_rq_complete_if_started(struct request *rq)
=A0{
-	clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
+	return atomic_cmpxchg(&rq->state, REQ_STARTED, REQ_COMPLETE) !=3D
+		REQ_STARTED;
=A0}
=A0
=A0/*
diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 939c47df73fa..136379097131 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -672,7 +672,7 @@ static int virtscsi_tmf(struct virtio_scsi *vscsi, stru=
ct virtio_scsi_cmd *cmd)
=A0	=A0*
=A0	=A0* In the abort case, sc->scsi_done will do nothing, because
=A0	=A0* the block layer must have detected a timeout and as a result
-	=A0* REQ_ATOM_COMPLETE has been set.
+	=A0* rq->state =3D=3D REQ_COMPLETED.
=A0	=A0*/
=A0	virtscsi_poll_requests(vscsi);
=A0
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5a7da607ca04..b286887b095e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -142,6 +142,7 @@ struct request {
=A0
=A0	int internal_tag;
=A0
+	atomic_t state;
=A0	unsigned long atomic_flags;
=A0
=A0	/* the following two fields are internal, NEVER access directly */
--=A0
2.12.0

^ permalink raw reply related

* Re: [PATCH 1/2] blk-mq: don't complete un-started request in timeout handler
From: Bart Van Assche @ 2017-03-15 21:34 UTC (permalink / raw)
  To: tom.leiming@gmail.com
  Cc: linux-kernel@vger.kernel.org, hch@infradead.org,
	linux-block@vger.kernel.org, yizhan@redhat.com, axboe@fb.com,
	stable@vger.kernel.org
In-Reply-To: <20170315121851.GA15807@ming.t460p>

On Wed, 2017-03-15 at 20:18 +0800, Ming Lei wrote:
> On Wed, Mar 15, 2017 at 12:07:37AM +0000, Bart Van Assche wrote:
> > Both the old and the new check look racy to me. The REQ_ATOM_STARTED bi=
t can
> > be changed concurrently by blk_mq_start_request(), __blk_mq_finish_requ=
est()
>=20
> blk_mq_start_request() and __blk_mq_finish_request() won't be run concurr=
ently.
>=20
> From view of __blk_mq_finish_request():
>=20
> 	- if it is run from merge queue io path(blk_mq_merge_queue_io()),
> 	blk_mq_start_request() can't be run at all, and the COMPLETE flag
> 	is kept as previous value(zero)
>=20
> 	- if it is run from normal complete path, COMPLETE flag is cleared
> 	before the req/tag is released to tag set.
>=20
> so there isn't race in blk_mq_start_request() vs. __blk_mq_finish_request=
()
> wrt. timeout.
>=20
> > or __blk_mq_requeue_request(). Another issue with this function is that=
 the
>=20
> __blk_mq_requeue_request() can be run from two pathes:
>=20
> 	- dispatch failure, in which case the req/tag isn't released to tag set
> =09
> 	- IO completion path, in which COMPLETE flag is cleared before requeue.
> =09
> so I can't see races with timeout in case of start rq vs. requeue rq.=20
>=20
> > request passed to this function can be reinitialized concurrently.

Hello Ming,

You misinterpret what I wrote. I was referring to manipulation of
REQ_ATOM_STARTED from different contexts and not to what you explained.

Bart.=

^ permalink raw reply

* [GIT PULL] Block fixes for 4.11-rc
From: Jens Axboe @ 2017-03-15 21:06 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel@vger.kernel.org, linux-block@vger.kernel.org

Hi Linus,

Four small fixes for this cycle. This pull request contains:

- Followup fix from Neil for a fix that went in before -rc2, ensuring
  that we always see the full per-task bio_list.

- Fix for blk-mq-sched from me that ensures that we retain similar
  direct-to-issue behavior on running the queue.

- Fix from Sagi fixing a potential NULL pointer dereference in blk-mq on
  spurious CPU unplug.

- A memory leak fix in writeback from Tahsin, fixing a case where device
  removal of a mounted device can leak a struct wb_writeback_work.

Please pull!


  git://git.kernel.dk/linux-block.git for-linus


----------------------------------------------------------------
Jens Axboe (1):
      blk-mq-sched: don't run the queue async from blk_mq_try_issue_directly()

NeilBrown (1):
      blk: Ensure users for current->bio_list can see the full list.

Sagi Grimberg (1):
      blk-mq: Fix tagset reinit in the presence of cpu hot-unplug

Tahsin Erdogan (1):
      writeback: fix memory leak in wb_queue_work()

 block/bio.c         | 12 +++++++++---
 block/blk-core.c    | 30 ++++++++++++++++++------------
 block/blk-mq-tag.c  |  3 +++
 block/blk-mq.c      |  9 +++++----
 drivers/md/dm.c     | 29 ++++++++++++++++-------------
 drivers/md/raid10.c |  3 ++-
 fs/fs-writeback.c   | 35 +++++++++++++++++++++--------------
 7 files changed, 74 insertions(+), 47 deletions(-)

-- 
Jens Axboe

^ permalink raw reply

* Re: [PATCH RFC 10/14] block, bfq: add Early Queue Merge (EQM)
From: Jens Axboe @ 2017-03-15 21:01 UTC (permalink / raw)
  To: Paolo Valente
  Cc: Tejun Heo, Fabio Checconi, Arianna Avanzini, linux-block,
	Linux-Kernal, Ulf Hansson, Linus Walleij, broonie,
	Mauro Andreolini
In-Reply-To: <609A716A-F334-4A17-AA87-695CF789DFB2@linaro.org>

On 03/15/2017 11:02 AM, Paolo Valente wrote:
> 
>> Il giorno 15 mar 2017, alle ore 17:56, Jens Axboe <axboe@kernel.dk> ha scritto:
>>
>> On 03/04/2017 09:01 AM, Paolo Valente wrote:
>>> @@ -6330,7 +7012,41 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
>>>
>>> static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
>>> {
>>> -	struct bfq_queue *bfqq = RQ_BFQQ(rq);
>>> +	struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;
>>> +
>>> +	/*
>>> +	 * An unplug may trigger a requeue of a request from the device
>>> +	 * driver: make sure we are in process context while trying to
>>> +	 * merge two bfq_queues.
>>> +	 */
>>> +	if (!in_interrupt()) {
>>
>> What's the reason for this?
> 
> None :(
> 
> Just pre-existing, working code that I did not update, sorry.

OK good, then that check can simply be killed.

-- 
Jens Axboe

^ permalink raw reply

* Re: [PATCH RFC 10/14] block, bfq: add Early Queue Merge (EQM)
From: Jens Axboe @ 2017-03-15 21:00 UTC (permalink / raw)
  To: Paolo Valente
  Cc: Tejun Heo, Fabio Checconi, Arianna Avanzini, linux-block,
	Linux-Kernal, Ulf Hansson, Linus Walleij, broonie,
	Mauro Andreolini
In-Reply-To: <2086B143-E8E3-43AD-B074-771378579081@linaro.org>

On 03/15/2017 10:59 AM, Paolo Valente wrote:
> 
>> Il giorno 15 mar 2017, alle ore 17:30, Jens Axboe <axboe@kernel.dk> ha scritto:
>>
>> On 03/15/2017 09:47 AM, Jens Axboe wrote:
>>> I think you understood me correctly. Currently I think the putting of
>>> the io context is somewhat of a mess. You have seemingly random places
>>> where you have to use special unlock functions, to ensure that you
>>> notice that some caller deeper down has set ->ioc_to_put. I took a quick
>>> look at it, and by far most of the cases can return an io_context to
>>> free quite easily. You can mark these functions __must_check to ensure
>>> that we don't drop an io_context, inadvertently. That's already a win
>>> over the random ->ioc_to_put store. And you can then get rid of
>>> bfq_unlock_put_ioc and it's irq variant as well.
>>>
>>> The places where you are already returning a value, like off dispatch
>>> for instance, you can just pass in a pointer to an io_context pointer.
>>>
>>> If you get this right, it'll be a lot less fragile and hacky than your
>>> current approach.
>>
>> Even just looking a little closer, you also find cases where you
>> potentially twice store ->ioc_to_put. That kind of mixup can't happen if
>> you return it properly.
>>
>> In __bfq_dispatch_request(), for instance. You call bfq_select_queue(),
>> and that in turn calls bfq_bfqq_expire(), which calls
>> __bfq_bfqq_expire() which can set ->ioc_to_put. But later on,
>> __bfq_dispatch_request() calls bfq_dispatch_rq_from_bfqq(), which in
>> turn calls bfq_bfqq_expire() that can also set ->ioc_to_put. There's no
>> "magic" bfq_unlock_and_put_ioc() in-between those. Maybe the former call
>> never sets ->ioc_to_put if it returns with bfqq == NULL? Hard to tell.
>>
>> Or __bfq_insert_request(), it calls bfq_add_request(), which may set
>> ->ioc_to_put through bfq_bfqq_handle_idle_busy_switch() ->
>> bfq_bfqq_expire(). And then from calling bfq_rq_enqueued() ->
>> bfq_bfqq_expire().
>>
> 
> I have checked that.  Basically, since a queue can't be expired twice,
> then it should never happen that ioc_to_put is set twice before being
> used.  Yet, I do agree that using a shared field and exploiting
> collateral effects makes code very complex and fragile (maybe even
> buggy if my speculative check is wrong).  Just, it has been the best
> solution I found, to avoid deferred work as you asked.  In fact, I
> still find quite heavy the alternative of passing a pointer to an ioc
> forth and back across seven or eight nested functions.

It's not heavy at all, I went through all of it this morning. It's not
super pretty either, since you end up passing back an io_context which
is seemingly unrelated to what the functions otherwise do. But that's
mostly a reflection of the implementation, not that it's a bad way to go
about this in general. The worst bits are the places where you want to
add a

	WARN_ON(ret != NULL);

between two calls that potentially both drop the ioc. In terms of
overhead, it's not heavy. Punting to a workqueue would be orders of
magnitude more expensive.

>> There might be more, but I think the above is plenty of evidence that
>> the current ->ioc_to_put solution is a bad hack, fragile, and already
>> has bugs.
>>
>> How often do you expect this putting of the io_context to happen?
> 
> Unfortunately often, as it must be done also every time the in-service
> queue is reset.  But, in this respect, are we sure that we do need to
> grab a reference to the ioc when we set a queue in service (as done in
> cfq, and copied into bfq)?  I mean, we have the hook exit_ioc for
> controlling the disappearing of an ioc.  Am I missing something here
> too?

No, in fact that'd be perfectly fine. It's easier for CFQ to just retain
the reference so we know it's not going away, but for your case, it
might in fact make more sense to simply be able to de-service a queue if
the process exits. And if you do that, we can drop all this passing back
of ioc (or ->ioc_to_put) craziness, without having to punt to a
workqueue either.

This will be more efficient too, since it'll be a much more rare
occurence.

-- 
Jens Axboe

^ permalink raw reply

* Re: [PATCH RFC 10/14] block, bfq: add Early Queue Merge (EQM)
From: Paolo Valente @ 2017-03-15 17:02 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Tejun Heo, Fabio Checconi, Arianna Avanzini, linux-block,
	Linux-Kernal, Ulf Hansson, Linus Walleij, broonie,
	Mauro Andreolini
In-Reply-To: <64ef5694-d62d-e14c-ce80-a213c406bf22@kernel.dk>


> Il giorno 15 mar 2017, alle ore 17:56, Jens Axboe <axboe@kernel.dk> ha =
scritto:
>=20
> On 03/04/2017 09:01 AM, Paolo Valente wrote:
>> @@ -6330,7 +7012,41 @@ static void bfq_rq_enqueued(struct bfq_data =
*bfqd, struct bfq_queue *bfqq,
>>=20
>> static void __bfq_insert_request(struct bfq_data *bfqd, struct =
request *rq)
>> {
>> -	struct bfq_queue *bfqq =3D RQ_BFQQ(rq);
>> +	struct bfq_queue *bfqq =3D RQ_BFQQ(rq), *new_bfqq;
>> +
>> +	/*
>> +	 * An unplug may trigger a requeue of a request from the device
>> +	 * driver: make sure we are in process context while trying to
>> +	 * merge two bfq_queues.
>> +	 */
>> +	if (!in_interrupt()) {
>=20
> What's the reason for this?

None :(

Just pre-existing, working code that I did not update, sorry.

> Don't use in_interrupt() to guide any of
> your decision making here.
>=20

Of course, sorry for these silly mistakes.

Thanks,
Paolo

> --=20
> Jens Axboe
>=20

^ permalink raw reply

* Re: [PATCH RFC 10/14] block, bfq: add Early Queue Merge (EQM)
From: Paolo Valente @ 2017-03-15 16:59 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Tejun Heo, Fabio Checconi, Arianna Avanzini, linux-block,
	Linux-Kernal, Ulf Hansson, Linus Walleij, broonie,
	Mauro Andreolini
In-Reply-To: <7b7c0d4c-c10d-1e69-4ea0-1ad05a4100a2@kernel.dk>


> Il giorno 15 mar 2017, alle ore 17:30, Jens Axboe <axboe@kernel.dk> ha =
scritto:
>=20
> On 03/15/2017 09:47 AM, Jens Axboe wrote:
>> I think you understood me correctly. Currently I think the putting of
>> the io context is somewhat of a mess. You have seemingly random =
places
>> where you have to use special unlock functions, to ensure that you
>> notice that some caller deeper down has set ->ioc_to_put. I took a =
quick
>> look at it, and by far most of the cases can return an io_context to
>> free quite easily. You can mark these functions __must_check to =
ensure
>> that we don't drop an io_context, inadvertently. That's already a win
>> over the random ->ioc_to_put store. And you can then get rid of
>> bfq_unlock_put_ioc and it's irq variant as well.
>>=20
>> The places where you are already returning a value, like off dispatch
>> for instance, you can just pass in a pointer to an io_context =
pointer.
>>=20
>> If you get this right, it'll be a lot less fragile and hacky than =
your
>> current approach.
>=20
> Even just looking a little closer, you also find cases where you
> potentially twice store ->ioc_to_put. That kind of mixup can't happen =
if
> you return it properly.
>=20
> In __bfq_dispatch_request(), for instance. You call =
bfq_select_queue(),
> and that in turn calls bfq_bfqq_expire(), which calls
> __bfq_bfqq_expire() which can set ->ioc_to_put. But later on,
> __bfq_dispatch_request() calls bfq_dispatch_rq_from_bfqq(), which in
> turn calls bfq_bfqq_expire() that can also set ->ioc_to_put. There's =
no
> "magic" bfq_unlock_and_put_ioc() in-between those. Maybe the former =
call
> never sets ->ioc_to_put if it returns with bfqq =3D=3D NULL? Hard to =
tell.
>=20
> Or __bfq_insert_request(), it calls bfq_add_request(), which may set
> ->ioc_to_put through bfq_bfqq_handle_idle_busy_switch() ->
> bfq_bfqq_expire(). And then from calling bfq_rq_enqueued() ->
> bfq_bfqq_expire().
>=20

I have checked that.  Basically, since a queue can't be expired twice,
then it should never happen that ioc_to_put is set twice before being
used.  Yet, I do agree that using a shared field and exploiting
collateral effects makes code very complex and fragile (maybe even
buggy if my speculative check is wrong).  Just, it has been the best
solution I found, to avoid deferred work as you asked.  In fact, I
still find quite heavy the alternative of passing a pointer to an ioc
forth and back across seven or eight nested functions.

> There might be more, but I think the above is plenty of evidence that
> the current ->ioc_to_put solution is a bad hack, fragile, and already
> has bugs.
>=20
> How often do you expect this putting of the io_context to happen?

Unfortunately often, as it must be done also every time the in-service
queue is reset.  But, in this respect, are we sure that we do need to
grab a reference to the ioc when we set a queue in service (as done in
cfq, and copied into bfq)?  I mean, we have the hook exit_ioc for
controlling the disappearing of an ioc.  Am I missing something here
too?

Thanks,
Paolo

> If
> it's not a very frequent occurence, maybe using a deferred workqueue =
to
> put it IS the right solution. As it currently stands, the code doesn't
> really work, and it's fragile. It can't be cleaned up without
> refactoring, since the call paths are all extremely intermingled.
>=20
> --=20
> Jens Axboe
>=20

^ permalink raw reply

* Re: [PATCH RFC 10/14] block, bfq: add Early Queue Merge (EQM)
From: Jens Axboe @ 2017-03-15 16:56 UTC (permalink / raw)
  To: Paolo Valente, Tejun Heo
  Cc: Fabio Checconi, Arianna Avanzini, linux-block, linux-kernel,
	ulf.hansson, linus.walleij, broonie, Mauro Andreolini
In-Reply-To: <20170304160131.57366-11-paolo.valente@linaro.org>

On 03/04/2017 09:01 AM, Paolo Valente wrote:
> @@ -6330,7 +7012,41 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
>  
>  static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
>  {
> -	struct bfq_queue *bfqq = RQ_BFQQ(rq);
> +	struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;
> +
> +	/*
> +	 * An unplug may trigger a requeue of a request from the device
> +	 * driver: make sure we are in process context while trying to
> +	 * merge two bfq_queues.
> +	 */
> +	if (!in_interrupt()) {

What's the reason for this? Don't use in_interrupt() to guide any of
your decision making here.

-- 
Jens Axboe

^ permalink raw reply

* Re: [PATCH 1/2] blk-mq: don't complete un-started request in timeout handler
From: Ming Lei @ 2017-03-15 16:46 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: linux-kernel@vger.kernel.org, hch@infradead.org,
	linux-block@vger.kernel.org, yizhan@redhat.com, axboe@fb.com,
	stable@vger.kernel.org
In-Reply-To: <20170315162158.GA18768@ming.t460p>

On Thu, Mar 16, 2017 at 12:22:01AM +0800, Ming Lei wrote:
> On Wed, Mar 15, 2017 at 03:36:31PM +0000, Bart Van Assche wrote:
> 
> OK, this race should only exist in case that the requeue happens after dispatch
> busy, because COMPLETE flag isn't set. And if the requeue is from io completion,
> no such race because COMPLETE flag is set.
> 
> One solution I thought of is to call blk_mark_rq_complete() before requeuing
> when dispatch busy happened, but that looks a bit silly. Another way is to
> set STARTED flag just after .queue_rq returns BLK_MQ_RQ_QUEUE_OK, which looks
> reasonable too. Any comments on the 2nd solution?

Actually it isn't possible to happen because rq->deadline is just set
in blk_mq_start_request() called from .queue_rq, and it won't trigger
timeout handling even STARTED is observed as true in blk_mq_check_expired()
because timeout period is often set as big enough. So it is still safe, isn't it?

But this situation should have been commented.

Thanks,
Ming

^ permalink raw reply

* Re: [PATCH RFC 10/14] block, bfq: add Early Queue Merge (EQM)
From: Jens Axboe @ 2017-03-15 16:30 UTC (permalink / raw)
  To: Paolo Valente
  Cc: Tejun Heo, Fabio Checconi, Arianna Avanzini, linux-block,
	linux-kernel, ulf.hansson, linus.walleij, broonie,
	Mauro Andreolini
In-Reply-To: <fe98e76a-c4c6-dcd6-93c6-55cbbbc1db76@kernel.dk>

On 03/15/2017 09:47 AM, Jens Axboe wrote:
> I think you understood me correctly. Currently I think the putting of
> the io context is somewhat of a mess. You have seemingly random places
> where you have to use special unlock functions, to ensure that you
> notice that some caller deeper down has set ->ioc_to_put. I took a quick
> look at it, and by far most of the cases can return an io_context to
> free quite easily. You can mark these functions __must_check to ensure
> that we don't drop an io_context, inadvertently. That's already a win
> over the random ->ioc_to_put store. And you can then get rid of
> bfq_unlock_put_ioc and it's irq variant as well.
> 
> The places where you are already returning a value, like off dispatch
> for instance, you can just pass in a pointer to an io_context pointer.
> 
> If you get this right, it'll be a lot less fragile and hacky than your
> current approach.

Even just looking a little closer, you also find cases where you
potentially twice store ->ioc_to_put. That kind of mixup can't happen if
you return it properly.

In __bfq_dispatch_request(), for instance. You call bfq_select_queue(),
and that in turn calls bfq_bfqq_expire(), which calls
__bfq_bfqq_expire() which can set ->ioc_to_put. But later on,
__bfq_dispatch_request() calls bfq_dispatch_rq_from_bfqq(), which in
turn calls bfq_bfqq_expire() that can also set ->ioc_to_put. There's no
"magic" bfq_unlock_and_put_ioc() in-between those. Maybe the former call
never sets ->ioc_to_put if it returns with bfqq == NULL? Hard to tell.

Or __bfq_insert_request(), it calls bfq_add_request(), which may set
->ioc_to_put through bfq_bfqq_handle_idle_busy_switch() ->
bfq_bfqq_expire(). And then from calling bfq_rq_enqueued() ->
bfq_bfqq_expire().

There might be more, but I think the above is plenty of evidence that
the current ->ioc_to_put solution is a bad hack, fragile, and already
has bugs.

How often do you expect this putting of the io_context to happen? If
it's not a very frequent occurence, maybe using a deferred workqueue to
put it IS the right solution. As it currently stands, the code doesn't
really work, and it's fragile. It can't be cleaned up without
refactoring, since the call paths are all extremely intermingled.

-- 
Jens Axboe

^ permalink raw reply

* Re: [PATCH 1/2] blk-mq: don't complete un-started request in timeout handler
From: Ming Lei @ 2017-03-15 16:22 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: linux-kernel@vger.kernel.org, hch@infradead.org,
	linux-block@vger.kernel.org, yizhan@redhat.com, axboe@fb.com,
	stable@vger.kernel.org
In-Reply-To: <1489592177.2660.1.camel@sandisk.com>

On Wed, Mar 15, 2017 at 03:36:31PM +0000, Bart Van Assche wrote:
> On Wed, 2017-03-15 at 20:40 +0800, Ming Lei wrote:
> > On Wed, Mar 15, 2017 at 08:18:53PM +0800, Ming Lei wrote:
> > > On Wed, Mar 15, 2017 at 12:07:37AM +0000, Bart Van Assche wrote:
> > > 
> > > > or __blk_mq_requeue_request(). Another issue with this function is that the
> > > 
> > > __blk_mq_requeue_request() can be run from two pathes:
> > > 
> > > 	- dispatch failure, in which case the req/tag isn't released to tag set
> > > 	
> > > 	- IO completion path, in which COMPLETE flag is cleared before requeue.
> > > 	
> > > so I can't see races with timeout in case of start rq vs. requeue rq. 
> > 
> > Actually rq/tag won't be released to tag set if it will be requeued, so
> > the timeout race is nothing to do with requeue.
> 
> Hello Ming,
> 
> Please have another look at __blk_mq_requeue_request(). In that function
> the following code occurs: if (test_and_clear_bit(REQ_ATOM_STARTED,
> &rq->atomic_flags)) { ... }
> 
> I think the�REQ_ATOM_STARTED check in blk_mq_check_expired() races with the
> test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags) call in
> __blk_mq_requeue_request().

OK, this race should only exist in case that the requeue happens after dispatch
busy, because COMPLETE flag isn't set. And if the requeue is from io completion,
no such race because COMPLETE flag is set.

One solution I thought of is to call blk_mark_rq_complete() before requeuing
when dispatch busy happened, but that looks a bit silly. Another way is to
set STARTED flag just after .queue_rq returns BLK_MQ_RQ_QUEUE_OK, which looks
reasonable too. Any comments on the 2nd solution?


Thanks,
Ming

^ permalink raw reply

* Re: [PATCH RFC 10/14] block, bfq: add Early Queue Merge (EQM)
From: Jens Axboe @ 2017-03-15 15:47 UTC (permalink / raw)
  To: Paolo Valente
  Cc: Tejun Heo, Fabio Checconi, Arianna Avanzini, linux-block,
	linux-kernel, ulf.hansson, linus.walleij, broonie,
	Mauro Andreolini
In-Reply-To: <1D2EDF98-59BA-4A98-8251-9BEC4AC752C4@linaro.org>

On 03/15/2017 06:01 AM, Paolo Valente wrote:
> 
>> Il giorno 07 mar 2017, alle ore 18:44, Jens Axboe <axboe@kernel.dk> ha scritto:
>>
>> On 03/04/2017 09:01 AM, Paolo Valente wrote:
>>> @@ -560,6 +600,15 @@ struct bfq_data {
>>> 	struct bfq_io_cq *bio_bic;
>>> 	/* bfqq associated with the task issuing current bio for merging */
>>> 	struct bfq_queue *bio_bfqq;
>>> +
>>> +	/*
>>> +	 * io context to put right after bfqd->lock is released. This
>>> +	 * filed is used to perform put_io_context, when needed, to
>>> +	 * after the scheduler lock has been released, and thus
>>> +	 * prevent an ioc->lock from being possibly taken while the
>>> +	 * scheduler lock is being held.
>>> +	 */
>>> +	struct io_context *ioc_to_put;
>>> };
>>
>> The logic around this is nasty, effectively you end up having locking
>> around sections of code instea of structures, which is never a good
>> idea.
>>
>> The helper functions for unlocking and dropping the ioc add to the mess
>> as well.
>>
> 
> Hi Jens,
> fortunately I seem to have found and fixed the bug causing the failure
> your reported in one of your previous emails, so I've started addressing
> the issue you raise here.  But your suggestion below raised doubts
> that I was not able to solve.  So I'm bailing out and asking for help.

Great (on fixing that other bug).

>> Can't we simply pass back a pointer to an ioc to free? That should be
>> possible, given that we must have grabbed the bfqd lock ourselves
>> further up in the call chain. So we _know_ that we'll drop it later on.
>> If that wasn't the case, the existing logic wouldn't work.
>>
> 
> One of the two functions that discover that an ioc has to bee freed,
> namely __bfq_bfqd_reset_in_service, is invoked at the end of several
> relatively long chains of function invocations.  The heads of these
> chains take and release the scheduler lock.  One example is:
> 
> bfq_dispatch_request -> __bfq_dispatch_request -> bfq_select_queue -> bfq_bfqq_expire  -> __bfq_bfqq_expire -> __bfq_bfqd_reset_in_service
> 
> To implement your proposal, all the functions involved in these chains
> should be extended to pass back the ioc to put.  The resulting, heavy
> version of the code seems really unadvisable, and prone to errors when
> one modifies or adds some chain.
> 
> So I have certainly misunderstood something.  As usual, to help you
> help me more quickly, here is a summary of what I have understood on
> this matter.
> 
> 1.  For similar, if not exactly the same, lock-nesting issue related
> to io-context putting, deferred work is used.  Probably deferred work
> is used also for other reasons, but for sure it does solve this issue too.
> 
> 2.  My solution (which I'm not defending; I'm just trying to
> understand) solves the same issue as above: put the io
> context after the other lock is released.  But it solves it with no
> work-queueing overhead.  Instead of queueing work, it 'queues' the ioc
> to put, and puts it right after releasing the scheduler lock.
> 
> Where is my mistake?  And what is the correct interpretation of your
> proposal to pass back the pointer (instead of storing it in a field of
> the device data structure)?

I think you understood me correctly. Currently I think the putting of
the io context is somewhat of a mess. You have seemingly random places
where you have to use special unlock functions, to ensure that you
notice that some caller deeper down has set ->ioc_to_put. I took a quick
look at it, and by far most of the cases can return an io_context to
free quite easily. You can mark these functions __must_check to ensure
that we don't drop an io_context, inadvertently. That's already a win
over the random ->ioc_to_put store. And you can then get rid of
bfq_unlock_put_ioc and it's irq variant as well.

The places where you are already returning a value, like off dispatch
for instance, you can just pass in a pointer to an io_context pointer.

If you get this right, it'll be a lot less fragile and hacky than your
current approach.

I'd avoid having to do deferred put from a workqueue at all costs. This
is an _expensive_ operation.

-- 
Jens Axboe

^ permalink raw reply

* Re: [PATCH 1/2] blk-mq: don't complete un-started request in timeout handler
From: Bart Van Assche @ 2017-03-15 15:36 UTC (permalink / raw)
  To: tom.leiming@gmail.com
  Cc: linux-kernel@vger.kernel.org, hch@infradead.org,
	linux-block@vger.kernel.org, yizhan@redhat.com, axboe@fb.com,
	stable@vger.kernel.org
In-Reply-To: <20170315124024.GA16549@ming.t460p>

On Wed, 2017-03-15 at 20:40 +0800, Ming Lei wrote:
> On Wed, Mar 15, 2017 at 08:18:53PM +0800, Ming Lei wrote:
> > On Wed, Mar 15, 2017 at 12:07:37AM +0000, Bart Van Assche wrote:
> >=20
> > > or __blk_mq_requeue_request(). Another issue with this function is th=
at the
> >=20
> > __blk_mq_requeue_request() can be run from two pathes:
> >=20
> > 	- dispatch failure, in which case the req/tag isn't released to tag se=
t
> > =09
> > 	- IO completion path, in which COMPLETE flag is cleared before requeue=
.
> > =09
> > so I can't see races with timeout in case of start rq vs. requeue rq.=20
>=20
> Actually rq/tag won't be released to tag set if it will be requeued, so
> the timeout race is nothing to do with requeue.

Hello Ming,

Please have another look at __blk_mq_requeue_request(). In that function
the following code occurs: if (test_and_clear_bit(REQ_ATOM_STARTED,
&rq->atomic_flags)) { ... }

I think the=A0REQ_ATOM_STARTED check in blk_mq_check_expired() races with t=
he
test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags) call in
__blk_mq_requeue_request().

Bart.=

^ permalink raw reply

* Re: [PATCH 1/2] blk-mq: don't complete un-started request in timeout handler
From: Yi Zhang @ 2017-03-15 14:11 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-kernel, linux-block, Christoph Hellwig, stable
In-Reply-To: <1489064578-17305-3-git-send-email-tom.leiming@gmail.com>

Thanks Ming.

Tested-by: Yi Zhang <yizhan@redhat.com>

Best Regards,
  Yi Zhang


----- Original Message -----
From: "Ming Lei" <tom.leiming@gmail.com>
To: "Jens Axboe" <axboe@fb.com>, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, "Christoph Hellwig" <hch@infradead.org>
Cc: "Yi Zhang" <yizhan@redhat.com>, "Ming Lei" <tom.leiming@gmail.com>, stable@vger.kernel.org
Sent: Thursday, March 9, 2017 9:02:57 PM
Subject: [PATCH 1/2] blk-mq: don't complete un-started request in timeout handler

When iterating busy request in timeout handler,
if the STARTED flag of one request isn't set, that means
the request is being processed in block layer, and not
dispatched to low level driver yet.

In current implementation of blk_mq_check_expired(),
in case that the request queue becomes dying, un-started
requests are completed immediately. This way can cause
use-after-free issue[1][2] when doing I/O and removing&resetting
NVMe device.

This patch fixes several issues reported by Yi Zhang.

[1]. oops log 1
[  581.789754] ------------[ cut here ]------------
[  581.789758] kernel BUG at block/blk-mq.c:374!
[  581.789760] invalid opcode: 0000 [#1] SMP
[  581.789761] Modules linked in: vfat fat ipmi_ssif intel_rapl sb_edac
edac_core x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm nvme
irqbypass crct10dif_pclmul nvme_core crc32_pclmul ghash_clmulni_intel
intel_cstate ipmi_si mei_me ipmi_devintf intel_uncore sg ipmi_msghandler
intel_rapl_perf iTCO_wdt mei iTCO_vendor_support mxm_wmi lpc_ich dcdbas shpchp
pcspkr acpi_power_meter wmi nfsd auth_rpcgss nfs_acl lockd dm_multipath grace
sunrpc ip_tables xfs libcrc32c sd_mod mgag200 i2c_algo_bit drm_kms_helper
syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm ahci libahci
crc32c_intel tg3 libata megaraid_sas i2c_core ptp fjes pps_core dm_mirror
dm_region_hash dm_log dm_mod
[  581.789796] CPU: 1 PID: 1617 Comm: kworker/1:1H Not tainted 4.10.0.bz1420297+ #4
[  581.789797] Hardware name: Dell Inc. PowerEdge R730xd/072T6D, BIOS 2.2.5 09/06/2016
[  581.789804] Workqueue: kblockd blk_mq_timeout_work
[  581.789806] task: ffff8804721c8000 task.stack: ffffc90006ee4000
[  581.789809] RIP: 0010:blk_mq_end_request+0x58/0x70
[  581.789810] RSP: 0018:ffffc90006ee7d50 EFLAGS: 00010202
[  581.789811] RAX: 0000000000000001 RBX: ffff8802e4195340 RCX: ffff88028e2f4b88
[  581.789812] RDX: 0000000000001000 RSI: 0000000000001000 RDI: 0000000000000000
[  581.789813] RBP: ffffc90006ee7d60 R08: 0000000000000003 R09: ffff88028e2f4b00
[  581.789814] R10: 0000000000001000 R11: 0000000000000001 R12: 00000000fffffffb
[  581.789815] R13: ffff88042abe5780 R14: 000000000000002d R15: ffff88046fbdff80
[  581.789817] FS:  0000000000000000(0000) GS:ffff88047fc00000(0000) knlGS:0000000000000000
[  581.789818] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  581.789819] CR2: 00007f64f403a008 CR3: 000000014d078000 CR4: 00000000001406e0
[  581.789820] Call Trace:
[  581.789825]  blk_mq_check_expired+0x76/0x80
[  581.789828]  bt_iter+0x45/0x50
[  581.789830]  blk_mq_queue_tag_busy_iter+0xdd/0x1f0
[  581.789832]  ? blk_mq_rq_timed_out+0x70/0x70
[  581.789833]  ? blk_mq_rq_timed_out+0x70/0x70
[  581.789840]  ? __switch_to+0x140/0x450
[  581.789841]  blk_mq_timeout_work+0x88/0x170
[  581.789845]  process_one_work+0x165/0x410
[  581.789847]  worker_thread+0x137/0x4c0
[  581.789851]  kthread+0x101/0x140
[  581.789853]  ? rescuer_thread+0x3b0/0x3b0
[  581.789855]  ? kthread_park+0x90/0x90
[  581.789860]  ret_from_fork+0x2c/0x40
[  581.789861] Code: 48 85 c0 74 0d 44 89 e6 48 89 df ff d0 5b 41 5c 5d c3 48
8b bb 70 01 00 00 48 85 ff 75 0f 48 89 df e8 7d f0 ff ff 5b 41 5c 5d c3 <0f>
0b e8 71 f0 ff ff 90 eb e9 0f 1f 40 00 66 2e 0f 1f 84 00 00
[  581.789882] RIP: blk_mq_end_request+0x58/0x70 RSP: ffffc90006ee7d50
[  581.789889] ---[ end trace bcaf03d9a14a0a70 ]---

[2]. oops log2
[ 6984.857362] BUG: unable to handle kernel NULL pointer dereference at 0000000000000010
[ 6984.857372] IP: nvme_queue_rq+0x6e6/0x8cd [nvme]
[ 6984.857373] PGD 0
[ 6984.857374]
[ 6984.857376] Oops: 0000 [#1] SMP
[ 6984.857379] Modules linked in: ipmi_ssif vfat fat intel_rapl sb_edac
edac_core x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm
irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel ipmi_si iTCO_wdt
iTCO_vendor_support mxm_wmi ipmi_devintf intel_cstate sg dcdbas intel_uncore
mei_me intel_rapl_perf mei pcspkr lpc_ich ipmi_msghandler shpchp
acpi_power_meter wmi nfsd auth_rpcgss dm_multipath nfs_acl lockd grace sunrpc
ip_tables xfs libcrc32c sd_mod mgag200 i2c_algo_bit drm_kms_helper syscopyarea
sysfillrect crc32c_intel sysimgblt fb_sys_fops ttm nvme drm nvme_core ahci
libahci i2c_core tg3 libata ptp megaraid_sas pps_core fjes dm_mirror
dm_region_hash dm_log dm_mod
[ 6984.857416] CPU: 7 PID: 1635 Comm: kworker/7:1H Not tainted
4.10.0-2.el7.bz1420297.x86_64 #1
[ 6984.857417] Hardware name: Dell Inc. PowerEdge R730xd/072T6D, BIOS 2.2.5 09/06/2016
[ 6984.857427] Workqueue: kblockd blk_mq_run_work_fn
[ 6984.857429] task: ffff880476e3da00 task.stack: ffffc90002e90000
[ 6984.857432] RIP: 0010:nvme_queue_rq+0x6e6/0x8cd [nvme]
[ 6984.857433] RSP: 0018:ffffc90002e93c50 EFLAGS: 00010246
[ 6984.857434] RAX: 0000000000000000 RBX: ffff880275646600 RCX: 0000000000001000
[ 6984.857435] RDX: 0000000000000fff RSI: 00000002fba2a000 RDI: ffff8804734e6950
[ 6984.857436] RBP: ffffc90002e93d30 R08: 0000000000002000 R09: 0000000000001000
[ 6984.857437] R10: 0000000000001000 R11: 0000000000000000 R12: ffff8804741d8000
[ 6984.857438] R13: 0000000000000040 R14: ffff880475649f80 R15: ffff8804734e6780
[ 6984.857439] FS:  0000000000000000(0000) GS:ffff88047fcc0000(0000) knlGS:0000000000000000
[ 6984.857440] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 6984.857442] CR2: 0000000000000010 CR3: 0000000001c09000 CR4: 00000000001406e0
[ 6984.857443] Call Trace:
[ 6984.857451]  ? mempool_free+0x2b/0x80
[ 6984.857455]  ? bio_free+0x4e/0x60
[ 6984.857459]  blk_mq_dispatch_rq_list+0xf5/0x230
[ 6984.857462]  blk_mq_process_rq_list+0x133/0x170
[ 6984.857465]  __blk_mq_run_hw_queue+0x8c/0xa0
[ 6984.857467]  blk_mq_run_work_fn+0x12/0x20
[ 6984.857473]  process_one_work+0x165/0x410
[ 6984.857475]  worker_thread+0x137/0x4c0
[ 6984.857478]  kthread+0x101/0x140
[ 6984.857480]  ? rescuer_thread+0x3b0/0x3b0
[ 6984.857481]  ? kthread_park+0x90/0x90
[ 6984.857489]  ret_from_fork+0x2c/0x40
[ 6984.857490] Code: 8b bd 70 ff ff ff 89 95 50 ff ff ff 89 8d 58 ff ff ff 44
89 95 60 ff ff ff e8 b7 dd 12 e1 8b 95 50 ff ff ff 48 89 85 68 ff ff ff <4c>
8b 48 10 44 8b 58 18 8b 8d 58 ff ff ff 44 8b 95 60 ff ff ff
[ 6984.857511] RIP: nvme_queue_rq+0x6e6/0x8cd [nvme] RSP: ffffc90002e93c50
[ 6984.857512] CR2: 0000000000000010
[ 6984.895359] ---[ end trace 2d7ceb528432bf83 ]---

Cc: stable@vger.kernel.org
Reported-by: Yi Zhang <yizhan@redhat.com>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 block/blk-mq.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 159187a28d66..0aff380099d5 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -697,17 +697,8 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 {
 	struct blk_mq_timeout_data *data = priv;
 
-	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
-		/*
-		 * If a request wasn't started before the queue was
-		 * marked dying, kill it here or it'll go unnoticed.
-		 */
-		if (unlikely(blk_queue_dying(rq->q))) {
-			rq->errors = -EIO;
-			blk_mq_end_request(rq, rq->errors);
-		}
+	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
 		return;
-	}
 
 	if (time_after_eq(jiffies, rq->deadline)) {
 		if (!blk_mark_rq_complete(rq))
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH 1/2] blk-mq: don't complete un-started request in timeout handler
From: Ming Lei @ 2017-03-15 12:40 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: linux-kernel@vger.kernel.org, hch@infradead.org,
	linux-block@vger.kernel.org, axboe@fb.com, yizhan@redhat.com,
	stable@vger.kernel.org
In-Reply-To: <20170315121851.GA15807@ming.t460p>

On Wed, Mar 15, 2017 at 08:18:53PM +0800, Ming Lei wrote:
> On Wed, Mar 15, 2017 at 12:07:37AM +0000, Bart Van Assche wrote:
> 
> > or __blk_mq_requeue_request(). Another issue with this function is that the
> 
> __blk_mq_requeue_request() can be run from two pathes:
> 
> 	- dispatch failure, in which case the req/tag isn't released to tag set
> 	
> 	- IO completion path, in which COMPLETE flag is cleared before requeue.
> 	
> so I can't see races with timeout in case of start rq vs. requeue rq. 

Actually rq/tag won't be released to tag set if it will be requeued, so
the timeout race is nothing to do with requeue.

Thanks,
Ming

^ permalink raw reply

* Re: [PATCH 1/2] blk-mq: don't complete un-started request in timeout handler
From: Ming Lei @ 2017-03-15 12:18 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: linux-kernel@vger.kernel.org, hch@infradead.org,
	linux-block@vger.kernel.org, axboe@fb.com, yizhan@redhat.com,
	stable@vger.kernel.org
In-Reply-To: <1489536441.2676.21.camel@sandisk.com>

On Wed, Mar 15, 2017 at 12:07:37AM +0000, Bart Van Assche wrote:
> On Thu, 2017-03-09 at 21:02 +0800, Ming Lei wrote:
> > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > index 159187a28d66..0aff380099d5 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -697,17 +697,8 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
> >  {
> >  	struct blk_mq_timeout_data *data = priv;
> >  
> > -	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
> > -		/*
> > -		 * If a request wasn't started before the queue was
> > -		 * marked dying, kill it here or it'll go unnoticed.
> > -		 */
> > -		if (unlikely(blk_queue_dying(rq->q))) {
> > -			rq->errors = -EIO;
> > -			blk_mq_end_request(rq, rq->errors);
> > -		}
> > +	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
> >  		return;
> > -	}
> >  
> >  	if (time_after_eq(jiffies, rq->deadline)) {
> >  		if (!blk_mark_rq_complete(rq))
> 
> Both the old and the new check look racy to me. The REQ_ATOM_STARTED bit can
> be changed concurrently by blk_mq_start_request(), __blk_mq_finish_request()

blk_mq_start_request() and __blk_mq_finish_request() won't be run concurrently.

>From view of __blk_mq_finish_request():

	- if it is run from merge queue io path(blk_mq_merge_queue_io()),
	blk_mq_start_request() can't be run at all, and the COMPLETE flag
	is kept as previous value(zero)

	- if it is run from normal complete path, COMPLETE flag is cleared
	before the req/tag is released to tag set.

so there isn't race in blk_mq_start_request() vs. __blk_mq_finish_request()
wrt. timeout.

> or __blk_mq_requeue_request(). Another issue with this function is that the

__blk_mq_requeue_request() can be run from two pathes:

	- dispatch failure, in which case the req/tag isn't released to tag set
	
	- IO completion path, in which COMPLETE flag is cleared before requeue.
	
so I can't see races with timeout in case of start rq vs. requeue rq. 

> request passed to this function can be reinitialized concurrently. Sorry but

Yes, that is possible, but rq->atomic_flags is kept, and in that case
when we handle timeout, COMPLETE is cleared before releasing the rq/tag to
tag set via blk_mark_rq_complete(), so we won't complete that req twice.



Thanks,
Ming

^ permalink raw reply

* Re: [PATCH RFC 10/14] block, bfq: add Early Queue Merge (EQM)
From: Paolo Valente @ 2017-03-15 12:01 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Tejun Heo, Fabio Checconi, Arianna Avanzini, linux-block,
	linux-kernel, ulf.hansson, linus.walleij, broonie,
	Mauro Andreolini
In-Reply-To: <d37a7169-f279-8244-19f8-440dc1490893@kernel.dk>


> Il giorno 07 mar 2017, alle ore 18:44, Jens Axboe <axboe@kernel.dk> ha =
scritto:
>=20
> On 03/04/2017 09:01 AM, Paolo Valente wrote:
>> @@ -560,6 +600,15 @@ struct bfq_data {
>> 	struct bfq_io_cq *bio_bic;
>> 	/* bfqq associated with the task issuing current bio for merging =
*/
>> 	struct bfq_queue *bio_bfqq;
>> +
>> +	/*
>> +	 * io context to put right after bfqd->lock is released. This
>> +	 * filed is used to perform put_io_context, when needed, to
>> +	 * after the scheduler lock has been released, and thus
>> +	 * prevent an ioc->lock from being possibly taken while the
>> +	 * scheduler lock is being held.
>> +	 */
>> +	struct io_context *ioc_to_put;
>> };
>=20
> The logic around this is nasty, effectively you end up having locking
> around sections of code instea of structures, which is never a good
> idea.
>=20
> The helper functions for unlocking and dropping the ioc add to the =
mess
> as well.
>=20

Hi Jens,
fortunately I seem to have found and fixed the bug causing the failure
your reported in one of your previous emails, so I've started addressing
the issue you raise here.  But your suggestion below raised doubts
that I was not able to solve.  So I'm bailing out and asking for help.

> Can't we simply pass back a pointer to an ioc to free? That should be
> possible, given that we must have grabbed the bfqd lock ourselves
> further up in the call chain. So we _know_ that we'll drop it later =
on.
> If that wasn't the case, the existing logic wouldn't work.
>=20

One of the two functions that discover that an ioc has to bee freed,
namely __bfq_bfqd_reset_in_service, is invoked at the end of several
relatively long chains of function invocations.  The heads of these
chains take and release the scheduler lock.  One example is:

bfq_dispatch_request -> __bfq_dispatch_request -> bfq_select_queue -> =
bfq_bfqq_expire  -> __bfq_bfqq_expire -> __bfq_bfqd_reset_in_service

To implement your proposal, all the functions involved in these chains
should be extended to pass back the ioc to put.  The resulting, heavy
version of the code seems really unadvisable, and prone to errors when
one modifies or adds some chain.

So I have certainly misunderstood something.  As usual, to help you
help me more quickly, here is a summary of what I have understood on
this matter.

1.  For similar, if not exactly the same, lock-nesting issue related
to io-context putting, deferred work is used.  Probably deferred work
is used also for other reasons, but for sure it does solve this issue =
too.

2.  My solution (which I'm not defending; I'm just trying to
understand) solves the same issue as above: put the io
context after the other lock is released.  But it solves it with no
work-queueing overhead.  Instead of queueing work, it 'queues' the ioc
to put, and puts it right after releasing the scheduler lock.

Where is my mistake?  And what is the correct interpretation of your
proposal to pass back the pointer (instead of storing it in a field of
the device data structure)?

Thanks,
Paolo

> --=20
> Jens Axboe
>=20

^ permalink raw reply

* Re: [PATCH 1/2] blk-mq: don't complete un-started request in timeout handler
From: Bart Van Assche @ 2017-03-15  0:07 UTC (permalink / raw)
  To: linux-kernel@vger.kernel.org, hch@infradead.org,
	linux-block@vger.kernel.org, tom.leiming@gmail.com, axboe@fb.com
  Cc: yizhan@redhat.com, stable@vger.kernel.org
In-Reply-To: <1489064578-17305-3-git-send-email-tom.leiming@gmail.com>

On Thu, 2017-03-09 at 21:02 +0800, Ming Lei wrote:
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 159187a28d66..0aff380099d5 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -697,17 +697,8 @@ static void blk_mq_check_expired(struct blk_mq_hw_ct=
x *hctx,
>  {
>  	struct blk_mq_timeout_data *data =3D priv;
> =20
> -	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
> -		/*
> -		 * If a request wasn't started before the queue was
> -		 * marked dying, kill it here or it'll go unnoticed.
> -		 */
> -		if (unlikely(blk_queue_dying(rq->q))) {
> -			rq->errors =3D -EIO;
> -			blk_mq_end_request(rq, rq->errors);
> -		}
> +	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
>  		return;
> -	}
> =20
>  	if (time_after_eq(jiffies, rq->deadline)) {
>  		if (!blk_mark_rq_complete(rq))

Both the old and the new check look racy to me. The REQ_ATOM_STARTED bit ca=
n
be changed concurrently by blk_mq_start_request(), __blk_mq_finish_request(=
)
or __blk_mq_requeue_request(). Another issue with this function is that the
request passed to this function can be reinitialized concurrently. Sorry bu=
t
I'm not sure what the best way is to address these issues.

Bart.=

^ permalink raw reply

* Re: [PATCH rfc 06/10] IB/cq: Don't force IB_POLL_DIRECT poll context for ib_process_cq_direct
From: Bart Van Assche @ 2017-03-14 21:32 UTC (permalink / raw)
  To: linux-nvme@lists.infradead.org, linux-rdma@vger.kernel.org,
	linux-block@vger.kernel.org, target-devel@vger.kernel.org,
	sagi@grimberg.me
In-Reply-To: <f4ade02b-9d17-20ae-f910-47c718f2a5bd@grimberg.me>

On Mon, 2017-03-13 at 10:24 +0200, Sagi Grimberg wrote:
> > Before this patch
> > the completions from each CQ were processed sequentially. That's a big
> > change so I think this should be mentioned clearly in the header above
> > ib_process_cq_direct().
>=20
> Note that I now see that the cq lock is not sufficient for mutual
> exclusion here because we're referencing cq->wc array outside of it.
>=20
> There are three options I see here:
> 1. we'll need to allocate a different wc array for polling mode,
> perhaps a smaller one?
> 2. Export __ib_process_cq (in some form) with an option to pass in a wc
> array.
> 3. Simply not support non-selective polling but it seems like a shame...
>=20
> Any thoughts?

I doubt it is possible to come up with an algorithm that recognizes whether
or not two different ib_process_cq() calls are serialized. So the
ib_process_cq() caller will have to provide that information. How about add=
ing
an ib_wc array argument to ib_process_cq() and modifying __ib_process_cq()
such that it uses that array if specified and cq->wc otherwise?

Bart.=

^ permalink raw reply

* Re: [PATCH rfc 04/10] block: Add a non-selective polling interface
From: Bart Van Assche @ 2017-03-14 21:21 UTC (permalink / raw)
  To: linux-nvme@lists.infradead.org, linux-rdma@vger.kernel.org,
	linux-block@vger.kernel.org, target-devel@vger.kernel.org,
	sagi@grimberg.me
In-Reply-To: <b8124df3-bd09-2eb2-9899-3c9195605510@grimberg.me>

On Mon, 2017-03-13 at 10:15 +0200, Sagi Grimberg wrote:
> > Additionally, I think making the hardware context an argument of this f=
unction
> > instead of using blk_mq_map_queue(q, smp_processor_id()) would make thi=
s
> > function much more versatile.
>=20
> What do you mean? remember that the callers interface to the device is
> a request queue, it doesn't even know if its a blk-mq device. Can you
> explain in more details what you would like to see?

Have you considered to make the CPU ID an argument instead of using the
smp_processor_id() result?

Bart.=

^ permalink raw reply

* Re: [PATCH 0/4] block: callback-based statistics
From: Omar Sandoval @ 2017-03-14 21:07 UTC (permalink / raw)
  To: linux-block; +Cc: kernel-team
In-Reply-To: <cover.1489524591.git.osandov@fb.com>

On Tue, Mar 14, 2017 at 02:03:27PM -0700, Omar Sandoval wrote:
> From: Omar Sandoval <osandov@fb.com>
> 
> This patchset generalizes the blk-stats infrastructure to allow users to
> register a callback to be called at a given time with the statistics of
> requests completed during that window. Writeback throttling and hybrid
> polling are converted to the new infrastructure. The ultimate goal is to
> use this infrastructure for the mq I/O scheduler.
> 
> The details are in patch 4, which is the actual conversion. Patches 1-3
> are preparation cleanups.

FYI, the diff of block/blk-stat.[ch] is a pain to read. It's easier to
just apply the patches and look at the whole files.

^ permalink raw reply

* [PATCH 4/4] blk-stats: convert to callback-based statistics reporting
From: Omar Sandoval @ 2017-03-14 21:03 UTC (permalink / raw)
  To: linux-block; +Cc: kernel-team
In-Reply-To: <cover.1489524591.git.osandov@fb.com>

From: Omar Sandoval <osandov@fb.com>

Currently, statistics are gathered in ~0.13s windows, and users grab the
statistics whenever they need them. This is not ideal for both in-tree
users:

1. Writeback throttling wants its own dynamically sized window of
   statistics. Since the blk-stats statistics are reset after every
   window and the wbt windows don't line up with the blk-stats windows,
   wbt doesn't see every I/O.
2. Polling currently grabs the statistics on every I/O. Again, depending
   on how the window lines up, we may miss some I/Os. It's also
   unnecessary overhead to get the statistics on every I/O; the hybrid
   polling heuristic would be just as happy with the statistics from the
   previous full window.

This reworks the blk-stats infrastructure to be callback-based: users
register a callback that they want called at a given time with all of
the statistics from the window during which the callback was active. The
callbacks are kept on an RCU list, and each callback has percpu stats
buffers. There will only be a few users, so the overhead on the I/O
completion side is low. The stats flushing is also simplified
considerably: since the timer function is responsible for clearing the
statistics, we don't have to worry about stale statistics.

wbt is a trivial conversion. After the conversion, the windowing problem
mentioned above is fixed.

For polling, we register an extra callback that caches the previous
window's statistics in the struct request_queue for the hybrid polling
heuristic to use.

Signed-off-by: Omar Sandoval <osandov@fb.com>
---
 block/blk-core.c          |   6 +-
 block/blk-mq.c            |  77 ++++++++----
 block/blk-mq.h            |   1 -
 block/blk-stat.c          | 300 +++++++++++++++++++++-------------------------
 block/blk-stat.h          |  98 ++++++++++++---
 block/blk-sysfs.c         |   5 +
 block/blk-wbt.c           |  48 +++-----
 block/blk-wbt.h           |   2 +-
 include/linux/blk_types.h |   3 -
 include/linux/blkdev.h    |  10 +-
 10 files changed, 307 insertions(+), 243 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 82425017c9b8..0f4ae118e220 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -852,6 +852,10 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio);
 
 int blk_init_allocated_queue(struct request_queue *q)
 {
+	q->stats = blk_alloc_queue_stats();
+	if (!q->stats)
+		return -ENOMEM;
+
 	q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, q->cmd_size);
 	if (!q->fq)
 		return -ENOMEM;
@@ -2692,7 +2696,7 @@ void blk_finish_request(struct request *req, int error)
 	struct request_queue *q = req->q;
 
 	if (req->rq_flags & RQF_STATS)
-		blk_stat_add(&q->rq_stats[rq_data_dir(req)], req);
+		blk_stat_add(req);
 
 	if (req->rq_flags & RQF_QUEUED)
 		blk_queue_end_tag(q, req);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e16f8d420683..559f9b0f24a1 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -39,6 +39,10 @@
 static DEFINE_MUTEX(all_q_mutex);
 static LIST_HEAD(all_q_list);
 
+static void blk_mq_poll_stats_start(struct request_queue *q);
+static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb,
+				 struct blk_stats *stats);
+
 /*
  * Check if any of the ctx's have pending work in this hardware queue
  */
@@ -432,15 +436,8 @@ static void blk_mq_ipi_complete_request(struct request *rq)
 static void blk_mq_stat_add(struct request *rq)
 {
 	if (rq->rq_flags & RQF_STATS) {
-		/*
-		 * We could rq->mq_ctx here, but there's less of a risk
-		 * of races if we have the completion event add the stats
-		 * to the local software queue.
-		 */
-		struct blk_mq_ctx *ctx;
-
-		ctx = __blk_mq_get_ctx(rq->q, raw_smp_processor_id());
-		blk_stat_add(&ctx->stat[rq_data_dir(rq)], rq);
+		blk_mq_poll_stats_start(rq->q);
+		blk_stat_add(rq);
 	}
 }
 
@@ -2039,8 +2036,6 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 		spin_lock_init(&__ctx->lock);
 		INIT_LIST_HEAD(&__ctx->rq_list);
 		__ctx->queue = q;
-		blk_stat_init(&__ctx->stat[BLK_STAT_READ]);
-		blk_stat_init(&__ctx->stat[BLK_STAT_WRITE]);
 
 		/* If the cpu isn't online, the cpu is mapped to first hctx */
 		if (!cpu_online(i))
@@ -2338,6 +2333,14 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	/* mark the queue as mq asap */
 	q->mq_ops = set->ops;
 
+	q->stats = blk_alloc_queue_stats();
+	if (!q->stats)
+		goto err_exit;
+
+	q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn, q);
+	if (!q->poll_cb)
+		goto err_exit;
+
 	q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
 	if (!q->queue_ctx)
 		goto err_exit;
@@ -2739,28 +2742,54 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
 }
 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
 
+/* Enable polling stats and return whether they were already enabled. */
+static bool blk_poll_stats_enable(struct request_queue *q)
+{
+	if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
+	    test_and_set_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
+		return true;
+	blk_stat_add_callback(q, q->poll_cb);
+	return false;
+}
+
+static void blk_mq_poll_stats_start(struct request_queue *q)
+{
+	/*
+	 * We don't arm the callback if polling stats are not enabled or the
+	 * callback has already been armed.
+	 */
+	if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
+	    timer_pending(&q->poll_cb->timer))
+		return;
+
+	blk_stat_arm_callback(q->poll_cb, jiffies + msecs_to_jiffies(100));
+}
+
+static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb,
+				 struct blk_stats *stats)
+{
+	struct request_queue *q = cb->data;
+
+	if (stats->read.nr_samples)
+		q->poll_stats.read = stats->read;
+	if (stats->write.nr_samples)
+		q->poll_stats.write = stats->write;
+}
+
 static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
 				       struct blk_mq_hw_ctx *hctx,
 				       struct request *rq)
 {
-	struct blk_stats stats;
 	unsigned long ret = 0;
 
 	/*
 	 * If stats collection isn't on, don't sleep but turn it on for
 	 * future users
 	 */
-	if (!blk_stat_enable(q))
+	if (!blk_poll_stats_enable(q))
 		return 0;
 
 	/*
-	 * We don't have to do this once per IO, should optimize this
-	 * to just use the current window of stats until it changes
-	 */
-	memset(&stats, 0, sizeof(stats));
-	blk_hctx_stat_get(hctx, &stats);
-
-	/*
 	 * As an optimistic guess, use half of the mean service time
 	 * for this type of request. We can (and should) make this smarter.
 	 * For instance, if the completion latencies are tight, we can
@@ -2768,10 +2797,10 @@ static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
 	 * important on devices where the completion latencies are longer
 	 * than ~10 usec.
 	 */
-	if (req_op(rq) == REQ_OP_READ && stats.read.nr_samples)
-		ret = (stats.read.mean + 1) / 2;
-	else if (req_op(rq) == REQ_OP_WRITE && stats.write.nr_samples)
-		ret = (stats.write.mean + 1) / 2;
+	if (req_op(rq) == REQ_OP_READ && q->poll_stats.read.nr_samples)
+		ret = (q->poll_stats.read.mean + 1) / 2;
+	else if (req_op(rq) == REQ_OP_WRITE && q->poll_stats.write.nr_samples)
+		ret = (q->poll_stats.write.mean + 1) / 2;
 
 	return ret;
 }
diff --git a/block/blk-mq.h b/block/blk-mq.h
index b79f9a7d8cf6..8d49c06fc520 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -20,7 +20,6 @@ struct blk_mq_ctx {
 
 	/* incremented at completion time */
 	unsigned long		____cacheline_aligned_in_smp rq_completed[2];
-	struct blk_rq_stat	stat[2];
 
 	struct request_queue	*queue;
 	struct kobject		kobj;
diff --git a/block/blk-stat.c b/block/blk-stat.c
index 40f6c90e432a..a71cfb35712d 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -4,11 +4,24 @@
  * Copyright (C) 2016 Jens Axboe
  */
 #include <linux/kernel.h>
+#include <linux/rculist.h>
 #include <linux/blk-mq.h>
 
 #include "blk-stat.h"
 #include "blk-mq.h"
 
+struct blk_queue_stats {
+	struct list_head callbacks;
+	spinlock_t lock;
+};
+
+static void blk_stat_init(struct blk_rq_stat *stat)
+{
+	stat->min = -1ULL;
+	stat->max = stat->nr_samples = stat->mean = 0;
+	stat->batch = stat->nr_batch = 0;
+}
+
 static void blk_stat_flush_batch(struct blk_rq_stat *stat)
 {
 	const s32 nr_batch = READ_ONCE(stat->nr_batch);
@@ -48,209 +61,164 @@ static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
 	dst->nr_samples += src->nr_samples;
 }
 
-static void blk_mq_stat_get(struct request_queue *q, struct blk_stats *stats)
+static void __blk_stat_add(struct blk_rq_stat *stat, u64 value)
 {
-	struct blk_mq_hw_ctx *hctx;
-	struct blk_mq_ctx *ctx;
-	uint64_t latest = 0;
-	int i, j, nr;
-
-	blk_stat_init(&stats->read);
-	blk_stat_init(&stats->write);
-
-	nr = 0;
-	do {
-		uint64_t newest = 0;
-
-		queue_for_each_hw_ctx(q, hctx, i) {
-			hctx_for_each_ctx(hctx, ctx, j) {
-				blk_stat_flush_batch(&ctx->stat[BLK_STAT_READ]);
-				blk_stat_flush_batch(&ctx->stat[BLK_STAT_WRITE]);
-
-				if (!ctx->stat[BLK_STAT_READ].nr_samples &&
-				    !ctx->stat[BLK_STAT_WRITE].nr_samples)
-					continue;
-				if (ctx->stat[BLK_STAT_READ].time > newest)
-					newest = ctx->stat[BLK_STAT_READ].time;
-				if (ctx->stat[BLK_STAT_WRITE].time > newest)
-					newest = ctx->stat[BLK_STAT_WRITE].time;
-			}
-		}
+	stat->min = min(stat->min, value);
+	stat->max = max(stat->max, value);
 
-		/*
-		 * No samples
-		 */
-		if (!newest)
-			break;
-
-		if (newest > latest)
-			latest = newest;
-
-		queue_for_each_hw_ctx(q, hctx, i) {
-			hctx_for_each_ctx(hctx, ctx, j) {
-				if (ctx->stat[BLK_STAT_READ].time == newest) {
-					blk_stat_sum(&stats->read,
-						     &ctx->stat[BLK_STAT_READ]);
-					nr++;
-				}
-				if (ctx->stat[BLK_STAT_WRITE].time == newest) {
-					blk_stat_sum(&stats->write,
-						     &ctx->stat[BLK_STAT_WRITE]);
-					nr++;
-				}
-			}
-		}
-		/*
-		 * If we race on finding an entry, just loop back again.
-		 * Should be very rare.
-		 */
-	} while (!nr);
+	if (stat->batch + value < stat->batch ||
+	    stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
+		blk_stat_flush_batch(stat);
 
-	stats->read.time = stats->write.time = latest;
+	stat->batch += value;
+	stat->nr_batch++;
 }
 
-void blk_queue_stat_get(struct request_queue *q, struct blk_stats *stats)
+static struct blk_rq_stat *blk_stat_ddir(struct blk_stats __percpu *stats,
+					 int ddir)
 {
-	if (q->mq_ops)
-		blk_mq_stat_get(q, stats);
-	else {
-		blk_stat_flush_batch(&q->rq_stats[BLK_STAT_READ]);
-		blk_stat_flush_batch(&q->rq_stats[BLK_STAT_WRITE]);
-		memcpy(&stats->read, &q->rq_stats[BLK_STAT_READ],
-		       sizeof(struct blk_rq_stat));
-		memcpy(&stats->write, &q->rq_stats[BLK_STAT_WRITE],
-		       sizeof(struct blk_rq_stat));
-	}
+	if (ddir == READ)
+		return &this_cpu_ptr(stats)->read;
+	else
+		return &this_cpu_ptr(stats)->write;
 }
 
-void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_stats *stats)
+void blk_stat_add(struct request *rq)
 {
-	struct blk_mq_ctx *ctx;
-	unsigned int i, nr;
-
-	nr = 0;
-	do {
-		uint64_t newest = 0;
-
-		hctx_for_each_ctx(hctx, ctx, i) {
-			blk_stat_flush_batch(&ctx->stat[BLK_STAT_READ]);
-			blk_stat_flush_batch(&ctx->stat[BLK_STAT_WRITE]);
+	struct request_queue *q = rq->q;
+	struct blk_stat_callback *cb;
+	struct blk_rq_stat *stat;
+	int ddir = rq_data_dir(rq);
+	s64 now, value;
 
-			if (!ctx->stat[BLK_STAT_READ].nr_samples &&
-			    !ctx->stat[BLK_STAT_WRITE].nr_samples)
-				continue;
+	now = __blk_stat_time(ktime_to_ns(ktime_get()));
+	if (now < blk_stat_time(&rq->issue_stat))
+		return;
 
-			if (ctx->stat[BLK_STAT_READ].time > newest)
-				newest = ctx->stat[BLK_STAT_READ].time;
-			if (ctx->stat[BLK_STAT_WRITE].time > newest)
-				newest = ctx->stat[BLK_STAT_WRITE].time;
-		}
+	value = now - blk_stat_time(&rq->issue_stat);
 
-		if (!newest)
-			break;
-
-		hctx_for_each_ctx(hctx, ctx, i) {
-			if (ctx->stat[BLK_STAT_READ].time == newest) {
-				blk_stat_sum(&stats->read,
-					     &ctx->stat[BLK_STAT_READ]);
-				nr++;
-			}
-			if (ctx->stat[BLK_STAT_WRITE].time == newest) {
-				blk_stat_sum(&stats->write,
-					     &ctx->stat[BLK_STAT_WRITE]);
-				nr++;
-			}
+	rcu_read_lock();
+	list_for_each_entry_rcu(cb, &q->stats->callbacks, list) {
+		if (timer_pending(&cb->timer)) {
+			stat = blk_stat_ddir(cb->stats, ddir);
+			__blk_stat_add(stat, value);
 		}
-		/*
-		 * If we race on finding an entry, just loop back again.
-		 * Should be very rare, as the window is only updated
-		 * occasionally
-		 */
-	} while (!nr);
+	}
+	rcu_read_unlock();
 }
 
-static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now)
+static void blk_stat_timer_fn(unsigned long data)
 {
-	stat->min = -1ULL;
-	stat->max = stat->nr_samples = stat->mean = 0;
-	stat->batch = stat->nr_batch = 0;
-	stat->time = time_now & BLK_STAT_NSEC_MASK;
-}
+	struct blk_stat_callback *cb = (void *)data;
+	struct blk_stats stats;
+	int i;
 
-void blk_stat_init(struct blk_rq_stat *stat)
-{
-	__blk_stat_init(stat, ktime_to_ns(ktime_get()));
-}
+	blk_stat_init(&stats.read);
+	blk_stat_init(&stats.write);
 
-static bool __blk_stat_is_current(struct blk_rq_stat *stat, s64 now)
-{
-	return (now & BLK_STAT_NSEC_MASK) == (stat->time & BLK_STAT_NSEC_MASK);
+	for_each_online_cpu(i) {
+		struct blk_stats *cpu_stats;
+
+		cpu_stats = per_cpu_ptr(cb->stats, i);
+		blk_stat_sum(&stats.read, &cpu_stats->read);
+		blk_stat_sum(&stats.write, &cpu_stats->write);
+		blk_stat_init(&cpu_stats->read);
+		blk_stat_init(&cpu_stats->write);
+	}
+
+	cb->fn(cb, &stats);
 }
 
-bool blk_stat_is_current(struct blk_rq_stat *stat)
+struct blk_stat_callback *
+blk_stat_alloc_callback(void (*fn)(struct blk_stat_callback *, struct blk_stats *),
+			void *data)
 {
-	return __blk_stat_is_current(stat, ktime_to_ns(ktime_get()));
+	struct blk_stat_callback *cb;
+
+	cb = kmalloc(sizeof(*cb), GFP_KERNEL);
+	if (!cb)
+		return NULL;
+
+	cb->stats = alloc_percpu(struct blk_stats);
+	if (!cb->stats) {
+		kfree(cb);
+		return NULL;
+	}
+
+	cb->fn = fn;
+	cb->data = data;
+	setup_timer(&cb->timer, blk_stat_timer_fn, (unsigned long)cb);
+
+	return cb;
 }
+EXPORT_SYMBOL_GPL(blk_stat_alloc_callback);
 
-void blk_stat_add(struct blk_rq_stat *stat, struct request *rq)
+void blk_stat_add_callback(struct request_queue *q,
+			   struct blk_stat_callback *cb)
 {
-	s64 now, value;
+	int i;
 
-	now = __blk_stat_time(ktime_to_ns(ktime_get()));
-	if (now < blk_stat_time(&rq->issue_stat))
-		return;
+	for_each_possible_cpu(i) {
+		struct blk_stats *stats = per_cpu_ptr(cb->stats, i);
 
-	if (!__blk_stat_is_current(stat, now))
-		__blk_stat_init(stat, now);
+		blk_stat_init(&stats->read);
+		blk_stat_init(&stats->write);
+	}
 
-	value = now - blk_stat_time(&rq->issue_stat);
-	if (value > stat->max)
-		stat->max = value;
-	if (value < stat->min)
-		stat->min = value;
+	spin_lock(&q->stats->lock);
+	list_add_tail_rcu(&cb->list, &q->stats->callbacks);
+	set_bit(QUEUE_FLAG_STATS, &q->queue_flags);
+	spin_unlock(&q->stats->lock);
+}
+EXPORT_SYMBOL_GPL(blk_stat_add_callback);
 
-	if (stat->batch + value < stat->batch ||
-	    stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
-		blk_stat_flush_batch(stat);
+void blk_stat_remove_callback(struct request_queue *q,
+			      struct blk_stat_callback *cb)
+{
+	spin_lock(&q->stats->lock);
+	list_del_rcu(&cb->list);
+	if (list_empty(&q->stats->callbacks))
+		clear_bit(QUEUE_FLAG_STATS, &q->queue_flags);
+	spin_unlock(&q->stats->lock);
 
-	stat->batch += value;
-	stat->nr_batch++;
+	del_timer_sync(&cb->timer);
 }
+EXPORT_SYMBOL_GPL(blk_stat_remove_callback);
 
-void blk_stat_clear(struct request_queue *q)
+static void blk_stat_free_callback_rcu(struct rcu_head *head)
 {
-	if (q->mq_ops) {
-		struct blk_mq_hw_ctx *hctx;
-		struct blk_mq_ctx *ctx;
-		int i, j;
-
-		queue_for_each_hw_ctx(q, hctx, i) {
-			hctx_for_each_ctx(hctx, ctx, j) {
-				blk_stat_init(&ctx->stat[BLK_STAT_READ]);
-				blk_stat_init(&ctx->stat[BLK_STAT_WRITE]);
-			}
-		}
-	} else {
-		blk_stat_init(&q->rq_stats[BLK_STAT_READ]);
-		blk_stat_init(&q->rq_stats[BLK_STAT_WRITE]);
-	}
+	struct blk_stat_callback *cb;
+
+	cb = container_of(head, struct blk_stat_callback, rcu);
+	free_percpu(cb->stats);
 }
 
-void blk_stat_set_issue_time(struct blk_issue_stat *stat)
+void blk_stat_free_callback(struct blk_stat_callback *cb)
 {
-	stat->time = (stat->time & BLK_STAT_MASK) |
-			(ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK);
+	call_rcu(&cb->rcu, blk_stat_free_callback_rcu);
 }
+EXPORT_SYMBOL_GPL(blk_stat_free_callback);
 
-/*
- * Enable stat tracking, return whether it was enabled
- */
-bool blk_stat_enable(struct request_queue *q)
+struct blk_queue_stats *blk_alloc_queue_stats(void)
 {
-	if (!test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
-		set_bit(QUEUE_FLAG_STATS, &q->queue_flags);
-		return false;
-	}
+	struct blk_queue_stats *stats;
+
+	stats = kmalloc(sizeof(*stats), GFP_KERNEL);
+	if (!stats)
+		return NULL;
+
+	INIT_LIST_HEAD(&stats->callbacks);
+	spin_lock_init(&stats->lock);
+
+	return stats;
+}
+
+void blk_free_queue_stats(struct blk_queue_stats *stats)
+{
+	if (!stats)
+		return;
+
+	WARN_ON(!list_empty(&stats->callbacks));
 
-	return true;
+	kfree(stats);
 }
diff --git a/block/blk-stat.h b/block/blk-stat.h
index a24439aab710..fd6ba9183b17 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -1,11 +1,11 @@
 #ifndef BLK_STAT_H
 #define BLK_STAT_H
 
-/*
- * ~0.13s window as a power-of-2 (2^27 nsecs)
- */
-#define BLK_STAT_NSEC		134217728ULL
-#define BLK_STAT_NSEC_MASK	~(BLK_STAT_NSEC - 1)
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/ktime.h>
+#include <linux/rcupdate.h>
+#include <linux/timer.h>
 
 /*
  * Upper 3 bits can be used elsewhere
@@ -15,19 +15,27 @@
 #define BLK_STAT_TIME_MASK	((1ULL << BLK_STAT_SHIFT) - 1)
 #define BLK_STAT_MASK		~BLK_STAT_TIME_MASK
 
-enum {
-	BLK_STAT_READ	= 0,
-	BLK_STAT_WRITE,
+#define BLK_RQ_STAT_BATCH	64
+
+struct blk_stat_callback {
+	struct list_head list;
+	struct timer_list timer;
+	struct blk_stats __percpu *stats;
+	void (*fn)(struct blk_stat_callback *, struct blk_stats *);
+	void *data;
+	struct rcu_head rcu;
 };
 
-void blk_stat_add(struct blk_rq_stat *, struct request *);
-void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_stats *);
-void blk_queue_stat_get(struct request_queue *, struct blk_stats *);
-void blk_stat_clear(struct request_queue *);
-void blk_stat_init(struct blk_rq_stat *);
-bool blk_stat_is_current(struct blk_rq_stat *);
-void blk_stat_set_issue_time(struct blk_issue_stat *);
-bool blk_stat_enable(struct request_queue *);
+struct blk_queue_stats *blk_alloc_queue_stats(void);
+void blk_free_queue_stats(struct blk_queue_stats *);
+
+void blk_stat_add(struct request *);
+
+static inline void blk_stat_set_issue_time(struct blk_issue_stat *stat)
+{
+	stat->time = ((stat->time & BLK_STAT_MASK) |
+		      (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK));
+}
 
 static inline u64 __blk_stat_time(u64 time)
 {
@@ -39,4 +47,62 @@ static inline u64 blk_stat_time(struct blk_issue_stat *stat)
 	return __blk_stat_time(stat->time);
 }
 
+/**
+ * blk_stat_alloc_callback() - Allocate a block statistics callback.
+ * @fn: Callback function.
+ * @data: Value for the @data field of the &struct blk_stat_callback.
+ *
+ * Return: &struct blk_stat_callback on success or NULL on ENOMEM.
+ */
+struct blk_stat_callback *
+blk_stat_alloc_callback(void (*fn)(struct blk_stat_callback *, struct blk_stats *),
+			void *data);
+
+/**
+ * blk_stat_add_callback() - Add a block statistics callback to be run on a
+ * request queue.
+ * @q: The request queue.
+ * @cb: The callback.
+ *
+ * Note that a single &struct blk_stat_callback can only be added to a single
+ * &struct request_queue.
+ */
+void blk_stat_add_callback(struct request_queue *q,
+			   struct blk_stat_callback *cb);
+
+/**
+ * blk_stat_remove_callback() - Remove a block statistics callback from a
+ * request queue.
+ * @q: The request queue.
+ * @cb: The callback.
+ *
+ * When this returns, the callback is not running on any CPUs and will not be
+ * called again unless readded.
+ */
+void blk_stat_remove_callback(struct request_queue *q,
+			      struct blk_stat_callback *cb);
+
+/**
+ * blk_stat_free_callback() - Free a block statistics callback.
+ * @cb: The callback.
+ *
+ * @cb may be NULL, in which case this does nothing. If it is not NULL, @cb must
+ * not be associated with a request queue. I.e., if it was previously added with
+ * blk_stat_add_callback(), it must also have been removed since then with
+ * blk_stat_remove_callback().
+ */
+void blk_stat_free_callback(struct blk_stat_callback *cb);
+
+/**
+ * blk_stat_arm_callback() - Set a block statistics callback to fire at a given
+ * time.
+ * @cb: The callback.
+ * @expires: The time (in jiffies) at which to run.
+ */
+static inline void blk_stat_arm_callback(struct blk_stat_callback *cb,
+					 unsigned long expires)
+{
+	mod_timer(&cb->timer, expires);
+}
+
 #endif
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 6e1cf622af21..fa831cb2fc30 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -785,6 +785,9 @@ static void blk_release_queue(struct kobject *kobj)
 		container_of(kobj, struct request_queue, kobj);
 
 	wbt_exit(q);
+	if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
+		blk_stat_remove_callback(q, q->poll_cb);
+	blk_stat_free_callback(q->poll_cb);
 	bdi_put(q->backing_dev_info);
 	blkcg_exit_queue(q);
 
@@ -793,6 +796,8 @@ static void blk_release_queue(struct kobject *kobj)
 		elevator_exit(q->elevator);
 	}
 
+	blk_free_queue_stats(q->stats);
+
 	blk_exit_rl(&q->root_rl);
 
 	if (q->queue_tags)
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 5e81068f7c52..695bb92be82b 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -277,7 +277,7 @@ enum {
 	LAT_EXCEEDED,
 };
 
-static int __latency_exceeded(struct rq_wb *rwb, struct blk_stats *stats)
+static int latency_exceeded(struct rq_wb *rwb, struct blk_stats *stats)
 {
 	struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
 	u64 thislat;
@@ -308,8 +308,8 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_stats *stats)
 		 * waited or still has writes in flights, consider us doing
 		 * just writes as well.
 		 */
-		if ((stats->write.nr_samples && blk_stat_is_current(&stats->read)) ||
-		    wb_recent_wait(rwb) || wbt_inflight(rwb))
+		if (stats->write.nr_samples || wb_recent_wait(rwb) ||
+		    wbt_inflight(rwb))
 			return LAT_UNKNOWN_WRITES;
 		return LAT_UNKNOWN;
 	}
@@ -329,14 +329,6 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_stats *stats)
 	return LAT_OK;
 }
 
-static int latency_exceeded(struct rq_wb *rwb)
-{
-	struct blk_stats stats;
-
-	blk_queue_stat_get(rwb->queue, &stats);
-	return __latency_exceeded(rwb, &stats);
-}
-
 static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
 {
 	struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
@@ -355,7 +347,6 @@ static void scale_up(struct rq_wb *rwb)
 
 	rwb->scale_step--;
 	rwb->unknown_cnt = 0;
-	blk_stat_clear(rwb->queue);
 
 	rwb->scaled_max = calc_wb_limits(rwb);
 
@@ -385,7 +376,6 @@ static void scale_down(struct rq_wb *rwb, bool hard_throttle)
 
 	rwb->scaled_max = false;
 	rwb->unknown_cnt = 0;
-	blk_stat_clear(rwb->queue);
 	calc_wb_limits(rwb);
 	rwb_trace_step(rwb, "step down");
 }
@@ -412,16 +402,16 @@ static void rwb_arm_timer(struct rq_wb *rwb)
 	}
 
 	expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec);
-	mod_timer(&rwb->window_timer, expires);
+	blk_stat_arm_callback(rwb->cb, expires);
 }
 
-static void wb_timer_fn(unsigned long data)
+static void wb_timer_fn(struct blk_stat_callback *cb, struct blk_stats *stats)
 {
-	struct rq_wb *rwb = (struct rq_wb *) data;
+	struct rq_wb *rwb = cb->data;
 	unsigned int inflight = wbt_inflight(rwb);
 	int status;
 
-	status = latency_exceeded(rwb);
+	status = latency_exceeded(rwb, stats);
 
 	trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step,
 			inflight);
@@ -614,7 +604,7 @@ enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
 
 	__wbt_wait(rwb, bio->bi_opf, lock);
 
-	if (!timer_pending(&rwb->window_timer))
+	if (!timer_pending(&rwb->cb->timer))
 		rwb_arm_timer(rwb);
 
 	if (current_is_kswapd())
@@ -675,7 +665,7 @@ void wbt_disable_default(struct request_queue *q)
 	struct rq_wb *rwb = q->rq_wb;
 
 	if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) {
-		del_timer_sync(&rwb->window_timer);
+		blk_stat_remove_callback(q, rwb->cb);
 		rwb->win_nsec = rwb->min_lat_nsec = 0;
 		wbt_update_limits(rwb);
 	}
@@ -699,24 +689,23 @@ int wbt_init(struct request_queue *q)
 	struct rq_wb *rwb;
 	int i;
 
-	/*
-	 * For now, we depend on the stats window being larger than
-	 * our monitoring window. Ensure that this isn't inadvertently
-	 * violated.
-	 */
-	BUILD_BUG_ON(RWB_WINDOW_NSEC > BLK_STAT_NSEC);
 	BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS);
 
 	rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
 	if (!rwb)
 		return -ENOMEM;
 
+	rwb->cb = blk_stat_alloc_callback(wb_timer_fn, rwb);
+	if (!rwb->cb) {
+		kfree(rwb);
+		return -ENOMEM;
+	}
+
 	for (i = 0; i < WBT_NUM_RWQ; i++) {
 		atomic_set(&rwb->rq_wait[i].inflight, 0);
 		init_waitqueue_head(&rwb->rq_wait[i].wait);
 	}
 
-	setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb);
 	rwb->wc = 1;
 	rwb->queue_depth = RWB_DEF_DEPTH;
 	rwb->last_comp = rwb->last_issue = jiffies;
@@ -726,10 +715,10 @@ int wbt_init(struct request_queue *q)
 	wbt_update_limits(rwb);
 
 	/*
-	 * Assign rwb, and turn on stats tracking for this queue
+	 * Assign rwb and add the stats callback.
 	 */
 	q->rq_wb = rwb;
-	blk_stat_enable(q);
+	blk_stat_add_callback(q, rwb->cb);
 
 	rwb->min_lat_nsec = wbt_default_latency_nsec(q);
 
@@ -744,7 +733,8 @@ void wbt_exit(struct request_queue *q)
 	struct rq_wb *rwb = q->rq_wb;
 
 	if (rwb) {
-		del_timer_sync(&rwb->window_timer);
+		blk_stat_remove_callback(q, rwb->cb);
+		blk_stat_free_callback(rwb->cb);
 		q->rq_wb = NULL;
 		kfree(rwb);
 	}
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index 65f1de519f67..591ff2f4b2ee 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -81,7 +81,7 @@ struct rq_wb {
 	u64 win_nsec;				/* default window size */
 	u64 cur_win_nsec;			/* current window size */
 
-	struct timer_list window_timer;
+	struct blk_stat_callback *cb;
 
 	s64 sync_issue;
 	void *sync_cookie;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 5e4c5380edf0..14789bdb3338 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -287,8 +287,6 @@ struct blk_issue_stat {
 	u64 time;
 };
 
-#define BLK_RQ_STAT_BATCH	64
-
 struct blk_rq_stat {
 	s64 mean;
 	u64 min;
@@ -296,7 +294,6 @@ struct blk_rq_stat {
 	s32 nr_samples;
 	s32 nr_batch;
 	u64 batch;
-	s64 time;
 };
 
 struct blk_stats {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5a7da607ca04..a093449adbb8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -40,6 +40,8 @@ struct blkcg_gq;
 struct blk_flush_queue;
 struct pr_ops;
 struct rq_wb;
+struct blk_queue_stats;
+struct blk_stat_callback;
 
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
@@ -388,6 +390,7 @@ struct request_queue {
 	int			nr_rqs[2];	/* # allocated [a]sync rqs */
 	int			nr_rqs_elvpriv;	/* # allocated rqs w/ elvpriv */
 
+	struct blk_queue_stats	*stats;
 	struct rq_wb		*rq_wb;
 
 	/*
@@ -505,8 +508,6 @@ struct request_queue {
 	unsigned int		nr_sorted;
 	unsigned int		in_flight[2];
 
-	struct blk_rq_stat	rq_stats[2];
-
 	/*
 	 * Number of active block driver functions for which blk_drain_queue()
 	 * must wait. Must be incremented around functions that unlock the
@@ -516,6 +517,10 @@ struct request_queue {
 
 	unsigned int		rq_timeout;
 	int			poll_nsec;
+
+	struct blk_stat_callback	*poll_cb;
+	struct blk_stats	poll_stats;
+
 	struct timer_list	timeout;
 	struct work_struct	timeout_work;
 	struct list_head	timeout_list;
@@ -611,6 +616,7 @@ struct request_queue {
 #define QUEUE_FLAG_DAX         26	/* device supports DAX */
 #define QUEUE_FLAG_STATS       27	/* track rq completion times */
 #define QUEUE_FLAG_RESTART     28	/* queue needs restart at completion */
+#define QUEUE_FLAG_POLL_STATS  29	/* collecting stats for hybrid polling */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_STACKABLE)	|	\
-- 
2.12.0

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox