* [PATCHv2] sbitmap: fix batched wait_cnt accounting
@ 2022-08-24 20:14 Keith Busch
2022-08-25 6:09 ` Hannes Reinecke
2022-08-25 11:48 ` Pankaj Raghav
0 siblings, 2 replies; 7+ messages in thread
From: Keith Busch @ 2022-08-24 20:14 UTC (permalink / raw)
To: axboe, linux-block; +Cc: Keith Busch
From: Keith Busch <kbusch@kernel.org>
Batched completions can clear multiple bits, but we're only decrementing
the wait_cnt by one each time. This can cause waiters to never be woken,
stalling IO. Use the batched count instead.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=215679
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
v1->v2:
Use atomic_try_cmpxchg instead of direct subtraction to prevent a race
between two batched completions.
Remove 'wait_cnt < 0' condition since it's not possible anymore.
Repeat wake up while waiters exist and more bits are unaccounted.
block/blk-mq-tag.c | 2 +-
include/linux/sbitmap.h | 3 ++-
lib/sbitmap.c | 32 ++++++++++++++++++--------------
3 files changed, 21 insertions(+), 16 deletions(-)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 8e3b36d1cb57..9eb968e14d31 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -196,7 +196,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
* other allocations on previous queue won't be starved.
*/
if (bt != bt_prev)
- sbitmap_queue_wake_up(bt_prev);
+ sbitmap_queue_wake_up(bt_prev, 1);
ws = bt_wait_ptr(bt, data->hctx);
} while (1);
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 8f5a86e210b9..4d2d5205ab58 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -575,8 +575,9 @@ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq);
* sbitmap_queue_wake_up() - Wake up some of waiters in one waitqueue
* on a &struct sbitmap_queue.
* @sbq: Bitmap queue to wake up.
+ * @nr: Number of bits cleared.
*/
-void sbitmap_queue_wake_up(struct sbitmap_queue *sbq);
+void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr);
/**
* sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 1f31147872e6..6fd3bc22c2e7 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -600,28 +600,33 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
return NULL;
}
-static bool __sbq_wake_up(struct sbitmap_queue *sbq)
+static bool __sbq_wake_up(struct sbitmap_queue *sbq, int *nr)
{
struct sbq_wait_state *ws;
- unsigned int wake_batch;
- int wait_cnt;
+ int wake_batch, wait_cnt, sub, cur;
ws = sbq_wake_ptr(sbq);
if (!ws)
return false;
- wait_cnt = atomic_dec_return(&ws->wait_cnt);
+ wake_batch = READ_ONCE(sbq->wake_batch);
+ do {
+ cur = atomic_read(&ws->wait_cnt);
+ sub = min3(wake_batch, *nr, cur);
+ wait_cnt = cur - sub;
+ } while (!atomic_try_cmpxchg(&ws->wait_cnt, &cur, wait_cnt));
+
+ *nr -= sub;
+
/*
* For concurrent callers of this, callers should call this function
* again to wakeup a new batch on a different 'ws'.
*/
- if (wait_cnt < 0 || !waitqueue_active(&ws->wait))
+ if (!waitqueue_active(&ws->wait))
return true;
if (wait_cnt > 0)
- return false;
-
- wake_batch = READ_ONCE(sbq->wake_batch);
+ return *nr > 0;
/*
* Wake up first in case that concurrent callers decrease wait_cnt
@@ -649,15 +654,14 @@ static bool __sbq_wake_up(struct sbitmap_queue *sbq)
sbq_index_atomic_inc(&sbq->wake_index);
atomic_set(&ws->wait_cnt, wake_batch);
- return false;
+ return *nr > 0;
}
-void sbitmap_queue_wake_up(struct sbitmap_queue *sbq)
+void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr)
{
- while (__sbq_wake_up(sbq))
+ while (__sbq_wake_up(sbq, &nr))
;
}
-EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up);
static inline void sbitmap_update_cpu_hint(struct sbitmap *sb, int cpu, int tag)
{
@@ -694,7 +698,7 @@ void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset,
atomic_long_andnot(mask, (atomic_long_t *) addr);
smp_mb__after_atomic();
- sbitmap_queue_wake_up(sbq);
+ sbitmap_queue_wake_up(sbq, nr_tags);
sbitmap_update_cpu_hint(&sbq->sb, raw_smp_processor_id(),
tags[nr_tags - 1] - offset);
}
@@ -722,7 +726,7 @@ void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
* waiter. See the comment on waitqueue_active().
*/
smp_mb__after_atomic();
- sbitmap_queue_wake_up(sbq);
+ sbitmap_queue_wake_up(sbq, 1);
sbitmap_update_cpu_hint(&sbq->sb, cpu, nr);
}
EXPORT_SYMBOL_GPL(sbitmap_queue_clear);
--
2.30.2
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCHv2] sbitmap: fix batched wait_cnt accounting
2022-08-24 20:14 [PATCHv2] sbitmap: fix batched wait_cnt accounting Keith Busch
@ 2022-08-25 6:09 ` Hannes Reinecke
2022-08-25 12:59 ` Keith Busch
2022-08-25 11:48 ` Pankaj Raghav
1 sibling, 1 reply; 7+ messages in thread
From: Hannes Reinecke @ 2022-08-25 6:09 UTC (permalink / raw)
To: Keith Busch, axboe, linux-block; +Cc: Keith Busch
On 8/24/22 22:14, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
>
> Batched completions can clear multiple bits, but we're only decrementing
> the wait_cnt by one each time. This can cause waiters to never be woken,
> stalling IO. Use the batched count instead.
>
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=215679
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
> v1->v2:
>
> Use atomic_try_cmpxchg instead of direct subtraction to prevent a race
> between two batched completions.
>
> Remove 'wait_cnt < 0' condition since it's not possible anymore.
>
> Repeat wake up while waiters exist and more bits are unaccounted.
>
> block/blk-mq-tag.c | 2 +-
> include/linux/sbitmap.h | 3 ++-
> lib/sbitmap.c | 32 ++++++++++++++++++--------------
> 3 files changed, 21 insertions(+), 16 deletions(-)
>
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 8e3b36d1cb57..9eb968e14d31 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -196,7 +196,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
> * other allocations on previous queue won't be starved.
> */
> if (bt != bt_prev)
> - sbitmap_queue_wake_up(bt_prev);
> + sbitmap_queue_wake_up(bt_prev, 1);
>
> ws = bt_wait_ptr(bt, data->hctx);
> } while (1);
> diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
> index 8f5a86e210b9..4d2d5205ab58 100644
> --- a/include/linux/sbitmap.h
> +++ b/include/linux/sbitmap.h
> @@ -575,8 +575,9 @@ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq);
> * sbitmap_queue_wake_up() - Wake up some of waiters in one waitqueue
> * on a &struct sbitmap_queue.
> * @sbq: Bitmap queue to wake up.
> + * @nr: Number of bits cleared.
> */
> -void sbitmap_queue_wake_up(struct sbitmap_queue *sbq);
> +void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr);
>
> /**
> * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct
> diff --git a/lib/sbitmap.c b/lib/sbitmap.c
> index 1f31147872e6..6fd3bc22c2e7 100644
> --- a/lib/sbitmap.c
> +++ b/lib/sbitmap.c
> @@ -600,28 +600,33 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
> return NULL;
> }
>
> -static bool __sbq_wake_up(struct sbitmap_queue *sbq)
> +static bool __sbq_wake_up(struct sbitmap_queue *sbq, int *nr)
Why is '*nr' a pointer? It's always used as a value, so what does
promoting it to a point buys you?
> {
> struct sbq_wait_state *ws;
> - unsigned int wake_batch;
> - int wait_cnt;
> + int wake_batch, wait_cnt, sub, cur;
>
> ws = sbq_wake_ptr(sbq);
> if (!ws)
> return false;
>
> - wait_cnt = atomic_dec_return(&ws->wait_cnt);
> + wake_batch = READ_ONCE(sbq->wake_batch);
> + do {
> + cur = atomic_read(&ws->wait_cnt);
> + sub = min3(wake_batch, *nr, cur);
> + wait_cnt = cur - sub;
> + } while (!atomic_try_cmpxchg(&ws->wait_cnt, &cur, wait_cnt));
> +
> + *nr -= sub;
> +
> /*
> * For concurrent callers of this, callers should call this function
> * again to wakeup a new batch on a different 'ws'.
> */
> - if (wait_cnt < 0 || !waitqueue_active(&ws->wait))
> + if (!waitqueue_active(&ws->wait))
> return true;
>
> if (wait_cnt > 0)
> - return false;
> -
> - wake_batch = READ_ONCE(sbq->wake_batch);
> + return *nr > 0;
>
> /*
> * Wake up first in case that concurrent callers decrease wait_cnt
> @@ -649,15 +654,14 @@ static bool __sbq_wake_up(struct sbitmap_queue *sbq)
> sbq_index_atomic_inc(&sbq->wake_index);
> atomic_set(&ws->wait_cnt, wake_batch);
>
> - return false;
> + return *nr > 0;
> }
>
> -void sbitmap_queue_wake_up(struct sbitmap_queue *sbq)
> +void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr)
> {
> - while (__sbq_wake_up(sbq))
> + while (__sbq_wake_up(sbq, &nr))
> ;
> }
> -EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up);
>
> static inline void sbitmap_update_cpu_hint(struct sbitmap *sb, int cpu, int tag)
> {
> @@ -694,7 +698,7 @@ void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset,
> atomic_long_andnot(mask, (atomic_long_t *) addr);
>
> smp_mb__after_atomic();
> - sbitmap_queue_wake_up(sbq);
> + sbitmap_queue_wake_up(sbq, nr_tags);
> sbitmap_update_cpu_hint(&sbq->sb, raw_smp_processor_id(),
> tags[nr_tags - 1] - offset);
> }
> @@ -722,7 +726,7 @@ void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
> * waiter. See the comment on waitqueue_active().
> */
> smp_mb__after_atomic();
> - sbitmap_queue_wake_up(sbq);
> + sbitmap_queue_wake_up(sbq, 1);
> sbitmap_update_cpu_hint(&sbq->sb, cpu, nr);
> }
> EXPORT_SYMBOL_GPL(sbitmap_queue_clear);
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Ivo Totev, Andrew
Myers, Andrew McDonald, Martje Boudien Moerman
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCHv2] sbitmap: fix batched wait_cnt accounting
2022-08-24 20:14 [PATCHv2] sbitmap: fix batched wait_cnt accounting Keith Busch
2022-08-25 6:09 ` Hannes Reinecke
@ 2022-08-25 11:48 ` Pankaj Raghav
2022-08-25 11:55 ` Pankaj Raghav
1 sibling, 1 reply; 7+ messages in thread
From: Pankaj Raghav @ 2022-08-25 11:48 UTC (permalink / raw)
To: Keith Busch; +Cc: axboe, linux-block, Keith Busch, Pankaj Raghav
On Wed, Aug 24, 2022 at 01:14:40PM -0700, Keith Busch wrote:
>
> -static bool __sbq_wake_up(struct sbitmap_queue *sbq)
> +static bool __sbq_wake_up(struct sbitmap_queue *sbq, int *nr)
> {
> struct sbq_wait_state *ws;
> - unsigned int wake_batch;
> - int wait_cnt;
> + int wake_batch, wait_cnt, sub, cur;
>
> ws = sbq_wake_ptr(sbq);
> if (!ws)
> return false;
>
> - wait_cnt = atomic_dec_return(&ws->wait_cnt);
> + wake_batch = READ_ONCE(sbq->wake_batch);
> + do {
> + cur = atomic_read(&ws->wait_cnt);
I think the above statement is not needed if we use atomic_try_cmpxchg
as the old value is updated in that function itself.
https://docs.kernel.org/staging/index.html?highlight=atomic_try_cmpxchg#atomic-types
> + sub = min3(wake_batch, *nr, cur);
> + wait_cnt = cur - sub;
> + } while (!atomic_try_cmpxchg(&ws->wait_cnt, &cur, wait_cnt));
> +
> + *nr -= sub;
> +
> /*
--
Pankaj Raghav
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCHv2] sbitmap: fix batched wait_cnt accounting
2022-08-25 11:48 ` Pankaj Raghav
@ 2022-08-25 11:55 ` Pankaj Raghav
2022-08-25 13:05 ` Keith Busch
0 siblings, 1 reply; 7+ messages in thread
From: Pankaj Raghav @ 2022-08-25 11:55 UTC (permalink / raw)
To: Keith Busch; +Cc: axboe, linux-block, Keith Busch, Pankaj Raghav
On Thu, Aug 25, 2022 at 01:48:43PM +0200, Pankaj Raghav wrote:
> On Wed, Aug 24, 2022 at 01:14:40PM -0700, Keith Busch wrote:
> >
> > -static bool __sbq_wake_up(struct sbitmap_queue *sbq)
> > +static bool __sbq_wake_up(struct sbitmap_queue *sbq, int *nr)
> > {
> > struct sbq_wait_state *ws;
> > - unsigned int wake_batch;
> > - int wait_cnt;
> > + int wake_batch, wait_cnt, sub, cur;
> >
> > ws = sbq_wake_ptr(sbq);
> > if (!ws)
> > return false;
> >
> > - wait_cnt = atomic_dec_return(&ws->wait_cnt);
> > + wake_batch = READ_ONCE(sbq->wake_batch);
> > + do {
> > + cur = atomic_read(&ws->wait_cnt);
>
> I think the above statement is not needed if we use atomic_try_cmpxchg
> as the old value is updated in that function itself.
> https://docs.kernel.org/staging/index.html?highlight=atomic_try_cmpxchg#atomic-types
I mean after moving the cur = atomic_read(..) above the do statement:
wake_batch = READ_ONCE(sbq->wake_batch);
cur = atomic_read(&ws->wait_cnt);
do {
sub = min3(wake_batch, *nr, cur);
wait_cnt = cur - sub;
} while (!atomic_try_cmpxchg(&ws->wait_cnt, &cur, wait_cnt));
--
Pankaj Raghav
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCHv2] sbitmap: fix batched wait_cnt accounting
2022-08-25 6:09 ` Hannes Reinecke
@ 2022-08-25 12:59 ` Keith Busch
2022-08-25 13:29 ` Keith Busch
0 siblings, 1 reply; 7+ messages in thread
From: Keith Busch @ 2022-08-25 12:59 UTC (permalink / raw)
To: Hannes Reinecke; +Cc: Keith Busch, axboe, linux-block
On Thu, Aug 25, 2022 at 08:09:50AM +0200, Hannes Reinecke wrote:
> > +static bool __sbq_wake_up(struct sbitmap_queue *sbq, int *nr)
>
> Why is '*nr' a pointer? It's always used as a value, so what does promoting
> it to a point buys you?
Not quite, see below:
> > {
> > struct sbq_wait_state *ws;
> > - unsigned int wake_batch;
> > - int wait_cnt;
> > + int wake_batch, wait_cnt, sub, cur;
> > ws = sbq_wake_ptr(sbq);
> > if (!ws)
> > return false;
> > - wait_cnt = atomic_dec_return(&ws->wait_cnt);
> > + wake_batch = READ_ONCE(sbq->wake_batch);
> > + do {
> > + cur = atomic_read(&ws->wait_cnt);
> > + sub = min3(wake_batch, *nr, cur);
> > + wait_cnt = cur - sub;
> > + } while (!atomic_try_cmpxchg(&ws->wait_cnt, &cur, wait_cnt));
> > +
> > + *nr -= sub;
Dereferenced here to account for how many bits were considered for this wake
cycle.
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCHv2] sbitmap: fix batched wait_cnt accounting
2022-08-25 11:55 ` Pankaj Raghav
@ 2022-08-25 13:05 ` Keith Busch
0 siblings, 0 replies; 7+ messages in thread
From: Keith Busch @ 2022-08-25 13:05 UTC (permalink / raw)
To: Pankaj Raghav; +Cc: Keith Busch, axboe, linux-block, Pankaj Raghav
On Thu, Aug 25, 2022 at 01:55:06PM +0200, Pankaj Raghav wrote:
> On Thu, Aug 25, 2022 at 01:48:43PM +0200, Pankaj Raghav wrote:
> > On Wed, Aug 24, 2022 at 01:14:40PM -0700, Keith Busch wrote:
> > >
> > > -static bool __sbq_wake_up(struct sbitmap_queue *sbq)
> > > +static bool __sbq_wake_up(struct sbitmap_queue *sbq, int *nr)
> > > {
> > > struct sbq_wait_state *ws;
> > > - unsigned int wake_batch;
> > > - int wait_cnt;
> > > + int wake_batch, wait_cnt, sub, cur;
> > >
> > > ws = sbq_wake_ptr(sbq);
> > > if (!ws)
> > > return false;
> > >
> > > - wait_cnt = atomic_dec_return(&ws->wait_cnt);
> > > + wake_batch = READ_ONCE(sbq->wake_batch);
> > > + do {
> > > + cur = atomic_read(&ws->wait_cnt);
> >
> > I think the above statement is not needed if we use atomic_try_cmpxchg
> > as the old value is updated in that function itself.
> > https://docs.kernel.org/staging/index.html?highlight=atomic_try_cmpxchg#atomic-types
>
> I mean after moving the cur = atomic_read(..) above the do statement:
>
> wake_batch = READ_ONCE(sbq->wake_batch);
> cur = atomic_read(&ws->wait_cnt);
> do {
> sub = min3(wake_batch, *nr, cur);
> wait_cnt = cur - sub;
> } while (!atomic_try_cmpxchg(&ws->wait_cnt, &cur, wait_cnt));
Yeah, that's a good change. I'll fix it up.
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCHv2] sbitmap: fix batched wait_cnt accounting
2022-08-25 12:59 ` Keith Busch
@ 2022-08-25 13:29 ` Keith Busch
0 siblings, 0 replies; 7+ messages in thread
From: Keith Busch @ 2022-08-25 13:29 UTC (permalink / raw)
To: Hannes Reinecke; +Cc: Keith Busch, axboe, linux-block
On Thu, Aug 25, 2022 at 06:59:00AM -0600, Keith Busch wrote:
> On Thu, Aug 25, 2022 at 08:09:50AM +0200, Hannes Reinecke wrote:
> > > +static bool __sbq_wake_up(struct sbitmap_queue *sbq, int *nr)
> >
> > Why is '*nr' a pointer? It's always used as a value, so what does promoting
> > it to a point buys you?
>
> Not quite, see below:
>
> > > {
> > > struct sbq_wait_state *ws;
> > > - unsigned int wake_batch;
> > > - int wait_cnt;
> > > + int wake_batch, wait_cnt, sub, cur;
> > > ws = sbq_wake_ptr(sbq);
> > > if (!ws)
> > > return false;
> > > - wait_cnt = atomic_dec_return(&ws->wait_cnt);
> > > + wake_batch = READ_ONCE(sbq->wake_batch);
> > > + do {
> > > + cur = atomic_read(&ws->wait_cnt);
> > > + sub = min3(wake_batch, *nr, cur);
> > > + wait_cnt = cur - sub;
> > > + } while (!atomic_try_cmpxchg(&ws->wait_cnt, &cur, wait_cnt));
> > > +
> > > + *nr -= sub;
>
> Dereferenced here to account for how many bits were considered for this wake
> cycle.
Actually, I think we can just get rid of this accounting. Instead of calling
wake_up_nr() with wake_batch multiple times, we can call wake_up_nr() with
max(wake_batch, nr) just once, then all bits are accounted for in a single go.
The 'wake_batch' value is really just the lowest amount of tags to accumulate
before waking tag waiters; having more should be fine.
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2022-08-25 13:29 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-08-24 20:14 [PATCHv2] sbitmap: fix batched wait_cnt accounting Keith Busch
2022-08-25 6:09 ` Hannes Reinecke
2022-08-25 12:59 ` Keith Busch
2022-08-25 13:29 ` Keith Busch
2022-08-25 11:48 ` Pankaj Raghav
2022-08-25 11:55 ` Pankaj Raghav
2022-08-25 13:05 ` Keith Busch
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox