linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 2/2] sbitmap: optimize wakeup check
  2018-11-30  2:09 [PATCHSET v3] sbitmap optimizations Jens Axboe
@ 2018-11-30  2:09 ` Jens Axboe
  0 siblings, 0 replies; 8+ messages in thread
From: Jens Axboe @ 2018-11-30  2:09 UTC (permalink / raw)
  To: linux-block, osandov; +Cc: Jens Axboe

Even if we have no waiters on any of the sbitmap_queue wait states, we
still have to loop every entry to check. We do this for every IO, so
the cost adds up.

Shift a bit of the cost to the slow path, when we actually have waiters.
Wrap prepare_to_wait_exclusive() and finish_wait(), so we can maintain
an internal count of how many are currently active. Then we can simply
check this count in sbq_wake_ptr() and not have to loop if we don't
have any sleepers.

Convert the two users of sbitmap with waiting, blk-mq-tag and iSCSI.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-tag.c                       | 13 ++++++-------
 drivers/target/iscsi/iscsi_target_util.c |  9 ++++++---
 include/linux/sbitmap.h                  | 19 +++++++++++++++++++
 lib/sbitmap.c                            | 21 +++++++++++++++++++++
 4 files changed, 52 insertions(+), 10 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 87bc5df72d48..4704046ce493 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -154,12 +154,13 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		if (tag != -1)
 			break;
 
-		prepare_to_wait_exclusive(&ws->wait, &wait,
-						TASK_UNINTERRUPTIBLE);
+		sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE);
 
 		tag = __blk_mq_get_tag(data, bt);
-		if (tag != -1)
+		if (tag != -1) {
+			sbitmap_finish_wait(bt, ws, &wait);
 			break;
+		}
 
 		if (data->ctx)
 			blk_mq_put_ctx(data->ctx);
@@ -167,6 +168,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		bt_prev = bt;
 		io_schedule();
 
+		sbitmap_finish_wait(bt, ws, &wait);
+
 		data->ctx = blk_mq_get_ctx(data->q);
 		data->hctx = blk_mq_map_queue(data->q, data->cmd_flags,
 						data->ctx->cpu);
@@ -176,8 +179,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		else
 			bt = &tags->bitmap_tags;
 
-		finish_wait(&ws->wait, &wait);
-
 		/*
 		 * If destination hw queue is changed, fake wake up on
 		 * previous queue for compensating the wake up miss, so
@@ -192,8 +193,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	if (drop_ctx && data->ctx)
 		blk_mq_put_ctx(data->ctx);
 
-	finish_wait(&ws->wait, &wait);
-
 found_tag:
 	return tag + tag_offset;
 }
diff --git a/drivers/target/iscsi/iscsi_target_util.c b/drivers/target/iscsi/iscsi_target_util.c
index 36b742932c72..37db1f80a219 100644
--- a/drivers/target/iscsi/iscsi_target_util.c
+++ b/drivers/target/iscsi/iscsi_target_util.c
@@ -152,22 +152,25 @@ static int iscsit_wait_for_tag(struct se_session *se_sess, int state, int *cpup)
 	int tag = -1;
 	DEFINE_WAIT(wait);
 	struct sbq_wait_state *ws;
+	struct sbitmap_queue *sbq;
 
 	if (state == TASK_RUNNING)
 		return tag;
 
-	ws = &se_sess->sess_tag_pool.ws[0];
+	sbq = &se_sess->sess_tag_pool;
+	ws = &sbq->ws[0];
 	for (;;) {
-		prepare_to_wait_exclusive(&ws->wait, &wait, state);
+		sbitmap_prepare_to_wait(sbq, ws, &wait, state);
 		if (signal_pending_state(state, current))
 			break;
 		tag = sbitmap_queue_get(&se_sess->sess_tag_pool, cpup);
 		if (tag >= 0)
 			break;
 		schedule();
+		sbitmap_finish_wait(sbq, ws, &wait);
 	}
+	sbitmap_finish_wait(sbq, ws, &wait);
 
-	finish_wait(&ws->wait, &wait);
 	return tag;
 }
 
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index cec685b89998..a65d5b67a611 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -130,6 +130,11 @@ struct sbitmap_queue {
 	 */
 	struct sbq_wait_state *ws;
 
+	/*
+	 * @ws_active: count of currently active ws waitqueues
+	 */
+	atomic_t ws_active;
+
 	/**
 	 * @round_robin: Allocate bits in strict round-robin order.
 	 */
@@ -549,4 +554,18 @@ void sbitmap_queue_wake_up(struct sbitmap_queue *sbq);
  */
 void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m);
 
+/*
+ * Wrapper around prepare_to_wait_exclusive(), which maintains some extra
+ * internal state.
+ */
+void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq,
+				struct sbq_wait_state *ws,
+				struct wait_queue_entry *wait, int state);
+
+/*
+ * Must be paired with sbitmap_prepare_to_wait().
+ */
+void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws,
+				struct wait_queue_entry *wait);
+
 #endif /* __LINUX_SCALE_BITMAP_H */
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 2316f53f3e1d..ba51f538011a 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -381,6 +381,7 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
 	sbq->min_shallow_depth = UINT_MAX;
 	sbq->wake_batch = sbq_calc_wake_batch(sbq, depth);
 	atomic_set(&sbq->wake_index, 0);
+	atomic_set(&sbq->ws_active, 0);
 
 	sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node);
 	if (!sbq->ws) {
@@ -496,6 +497,9 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
 {
 	int i, wake_index;
 
+	if (!atomic_read(&sbq->ws_active))
+		return NULL;
+
 	wake_index = atomic_read(&sbq->wake_index);
 	for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
 		struct sbq_wait_state *ws = &sbq->ws[wake_index];
@@ -636,3 +640,20 @@ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m)
 	seq_printf(m, "min_shallow_depth=%u\n", sbq->min_shallow_depth);
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_show);
+
+void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq,
+			     struct sbq_wait_state *ws,
+			     struct wait_queue_entry *wait, int state)
+{
+	atomic_inc(&sbq->ws_active);
+	prepare_to_wait_exclusive(&ws->wait, wait, state);
+}
+EXPORT_SYMBOL_GPL(sbitmap_prepare_to_wait);
+
+void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws,
+			 struct wait_queue_entry *wait)
+{
+	finish_wait(&ws->wait, wait);
+	atomic_dec(&sbq->ws_active);
+}
+EXPORT_SYMBOL_GPL(sbitmap_finish_wait);
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCHSET v4] sbitmap optimizations
@ 2018-11-30 16:01 Jens Axboe
  2018-11-30 16:01 ` [PATCH 1/2] sbitmap: ammortize cost of clearing bits Jens Axboe
  2018-11-30 16:01 ` [PATCH 2/2] sbitmap: optimize wakeup check Jens Axboe
  0 siblings, 2 replies; 8+ messages in thread
From: Jens Axboe @ 2018-11-30 16:01 UTC (permalink / raw)
  To: linux-block, osandov

This versions tests out solid, and we're still seeing the same
improvements.

Changes:

- Lock map index for the move. This eliminates the race completely,
  since it's now not possible to find ->cleared == 0 while swap of
  bits is in progress. The previous version was fine for users that
  re-check after having added themselves to the waitqueue, but we
  have users that don't re-check after getting failure. This works
  for both.

- Add the new states to the blk-mq debugfs output.

- Wrap the waitqueue in a sbitmap waitqueue, so we can ensure that
  we account it properly. This means that any kind of prep+finish
  on the waitqueue will work fine, just like before.

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 1/2] sbitmap: ammortize cost of clearing bits
  2018-11-30 16:01 [PATCHSET v4] sbitmap optimizations Jens Axboe
@ 2018-11-30 16:01 ` Jens Axboe
  2018-11-30 20:03   ` Omar Sandoval
  2018-11-30 16:01 ` [PATCH 2/2] sbitmap: optimize wakeup check Jens Axboe
  1 sibling, 1 reply; 8+ messages in thread
From: Jens Axboe @ 2018-11-30 16:01 UTC (permalink / raw)
  To: linux-block, osandov; +Cc: Jens Axboe

sbitmap maintains a set of words that we use to set and clear bits, with
each bit representing a tag for blk-mq. Even though we spread the bits
out and maintain a hint cache, one particular bit allocated will end up
being cleared in the exact same spot.

This introduces batched clearing of bits. Instead of clearing a given
bit, the same bit is set in a cleared/free mask instead. If we fail
allocating a bit from a given word, then we check the free mask, and
batch move those cleared bits at that time. This trades 64 atomic bitops
for 2 cmpxchg().

In a threaded poll test case, half the overhead of getting and clearing
tags is removed with this change. On another poll test case with a
single thread, performance is unchanged.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h | 31 +++++++++++++---
 lib/sbitmap.c           | 80 +++++++++++++++++++++++++++++++++++++----
 2 files changed, 100 insertions(+), 11 deletions(-)

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 804a50983ec5..07f117ee19dc 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -30,14 +30,24 @@ struct seq_file;
  */
 struct sbitmap_word {
 	/**
-	 * @word: The bitmap word itself.
+	 * @depth: Number of bits being used in @word/@cleared
 	 */
-	unsigned long word;
+	unsigned long depth;
 
 	/**
-	 * @depth: Number of bits being used in @word.
+	 * @word: word holding free bits
 	 */
-	unsigned long depth;
+	unsigned long word ____cacheline_aligned_in_smp;
+
+	/**
+	 * @cleared: word holding cleared bits
+	 */
+	unsigned long cleared ____cacheline_aligned_in_smp;
+
+	/**
+	 * @swap_lock: Held while swapping word <-> cleared
+	 */
+	spinlock_t swap_lock;
 } ____cacheline_aligned_in_smp;
 
 /**
@@ -310,6 +320,19 @@ static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr)
 	clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
 }
 
+/*
+ * This one is special, since it doesn't actually clear the bit, rather it
+ * sets the corresponding bit in the ->cleared mask instead. Paired with
+ * the caller doing sbitmap_batch_clear() if a given index is full, which
+ * will clear the previously freed entries in the corresponding ->word.
+ */
+static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr)
+{
+	unsigned long *addr = &sb->map[SB_NR_TO_INDEX(sb, bitnr)].cleared;
+
+	set_bit(SB_NR_TO_BIT(sb, bitnr), addr);
+}
+
 static inline void sbitmap_clear_bit_unlock(struct sbitmap *sb,
 					    unsigned int bitnr)
 {
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 45cab6bbc1c7..f6a9553617bd 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -59,6 +59,7 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
 	for (i = 0; i < sb->map_nr; i++) {
 		sb->map[i].depth = min(depth, bits_per_word);
 		depth -= sb->map[i].depth;
+		spin_lock_init(&sb->map[i].swap_lock);
 	}
 	return 0;
 }
@@ -111,6 +112,57 @@ static int __sbitmap_get_word(unsigned long *word, unsigned long depth,
 	return nr;
 }
 
+/*
+ * See if we have deferred clears that we can batch move
+ */
+static inline bool sbitmap_deferred_clear(struct sbitmap *sb, int index)
+{
+	unsigned long mask, val;
+	bool ret = false;
+
+	spin_lock(&sb->map[index].swap_lock);
+
+	if (!sb->map[index].cleared)
+		goto out_unlock;
+
+	/*
+	 * First get a stable cleared mask, setting the old mask to 0.
+	 */
+	do {
+		mask = sb->map[index].cleared;
+	} while (cmpxchg(&sb->map[index].cleared, mask, 0) != mask);
+
+	/*
+	 * Now clear the masked bits in our free word
+	 */
+	do {
+		val = sb->map[index].word;
+	} while (cmpxchg(&sb->map[index].word, val, val & ~mask) != val);
+
+	ret = true;
+out_unlock:
+	spin_unlock(&sb->map[index].swap_lock);
+	return ret;
+}
+
+static int sbitmap_find_bit_in_index(struct sbitmap *sb, int index,
+				     unsigned int alloc_hint, bool round_robin)
+{
+	int nr;
+
+	do {
+		nr = __sbitmap_get_word(&sb->map[index].word,
+					sb->map[index].depth, alloc_hint,
+					!round_robin);
+		if (nr != -1)
+			break;
+		if (!sbitmap_deferred_clear(sb, index))
+			break;
+	} while (1);
+
+	return nr;
+}
+
 int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
 {
 	unsigned int i, index;
@@ -129,9 +181,8 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
 		alloc_hint = 0;
 
 	for (i = 0; i < sb->map_nr; i++) {
-		nr = __sbitmap_get_word(&sb->map[index].word,
-					sb->map[index].depth, alloc_hint,
-					!round_robin);
+		nr = sbitmap_find_bit_in_index(sb, index, alloc_hint,
+						round_robin);
 		if (nr != -1) {
 			nr += index << sb->shift;
 			break;
@@ -206,23 +257,37 @@ bool sbitmap_any_bit_clear(const struct sbitmap *sb)
 }
 EXPORT_SYMBOL_GPL(sbitmap_any_bit_clear);
 
-unsigned int sbitmap_weight(const struct sbitmap *sb)
+static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set)
 {
 	unsigned int i, weight = 0;
 
 	for (i = 0; i < sb->map_nr; i++) {
 		const struct sbitmap_word *word = &sb->map[i];
 
-		weight += bitmap_weight(&word->word, word->depth);
+		if (set)
+			weight += bitmap_weight(&word->word, word->depth);
+		else
+			weight += bitmap_weight(&word->cleared, word->depth);
 	}
 	return weight;
 }
+
+unsigned int sbitmap_weight(const struct sbitmap *sb)
+{
+	return __sbitmap_weight(sb, true);
+}
 EXPORT_SYMBOL_GPL(sbitmap_weight);
 
+static unsigned int sbitmap_cleared(const struct sbitmap *sb)
+{
+	return __sbitmap_weight(sb, false);
+}
+
 void sbitmap_show(struct sbitmap *sb, struct seq_file *m)
 {
 	seq_printf(m, "depth=%u\n", sb->depth);
-	seq_printf(m, "busy=%u\n", sbitmap_weight(sb));
+	seq_printf(m, "busy=%u\n", sbitmap_weight(sb) - sbitmap_cleared(sb));
+	seq_printf(m, "cleared=%u\n", sbitmap_cleared(sb));
 	seq_printf(m, "bits_per_word=%u\n", 1U << sb->shift);
 	seq_printf(m, "map_nr=%u\n", sb->map_nr);
 }
@@ -514,7 +579,8 @@ EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up);
 void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
 			 unsigned int cpu)
 {
-	sbitmap_clear_bit_unlock(&sbq->sb, nr);
+	sbitmap_deferred_clear_bit(&sbq->sb, nr);
+
 	/*
 	 * Pairs with the memory barrier in set_current_state() to ensure the
 	 * proper ordering of clear_bit_unlock()/waitqueue_active() in the waker
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/2] sbitmap: optimize wakeup check
  2018-11-30 16:01 [PATCHSET v4] sbitmap optimizations Jens Axboe
  2018-11-30 16:01 ` [PATCH 1/2] sbitmap: ammortize cost of clearing bits Jens Axboe
@ 2018-11-30 16:01 ` Jens Axboe
  2018-11-30 21:37   ` Omar Sandoval
  1 sibling, 1 reply; 8+ messages in thread
From: Jens Axboe @ 2018-11-30 16:01 UTC (permalink / raw)
  To: linux-block, osandov; +Cc: Jens Axboe

Even if we have no waiters on any of the sbitmap_queue wait states, we
still have to loop every entry to check. We do this for every IO, so
the cost adds up.

Shift a bit of the cost to the slow path, when we actually have waiters.
Wrap prepare_to_wait_exclusive() and finish_wait(), so we can maintain
an internal count of how many are currently active. Then we can simply
check this count in sbq_wake_ptr() and not have to loop if we don't
have any sleepers.

Convert the two users of sbitmap with waiting, blk-mq-tag and iSCSI.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-tag.c                       | 11 ++++----
 drivers/target/iscsi/iscsi_target_util.c | 12 +++++----
 include/linux/sbitmap.h                  | 34 ++++++++++++++++++++++++
 lib/sbitmap.c                            | 28 +++++++++++++++++++
 4 files changed, 74 insertions(+), 11 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 87bc5df72d48..2089c6c62f44 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -110,7 +110,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
 	struct sbitmap_queue *bt;
 	struct sbq_wait_state *ws;
-	DEFINE_WAIT(wait);
+	DEFINE_SBQ_WAIT(wait);
 	unsigned int tag_offset;
 	bool drop_ctx;
 	int tag;
@@ -154,8 +154,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		if (tag != -1)
 			break;
 
-		prepare_to_wait_exclusive(&ws->wait, &wait,
-						TASK_UNINTERRUPTIBLE);
+		sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE);
 
 		tag = __blk_mq_get_tag(data, bt);
 		if (tag != -1)
@@ -167,6 +166,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		bt_prev = bt;
 		io_schedule();
 
+		sbitmap_finish_wait(bt, ws, &wait);
+
 		data->ctx = blk_mq_get_ctx(data->q);
 		data->hctx = blk_mq_map_queue(data->q, data->cmd_flags,
 						data->ctx->cpu);
@@ -176,8 +177,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		else
 			bt = &tags->bitmap_tags;
 
-		finish_wait(&ws->wait, &wait);
-
 		/*
 		 * If destination hw queue is changed, fake wake up on
 		 * previous queue for compensating the wake up miss, so
@@ -192,7 +191,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	if (drop_ctx && data->ctx)
 		blk_mq_put_ctx(data->ctx);
 
-	finish_wait(&ws->wait, &wait);
+	sbitmap_finish_wait(bt, ws, &wait);
 
 found_tag:
 	return tag + tag_offset;
diff --git a/drivers/target/iscsi/iscsi_target_util.c b/drivers/target/iscsi/iscsi_target_util.c
index 36b742932c72..86987da86dd6 100644
--- a/drivers/target/iscsi/iscsi_target_util.c
+++ b/drivers/target/iscsi/iscsi_target_util.c
@@ -150,24 +150,26 @@ void iscsit_free_r2ts_from_list(struct iscsi_cmd *cmd)
 static int iscsit_wait_for_tag(struct se_session *se_sess, int state, int *cpup)
 {
 	int tag = -1;
-	DEFINE_WAIT(wait);
+	DEFINE_SBQ_WAIT(wait);
 	struct sbq_wait_state *ws;
+	struct sbitmap_queue *sbq;
 
 	if (state == TASK_RUNNING)
 		return tag;
 
-	ws = &se_sess->sess_tag_pool.ws[0];
+	sbq = &se_sess->sess_tag_pool;
+	ws = &sbq->ws[0];
 	for (;;) {
-		prepare_to_wait_exclusive(&ws->wait, &wait, state);
+		sbitmap_prepare_to_wait(sbq, ws, &wait, state);
 		if (signal_pending_state(state, current))
 			break;
-		tag = sbitmap_queue_get(&se_sess->sess_tag_pool, cpup);
+		tag = sbitmap_queue_get(sbq, cpup);
 		if (tag >= 0)
 			break;
 		schedule();
 	}
 
-	finish_wait(&ws->wait, &wait);
+	sbitmap_finish_wait(sbq, ws, &wait);
 	return tag;
 }
 
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 07f117ee19dc..f0f49bbb2617 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -135,6 +135,11 @@ struct sbitmap_queue {
 	 */
 	struct sbq_wait_state *ws;
 
+	/*
+	 * @ws_active: count of currently active ws waitqueues
+	 */
+	atomic_t ws_active;
+
 	/**
 	 * @round_robin: Allocate bits in strict round-robin order.
 	 */
@@ -554,4 +559,33 @@ void sbitmap_queue_wake_up(struct sbitmap_queue *sbq);
  */
 void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m);
 
+struct sbq_wait {
+	int accounted;
+	struct wait_queue_entry wait;
+};
+
+#define DEFINE_SBQ_WAIT(name)							\
+	struct sbq_wait name = {						\
+		.accounted = 0,							\
+		.wait = {							\
+			.private	= current,				\
+			.func		= autoremove_wake_function,		\
+			.entry		= LIST_HEAD_INIT((name).wait.entry),	\
+		}								\
+	}
+
+/*
+ * Wrapper around prepare_to_wait_exclusive(), which maintains some extra
+ * internal state.
+ */
+void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq,
+				struct sbq_wait_state *ws,
+				struct sbq_wait *sbq_wait, int state);
+
+/*
+ * Must be paired with sbitmap_prepare_to_wait().
+ */
+void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws,
+				struct sbq_wait *sbq_wait);
+
 #endif /* __LINUX_SCALE_BITMAP_H */
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index f6a9553617bd..90a295cc782c 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -395,6 +395,7 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
 	sbq->min_shallow_depth = UINT_MAX;
 	sbq->wake_batch = sbq_calc_wake_batch(sbq, depth);
 	atomic_set(&sbq->wake_index, 0);
+	atomic_set(&sbq->ws_active, 0);
 
 	sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node);
 	if (!sbq->ws) {
@@ -510,6 +511,9 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
 {
 	int i, wake_index;
 
+	if (!atomic_read(&sbq->ws_active))
+		return NULL;
+
 	wake_index = atomic_read(&sbq->wake_index);
 	for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
 		struct sbq_wait_state *ws = &sbq->ws[wake_index];
@@ -635,6 +639,7 @@ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m)
 
 	seq_printf(m, "wake_batch=%u\n", sbq->wake_batch);
 	seq_printf(m, "wake_index=%d\n", atomic_read(&sbq->wake_index));
+	seq_printf(m, "ws_active=%d\n", atomic_read(&sbq->ws_active));
 
 	seq_puts(m, "ws={\n");
 	for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
@@ -650,3 +655,26 @@ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m)
 	seq_printf(m, "min_shallow_depth=%u\n", sbq->min_shallow_depth);
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_show);
+
+void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq,
+			     struct sbq_wait_state *ws,
+			     struct sbq_wait *sbq_wait, int state)
+{
+	if (!sbq_wait->accounted) {
+		atomic_inc(&sbq->ws_active);
+		sbq_wait->accounted = 1;
+	}
+	prepare_to_wait_exclusive(&ws->wait, &sbq_wait->wait, state);
+}
+EXPORT_SYMBOL_GPL(sbitmap_prepare_to_wait);
+
+void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws,
+			 struct sbq_wait *sbq_wait)
+{
+	finish_wait(&ws->wait, &sbq_wait->wait);
+	if (sbq_wait->accounted) {
+		atomic_dec(&sbq->ws_active);
+		sbq_wait->accounted = 0;
+	}
+}
+EXPORT_SYMBOL_GPL(sbitmap_finish_wait);
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] sbitmap: ammortize cost of clearing bits
  2018-11-30 16:01 ` [PATCH 1/2] sbitmap: ammortize cost of clearing bits Jens Axboe
@ 2018-11-30 20:03   ` Omar Sandoval
  2018-11-30 20:10     ` Jens Axboe
  0 siblings, 1 reply; 8+ messages in thread
From: Omar Sandoval @ 2018-11-30 20:03 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block

On Fri, Nov 30, 2018 at 09:01:17AM -0700, Jens Axboe wrote:
> sbitmap maintains a set of words that we use to set and clear bits, with
> each bit representing a tag for blk-mq. Even though we spread the bits
> out and maintain a hint cache, one particular bit allocated will end up
> being cleared in the exact same spot.
> 
> This introduces batched clearing of bits. Instead of clearing a given
> bit, the same bit is set in a cleared/free mask instead. If we fail
> allocating a bit from a given word, then we check the free mask, and
> batch move those cleared bits at that time. This trades 64 atomic bitops
> for 2 cmpxchg().
> 
> In a threaded poll test case, half the overhead of getting and clearing
> tags is removed with this change. On another poll test case with a
> single thread, performance is unchanged.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  include/linux/sbitmap.h | 31 +++++++++++++---
>  lib/sbitmap.c           | 80 +++++++++++++++++++++++++++++++++++++----
>  2 files changed, 100 insertions(+), 11 deletions(-)
> 
> diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
> index 804a50983ec5..07f117ee19dc 100644
> --- a/include/linux/sbitmap.h
> +++ b/include/linux/sbitmap.h
> @@ -30,14 +30,24 @@ struct seq_file;
>   */
>  struct sbitmap_word {
>  	/**
> -	 * @word: The bitmap word itself.
> +	 * @depth: Number of bits being used in @word/@cleared
>  	 */
> -	unsigned long word;
> +	unsigned long depth;
>  
>  	/**
> -	 * @depth: Number of bits being used in @word.
> +	 * @word: word holding free bits
>  	 */
> -	unsigned long depth;
> +	unsigned long word ____cacheline_aligned_in_smp;

Still splitting up word and depth in separate cachelines?

> +	/**
> +	 * @cleared: word holding cleared bits
> +	 */
> +	unsigned long cleared ____cacheline_aligned_in_smp;
> +
> +	/**
> +	 * @swap_lock: Held while swapping word <-> cleared
> +	 */
> +	spinlock_t swap_lock;
>  } ____cacheline_aligned_in_smp;
>  
>  /**
> @@ -310,6 +320,19 @@ static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr)
>  	clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
>  }
>  
> +/*
> + * This one is special, since it doesn't actually clear the bit, rather it
> + * sets the corresponding bit in the ->cleared mask instead. Paired with
> + * the caller doing sbitmap_batch_clear() if a given index is full, which
> + * will clear the previously freed entries in the corresponding ->word.
> + */
> +static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr)
> +{
> +	unsigned long *addr = &sb->map[SB_NR_TO_INDEX(sb, bitnr)].cleared;
> +
> +	set_bit(SB_NR_TO_BIT(sb, bitnr), addr);
> +}
> +
>  static inline void sbitmap_clear_bit_unlock(struct sbitmap *sb,
>  					    unsigned int bitnr)
>  {
> diff --git a/lib/sbitmap.c b/lib/sbitmap.c
> index 45cab6bbc1c7..f6a9553617bd 100644
> --- a/lib/sbitmap.c
> +++ b/lib/sbitmap.c
> @@ -59,6 +59,7 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
>  	for (i = 0; i < sb->map_nr; i++) {
>  		sb->map[i].depth = min(depth, bits_per_word);
>  		depth -= sb->map[i].depth;
> +		spin_lock_init(&sb->map[i].swap_lock);
>  	}
>  	return 0;
>  }
> @@ -111,6 +112,57 @@ static int __sbitmap_get_word(unsigned long *word, unsigned long depth,
>  	return nr;
>  }
>  
> +/*
> + * See if we have deferred clears that we can batch move
> + */
> +static inline bool sbitmap_deferred_clear(struct sbitmap *sb, int index)
> +{
> +	unsigned long mask, val;
> +	bool ret = false;
> +
> +	spin_lock(&sb->map[index].swap_lock);
> +
> +	if (!sb->map[index].cleared)
> +		goto out_unlock;
> +
> +	/*
> +	 * First get a stable cleared mask, setting the old mask to 0.
> +	 */
> +	do {
> +		mask = sb->map[index].cleared;
> +	} while (cmpxchg(&sb->map[index].cleared, mask, 0) != mask);
> +
> +	/*
> +	 * Now clear the masked bits in our free word
> +	 */
> +	do {
> +		val = sb->map[index].word;
> +	} while (cmpxchg(&sb->map[index].word, val, val & ~mask) != val);
> +
> +	ret = true;
> +out_unlock:
> +	spin_unlock(&sb->map[index].swap_lock);
> +	return ret;
> +}

Okay, I couldn't find any holes in this one :)

> +static int sbitmap_find_bit_in_index(struct sbitmap *sb, int index,
> +				     unsigned int alloc_hint, bool round_robin)
> +{
> +	int nr;
> +
> +	do {
> +		nr = __sbitmap_get_word(&sb->map[index].word,
> +					sb->map[index].depth, alloc_hint,
> +					!round_robin);
> +		if (nr != -1)
> +			break;
> +		if (!sbitmap_deferred_clear(sb, index))
> +			break;
> +	} while (1);
> +
> +	return nr;
> +}
> +
>  int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
>  {
>  	unsigned int i, index;
> @@ -129,9 +181,8 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
>  		alloc_hint = 0;
>  
>  	for (i = 0; i < sb->map_nr; i++) {
> -		nr = __sbitmap_get_word(&sb->map[index].word,
> -					sb->map[index].depth, alloc_hint,
> -					!round_robin);
> +		nr = sbitmap_find_bit_in_index(sb, index, alloc_hint,
> +						round_robin);
>  		if (nr != -1) {
>  			nr += index << sb->shift;
>  			break;
> @@ -206,23 +257,37 @@ bool sbitmap_any_bit_clear(const struct sbitmap *sb)
>  }
>  EXPORT_SYMBOL_GPL(sbitmap_any_bit_clear);
>  
> -unsigned int sbitmap_weight(const struct sbitmap *sb)
> +static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set)
>  {
>  	unsigned int i, weight = 0;
>  
>  	for (i = 0; i < sb->map_nr; i++) {
>  		const struct sbitmap_word *word = &sb->map[i];
>  
> -		weight += bitmap_weight(&word->word, word->depth);
> +		if (set)
> +			weight += bitmap_weight(&word->word, word->depth);

Should probably do
			weight -= bitmap_weight(&word->cleared, word->depth);

too, right?

> +		else
> +			weight += bitmap_weight(&word->cleared, word->depth);
>  	}
>  	return weight;
>  }
> +
> +unsigned int sbitmap_weight(const struct sbitmap *sb)
> +{
> +	return __sbitmap_weight(sb, true);
> +}
>  EXPORT_SYMBOL_GPL(sbitmap_weight);
>  
> +static unsigned int sbitmap_cleared(const struct sbitmap *sb)
> +{
> +	return __sbitmap_weight(sb, false);
> +}
> +
>  void sbitmap_show(struct sbitmap *sb, struct seq_file *m)
>  {
>  	seq_printf(m, "depth=%u\n", sb->depth);
> -	seq_printf(m, "busy=%u\n", sbitmap_weight(sb));
> +	seq_printf(m, "busy=%u\n", sbitmap_weight(sb) - sbitmap_cleared(sb));
> +	seq_printf(m, "cleared=%u\n", sbitmap_cleared(sb));
>  	seq_printf(m, "bits_per_word=%u\n", 1U << sb->shift);
>  	seq_printf(m, "map_nr=%u\n", sb->map_nr);
>  }
> @@ -514,7 +579,8 @@ EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up);
>  void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
>  			 unsigned int cpu)
>  {
> -	sbitmap_clear_bit_unlock(&sbq->sb, nr);
> +	sbitmap_deferred_clear_bit(&sbq->sb, nr);
> +
>  	/*
>  	 * Pairs with the memory barrier in set_current_state() to ensure the
>  	 * proper ordering of clear_bit_unlock()/waitqueue_active() in the waker
> -- 
> 2.17.1
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] sbitmap: ammortize cost of clearing bits
  2018-11-30 20:03   ` Omar Sandoval
@ 2018-11-30 20:10     ` Jens Axboe
  2018-11-30 21:41       ` Omar Sandoval
  0 siblings, 1 reply; 8+ messages in thread
From: Jens Axboe @ 2018-11-30 20:10 UTC (permalink / raw)
  To: Omar Sandoval; +Cc: linux-block

On 11/30/18 1:03 PM, Omar Sandoval wrote:
> On Fri, Nov 30, 2018 at 09:01:17AM -0700, Jens Axboe wrote:
>> sbitmap maintains a set of words that we use to set and clear bits, with
>> each bit representing a tag for blk-mq. Even though we spread the bits
>> out and maintain a hint cache, one particular bit allocated will end up
>> being cleared in the exact same spot.
>>
>> This introduces batched clearing of bits. Instead of clearing a given
>> bit, the same bit is set in a cleared/free mask instead. If we fail
>> allocating a bit from a given word, then we check the free mask, and
>> batch move those cleared bits at that time. This trades 64 atomic bitops
>> for 2 cmpxchg().
>>
>> In a threaded poll test case, half the overhead of getting and clearing
>> tags is removed with this change. On another poll test case with a
>> single thread, performance is unchanged.
>>
>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>> ---
>>  include/linux/sbitmap.h | 31 +++++++++++++---
>>  lib/sbitmap.c           | 80 +++++++++++++++++++++++++++++++++++++----
>>  2 files changed, 100 insertions(+), 11 deletions(-)
>>
>> diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
>> index 804a50983ec5..07f117ee19dc 100644
>> --- a/include/linux/sbitmap.h
>> +++ b/include/linux/sbitmap.h
>> @@ -30,14 +30,24 @@ struct seq_file;
>>   */
>>  struct sbitmap_word {
>>  	/**
>> -	 * @word: The bitmap word itself.
>> +	 * @depth: Number of bits being used in @word/@cleared
>>  	 */
>> -	unsigned long word;
>> +	unsigned long depth;
>>  
>>  	/**
>> -	 * @depth: Number of bits being used in @word.
>> +	 * @word: word holding free bits
>>  	 */
>> -	unsigned long depth;
>> +	unsigned long word ____cacheline_aligned_in_smp;
> 
> Still splitting up word and depth in separate cachelines?

Yeah, I mentioned that in one of the other postings, there's still a
definite win to doing that.

> Okay, I couldn't find any holes in this one :)

Good to hear that :-)

>> -unsigned int sbitmap_weight(const struct sbitmap *sb)
>> +static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set)
>>  {
>>  	unsigned int i, weight = 0;
>>  
>>  	for (i = 0; i < sb->map_nr; i++) {
>>  		const struct sbitmap_word *word = &sb->map[i];
>>  
>> -		weight += bitmap_weight(&word->word, word->depth);
>> +		if (set)
>> +			weight += bitmap_weight(&word->word, word->depth);
> 
> Should probably do
> 			weight -= bitmap_weight(&word->cleared, word->depth);
> 
> too, right?

We only use these for the debugfs stuff, how about I just make it static
instead?


-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] sbitmap: optimize wakeup check
  2018-11-30 16:01 ` [PATCH 2/2] sbitmap: optimize wakeup check Jens Axboe
@ 2018-11-30 21:37   ` Omar Sandoval
  0 siblings, 0 replies; 8+ messages in thread
From: Omar Sandoval @ 2018-11-30 21:37 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block

On Fri, Nov 30, 2018 at 09:01:18AM -0700, Jens Axboe wrote:
> Even if we have no waiters on any of the sbitmap_queue wait states, we
> still have to loop every entry to check. We do this for every IO, so
> the cost adds up.
> 
> Shift a bit of the cost to the slow path, when we actually have waiters.
> Wrap prepare_to_wait_exclusive() and finish_wait(), so we can maintain
> an internal count of how many are currently active. Then we can simply
> check this count in sbq_wake_ptr() and not have to loop if we don't
> have any sleepers.
> 
> Convert the two users of sbitmap with waiting, blk-mq-tag and iSCSI.

Reviewed-by: Omar Sandoval <osandov@fb.com>

> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  block/blk-mq-tag.c                       | 11 ++++----
>  drivers/target/iscsi/iscsi_target_util.c | 12 +++++----
>  include/linux/sbitmap.h                  | 34 ++++++++++++++++++++++++
>  lib/sbitmap.c                            | 28 +++++++++++++++++++
>  4 files changed, 74 insertions(+), 11 deletions(-)

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] sbitmap: ammortize cost of clearing bits
  2018-11-30 20:10     ` Jens Axboe
@ 2018-11-30 21:41       ` Omar Sandoval
  0 siblings, 0 replies; 8+ messages in thread
From: Omar Sandoval @ 2018-11-30 21:41 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block

On Fri, Nov 30, 2018 at 01:10:47PM -0700, Jens Axboe wrote:
> On 11/30/18 1:03 PM, Omar Sandoval wrote:
> > On Fri, Nov 30, 2018 at 09:01:17AM -0700, Jens Axboe wrote:
> >> sbitmap maintains a set of words that we use to set and clear bits, with
> >> each bit representing a tag for blk-mq. Even though we spread the bits
> >> out and maintain a hint cache, one particular bit allocated will end up
> >> being cleared in the exact same spot.
> >>
> >> This introduces batched clearing of bits. Instead of clearing a given
> >> bit, the same bit is set in a cleared/free mask instead. If we fail
> >> allocating a bit from a given word, then we check the free mask, and
> >> batch move those cleared bits at that time. This trades 64 atomic bitops
> >> for 2 cmpxchg().
> >>
> >> In a threaded poll test case, half the overhead of getting and clearing
> >> tags is removed with this change. On another poll test case with a
> >> single thread, performance is unchanged.
> >>
> >> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> >> ---
> >>  include/linux/sbitmap.h | 31 +++++++++++++---
> >>  lib/sbitmap.c           | 80 +++++++++++++++++++++++++++++++++++++----
> >>  2 files changed, 100 insertions(+), 11 deletions(-)
> >>
> >> diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
> >> index 804a50983ec5..07f117ee19dc 100644
> >> --- a/include/linux/sbitmap.h
> >> +++ b/include/linux/sbitmap.h
> >> @@ -30,14 +30,24 @@ struct seq_file;
> >>   */
> >>  struct sbitmap_word {
> >>  	/**
> >> -	 * @word: The bitmap word itself.
> >> +	 * @depth: Number of bits being used in @word/@cleared
> >>  	 */
> >> -	unsigned long word;
> >> +	unsigned long depth;
> >>  
> >>  	/**
> >> -	 * @depth: Number of bits being used in @word.
> >> +	 * @word: word holding free bits
> >>  	 */
> >> -	unsigned long depth;
> >> +	unsigned long word ____cacheline_aligned_in_smp;
> > 
> > Still splitting up word and depth in separate cachelines?
> 
> Yeah, I mentioned that in one of the other postings, there's still a
> definite win to doing that.
> 
> > Okay, I couldn't find any holes in this one :)
> 
> Good to hear that :-)
> 
> >> -unsigned int sbitmap_weight(const struct sbitmap *sb)
> >> +static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set)
> >>  {
> >>  	unsigned int i, weight = 0;
> >>  
> >>  	for (i = 0; i < sb->map_nr; i++) {
> >>  		const struct sbitmap_word *word = &sb->map[i];
> >>  
> >> -		weight += bitmap_weight(&word->word, word->depth);
> >> +		if (set)
> >> +			weight += bitmap_weight(&word->word, word->depth);
> > 
> > Should probably do
> > 			weight -= bitmap_weight(&word->cleared, word->depth);
> > 
> > too, right?
> 
> We only use these for the debugfs stuff, how about I just make it static
> instead?

Yeah, with that,

Reviewed-by: Omar Sandoval <osandov@fb.com>

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2018-11-30 21:41 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-11-30 16:01 [PATCHSET v4] sbitmap optimizations Jens Axboe
2018-11-30 16:01 ` [PATCH 1/2] sbitmap: ammortize cost of clearing bits Jens Axboe
2018-11-30 20:03   ` Omar Sandoval
2018-11-30 20:10     ` Jens Axboe
2018-11-30 21:41       ` Omar Sandoval
2018-11-30 16:01 ` [PATCH 2/2] sbitmap: optimize wakeup check Jens Axboe
2018-11-30 21:37   ` Omar Sandoval
  -- strict thread matches above, loose matches on Subject: below --
2018-11-30  2:09 [PATCHSET v3] sbitmap optimizations Jens Axboe
2018-11-30  2:09 ` [PATCH 2/2] sbitmap: optimize wakeup check Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).