[patch 0/4] blk-mq: use percpu

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [patch 0/4] blk-mq: use percpu_ida to manage tags
@ 2013-10-11  7:18 Shaohua Li
  2013-10-11  7:18 ` [patch 1/4] percpu_ida: make percpu_ida percpu size/batch configurable Shaohua Li
                   ` (3 more replies)
  0 siblings, 4 replies; 13+ messages in thread
From: Shaohua Li @ 2013-10-11  7:18 UTC (permalink / raw)
  To: linux-kernel, axboe; +Cc: kmo

Hi, blk-mq and percpu_ida use similar algorithm to manage tags. The difference
is when a cpu can't allocate tags, blk-mq will use ipi to purge remote cpu
cache, while percpu-ida directly purges remote cpu cache. In practice, the
percpu-ida approach is much faster when we can't allocate enough for percpu
cache.

This patch makes blk-mq use percpu_ida to manage tags, this avoids some
duplicate code and has better performance as I mentioned. To test the patches,
rebase blk-mq tree to latest kernel first.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [patch 1/4] percpu_ida: make percpu_ida percpu size/batch configurable
  2013-10-11  7:18 [patch 0/4] blk-mq: use percpu_ida to manage tags Shaohua Li
@ 2013-10-11  7:18 ` Shaohua Li
  2013-10-11 20:31   ` Kent Overstreet
  2013-10-11  7:18 ` [patch 2/4] percpu_ida: add percpu_ida_for_each_free Shaohua Li
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 13+ messages in thread
From: Shaohua Li @ 2013-10-11  7:18 UTC (permalink / raw)
  To: linux-kernel, axboe; +Cc: kmo

[-- Attachment #1: 0001-percpu_ida-make-percpu_ida-percpu-size-batch-configu.patch --]
[-- Type: text/plain, Size: 4771 bytes --]

Make percpu_ida percpu size/batch configurable. The block-mq-tag will use it.

Signed-off-by: Shaohua Li <shli@fusionio.com>
---
 include/linux/percpu_ida.h |   18 +++++++++++++++++-
 lib/percpu_ida.c           |   28 +++++++++++-----------------
 2 files changed, 28 insertions(+), 18 deletions(-)

Index: master/include/linux/percpu_ida.h
===================================================================
--- master.orig/include/linux/percpu_ida.h	2013-10-11 12:14:44.472699999 +0800
+++ master/include/linux/percpu_ida.h	2013-10-11 12:14:44.472699999 +0800
@@ -16,6 +16,8 @@ struct percpu_ida {
 	 * percpu_ida_init()
 	 */
 	unsigned			nr_tags;
+	unsigned			percpu_max_size;
+	unsigned			percpu_batch_size;
 
 	struct percpu_ida_cpu __percpu	*tag_cpu;
 
@@ -51,10 +53,24 @@ struct percpu_ida {
 	} ____cacheline_aligned_in_smp;
 };
 
+/*
+ * Number of tags we move between the percpu freelist and the global freelist at
+ * a time
+ */
+#define IDA_DEFAULT_PCPU_BATCH_MOVE	32U
+/* Max size of percpu freelist, */
+#define IDA_DEFAULT_PCPU_SIZE	((IDA_DEFAULT_PCPU_BATCH_MOVE * 3) / 2)
+
 int percpu_ida_alloc(struct percpu_ida *pool, gfp_t gfp);
 void percpu_ida_free(struct percpu_ida *pool, unsigned tag);
 
 void percpu_ida_destroy(struct percpu_ida *pool);
-int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags);
+int __percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags,
+	unsigned long max_size, unsigned long batch_size);
+static inline int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags)
+{
+	return __percpu_ida_init(pool, nr_tags, IDA_DEFAULT_PCPU_SIZE,
+		IDA_DEFAULT_PCPU_BATCH_MOVE);
+}
 
 #endif /* __PERCPU_IDA_H__ */
Index: master/lib/percpu_ida.c
===================================================================
--- master.orig/lib/percpu_ida.c	2013-10-11 12:14:44.472699999 +0800
+++ master/lib/percpu_ida.c	2013-10-11 12:14:44.472699999 +0800
@@ -30,15 +30,6 @@
 #include <linux/spinlock.h>
 #include <linux/percpu_ida.h>
 
-/*
- * Number of tags we move between the percpu freelist and the global freelist at
- * a time
- */
-#define IDA_PCPU_BATCH_MOVE	32U
-
-/* Max size of percpu freelist, */
-#define IDA_PCPU_SIZE		((IDA_PCPU_BATCH_MOVE * 3) / 2)
-
 struct percpu_ida_cpu {
 	/*
 	 * Even though this is percpu, we need a lock for tag stealing by remote
@@ -78,7 +69,7 @@ static inline void steal_tags(struct per
 	struct percpu_ida_cpu *remote;
 
 	for (cpus_have_tags = cpumask_weight(&pool->cpus_have_tags);
-	     cpus_have_tags * IDA_PCPU_SIZE > pool->nr_tags / 2;
+	     cpus_have_tags * pool->percpu_max_size > pool->nr_tags / 2;
 	     cpus_have_tags--) {
 		cpu = cpumask_next(cpu, &pool->cpus_have_tags);
 
@@ -123,7 +114,7 @@ static inline void alloc_global_tags(str
 {
 	move_tags(tags->freelist, &tags->nr_free,
 		  pool->freelist, &pool->nr_free,
-		  min(pool->nr_free, IDA_PCPU_BATCH_MOVE));
+		  min(pool->nr_free, pool->percpu_batch_size));
 }
 
 static inline unsigned alloc_local_tag(struct percpu_ida *pool,
@@ -245,17 +236,17 @@ void percpu_ida_free(struct percpu_ida *
 		wake_up(&pool->wait);
 	}
 
-	if (nr_free == IDA_PCPU_SIZE) {
+	if (nr_free == pool->percpu_max_size) {
 		spin_lock(&pool->lock);
 
 		/*
 		 * Global lock held and irqs disabled, don't need percpu
 		 * lock
 		 */
-		if (tags->nr_free == IDA_PCPU_SIZE) {
+		if (tags->nr_free == pool->percpu_max_size) {
 			move_tags(pool->freelist, &pool->nr_free,
 				  tags->freelist, &tags->nr_free,
-				  IDA_PCPU_BATCH_MOVE);
+				  pool->percpu_batch_size);
 
 			wake_up(&pool->wait);
 		}
@@ -292,7 +283,8 @@ EXPORT_SYMBOL_GPL(percpu_ida_destroy);
  * Allocation is percpu, but sharding is limited by nr_tags - for best
  * performance, the workload should not span more cpus than nr_tags / 128.
  */
-int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags)
+int __percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags,
+	unsigned long max_size, unsigned long batch_size)
 {
 	unsigned i, cpu, order;
 
@@ -301,6 +293,8 @@ int percpu_ida_init(struct percpu_ida *p
 	init_waitqueue_head(&pool->wait);
 	spin_lock_init(&pool->lock);
 	pool->nr_tags = nr_tags;
+	pool->percpu_max_size = max_size;
+	pool->percpu_batch_size = batch_size;
 
 	/* Guard against overflow */
 	if (nr_tags > (unsigned) INT_MAX + 1) {
@@ -319,7 +313,7 @@ int percpu_ida_init(struct percpu_ida *p
 	pool->nr_free = nr_tags;
 
 	pool->tag_cpu = __alloc_percpu(sizeof(struct percpu_ida_cpu) +
-				       IDA_PCPU_SIZE * sizeof(unsigned),
+				       pool->percpu_max_size * sizeof(unsigned),
 				       sizeof(unsigned));
 	if (!pool->tag_cpu)
 		goto err;
@@ -332,4 +326,4 @@ err:
 	percpu_ida_destroy(pool);
 	return -ENOMEM;
 }
-EXPORT_SYMBOL_GPL(percpu_ida_init);
+EXPORT_SYMBOL_GPL(__percpu_ida_init);


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [patch 2/4] percpu_ida: add percpu_ida_for_each_free
  2013-10-11  7:18 [patch 0/4] blk-mq: use percpu_ida to manage tags Shaohua Li
  2013-10-11  7:18 ` [patch 1/4] percpu_ida: make percpu_ida percpu size/batch configurable Shaohua Li
@ 2013-10-11  7:18 ` Shaohua Li
  2013-10-11 20:34   ` Kent Overstreet
  2013-10-11  7:18 ` [patch 3/4] percpu_ida: add an API to return free tags Shaohua Li
  2013-10-11  7:18 ` [patch 4/4] blk-mq: switch to percpu-ida for tag menagement Shaohua Li
  3 siblings, 1 reply; 13+ messages in thread
From: Shaohua Li @ 2013-10-11  7:18 UTC (permalink / raw)
  To: linux-kernel, axboe; +Cc: kmo

[-- Attachment #1: 0002-percpu_ida-add-percpu_ida_for_each_free.patch --]
[-- Type: text/plain, Size: 2176 bytes --]

Add a new API to iterate free ids. blk-mq-tag will use it.

Signed-off-by: Shaohua Li <shli@fusionio.com>
---
 include/linux/percpu_ida.h |    3 +++
 lib/percpu_ida.c           |   44 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+)

Index: master/include/linux/percpu_ida.h
===================================================================
--- master.orig/include/linux/percpu_ida.h	2013-10-11 12:14:56.932543376 +0800
+++ master/include/linux/percpu_ida.h	2013-10-11 12:14:56.928543501 +0800
@@ -73,4 +73,7 @@ static inline int percpu_ida_init(struct
 		IDA_DEFAULT_PCPU_BATCH_MOVE);
 }
 
+int percpu_ida_for_each_free(struct percpu_ida *pool,
+	int (*fn)(int id, void *data), void *data);
+
 #endif /* __PERCPU_IDA_H__ */
Index: master/lib/percpu_ida.c
===================================================================
--- master.orig/lib/percpu_ida.c	2013-10-11 12:14:56.932543376 +0800
+++ master/lib/percpu_ida.c	2013-10-11 12:14:56.928543501 +0800
@@ -327,3 +327,47 @@ err:
 	return -ENOMEM;
 }
 EXPORT_SYMBOL_GPL(__percpu_ida_init);
+
+/**
+ * percpu_ida_for_each_free - iterate free ids of a pool
+ * @pool: pool to iterate
+ * @fn: interate callback function
+ * @data: parameter for @fn
+ *
+ * Note, this doesn't guarantee iterate all free ids restrictly. Some free
+ * ids might be missed, some might be iterated duplicated, and some might
+ * not be free and iterated.
+ */
+int percpu_ida_for_each_free(struct percpu_ida *pool,
+	int (*fn)(int id, void *data), void *data)
+{
+	unsigned long flags;
+	struct percpu_ida_cpu *remote;
+	unsigned cpu, i, err = 0;
+
+	local_irq_save(flags);
+	for_each_possible_cpu(cpu) {
+		remote = per_cpu_ptr(pool->tag_cpu, cpu);
+		spin_lock(&remote->lock);
+		for (i = 0; i < remote->nr_free; i++) {
+			err = fn(remote->freelist[i], data);
+			if (err)
+				break;
+		}
+		spin_unlock(&remote->lock);
+		if (err)
+			goto out;
+	}
+
+	spin_lock(&pool->lock);
+	for (i = 0; i < pool->nr_free; i++) {
+		err = fn(pool->freelist[i], data);
+		if (err)
+			break;
+	}
+	spin_unlock(&pool->lock);
+out:
+	local_irq_restore(flags);
+	return err;
+}
+EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [patch 3/4] percpu_ida: add an API to return free tags
  2013-10-11  7:18 [patch 0/4] blk-mq: use percpu_ida to manage tags Shaohua Li
  2013-10-11  7:18 ` [patch 1/4] percpu_ida: make percpu_ida percpu size/batch configurable Shaohua Li
  2013-10-11  7:18 ` [patch 2/4] percpu_ida: add percpu_ida_for_each_free Shaohua Li
@ 2013-10-11  7:18 ` Shaohua Li
  2013-10-11 20:35   ` Kent Overstreet
  2013-10-11  7:18 ` [patch 4/4] blk-mq: switch to percpu-ida for tag menagement Shaohua Li
  3 siblings, 1 reply; 13+ messages in thread
From: Shaohua Li @ 2013-10-11  7:18 UTC (permalink / raw)
  To: linux-kernel, axboe; +Cc: kmo

[-- Attachment #1: 0003-percpu_ida-add-an-API-to-return-free-tags.patch --]
[-- Type: text/plain, Size: 1585 bytes --]

add an API to return free tags, blk-mq-tag will use it

Signed-off-by: Shaohua Li <shli@fusionio.com>
---
 include/linux/percpu_ida.h |    1 +
 lib/percpu_ida.c           |   17 +++++++++++++++++
 2 files changed, 18 insertions(+)

Index: master/include/linux/percpu_ida.h
===================================================================
--- master.orig/include/linux/percpu_ida.h	2013-10-11 12:15:06.996416710 +0800
+++ master/include/linux/percpu_ida.h	2013-10-11 12:15:06.992416749 +0800
@@ -76,4 +76,5 @@ static inline int percpu_ida_init(struct
 int percpu_ida_for_each_free(struct percpu_ida *pool,
 	int (*fn)(int id, void *data), void *data);
 
+unsigned percpu_ida_free_tags(struct percpu_ida *pool, int cpu);
 #endif /* __PERCPU_IDA_H__ */
Index: master/lib/percpu_ida.c
===================================================================
--- master.orig/lib/percpu_ida.c	2013-10-11 12:15:06.996416710 +0800
+++ master/lib/percpu_ida.c	2013-10-11 12:15:06.992416749 +0800
@@ -371,3 +371,20 @@ out:
 	return err;
 }
 EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
+
+/**
+ * percpu_ida_free_tags - return free tags number of a specific cpu or global pool
+ * @pool: pool related
+ * @cpu: specific cpu or global pool if @cpu == nr_cpu_ids
+ *
+ * Note: this just returns a snapshot of free tags number.
+ */
+unsigned percpu_ida_free_tags(struct percpu_ida *pool, int cpu)
+{
+	struct percpu_ida_cpu *remote;
+	if (cpu == nr_cpu_ids)
+		return pool->nr_free;
+	remote = per_cpu_ptr(pool->tag_cpu, cpu);
+	return remote->nr_free;
+}
+EXPORT_SYMBOL_GPL(percpu_ida_free_tags);


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [patch 4/4] blk-mq: switch to percpu-ida for tag menagement
  2013-10-11  7:18 [patch 0/4] blk-mq: use percpu_ida to manage tags Shaohua Li
                   ` (2 preceding siblings ...)
  2013-10-11  7:18 ` [patch 3/4] percpu_ida: add an API to return free tags Shaohua Li
@ 2013-10-11  7:18 ` Shaohua Li
  2013-10-11 14:28   ` Jens Axboe
  3 siblings, 1 reply; 13+ messages in thread
From: Shaohua Li @ 2013-10-11  7:18 UTC (permalink / raw)
  To: linux-kernel, axboe; +Cc: kmo

[-- Attachment #1: 0004-blk-mq-switch-to-percpu-ida-for-tag-menagement.patch --]
[-- Type: text/plain, Size: 15120 bytes --]

Using percpu-ida to manage blk-mq tags. the percpu-ida has similar algorithm
like the blk-mq-tag. The difference is when a cpu can't allocate tags
blk-mq-tag uses ipi to purge remote cpu cache and percpu-ida directly purges
remote cpu cache. In practice (testing null_blk), the percpu-ida approach is
much faster when total tags aren't enough.

Signed-off-by: Shaohua Li <shli@fusionio.com>
---
 block/blk-mq-tag.c |  470 +++++++----------------------------------------------
 1 file changed, 68 insertions(+), 402 deletions(-)

Index: master/block/blk-mq-tag.c
===================================================================
--- master.orig/block/blk-mq-tag.c	2013-10-11 12:15:17.200288445 +0800
+++ master/block/blk-mq-tag.c	2013-10-11 12:15:17.196288502 +0800
@@ -1,9 +1,6 @@
 #include <linux/kernel.h>
-#include <linux/threads.h>
 #include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/cpu.h>
+#include <linux/percpu_ida.h>
 
 #include <linux/blk-mq.h>
 #include "blk.h"
@@ -11,291 +8,53 @@
 #include "blk-mq-tag.h"
 
 /*
- * Per-cpu cache entries
- */
-struct blk_mq_tag_map {
-	unsigned int nr_free;
-	unsigned int freelist[];
-};
-
-/*
  * Per tagged queue (tag address space) map
  */
 struct blk_mq_tags {
 	unsigned int nr_tags;
-	unsigned int reserved_tags;
-	unsigned int batch_move;
-	unsigned int max_cache;
-
-	struct {
-		spinlock_t lock;
-		unsigned int nr_free;
-		unsigned int *freelist;
-		unsigned int nr_reserved;
-		unsigned int *reservelist;
-		struct list_head wait;
-	} ____cacheline_aligned_in_smp;
-
-	struct blk_mq_tag_map __percpu *free_maps;
-
-	struct blk_mq_cpu_notifier cpu_notifier;
-};
+	unsigned int nr_reserved_tags;
+	unsigned int nr_batch_move;
+	unsigned int nr_max_cache;
 
-struct blk_mq_tag_wait {
-	struct list_head list;
-	struct task_struct *task;
+	struct percpu_ida free_tags;
+	struct percpu_ida reserved_tags;
 };
 
-#define DEFINE_TAG_WAIT(name)						\
-	struct blk_mq_tag_wait name = {					\
-		.list		= LIST_HEAD_INIT((name).list),		\
-		.task		= current,				\
-	}
-
-static unsigned int move_tags(unsigned int *dst, unsigned int *dst_nr,
-			      unsigned int *src, unsigned int *src_nr,
-			      unsigned int nr_to_move)
-{
-	nr_to_move = min(nr_to_move, *src_nr);
-	*src_nr -= nr_to_move;
-	memcpy(dst + *dst_nr, src + *src_nr, sizeof(int) * nr_to_move);
-	*dst_nr += nr_to_move;
-
-	return nr_to_move;
-}
-
-static void __wake_waiters(struct blk_mq_tags *tags, unsigned int nr)
-{
-	while (nr && !list_empty(&tags->wait)) {
-		struct blk_mq_tag_wait *waiter;
-
-		waiter = list_entry(tags->wait.next, struct blk_mq_tag_wait,
-					list);
-		list_del_init(&waiter->list);
-		wake_up_process(waiter->task);
-		nr--;
-	}
-}
-
-static void __blk_mq_tag_return(struct blk_mq_tags *tags,
-				struct blk_mq_tag_map *map, unsigned int nr)
-{
-	unsigned int waiters;
-
-	lockdep_assert_held(&tags->lock);
-
-	waiters = move_tags(tags->freelist, &tags->nr_free, map->freelist,
-				&map->nr_free, nr);
-	if (!list_empty(&tags->wait))
-		__wake_waiters(tags, waiters);
-}
-
-static void blk_mq_tag_return(struct blk_mq_tags *tags,
-			      struct blk_mq_tag_map *map, unsigned int nr)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&tags->lock, flags);
-	__blk_mq_tag_return(tags, map, nr);
-	spin_unlock_irqrestore(&tags->lock, flags);
-}
-
-#if NR_CPUS != 1
-static void prune_cache(void *data)
-{
-	struct blk_mq_tags *tags = data;
-	struct blk_mq_tag_map *map;
-
-	map = per_cpu_ptr(tags->free_maps, smp_processor_id());
-
-	spin_lock(&tags->lock);
-	__blk_mq_tag_return(tags, map, tags->batch_move);
-	spin_unlock(&tags->lock);
-}
-#endif
-
-static void ipi_local_caches(struct blk_mq_tags *tags, unsigned int this_cpu)
-{
-#if NR_CPUS != 1
-	cpumask_var_t ipi_mask;
-	unsigned int i, total;
-
-	/*
-	 * We could per-cpu cache this things, but overhead is probably not
-	 * large enough to care about it. If we fail, just punt to doing a
-	 * prune on all CPUs.
-	 */
-	if (!alloc_cpumask_var(&ipi_mask, GFP_ATOMIC)) {
-		smp_call_function(prune_cache, tags, 0);
-		return;
-	}
-
-	cpumask_clear(ipi_mask);
-
-	total = 0;
-	for_each_online_cpu(i) {
-		struct blk_mq_tag_map *map = per_cpu_ptr(tags->free_maps, i);
-
-		if (!map->nr_free)
-			continue;
-
-		total += map->nr_free;
-		cpumask_set_cpu(i, ipi_mask);
-
-		if (total > tags->batch_move)
-			break;
-	}
-
-	if (total) {
-		preempt_disable();
-		smp_call_function_many(ipi_mask, prune_cache, tags, 0);
-		preempt_enable();
-	}
-
-	free_cpumask_var(ipi_mask);
-#endif
-}
-
-/*
- * Wait on a free tag, move batch to map when we have it. Returns with
- * local CPU irq flags saved in 'flags'.
- */
-static void wait_on_tags(struct blk_mq_tags *tags, struct blk_mq_tag_map **map,
-			 unsigned long *flags)
-{
-	DEFINE_TAG_WAIT(wait);
-
-	do {
-		spin_lock_irqsave(&tags->lock, *flags);
-
-		__set_current_state(TASK_UNINTERRUPTIBLE);
-
-		if (list_empty(&wait.list))
-			list_add_tail(&wait.list, &tags->wait);
-
-		*map = this_cpu_ptr(tags->free_maps);
-		if ((*map)->nr_free || tags->nr_free) {
-			if (!(*map)->nr_free) {
-				move_tags((*map)->freelist, &(*map)->nr_free,
-						tags->freelist, &tags->nr_free,
-						tags->batch_move);
-			}
-
-			if (!list_empty(&wait.list))
-				list_del(&wait.list);
-
-			spin_unlock(&tags->lock);
-			break;
-		}
-
-		spin_unlock_irqrestore(&tags->lock, *flags);
-		ipi_local_caches(tags, raw_smp_processor_id());
-		io_schedule();
-	} while (1);
-
-	__set_current_state(TASK_RUNNING);
-}
-
 void blk_mq_wait_for_tags(struct blk_mq_tags *tags)
 {
-	struct blk_mq_tag_map *map;
-	unsigned long flags;
-
-	ipi_local_caches(tags, raw_smp_processor_id());
-	wait_on_tags(tags, &map, &flags);
-	local_irq_restore(flags);
+	int tag = blk_mq_get_tag(tags, __GFP_WAIT, false);
+	blk_mq_put_tag(tags, tag);
 }
 
 bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
 {
-	return !tags || tags->nr_free != 0;
+	return !tags ||
+		percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids) != 0;
 }
 
 static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp)
 {
-	struct blk_mq_tag_map *map;
-	unsigned int this_cpu;
-	unsigned long flags;
-	unsigned int tag;
-
-	local_irq_save(flags);
-	this_cpu = smp_processor_id();
-	map = per_cpu_ptr(tags->free_maps, this_cpu);
-
-	/*
-	 * Grab from local per-cpu cache, if we can
-	 */
-	do {
-		if (map->nr_free) {
-			map->nr_free--;
-			tag = map->freelist[map->nr_free];
-			local_irq_restore(flags);
-			return tag;
-		}
-
-		/*
-		 * Grab from device map, if we can
-		 */
-		if (tags->nr_free) {
-			spin_lock(&tags->lock);
-			move_tags(map->freelist, &map->nr_free, tags->freelist,
-					&tags->nr_free, tags->batch_move);
-			spin_unlock(&tags->lock);
-			continue;
-		}
-
-		local_irq_restore(flags);
-
-		if (!(gfp & __GFP_WAIT))
-			break;
-
-		ipi_local_caches(tags, this_cpu);
-
-		/*
-		 * All are busy, wait. Returns with irqs disabled again
-		 * and potentially new 'map' pointer.
-		 */
-		wait_on_tags(tags, &map, &flags);
-	} while (1);
+	int tag;
 
-	return BLK_MQ_TAG_FAIL;
+	tag = percpu_ida_alloc(&tags->free_tags, gfp);
+	if (tag < 0)
+		return BLK_MQ_TAG_FAIL;
+	return tag + tags->nr_reserved_tags;
 }
 
 static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags,
 					      gfp_t gfp)
 {
-	unsigned int tag = BLK_MQ_TAG_FAIL;
-	DEFINE_TAG_WAIT(wait);
+	int tag;
 
-	if (unlikely(!tags->reserved_tags)) {
+	if (unlikely(!tags->nr_reserved_tags)) {
 		WARN_ON_ONCE(1);
 		return BLK_MQ_TAG_FAIL;
 	}
 
-	do {
-		spin_lock_irq(&tags->lock);
-		if (tags->nr_reserved) {
-			tags->nr_reserved--;
-			tag = tags->reservelist[tags->nr_reserved];
-			break;
-		}
-
-		if (!(gfp & __GFP_WAIT))
-			break;
-
-		__set_current_state(TASK_UNINTERRUPTIBLE);
-
-		if (list_empty(&wait.list))
-			list_add_tail(&wait.list, &tags->wait);
-
-		spin_unlock_irq(&tags->lock);
-		io_schedule();
-	} while (1);
-
-	if (!list_empty(&wait.list))
-		list_del(&wait.list);
-
-	spin_unlock_irq(&tags->lock);
+	tag = percpu_ida_alloc(&tags->reserved_tags, gfp);
+	if (tag < 0)
+		return BLK_MQ_TAG_FAIL;
 	return tag;
 }
 
@@ -309,57 +68,38 @@ unsigned int blk_mq_get_tag(struct blk_m
 
 static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
 {
-	struct blk_mq_tag_map *map;
-	unsigned long flags;
-
 	BUG_ON(tag >= tags->nr_tags);
 
-	local_irq_save(flags);
-	map = this_cpu_ptr(tags->free_maps);
-
-	map->freelist[map->nr_free] = tag;
-	map->nr_free++;
-
-	if (map->nr_free >= tags->max_cache ||
-	    !list_empty_careful(&tags->wait)) {
-		spin_lock(&tags->lock);
-		__blk_mq_tag_return(tags, map, tags->batch_move);
-		spin_unlock(&tags->lock);
-	}
-
-	local_irq_restore(flags);
+	percpu_ida_free(&tags->free_tags, tag - tags->nr_reserved_tags);
 }
 
 static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
 				      unsigned int tag)
 {
-	unsigned long flags;
-
-	BUG_ON(tag >= tags->reserved_tags);
-
-	spin_lock_irqsave(&tags->lock, flags);
-	tags->reservelist[tags->nr_reserved] = tag;
-	tags->nr_reserved++;
+	BUG_ON(tag >= tags->nr_reserved_tags);
 
-	if (!list_empty(&tags->wait))
-		__wake_waiters(tags, 1);
-
-	spin_unlock_irqrestore(&tags->lock, flags);
+	percpu_ida_free(&tags->reserved_tags, tag);
 }
 
 void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
 {
-	if (tag >= tags->reserved_tags)
+	if (tag >= tags->nr_reserved_tags)
 		__blk_mq_put_tag(tags, tag);
 	else
 		__blk_mq_put_reserved_tag(tags, tag);
 }
 
+static int __blk_mq_tag_iter(int id, void *data)
+{
+	unsigned long *tag_map = data;
+	__set_bit(id, tag_map);
+	return 0;
+}
+
 void blk_mq_tag_busy_iter(struct blk_mq_tags *tags,
 			  void (*fn)(void *, unsigned long *), void *data)
 {
-	unsigned long flags, *tag_map;
-	unsigned int i, j;
+	unsigned long *tag_map;
 	size_t map_size;
 
 	map_size = ALIGN(tags->nr_tags, BITS_PER_LONG) / BITS_PER_LONG;
@@ -367,56 +107,21 @@ void blk_mq_tag_busy_iter(struct blk_mq_
 	if (!tag_map)
 		return;
 
-	local_irq_save(flags);
-
-	for_each_online_cpu(i) {
-		struct blk_mq_tag_map *map = per_cpu_ptr(tags->free_maps, i);
-
-		for (j = 0; j < map->nr_free; j++)
-			__set_bit(map->freelist[j], tag_map);
-	}
-
-	if (tags->nr_free || tags->nr_reserved) {
-		spin_lock(&tags->lock);
-
-		if (tags->nr_reserved)
-			for (j = 0; j < tags->nr_reserved; j++)
-				__set_bit(tags->reservelist[j], tag_map);
-
-		if (tags->nr_free)
-			for (j = 0; j < tags->nr_free; j++)
-				__set_bit(tags->freelist[j], tag_map);
-
-		spin_unlock(&tags->lock);
-	}
-
-	local_irq_restore(flags);
+	percpu_ida_for_each_free(&tags->free_tags, __blk_mq_tag_iter, tag_map);
+	if (tags->nr_reserved_tags)
+		percpu_ida_for_each_free(&tags->reserved_tags, __blk_mq_tag_iter,
+			tag_map);
 
 	fn(data, tag_map);
 	kfree(tag_map);
 }
 
-static void blk_mq_tag_notify(void *data, unsigned long action,
-			      unsigned int cpu)
-{
-	/*
-	 * Move entries from this CPU to global pool
-	 */
-	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
-		struct blk_mq_tags *tags = data;
-		struct blk_mq_tag_map *map = per_cpu_ptr(tags->free_maps, cpu);
-
-		if (map->nr_free)
-			blk_mq_tag_return(tags, map, map->nr_free);
-	}
-}
-
 struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 				     unsigned int reserved_tags, int node)
 {
 	unsigned int nr_tags, nr_cache;
 	struct blk_mq_tags *tags;
-	size_t map_size;
+	int ret;
 
 	if (total_tags > BLK_MQ_TAG_MAX) {
 		pr_err("blk-mq: tag depth too large\n");
@@ -435,104 +140,65 @@ struct blk_mq_tags *blk_mq_init_tags(uns
 	else if (nr_cache > BLK_MQ_TAG_CACHE_MAX)
 		nr_cache = BLK_MQ_TAG_CACHE_MAX;
 
-	map_size = sizeof(struct blk_mq_tag_map) + nr_cache * sizeof(int);
-	tags->free_maps = __alloc_percpu(map_size, sizeof(void *));
-	if (!tags->free_maps)
-		goto err_free_maps;
-
-	tags->freelist = kmalloc_node(sizeof(int) * nr_tags, GFP_KERNEL, node);
-	if (!tags->freelist)
-		goto err_freelist;
-
-	if (reserved_tags) {
-		tags->reservelist = kmalloc_node(sizeof(int) * reserved_tags,
-							GFP_KERNEL, node);
-		if (!tags->reservelist)
-			goto err_reservelist;
-	}
-
-	spin_lock_init(&tags->lock);
-	INIT_LIST_HEAD(&tags->wait);
 	tags->nr_tags = total_tags;
-	tags->reserved_tags = reserved_tags;
-	tags->max_cache = nr_cache;
-	tags->batch_move = max(1u, nr_cache / 2);
-
-	/*
-	 * Reserved tags are first
-	 */
-	if (reserved_tags) {
-		tags->nr_reserved = 0;
-		while (reserved_tags--) {
-			tags->reservelist[tags->nr_reserved] =
-							tags->nr_reserved;
-			tags->nr_reserved++;
-		}
-	}
+	tags->nr_reserved_tags = reserved_tags;
+	tags->nr_max_cache = nr_cache;
+	tags->nr_batch_move = max(1u, nr_cache / 2);
+
+	ret = __percpu_ida_init(&tags->free_tags, tags->nr_tags -
+				tags->nr_reserved_tags,
+				tags->nr_max_cache,
+				tags->nr_batch_move);
+	if (ret)
+		goto err_free_tags;
 
-	/*
-	 * Rest of the tags start at the queue list
-	 */
-	tags->nr_free = 0;
-	while (nr_tags--) {
-		tags->freelist[tags->nr_free] = tags->nr_free +
-							tags->nr_reserved;
-		tags->nr_free++;
+	if (reserved_tags) {
+		/*
+		 * With max_cahe and batch set to 1, the allocator fallbacks to
+		 * no cached. It's fine reserved tags allocation is slow.
+		 */
+		ret = __percpu_ida_init(&tags->reserved_tags, reserved_tags,
+				1, 1);
+		if (ret)
+			goto err_reserved_tags;
 	}
 
-	blk_mq_init_cpu_notifier(&tags->cpu_notifier, blk_mq_tag_notify, tags);
-	blk_mq_register_cpu_notifier(&tags->cpu_notifier);
 	return tags;
 
-err_reservelist:
-	kfree(tags->freelist);
-err_freelist:
-	free_percpu(tags->free_maps);
-err_free_maps:
+err_reserved_tags:
+	percpu_ida_destroy(&tags->free_tags);
+err_free_tags:
 	kfree(tags);
 	return NULL;
 }
 
 void blk_mq_free_tags(struct blk_mq_tags *tags)
 {
-	blk_mq_unregister_cpu_notifier(&tags->cpu_notifier);
-	free_percpu(tags->free_maps);
-	kfree(tags->freelist);
-	kfree(tags->reservelist);
+	percpu_ida_destroy(&tags->free_tags);
+	percpu_ida_destroy(&tags->reserved_tags);
 	kfree(tags);
 }
 
 ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
 {
 	char *orig_page = page;
-	unsigned long flags;
-	struct list_head *tmp;
-	int waiters;
 	int cpu;
 
 	if (!tags)
 		return 0;
 
-	spin_lock_irqsave(&tags->lock, flags);
-
 	page += sprintf(page, "nr_tags=%u, reserved_tags=%u, batch_move=%u,"
-			" max_cache=%u\n", tags->nr_tags, tags->reserved_tags,
-			tags->batch_move, tags->max_cache);
-
-	waiters = 0;
-	list_for_each(tmp, &tags->wait)
-		waiters++;
-
-	page += sprintf(page, "nr_free=%u, nr_reserved=%u, waiters=%u\n",
-			tags->nr_free, tags->nr_reserved, waiters);
+			" max_cache=%u\n", tags->nr_tags, tags->nr_reserved_tags,
+			tags->nr_batch_move, tags->nr_max_cache);
 
-	for_each_online_cpu(cpu) {
-		struct blk_mq_tag_map *map = per_cpu_ptr(tags->free_maps, cpu);
+	page += sprintf(page, "nr_free=%u, nr_reserved=%u\n",
+			percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids),
+			percpu_ida_free_tags(&tags->reserved_tags, nr_cpu_ids));
 
+	for_each_possible_cpu(cpu) {
 		page += sprintf(page, "  cpu%02u: nr_free=%u\n", cpu,
-					map->nr_free);
+				percpu_ida_free_tags(&tags->free_tags, cpu));
 	}
 
-	spin_unlock_irqrestore(&tags->lock, flags);
 	return page - orig_page;
 }


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch 4/4] blk-mq: switch to percpu-ida for tag menagement
  2013-10-11  7:18 ` [patch 4/4] blk-mq: switch to percpu-ida for tag menagement Shaohua Li
@ 2013-10-11 14:28   ` Jens Axboe
  2013-10-12  0:49     ` Shaohua Li
  0 siblings, 1 reply; 13+ messages in thread
From: Jens Axboe @ 2013-10-11 14:28 UTC (permalink / raw)
  To: Shaohua Li, linux-kernel; +Cc: kmo

On 10/11/2013 01:18 AM, Shaohua Li wrote:
> Using percpu-ida to manage blk-mq tags. the percpu-ida has similar algorithm
> like the blk-mq-tag. The difference is when a cpu can't allocate tags
> blk-mq-tag uses ipi to purge remote cpu cache and percpu-ida directly purges
> remote cpu cache. In practice (testing null_blk), the percpu-ida approach is
> much faster when total tags aren't enough.

I'm not surprised it's a lot faster the the pathological case of needing
to prune tags, the IPI isn't near ideal for that. I'm assuming the
general performance is the same for the non-full case?

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch 1/4] percpu_ida: make percpu_ida percpu size/batch configurable
  2013-10-11  7:18 ` [patch 1/4] percpu_ida: make percpu_ida percpu size/batch configurable Shaohua Li
@ 2013-10-11 20:31   ` Kent Overstreet
  2013-10-12  0:52     ` Shaohua Li
  0 siblings, 1 reply; 13+ messages in thread
From: Kent Overstreet @ 2013-10-11 20:31 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-kernel, axboe

On Fri, Oct 11, 2013 at 03:18:03PM +0800, Shaohua Li wrote:
> Make percpu_ida percpu size/batch configurable. The block-mq-tag will use it.

Can you explain the justification for this? Performance...?

> Signed-off-by: Shaohua Li <shli@fusionio.com>
> ---
>  include/linux/percpu_ida.h |   18 +++++++++++++++++-
>  lib/percpu_ida.c           |   28 +++++++++++-----------------
>  2 files changed, 28 insertions(+), 18 deletions(-)
> 
> Index: master/include/linux/percpu_ida.h
> ===================================================================
> --- master.orig/include/linux/percpu_ida.h	2013-10-11 12:14:44.472699999 +0800
> +++ master/include/linux/percpu_ida.h	2013-10-11 12:14:44.472699999 +0800
> @@ -16,6 +16,8 @@ struct percpu_ida {
>  	 * percpu_ida_init()
>  	 */
>  	unsigned			nr_tags;
> +	unsigned			percpu_max_size;
> +	unsigned			percpu_batch_size;
>  
>  	struct percpu_ida_cpu __percpu	*tag_cpu;
>  
> @@ -51,10 +53,24 @@ struct percpu_ida {
>  	} ____cacheline_aligned_in_smp;
>  };
>  
> +/*
> + * Number of tags we move between the percpu freelist and the global freelist at
> + * a time
> + */
> +#define IDA_DEFAULT_PCPU_BATCH_MOVE	32U
> +/* Max size of percpu freelist, */
> +#define IDA_DEFAULT_PCPU_SIZE	((IDA_DEFAULT_PCPU_BATCH_MOVE * 3) / 2)
> +
>  int percpu_ida_alloc(struct percpu_ida *pool, gfp_t gfp);
>  void percpu_ida_free(struct percpu_ida *pool, unsigned tag);
>  
>  void percpu_ida_destroy(struct percpu_ida *pool);
> -int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags);
> +int __percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags,
> +	unsigned long max_size, unsigned long batch_size);
> +static inline int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags)
> +{
> +	return __percpu_ida_init(pool, nr_tags, IDA_DEFAULT_PCPU_SIZE,
> +		IDA_DEFAULT_PCPU_BATCH_MOVE);
> +}
>  
>  #endif /* __PERCPU_IDA_H__ */
> Index: master/lib/percpu_ida.c
> ===================================================================
> --- master.orig/lib/percpu_ida.c	2013-10-11 12:14:44.472699999 +0800
> +++ master/lib/percpu_ida.c	2013-10-11 12:14:44.472699999 +0800
> @@ -30,15 +30,6 @@
>  #include <linux/spinlock.h>
>  #include <linux/percpu_ida.h>
>  
> -/*
> - * Number of tags we move between the percpu freelist and the global freelist at
> - * a time
> - */
> -#define IDA_PCPU_BATCH_MOVE	32U
> -
> -/* Max size of percpu freelist, */
> -#define IDA_PCPU_SIZE		((IDA_PCPU_BATCH_MOVE * 3) / 2)
> -
>  struct percpu_ida_cpu {
>  	/*
>  	 * Even though this is percpu, we need a lock for tag stealing by remote
> @@ -78,7 +69,7 @@ static inline void steal_tags(struct per
>  	struct percpu_ida_cpu *remote;
>  
>  	for (cpus_have_tags = cpumask_weight(&pool->cpus_have_tags);
> -	     cpus_have_tags * IDA_PCPU_SIZE > pool->nr_tags / 2;
> +	     cpus_have_tags * pool->percpu_max_size > pool->nr_tags / 2;
>  	     cpus_have_tags--) {
>  		cpu = cpumask_next(cpu, &pool->cpus_have_tags);
>  
> @@ -123,7 +114,7 @@ static inline void alloc_global_tags(str
>  {
>  	move_tags(tags->freelist, &tags->nr_free,
>  		  pool->freelist, &pool->nr_free,
> -		  min(pool->nr_free, IDA_PCPU_BATCH_MOVE));
> +		  min(pool->nr_free, pool->percpu_batch_size));
>  }
>  
>  static inline unsigned alloc_local_tag(struct percpu_ida *pool,
> @@ -245,17 +236,17 @@ void percpu_ida_free(struct percpu_ida *
>  		wake_up(&pool->wait);
>  	}
>  
> -	if (nr_free == IDA_PCPU_SIZE) {
> +	if (nr_free == pool->percpu_max_size) {
>  		spin_lock(&pool->lock);
>  
>  		/*
>  		 * Global lock held and irqs disabled, don't need percpu
>  		 * lock
>  		 */
> -		if (tags->nr_free == IDA_PCPU_SIZE) {
> +		if (tags->nr_free == pool->percpu_max_size) {
>  			move_tags(pool->freelist, &pool->nr_free,
>  				  tags->freelist, &tags->nr_free,
> -				  IDA_PCPU_BATCH_MOVE);
> +				  pool->percpu_batch_size);
>  
>  			wake_up(&pool->wait);
>  		}
> @@ -292,7 +283,8 @@ EXPORT_SYMBOL_GPL(percpu_ida_destroy);
>   * Allocation is percpu, but sharding is limited by nr_tags - for best
>   * performance, the workload should not span more cpus than nr_tags / 128.
>   */
> -int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags)
> +int __percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags,
> +	unsigned long max_size, unsigned long batch_size)
>  {
>  	unsigned i, cpu, order;
>  
> @@ -301,6 +293,8 @@ int percpu_ida_init(struct percpu_ida *p
>  	init_waitqueue_head(&pool->wait);
>  	spin_lock_init(&pool->lock);
>  	pool->nr_tags = nr_tags;
> +	pool->percpu_max_size = max_size;
> +	pool->percpu_batch_size = batch_size;
>  
>  	/* Guard against overflow */
>  	if (nr_tags > (unsigned) INT_MAX + 1) {
> @@ -319,7 +313,7 @@ int percpu_ida_init(struct percpu_ida *p
>  	pool->nr_free = nr_tags;
>  
>  	pool->tag_cpu = __alloc_percpu(sizeof(struct percpu_ida_cpu) +
> -				       IDA_PCPU_SIZE * sizeof(unsigned),
> +				       pool->percpu_max_size * sizeof(unsigned),
>  				       sizeof(unsigned));
>  	if (!pool->tag_cpu)
>  		goto err;
> @@ -332,4 +326,4 @@ err:
>  	percpu_ida_destroy(pool);
>  	return -ENOMEM;
>  }
> -EXPORT_SYMBOL_GPL(percpu_ida_init);
> +EXPORT_SYMBOL_GPL(__percpu_ida_init);
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch 2/4] percpu_ida: add percpu_ida_for_each_free
  2013-10-11  7:18 ` [patch 2/4] percpu_ida: add percpu_ida_for_each_free Shaohua Li
@ 2013-10-11 20:34   ` Kent Overstreet
  0 siblings, 0 replies; 13+ messages in thread
From: Kent Overstreet @ 2013-10-11 20:34 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-kernel, axboe

On Fri, Oct 11, 2013 at 03:18:04PM +0800, Shaohua Li wrote:
> Add a new API to iterate free ids. blk-mq-tag will use it.
> 
> Signed-off-by: Shaohua Li <shli@fusionio.com>
> ---
>  include/linux/percpu_ida.h |    3 +++
>  lib/percpu_ida.c           |   44 ++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 47 insertions(+)
> 
> Index: master/include/linux/percpu_ida.h
> ===================================================================
> --- master.orig/include/linux/percpu_ida.h	2013-10-11 12:14:56.932543376 +0800
> +++ master/include/linux/percpu_ida.h	2013-10-11 12:14:56.928543501 +0800
> @@ -73,4 +73,7 @@ static inline int percpu_ida_init(struct
>  		IDA_DEFAULT_PCPU_BATCH_MOVE);
>  }
>  
> +int percpu_ida_for_each_free(struct percpu_ida *pool,
> +	int (*fn)(int id, void *data), void *data);
> +
>  #endif /* __PERCPU_IDA_H__ */
> Index: master/lib/percpu_ida.c
> ===================================================================
> --- master.orig/lib/percpu_ida.c	2013-10-11 12:14:56.932543376 +0800
> +++ master/lib/percpu_ida.c	2013-10-11 12:14:56.928543501 +0800
> @@ -327,3 +327,47 @@ err:
>  	return -ENOMEM;
>  }
>  EXPORT_SYMBOL_GPL(__percpu_ida_init);
> +
> +/**
> + * percpu_ida_for_each_free - iterate free ids of a pool
> + * @pool: pool to iterate
> + * @fn: interate callback function
> + * @data: parameter for @fn
> + *
> + * Note, this doesn't guarantee iterate all free ids restrictly. Some free
> + * ids might be missed, some might be iterated duplicated, and some might
> + * not be free and iterated.
> + */
> +int percpu_ida_for_each_free(struct percpu_ida *pool,
> +	int (*fn)(int id, void *data), void *data)

I'd prefer to make the id parameter unsigned - and use a typedef for the
function pointer argument - but other than that, looks reasonable to me.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch 3/4] percpu_ida: add an API to return free tags
  2013-10-11  7:18 ` [patch 3/4] percpu_ida: add an API to return free tags Shaohua Li
@ 2013-10-11 20:35   ` Kent Overstreet
  2013-10-12  1:02     ` Shaohua Li
  0 siblings, 1 reply; 13+ messages in thread
From: Kent Overstreet @ 2013-10-11 20:35 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-kernel, axboe

On Fri, Oct 11, 2013 at 03:18:05PM +0800, Shaohua Li wrote:
> add an API to return free tags, blk-mq-tag will use it

Can you explain how this is going to be used? Seems like something that
could be prone to misuse, try and convince me there isn't a better way
to do what it's for.

> 
> Signed-off-by: Shaohua Li <shli@fusionio.com>
> ---
>  include/linux/percpu_ida.h |    1 +
>  lib/percpu_ida.c           |   17 +++++++++++++++++
>  2 files changed, 18 insertions(+)
> 
> Index: master/include/linux/percpu_ida.h
> ===================================================================
> --- master.orig/include/linux/percpu_ida.h	2013-10-11 12:15:06.996416710 +0800
> +++ master/include/linux/percpu_ida.h	2013-10-11 12:15:06.992416749 +0800
> @@ -76,4 +76,5 @@ static inline int percpu_ida_init(struct
>  int percpu_ida_for_each_free(struct percpu_ida *pool,
>  	int (*fn)(int id, void *data), void *data);
>  
> +unsigned percpu_ida_free_tags(struct percpu_ida *pool, int cpu);
>  #endif /* __PERCPU_IDA_H__ */
> Index: master/lib/percpu_ida.c
> ===================================================================
> --- master.orig/lib/percpu_ida.c	2013-10-11 12:15:06.996416710 +0800
> +++ master/lib/percpu_ida.c	2013-10-11 12:15:06.992416749 +0800
> @@ -371,3 +371,20 @@ out:
>  	return err;
>  }
>  EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
> +
> +/**
> + * percpu_ida_free_tags - return free tags number of a specific cpu or global pool
> + * @pool: pool related
> + * @cpu: specific cpu or global pool if @cpu == nr_cpu_ids
> + *
> + * Note: this just returns a snapshot of free tags number.
> + */
> +unsigned percpu_ida_free_tags(struct percpu_ida *pool, int cpu)
> +{
> +	struct percpu_ida_cpu *remote;
> +	if (cpu == nr_cpu_ids)
> +		return pool->nr_free;
> +	remote = per_cpu_ptr(pool->tag_cpu, cpu);
> +	return remote->nr_free;
> +}
> +EXPORT_SYMBOL_GPL(percpu_ida_free_tags);
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch 4/4] blk-mq: switch to percpu-ida for tag menagement
  2013-10-11 14:28   ` Jens Axboe
@ 2013-10-12  0:49     ` Shaohua Li
  2013-10-13 18:26       ` Jens Axboe
  0 siblings, 1 reply; 13+ messages in thread
From: Shaohua Li @ 2013-10-12  0:49 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, kmo

On Fri, Oct 11, 2013 at 08:28:54AM -0600, Jens Axboe wrote:
> On 10/11/2013 01:18 AM, Shaohua Li wrote:
> > Using percpu-ida to manage blk-mq tags. the percpu-ida has similar algorithm
> > like the blk-mq-tag. The difference is when a cpu can't allocate tags
> > blk-mq-tag uses ipi to purge remote cpu cache and percpu-ida directly purges
> > remote cpu cache. In practice (testing null_blk), the percpu-ida approach is
> > much faster when total tags aren't enough.
> 
> I'm not surprised it's a lot faster the the pathological case of needing
> to prune tags, the IPI isn't near ideal for that. I'm assuming the
> general performance is the same for the non-full case?

Yep. My test is done in a 2 sockets machine, 12 process cross the 2 sockets. So
if there is lock contention or ipi, should be stressed heavily. Testing is done
for null-blk.

hw_queue_depth	nopatch iops	patch iops
64		~800k/s		~1470k/s
2048		~4470k/s	~4340k/s

In the 2048 case, perf doesn't should any percpu-ida function is hot (no one
use > 1% cpu time), so the small difference should be drift. So yes, the
general performance is the same.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch 1/4] percpu_ida: make percpu_ida percpu size/batch configurable
  2013-10-11 20:31   ` Kent Overstreet
@ 2013-10-12  0:52     ` Shaohua Li
  0 siblings, 0 replies; 13+ messages in thread
From: Shaohua Li @ 2013-10-12  0:52 UTC (permalink / raw)
  To: Kent Overstreet; +Cc: linux-kernel, axboe

On Fri, Oct 11, 2013 at 01:31:52PM -0700, Kent Overstreet wrote:
> On Fri, Oct 11, 2013 at 03:18:03PM +0800, Shaohua Li wrote:
> > Make percpu_ida percpu size/batch configurable. The block-mq-tag will use it.
> 
> Can you explain the justification for this? Performance...?

The performance using percpu_ida for tag management? Please see the patch 4
thread.  If you ask the justification of making the size/batch configurable,
the answer is we might have no enough tags, and default cache size/batch is too
aggressive.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch 3/4] percpu_ida: add an API to return free tags
  2013-10-11 20:35   ` Kent Overstreet
@ 2013-10-12  1:02     ` Shaohua Li
  0 siblings, 0 replies; 13+ messages in thread
From: Shaohua Li @ 2013-10-12  1:02 UTC (permalink / raw)
  To: Kent Overstreet; +Cc: linux-kernel, axboe

On Fri, Oct 11, 2013 at 01:35:36PM -0700, Kent Overstreet wrote:
> On Fri, Oct 11, 2013 at 03:18:05PM +0800, Shaohua Li wrote:
> > add an API to return free tags, blk-mq-tag will use it
> 
> Can you explain how this is going to be used? Seems like something that
> could be prone to misuse, try and convince me there isn't a better way
> to do what it's for.

There are two usages. One is for sysfs info output, which can be used for
diagnosis. The other one is blk-mq wants to determine if it's possible a
request can be queued. Neither requires precise data. Yes, caller should be
aware returned data isn't very precise.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch 4/4] blk-mq: switch to percpu-ida for tag menagement
  2013-10-12  0:49     ` Shaohua Li
@ 2013-10-13 18:26       ` Jens Axboe
  0 siblings, 0 replies; 13+ messages in thread
From: Jens Axboe @ 2013-10-13 18:26 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-kernel, kmo

On Sat, Oct 12 2013, Shaohua Li wrote:
> On Fri, Oct 11, 2013 at 08:28:54AM -0600, Jens Axboe wrote:
> > On 10/11/2013 01:18 AM, Shaohua Li wrote:
> > > Using percpu-ida to manage blk-mq tags. the percpu-ida has similar algorithm
> > > like the blk-mq-tag. The difference is when a cpu can't allocate tags
> > > blk-mq-tag uses ipi to purge remote cpu cache and percpu-ida directly purges
> > > remote cpu cache. In practice (testing null_blk), the percpu-ida approach is
> > > much faster when total tags aren't enough.
> > 
> > I'm not surprised it's a lot faster the the pathological case of needing
> > to prune tags, the IPI isn't near ideal for that. I'm assuming the
> > general performance is the same for the non-full case?
> 
> Yep. My test is done in a 2 sockets machine, 12 process cross the 2 sockets. So
> if there is lock contention or ipi, should be stressed heavily. Testing is done
> for null-blk.
> 
> hw_queue_depth	nopatch iops	patch iops
> 64		~800k/s		~1470k/s
> 2048		~4470k/s	~4340k/s
> 
> In the 2048 case, perf doesn't should any percpu-ida function is hot (no one
> use > 1% cpu time), so the small difference should be drift. So yes, the
> general performance is the same.

Yep, that definitely looks a lot prettier. Thanks! I've had this on the
TODO for a while, since the ida got fixed up.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2013-10-13 18:26 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-10-11  7:18 [patch 0/4] blk-mq: use percpu_ida to manage tags Shaohua Li
2013-10-11  7:18 ` [patch 1/4] percpu_ida: make percpu_ida percpu size/batch configurable Shaohua Li
2013-10-11 20:31   ` Kent Overstreet
2013-10-12  0:52     ` Shaohua Li
2013-10-11  7:18 ` [patch 2/4] percpu_ida: add percpu_ida_for_each_free Shaohua Li
2013-10-11 20:34   ` Kent Overstreet
2013-10-11  7:18 ` [patch 3/4] percpu_ida: add an API to return free tags Shaohua Li
2013-10-11 20:35   ` Kent Overstreet
2013-10-12  1:02     ` Shaohua Li
2013-10-11  7:18 ` [patch 4/4] blk-mq: switch to percpu-ida for tag menagement Shaohua Li
2013-10-11 14:28   ` Jens Axboe
2013-10-12  0:49     ` Shaohua Li
2013-10-13 18:26       ` Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox