* [patch 0/4] blk-mq: use percpu_ida to manage tags
@ 2013-10-11 7:18 Shaohua Li
2013-10-11 7:18 ` [patch 1/4] percpu_ida: make percpu_ida percpu size/batch configurable Shaohua Li
` (3 more replies)
0 siblings, 4 replies; 13+ messages in thread
From: Shaohua Li @ 2013-10-11 7:18 UTC (permalink / raw)
To: linux-kernel, axboe; +Cc: kmo
Hi, blk-mq and percpu_ida use similar algorithm to manage tags. The difference
is when a cpu can't allocate tags, blk-mq will use ipi to purge remote cpu
cache, while percpu-ida directly purges remote cpu cache. In practice, the
percpu-ida approach is much faster when we can't allocate enough for percpu
cache.
This patch makes blk-mq use percpu_ida to manage tags, this avoids some
duplicate code and has better performance as I mentioned. To test the patches,
rebase blk-mq tree to latest kernel first.
Thanks,
Shaohua
^ permalink raw reply [flat|nested] 13+ messages in thread
* [patch 1/4] percpu_ida: make percpu_ida percpu size/batch configurable
2013-10-11 7:18 [patch 0/4] blk-mq: use percpu_ida to manage tags Shaohua Li
@ 2013-10-11 7:18 ` Shaohua Li
2013-10-11 20:31 ` Kent Overstreet
2013-10-11 7:18 ` [patch 2/4] percpu_ida: add percpu_ida_for_each_free Shaohua Li
` (2 subsequent siblings)
3 siblings, 1 reply; 13+ messages in thread
From: Shaohua Li @ 2013-10-11 7:18 UTC (permalink / raw)
To: linux-kernel, axboe; +Cc: kmo
[-- Attachment #1: 0001-percpu_ida-make-percpu_ida-percpu-size-batch-configu.patch --]
[-- Type: text/plain, Size: 4771 bytes --]
Make percpu_ida percpu size/batch configurable. The block-mq-tag will use it.
Signed-off-by: Shaohua Li <shli@fusionio.com>
---
include/linux/percpu_ida.h | 18 +++++++++++++++++-
lib/percpu_ida.c | 28 +++++++++++-----------------
2 files changed, 28 insertions(+), 18 deletions(-)
Index: master/include/linux/percpu_ida.h
===================================================================
--- master.orig/include/linux/percpu_ida.h 2013-10-11 12:14:44.472699999 +0800
+++ master/include/linux/percpu_ida.h 2013-10-11 12:14:44.472699999 +0800
@@ -16,6 +16,8 @@ struct percpu_ida {
* percpu_ida_init()
*/
unsigned nr_tags;
+ unsigned percpu_max_size;
+ unsigned percpu_batch_size;
struct percpu_ida_cpu __percpu *tag_cpu;
@@ -51,10 +53,24 @@ struct percpu_ida {
} ____cacheline_aligned_in_smp;
};
+/*
+ * Number of tags we move between the percpu freelist and the global freelist at
+ * a time
+ */
+#define IDA_DEFAULT_PCPU_BATCH_MOVE 32U
+/* Max size of percpu freelist, */
+#define IDA_DEFAULT_PCPU_SIZE ((IDA_DEFAULT_PCPU_BATCH_MOVE * 3) / 2)
+
int percpu_ida_alloc(struct percpu_ida *pool, gfp_t gfp);
void percpu_ida_free(struct percpu_ida *pool, unsigned tag);
void percpu_ida_destroy(struct percpu_ida *pool);
-int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags);
+int __percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags,
+ unsigned long max_size, unsigned long batch_size);
+static inline int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags)
+{
+ return __percpu_ida_init(pool, nr_tags, IDA_DEFAULT_PCPU_SIZE,
+ IDA_DEFAULT_PCPU_BATCH_MOVE);
+}
#endif /* __PERCPU_IDA_H__ */
Index: master/lib/percpu_ida.c
===================================================================
--- master.orig/lib/percpu_ida.c 2013-10-11 12:14:44.472699999 +0800
+++ master/lib/percpu_ida.c 2013-10-11 12:14:44.472699999 +0800
@@ -30,15 +30,6 @@
#include <linux/spinlock.h>
#include <linux/percpu_ida.h>
-/*
- * Number of tags we move between the percpu freelist and the global freelist at
- * a time
- */
-#define IDA_PCPU_BATCH_MOVE 32U
-
-/* Max size of percpu freelist, */
-#define IDA_PCPU_SIZE ((IDA_PCPU_BATCH_MOVE * 3) / 2)
-
struct percpu_ida_cpu {
/*
* Even though this is percpu, we need a lock for tag stealing by remote
@@ -78,7 +69,7 @@ static inline void steal_tags(struct per
struct percpu_ida_cpu *remote;
for (cpus_have_tags = cpumask_weight(&pool->cpus_have_tags);
- cpus_have_tags * IDA_PCPU_SIZE > pool->nr_tags / 2;
+ cpus_have_tags * pool->percpu_max_size > pool->nr_tags / 2;
cpus_have_tags--) {
cpu = cpumask_next(cpu, &pool->cpus_have_tags);
@@ -123,7 +114,7 @@ static inline void alloc_global_tags(str
{
move_tags(tags->freelist, &tags->nr_free,
pool->freelist, &pool->nr_free,
- min(pool->nr_free, IDA_PCPU_BATCH_MOVE));
+ min(pool->nr_free, pool->percpu_batch_size));
}
static inline unsigned alloc_local_tag(struct percpu_ida *pool,
@@ -245,17 +236,17 @@ void percpu_ida_free(struct percpu_ida *
wake_up(&pool->wait);
}
- if (nr_free == IDA_PCPU_SIZE) {
+ if (nr_free == pool->percpu_max_size) {
spin_lock(&pool->lock);
/*
* Global lock held and irqs disabled, don't need percpu
* lock
*/
- if (tags->nr_free == IDA_PCPU_SIZE) {
+ if (tags->nr_free == pool->percpu_max_size) {
move_tags(pool->freelist, &pool->nr_free,
tags->freelist, &tags->nr_free,
- IDA_PCPU_BATCH_MOVE);
+ pool->percpu_batch_size);
wake_up(&pool->wait);
}
@@ -292,7 +283,8 @@ EXPORT_SYMBOL_GPL(percpu_ida_destroy);
* Allocation is percpu, but sharding is limited by nr_tags - for best
* performance, the workload should not span more cpus than nr_tags / 128.
*/
-int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags)
+int __percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags,
+ unsigned long max_size, unsigned long batch_size)
{
unsigned i, cpu, order;
@@ -301,6 +293,8 @@ int percpu_ida_init(struct percpu_ida *p
init_waitqueue_head(&pool->wait);
spin_lock_init(&pool->lock);
pool->nr_tags = nr_tags;
+ pool->percpu_max_size = max_size;
+ pool->percpu_batch_size = batch_size;
/* Guard against overflow */
if (nr_tags > (unsigned) INT_MAX + 1) {
@@ -319,7 +313,7 @@ int percpu_ida_init(struct percpu_ida *p
pool->nr_free = nr_tags;
pool->tag_cpu = __alloc_percpu(sizeof(struct percpu_ida_cpu) +
- IDA_PCPU_SIZE * sizeof(unsigned),
+ pool->percpu_max_size * sizeof(unsigned),
sizeof(unsigned));
if (!pool->tag_cpu)
goto err;
@@ -332,4 +326,4 @@ err:
percpu_ida_destroy(pool);
return -ENOMEM;
}
-EXPORT_SYMBOL_GPL(percpu_ida_init);
+EXPORT_SYMBOL_GPL(__percpu_ida_init);
^ permalink raw reply [flat|nested] 13+ messages in thread
* [patch 2/4] percpu_ida: add percpu_ida_for_each_free
2013-10-11 7:18 [patch 0/4] blk-mq: use percpu_ida to manage tags Shaohua Li
2013-10-11 7:18 ` [patch 1/4] percpu_ida: make percpu_ida percpu size/batch configurable Shaohua Li
@ 2013-10-11 7:18 ` Shaohua Li
2013-10-11 20:34 ` Kent Overstreet
2013-10-11 7:18 ` [patch 3/4] percpu_ida: add an API to return free tags Shaohua Li
2013-10-11 7:18 ` [patch 4/4] blk-mq: switch to percpu-ida for tag menagement Shaohua Li
3 siblings, 1 reply; 13+ messages in thread
From: Shaohua Li @ 2013-10-11 7:18 UTC (permalink / raw)
To: linux-kernel, axboe; +Cc: kmo
[-- Attachment #1: 0002-percpu_ida-add-percpu_ida_for_each_free.patch --]
[-- Type: text/plain, Size: 2176 bytes --]
Add a new API to iterate free ids. blk-mq-tag will use it.
Signed-off-by: Shaohua Li <shli@fusionio.com>
---
include/linux/percpu_ida.h | 3 +++
lib/percpu_ida.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 47 insertions(+)
Index: master/include/linux/percpu_ida.h
===================================================================
--- master.orig/include/linux/percpu_ida.h 2013-10-11 12:14:56.932543376 +0800
+++ master/include/linux/percpu_ida.h 2013-10-11 12:14:56.928543501 +0800
@@ -73,4 +73,7 @@ static inline int percpu_ida_init(struct
IDA_DEFAULT_PCPU_BATCH_MOVE);
}
+int percpu_ida_for_each_free(struct percpu_ida *pool,
+ int (*fn)(int id, void *data), void *data);
+
#endif /* __PERCPU_IDA_H__ */
Index: master/lib/percpu_ida.c
===================================================================
--- master.orig/lib/percpu_ida.c 2013-10-11 12:14:56.932543376 +0800
+++ master/lib/percpu_ida.c 2013-10-11 12:14:56.928543501 +0800
@@ -327,3 +327,47 @@ err:
return -ENOMEM;
}
EXPORT_SYMBOL_GPL(__percpu_ida_init);
+
+/**
+ * percpu_ida_for_each_free - iterate free ids of a pool
+ * @pool: pool to iterate
+ * @fn: interate callback function
+ * @data: parameter for @fn
+ *
+ * Note, this doesn't guarantee iterate all free ids restrictly. Some free
+ * ids might be missed, some might be iterated duplicated, and some might
+ * not be free and iterated.
+ */
+int percpu_ida_for_each_free(struct percpu_ida *pool,
+ int (*fn)(int id, void *data), void *data)
+{
+ unsigned long flags;
+ struct percpu_ida_cpu *remote;
+ unsigned cpu, i, err = 0;
+
+ local_irq_save(flags);
+ for_each_possible_cpu(cpu) {
+ remote = per_cpu_ptr(pool->tag_cpu, cpu);
+ spin_lock(&remote->lock);
+ for (i = 0; i < remote->nr_free; i++) {
+ err = fn(remote->freelist[i], data);
+ if (err)
+ break;
+ }
+ spin_unlock(&remote->lock);
+ if (err)
+ goto out;
+ }
+
+ spin_lock(&pool->lock);
+ for (i = 0; i < pool->nr_free; i++) {
+ err = fn(pool->freelist[i], data);
+ if (err)
+ break;
+ }
+ spin_unlock(&pool->lock);
+out:
+ local_irq_restore(flags);
+ return err;
+}
+EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
^ permalink raw reply [flat|nested] 13+ messages in thread
* [patch 3/4] percpu_ida: add an API to return free tags
2013-10-11 7:18 [patch 0/4] blk-mq: use percpu_ida to manage tags Shaohua Li
2013-10-11 7:18 ` [patch 1/4] percpu_ida: make percpu_ida percpu size/batch configurable Shaohua Li
2013-10-11 7:18 ` [patch 2/4] percpu_ida: add percpu_ida_for_each_free Shaohua Li
@ 2013-10-11 7:18 ` Shaohua Li
2013-10-11 20:35 ` Kent Overstreet
2013-10-11 7:18 ` [patch 4/4] blk-mq: switch to percpu-ida for tag menagement Shaohua Li
3 siblings, 1 reply; 13+ messages in thread
From: Shaohua Li @ 2013-10-11 7:18 UTC (permalink / raw)
To: linux-kernel, axboe; +Cc: kmo
[-- Attachment #1: 0003-percpu_ida-add-an-API-to-return-free-tags.patch --]
[-- Type: text/plain, Size: 1585 bytes --]
add an API to return free tags, blk-mq-tag will use it
Signed-off-by: Shaohua Li <shli@fusionio.com>
---
include/linux/percpu_ida.h | 1 +
lib/percpu_ida.c | 17 +++++++++++++++++
2 files changed, 18 insertions(+)
Index: master/include/linux/percpu_ida.h
===================================================================
--- master.orig/include/linux/percpu_ida.h 2013-10-11 12:15:06.996416710 +0800
+++ master/include/linux/percpu_ida.h 2013-10-11 12:15:06.992416749 +0800
@@ -76,4 +76,5 @@ static inline int percpu_ida_init(struct
int percpu_ida_for_each_free(struct percpu_ida *pool,
int (*fn)(int id, void *data), void *data);
+unsigned percpu_ida_free_tags(struct percpu_ida *pool, int cpu);
#endif /* __PERCPU_IDA_H__ */
Index: master/lib/percpu_ida.c
===================================================================
--- master.orig/lib/percpu_ida.c 2013-10-11 12:15:06.996416710 +0800
+++ master/lib/percpu_ida.c 2013-10-11 12:15:06.992416749 +0800
@@ -371,3 +371,20 @@ out:
return err;
}
EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
+
+/**
+ * percpu_ida_free_tags - return free tags number of a specific cpu or global pool
+ * @pool: pool related
+ * @cpu: specific cpu or global pool if @cpu == nr_cpu_ids
+ *
+ * Note: this just returns a snapshot of free tags number.
+ */
+unsigned percpu_ida_free_tags(struct percpu_ida *pool, int cpu)
+{
+ struct percpu_ida_cpu *remote;
+ if (cpu == nr_cpu_ids)
+ return pool->nr_free;
+ remote = per_cpu_ptr(pool->tag_cpu, cpu);
+ return remote->nr_free;
+}
+EXPORT_SYMBOL_GPL(percpu_ida_free_tags);
^ permalink raw reply [flat|nested] 13+ messages in thread
* [patch 4/4] blk-mq: switch to percpu-ida for tag menagement
2013-10-11 7:18 [patch 0/4] blk-mq: use percpu_ida to manage tags Shaohua Li
` (2 preceding siblings ...)
2013-10-11 7:18 ` [patch 3/4] percpu_ida: add an API to return free tags Shaohua Li
@ 2013-10-11 7:18 ` Shaohua Li
2013-10-11 14:28 ` Jens Axboe
3 siblings, 1 reply; 13+ messages in thread
From: Shaohua Li @ 2013-10-11 7:18 UTC (permalink / raw)
To: linux-kernel, axboe; +Cc: kmo
[-- Attachment #1: 0004-blk-mq-switch-to-percpu-ida-for-tag-menagement.patch --]
[-- Type: text/plain, Size: 15120 bytes --]
Using percpu-ida to manage blk-mq tags. the percpu-ida has similar algorithm
like the blk-mq-tag. The difference is when a cpu can't allocate tags
blk-mq-tag uses ipi to purge remote cpu cache and percpu-ida directly purges
remote cpu cache. In practice (testing null_blk), the percpu-ida approach is
much faster when total tags aren't enough.
Signed-off-by: Shaohua Li <shli@fusionio.com>
---
block/blk-mq-tag.c | 470 +++++++----------------------------------------------
1 file changed, 68 insertions(+), 402 deletions(-)
Index: master/block/blk-mq-tag.c
===================================================================
--- master.orig/block/blk-mq-tag.c 2013-10-11 12:15:17.200288445 +0800
+++ master/block/blk-mq-tag.c 2013-10-11 12:15:17.196288502 +0800
@@ -1,9 +1,6 @@
#include <linux/kernel.h>
-#include <linux/threads.h>
#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/cpu.h>
+#include <linux/percpu_ida.h>
#include <linux/blk-mq.h>
#include "blk.h"
@@ -11,291 +8,53 @@
#include "blk-mq-tag.h"
/*
- * Per-cpu cache entries
- */
-struct blk_mq_tag_map {
- unsigned int nr_free;
- unsigned int freelist[];
-};
-
-/*
* Per tagged queue (tag address space) map
*/
struct blk_mq_tags {
unsigned int nr_tags;
- unsigned int reserved_tags;
- unsigned int batch_move;
- unsigned int max_cache;
-
- struct {
- spinlock_t lock;
- unsigned int nr_free;
- unsigned int *freelist;
- unsigned int nr_reserved;
- unsigned int *reservelist;
- struct list_head wait;
- } ____cacheline_aligned_in_smp;
-
- struct blk_mq_tag_map __percpu *free_maps;
-
- struct blk_mq_cpu_notifier cpu_notifier;
-};
+ unsigned int nr_reserved_tags;
+ unsigned int nr_batch_move;
+ unsigned int nr_max_cache;
-struct blk_mq_tag_wait {
- struct list_head list;
- struct task_struct *task;
+ struct percpu_ida free_tags;
+ struct percpu_ida reserved_tags;
};
-#define DEFINE_TAG_WAIT(name) \
- struct blk_mq_tag_wait name = { \
- .list = LIST_HEAD_INIT((name).list), \
- .task = current, \
- }
-
-static unsigned int move_tags(unsigned int *dst, unsigned int *dst_nr,
- unsigned int *src, unsigned int *src_nr,
- unsigned int nr_to_move)
-{
- nr_to_move = min(nr_to_move, *src_nr);
- *src_nr -= nr_to_move;
- memcpy(dst + *dst_nr, src + *src_nr, sizeof(int) * nr_to_move);
- *dst_nr += nr_to_move;
-
- return nr_to_move;
-}
-
-static void __wake_waiters(struct blk_mq_tags *tags, unsigned int nr)
-{
- while (nr && !list_empty(&tags->wait)) {
- struct blk_mq_tag_wait *waiter;
-
- waiter = list_entry(tags->wait.next, struct blk_mq_tag_wait,
- list);
- list_del_init(&waiter->list);
- wake_up_process(waiter->task);
- nr--;
- }
-}
-
-static void __blk_mq_tag_return(struct blk_mq_tags *tags,
- struct blk_mq_tag_map *map, unsigned int nr)
-{
- unsigned int waiters;
-
- lockdep_assert_held(&tags->lock);
-
- waiters = move_tags(tags->freelist, &tags->nr_free, map->freelist,
- &map->nr_free, nr);
- if (!list_empty(&tags->wait))
- __wake_waiters(tags, waiters);
-}
-
-static void blk_mq_tag_return(struct blk_mq_tags *tags,
- struct blk_mq_tag_map *map, unsigned int nr)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&tags->lock, flags);
- __blk_mq_tag_return(tags, map, nr);
- spin_unlock_irqrestore(&tags->lock, flags);
-}
-
-#if NR_CPUS != 1
-static void prune_cache(void *data)
-{
- struct blk_mq_tags *tags = data;
- struct blk_mq_tag_map *map;
-
- map = per_cpu_ptr(tags->free_maps, smp_processor_id());
-
- spin_lock(&tags->lock);
- __blk_mq_tag_return(tags, map, tags->batch_move);
- spin_unlock(&tags->lock);
-}
-#endif
-
-static void ipi_local_caches(struct blk_mq_tags *tags, unsigned int this_cpu)
-{
-#if NR_CPUS != 1
- cpumask_var_t ipi_mask;
- unsigned int i, total;
-
- /*
- * We could per-cpu cache this things, but overhead is probably not
- * large enough to care about it. If we fail, just punt to doing a
- * prune on all CPUs.
- */
- if (!alloc_cpumask_var(&ipi_mask, GFP_ATOMIC)) {
- smp_call_function(prune_cache, tags, 0);
- return;
- }
-
- cpumask_clear(ipi_mask);
-
- total = 0;
- for_each_online_cpu(i) {
- struct blk_mq_tag_map *map = per_cpu_ptr(tags->free_maps, i);
-
- if (!map->nr_free)
- continue;
-
- total += map->nr_free;
- cpumask_set_cpu(i, ipi_mask);
-
- if (total > tags->batch_move)
- break;
- }
-
- if (total) {
- preempt_disable();
- smp_call_function_many(ipi_mask, prune_cache, tags, 0);
- preempt_enable();
- }
-
- free_cpumask_var(ipi_mask);
-#endif
-}
-
-/*
- * Wait on a free tag, move batch to map when we have it. Returns with
- * local CPU irq flags saved in 'flags'.
- */
-static void wait_on_tags(struct blk_mq_tags *tags, struct blk_mq_tag_map **map,
- unsigned long *flags)
-{
- DEFINE_TAG_WAIT(wait);
-
- do {
- spin_lock_irqsave(&tags->lock, *flags);
-
- __set_current_state(TASK_UNINTERRUPTIBLE);
-
- if (list_empty(&wait.list))
- list_add_tail(&wait.list, &tags->wait);
-
- *map = this_cpu_ptr(tags->free_maps);
- if ((*map)->nr_free || tags->nr_free) {
- if (!(*map)->nr_free) {
- move_tags((*map)->freelist, &(*map)->nr_free,
- tags->freelist, &tags->nr_free,
- tags->batch_move);
- }
-
- if (!list_empty(&wait.list))
- list_del(&wait.list);
-
- spin_unlock(&tags->lock);
- break;
- }
-
- spin_unlock_irqrestore(&tags->lock, *flags);
- ipi_local_caches(tags, raw_smp_processor_id());
- io_schedule();
- } while (1);
-
- __set_current_state(TASK_RUNNING);
-}
-
void blk_mq_wait_for_tags(struct blk_mq_tags *tags)
{
- struct blk_mq_tag_map *map;
- unsigned long flags;
-
- ipi_local_caches(tags, raw_smp_processor_id());
- wait_on_tags(tags, &map, &flags);
- local_irq_restore(flags);
+ int tag = blk_mq_get_tag(tags, __GFP_WAIT, false);
+ blk_mq_put_tag(tags, tag);
}
bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
{
- return !tags || tags->nr_free != 0;
+ return !tags ||
+ percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids) != 0;
}
static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp)
{
- struct blk_mq_tag_map *map;
- unsigned int this_cpu;
- unsigned long flags;
- unsigned int tag;
-
- local_irq_save(flags);
- this_cpu = smp_processor_id();
- map = per_cpu_ptr(tags->free_maps, this_cpu);
-
- /*
- * Grab from local per-cpu cache, if we can
- */
- do {
- if (map->nr_free) {
- map->nr_free--;
- tag = map->freelist[map->nr_free];
- local_irq_restore(flags);
- return tag;
- }
-
- /*
- * Grab from device map, if we can
- */
- if (tags->nr_free) {
- spin_lock(&tags->lock);
- move_tags(map->freelist, &map->nr_free, tags->freelist,
- &tags->nr_free, tags->batch_move);
- spin_unlock(&tags->lock);
- continue;
- }
-
- local_irq_restore(flags);
-
- if (!(gfp & __GFP_WAIT))
- break;
-
- ipi_local_caches(tags, this_cpu);
-
- /*
- * All are busy, wait. Returns with irqs disabled again
- * and potentially new 'map' pointer.
- */
- wait_on_tags(tags, &map, &flags);
- } while (1);
+ int tag;
- return BLK_MQ_TAG_FAIL;
+ tag = percpu_ida_alloc(&tags->free_tags, gfp);
+ if (tag < 0)
+ return BLK_MQ_TAG_FAIL;
+ return tag + tags->nr_reserved_tags;
}
static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags,
gfp_t gfp)
{
- unsigned int tag = BLK_MQ_TAG_FAIL;
- DEFINE_TAG_WAIT(wait);
+ int tag;
- if (unlikely(!tags->reserved_tags)) {
+ if (unlikely(!tags->nr_reserved_tags)) {
WARN_ON_ONCE(1);
return BLK_MQ_TAG_FAIL;
}
- do {
- spin_lock_irq(&tags->lock);
- if (tags->nr_reserved) {
- tags->nr_reserved--;
- tag = tags->reservelist[tags->nr_reserved];
- break;
- }
-
- if (!(gfp & __GFP_WAIT))
- break;
-
- __set_current_state(TASK_UNINTERRUPTIBLE);
-
- if (list_empty(&wait.list))
- list_add_tail(&wait.list, &tags->wait);
-
- spin_unlock_irq(&tags->lock);
- io_schedule();
- } while (1);
-
- if (!list_empty(&wait.list))
- list_del(&wait.list);
-
- spin_unlock_irq(&tags->lock);
+ tag = percpu_ida_alloc(&tags->reserved_tags, gfp);
+ if (tag < 0)
+ return BLK_MQ_TAG_FAIL;
return tag;
}
@@ -309,57 +68,38 @@ unsigned int blk_mq_get_tag(struct blk_m
static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
{
- struct blk_mq_tag_map *map;
- unsigned long flags;
-
BUG_ON(tag >= tags->nr_tags);
- local_irq_save(flags);
- map = this_cpu_ptr(tags->free_maps);
-
- map->freelist[map->nr_free] = tag;
- map->nr_free++;
-
- if (map->nr_free >= tags->max_cache ||
- !list_empty_careful(&tags->wait)) {
- spin_lock(&tags->lock);
- __blk_mq_tag_return(tags, map, tags->batch_move);
- spin_unlock(&tags->lock);
- }
-
- local_irq_restore(flags);
+ percpu_ida_free(&tags->free_tags, tag - tags->nr_reserved_tags);
}
static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
unsigned int tag)
{
- unsigned long flags;
-
- BUG_ON(tag >= tags->reserved_tags);
-
- spin_lock_irqsave(&tags->lock, flags);
- tags->reservelist[tags->nr_reserved] = tag;
- tags->nr_reserved++;
+ BUG_ON(tag >= tags->nr_reserved_tags);
- if (!list_empty(&tags->wait))
- __wake_waiters(tags, 1);
-
- spin_unlock_irqrestore(&tags->lock, flags);
+ percpu_ida_free(&tags->reserved_tags, tag);
}
void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
{
- if (tag >= tags->reserved_tags)
+ if (tag >= tags->nr_reserved_tags)
__blk_mq_put_tag(tags, tag);
else
__blk_mq_put_reserved_tag(tags, tag);
}
+static int __blk_mq_tag_iter(int id, void *data)
+{
+ unsigned long *tag_map = data;
+ __set_bit(id, tag_map);
+ return 0;
+}
+
void blk_mq_tag_busy_iter(struct blk_mq_tags *tags,
void (*fn)(void *, unsigned long *), void *data)
{
- unsigned long flags, *tag_map;
- unsigned int i, j;
+ unsigned long *tag_map;
size_t map_size;
map_size = ALIGN(tags->nr_tags, BITS_PER_LONG) / BITS_PER_LONG;
@@ -367,56 +107,21 @@ void blk_mq_tag_busy_iter(struct blk_mq_
if (!tag_map)
return;
- local_irq_save(flags);
-
- for_each_online_cpu(i) {
- struct blk_mq_tag_map *map = per_cpu_ptr(tags->free_maps, i);
-
- for (j = 0; j < map->nr_free; j++)
- __set_bit(map->freelist[j], tag_map);
- }
-
- if (tags->nr_free || tags->nr_reserved) {
- spin_lock(&tags->lock);
-
- if (tags->nr_reserved)
- for (j = 0; j < tags->nr_reserved; j++)
- __set_bit(tags->reservelist[j], tag_map);
-
- if (tags->nr_free)
- for (j = 0; j < tags->nr_free; j++)
- __set_bit(tags->freelist[j], tag_map);
-
- spin_unlock(&tags->lock);
- }
-
- local_irq_restore(flags);
+ percpu_ida_for_each_free(&tags->free_tags, __blk_mq_tag_iter, tag_map);
+ if (tags->nr_reserved_tags)
+ percpu_ida_for_each_free(&tags->reserved_tags, __blk_mq_tag_iter,
+ tag_map);
fn(data, tag_map);
kfree(tag_map);
}
-static void blk_mq_tag_notify(void *data, unsigned long action,
- unsigned int cpu)
-{
- /*
- * Move entries from this CPU to global pool
- */
- if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
- struct blk_mq_tags *tags = data;
- struct blk_mq_tag_map *map = per_cpu_ptr(tags->free_maps, cpu);
-
- if (map->nr_free)
- blk_mq_tag_return(tags, map, map->nr_free);
- }
-}
-
struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
unsigned int reserved_tags, int node)
{
unsigned int nr_tags, nr_cache;
struct blk_mq_tags *tags;
- size_t map_size;
+ int ret;
if (total_tags > BLK_MQ_TAG_MAX) {
pr_err("blk-mq: tag depth too large\n");
@@ -435,104 +140,65 @@ struct blk_mq_tags *blk_mq_init_tags(uns
else if (nr_cache > BLK_MQ_TAG_CACHE_MAX)
nr_cache = BLK_MQ_TAG_CACHE_MAX;
- map_size = sizeof(struct blk_mq_tag_map) + nr_cache * sizeof(int);
- tags->free_maps = __alloc_percpu(map_size, sizeof(void *));
- if (!tags->free_maps)
- goto err_free_maps;
-
- tags->freelist = kmalloc_node(sizeof(int) * nr_tags, GFP_KERNEL, node);
- if (!tags->freelist)
- goto err_freelist;
-
- if (reserved_tags) {
- tags->reservelist = kmalloc_node(sizeof(int) * reserved_tags,
- GFP_KERNEL, node);
- if (!tags->reservelist)
- goto err_reservelist;
- }
-
- spin_lock_init(&tags->lock);
- INIT_LIST_HEAD(&tags->wait);
tags->nr_tags = total_tags;
- tags->reserved_tags = reserved_tags;
- tags->max_cache = nr_cache;
- tags->batch_move = max(1u, nr_cache / 2);
-
- /*
- * Reserved tags are first
- */
- if (reserved_tags) {
- tags->nr_reserved = 0;
- while (reserved_tags--) {
- tags->reservelist[tags->nr_reserved] =
- tags->nr_reserved;
- tags->nr_reserved++;
- }
- }
+ tags->nr_reserved_tags = reserved_tags;
+ tags->nr_max_cache = nr_cache;
+ tags->nr_batch_move = max(1u, nr_cache / 2);
+
+ ret = __percpu_ida_init(&tags->free_tags, tags->nr_tags -
+ tags->nr_reserved_tags,
+ tags->nr_max_cache,
+ tags->nr_batch_move);
+ if (ret)
+ goto err_free_tags;
- /*
- * Rest of the tags start at the queue list
- */
- tags->nr_free = 0;
- while (nr_tags--) {
- tags->freelist[tags->nr_free] = tags->nr_free +
- tags->nr_reserved;
- tags->nr_free++;
+ if (reserved_tags) {
+ /*
+ * With max_cahe and batch set to 1, the allocator fallbacks to
+ * no cached. It's fine reserved tags allocation is slow.
+ */
+ ret = __percpu_ida_init(&tags->reserved_tags, reserved_tags,
+ 1, 1);
+ if (ret)
+ goto err_reserved_tags;
}
- blk_mq_init_cpu_notifier(&tags->cpu_notifier, blk_mq_tag_notify, tags);
- blk_mq_register_cpu_notifier(&tags->cpu_notifier);
return tags;
-err_reservelist:
- kfree(tags->freelist);
-err_freelist:
- free_percpu(tags->free_maps);
-err_free_maps:
+err_reserved_tags:
+ percpu_ida_destroy(&tags->free_tags);
+err_free_tags:
kfree(tags);
return NULL;
}
void blk_mq_free_tags(struct blk_mq_tags *tags)
{
- blk_mq_unregister_cpu_notifier(&tags->cpu_notifier);
- free_percpu(tags->free_maps);
- kfree(tags->freelist);
- kfree(tags->reservelist);
+ percpu_ida_destroy(&tags->free_tags);
+ percpu_ida_destroy(&tags->reserved_tags);
kfree(tags);
}
ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
{
char *orig_page = page;
- unsigned long flags;
- struct list_head *tmp;
- int waiters;
int cpu;
if (!tags)
return 0;
- spin_lock_irqsave(&tags->lock, flags);
-
page += sprintf(page, "nr_tags=%u, reserved_tags=%u, batch_move=%u,"
- " max_cache=%u\n", tags->nr_tags, tags->reserved_tags,
- tags->batch_move, tags->max_cache);
-
- waiters = 0;
- list_for_each(tmp, &tags->wait)
- waiters++;
-
- page += sprintf(page, "nr_free=%u, nr_reserved=%u, waiters=%u\n",
- tags->nr_free, tags->nr_reserved, waiters);
+ " max_cache=%u\n", tags->nr_tags, tags->nr_reserved_tags,
+ tags->nr_batch_move, tags->nr_max_cache);
- for_each_online_cpu(cpu) {
- struct blk_mq_tag_map *map = per_cpu_ptr(tags->free_maps, cpu);
+ page += sprintf(page, "nr_free=%u, nr_reserved=%u\n",
+ percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids),
+ percpu_ida_free_tags(&tags->reserved_tags, nr_cpu_ids));
+ for_each_possible_cpu(cpu) {
page += sprintf(page, " cpu%02u: nr_free=%u\n", cpu,
- map->nr_free);
+ percpu_ida_free_tags(&tags->free_tags, cpu));
}
- spin_unlock_irqrestore(&tags->lock, flags);
return page - orig_page;
}
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [patch 4/4] blk-mq: switch to percpu-ida for tag menagement
2013-10-11 7:18 ` [patch 4/4] blk-mq: switch to percpu-ida for tag menagement Shaohua Li
@ 2013-10-11 14:28 ` Jens Axboe
2013-10-12 0:49 ` Shaohua Li
0 siblings, 1 reply; 13+ messages in thread
From: Jens Axboe @ 2013-10-11 14:28 UTC (permalink / raw)
To: Shaohua Li, linux-kernel; +Cc: kmo
On 10/11/2013 01:18 AM, Shaohua Li wrote:
> Using percpu-ida to manage blk-mq tags. the percpu-ida has similar algorithm
> like the blk-mq-tag. The difference is when a cpu can't allocate tags
> blk-mq-tag uses ipi to purge remote cpu cache and percpu-ida directly purges
> remote cpu cache. In practice (testing null_blk), the percpu-ida approach is
> much faster when total tags aren't enough.
I'm not surprised it's a lot faster the the pathological case of needing
to prune tags, the IPI isn't near ideal for that. I'm assuming the
general performance is the same for the non-full case?
--
Jens Axboe
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [patch 1/4] percpu_ida: make percpu_ida percpu size/batch configurable
2013-10-11 7:18 ` [patch 1/4] percpu_ida: make percpu_ida percpu size/batch configurable Shaohua Li
@ 2013-10-11 20:31 ` Kent Overstreet
2013-10-12 0:52 ` Shaohua Li
0 siblings, 1 reply; 13+ messages in thread
From: Kent Overstreet @ 2013-10-11 20:31 UTC (permalink / raw)
To: Shaohua Li; +Cc: linux-kernel, axboe
On Fri, Oct 11, 2013 at 03:18:03PM +0800, Shaohua Li wrote:
> Make percpu_ida percpu size/batch configurable. The block-mq-tag will use it.
Can you explain the justification for this? Performance...?
> Signed-off-by: Shaohua Li <shli@fusionio.com>
> ---
> include/linux/percpu_ida.h | 18 +++++++++++++++++-
> lib/percpu_ida.c | 28 +++++++++++-----------------
> 2 files changed, 28 insertions(+), 18 deletions(-)
>
> Index: master/include/linux/percpu_ida.h
> ===================================================================
> --- master.orig/include/linux/percpu_ida.h 2013-10-11 12:14:44.472699999 +0800
> +++ master/include/linux/percpu_ida.h 2013-10-11 12:14:44.472699999 +0800
> @@ -16,6 +16,8 @@ struct percpu_ida {
> * percpu_ida_init()
> */
> unsigned nr_tags;
> + unsigned percpu_max_size;
> + unsigned percpu_batch_size;
>
> struct percpu_ida_cpu __percpu *tag_cpu;
>
> @@ -51,10 +53,24 @@ struct percpu_ida {
> } ____cacheline_aligned_in_smp;
> };
>
> +/*
> + * Number of tags we move between the percpu freelist and the global freelist at
> + * a time
> + */
> +#define IDA_DEFAULT_PCPU_BATCH_MOVE 32U
> +/* Max size of percpu freelist, */
> +#define IDA_DEFAULT_PCPU_SIZE ((IDA_DEFAULT_PCPU_BATCH_MOVE * 3) / 2)
> +
> int percpu_ida_alloc(struct percpu_ida *pool, gfp_t gfp);
> void percpu_ida_free(struct percpu_ida *pool, unsigned tag);
>
> void percpu_ida_destroy(struct percpu_ida *pool);
> -int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags);
> +int __percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags,
> + unsigned long max_size, unsigned long batch_size);
> +static inline int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags)
> +{
> + return __percpu_ida_init(pool, nr_tags, IDA_DEFAULT_PCPU_SIZE,
> + IDA_DEFAULT_PCPU_BATCH_MOVE);
> +}
>
> #endif /* __PERCPU_IDA_H__ */
> Index: master/lib/percpu_ida.c
> ===================================================================
> --- master.orig/lib/percpu_ida.c 2013-10-11 12:14:44.472699999 +0800
> +++ master/lib/percpu_ida.c 2013-10-11 12:14:44.472699999 +0800
> @@ -30,15 +30,6 @@
> #include <linux/spinlock.h>
> #include <linux/percpu_ida.h>
>
> -/*
> - * Number of tags we move between the percpu freelist and the global freelist at
> - * a time
> - */
> -#define IDA_PCPU_BATCH_MOVE 32U
> -
> -/* Max size of percpu freelist, */
> -#define IDA_PCPU_SIZE ((IDA_PCPU_BATCH_MOVE * 3) / 2)
> -
> struct percpu_ida_cpu {
> /*
> * Even though this is percpu, we need a lock for tag stealing by remote
> @@ -78,7 +69,7 @@ static inline void steal_tags(struct per
> struct percpu_ida_cpu *remote;
>
> for (cpus_have_tags = cpumask_weight(&pool->cpus_have_tags);
> - cpus_have_tags * IDA_PCPU_SIZE > pool->nr_tags / 2;
> + cpus_have_tags * pool->percpu_max_size > pool->nr_tags / 2;
> cpus_have_tags--) {
> cpu = cpumask_next(cpu, &pool->cpus_have_tags);
>
> @@ -123,7 +114,7 @@ static inline void alloc_global_tags(str
> {
> move_tags(tags->freelist, &tags->nr_free,
> pool->freelist, &pool->nr_free,
> - min(pool->nr_free, IDA_PCPU_BATCH_MOVE));
> + min(pool->nr_free, pool->percpu_batch_size));
> }
>
> static inline unsigned alloc_local_tag(struct percpu_ida *pool,
> @@ -245,17 +236,17 @@ void percpu_ida_free(struct percpu_ida *
> wake_up(&pool->wait);
> }
>
> - if (nr_free == IDA_PCPU_SIZE) {
> + if (nr_free == pool->percpu_max_size) {
> spin_lock(&pool->lock);
>
> /*
> * Global lock held and irqs disabled, don't need percpu
> * lock
> */
> - if (tags->nr_free == IDA_PCPU_SIZE) {
> + if (tags->nr_free == pool->percpu_max_size) {
> move_tags(pool->freelist, &pool->nr_free,
> tags->freelist, &tags->nr_free,
> - IDA_PCPU_BATCH_MOVE);
> + pool->percpu_batch_size);
>
> wake_up(&pool->wait);
> }
> @@ -292,7 +283,8 @@ EXPORT_SYMBOL_GPL(percpu_ida_destroy);
> * Allocation is percpu, but sharding is limited by nr_tags - for best
> * performance, the workload should not span more cpus than nr_tags / 128.
> */
> -int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags)
> +int __percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags,
> + unsigned long max_size, unsigned long batch_size)
> {
> unsigned i, cpu, order;
>
> @@ -301,6 +293,8 @@ int percpu_ida_init(struct percpu_ida *p
> init_waitqueue_head(&pool->wait);
> spin_lock_init(&pool->lock);
> pool->nr_tags = nr_tags;
> + pool->percpu_max_size = max_size;
> + pool->percpu_batch_size = batch_size;
>
> /* Guard against overflow */
> if (nr_tags > (unsigned) INT_MAX + 1) {
> @@ -319,7 +313,7 @@ int percpu_ida_init(struct percpu_ida *p
> pool->nr_free = nr_tags;
>
> pool->tag_cpu = __alloc_percpu(sizeof(struct percpu_ida_cpu) +
> - IDA_PCPU_SIZE * sizeof(unsigned),
> + pool->percpu_max_size * sizeof(unsigned),
> sizeof(unsigned));
> if (!pool->tag_cpu)
> goto err;
> @@ -332,4 +326,4 @@ err:
> percpu_ida_destroy(pool);
> return -ENOMEM;
> }
> -EXPORT_SYMBOL_GPL(percpu_ida_init);
> +EXPORT_SYMBOL_GPL(__percpu_ida_init);
>
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [patch 2/4] percpu_ida: add percpu_ida_for_each_free
2013-10-11 7:18 ` [patch 2/4] percpu_ida: add percpu_ida_for_each_free Shaohua Li
@ 2013-10-11 20:34 ` Kent Overstreet
0 siblings, 0 replies; 13+ messages in thread
From: Kent Overstreet @ 2013-10-11 20:34 UTC (permalink / raw)
To: Shaohua Li; +Cc: linux-kernel, axboe
On Fri, Oct 11, 2013 at 03:18:04PM +0800, Shaohua Li wrote:
> Add a new API to iterate free ids. blk-mq-tag will use it.
>
> Signed-off-by: Shaohua Li <shli@fusionio.com>
> ---
> include/linux/percpu_ida.h | 3 +++
> lib/percpu_ida.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 47 insertions(+)
>
> Index: master/include/linux/percpu_ida.h
> ===================================================================
> --- master.orig/include/linux/percpu_ida.h 2013-10-11 12:14:56.932543376 +0800
> +++ master/include/linux/percpu_ida.h 2013-10-11 12:14:56.928543501 +0800
> @@ -73,4 +73,7 @@ static inline int percpu_ida_init(struct
> IDA_DEFAULT_PCPU_BATCH_MOVE);
> }
>
> +int percpu_ida_for_each_free(struct percpu_ida *pool,
> + int (*fn)(int id, void *data), void *data);
> +
> #endif /* __PERCPU_IDA_H__ */
> Index: master/lib/percpu_ida.c
> ===================================================================
> --- master.orig/lib/percpu_ida.c 2013-10-11 12:14:56.932543376 +0800
> +++ master/lib/percpu_ida.c 2013-10-11 12:14:56.928543501 +0800
> @@ -327,3 +327,47 @@ err:
> return -ENOMEM;
> }
> EXPORT_SYMBOL_GPL(__percpu_ida_init);
> +
> +/**
> + * percpu_ida_for_each_free - iterate free ids of a pool
> + * @pool: pool to iterate
> + * @fn: interate callback function
> + * @data: parameter for @fn
> + *
> + * Note, this doesn't guarantee iterate all free ids restrictly. Some free
> + * ids might be missed, some might be iterated duplicated, and some might
> + * not be free and iterated.
> + */
> +int percpu_ida_for_each_free(struct percpu_ida *pool,
> + int (*fn)(int id, void *data), void *data)
I'd prefer to make the id parameter unsigned - and use a typedef for the
function pointer argument - but other than that, looks reasonable to me.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [patch 3/4] percpu_ida: add an API to return free tags
2013-10-11 7:18 ` [patch 3/4] percpu_ida: add an API to return free tags Shaohua Li
@ 2013-10-11 20:35 ` Kent Overstreet
2013-10-12 1:02 ` Shaohua Li
0 siblings, 1 reply; 13+ messages in thread
From: Kent Overstreet @ 2013-10-11 20:35 UTC (permalink / raw)
To: Shaohua Li; +Cc: linux-kernel, axboe
On Fri, Oct 11, 2013 at 03:18:05PM +0800, Shaohua Li wrote:
> add an API to return free tags, blk-mq-tag will use it
Can you explain how this is going to be used? Seems like something that
could be prone to misuse, try and convince me there isn't a better way
to do what it's for.
>
> Signed-off-by: Shaohua Li <shli@fusionio.com>
> ---
> include/linux/percpu_ida.h | 1 +
> lib/percpu_ida.c | 17 +++++++++++++++++
> 2 files changed, 18 insertions(+)
>
> Index: master/include/linux/percpu_ida.h
> ===================================================================
> --- master.orig/include/linux/percpu_ida.h 2013-10-11 12:15:06.996416710 +0800
> +++ master/include/linux/percpu_ida.h 2013-10-11 12:15:06.992416749 +0800
> @@ -76,4 +76,5 @@ static inline int percpu_ida_init(struct
> int percpu_ida_for_each_free(struct percpu_ida *pool,
> int (*fn)(int id, void *data), void *data);
>
> +unsigned percpu_ida_free_tags(struct percpu_ida *pool, int cpu);
> #endif /* __PERCPU_IDA_H__ */
> Index: master/lib/percpu_ida.c
> ===================================================================
> --- master.orig/lib/percpu_ida.c 2013-10-11 12:15:06.996416710 +0800
> +++ master/lib/percpu_ida.c 2013-10-11 12:15:06.992416749 +0800
> @@ -371,3 +371,20 @@ out:
> return err;
> }
> EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
> +
> +/**
> + * percpu_ida_free_tags - return free tags number of a specific cpu or global pool
> + * @pool: pool related
> + * @cpu: specific cpu or global pool if @cpu == nr_cpu_ids
> + *
> + * Note: this just returns a snapshot of free tags number.
> + */
> +unsigned percpu_ida_free_tags(struct percpu_ida *pool, int cpu)
> +{
> + struct percpu_ida_cpu *remote;
> + if (cpu == nr_cpu_ids)
> + return pool->nr_free;
> + remote = per_cpu_ptr(pool->tag_cpu, cpu);
> + return remote->nr_free;
> +}
> +EXPORT_SYMBOL_GPL(percpu_ida_free_tags);
>
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [patch 4/4] blk-mq: switch to percpu-ida for tag menagement
2013-10-11 14:28 ` Jens Axboe
@ 2013-10-12 0:49 ` Shaohua Li
2013-10-13 18:26 ` Jens Axboe
0 siblings, 1 reply; 13+ messages in thread
From: Shaohua Li @ 2013-10-12 0:49 UTC (permalink / raw)
To: Jens Axboe; +Cc: linux-kernel, kmo
On Fri, Oct 11, 2013 at 08:28:54AM -0600, Jens Axboe wrote:
> On 10/11/2013 01:18 AM, Shaohua Li wrote:
> > Using percpu-ida to manage blk-mq tags. the percpu-ida has similar algorithm
> > like the blk-mq-tag. The difference is when a cpu can't allocate tags
> > blk-mq-tag uses ipi to purge remote cpu cache and percpu-ida directly purges
> > remote cpu cache. In practice (testing null_blk), the percpu-ida approach is
> > much faster when total tags aren't enough.
>
> I'm not surprised it's a lot faster the the pathological case of needing
> to prune tags, the IPI isn't near ideal for that. I'm assuming the
> general performance is the same for the non-full case?
Yep. My test is done in a 2 sockets machine, 12 process cross the 2 sockets. So
if there is lock contention or ipi, should be stressed heavily. Testing is done
for null-blk.
hw_queue_depth nopatch iops patch iops
64 ~800k/s ~1470k/s
2048 ~4470k/s ~4340k/s
In the 2048 case, perf doesn't should any percpu-ida function is hot (no one
use > 1% cpu time), so the small difference should be drift. So yes, the
general performance is the same.
Thanks,
Shaohua
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [patch 1/4] percpu_ida: make percpu_ida percpu size/batch configurable
2013-10-11 20:31 ` Kent Overstreet
@ 2013-10-12 0:52 ` Shaohua Li
0 siblings, 0 replies; 13+ messages in thread
From: Shaohua Li @ 2013-10-12 0:52 UTC (permalink / raw)
To: Kent Overstreet; +Cc: linux-kernel, axboe
On Fri, Oct 11, 2013 at 01:31:52PM -0700, Kent Overstreet wrote:
> On Fri, Oct 11, 2013 at 03:18:03PM +0800, Shaohua Li wrote:
> > Make percpu_ida percpu size/batch configurable. The block-mq-tag will use it.
>
> Can you explain the justification for this? Performance...?
The performance using percpu_ida for tag management? Please see the patch 4
thread. If you ask the justification of making the size/batch configurable,
the answer is we might have no enough tags, and default cache size/batch is too
aggressive.
Thanks,
Shaohua
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [patch 3/4] percpu_ida: add an API to return free tags
2013-10-11 20:35 ` Kent Overstreet
@ 2013-10-12 1:02 ` Shaohua Li
0 siblings, 0 replies; 13+ messages in thread
From: Shaohua Li @ 2013-10-12 1:02 UTC (permalink / raw)
To: Kent Overstreet; +Cc: linux-kernel, axboe
On Fri, Oct 11, 2013 at 01:35:36PM -0700, Kent Overstreet wrote:
> On Fri, Oct 11, 2013 at 03:18:05PM +0800, Shaohua Li wrote:
> > add an API to return free tags, blk-mq-tag will use it
>
> Can you explain how this is going to be used? Seems like something that
> could be prone to misuse, try and convince me there isn't a better way
> to do what it's for.
There are two usages. One is for sysfs info output, which can be used for
diagnosis. The other one is blk-mq wants to determine if it's possible a
request can be queued. Neither requires precise data. Yes, caller should be
aware returned data isn't very precise.
Thanks,
Shaohua
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [patch 4/4] blk-mq: switch to percpu-ida for tag menagement
2013-10-12 0:49 ` Shaohua Li
@ 2013-10-13 18:26 ` Jens Axboe
0 siblings, 0 replies; 13+ messages in thread
From: Jens Axboe @ 2013-10-13 18:26 UTC (permalink / raw)
To: Shaohua Li; +Cc: linux-kernel, kmo
On Sat, Oct 12 2013, Shaohua Li wrote:
> On Fri, Oct 11, 2013 at 08:28:54AM -0600, Jens Axboe wrote:
> > On 10/11/2013 01:18 AM, Shaohua Li wrote:
> > > Using percpu-ida to manage blk-mq tags. the percpu-ida has similar algorithm
> > > like the blk-mq-tag. The difference is when a cpu can't allocate tags
> > > blk-mq-tag uses ipi to purge remote cpu cache and percpu-ida directly purges
> > > remote cpu cache. In practice (testing null_blk), the percpu-ida approach is
> > > much faster when total tags aren't enough.
> >
> > I'm not surprised it's a lot faster the the pathological case of needing
> > to prune tags, the IPI isn't near ideal for that. I'm assuming the
> > general performance is the same for the non-full case?
>
> Yep. My test is done in a 2 sockets machine, 12 process cross the 2 sockets. So
> if there is lock contention or ipi, should be stressed heavily. Testing is done
> for null-blk.
>
> hw_queue_depth nopatch iops patch iops
> 64 ~800k/s ~1470k/s
> 2048 ~4470k/s ~4340k/s
>
> In the 2048 case, perf doesn't should any percpu-ida function is hot (no one
> use > 1% cpu time), so the small difference should be drift. So yes, the
> general performance is the same.
Yep, that definitely looks a lot prettier. Thanks! I've had this on the
TODO for a while, since the ida got fixed up.
--
Jens Axboe
^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2013-10-13 18:26 UTC | newest]
Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-10-11 7:18 [patch 0/4] blk-mq: use percpu_ida to manage tags Shaohua Li
2013-10-11 7:18 ` [patch 1/4] percpu_ida: make percpu_ida percpu size/batch configurable Shaohua Li
2013-10-11 20:31 ` Kent Overstreet
2013-10-12 0:52 ` Shaohua Li
2013-10-11 7:18 ` [patch 2/4] percpu_ida: add percpu_ida_for_each_free Shaohua Li
2013-10-11 20:34 ` Kent Overstreet
2013-10-11 7:18 ` [patch 3/4] percpu_ida: add an API to return free tags Shaohua Li
2013-10-11 20:35 ` Kent Overstreet
2013-10-12 1:02 ` Shaohua Li
2013-10-11 7:18 ` [patch 4/4] blk-mq: switch to percpu-ida for tag menagement Shaohua Li
2013-10-11 14:28 ` Jens Axboe
2013-10-12 0:49 ` Shaohua Li
2013-10-13 18:26 ` Jens Axboe
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox