* [patch] reduce IPI noise due to /dev/cdrom open/close
@ 2006-07-03 15:33 Jes Sorensen
2006-07-03 15:37 ` Milton Miller
` (2 more replies)
0 siblings, 3 replies; 23+ messages in thread
From: Jes Sorensen @ 2006-07-03 15:33 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Andrew Morton, Alexander Viro, linux-kernel
Hi,
Certain applications cause a lot of IPI noise due to them constantly
doing open/close on /dev/cdrom. hald is a particularly annoying case
of this. However since every distribution insists on shipping it, it's
one of those that are hard to get rid of :(
Anyway, this patch reduces the IPI noise by keeping a cpumask of CPUs
which have items in the bh lru and only flushing on the relevant
CPUs. On systems with larger CPU counts it's quite normal that only a
few CPUs are actively doing block IO, so spewing IPIs everywhere to
flush this is unnecessary.
I also switched the code to use schedule_on_each_cpu() as suggested by
Andrew, and I made the API there more flexible by introducing
schedule_on_each_cpu_mask().
Cheers,
Jes
Introduce more flexible schedule_on_each_cpu_mask() API allowing one
to specify a CPU mask to schedule on and implement
schedule_on_each_cpu_mask() on top of it.
Use a cpumask to keep track of which CPUs have items in the per CPU
buffer_head lru. Let invalidate_bh_lrus() use the new cpumask API to
limit the number of cross-CPU calls when a block device is
open/closed.
This significantly reduces IPI noise on large CPU number systems which
are running hald.
Signed-off-by: Jes Sorensen <jes@sgi.com>
---
fs/buffer.c | 23 +++++++++++++++++++----
include/linux/workqueue.h | 3 +++
kernel/workqueue.c | 31 +++++++++++++++++++++++++++----
3 files changed, 49 insertions(+), 8 deletions(-)
Index: linux-2.6/fs/buffer.c
===================================================================
--- linux-2.6.orig/fs/buffer.c
+++ linux-2.6/fs/buffer.c
@@ -1323,6 +1323,7 @@ struct bh_lru {
};
static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
+static cpumask_t lru_in_use;
#ifdef CONFIG_SMP
#define bh_lru_lock() local_irq_disable()
@@ -1352,9 +1353,14 @@ static void bh_lru_install(struct buffer
lru = &__get_cpu_var(bh_lrus);
if (lru->bhs[0] != bh) {
struct buffer_head *bhs[BH_LRU_SIZE];
- int in;
- int out = 0;
+ int in, out, cpu;
+ cpu = raw_smp_processor_id();
+ /* Test first to avoid cache lines bouncing around */
+ if (!cpu_isset(cpu, lru_in_use))
+ cpu_set(cpu, lru_in_use);
+
+ out = 0;
get_bh(bh);
bhs[out++] = bh;
for (in = 0; in < BH_LRU_SIZE; in++) {
@@ -1500,19 +1506,28 @@ EXPORT_SYMBOL(__bread);
*/
static void invalidate_bh_lru(void *arg)
{
- struct bh_lru *b = &get_cpu_var(bh_lrus);
+ struct bh_lru *b;
int i;
+ local_irq_disable();
+ b = &get_cpu_var(bh_lrus);
for (i = 0; i < BH_LRU_SIZE; i++) {
brelse(b->bhs[i]);
b->bhs[i] = NULL;
}
put_cpu_var(bh_lrus);
+ local_irq_enable();
}
static void invalidate_bh_lrus(void)
{
- on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
+ /*
+ * Need to hand down a copy of the mask or we wouldn't be run
+ * anywhere due to the original mask being cleared
+ */
+ cpumask_t mask = lru_in_use;
+ cpus_clear(lru_in_use);
+ schedule_on_each_cpu_mask(invalidate_bh_lru, NULL, mask);
}
void set_bh_page(struct buffer_head *bh,
Index: linux-2.6/include/linux/workqueue.h
===================================================================
--- linux-2.6.orig/include/linux/workqueue.h
+++ linux-2.6/include/linux/workqueue.h
@@ -8,6 +8,7 @@
#include <linux/timer.h>
#include <linux/linkage.h>
#include <linux/bitops.h>
+#include <linux/cpumask.h>
struct workqueue_struct;
@@ -70,6 +71,8 @@ extern int FASTCALL(schedule_delayed_wor
extern int schedule_delayed_work_on(int cpu, struct work_struct *work, unsigned long delay);
extern int schedule_on_each_cpu(void (*func)(void *info), void *info);
+extern int schedule_on_each_cpu_mask(void (*func)(void *info),
+ void *info, cpumask_t mask);
extern void flush_scheduled_work(void);
extern int current_is_keventd(void);
extern int keventd_up(void);
Index: linux-2.6/kernel/workqueue.c
===================================================================
--- linux-2.6.orig/kernel/workqueue.c
+++ linux-2.6/kernel/workqueue.c
@@ -429,9 +429,11 @@ int schedule_delayed_work_on(int cpu,
}
/**
- * schedule_on_each_cpu - call a function on each online CPU from keventd
+ * schedule_on_each_cpu_mask - call a function on each online CPU in the
+ * mask from keventd
* @func: the function to call
* @info: a pointer to pass to func()
+ * @mask: a cpumask_t of CPUs to schedule on
*
* Returns zero on success.
* Returns -ve errno on failure.
@@ -440,7 +442,8 @@ int schedule_delayed_work_on(int cpu,
*
* schedule_on_each_cpu() is very slow.
*/
-int schedule_on_each_cpu(void (*func)(void *info), void *info)
+int
+schedule_on_each_cpu_mask(void (*func)(void *info), void *info, cpumask_t mask)
{
int cpu;
struct work_struct *works;
@@ -451,14 +454,34 @@ int schedule_on_each_cpu(void (*func)(vo
for_each_online_cpu(cpu) {
INIT_WORK(per_cpu_ptr(works, cpu), func, info);
- __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
- per_cpu_ptr(works, cpu));
+ if (cpu_isset(cpu, mask))
+ __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
+ per_cpu_ptr(works, cpu));
}
flush_workqueue(keventd_wq);
free_percpu(works);
return 0;
}
+/**
+ * schedule_on_each_cpu_mask - call a function on each online CPU from keventd
+ *
+ * @func: the function to call
+ * @info: a pointer to pass to func()
+ *
+ * Returns zero on success.
+ * Returns -ve errno on failure.
+ *
+ * Appears to be racy against CPU hotplug.
+ *
+ * schedule_on_each_cpu() is very slow.
+ */
+int
+schedule_on_each_cpu(void (*func)(void *info), void *info)
+{
+ return schedule_on_each_cpu_mask(func, info, CPU_MASK_ALL);
+}
+
void flush_scheduled_work(void)
{
flush_workqueue(keventd_wq);
^ permalink raw reply [flat|nested] 23+ messages in thread
* [PATCH] simplfy bh_lru_install
2006-07-03 15:33 [patch] reduce IPI noise due to /dev/cdrom open/close Jes Sorensen
2006-07-03 15:37 ` Milton Miller
@ 2006-07-03 15:37 ` Milton Miller
2006-07-03 15:37 ` Milton Miller
2006-07-04 5:32 ` [patch] reduce IPI noise due to /dev/cdrom open/close Keith Owens
2 siblings, 1 reply; 23+ messages in thread
From: Milton Miller @ 2006-07-03 15:37 UTC (permalink / raw)
To: Jens Axboe; +Cc: linux-kernel, Andrew Morton, Jes Sorensen
While looking at Jes' patch "reduce IPI noise due to /dev/cdrom open/close"
I took a look at what the bh_lru code was doing.
* The LRU management algorithm is dopey-but-simple. Sorry.
Umm.. yes.
That in and out loop caused me to stare a bit.
lru_lookup_install, aka adding to lru cache, is building a second
array on the stack and then calling memcpy at the end to replace
the data. Sure its in cache, but we already did all the loads
and stores once.
The in and out arrays are always the same length, and we only allow
an entry in once. So we know that we either push all down, and the
last one is the evictee, or we find the entry in there previously
and free that slot, leaving a copyloop.
But I'm sure the compiler can't determine that, because it doesn't
know about the insert-uniquely assertion.
It also has a special case for its already at the top. But since
we did a lookup before __find_get_block_slow, that is surely a rare
special case.
Maybe it was designed to be called it on every lookup.
The lookup case does a move to top pulling the previous entrys down
one by one to the former spot, and inserting at the top, which is
sane. Why not do a push down here here?
And while here, we can stop if we hit a NULL entry too, we will not
have any stragglers underneath.
Here is a totally untested patch. Well, it compiles. In the for
loop next is also assigned to bh for the bh = NULL case the compiler
pointed out.
Signed-off-by: Milton Miller <miltonm@bga.com>
--- linux-2.6.17/fs/buffer.c.orig 2006-07-04 00:04:16.000000000 -0400
+++ linux-2.6.17/fs/buffer.c 2006-07-04 00:52:01.000000000 -0400
@@ -1347,41 +1347,35 @@ static inline void check_irqs_on(void)
*/
static void bh_lru_install(struct buffer_head *bh)
{
- struct buffer_head *evictee = NULL;
+ struct buffer_head *next, *prev;
struct bh_lru *lru;
+ int i;
check_irqs_on();
bh_lru_lock();
lru = &__get_cpu_var(bh_lrus);
- if (lru->bhs[0] != bh) {
- struct buffer_head *bhs[BH_LRU_SIZE];
- int in;
- int out = 0;
- get_bh(bh);
- bhs[out++] = bh;
- for (in = 0; in < BH_LRU_SIZE; in++) {
- struct buffer_head *bh2 = lru->bhs[in];
-
- if (bh2 == bh) {
- __brelse(bh2);
- } else {
- if (out >= BH_LRU_SIZE) {
- BUG_ON(evictee != NULL);
- evictee = bh2;
- } else {
- bhs[out++] = bh2;
- }
- }
- }
- while (out < BH_LRU_SIZE)
- bhs[out++] = NULL;
- memcpy(lru->bhs, bhs, sizeof(bhs));
+ /* Push down, looking for duplicate as we go. last is freed */
+ for (i = 0, next = prev = bh; i < BH_LRU_SIZE && prev;
+ i++, prev = next) {
+ next = lru->bhs[i];
+ if (unlikely(next == bh))
+ break;
+ lru->bhs[i] = prev;
+ }
+
+#ifdef DEBUG
+ /* ++i to avoid finding the entry we know */
+ for (++i; i < BH_LRU_SIZE; i++) {
+ BUG_ON(lru->bhs[i] == bh);
+ if (!next)
+ BUG_ON(lru->bhs[i]);
}
+#endif
+
bh_lru_unlock();
- if (evictee)
- __brelse(evictee);
+ brelse(next);
}
/*
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] reduce IPI noise due to /dev/cdrom open/close
2006-07-03 15:33 [patch] reduce IPI noise due to /dev/cdrom open/close Jes Sorensen
@ 2006-07-03 15:37 ` Milton Miller
2006-07-04 7:47 ` Jes Sorensen
2006-07-03 15:37 ` [PATCH] simplfy bh_lru_install Milton Miller
2006-07-04 5:32 ` [patch] reduce IPI noise due to /dev/cdrom open/close Keith Owens
2 siblings, 1 reply; 23+ messages in thread
From: Milton Miller @ 2006-07-03 15:37 UTC (permalink / raw)
To: Jes Sorensen, Jens Axboe; +Cc: linux-kernel, Andrew Morton
On Mon Jul 03 2006 - 11:37:43 EST, Jes Sorensen wrote:
> Certain applications cause a lot of IPI noise due to them constantly
> doing open/close on /dev/cdrom. hald is a particularly annoying case
> of this. However since every distribution insists on shipping it, it's
> one of those that are hard to get rid of :(
>
> Anyway, this patch reduces the IPI noise by keeping a cpumask of CPUs
> which have items in the bh lru and only flushing on the relevant
> CPUs. On systems with larger CPU counts it's quite normal that only a
> few CPUs are actively doing block IO, so spewing IPIs everywhere to
> flush this is unnecessary.
>
Ok we are optimizing for the low but not zero traffic case ...
> + cpu = raw_smp_processor_id();
> + /* Test first to avoid cache lines bouncing around */
> + if (!cpu_isset(cpu, lru_in_use))
> + cpu_set(cpu, lru_in_use);
Being set when local array is filled, under local locking only, but its
a set_bit and ok.
> static void invalidate_bh_lrus(void)
> {
> - on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
> + /*
> + * Need to hand down a copy of the mask or we wouldn't be run
> + * anywhere due to the original mask being cleared
> + */
> + cpumask_t mask = lru_in_use;
> + cpus_clear(lru_in_use);
> + schedule_on_each_cpu_mask(invalidate_bh_lru, NULL, mask);
> }
But that is totally racy!
Another cpu could set its bit between the assignment to mask and
the call to cpus_clear.
Which means we end up with cpus holding a bh in their lru but no
idea which ones.
Unfornately clearing the bit in the callback means we pass the cpu
mask around twice (once to clear, and later to set as we start
freeing bhs again).
Although that is probably not much worse than scanning other cpus'
per-cpu data for NULL (and I would probably just scan 8 pointers
rather than add another per-cpu something is cached flag).
I don't like the idea of invalidate_bdev (say due to openers going
to zero) running against one device causing a buffer to be left
artificially busy on another device, causing a page to be left
around.
If you want to cut down on the cache line passing, then putting
the cpu mask in the bdev (for "I have cached a bh on this bdev
sometime") might be useful. You could even do a bdev specific
lru kill, but then we get into the next topic.
And now for the "what is that code doing?" part of this review:
* The LRU management algorithm is dopey-but-simple. Sorry.
Umm.. yes.
(but time for a new subject).
milton
[sorry for the whitespace munging]
^ permalink raw reply [flat|nested] 23+ messages in thread
* [PATCH] simplfy bh_lru_install
2006-07-03 15:37 ` [PATCH] simplfy bh_lru_install Milton Miller
@ 2006-07-03 15:37 ` Milton Miller
0 siblings, 0 replies; 23+ messages in thread
From: Milton Miller @ 2006-07-03 15:37 UTC (permalink / raw)
To: Jens Axboe; +Cc: linux-kernel, Andrew Morton, Jes Sorensen
Oops, previous version forgot to do get_bh(). Still not tested.
While looking at Jes' patch "reduce IPI noise due to /dev/cdrom open/close"
I took a look at what the bh_lru code was doing.
* The LRU management algorithm is dopey-but-simple. Sorry.
Umm.. yes.
That in and out loop caused me to stare a bit.
lru_lookup_install, aka adding to lru cache, is building a second
array on the stack and then calling memcpy at the end to replace
the data. Sure its in cache, but we already did all the loads
and stores once.
The in and out arrays are always the same length, and we only allow
an entry in once. So we know that we either push all down, and the
last one is the evictee, or we find the entry in there previously
and free that slot, leaving a copyloop.
But I'm sure the compiler can't determine that, because it doesn't
know about the insert-uniquely assertion.
It also has a special case for its already at the top. But since
we did a lookup before __find_get_block_slow, that is surely a rare
special case.
Maybe it was designed to be called it on every lookup.
The lookup case does a move to top pulling the previous entrys down
one by one to the former spot, and inserting at the top, which is
sane. Why not do a push down here here?
And while here, we can stop if we hit a NULL entry too, we will not
have any stragglers underneath.
Here is a totally untested patch. Well, it compiles. In the for
loop next is also assigned to bh for the bh = NULL case the compiler
pointed out.
Signed-off-by: Milton Miller <miltonm@bga.com>
--- linux-2.6.17/fs/buffer.c.orig 2006-07-04 00:04:16.000000000 -0400
+++ linux-2.6.17/fs/buffer.c 2006-07-05 01:50:27.000000000 -0400
@@ -1347,41 +1347,36 @@ static inline void check_irqs_on(void)
*/
static void bh_lru_install(struct buffer_head *bh)
{
- struct buffer_head *evictee = NULL;
+ struct buffer_head *next, *prev;
struct bh_lru *lru;
+ int i;
check_irqs_on();
bh_lru_lock();
lru = &__get_cpu_var(bh_lrus);
- if (lru->bhs[0] != bh) {
- struct buffer_head *bhs[BH_LRU_SIZE];
- int in;
- int out = 0;
- get_bh(bh);
- bhs[out++] = bh;
- for (in = 0; in < BH_LRU_SIZE; in++) {
- struct buffer_head *bh2 = lru->bhs[in];
-
- if (bh2 == bh) {
- __brelse(bh2);
- } else {
- if (out >= BH_LRU_SIZE) {
- BUG_ON(evictee != NULL);
- evictee = bh2;
- } else {
- bhs[out++] = bh2;
- }
- }
- }
- while (out < BH_LRU_SIZE)
- bhs[out++] = NULL;
- memcpy(lru->bhs, bhs, sizeof(bhs));
+ get_bh(bh);
+ /* Push down, looking for duplicate as we go. last is freed */
+ for (i = 0, next = prev = bh; i < BH_LRU_SIZE && prev;
+ i++, prev = next) {
+ next = lru->bhs[i];
+ if (unlikely(next == bh))
+ break;
+ lru->bhs[i] = prev;
+ }
+
+#ifdef DEBUG
+ /* ++i to avoid finding the entry we know */
+ for (++i; i < BH_LRU_SIZE; i++) {
+ BUG_ON(lru->bhs[i] == bh);
+ if (!next)
+ BUG_ON(lru->bhs[i]);
}
+#endif
+
bh_lru_unlock();
- if (evictee)
- __brelse(evictee);
+ brelse(next);
}
/*
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] reduce IPI noise due to /dev/cdrom open/close
2006-07-03 15:33 [patch] reduce IPI noise due to /dev/cdrom open/close Jes Sorensen
2006-07-03 15:37 ` Milton Miller
2006-07-03 15:37 ` [PATCH] simplfy bh_lru_install Milton Miller
@ 2006-07-04 5:32 ` Keith Owens
2006-07-04 6:41 ` Andrew Morton
2006-07-04 7:49 ` Jes Sorensen
2 siblings, 2 replies; 23+ messages in thread
From: Keith Owens @ 2006-07-04 5:32 UTC (permalink / raw)
To: Jes Sorensen; +Cc: Linus Torvalds, Andrew Morton, Alexander Viro, linux-kernel
Jes Sorensen (on 03 Jul 2006 11:33:54 -0400) wrote:
>Anyway, this patch reduces the IPI noise by keeping a cpumask of CPUs
>which have items in the bh lru and only flushing on the relevant
>CPUs. On systems with larger CPU counts it's quite normal that only a
>few CPUs are actively doing block IO, so spewing IPIs everywhere to
>flush this is unnecessary.
>
>Index: linux-2.6/fs/buffer.c
>===================================================================
>--- linux-2.6.orig/fs/buffer.c
>+++ linux-2.6/fs/buffer.c
>@@ -1323,6 +1323,7 @@ struct bh_lru {
> };
>
> static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
>+static cpumask_t lru_in_use;
>
> #ifdef CONFIG_SMP
> #define bh_lru_lock() local_irq_disable()
>@@ -1352,9 +1353,14 @@ static void bh_lru_install(struct buffer
> lru = &__get_cpu_var(bh_lrus);
> if (lru->bhs[0] != bh) {
> struct buffer_head *bhs[BH_LRU_SIZE];
>- int in;
>- int out = 0;
>+ int in, out, cpu;
>
>+ cpu = raw_smp_processor_id();
Why raw_smp_processor_id? That normally indicates code that wants a
lazy cpu number, but this code requires the exact cpu number, IMHO
using raw_smp_processor_id is confusing. smp_processor_id can safely
be used here, bh_lru_lock has disabled irq or preempt.
>+ /* Test first to avoid cache lines bouncing around */
>+ if (!cpu_isset(cpu, lru_in_use))
>+ cpu_set(cpu, lru_in_use);
>+
>+ out = 0;
> get_bh(bh);
> bhs[out++] = bh;
> for (in = 0; in < BH_LRU_SIZE; in++) {
>@@ -1500,19 +1506,28 @@ EXPORT_SYMBOL(__bread);
> */
> static void invalidate_bh_lru(void *arg)
> {
>- struct bh_lru *b = &get_cpu_var(bh_lrus);
>+ struct bh_lru *b;
> int i;
>
>+ local_irq_disable();
>+ b = &get_cpu_var(bh_lrus);
> for (i = 0; i < BH_LRU_SIZE; i++) {
> brelse(b->bhs[i]);
> b->bhs[i] = NULL;
> }
> put_cpu_var(bh_lrus);
>+ local_irq_enable();
> }
>
> static void invalidate_bh_lrus(void)
> {
>- on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
>+ /*
>+ * Need to hand down a copy of the mask or we wouldn't be run
>+ * anywhere due to the original mask being cleared
>+ */
>+ cpumask_t mask = lru_in_use;
>+ cpus_clear(lru_in_use);
>+ schedule_on_each_cpu_mask(invalidate_bh_lru, NULL, mask);
> }
Racy? Start with an empty lru_in_use.
Cpu A Cpu B
invalidate_bh_lrus()
mask = lru_in_use;
preempted
block I/O
bh_lru_install()
cpu_set(cpu, lru_in_use);
resume
cpus_clear(lru_in_use);
schedule_on_each_cpu_mask() - does not send IPI to cpu B
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] reduce IPI noise due to /dev/cdrom open/close
2006-07-04 5:32 ` [patch] reduce IPI noise due to /dev/cdrom open/close Keith Owens
@ 2006-07-04 6:41 ` Andrew Morton
2006-07-04 7:51 ` Jes Sorensen
` (3 more replies)
2006-07-04 7:49 ` Jes Sorensen
1 sibling, 4 replies; 23+ messages in thread
From: Andrew Morton @ 2006-07-04 6:41 UTC (permalink / raw)
To: Keith Owens; +Cc: jes, torvalds, viro, linux-kernel
On Tue, 04 Jul 2006 15:32:19 +1000
Keith Owens <kaos@sgi.com> wrote:
> Jes Sorensen (on 03 Jul 2006 11:33:54 -0400) wrote:
> >Anyway, this patch reduces the IPI noise by keeping a cpumask of CPUs
> >which have items in the bh lru and only flushing on the relevant
> >CPUs. On systems with larger CPU counts it's quite normal that only a
> >few CPUs are actively doing block IO, so spewing IPIs everywhere to
> >flush this is unnecessary.
> >
> >Index: linux-2.6/fs/buffer.c
> >===================================================================
> >--- linux-2.6.orig/fs/buffer.c
> >+++ linux-2.6/fs/buffer.c
> >@@ -1323,6 +1323,7 @@ struct bh_lru {
> > };
> >
> > static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
> >+static cpumask_t lru_in_use;
> >
> > #ifdef CONFIG_SMP
> > #define bh_lru_lock() local_irq_disable()
> >@@ -1352,9 +1353,14 @@ static void bh_lru_install(struct buffer
> > lru = &__get_cpu_var(bh_lrus);
> > if (lru->bhs[0] != bh) {
> > struct buffer_head *bhs[BH_LRU_SIZE];
> >- int in;
> >- int out = 0;
> >+ int in, out, cpu;
> >
> >+ cpu = raw_smp_processor_id();
>
> Why raw_smp_processor_id? That normally indicates code that wants a
> lazy cpu number, but this code requires the exact cpu number, IMHO
> using raw_smp_processor_id is confusing. smp_processor_id can safely
> be used here, bh_lru_lock has disabled irq or preempt.
I expect raw_smp_processor_id() is used here as a a microoptimisation -
avoid a might_sleep() which obviously will never trigger.
But I think it'd be better to do just a single raw_smp_processor_id() for
this entire function:
static void bh_lru_install(struct buffer_head *bh)
{
struct buffer_head *evictee = NULL;
struct bh_lru *lru;
+ int cpu;
check_irqs_on();
bh_lru_lock();
+ cpu = raw_smp_processor_id();
- lru = &__get_cpu_var(bh_lrus);
+ lru = per_cpu(bh_lrus, cpu);
etcetera.
> >
> > static void invalidate_bh_lrus(void)
> > {
> >- on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
> >+ /*
> >+ * Need to hand down a copy of the mask or we wouldn't be run
> >+ * anywhere due to the original mask being cleared
> >+ */
> >+ cpumask_t mask = lru_in_use;
> >+ cpus_clear(lru_in_use);
> >+ schedule_on_each_cpu_mask(invalidate_bh_lru, NULL, mask);
> > }
>
> Racy? Start with an empty lru_in_use.
>
> Cpu A Cpu B
> invalidate_bh_lrus()
> mask = lru_in_use;
> preempted
> block I/O
> bh_lru_install()
> cpu_set(cpu, lru_in_use);
> resume
> cpus_clear(lru_in_use);
> schedule_on_each_cpu_mask() - does not send IPI to cpu B
Yup. I think we can fix that by doing a single cpu_clear() on each CPU
just prior to that CPU clearing out its array, in invalidate_bh_lru().
There's a possibility of course that new bh's will get installed somewhere,
but higher-level code must ensure that those bh's do not belong to the
device which we're trying to clean up.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] reduce IPI noise due to /dev/cdrom open/close
2006-07-03 15:37 ` Milton Miller
@ 2006-07-04 7:47 ` Jes Sorensen
2006-07-04 7:53 ` Arjan van de Ven
2006-07-04 8:42 ` Milton Miller
0 siblings, 2 replies; 23+ messages in thread
From: Jes Sorensen @ 2006-07-04 7:47 UTC (permalink / raw)
To: Milton Miller; +Cc: Jens Axboe, linux-kernel, Andrew Morton
Milton Miller wrote:
> On Mon Jul 03 2006 - 11:37:43 EST, Jes Sorensen wrote:
>> Anyway, this patch reduces the IPI noise by keeping a cpumask of CPUs
>> which have items in the bh lru and only flushing on the relevant
>> CPUs. On systems with larger CPU counts it's quite normal that only a
>> few CPUs are actively doing block IO, so spewing IPIs everywhere to
>> flush this is unnecessary.
> Ok we are optimizing for the low but not zero traffic case ...
Well yes and no. $#@$#@* hald will do the open/close stupidity a
couple of times per second. On a 128 CPU system thats quite a lot of
IPI traffic, resulting in measurable noise if you run a benchmark.
Remember that the IPIs are synchronous so you have to wait for them to
hit across the system :(
>> static void invalidate_bh_lrus(void)
>> {
>> - on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
>> + /*
>> + * Need to hand down a copy of the mask or we wouldn't be run
>> + * anywhere due to the original mask being cleared
>> + */
>> + cpumask_t mask = lru_in_use;
>> + cpus_clear(lru_in_use);
>> + schedule_on_each_cpu_mask(invalidate_bh_lru, NULL, mask);
>> }
>
> But that is totally racy!
>
> Another cpu could set its bit between the assignment to mask and
> the call to cpus_clear.
>
> Which means we end up with cpus holding a bh in their lru but no
> idea which ones.
The idea I wrote the code under was that we are really only concerned to
make sure all bh's related to the device causing the invalidate are hit.
It doesn't matter that there are bh's in the lru from other devices.
> Unfornately clearing the bit in the callback means we pass the cpu
> mask around twice (once to clear, and later to set as we start
> freeing bhs again).
Doing it in the callback also means each CPU has to go back and hit the
mask remotely causing a lot of cache line ping pong effects, especially
as they'll be doing it at roughly the same time. This is why I
explicitly did the if (!test_bit) set_bit() optimization.
> Although that is probably not much worse than scanning other cpus'
> per-cpu data for NULL (and I would probably just scan 8 pointers
> rather than add another per-cpu something is cached flag).
8 pointers * NR_CPUS - that can be a lot of cachelines you have to pull
in from remote :(
> I don't like the idea of invalidate_bdev (say due to openers going
> to zero) running against one device causing a buffer to be left
> artificially busy on another device, causing a page to be left
> around.
>
> If you want to cut down on the cache line passing, then putting
> the cpu mask in the bdev (for "I have cached a bh on this bdev
> sometime") might be useful. You could even do a bdev specific
> lru kill, but then we get into the next topic.
That could be an interesting way out.
Cheers,
Jes
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] reduce IPI noise due to /dev/cdrom open/close
2006-07-04 5:32 ` [patch] reduce IPI noise due to /dev/cdrom open/close Keith Owens
2006-07-04 6:41 ` Andrew Morton
@ 2006-07-04 7:49 ` Jes Sorensen
2006-07-04 8:04 ` Andrew Morton
1 sibling, 1 reply; 23+ messages in thread
From: Jes Sorensen @ 2006-07-04 7:49 UTC (permalink / raw)
To: Keith Owens; +Cc: Linus Torvalds, Andrew Morton, Alexander Viro, linux-kernel
Keith Owens wrote:
>> static void invalidate_bh_lrus(void)
>> {
>> - on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
>> + /*
>> + * Need to hand down a copy of the mask or we wouldn't be run
>> + * anywhere due to the original mask being cleared
>> + */
>> + cpumask_t mask = lru_in_use;
>> + cpus_clear(lru_in_use);
>> + schedule_on_each_cpu_mask(invalidate_bh_lru, NULL, mask);
>> }
>
> Racy? Start with an empty lru_in_use.
>
> Cpu A Cpu B
> invalidate_bh_lrus()
> mask = lru_in_use;
> preempted
> block I/O
> bh_lru_install()
> cpu_set(cpu, lru_in_use);
> resume
> cpus_clear(lru_in_use);
> schedule_on_each_cpu_mask() - does not send IPI to cpu B
I guess the real question is whether the device is still valid for new
IO by the time we hit the invalidate function. If not, and that was my,
possibly flawed, assumption, then it's not an issue. Whatever bh's are
left in the lrus from other devices will be handled on the next hit.
Cheers,
Jes
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] reduce IPI noise due to /dev/cdrom open/close
2006-07-04 6:41 ` Andrew Morton
@ 2006-07-04 7:51 ` Jes Sorensen
2006-07-04 9:13 ` Milton Miller
` (2 subsequent siblings)
3 siblings, 0 replies; 23+ messages in thread
From: Jes Sorensen @ 2006-07-04 7:51 UTC (permalink / raw)
To: Andrew Morton; +Cc: Keith Owens, torvalds, viro, linux-kernel
Andrew Morton wrote:
> On Tue, 04 Jul 2006 15:32:19 +1000
> I expect raw_smp_processor_id() is used here as a a microoptimisation -
> avoid a might_sleep() which obviously will never trigger.
>
> But I think it'd be better to do just a single raw_smp_processor_id() for
> this entire function:
I agree, should have done that instead.
>> Racy? Start with an empty lru_in_use.
>>
>> Cpu A Cpu B
>> invalidate_bh_lrus()
>> mask = lru_in_use;
>> preempted
>> block I/O
>> bh_lru_install()
>> cpu_set(cpu, lru_in_use);
>> resume
>> cpus_clear(lru_in_use);
>> schedule_on_each_cpu_mask() - does not send IPI to cpu B
>
> Yup. I think we can fix that by doing a single cpu_clear() on each CPU
> just prior to that CPU clearing out its array, in invalidate_bh_lru().
I deliberately tried to avoid that to avoid the bitmask turning into a
bouncing cacheline.
> There's a possibility of course that new bh's will get installed somewhere,
> but higher-level code must ensure that those bh's do not belong to the
> device which we're trying to clean up.
Cheers,
Jes
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] reduce IPI noise due to /dev/cdrom open/close
2006-07-04 7:47 ` Jes Sorensen
@ 2006-07-04 7:53 ` Arjan van de Ven
2006-07-04 8:12 ` Jes Sorensen
2006-07-04 8:42 ` Milton Miller
1 sibling, 1 reply; 23+ messages in thread
From: Arjan van de Ven @ 2006-07-04 7:53 UTC (permalink / raw)
To: Jes Sorensen; +Cc: Milton Miller, Jens Axboe, linux-kernel, Andrew Morton
On Tue, 2006-07-04 at 09:47 +0200, Jes Sorensen wrote:
> Milton Miller wrote:
> > On Mon Jul 03 2006 - 11:37:43 EST, Jes Sorensen wrote:
> >> Anyway, this patch reduces the IPI noise by keeping a cpumask of CPUs
> >> which have items in the bh lru and only flushing on the relevant
> >> CPUs. On systems with larger CPU counts it's quite normal that only a
> >> few CPUs are actively doing block IO, so spewing IPIs everywhere to
> >> flush this is unnecessary.
>
> > Ok we are optimizing for the low but not zero traffic case ...
>
> Well yes and no. $#@$#@* hald will do the open/close stupidity a
> couple of times per second. On a 128 CPU system thats quite a lot of
> IPI traffic, resulting in measurable noise if you run a benchmark.
> Remember that the IPIs are synchronous so you have to wait for them to
> hit across the system :
can you get hald fixed? That sounds important anyway... stupid userspace
isn't going to be good no matter what, and the question is how much crap
we need to do in the kernel to compensate for stupid userspace...
especially if such userspace is open source and CAN be fixed...
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] reduce IPI noise due to /dev/cdrom open/close
2006-07-04 7:49 ` Jes Sorensen
@ 2006-07-04 8:04 ` Andrew Morton
0 siblings, 0 replies; 23+ messages in thread
From: Andrew Morton @ 2006-07-04 8:04 UTC (permalink / raw)
To: Jes Sorensen; +Cc: kaos, torvalds, viro, linux-kernel
On Tue, 04 Jul 2006 09:49:14 +0200
Jes Sorensen <jes@sgi.com> wrote:
> Keith Owens wrote:
> >> static void invalidate_bh_lrus(void)
> >> {
> >> - on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
> >> + /*
> >> + * Need to hand down a copy of the mask or we wouldn't be run
> >> + * anywhere due to the original mask being cleared
> >> + */
> >> + cpumask_t mask = lru_in_use;
> >> + cpus_clear(lru_in_use);
> >> + schedule_on_each_cpu_mask(invalidate_bh_lru, NULL, mask);
> >> }
> >
> > Racy? Start with an empty lru_in_use.
> >
> > Cpu A Cpu B
> > invalidate_bh_lrus()
> > mask = lru_in_use;
> > preempted
> > block I/O
> > bh_lru_install()
> > cpu_set(cpu, lru_in_use);
> > resume
> > cpus_clear(lru_in_use);
> > schedule_on_each_cpu_mask() - does not send IPI to cpu B
>
> I guess the real question is whether the device is still valid for new
> IO by the time we hit the invalidate function. If not, and that was my,
> possibly flawed, assumption, then it's not an issue. Whatever bh's are
> left in the lrus from other devices will be handled on the next hit.
>
The problem is that we can actually lose bits in invalidate_bh_lrus().
CPU0 CPU1
mask = lru_in_use;
cpu_set(lru_in_use, 1);
cpus_clear(lru_in_use);
Now we have a stray bh which nobody knows about.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] reduce IPI noise due to /dev/cdrom open/close
2006-07-04 7:53 ` Arjan van de Ven
@ 2006-07-04 8:12 ` Jes Sorensen
2006-07-04 8:23 ` Arjan van de Ven
2006-07-04 13:02 ` Helge Hafting
0 siblings, 2 replies; 23+ messages in thread
From: Jes Sorensen @ 2006-07-04 8:12 UTC (permalink / raw)
To: Arjan van de Ven; +Cc: Milton Miller, Jens Axboe, linux-kernel, Andrew Morton
Arjan van de Ven wrote:
> On Tue, 2006-07-04 at 09:47 +0200, Jes Sorensen wrote:
>> Well yes and no. $#@$#@* hald will do the open/close stupidity a
>> couple of times per second. On a 128 CPU system thats quite a lot of
>> IPI traffic, resulting in measurable noise if you run a benchmark.
>> Remember that the IPIs are synchronous so you have to wait for them to
>> hit across the system :
>
> can you get hald fixed? That sounds important anyway... stupid userspace
> isn't going to be good no matter what, and the question is how much crap
> we need to do in the kernel to compensate for stupid userspace...
> especially if such userspace is open source and CAN be fixed...
I'd like to, I don't know how feasible it is though :( The distros make
it a priority to run all the GUI stuff that makes Linux look like
windows as much as they can, which includes autodetecting when users
insert their latest audio CD so they can launch the mp3 ripper
automatically ....
Guess the question is, is there a way we can detect when media has been
inserted without doing open/close on the device constantly? It's not
something I have looked at in detail, so I dunno if there's a sensible
way to handle it.
The other part of it is that I do think it's undesirable that a user
space app can cause so much kernel IPI noise by simply doing open/close
on a device.
Anyway, I do agree, we should look at fixing both problems.
Cheers,
Jes
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] reduce IPI noise due to /dev/cdrom open/close
2006-07-04 8:12 ` Jes Sorensen
@ 2006-07-04 8:23 ` Arjan van de Ven
2006-07-04 8:33 ` Jes Sorensen
2006-07-04 13:02 ` Helge Hafting
1 sibling, 1 reply; 23+ messages in thread
From: Arjan van de Ven @ 2006-07-04 8:23 UTC (permalink / raw)
To: Jes Sorensen; +Cc: Milton Miller, Jens Axboe, linux-kernel, Andrew Morton
On Tue, 2006-07-04 at 10:12 +0200, Jes Sorensen wrote:
>
> Guess the question is, is there a way we can detect when media has
> been
> inserted without doing open/close on the device constantly?
they could just keep the device open.... at least until media is
inserted
also they should poll at most every 10 seconds; anything more frequent
is just braindead...
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] reduce IPI noise due to /dev/cdrom open/close
2006-07-04 8:23 ` Arjan van de Ven
@ 2006-07-04 8:33 ` Jes Sorensen
0 siblings, 0 replies; 23+ messages in thread
From: Jes Sorensen @ 2006-07-04 8:33 UTC (permalink / raw)
To: Arjan van de Ven; +Cc: Milton Miller, Jens Axboe, linux-kernel, Andrew Morton
Arjan van de Ven wrote:
> On Tue, 2006-07-04 at 10:12 +0200, Jes Sorensen wrote:
>> Guess the question is, is there a way we can detect when media has
>> been
>> inserted without doing open/close on the device constantly?
>
> they could just keep the device open.... at least until media is
> inserted
Yup, I will see how much it requires to make it do that.
> also they should poll at most every 10 seconds; anything more frequent
> is just braindead...
But of course, then your camera's picture card isn't detected within a
microsecond of your inserting it into the cardreader! Really can't have
that!
Cheers,
Jes
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] reduce IPI noise due to /dev/cdrom open/close
2006-07-04 7:47 ` Jes Sorensen
2006-07-04 7:53 ` Arjan van de Ven
@ 2006-07-04 8:42 ` Milton Miller
2006-07-04 8:59 ` Jes Sorensen
1 sibling, 1 reply; 23+ messages in thread
From: Milton Miller @ 2006-07-04 8:42 UTC (permalink / raw)
To: Jes Sorensen; +Cc: LKML, Jens Axboe, Andrew Morton
On Jul 4, 2006, at 2:47 AM, Jes Sorensen wrote:
> Milton Miller wrote:
>> On Mon Jul 03 2006 - 11:37:43 EST, Jes Sorensen wrote:
>>
>> But that is totally racy!
>>
>> Another cpu could set its bit between the assignment to mask and
>> the call to cpus_clear.
>>
>> Which means we end up with cpus holding a bh in their lru but no
>> idea which ones.
>
> The idea I wrote the code under was that we are really only concerned
> to
> make sure all bh's related to the device causing the invalidate are
> hit.
> It doesn't matter that there are bh's in the lru from other devices.
>
And that is fine as far as it goes. The problem is that an unrelated
device might be be hit by this operation. For example, hal is running
on cpu 0, so the fsck gets run on cpu 1 and hits this race. It
finishes, and now hal is back to sleep, and cpu power saving balancing
says run the mount on cpu 0. The file system tries to change the block
size, which calls kill_bdev which calls invalidate_bdev, but we forgot
that cpu 1 had a bh, so the bh doesn't free, and the page is left in
the page cache with the wrong size buffer. POOOF there goes the
filesystem.
Oh, and there isn't any safety check there, is there?
>> Unfortunately clearing the bit in the callback means we pass the cpu
>> mask around twice (once to clear, and later to set as we start
>> freeing bhs again).
>
> Doing it in the callback also means each CPU has to go back and hit the
> mask remotely causing a lot of cache line ping pong effects, especially
> as they'll be doing it at roughly the same time. This is why I
> explicitly did the if (!test_bit) set_bit() optimization.
>
>> Although that is probably not much worse than scanning other cpus'
>> per-cpu data for NULL (and I would probably just scan 8 pointers
>> rather than add another per-cpu something is cached flag).
>
> 8 pointers * NR_CPUS - that can be a lot of cachelines you have to pull
> in from remote :(
8 pointers in one array, hmm... 8*8 = 64 consecutive bytes, that is
half a cache line in my arch. (And on 32 bit archs, 8*4 = 32 bytes,
typically 1 full cache line). Add an alignment constraint if you like.
I was comparing to fetching a per-cpu variable saying we had any
entries; I'd say the difference is the time to do 8 loads once you have
the line vs the chance the other cpu is actively storing and stealing
it back before you finish. I'd be really surprised if it was stolen
twice.
>> I don't like the idea of invalidate_bdev (say due to openers going
>> to zero) running against one device causing a buffer to be left
>> artificially busy on another device, causing a page to be left
>> around.
>>
>> If you want to cut down on the cache line passing, then putting
>> the cpu mask in the bdev (for "I have cached a bh on this bdev
>> sometime") might be useful. You could even do a bdev specific
>> lru kill, but then we get into the next topic.
>
> That could be an interesting way out.
>
Another approach is to age the cache. Only clear the bit during the IPI
call if you had nothing there on the last N calls (N being some
combination of absolute time and number of invalidate calls).
milton
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] reduce IPI noise due to /dev/cdrom open/close
2006-07-04 8:42 ` Milton Miller
@ 2006-07-04 8:59 ` Jes Sorensen
2006-07-04 9:30 ` Milton Miller
0 siblings, 1 reply; 23+ messages in thread
From: Jes Sorensen @ 2006-07-04 8:59 UTC (permalink / raw)
To: Milton Miller; +Cc: LKML, Jens Axboe, Andrew Morton
Milton Miller wrote:
> On Jul 4, 2006, at 2:47 AM, Jes Sorensen wrote:
>> The idea I wrote the code under was that we are really only concerned to
>> make sure all bh's related to the device causing the invalidate are hit.
>> It doesn't matter that there are bh's in the lru from other devices.
>
> And that is fine as far as it goes. The problem is that an unrelated
> device might be be hit by this operation. For example, hal is running
> on cpu 0, so the fsck gets run on cpu 1 and hits this race. It
> finishes, and now hal is back to sleep, and cpu power saving balancing
> says run the mount on cpu 0. The file system tries to change the block
> size, which calls kill_bdev which calls invalidate_bdev, but we forgot
> that cpu 1 had a bh, so the bh doesn't free, and the page is left in the
> page cache with the wrong size buffer. POOOF there goes the filesystem.
Hrmpf, maybe it's back to putting the mask into the bdev then.
>> 8 pointers * NR_CPUS - that can be a lot of cachelines you have to pull
>> in from remote :(
>
> 8 pointers in one array, hmm... 8*8 = 64 consecutive bytes, that is
> half a cache line in my arch. (And on 32 bit archs, 8*4 = 32 bytes,
> typically 1 full cache line). Add an alignment constraint if you like.
> I was comparing to fetching a per-cpu variable saying we had any
> entries; I'd say the difference is the time to do 8 loads once you have
> the line vs the chance the other cpu is actively storing and stealing it
> back before you finish. I'd be really surprised if it was stolen twice.
The problem is that if you throw the IPI onto multiple CPUs they will
all try and hit this array at the same time. Besides, on some platforms
this would be 64 * 8 * 8 (or even bigger) - scalability quickly goes
out the window, especially as it's to be fetched from another node :( I
know these are the extreme in today's world, but it's becoming more of
an issue with all the multi-core stuff we're going to see.
>>> If you want to cut down on the cache line passing, then putting
>>> the cpu mask in the bdev (for "I have cached a bh on this bdev
>>> sometime") might be useful. You could even do a bdev specific
>>> lru kill, but then we get into the next topic.
>>
>> That could be an interesting way out.
>
> Another approach is to age the cache. Only clear the bit during the IPI
> call if you had nothing there on the last N calls (N being some
> combination of absolute time and number of invalidate calls).
I guess if there's nothing in the LRU you don't get the call and the
number of cross node clears are limited, but it seems to get worse
exponentially with the size of the machine, especially if it's busy
doing IO, which is what worries me :(
Next, I'll look for my asbestos suit and take a peak at the hald source.
Cheers,
Jes
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] reduce IPI noise due to /dev/cdrom open/close
2006-07-04 6:41 ` Andrew Morton
2006-07-04 7:51 ` Jes Sorensen
@ 2006-07-04 9:13 ` Milton Miller
2006-07-04 17:33 ` Nick Piggin
2006-07-05 0:10 ` Paul Mackerras
3 siblings, 0 replies; 23+ messages in thread
From: Milton Miller @ 2006-07-04 9:13 UTC (permalink / raw)
To: Andrew Morton, Jes Sorensen; +Cc: Anton Blanchard, LKML, Paul Mackerras
On Jul 4, 2006, at 1:39 AM, Andrew Morton wrote:
>
> I expect raw_smp_processor_id() is used here as a a microoptimisation -
> avoid a might_sleep() which obviously will never trigger.
>
> But I think it'd be better to do just a single raw_smp_processor_id()
> for this entire function:
>
> static void bh_lru_install(struct buffer_head *bh)
> {
> struct buffer_head *evictee = NULL;
> struct bh_lru *lru;
> + int cpu;
>
> check_irqs_on();
> bh_lru_lock();
> + cpu = raw_smp_processor_id();
> - lru = &__get_cpu_var(bh_lrus);
> + lru = per_cpu(bh_lrus, cpu);
>
The problem with this style is that it is an disoptimizatoin for
architectures who hold their per-cpu data offset in a register, put the
smp_processor_id in the per-cpu data (or similar) and per_cpu data
offsets in a global lookup.
Do we need a new macro? (what is that gcc macro function syntax?)
#ifdef PER_CPU_IS_SLOW
#define my_cpu_var(bh_lrus, cpu) \
({ BUG_ON(cpu != smp_processor_id()); &__get_cpu_var(bh_lrus); })
#else
#define my_cpu_var(bh_lrus, cpu) \
({ BUG_ON(cpu != smp_processor_id()); per_cpu(bh_lrus, cpu); })
#endif
(and yes, the BUG_ON would be for debug checking).
milton
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] reduce IPI noise due to /dev/cdrom open/close
2006-07-04 8:59 ` Jes Sorensen
@ 2006-07-04 9:30 ` Milton Miller
0 siblings, 0 replies; 23+ messages in thread
From: Milton Miller @ 2006-07-04 9:30 UTC (permalink / raw)
To: Jes Sorensen; +Cc: LKML, Jens Axboe, Andrew Morton
On Jul 4, 2006, at 3:59 AM, Jes Sorensen wrote:
> Milton Miller wrote:
>> On Jul 4, 2006, at 2:47 AM, Jes Sorensen wrote:
>>> The idea I wrote the code under was that we are really only
>>> concerned to
>>> make sure all bh's related to the device causing the invalidate are
>>> hit.
>>> It doesn't matter that there are bh's in the lru from other devices.
>>
>> And that is fine as far as it goes. The problem is that an unrelated
>> device might be be hit by this operation.
... [ fs POOF scenerio ]
> Hrmpf, maybe it's back to putting the mask into the bdev then.
>
>>> 8 pointers * NR_CPUS - that can be a lot of cachelines you have to
>>> pull
>>> in from remote :(
>>
>> 8 pointers in one array, hmm... 8*8 = 64 consecutive bytes, that is
>> half a cache line in my arch. (And on 32 bit archs, 8*4 = 32 bytes,
>> typically 1 full cache line). Add an alignment constraint if you
>> like.
>> I was comparing to fetching a per-cpu variable saying we had any
>> entries; I'd say the difference is the time to do 8 loads once you
>> have
>> the line vs the chance the other cpu is actively storing and stealing
>> it
>> back before you finish. I'd be really surprised if it was stolen
>> twice.
>
> The problem is that if you throw the IPI onto multiple CPUs they will
> all try and hit this array at the same time. Besides, on some platforms
> this would be 64 * 8 * 8 (or even bigger) - scalability quickly goes
> out the window, especially as it's to be fetched from another node :( I
> know these are the extreme in today's world, but it's becoming more of
> an issue with all the multi-core stuff we're going to see.
Huh? The cpus have their own local array, you scan the remote to see
if you need to wake that guy up. Sort of like need_reseched/polling
flags. Ok, they all have to pull it back from shared with the caller
to exclusive/modified at once. Of course they had to do that with
whatever you method you used to tell them what to do also.
The trade off is bouncing one line with the mask of who might have
something relevant in a bit vector (two bounces if you clear always,
shared if you do it lazy) vs pulling a line per cpu when you need to
check, that will be actively pulled back every bh lookup.
Maybe you shouldn't push the work until some time expires. Just let
wait a bit for the eventd to kick in first. After all, you would just
be slowing down the thread trying to invalidate a dead device.
>>>> If you want to cut down on the cache line passing, then putting
>>>> the cpu mask in the bdev (for "I have cached a bh on this bdev
>>>> sometime") might be useful. You could even do a bdev specific
>>>> lru kill, but then we get into the next topic.
>>>
>>> That could be an interesting way out.
>>
>> Another approach is to age the cache. Only clear the bit during the
>> IPI
>> call if you had nothing there on the last N calls (N being some
>> combination of absolute time and number of invalidate calls).
>
> I guess if there's nothing in the LRU you don't get the call and the
> number of cross node clears are limited, but it seems to get worse
> exponentially with the size of the machine, especially if it's busy
> doing IO, which is what worries me :(
>
If you are doing that much bh based IO then you will have to issue
the IPI, even with the cpu mask. Unless you go the per-bdev cpu_mask.
> Next, I'll look for my asbestos suit and take a peak at the hald
> source.
* miltonm stands well back
later,
milton
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] reduce IPI noise due to /dev/cdrom open/close
2006-07-04 8:12 ` Jes Sorensen
2006-07-04 8:23 ` Arjan van de Ven
@ 2006-07-04 13:02 ` Helge Hafting
1 sibling, 0 replies; 23+ messages in thread
From: Helge Hafting @ 2006-07-04 13:02 UTC (permalink / raw)
To: Jes Sorensen
Cc: Arjan van de Ven, Milton Miller, Jens Axboe, linux-kernel,
Andrew Morton
On Tue, Jul 04, 2006 at 10:12:49AM +0200, Jes Sorensen wrote:
> Arjan van de Ven wrote:
> > On Tue, 2006-07-04 at 09:47 +0200, Jes Sorensen wrote:
> >> Well yes and no. $#@$#@* hald will do the open/close stupidity a
> >> couple of times per second. On a 128 CPU system thats quite a lot of
> >> IPI traffic, resulting in measurable noise if you run a benchmark.
> >> Remember that the IPIs are synchronous so you have to wait for them to
> >> hit across the system :
> >
> > can you get hald fixed? That sounds important anyway... stupid userspace
> > isn't going to be good no matter what, and the question is how much crap
> > we need to do in the kernel to compensate for stupid userspace...
> > especially if such userspace is open source and CAN be fixed...
>
> I'd like to, I don't know how feasible it is though :( The distros make
> it a priority to run all the GUI stuff that makes Linux look like
> windows as much as they can, which includes autodetecting when users
> insert their latest audio CD so they can launch the mp3 ripper
> automatically ....
>
> Guess the question is, is there a way we can detect when media has been
> inserted without doing open/close on the device constantly? It's not
> something I have looked at in detail, so I dunno if there's a sensible
> way to handle it.
I always believed that it was possible, but not on all cdroms.
If this was supported, people with lots of cpus could be told to
get some of the sane cdroms for their big boxes.
One solution is to have the kernel do this kind of polling itself,
in the device drivers for removeable media.
Then it can simply notify userspace when something happens,
and userspace can decide to play music, mount a fs, or whatever
people want to happen.
>
> The other part of it is that I do think it's undesirable that a user
> space app can cause so much kernel IPI noise by simply doing open/close
> on a device.
Sure, a DOS waiting to happen. If they can poll twice a second, whats
to stop them from trying to poll a hundred times a second, or in
a busy loop?
Helge Hafting
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] reduce IPI noise due to /dev/cdrom open/close
2006-07-04 6:41 ` Andrew Morton
2006-07-04 7:51 ` Jes Sorensen
2006-07-04 9:13 ` Milton Miller
@ 2006-07-04 17:33 ` Nick Piggin
2006-07-05 7:30 ` Jes Sorensen
2006-07-05 0:10 ` Paul Mackerras
3 siblings, 1 reply; 23+ messages in thread
From: Nick Piggin @ 2006-07-04 17:33 UTC (permalink / raw)
To: Andrew Morton; +Cc: Keith Owens, jes, torvalds, viro, linux-kernel
Andrew Morton wrote:
> On Tue, 04 Jul 2006 15:32:19 +1000
> Keith Owens <kaos@sgi.com> wrote:
>>Why raw_smp_processor_id? That normally indicates code that wants a
>>lazy cpu number, but this code requires the exact cpu number, IMHO
>>using raw_smp_processor_id is confusing. smp_processor_id can safely
>>be used here, bh_lru_lock has disabled irq or preempt.
>
>
> I expect raw_smp_processor_id() is used here as a a microoptimisation -
> avoid a might_sleep() which obviously will never trigger.
A microoptimisation because they've turned on DEBUG_PREEMPT and found
that smp_processor_id slows down? ;) Wouldn't it be better to just stick
to the normal rules (ie. what Keith said)?
It may be obvious in this case (though that doesn't help people who make
obvious mistakes, or mismerge patches) but this just seems like a nasty
precedent to set (or has it already been?).
>
> But I think it'd be better to do just a single raw_smp_processor_id() for
> this entire function:
>
> static void bh_lru_install(struct buffer_head *bh)
> {
> struct buffer_head *evictee = NULL;
> struct bh_lru *lru;
> + int cpu;
>
> check_irqs_on();
> bh_lru_lock();
> + cpu = raw_smp_processor_id();
> - lru = &__get_cpu_var(bh_lrus);
> + lru = per_cpu(bh_lrus, cpu);
>
> etcetera.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] reduce IPI noise due to /dev/cdrom open/close
2006-07-04 6:41 ` Andrew Morton
` (2 preceding siblings ...)
2006-07-04 17:33 ` Nick Piggin
@ 2006-07-05 0:10 ` Paul Mackerras
3 siblings, 0 replies; 23+ messages in thread
From: Paul Mackerras @ 2006-07-05 0:10 UTC (permalink / raw)
To: Andrew Morton; +Cc: Keith Owens, jes, torvalds, viro, linux-kernel
Andrew Morton writes:
> But I think it'd be better to do just a single raw_smp_processor_id() for
> this entire function:
>
> static void bh_lru_install(struct buffer_head *bh)
> {
> struct buffer_head *evictee = NULL;
> struct bh_lru *lru;
> + int cpu;
>
> check_irqs_on();
> bh_lru_lock();
> + cpu = raw_smp_processor_id();
> - lru = &__get_cpu_var(bh_lrus);
> + lru = per_cpu(bh_lrus, cpu);
Better still, use the __raw_get_cpu_var() that I introduced.
Paul.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] reduce IPI noise due to /dev/cdrom open/close
2006-07-04 17:33 ` Nick Piggin
@ 2006-07-05 7:30 ` Jes Sorensen
2006-07-05 18:26 ` Nick Piggin
0 siblings, 1 reply; 23+ messages in thread
From: Jes Sorensen @ 2006-07-05 7:30 UTC (permalink / raw)
To: Nick Piggin; +Cc: Andrew Morton, Keith Owens, torvalds, viro, linux-kernel
Nick Piggin wrote:
> Andrew Morton wrote:
>> I expect raw_smp_processor_id() is used here as a a microoptimisation -
>> avoid a might_sleep() which obviously will never trigger.
>
> A microoptimisation because they've turned on DEBUG_PREEMPT and found
> that smp_processor_id slows down? ;) Wouldn't it be better to just stick
> to the normal rules (ie. what Keith said)?
>
> It may be obvious in this case (though that doesn't help people who make
> obvious mistakes, or mismerge patches) but this just seems like a nasty
> precedent to set (or has it already been?).
I suspect the real reason here is that there's now so many ways to get
the processor ID that I cannot keep track of which one to use. Paul's
mention of __raw_get_cpu_var() just confuses me even more.
So if anyone can give me a conclusive answer of which one to use, I'm
happy to go there.
Granted I have a bias to avoid anything involving the preempt crap, but
thats just me :)
Cheers,
Jes
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] reduce IPI noise due to /dev/cdrom open/close
2006-07-05 7:30 ` Jes Sorensen
@ 2006-07-05 18:26 ` Nick Piggin
0 siblings, 0 replies; 23+ messages in thread
From: Nick Piggin @ 2006-07-05 18:26 UTC (permalink / raw)
To: Jes Sorensen; +Cc: Andrew Morton, Keith Owens, torvalds, viro, linux-kernel
Jes Sorensen wrote:
> Nick Piggin wrote:
>
>>Andrew Morton wrote:
>>
>>>I expect raw_smp_processor_id() is used here as a a microoptimisation -
>>>avoid a might_sleep() which obviously will never trigger.
>>
>>A microoptimisation because they've turned on DEBUG_PREEMPT and found
>>that smp_processor_id slows down? ;) Wouldn't it be better to just stick
>>to the normal rules (ie. what Keith said)?
>>
>>It may be obvious in this case (though that doesn't help people who make
>>obvious mistakes, or mismerge patches) but this just seems like a nasty
>>precedent to set (or has it already been?).
>
>
> I suspect the real reason here is that there's now so many ways to get
> the processor ID that I cannot keep track of which one to use. Paul's
> mention of __raw_get_cpu_var() just confuses me even more.
>
> So if anyone can give me a conclusive answer of which one to use, I'm
> happy to go there.
>
> Granted I have a bias to avoid anything involving the preempt crap, but
> thats just me :)
Use smp_processor_id() unless you explicitly want a lazy CPU number,
and in that case use the raw_ version. Turning off preempt or preempt
debug options does the rest for you.
If you're just using the number to feed into per_cpu, then use the
appropriate get_cpu variant.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
^ permalink raw reply [flat|nested] 23+ messages in thread
end of thread, other threads:[~2006-07-05 18:26 UTC | newest]
Thread overview: 23+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-07-03 15:33 [patch] reduce IPI noise due to /dev/cdrom open/close Jes Sorensen
2006-07-03 15:37 ` Milton Miller
2006-07-04 7:47 ` Jes Sorensen
2006-07-04 7:53 ` Arjan van de Ven
2006-07-04 8:12 ` Jes Sorensen
2006-07-04 8:23 ` Arjan van de Ven
2006-07-04 8:33 ` Jes Sorensen
2006-07-04 13:02 ` Helge Hafting
2006-07-04 8:42 ` Milton Miller
2006-07-04 8:59 ` Jes Sorensen
2006-07-04 9:30 ` Milton Miller
2006-07-03 15:37 ` [PATCH] simplfy bh_lru_install Milton Miller
2006-07-03 15:37 ` Milton Miller
2006-07-04 5:32 ` [patch] reduce IPI noise due to /dev/cdrom open/close Keith Owens
2006-07-04 6:41 ` Andrew Morton
2006-07-04 7:51 ` Jes Sorensen
2006-07-04 9:13 ` Milton Miller
2006-07-04 17:33 ` Nick Piggin
2006-07-05 7:30 ` Jes Sorensen
2006-07-05 18:26 ` Nick Piggin
2006-07-05 0:10 ` Paul Mackerras
2006-07-04 7:49 ` Jes Sorensen
2006-07-04 8:04 ` Andrew Morton
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox