* [PATCH] modules: Use a better scheme for refcounting
@ 2008-05-15 20:40 Eric Dumazet
2008-05-15 21:57 ` Andi Kleen
2008-05-16 0:09 ` Rusty Russell
0 siblings, 2 replies; 14+ messages in thread
From: Eric Dumazet @ 2008-05-15 20:40 UTC (permalink / raw)
To: Andrew Morton; +Cc: Rusty Russell, linux kernel, Mike Travis
[-- Attachment #1: Type: text/plain, Size: 1003 bytes --]
Current refcounting for modules (done if CONFIG_MODULE_UNLOAD=y)
is using a lot of memory.
Each 'struct module' contains an [NR_CPUS] array of full cache lines.
This patch uses existing infrastructure (percpu_modalloc() &
percpu_modfree())
to allocate percpu space for the refcount storage.
Instead of wasting NR_CPUS*128 bytes (on i386), we now use
num_possible_cpus*sizeof(local_t) bytes.
On a typical distro, where NR_CPUS=8, shiping 2000 modules, we reduce
size of module files by about 2 Mbytes. (1Kb per module)
Instead of having all refcounters in the same memory node - with TLB misses
because of vmalloc() - this new implementation permits to have better
NUMA properties, since each CPU will use storage on its preferred node,
thanks to percpu storage.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
include/linux/module.h | 25 ++++++++++++++++---------
kernel/module.c | 37 +++++++++++++++++++++++++++----------
2 files changed, 43 insertions(+), 19 deletions(-)
[-- Attachment #2: module.patch --]
[-- Type: text/plain, Size: 4856 bytes --]
diff --git a/include/linux/module.h b/include/linux/module.h
index 3e03b1a..72db614 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -217,11 +217,6 @@ void *__symbol_get_gpl(const char *symbol);
#endif
-struct module_ref
-{
- local_t count;
-} ____cacheline_aligned;
-
enum module_state
{
MODULE_STATE_LIVE,
@@ -307,8 +302,11 @@ struct module
#ifdef CONFIG_MODULE_UNLOAD
/* Reference counts */
- struct module_ref ref[NR_CPUS];
-
+#ifdef CONFIG_SMP
+ char *refptr;
+#else
+ local_t ref;
+#endif
/* What modules depend on me? */
struct list_head modules_which_use_me;
@@ -378,13 +376,22 @@ void __symbol_put(const char *symbol);
#define symbol_put(x) __symbol_put(MODULE_SYMBOL_PREFIX #x)
void symbol_put_addr(void *addr);
+static inline local_t *__module_ref_addr(struct module *mod, int cpu)
+{
+#ifdef CONFIG_SMP
+ return (local_t *) (mod->refptr + per_cpu_offset(cpu));
+#else
+ return &mod->ref;
+#endif
+}
+
/* Sometimes we know we already have a refcount, and it's easier not
to handle the error case (which only happens with rmmod --wait). */
static inline void __module_get(struct module *module)
{
if (module) {
BUG_ON(module_refcount(module) == 0);
- local_inc(&module->ref[get_cpu()].count);
+ local_inc(__module_ref_addr(module, get_cpu()));
put_cpu();
}
}
@@ -396,7 +403,7 @@ static inline int try_module_get(struct module *module)
if (module) {
unsigned int cpu = get_cpu();
if (likely(module_is_live(module)))
- local_inc(&module->ref[cpu].count);
+ local_inc(__module_ref_addr(module, cpu));
else
ret = 0;
put_cpu();
diff --git a/kernel/module.c b/kernel/module.c
index f5e9491..e0e8712 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -522,13 +522,13 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
/* Init the unload section of the module. */
static void module_unload_init(struct module *mod)
{
- unsigned int i;
+ int cpu;
INIT_LIST_HEAD(&mod->modules_which_use_me);
- for (i = 0; i < NR_CPUS; i++)
- local_set(&mod->ref[i].count, 0);
+ for_each_possible_cpu(cpu)
+ local_set(__module_ref_addr(mod, cpu), 0);
/* Hold reference count during initialization. */
- local_set(&mod->ref[raw_smp_processor_id()].count, 1);
+ local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1);
/* Backwards compatibility macros put refcount during init. */
mod->waiter = current;
}
@@ -659,10 +659,11 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
unsigned int module_refcount(struct module *mod)
{
- unsigned int i, total = 0;
+ unsigned int total = 0;
+ int cpu;
- for (i = 0; i < NR_CPUS; i++)
- total += local_read(&mod->ref[i].count);
+ for_each_possible_cpu(cpu)
+ total += local_read(__module_ref_addr(mod, cpu));
return total;
}
EXPORT_SYMBOL(module_refcount);
@@ -824,7 +825,7 @@ void module_put(struct module *module)
{
if (module) {
unsigned int cpu = get_cpu();
- local_dec(&module->ref[cpu].count);
+ local_dec(__module_ref_addr(module, cpu));
/* Maybe they're waiting for us to drop reference? */
if (unlikely(!module_is_live(module)))
wake_up_process(module->waiter);
@@ -1392,7 +1393,10 @@ static void free_module(struct module *mod)
kfree(mod->args);
if (mod->percpu)
percpu_modfree(mod->percpu);
-
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ if (mod->refptr)
+ percpu_modfree(mod->refptr);
+#endif
/* Free lock-classes: */
lockdep_free_key_range(mod->module_core, mod->core_size);
@@ -1760,6 +1764,9 @@ static struct module *load_module(void __user *umod,
unsigned int markersstringsindex;
struct module *mod;
long err = 0;
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ void *refptr = NULL;
+#endif
void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
struct exception_table_entry *extable;
mm_segment_t old_fs;
@@ -1904,6 +1911,12 @@ static struct module *load_module(void __user *umod,
if (err < 0)
goto free_mod;
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ refptr = percpu_modalloc(sizeof(local_t), sizeof(local_t), mod->name);
+ if (!refptr)
+ goto free_mod;
+ mod->refptr = refptr;
+#endif
if (pcpuindex) {
/* We have a special allocation for this section. */
percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
@@ -1911,7 +1924,7 @@ static struct module *load_module(void __user *umod,
mod->name);
if (!percpu) {
err = -ENOMEM;
- goto free_mod;
+ goto free_percpu;
}
sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
mod->percpu = percpu;
@@ -2164,6 +2177,10 @@ static struct module *load_module(void __user *umod,
free_percpu:
if (percpu)
percpu_modfree(percpu);
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ if (refptr)
+ percpu_modfree(refptr);
+#endif
free_mod:
kfree(args);
free_hdr:
^ permalink raw reply related [flat|nested] 14+ messages in thread* Re: [PATCH] modules: Use a better scheme for refcounting
2008-05-15 20:40 [PATCH] modules: Use a better scheme for refcounting Eric Dumazet
@ 2008-05-15 21:57 ` Andi Kleen
2008-05-16 4:44 ` Eric Dumazet
2008-05-16 0:09 ` Rusty Russell
1 sibling, 1 reply; 14+ messages in thread
From: Andi Kleen @ 2008-05-15 21:57 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Andrew Morton, Rusty Russell, linux kernel, Mike Travis
Eric Dumazet <dada1@cosmosbay.com> writes:
> Instead of wasting NR_CPUS*128 bytes (on i386), we now use
> num_possible_cpus*sizeof(local_t) bytes.
>
> On a typical distro, where NR_CPUS=8,
More typical would be NR_CPUS=128, with NR_CPUS=four digits
when Mike Travis et.al. are finished
> shiping 2000 modules, we reduce
Surely only the loaded modules count? Perhaps 20-30.
But it's a cool improvement. Very nice.
-Andi
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] modules: Use a better scheme for refcounting
2008-05-15 21:57 ` Andi Kleen
@ 2008-05-16 4:44 ` Eric Dumazet
0 siblings, 0 replies; 14+ messages in thread
From: Eric Dumazet @ 2008-05-16 4:44 UTC (permalink / raw)
To: Andi Kleen; +Cc: Andrew Morton, Rusty Russell, linux kernel, Mike Travis
Andi Kleen a écrit :
> Eric Dumazet <dada1@cosmosbay.com> writes:
>
>> Instead of wasting NR_CPUS*128 bytes (on i386), we now use
>> num_possible_cpus*sizeof(local_t) bytes.
>>
>> On a typical distro, where NR_CPUS=8,
>>
>
> More typical would be NR_CPUS=128, with NR_CPUS=four digits
> when Mike Travis et.al. are finished
>
>
Yes. With NR_CPUS=4096, we save about half a megabyte per module. (!!!)
>> shiping 2000 modules, we reduce
>>
>
> Surely only the loaded modules count? Perhaps 20-30.
>
>
Well, I should have stated that this saving also takes place in the
module disk file.
(The "struct module" is included in it, in the
".gnu.linkonce.this_module" section)
Here on my distro, around 2000 modules are shiped.
> But it's a cool improvement. Very nice.
>
Thanks
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] modules: Use a better scheme for refcounting
2008-05-15 20:40 [PATCH] modules: Use a better scheme for refcounting Eric Dumazet
2008-05-15 21:57 ` Andi Kleen
@ 2008-05-16 0:09 ` Rusty Russell
2008-05-16 5:29 ` Eric Dumazet
1 sibling, 1 reply; 14+ messages in thread
From: Rusty Russell @ 2008-05-16 0:09 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Andrew Morton, linux kernel, Mike Travis
On Friday 16 May 2008 06:40:37 Eric Dumazet wrote:
> Current refcounting for modules (done if CONFIG_MODULE_UNLOAD=y)
> is using a lot of memory.
Hi Eric,
I like this patch! The plan was always to create a proper dynamic per-cpu
allocator which used the normal per-cpu offsets, but I think module refcounts
are worthwhile as a special case.
Any chance I can ask you look at the issue of full dynamic per-cpu
allocation? The problem of allocating memory which is laid out precisely
as the original per-cpu alloc is vexing on NUMA, and probably requires
reserving virtual address space and remapping into it, but the rewards
would be maximally-efficient per-cpu accessors, and getting rid of that
boutique allocator in module.c.
Only minor commentry on the patch itself:
> +#ifdef CONFIG_SMP
> + char *refptr;
> +#else
void * would seem more natural here (love those gcc'isms)
> +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
> + void *refptr = NULL;
> +#endif
Looks like you can skip this if you assign to mod->refptr directly below:
> +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
> + refptr = percpu_modalloc(sizeof(local_t), sizeof(local_t), mod->name);
> + if (!refptr)
> + goto free_mod;
> + mod->refptr = refptr;
> +#endif
And finally:
> +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
> + if (refptr)
> + percpu_modfree(refptr);
> +#endif
This if (refptr) seems redundant.
Thanks!
Rusty.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] modules: Use a better scheme for refcounting
2008-05-16 0:09 ` Rusty Russell
@ 2008-05-16 5:29 ` Eric Dumazet
2008-05-16 13:41 ` Mike Travis
0 siblings, 1 reply; 14+ messages in thread
From: Eric Dumazet @ 2008-05-16 5:29 UTC (permalink / raw)
To: Rusty Russell; +Cc: Andrew Morton, linux kernel, Mike Travis
[-- Attachment #1: Type: text/plain, Size: 3585 bytes --]
Rusty Russell a écrit :
> On Friday 16 May 2008 06:40:37 Eric Dumazet wrote:
>
>> Current refcounting for modules (done if CONFIG_MODULE_UNLOAD=y)
>> is using a lot of memory.
>>
>
> Hi Eric,
>
> I like this patch! The plan was always to create a proper dynamic per-cpu
> allocator which used the normal per-cpu offsets, but I think module refcounts
> are worthwhile as a special case.
>
> Any chance I can ask you look at the issue of full dynamic per-cpu
> allocation? The problem of allocating memory which is laid out precisely
> as the original per-cpu alloc is vexing on NUMA, and probably requires
> reserving virtual address space and remapping into it, but the rewards
> would be maximally-efficient per-cpu accessors, and getting rid of that
> boutique allocator in module.c.
>
>
You mean using alloc_percpu() ? Problem is that current implementation
is expensive, since it is using
an extra array of pointers (struct percpu_data). On x86_64, that means
at least a 200% space increase
over the solution of using 4 bytes in the static percpu zone. We
probably can change this to dynamic
per-cpu as soon as Mike or Christopher finish their work on new dynamic
per-cpu implementation ?
> Only minor commentry on the patch itself:
>
>
>> +#ifdef CONFIG_SMP
>> + char *refptr;
>> +#else
>>
>
> void * would seem more natural here (love those gcc'isms)
>
>
Yes, sure.
>> +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
>> + void *refptr = NULL;
>> +#endif
>>
>
> Looks like you can skip this if you assign to mod->refptr directly below:
>
>
>> +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
>> + refptr = percpu_modalloc(sizeof(local_t), sizeof(local_t), mod->name);
>> + if (!refptr)
>> + goto free_mod;
>> + mod->refptr = refptr;
>> +#endif
>>
>
>
Unfortunatly no. I tried it and failed.
This is the problem that at freeing time, we cannot anymore use mod->refptr,
since we just unmaped mod, using vfree().
You had same problem with percpu, and had to use a temporaty variable on
stack :)
> And finally:
>
>
>> +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
>> + if (refptr)
>> + percpu_modfree(refptr);
>> +#endif
>>
>
> This if (refptr) seems redundant.
>
>
Yes, thanks a lot. Please find an updated patch.
> Thanks!
> Rusty.
>
>
>
[PATCH] modules: Use a better scheme for refcounting
Current refcounting for modules (done if CONFIG_MODULE_UNLOAD=y)
is using a lot of memory (and disk space)
Each 'struct module' contains an [NR_CPUS] array of full cache lines.
This patch uses existing infrastructure (percpu_modalloc() &
percpu_modfree())
to allocate percpu space for the refcount storage.
Instead of wasting NR_CPUS*128 bytes (on i386), we now use
num_possible_cpus*sizeof(local_t) bytes.
On a typical distro, where NR_CPUS=8, shiping 2000 modules, we reduce
size of module files by about 2 Mbytes. (1Kb per module)
Instead of having all refcounters in the same memory node - with TLB misses
because of vmalloc() - this new implementation permits to have better
NUMA properties, since each CPU will use storage on its preferred node,
thanks to percpu storage.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
include/linux/module.h | 25 +++++++++++++++---------
kernel/module.c | 40 +++++++++++++++++++++++++++++----------
2 files changed, 46 insertions(+), 19 deletions(-)
[-- Attachment #2: module.patch --]
[-- Type: text/plain, Size: 4871 bytes --]
diff --git a/include/linux/module.h b/include/linux/module.h
index 3e03b1a..dfd36ed 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -217,11 +217,6 @@ void *__symbol_get_gpl(const char *symbol);
#endif
-struct module_ref
-{
- local_t count;
-} ____cacheline_aligned;
-
enum module_state
{
MODULE_STATE_LIVE,
@@ -307,8 +302,11 @@ struct module
#ifdef CONFIG_MODULE_UNLOAD
/* Reference counts */
- struct module_ref ref[NR_CPUS];
-
+#ifdef CONFIG_SMP
+ void *refptr;
+#else
+ local_t ref;
+#endif
/* What modules depend on me? */
struct list_head modules_which_use_me;
@@ -378,13 +376,22 @@ void __symbol_put(const char *symbol);
#define symbol_put(x) __symbol_put(MODULE_SYMBOL_PREFIX #x)
void symbol_put_addr(void *addr);
+static inline local_t *__module_ref_addr(struct module *mod, int cpu)
+{
+#ifdef CONFIG_SMP
+ return (local_t *) (mod->refptr + per_cpu_offset(cpu));
+#else
+ return &mod->ref;
+#endif
+}
+
/* Sometimes we know we already have a refcount, and it's easier not
to handle the error case (which only happens with rmmod --wait). */
static inline void __module_get(struct module *module)
{
if (module) {
BUG_ON(module_refcount(module) == 0);
- local_inc(&module->ref[get_cpu()].count);
+ local_inc(__module_ref_addr(module, get_cpu()));
put_cpu();
}
}
@@ -396,7 +403,7 @@ static inline int try_module_get(struct module *module)
if (module) {
unsigned int cpu = get_cpu();
if (likely(module_is_live(module)))
- local_inc(&module->ref[cpu].count);
+ local_inc(__module_ref_addr(module, cpu));
else
ret = 0;
put_cpu();
diff --git a/kernel/module.c b/kernel/module.c
index f5e9491..e82b2d9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -522,13 +522,13 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
/* Init the unload section of the module. */
static void module_unload_init(struct module *mod)
{
- unsigned int i;
+ int cpu;
INIT_LIST_HEAD(&mod->modules_which_use_me);
- for (i = 0; i < NR_CPUS; i++)
- local_set(&mod->ref[i].count, 0);
+ for_each_possible_cpu(cpu)
+ local_set(__module_ref_addr(mod, cpu), 0);
/* Hold reference count during initialization. */
- local_set(&mod->ref[raw_smp_processor_id()].count, 1);
+ local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1);
/* Backwards compatibility macros put refcount during init. */
mod->waiter = current;
}
@@ -659,10 +659,11 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
unsigned int module_refcount(struct module *mod)
{
- unsigned int i, total = 0;
+ unsigned int total = 0;
+ int cpu;
- for (i = 0; i < NR_CPUS; i++)
- total += local_read(&mod->ref[i].count);
+ for_each_possible_cpu(cpu)
+ total += local_read(__module_ref_addr(mod, cpu));
return total;
}
EXPORT_SYMBOL(module_refcount);
@@ -824,7 +825,7 @@ void module_put(struct module *module)
{
if (module) {
unsigned int cpu = get_cpu();
- local_dec(&module->ref[cpu].count);
+ local_dec(__module_ref_addr(module, cpu));
/* Maybe they're waiting for us to drop reference? */
if (unlikely(!module_is_live(module)))
wake_up_process(module->waiter);
@@ -1392,7 +1393,9 @@ static void free_module(struct module *mod)
kfree(mod->args);
if (mod->percpu)
percpu_modfree(mod->percpu);
-
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ percpu_modfree(mod->refptr);
+#endif
/* Free lock-classes: */
lockdep_free_key_range(mod->module_core, mod->core_size);
@@ -1760,6 +1763,9 @@ static struct module *load_module(void __user *umod,
unsigned int markersstringsindex;
struct module *mod;
long err = 0;
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ void *refptr = NULL;
+#endif
void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
struct exception_table_entry *extable;
mm_segment_t old_fs;
@@ -1904,6 +1910,16 @@ static struct module *load_module(void __user *umod,
if (err < 0)
goto free_mod;
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ refptr = percpu_modalloc(sizeof(local_t),
+ sizeof(local_t),
+ mod->name);
+ if (!refptr) {
+ err = -ENOMEM;
+ goto free_mod;
+ }
+ mod->refptr = refptr;
+#endif
if (pcpuindex) {
/* We have a special allocation for this section. */
percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
@@ -1911,7 +1927,7 @@ static struct module *load_module(void __user *umod,
mod->name);
if (!percpu) {
err = -ENOMEM;
- goto free_mod;
+ goto free_refptr;
}
sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
mod->percpu = percpu;
@@ -2164,6 +2180,10 @@ static struct module *load_module(void __user *umod,
free_percpu:
if (percpu)
percpu_modfree(percpu);
+ free_refptr:
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ percpu_modfree(refptr);
+#endif
free_mod:
kfree(args);
free_hdr:
^ permalink raw reply related [flat|nested] 14+ messages in thread* Re: [PATCH] modules: Use a better scheme for refcounting
2008-05-16 5:29 ` Eric Dumazet
@ 2008-05-16 13:41 ` Mike Travis
2008-05-17 5:33 ` Rusty Russell
0 siblings, 1 reply; 14+ messages in thread
From: Mike Travis @ 2008-05-16 13:41 UTC (permalink / raw)
To: Eric Dumazet
Cc: Rusty Russell, Andrew Morton, linux kernel, Christoph Lameter
Eric Dumazet wrote:
> Rusty Russell a écrit :
...
>>
>> Hi Eric,
>>
>> I like this patch! The plan was always to create a proper dynamic
>> per-cpu
>> allocator which used the normal per-cpu offsets, but I think module
>> refcounts
>> are worthwhile as a special case.
>>
>> Any chance I can ask you look at the issue of full dynamic per-cpu
>> allocation? The problem of allocating memory which is laid out precisely
>> as the original per-cpu alloc is vexing on NUMA, and probably requires
>> reserving virtual address space and remapping into it, but the rewards
>> would be maximally-efficient per-cpu accessors, and getting rid of that
>> boutique allocator in module.c.
>>
>>
> You mean using alloc_percpu() ? Problem is that current implementation
> is expensive, since it is using
> an extra array of pointers (struct percpu_data). On x86_64, that means
> at least a 200% space increase
> over the solution of using 4 bytes in the static percpu zone. We
> probably can change this to dynamic
> per-cpu as soon as Mike or Christopher finish their work on new dynamic
> per-cpu implementation ?
Yes, the zero-based percpu variables followed by the cpu_alloc patch should
provide this and shrink the code quite well, including in some cases removing
locking requirements (because the resultant instructions will be atomic.)
Thanks,
Mike
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] modules: Use a better scheme for refcounting
2008-05-16 13:41 ` Mike Travis
@ 2008-05-17 5:33 ` Rusty Russell
2008-05-17 7:36 ` Eric Dumazet
2008-05-19 16:39 ` Christoph Lameter
0 siblings, 2 replies; 14+ messages in thread
From: Rusty Russell @ 2008-05-17 5:33 UTC (permalink / raw)
To: Mike Travis; +Cc: Eric Dumazet, Andrew Morton, linux kernel, Christoph Lameter
On Friday 16 May 2008 23:41:16 Mike Travis wrote:
> Eric Dumazet wrote:
> > Rusty Russell a écrit :
> >> Any chance I can ask you look at the issue of full dynamic per-cpu
> >> allocation?
> >
> > You mean using alloc_percpu() ? Problem is that current implementation
> > is expensive,
I mean rewriting alloc_percpu :)
> > We probably can change this to dynamic per-cpu as soon as Mike or
> > Christopher finish their work on new dynamic per-cpu implementation ?
>
> Yes, the zero-based percpu variables followed by the cpu_alloc patch should
> provide this and shrink the code quite well, including in some cases
> removing locking requirements (because the resultant instructions will be
> atomic.)
Ah, I hadn't realized that Mike was already working on this. Mike, have you
published patches already?
Thanks,
Rusty.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] modules: Use a better scheme for refcounting
2008-05-17 5:33 ` Rusty Russell
@ 2008-05-17 7:36 ` Eric Dumazet
2008-05-18 14:31 ` Rusty Russell
2008-05-19 16:41 ` Christoph Lameter
2008-05-19 16:39 ` Christoph Lameter
1 sibling, 2 replies; 14+ messages in thread
From: Eric Dumazet @ 2008-05-17 7:36 UTC (permalink / raw)
To: Rusty Russell; +Cc: Mike Travis, Andrew Morton, linux kernel, Christoph Lameter
Rusty Russell a écrit :
> On Friday 16 May 2008 23:41:16 Mike Travis wrote:
>
>> Eric Dumazet wrote:
>>
>>> Rusty Russell a écrit :
>>>
>>>> Any chance I can ask you look at the issue of full dynamic per-cpu
>>>> allocation?
>>>>
>>> You mean using alloc_percpu() ? Problem is that current implementation
>>> is expensive,
>>>
>
> I mean rewriting alloc_percpu :)
>
>
>>> We probably can change this to dynamic per-cpu as soon as Mike or
>>> Christopher finish their work on new dynamic per-cpu implementation ?
>>>
>> Yes, the zero-based percpu variables followed by the cpu_alloc patch should
>> provide this and shrink the code quite well, including in some cases
>> removing locking requirements (because the resultant instructions will be
>> atomic.)
>>
>
> Ah, I hadn't realized that Mike was already working on this. Mike, have you
> published patches already?
>
>
Christoph Lameter made good work last year and apparently the path is to :
1) Put pda at the begining of percpu section.
2) Relocate percpu variables to begin at zero
3) Use %gs (or %fs) register to address pda AND percpu section.
4) Reserve an appropriate large virtual zone into percpu section so that
it can fullfill :
- percpu "static" allocations needed by module loader.
- per_cpu "dynamic" allocations
(No more need for an array of pointers to find the address of dynamic
per_cpu variable)
- Eventually allocates PAGES dynamically into this zone to satisfy any
size of percpu needs.
(But with a limit of say 256 MB per cpu on x86_64)
Thats a lot of work and apparently Mike and Christoph are doing it.
Some pointers to previous work:
http://groups.google.pl/group/linux.kernel/browse_thread/thread/f2ff6901ca6ae9fc/b2ed3b7f3612a157?lnk=raot
http://kerneltrap.org/mailarchive/linux-kernel/2008/2/1/683104
http://linux.derkeiler.com/Mailing-Lists/Kernel/2008-01/msg09052.html
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] modules: Use a better scheme for refcounting
2008-05-17 7:36 ` Eric Dumazet
@ 2008-05-18 14:31 ` Rusty Russell
2008-05-19 16:42 ` Christoph Lameter
2008-05-19 16:41 ` Christoph Lameter
1 sibling, 1 reply; 14+ messages in thread
From: Rusty Russell @ 2008-05-18 14:31 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Mike Travis, Andrew Morton, linux kernel, Christoph Lameter
On Saturday 17 May 2008 17:36:21 Eric Dumazet wrote:
> Christoph Lameter made good work last year and apparently the path is to :
>
> 1) Put pda at the begining of percpu section.
Well, we killed the pda on i386: it's basically a limited arch-specific percpu
section. Unfortunately, ISTR that doing so kills -fstack-protector on x86-64
(gcc was explicitly patched to support the kernel doing this, but it nailed
us to a fixed offset off %fs). But the PDA is a redundant idea which should
be removed.
> 2) Relocate percpu variables to begin at zero
> 3) Use %gs (or %fs) register to address pda AND percpu section.
Thanks for the prod Eric. ISTR acking those patches; Christoph, what happened
to them?
Cheers,
Rusty.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] modules: Use a better scheme for refcounting
2008-05-17 7:36 ` Eric Dumazet
2008-05-18 14:31 ` Rusty Russell
@ 2008-05-19 16:41 ` Christoph Lameter
2008-05-19 18:04 ` Mike Travis
1 sibling, 1 reply; 14+ messages in thread
From: Christoph Lameter @ 2008-05-19 16:41 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Rusty Russell, Mike Travis, Andrew Morton, linux kernel
On Sat, 17 May 2008, Eric Dumazet wrote:
> Thats a lot of work and apparently Mike and Christoph are doing it.
Indeed. Thanks for the support. I hope Mike gets the stuff out soon. I
know he was working on one issue that cause the zero based stuff to break.
Maybe it would help if he would post what he has so far?
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] modules: Use a better scheme for refcounting
2008-05-19 16:41 ` Christoph Lameter
@ 2008-05-19 18:04 ` Mike Travis
0 siblings, 0 replies; 14+ messages in thread
From: Mike Travis @ 2008-05-19 18:04 UTC (permalink / raw)
To: Christoph Lameter
Cc: Eric Dumazet, Rusty Russell, Andrew Morton, linux kernel
Christoph Lameter wrote:
> On Sat, 17 May 2008, Eric Dumazet wrote:
>
>> Thats a lot of work and apparently Mike and Christoph are doing it.
>
> Indeed. Thanks for the support. I hope Mike gets the stuff out soon. I
> know he was working on one issue that cause the zero based stuff to break.
> Maybe it would help if he would post what he has so far?
I'm re-basing it on the x86/latest tree as sched/latest was failing my tests
last week on both the AMD and Intel boxes. As soon as I re-verify everything
I'll send it in.
Thanks,
Mike
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] modules: Use a better scheme for refcounting
2008-05-17 5:33 ` Rusty Russell
2008-05-17 7:36 ` Eric Dumazet
@ 2008-05-19 16:39 ` Christoph Lameter
1 sibling, 0 replies; 14+ messages in thread
From: Christoph Lameter @ 2008-05-19 16:39 UTC (permalink / raw)
To: Rusty Russell; +Cc: Mike Travis, Eric Dumazet, Andrew Morton, linux kernel
On Sat, 17 May 2008, Rusty Russell wrote:
> > > You mean using alloc_percpu() ? Problem is that current implementation
> > > is expensive,
>
> I mean rewriting alloc_percpu :)
cpu_alooc is going to replace that completely.
>
> > > We probably can change this to dynamic per-cpu as soon as Mike or
> > > Christopher finish their work on new dynamic per-cpu implementation ?
> >
> > Yes, the zero-based percpu variables followed by the cpu_alloc patch should
> > provide this and shrink the code quite well, including in some cases
> > removing locking requirements (because the resultant instructions will be
> > atomic.)
>
> Ah, I hadn't realized that Mike was already working on this. Mike, have you
> published patches already?
He is reworking my patches. He can use some encouragement though...
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH] modules: Use a better scheme for refcounting
@ 2009-02-03 3:01 Rusty Russell
0 siblings, 0 replies; 14+ messages in thread
From: Rusty Russell @ 2009-02-03 3:01 UTC (permalink / raw)
To: Linus Torvalds; +Cc: linux-kernel, Eric Biederman
From: Eric Dumazet <dada1@cosmosbay.com>
(Linus, please apply. This has become critical with NR_CPUS 4096: we
currently use an astonishing 1GB additional space for 2000 modules.)
Current refcounting for modules (done if CONFIG_MODULE_UNLOAD=y)
is using a lot of memory.
Each 'struct module' contains an [NR_CPUS] array of full cache lines.
This patch uses existing infrastructure (percpu_modalloc() &
percpu_modfree()) to allocate percpu space for the refcount storage.
Instead of wasting NR_CPUS*128 bytes (on i386), we now use
nr_cpu_ids*sizeof(local_t) bytes.
On a typical distro, where NR_CPUS=8, shiping 2000 modules, we reduce
size of module files by about 2 Mbytes. (1Kb per module)
Instead of having all refcounters in the same memory node - with TLB misses
because of vmalloc() - this new implementation permits to have better
NUMA properties, since each CPU will use storage on its preferred node,
thanks to percpu storage.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
include/linux/module.h | 25 ++++++++++++++++---------
kernel/module.c | 35 +++++++++++++++++++++++++----------
2 files changed, 41 insertions(+), 19 deletions(-)
diff --git a/include/linux/module.h b/include/linux/module.h
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -219,11 +219,6 @@ void *__symbol_get_gpl(const char *symbo
#endif
-struct module_ref
-{
- local_t count;
-} ____cacheline_aligned;
-
enum module_state
{
MODULE_STATE_LIVE,
@@ -344,8 +339,11 @@ struct module
/* Destruction function. */
void (*exit)(void);
- /* Reference counts */
- struct module_ref ref[NR_CPUS];
+#ifdef CONFIG_SMP
+ char *refptr;
+#else
+ local_t ref;
+#endif
#endif
};
#ifndef MODULE_ARCH_INIT
@@ -395,13 +393,22 @@ void __symbol_put(const char *symbol);
#define symbol_put(x) __symbol_put(MODULE_SYMBOL_PREFIX #x)
void symbol_put_addr(void *addr);
+static inline local_t *__module_ref_addr(struct module *mod, int cpu)
+{
+#ifdef CONFIG_SMP
+ return (local_t *) (mod->refptr + per_cpu_offset(cpu));
+#else
+ return &mod->ref;
+#endif
+}
+
/* Sometimes we know we already have a refcount, and it's easier not
to handle the error case (which only happens with rmmod --wait). */
static inline void __module_get(struct module *module)
{
if (module) {
BUG_ON(module_refcount(module) == 0);
- local_inc(&module->ref[get_cpu()].count);
+ local_inc(__module_ref_addr(module, get_cpu()));
put_cpu();
}
}
@@ -413,7 +420,7 @@ static inline int try_module_get(struct
if (module) {
unsigned int cpu = get_cpu();
if (likely(module_is_live(module)))
- local_inc(&module->ref[cpu].count);
+ local_inc(__module_ref_addr(module, cpu));
else
ret = 0;
put_cpu();
diff --git a/kernel/module.c b/kernel/module.c
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -573,13 +573,13 @@ static char last_unloaded_module[MODULE_
/* Init the unload section of the module. */
static void module_unload_init(struct module *mod)
{
- unsigned int i;
+ int cpu;
INIT_LIST_HEAD(&mod->modules_which_use_me);
- for (i = 0; i < NR_CPUS; i++)
- local_set(&mod->ref[i].count, 0);
+ for_each_possible_cpu(cpu)
+ local_set(__module_ref_addr(mod, cpu), 0);
/* Hold reference count during initialization. */
- local_set(&mod->ref[raw_smp_processor_id()].count, 1);
+ local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1);
/* Backwards compatibility macros put refcount during init. */
mod->waiter = current;
}
@@ -717,10 +717,11 @@ static int try_stop_module(struct module
unsigned int module_refcount(struct module *mod)
{
- unsigned int i, total = 0;
+ unsigned int total = 0;
+ int cpu;
- for (i = 0; i < NR_CPUS; i++)
- total += local_read(&mod->ref[i].count);
+ for_each_possible_cpu(cpu)
+ total += local_read(__module_ref_addr(mod, cpu));
return total;
}
EXPORT_SYMBOL(module_refcount);
@@ -894,7 +895,7 @@ void module_put(struct module *module)
{
if (module) {
unsigned int cpu = get_cpu();
- local_dec(&module->ref[cpu].count);
+ local_dec(__module_ref_addr(module, cpu));
/* Maybe they're waiting for us to drop reference? */
if (unlikely(!module_is_live(module)))
wake_up_process(module->waiter);
@@ -1464,7 +1465,10 @@ static void free_module(struct module *m
kfree(mod->args);
if (mod->percpu)
percpu_modfree(mod->percpu);
-
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ if (mod->refptr)
+ percpu_modfree(mod->refptr);
+#endif
/* Free lock-classes: */
lockdep_free_key_range(mod->module_core, mod->core_size);
@@ -2011,6 +2015,14 @@ static noinline struct module *load_modu
if (err < 0)
goto free_mod;
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
+ mod->name);
+ if (!mod->refptr) {
+ err = -ENOMEM;
+ goto free_mod;
+ }
+#endif
if (pcpuindex) {
/* We have a special allocation for this section. */
percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
@@ -2018,7 +2030,7 @@ static noinline struct module *load_modu
mod->name);
if (!percpu) {
err = -ENOMEM;
- goto free_mod;
+ goto free_percpu;
}
sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
mod->percpu = percpu;
@@ -2282,6 +2294,9 @@ static noinline struct module *load_modu
free_percpu:
if (percpu)
percpu_modfree(percpu);
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ percpu_modfree(mod->refptr);
+#endif
free_mod:
kfree(args);
free_hdr:
^ permalink raw reply [flat|nested] 14+ messages in thread
end of thread, other threads:[~2009-02-03 3:01 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-05-15 20:40 [PATCH] modules: Use a better scheme for refcounting Eric Dumazet
2008-05-15 21:57 ` Andi Kleen
2008-05-16 4:44 ` Eric Dumazet
2008-05-16 0:09 ` Rusty Russell
2008-05-16 5:29 ` Eric Dumazet
2008-05-16 13:41 ` Mike Travis
2008-05-17 5:33 ` Rusty Russell
2008-05-17 7:36 ` Eric Dumazet
2008-05-18 14:31 ` Rusty Russell
2008-05-19 16:42 ` Christoph Lameter
2008-05-19 16:41 ` Christoph Lameter
2008-05-19 18:04 ` Mike Travis
2008-05-19 16:39 ` Christoph Lameter
-- strict thread matches above, loose matches on Subject: below --
2009-02-03 3:01 Rusty Russell
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox