public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] modules: Use a better scheme for refcounting
@ 2008-05-15 20:40 Eric Dumazet
  2008-05-15 21:57 ` Andi Kleen
  2008-05-16  0:09 ` Rusty Russell
  0 siblings, 2 replies; 14+ messages in thread
From: Eric Dumazet @ 2008-05-15 20:40 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Rusty Russell, linux kernel, Mike Travis

[-- Attachment #1: Type: text/plain, Size: 1003 bytes --]

Current refcounting for modules (done if CONFIG_MODULE_UNLOAD=y)
is using a lot of memory.

Each 'struct module' contains an [NR_CPUS] array of full cache lines.

This patch uses existing infrastructure (percpu_modalloc() & 
percpu_modfree())
to allocate percpu space for the refcount storage.

Instead of wasting NR_CPUS*128 bytes (on i386), we now use
num_possible_cpus*sizeof(local_t) bytes.

On a typical distro, where NR_CPUS=8, shiping 2000 modules, we reduce
size of module files by about 2 Mbytes. (1Kb per module)

Instead of having all refcounters in the same memory node - with TLB misses
because of vmalloc() - this new implementation permits to have better
NUMA properties, since each  CPU will use storage on its preferred node,
thanks to percpu storage.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
 include/linux/module.h |   25 ++++++++++++++++---------
 kernel/module.c        |   37 +++++++++++++++++++++++++++----------
 2 files changed, 43 insertions(+), 19 deletions(-)


[-- Attachment #2: module.patch --]
[-- Type: text/plain, Size: 4856 bytes --]

diff --git a/include/linux/module.h b/include/linux/module.h
index 3e03b1a..72db614 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -217,11 +217,6 @@ void *__symbol_get_gpl(const char *symbol);
 
 #endif
 
-struct module_ref
-{
-	local_t count;
-} ____cacheline_aligned;
-
 enum module_state
 {
 	MODULE_STATE_LIVE,
@@ -307,8 +302,11 @@ struct module
 
 #ifdef CONFIG_MODULE_UNLOAD
 	/* Reference counts */
-	struct module_ref ref[NR_CPUS];
-
+#ifdef CONFIG_SMP
+	char *refptr;
+#else
+	local_t ref;
+#endif
 	/* What modules depend on me? */
 	struct list_head modules_which_use_me;
 
@@ -378,13 +376,22 @@ void __symbol_put(const char *symbol);
 #define symbol_put(x) __symbol_put(MODULE_SYMBOL_PREFIX #x)
 void symbol_put_addr(void *addr);
 
+static inline local_t *__module_ref_addr(struct module *mod, int cpu)
+{
+#ifdef CONFIG_SMP
+	return (local_t *) (mod->refptr + per_cpu_offset(cpu));
+#else
+	return &mod->ref;
+#endif
+}
+
 /* Sometimes we know we already have a refcount, and it's easier not
    to handle the error case (which only happens with rmmod --wait). */
 static inline void __module_get(struct module *module)
 {
 	if (module) {
 		BUG_ON(module_refcount(module) == 0);
-		local_inc(&module->ref[get_cpu()].count);
+		local_inc(__module_ref_addr(module, get_cpu()));
 		put_cpu();
 	}
 }
@@ -396,7 +403,7 @@ static inline int try_module_get(struct module *module)
 	if (module) {
 		unsigned int cpu = get_cpu();
 		if (likely(module_is_live(module)))
-			local_inc(&module->ref[cpu].count);
+			local_inc(__module_ref_addr(module, cpu));
 		else
 			ret = 0;
 		put_cpu();
diff --git a/kernel/module.c b/kernel/module.c
index f5e9491..e0e8712 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -522,13 +522,13 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
 /* Init the unload section of the module. */
 static void module_unload_init(struct module *mod)
 {
-	unsigned int i;
+	int cpu;
 
 	INIT_LIST_HEAD(&mod->modules_which_use_me);
-	for (i = 0; i < NR_CPUS; i++)
-		local_set(&mod->ref[i].count, 0);
+	for_each_possible_cpu(cpu)
+		local_set(__module_ref_addr(mod, cpu), 0);
 	/* Hold reference count during initialization. */
-	local_set(&mod->ref[raw_smp_processor_id()].count, 1);
+	local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1);
 	/* Backwards compatibility macros put refcount during init. */
 	mod->waiter = current;
 }
@@ -659,10 +659,11 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
 
 unsigned int module_refcount(struct module *mod)
 {
-	unsigned int i, total = 0;
+	unsigned int total = 0;
+	int cpu;
 
-	for (i = 0; i < NR_CPUS; i++)
-		total += local_read(&mod->ref[i].count);
+	for_each_possible_cpu(cpu)
+		total += local_read(__module_ref_addr(mod, cpu));
 	return total;
 }
 EXPORT_SYMBOL(module_refcount);
@@ -824,7 +825,7 @@ void module_put(struct module *module)
 {
 	if (module) {
 		unsigned int cpu = get_cpu();
-		local_dec(&module->ref[cpu].count);
+		local_dec(__module_ref_addr(module, cpu));
 		/* Maybe they're waiting for us to drop reference? */
 		if (unlikely(!module_is_live(module)))
 			wake_up_process(module->waiter);
@@ -1392,7 +1393,10 @@ static void free_module(struct module *mod)
 	kfree(mod->args);
 	if (mod->percpu)
 		percpu_modfree(mod->percpu);
-
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+	if (mod->refptr)
+		percpu_modfree(mod->refptr);
+#endif
 	/* Free lock-classes: */
 	lockdep_free_key_range(mod->module_core, mod->core_size);
 
@@ -1760,6 +1764,9 @@ static struct module *load_module(void __user *umod,
 	unsigned int markersstringsindex;
 	struct module *mod;
 	long err = 0;
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+	void *refptr = NULL;
+#endif
 	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
 	struct exception_table_entry *extable;
 	mm_segment_t old_fs;
@@ -1904,6 +1911,12 @@ static struct module *load_module(void __user *umod,
 	if (err < 0)
 		goto free_mod;
 
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+	refptr = percpu_modalloc(sizeof(local_t), sizeof(local_t), mod->name);
+	if (!refptr)
+		goto free_mod;
+	mod->refptr = refptr;
+#endif
 	if (pcpuindex) {
 		/* We have a special allocation for this section. */
 		percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
@@ -1911,7 +1924,7 @@ static struct module *load_module(void __user *umod,
 					 mod->name);
 		if (!percpu) {
 			err = -ENOMEM;
-			goto free_mod;
+			goto free_percpu;
 		}
 		sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
 		mod->percpu = percpu;
@@ -2164,6 +2177,10 @@ static struct module *load_module(void __user *umod,
  free_percpu:
 	if (percpu)
 		percpu_modfree(percpu);
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+	if (refptr)
+		percpu_modfree(refptr);
+#endif
  free_mod:
 	kfree(args);
  free_hdr:

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] modules: Use a better scheme for refcounting
  2008-05-15 20:40 [PATCH] modules: Use a better scheme for refcounting Eric Dumazet
@ 2008-05-15 21:57 ` Andi Kleen
  2008-05-16  4:44   ` Eric Dumazet
  2008-05-16  0:09 ` Rusty Russell
  1 sibling, 1 reply; 14+ messages in thread
From: Andi Kleen @ 2008-05-15 21:57 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Andrew Morton, Rusty Russell, linux kernel, Mike Travis

Eric Dumazet <dada1@cosmosbay.com> writes:
> Instead of wasting NR_CPUS*128 bytes (on i386), we now use
> num_possible_cpus*sizeof(local_t) bytes.
>
> On a typical distro, where NR_CPUS=8, 

More typical would be NR_CPUS=128, with NR_CPUS=four digits 
when Mike Travis et.al. are finished

> shiping 2000 modules, we reduce

Surely only the loaded modules count? Perhaps 20-30.

But it's a cool improvement. Very nice.

-Andi

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] modules: Use a better scheme for refcounting
  2008-05-15 20:40 [PATCH] modules: Use a better scheme for refcounting Eric Dumazet
  2008-05-15 21:57 ` Andi Kleen
@ 2008-05-16  0:09 ` Rusty Russell
  2008-05-16  5:29   ` Eric Dumazet
  1 sibling, 1 reply; 14+ messages in thread
From: Rusty Russell @ 2008-05-16  0:09 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Andrew Morton, linux kernel, Mike Travis

On Friday 16 May 2008 06:40:37 Eric Dumazet wrote:
> Current refcounting for modules (done if CONFIG_MODULE_UNLOAD=y)
> is using a lot of memory.

Hi Eric,

   I like this patch!  The plan was always to create a proper dynamic per-cpu
allocator which used the normal per-cpu offsets, but I think module refcounts
are worthwhile as a special case.

   Any chance I can ask you look at the issue of full dynamic per-cpu
allocation?  The problem of allocating memory which is laid out precisely
as the original per-cpu alloc is vexing on NUMA, and probably requires
reserving virtual address space and remapping into it, but the rewards
would be maximally-efficient per-cpu accessors, and getting rid of that
boutique allocator in module.c.

Only minor commentry on the patch itself:

> +#ifdef CONFIG_SMP
> +       char *refptr;
> +#else

void * would seem more natural here (love those gcc'isms)

> +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
> +       void *refptr = NULL;
> +#endif

Looks like you can skip this if you assign to mod->refptr directly below:

> +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
> +       refptr = percpu_modalloc(sizeof(local_t), sizeof(local_t), mod->name);
> +       if (!refptr)
> +               goto free_mod;
> +       mod->refptr = refptr;
> +#endif

And finally:

> +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
> +       if (refptr)
> +               percpu_modfree(refptr);
> +#endif

This if (refptr) seems redundant.

Thanks!
Rusty.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] modules: Use a better scheme for refcounting
  2008-05-15 21:57 ` Andi Kleen
@ 2008-05-16  4:44   ` Eric Dumazet
  0 siblings, 0 replies; 14+ messages in thread
From: Eric Dumazet @ 2008-05-16  4:44 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Andrew Morton, Rusty Russell, linux kernel, Mike Travis

Andi Kleen a écrit :
> Eric Dumazet <dada1@cosmosbay.com> writes:
>   
>> Instead of wasting NR_CPUS*128 bytes (on i386), we now use
>> num_possible_cpus*sizeof(local_t) bytes.
>>
>> On a typical distro, where NR_CPUS=8, 
>>     
>
> More typical would be NR_CPUS=128, with NR_CPUS=four digits 
> when Mike Travis et.al. are finished
>
>   
Yes. With NR_CPUS=4096, we save about half a megabyte per module. (!!!)

>> shiping 2000 modules, we reduce
>>     
>
> Surely only the loaded modules count? Perhaps 20-30.
>
>   
Well, I should have stated that this saving also takes place in the 
module disk file.
(The "struct module" is included in it, in the 
".gnu.linkonce.this_module" section)
Here on my distro, around 2000 modules are shiped.
> But it's a cool improvement. Very nice.
>   
Thanks





^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] modules: Use a better scheme for refcounting
  2008-05-16  0:09 ` Rusty Russell
@ 2008-05-16  5:29   ` Eric Dumazet
  2008-05-16 13:41     ` Mike Travis
  0 siblings, 1 reply; 14+ messages in thread
From: Eric Dumazet @ 2008-05-16  5:29 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Andrew Morton, linux kernel, Mike Travis

[-- Attachment #1: Type: text/plain, Size: 3585 bytes --]

Rusty Russell a écrit :
> On Friday 16 May 2008 06:40:37 Eric Dumazet wrote:
>   
>> Current refcounting for modules (done if CONFIG_MODULE_UNLOAD=y)
>> is using a lot of memory.
>>     
>
> Hi Eric,
>
>    I like this patch!  The plan was always to create a proper dynamic per-cpu
> allocator which used the normal per-cpu offsets, but I think module refcounts
> are worthwhile as a special case.
>
>    Any chance I can ask you look at the issue of full dynamic per-cpu
> allocation?  The problem of allocating memory which is laid out precisely
> as the original per-cpu alloc is vexing on NUMA, and probably requires
> reserving virtual address space and remapping into it, but the rewards
> would be maximally-efficient per-cpu accessors, and getting rid of that
> boutique allocator in module.c.
>
>   
You mean using alloc_percpu() ? Problem is that current implementation 
is expensive, since it is using
an extra array of pointers (struct percpu_data). On x86_64, that means 
at least a 200% space increase
over the solution of using 4 bytes in the static percpu zone. We 
probably can change this to dynamic
per-cpu as soon as Mike or Christopher finish their work on new dynamic 
per-cpu implementation ?
> Only minor commentry on the patch itself:
>
>   
>> +#ifdef CONFIG_SMP
>> +       char *refptr;
>> +#else
>>     
>
> void * would seem more natural here (love those gcc'isms)
>
>   
Yes, sure.
>> +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
>> +       void *refptr = NULL;
>> +#endif
>>     
>
> Looks like you can skip this if you assign to mod->refptr directly below:
>
>   
>> +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
>> +       refptr = percpu_modalloc(sizeof(local_t), sizeof(local_t), mod->name);
>> +       if (!refptr)
>> +               goto free_mod;
>> +       mod->refptr = refptr;
>> +#endif
>>     
>
>   
Unfortunatly no. I tried it and failed.
This is the problem that at freeing time, we cannot anymore use mod->refptr,
since we just unmaped mod, using vfree().
You had same problem with percpu, and had to use a temporaty variable on 
stack :)
> And finally:
>
>   
>> +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
>> +       if (refptr)
>> +               percpu_modfree(refptr);
>> +#endif
>>     
>
> This if (refptr) seems redundant.
>
>   
Yes, thanks a lot. Please find an updated patch.
> Thanks!
> Rusty.
>
>
>   
[PATCH] modules: Use a better scheme for refcounting

Current refcounting for modules (done if CONFIG_MODULE_UNLOAD=y)
is using a lot of memory (and disk space)

Each 'struct module' contains an [NR_CPUS] array of full cache lines.

This patch uses existing infrastructure (percpu_modalloc() & 
percpu_modfree())
to allocate percpu space for the refcount storage.

Instead of wasting NR_CPUS*128 bytes (on i386), we now use
num_possible_cpus*sizeof(local_t) bytes.

On a typical distro, where NR_CPUS=8, shiping 2000 modules, we reduce
size of module files by about 2 Mbytes. (1Kb per module)

Instead of having all refcounters in the same memory node - with TLB misses
because of vmalloc() - this new implementation permits to have better
NUMA properties, since each  CPU will use storage on its preferred node,
thanks to percpu storage.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
 include/linux/module.h |   25 +++++++++++++++---------
 kernel/module.c        |   40 +++++++++++++++++++++++++++++----------
 2 files changed, 46 insertions(+), 19 deletions(-)


[-- Attachment #2: module.patch --]
[-- Type: text/plain, Size: 4871 bytes --]

diff --git a/include/linux/module.h b/include/linux/module.h
index 3e03b1a..dfd36ed 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -217,11 +217,6 @@ void *__symbol_get_gpl(const char *symbol);
 
 #endif
 
-struct module_ref
-{
-	local_t count;
-} ____cacheline_aligned;
-
 enum module_state
 {
 	MODULE_STATE_LIVE,
@@ -307,8 +302,11 @@ struct module
 
 #ifdef CONFIG_MODULE_UNLOAD
 	/* Reference counts */
-	struct module_ref ref[NR_CPUS];
-
+#ifdef CONFIG_SMP
+	void *refptr;
+#else
+	local_t ref;
+#endif
 	/* What modules depend on me? */
 	struct list_head modules_which_use_me;
 
@@ -378,13 +376,22 @@ void __symbol_put(const char *symbol);
 #define symbol_put(x) __symbol_put(MODULE_SYMBOL_PREFIX #x)
 void symbol_put_addr(void *addr);
 
+static inline local_t *__module_ref_addr(struct module *mod, int cpu)
+{
+#ifdef CONFIG_SMP
+	return (local_t *) (mod->refptr + per_cpu_offset(cpu));
+#else
+	return &mod->ref;
+#endif
+}
+
 /* Sometimes we know we already have a refcount, and it's easier not
    to handle the error case (which only happens with rmmod --wait). */
 static inline void __module_get(struct module *module)
 {
 	if (module) {
 		BUG_ON(module_refcount(module) == 0);
-		local_inc(&module->ref[get_cpu()].count);
+		local_inc(__module_ref_addr(module, get_cpu()));
 		put_cpu();
 	}
 }
@@ -396,7 +403,7 @@ static inline int try_module_get(struct module *module)
 	if (module) {
 		unsigned int cpu = get_cpu();
 		if (likely(module_is_live(module)))
-			local_inc(&module->ref[cpu].count);
+			local_inc(__module_ref_addr(module, cpu));
 		else
 			ret = 0;
 		put_cpu();
diff --git a/kernel/module.c b/kernel/module.c
index f5e9491..e82b2d9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -522,13 +522,13 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
 /* Init the unload section of the module. */
 static void module_unload_init(struct module *mod)
 {
-	unsigned int i;
+	int cpu;
 
 	INIT_LIST_HEAD(&mod->modules_which_use_me);
-	for (i = 0; i < NR_CPUS; i++)
-		local_set(&mod->ref[i].count, 0);
+	for_each_possible_cpu(cpu)
+		local_set(__module_ref_addr(mod, cpu), 0);
 	/* Hold reference count during initialization. */
-	local_set(&mod->ref[raw_smp_processor_id()].count, 1);
+	local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1);
 	/* Backwards compatibility macros put refcount during init. */
 	mod->waiter = current;
 }
@@ -659,10 +659,11 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
 
 unsigned int module_refcount(struct module *mod)
 {
-	unsigned int i, total = 0;
+	unsigned int total = 0;
+	int cpu;
 
-	for (i = 0; i < NR_CPUS; i++)
-		total += local_read(&mod->ref[i].count);
+	for_each_possible_cpu(cpu)
+		total += local_read(__module_ref_addr(mod, cpu));
 	return total;
 }
 EXPORT_SYMBOL(module_refcount);
@@ -824,7 +825,7 @@ void module_put(struct module *module)
 {
 	if (module) {
 		unsigned int cpu = get_cpu();
-		local_dec(&module->ref[cpu].count);
+		local_dec(__module_ref_addr(module, cpu));
 		/* Maybe they're waiting for us to drop reference? */
 		if (unlikely(!module_is_live(module)))
 			wake_up_process(module->waiter);
@@ -1392,7 +1393,9 @@ static void free_module(struct module *mod)
 	kfree(mod->args);
 	if (mod->percpu)
 		percpu_modfree(mod->percpu);
-
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+	percpu_modfree(mod->refptr);
+#endif
 	/* Free lock-classes: */
 	lockdep_free_key_range(mod->module_core, mod->core_size);
 
@@ -1760,6 +1763,9 @@ static struct module *load_module(void __user *umod,
 	unsigned int markersstringsindex;
 	struct module *mod;
 	long err = 0;
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+	void *refptr = NULL;
+#endif
 	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
 	struct exception_table_entry *extable;
 	mm_segment_t old_fs;
@@ -1904,6 +1910,16 @@ static struct module *load_module(void __user *umod,
 	if (err < 0)
 		goto free_mod;
 
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+	refptr = percpu_modalloc(sizeof(local_t),
+				 sizeof(local_t),
+				 mod->name);
+	if (!refptr) {
+		err = -ENOMEM;
+		goto free_mod;
+	}
+	mod->refptr = refptr;
+#endif
 	if (pcpuindex) {
 		/* We have a special allocation for this section. */
 		percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
@@ -1911,7 +1927,7 @@ static struct module *load_module(void __user *umod,
 					 mod->name);
 		if (!percpu) {
 			err = -ENOMEM;
-			goto free_mod;
+			goto free_refptr;
 		}
 		sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
 		mod->percpu = percpu;
@@ -2164,6 +2180,10 @@ static struct module *load_module(void __user *umod,
  free_percpu:
 	if (percpu)
 		percpu_modfree(percpu);
+ free_refptr:
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+	percpu_modfree(refptr);
+#endif
  free_mod:
 	kfree(args);
  free_hdr:

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] modules: Use a better scheme for refcounting
  2008-05-16  5:29   ` Eric Dumazet
@ 2008-05-16 13:41     ` Mike Travis
  2008-05-17  5:33       ` Rusty Russell
  0 siblings, 1 reply; 14+ messages in thread
From: Mike Travis @ 2008-05-16 13:41 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Rusty Russell, Andrew Morton, linux kernel, Christoph Lameter

Eric Dumazet wrote:
> Rusty Russell a écrit :
...
>>
>> Hi Eric,
>>
>>    I like this patch!  The plan was always to create a proper dynamic
>> per-cpu
>> allocator which used the normal per-cpu offsets, but I think module
>> refcounts
>> are worthwhile as a special case.
>>
>>    Any chance I can ask you look at the issue of full dynamic per-cpu
>> allocation?  The problem of allocating memory which is laid out precisely
>> as the original per-cpu alloc is vexing on NUMA, and probably requires
>> reserving virtual address space and remapping into it, but the rewards
>> would be maximally-efficient per-cpu accessors, and getting rid of that
>> boutique allocator in module.c.
>>
>>   
> You mean using alloc_percpu() ? Problem is that current implementation
> is expensive, since it is using
> an extra array of pointers (struct percpu_data). On x86_64, that means
> at least a 200% space increase
> over the solution of using 4 bytes in the static percpu zone. We
> probably can change this to dynamic
> per-cpu as soon as Mike or Christopher finish their work on new dynamic
> per-cpu implementation ?


Yes, the zero-based percpu variables followed by the cpu_alloc patch should
provide this and shrink the code quite well, including in some cases removing
locking requirements (because the resultant instructions will be atomic.)

Thanks,
Mike

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] modules: Use a better scheme for refcounting
  2008-05-16 13:41     ` Mike Travis
@ 2008-05-17  5:33       ` Rusty Russell
  2008-05-17  7:36         ` Eric Dumazet
  2008-05-19 16:39         ` Christoph Lameter
  0 siblings, 2 replies; 14+ messages in thread
From: Rusty Russell @ 2008-05-17  5:33 UTC (permalink / raw)
  To: Mike Travis; +Cc: Eric Dumazet, Andrew Morton, linux kernel, Christoph Lameter

On Friday 16 May 2008 23:41:16 Mike Travis wrote:
> Eric Dumazet wrote:
> > Rusty Russell a écrit :
> >>    Any chance I can ask you look at the issue of full dynamic per-cpu
> >> allocation?
> >
> > You mean using alloc_percpu() ? Problem is that current implementation
> > is expensive,

I mean rewriting alloc_percpu :)

> > We probably can change this to dynamic per-cpu as soon as Mike or
> > Christopher finish their work on new dynamic per-cpu implementation ?
>
> Yes, the zero-based percpu variables followed by the cpu_alloc patch should
> provide this and shrink the code quite well, including in some cases
> removing locking requirements (because the resultant instructions will be
> atomic.)

Ah, I hadn't realized that Mike was already working on this.  Mike, have you 
published patches already?

Thanks,
Rusty.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] modules: Use a better scheme for refcounting
  2008-05-17  5:33       ` Rusty Russell
@ 2008-05-17  7:36         ` Eric Dumazet
  2008-05-18 14:31           ` Rusty Russell
  2008-05-19 16:41           ` Christoph Lameter
  2008-05-19 16:39         ` Christoph Lameter
  1 sibling, 2 replies; 14+ messages in thread
From: Eric Dumazet @ 2008-05-17  7:36 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Mike Travis, Andrew Morton, linux kernel, Christoph Lameter

Rusty Russell a écrit :
> On Friday 16 May 2008 23:41:16 Mike Travis wrote:
>   
>> Eric Dumazet wrote:
>>     
>>> Rusty Russell a écrit :
>>>       
>>>>    Any chance I can ask you look at the issue of full dynamic per-cpu
>>>> allocation?
>>>>         
>>> You mean using alloc_percpu() ? Problem is that current implementation
>>> is expensive,
>>>       
>
> I mean rewriting alloc_percpu :)
>
>   
>>> We probably can change this to dynamic per-cpu as soon as Mike or
>>> Christopher finish their work on new dynamic per-cpu implementation ?
>>>       
>> Yes, the zero-based percpu variables followed by the cpu_alloc patch should
>> provide this and shrink the code quite well, including in some cases
>> removing locking requirements (because the resultant instructions will be
>> atomic.)
>>     
>
> Ah, I hadn't realized that Mike was already working on this.  Mike, have you 
> published patches already?
>
>   
Christoph Lameter made good work last year and apparently the path is to :

1) Put pda at the begining of percpu section.
2) Relocate percpu variables to begin at zero
3) Use %gs (or %fs) register to address pda AND percpu section.
4) Reserve an appropriate large virtual zone into percpu section so that 
it can fullfill :
- percpu "static" allocations needed by module loader.
- per_cpu "dynamic" allocations
  (No more need for an array of pointers to find the address of dynamic 
per_cpu variable)
- Eventually allocates PAGES dynamically into this zone to satisfy any 
size of percpu needs.
(But with a limit of say 256 MB per cpu on x86_64)

Thats a lot of work and apparently Mike and Christoph are doing it.

Some pointers to previous work:

http://groups.google.pl/group/linux.kernel/browse_thread/thread/f2ff6901ca6ae9fc/b2ed3b7f3612a157?lnk=raot

http://kerneltrap.org/mailarchive/linux-kernel/2008/2/1/683104

http://linux.derkeiler.com/Mailing-Lists/Kernel/2008-01/msg09052.html





^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] modules: Use a better scheme for refcounting
  2008-05-17  7:36         ` Eric Dumazet
@ 2008-05-18 14:31           ` Rusty Russell
  2008-05-19 16:42             ` Christoph Lameter
  2008-05-19 16:41           ` Christoph Lameter
  1 sibling, 1 reply; 14+ messages in thread
From: Rusty Russell @ 2008-05-18 14:31 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Mike Travis, Andrew Morton, linux kernel, Christoph Lameter

On Saturday 17 May 2008 17:36:21 Eric Dumazet wrote:
> Christoph Lameter made good work last year and apparently the path is to :
>
> 1) Put pda at the begining of percpu section.

Well, we killed the pda on i386: it's basically a limited arch-specific percpu 
section.  Unfortunately, ISTR that doing so kills -fstack-protector on x86-64 
(gcc was explicitly patched to support the kernel doing this, but it nailed 
us to a fixed offset off %fs).  But the PDA is a redundant idea which should 
be removed.

> 2) Relocate percpu variables to begin at zero
> 3) Use %gs (or %fs) register to address pda AND percpu section.

Thanks for the prod Eric.  ISTR acking those patches; Christoph, what happened 
to them?

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] modules: Use a better scheme for refcounting
  2008-05-17  5:33       ` Rusty Russell
  2008-05-17  7:36         ` Eric Dumazet
@ 2008-05-19 16:39         ` Christoph Lameter
  1 sibling, 0 replies; 14+ messages in thread
From: Christoph Lameter @ 2008-05-19 16:39 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Mike Travis, Eric Dumazet, Andrew Morton, linux kernel

On Sat, 17 May 2008, Rusty Russell wrote:

> > > You mean using alloc_percpu() ? Problem is that current implementation
> > > is expensive,
> 
> I mean rewriting alloc_percpu :)

cpu_alooc is going to replace that completely.
> 
> > > We probably can change this to dynamic per-cpu as soon as Mike or
> > > Christopher finish their work on new dynamic per-cpu implementation ?
> >
> > Yes, the zero-based percpu variables followed by the cpu_alloc patch should
> > provide this and shrink the code quite well, including in some cases
> > removing locking requirements (because the resultant instructions will be
> > atomic.)
> 
> Ah, I hadn't realized that Mike was already working on this.  Mike, have you 
> published patches already?

He is reworking my patches. He can use some encouragement though...


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] modules: Use a better scheme for refcounting
  2008-05-17  7:36         ` Eric Dumazet
  2008-05-18 14:31           ` Rusty Russell
@ 2008-05-19 16:41           ` Christoph Lameter
  2008-05-19 18:04             ` Mike Travis
  1 sibling, 1 reply; 14+ messages in thread
From: Christoph Lameter @ 2008-05-19 16:41 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Rusty Russell, Mike Travis, Andrew Morton, linux kernel

On Sat, 17 May 2008, Eric Dumazet wrote:

> Thats a lot of work and apparently Mike and Christoph are doing it.

Indeed. Thanks for the support. I hope Mike gets the stuff out soon. I 
know he was working on one issue that cause the zero based stuff to break. 
Maybe it would help if he would post what he has so far?


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] modules: Use a better scheme for refcounting
  2008-05-18 14:31           ` Rusty Russell
@ 2008-05-19 16:42             ` Christoph Lameter
  0 siblings, 0 replies; 14+ messages in thread
From: Christoph Lameter @ 2008-05-19 16:42 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Eric Dumazet, Mike Travis, Andrew Morton, linux kernel

On Mon, 19 May 2008, Rusty Russell wrote:

> Thanks for the prod Eric.  ISTR acking those patches; Christoph, what happened 
> to them?

Mike is handling them.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] modules: Use a better scheme for refcounting
  2008-05-19 16:41           ` Christoph Lameter
@ 2008-05-19 18:04             ` Mike Travis
  0 siblings, 0 replies; 14+ messages in thread
From: Mike Travis @ 2008-05-19 18:04 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Eric Dumazet, Rusty Russell, Andrew Morton, linux kernel

Christoph Lameter wrote:
> On Sat, 17 May 2008, Eric Dumazet wrote:
> 
>> Thats a lot of work and apparently Mike and Christoph are doing it.
> 
> Indeed. Thanks for the support. I hope Mike gets the stuff out soon. I 
> know he was working on one issue that cause the zero based stuff to break. 
> Maybe it would help if he would post what he has so far?

I'm re-basing it on the x86/latest tree as sched/latest was failing my tests
last week on both the AMD and Intel boxes.  As soon as I re-verify everything
I'll send it in.

Thanks,
Mike

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] modules: Use a better scheme for refcounting
@ 2009-02-03  3:01 Rusty Russell
  0 siblings, 0 replies; 14+ messages in thread
From: Rusty Russell @ 2009-02-03  3:01 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, Eric Biederman

From: Eric Dumazet <dada1@cosmosbay.com>

(Linus, please apply.  This has become critical with NR_CPUS 4096: we
currently use an astonishing 1GB additional space for 2000 modules.)

Current refcounting for modules (done if CONFIG_MODULE_UNLOAD=y)
is using a lot of memory.

Each 'struct module' contains an [NR_CPUS] array of full cache lines.

This patch uses existing infrastructure (percpu_modalloc() &
percpu_modfree()) to allocate percpu space for the refcount storage.

Instead of wasting NR_CPUS*128 bytes (on i386), we now use
nr_cpu_ids*sizeof(local_t) bytes.

On a typical distro, where NR_CPUS=8, shiping 2000 modules, we reduce
size of module files by about 2 Mbytes. (1Kb per module)

Instead of having all refcounters in the same memory node - with TLB misses
because of vmalloc() - this new implementation permits to have better
NUMA properties, since each  CPU will use storage on its preferred node,
thanks to percpu storage.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h |   25 ++++++++++++++++---------
 kernel/module.c        |   35 +++++++++++++++++++++++++----------
 2 files changed, 41 insertions(+), 19 deletions(-)

diff --git a/include/linux/module.h b/include/linux/module.h
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -219,11 +219,6 @@ void *__symbol_get_gpl(const char *symbo
 
 #endif
 
-struct module_ref
-{
-	local_t count;
-} ____cacheline_aligned;
-
 enum module_state
 {
 	MODULE_STATE_LIVE,
@@ -344,8 +339,11 @@ struct module
 	/* Destruction function. */
 	void (*exit)(void);
 
-	/* Reference counts */
-	struct module_ref ref[NR_CPUS];
+#ifdef CONFIG_SMP
+	char *refptr;
+#else
+	local_t ref;
+#endif
 #endif
 };
 #ifndef MODULE_ARCH_INIT
@@ -395,13 +393,22 @@ void __symbol_put(const char *symbol);
 #define symbol_put(x) __symbol_put(MODULE_SYMBOL_PREFIX #x)
 void symbol_put_addr(void *addr);
 
+static inline local_t *__module_ref_addr(struct module *mod, int cpu)
+{
+#ifdef CONFIG_SMP
+	return (local_t *) (mod->refptr + per_cpu_offset(cpu));
+#else
+	return &mod->ref;
+#endif
+}
+
 /* Sometimes we know we already have a refcount, and it's easier not
    to handle the error case (which only happens with rmmod --wait). */
 static inline void __module_get(struct module *module)
 {
 	if (module) {
 		BUG_ON(module_refcount(module) == 0);
-		local_inc(&module->ref[get_cpu()].count);
+		local_inc(__module_ref_addr(module, get_cpu()));
 		put_cpu();
 	}
 }
@@ -413,7 +420,7 @@ static inline int try_module_get(struct 
 	if (module) {
 		unsigned int cpu = get_cpu();
 		if (likely(module_is_live(module)))
-			local_inc(&module->ref[cpu].count);
+			local_inc(__module_ref_addr(module, cpu));
 		else
 			ret = 0;
 		put_cpu();
diff --git a/kernel/module.c b/kernel/module.c
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -573,13 +573,13 @@ static char last_unloaded_module[MODULE_
 /* Init the unload section of the module. */
 static void module_unload_init(struct module *mod)
 {
-	unsigned int i;
+	int cpu;
 
 	INIT_LIST_HEAD(&mod->modules_which_use_me);
-	for (i = 0; i < NR_CPUS; i++)
-		local_set(&mod->ref[i].count, 0);
+	for_each_possible_cpu(cpu)
+		local_set(__module_ref_addr(mod, cpu), 0);
 	/* Hold reference count during initialization. */
-	local_set(&mod->ref[raw_smp_processor_id()].count, 1);
+	local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1);
 	/* Backwards compatibility macros put refcount during init. */
 	mod->waiter = current;
 }
@@ -717,10 +717,11 @@ static int try_stop_module(struct module
 
 unsigned int module_refcount(struct module *mod)
 {
-	unsigned int i, total = 0;
+	unsigned int total = 0;
+	int cpu;
 
-	for (i = 0; i < NR_CPUS; i++)
-		total += local_read(&mod->ref[i].count);
+	for_each_possible_cpu(cpu)
+		total += local_read(__module_ref_addr(mod, cpu));
 	return total;
 }
 EXPORT_SYMBOL(module_refcount);
@@ -894,7 +895,7 @@ void module_put(struct module *module)
 {
 	if (module) {
 		unsigned int cpu = get_cpu();
-		local_dec(&module->ref[cpu].count);
+		local_dec(__module_ref_addr(module, cpu));
 		/* Maybe they're waiting for us to drop reference? */
 		if (unlikely(!module_is_live(module)))
 			wake_up_process(module->waiter);
@@ -1464,7 +1465,10 @@ static void free_module(struct module *m
 	kfree(mod->args);
 	if (mod->percpu)
 		percpu_modfree(mod->percpu);
-
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+	if (mod->refptr)
+		percpu_modfree(mod->refptr);
+#endif
 	/* Free lock-classes: */
 	lockdep_free_key_range(mod->module_core, mod->core_size);
 
@@ -2011,6 +2015,14 @@ static noinline struct module *load_modu
 	if (err < 0)
 		goto free_mod;
 
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+	mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
+				      mod->name);
+	if (!mod->refptr) {
+		err = -ENOMEM;
+		goto free_mod;
+	}
+#endif
 	if (pcpuindex) {
 		/* We have a special allocation for this section. */
 		percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
@@ -2018,7 +2030,7 @@ static noinline struct module *load_modu
 					 mod->name);
 		if (!percpu) {
 			err = -ENOMEM;
-			goto free_mod;
+			goto free_percpu;
 		}
 		sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
 		mod->percpu = percpu;
@@ -2282,6 +2294,9 @@ static noinline struct module *load_modu
  free_percpu:
 	if (percpu)
 		percpu_modfree(percpu);
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+	percpu_modfree(mod->refptr);
+#endif
  free_mod:
 	kfree(args);
  free_hdr:

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2009-02-03  3:01 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-05-15 20:40 [PATCH] modules: Use a better scheme for refcounting Eric Dumazet
2008-05-15 21:57 ` Andi Kleen
2008-05-16  4:44   ` Eric Dumazet
2008-05-16  0:09 ` Rusty Russell
2008-05-16  5:29   ` Eric Dumazet
2008-05-16 13:41     ` Mike Travis
2008-05-17  5:33       ` Rusty Russell
2008-05-17  7:36         ` Eric Dumazet
2008-05-18 14:31           ` Rusty Russell
2008-05-19 16:42             ` Christoph Lameter
2008-05-19 16:41           ` Christoph Lameter
2008-05-19 18:04             ` Mike Travis
2008-05-19 16:39         ` Christoph Lameter
  -- strict thread matches above, loose matches on Subject: below --
2009-02-03  3:01 Rusty Russell

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox