From: Manfred Spraul <manfred@colorfullife.com>
To: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Andrew Morton <akpm@osdl.org>, linux-mm@kvack.org
Subject: Re: slab fragmentation ?
Date: Sat, 09 Oct 2004 16:28:02 +0200 [thread overview]
Message-ID: <4167F572.8050900@colorfullife.com> (raw)
In-Reply-To: <1097074688.12861.182.camel@dyn318077bld.beaverton.ibm.com>
[-- Attachment #1: Type: text/plain, Size: 922 bytes --]
Badari Pulavarty wrote:
>Am I missing something fundamental here ?
>
>
>
No, you are right. The current implementation is just wrong(tm).
Attached is a patch - partially tested.
Description:
kmem_cache_alloc_node allocates memory from a particular node. The patch
fixes two problems with the current implementation:
- for !CONFIG_NUMA, kmem_cache_alloc_node is identical to kmalloc. The
patch implements kmem_cache_alloc_node as an alias to kmalloc for
!CONFIG_NUMA. Right now, the special node aware code runs even on
non-NUMA systems.
- checks the slab lists instead of allocating a new slab for every
allocation. This reduces the internal fragmentation.
Badri - could you test the patch?
Andrew, please do not merge the patch yet: it contains a severe bug: if
a node doesn't contain any memory, then it livelocks because the loop
never finds a suitable slab. I must think about that case.
--
Manfred
[-- Attachment #2: patch-nodealloc --]
[-- Type: text/plain, Size: 7579 bytes --]
// $Header$
// Kernel Version:
// VERSION = 2
// PATCHLEVEL = 6
// SUBLEVEL = 9
// EXTRAVERSION = -rc2-mm3
--- 2.6/include/linux/slab.h 2004-10-02 19:30:20.000000000 +0200
+++ build-2.6/include/linux/slab.h 2004-10-09 16:15:00.127026991 +0200
@@ -61,7 +61,14 @@
extern int kmem_cache_destroy(kmem_cache_t *);
extern int kmem_cache_shrink(kmem_cache_t *);
extern void *kmem_cache_alloc(kmem_cache_t *, int);
+#if CONFIG_NUMA
extern void *kmem_cache_alloc_node(kmem_cache_t *, int);
+#else
+static inline void *kmem_cache_alloc_node(kmem_cache_t *cachep, int node)
+{
+ return kmem_cache_alloc(cachep, GFP_KERNEL);
+}
+#endif
extern void kmem_cache_free(kmem_cache_t *, void *);
extern unsigned int kmem_cache_size(kmem_cache_t *);
--- 2.6/mm/slab.c 2004-10-02 19:30:21.000000000 +0200
+++ build-2.6/mm/slab.c 2004-10-09 16:14:30.779497578 +0200
@@ -327,6 +327,7 @@
unsigned long reaped;
unsigned long errors;
unsigned long max_freeable;
+ unsigned long node_allocs;
atomic_t allochit;
atomic_t allocmiss;
atomic_t freehit;
@@ -361,6 +362,7 @@
(x)->high_mark = (x)->num_active; \
} while (0)
#define STATS_INC_ERR(x) ((x)->errors++)
+#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
#define STATS_SET_FREEABLE(x, i) \
do { if ((x)->max_freeable < i) \
(x)->max_freeable = i; \
@@ -378,6 +380,7 @@
#define STATS_INC_REAPED(x) do { } while (0)
#define STATS_SET_HIGH(x) do { } while (0)
#define STATS_INC_ERR(x) do { } while (0)
+#define STATS_INC_NODEALLOCS(x) do { } while (0)
#define STATS_SET_FREEABLE(x, i) \
do { } while (0)
@@ -1747,7 +1750,7 @@
* Grow (by 1) the number of slabs within a cache. This is called by
* kmem_cache_alloc() when there are no active objs left in a cache.
*/
-static int cache_grow (kmem_cache_t * cachep, int flags)
+static int cache_grow (kmem_cache_t * cachep, int flags, int nodeid)
{
struct slab *slabp;
void *objp;
@@ -1798,7 +1801,7 @@
/* Get mem for the objs. */
- if (!(objp = kmem_getpages(cachep, flags, -1)))
+ if (!(objp = kmem_getpages(cachep, flags, nodeid)))
goto failed;
/* Get slab management. */
@@ -2032,7 +2035,7 @@
if (unlikely(!ac->avail)) {
int x;
- x = cache_grow(cachep, flags);
+ x = cache_grow(cachep, flags, -1);
// cache_grow can reenable interrupts, then ac could change.
ac = ac_data(cachep);
@@ -2313,6 +2316,7 @@
return 0;
}
+#if CONFIG_NUMA
/**
* kmem_cache_alloc_node - Allocate an object on the specified node
* @cachep: The cache to allocate from.
@@ -2325,69 +2329,78 @@
*/
void *kmem_cache_alloc_node(kmem_cache_t *cachep, int nodeid)
{
- size_t offset;
void *objp;
- struct slab *slabp;
- kmem_bufctl_t next;
-
- /* The main algorithms are not node aware, thus we have to cheat:
- * We bypass all caches and allocate a new slab.
- * The following code is a streamlined copy of cache_grow().
- */
-
- /* Get colour for the slab, and update the next value. */
- spin_lock_irq(&cachep->spinlock);
- offset = cachep->colour_next;
- cachep->colour_next++;
- if (cachep->colour_next >= cachep->colour)
- cachep->colour_next = 0;
- offset *= cachep->colour_off;
- spin_unlock_irq(&cachep->spinlock);
-
- /* Get mem for the objs. */
- if (!(objp = kmem_getpages(cachep, GFP_KERNEL, nodeid)))
- goto failed;
- /* Get slab management. */
- if (!(slabp = alloc_slabmgmt(cachep, objp, offset, GFP_KERNEL)))
- goto opps1;
-
- set_slab_attr(cachep, slabp, objp);
- cache_init_objs(cachep, slabp, SLAB_CTOR_CONSTRUCTOR);
+ for (;;) {
+ struct slab *slabp;
+ struct list_head *q;
+ kmem_bufctl_t next;
- /* The first object is ours: */
- objp = slabp->s_mem + slabp->free*cachep->objsize;
- slabp->inuse++;
- next = slab_bufctl(slabp)[slabp->free];
-#if DEBUG
- slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
-#endif
- slabp->free = next;
+ objp = NULL;
+ check_irq_on();
+ spin_lock_irq(&cachep->spinlock);
+ /* walk through all partial and empty slab and find one
+ * from the right node */
+ list_for_each(q,&cachep->lists.slabs_partial) {
+ slabp = list_entry(q, struct slab, list);
+
+ if (page_to_nid(virt_to_page(slabp->s_mem)) == nodeid)
+ goto got_slabp;
+ }
+ list_for_each(q, &cachep->lists.slabs_free) {
+ slabp = list_entry(q, struct slab, list);
+
+ if (page_to_nid(virt_to_page(slabp->s_mem)) == nodeid) {
+got_slabp:
+ /* found one: allocate object */
+ check_slabp(cachep, slabp);
+ check_spinlock_acquired(cachep);
+
+ STATS_INC_ALLOCED(cachep);
+ STATS_INC_ACTIVE(cachep);
+ STATS_SET_HIGH(cachep);
+ STATS_INC_NODEALLOCS(cachep);
+
+ objp = slabp->s_mem + slabp->free*cachep->objsize;
+
+ slabp->inuse++;
+ next = slab_bufctl(slabp)[slabp->free];
+#if DEBUG
+ slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
+#endif
+ slabp->free = next;
+ check_slabp(cachep, slabp);
+
+ /* move slabp to correct slabp list: */
+ list_del(&slabp->list);
+ if (slabp->free == BUFCTL_END)
+ list_add(&slabp->list, &cachep->lists.slabs_full);
+ else
+ list_add(&slabp->list, &cachep->lists.slabs_partial);
+
+ list3_data(cachep)->free_objects--;
+ spin_unlock_irq(&cachep->spinlock);
+ goto alloc_done;
+ }
+ }
+ spin_unlock_irq(&cachep->spinlock);
- /* add the remaining objects into the cache */
- spin_lock_irq(&cachep->spinlock);
- check_slabp(cachep, slabp);
- STATS_INC_GROWN(cachep);
- /* Make slab active. */
- if (slabp->free == BUFCTL_END) {
- list_add_tail(&slabp->list, &(list3_data(cachep)->slabs_full));
- } else {
- list_add_tail(&slabp->list,
- &(list3_data(cachep)->slabs_partial));
- list3_data(cachep)->free_objects += cachep->num-1;
+ local_irq_disable();
+ if (!cache_grow(cachep, GFP_KERNEL, nodeid)) {
+ local_irq_enable();
+ return NULL;
+ }
+ local_irq_enable();
}
- spin_unlock_irq(&cachep->spinlock);
+alloc_done:
objp = cache_alloc_debugcheck_after(cachep, GFP_KERNEL, objp,
__builtin_return_address(0));
return objp;
-opps1:
- kmem_freepages(cachep, objp);
-failed:
- return NULL;
-
}
EXPORT_SYMBOL(kmem_cache_alloc_node);
+#endif
+
/**
* kmalloc - allocate memory
* @size: how many bytes of memory are required.
@@ -2813,15 +2826,16 @@
* without _too_ many complaints.
*/
#if STATS
- seq_puts(m, "slabinfo - version: 2.0 (statistics)\n");
+ seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
#else
- seq_puts(m, "slabinfo - version: 2.0\n");
+ seq_puts(m, "slabinfo - version: 2.1\n");
#endif
seq_puts(m, "# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
seq_puts(m, " : tunables <batchcount> <limit> <sharedfactor>");
seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
#if STATS
- seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> <error> <maxfreeable> <freelimit>");
+ seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped>"
+ " <error> <maxfreeable> <freelimit> <nodeallocs>");
seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
#endif
seq_putc(m, '\n');
@@ -2912,10 +2926,11 @@
unsigned long errors = cachep->errors;
unsigned long max_freeable = cachep->max_freeable;
unsigned long free_limit = cachep->free_limit;
+ unsigned long node_allocs = cachep->node_allocs;
- seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu",
+ seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu",
allocs, high, grown, reaped, errors,
- max_freeable, free_limit);
+ max_freeable, free_limit, node_allocs);
}
/* cpu stats */
{
prev parent reply other threads:[~2004-10-09 14:28 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2004-09-29 23:36 slab fragmentation ? Badari Pulavarty
2004-09-30 3:41 ` Andrew Morton
2004-09-30 4:52 ` badari
2004-09-30 14:49 ` Martin J. Bligh
2004-09-30 14:48 ` Badari Pulavarty
2004-10-03 6:04 ` Manfred Spraul
2004-10-04 15:51 ` Badari Pulavarty
2004-10-04 16:08 ` Manfred Spraul
2004-10-04 17:37 ` Badari Pulavarty
2004-10-05 14:46 ` Badari Pulavarty
2004-10-05 17:58 ` Manfred Spraul
2004-10-05 18:27 ` Badari Pulavarty
2004-10-05 18:49 ` Manfred Spraul
2004-10-05 18:47 ` Badari Pulavarty
2004-10-05 21:13 ` Badari Pulavarty
2004-10-05 22:11 ` Chen, Kenneth W
2004-10-05 22:18 ` Chen, Kenneth W
2004-10-06 14:58 ` Badari Pulavarty
2004-10-09 14:28 ` Manfred Spraul [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4167F572.8050900@colorfullife.com \
--to=manfred@colorfullife.com \
--cc=akpm@osdl.org \
--cc=linux-mm@kvack.org \
--cc=pbadari@us.ibm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.