* [RFC, PATCH] SLAB : [NUMA] keep nodeid in struct page instead of struct slab
@ 2007-03-21 9:22 Eric Dumazet
2007-03-21 12:21 ` Pekka Enberg
` (3 more replies)
0 siblings, 4 replies; 26+ messages in thread
From: Eric Dumazet @ 2007-03-21 9:22 UTC (permalink / raw)
To: Andi Kleen, Andrew Morton, Christoph Lameter; +Cc: linux kernel
In order to avoid a cache miss in kmem_cache_free() on NUMA and reduce hot path length, we could exploit the following common facts.
1) MAX_NUMNODES <= 64
2) alignment of 'struct kmem_cache *' can be >= 64
The following patch changes the page->lru.next to contain not only the 'struct kmem_cache *' pointer, but also the nodeid in the low order bits.
This also reduces sizeof(struct slab) by 8 bytes on 64bits arches.
This reduces sizeof(struct slab) on all platforms (UP, or SMP)
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
diff --git a/mm/slab.c b/mm/slab.c
index abf46ae..d2f7299 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -210,6 +210,16 @@ #define BUFCTL_FREE (((kmem_bufctl_t)(~0
#define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2)
#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3)
+#ifdef CONFIG_NUMA
+#if MAX_NUMNODES <= 64
+/* we can use low order bits of page->lru.next to store nodeids */
+# define KEEP_NODEID_IN_PAGE
+#else
+/* too many nodes, we need a field in 'struct slab' */
+# define KEEP_NODEID_IN_SLAB
+#endif
+#endif
+
/*
* struct slab
*
@@ -223,7 +233,9 @@ struct slab {
void *s_mem; /* including colour offset */
unsigned int inuse; /* num of objs active in slab */
kmem_bufctl_t free;
+#ifdef KEEP_NODEID_IN_SLAB
unsigned short nodeid;
+#endif
};
/*
@@ -585,9 +597,18 @@ static int slab_break_gfp_order = BREAK_
* allocator. These are used to find the slab an obj belongs to. With kfree(),
* these are used to find the cache which an obj belongs to.
*/
-static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
+static inline void page_set_cache_slab_nodeid(struct page *page,
+ struct kmem_cache *cache, struct slab *slab, int nodeid)
{
+ page->lru.prev = (struct list_head *)slab;
+#ifdef KEEP_NODEID_IN_PAGE
+ page->lru.next = (struct list_head *)((long)cache + nodeid);
+#else
page->lru.next = (struct list_head *)cache;
+#endif
+#ifdef KEEP_NODEID_IN_SLAB
+ slab->nodeid = nodeid;
+#endif
}
static inline struct kmem_cache *page_get_cache(struct page *page)
@@ -595,12 +616,11 @@ static inline struct kmem_cache *page_ge
if (unlikely(PageCompound(page)))
page = (struct page *)page_private(page);
BUG_ON(!PageSlab(page));
+#ifdef KEEP_NODEID_IN_PAGE
+ return (struct kmem_cache *)((long)page->lru.next & ~63);
+#else
return (struct kmem_cache *)page->lru.next;
-}
-
-static inline void page_set_slab(struct page *page, struct slab *slab)
-{
- page->lru.prev = (struct list_head *)slab;
+#endif
}
static inline struct slab *page_get_slab(struct page *page)
@@ -617,6 +637,18 @@ static inline struct kmem_cache *virt_to
return page_get_cache(page);
}
+#ifdef CONFIG_NUMA
+static inline int virt_to_nodeid(const void *obj)
+{
+ struct page *page = virt_to_page(obj);
+#ifdef KEEP_NODEID_IN_SLAB
+ return page_get_slab(page)->nodeid;
+#else
+ return (long)page->lru.next & 63;
+#endif
+}
+#endif
+
static inline struct slab *virt_to_slab(const void *obj)
{
struct page *page = virt_to_page(obj);
@@ -1134,8 +1166,7 @@ static void drain_alien_cache(struct kme
static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
{
- struct slab *slabp = virt_to_slab(objp);
- int nodeid = slabp->nodeid;
+ int nodeid = virt_to_nodeid(objp);
struct kmem_list3 *l3;
struct array_cache *alien = NULL;
int node;
@@ -1146,7 +1177,7 @@ static inline int cache_free_alien(struc
* Make sure we are not freeing a object from another node to the array
* cache on this cpu.
*/
- if (likely(slabp->nodeid == node) || unlikely(!use_alien_caches))
+ if (likely(nodeid == node) || unlikely(!use_alien_caches))
return 0;
l3 = cachep->nodelists[node];
@@ -1437,8 +1468,14 @@ void __init kmem_cache_init(void)
cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE];
+#ifdef KEEP_NODEID_IN_PAGE
+ /* kmem_cache addresses must be multiple of 64 */
+ cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
+ max(64, cache_line_size()));
+#else
cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
cache_line_size());
+#endif
cache_cache.reciprocal_buffer_size =
reciprocal_value(cache_cache.buffer_size);
@@ -2588,7 +2625,6 @@ static struct slab *alloc_slabmgmt(struc
slabp->inuse = 0;
slabp->colouroff = colour_off;
slabp->s_mem = objp + colour_off;
- slabp->nodeid = nodeid;
return slabp;
}
@@ -2699,7 +2735,7 @@ #endif
* virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
*/
static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
- void *addr)
+ void *addr, int nodeid)
{
int nr_pages;
struct page *page;
@@ -2711,8 +2747,7 @@ static void slab_map_pages(struct kmem_c
nr_pages <<= cache->gfporder;
do {
- page_set_cache(page, cache);
- page_set_slab(page, slab);
+ page_set_cache_slab_nodeid(page, cache, slab, nodeid);
page++;
} while (--nr_pages);
}
@@ -2787,8 +2822,7 @@ static int cache_grow(struct kmem_cache
if (!slabp)
goto opps1;
- slabp->nodeid = nodeid;
- slab_map_pages(cachep, slabp, objp);
+ slab_map_pages(cachep, slabp, objp, nodeid);
cache_init_objs(cachep, slabp, ctor_flags);
^ permalink raw reply related [flat|nested] 26+ messages in thread
* Re: [RFC, PATCH] SLAB : [NUMA] keep nodeid in struct page instead of struct slab
2007-03-21 9:22 [RFC, PATCH] SLAB : [NUMA] keep nodeid in struct page instead of struct slab Eric Dumazet
@ 2007-03-21 12:21 ` Pekka Enberg
2007-03-21 14:42 ` Christoph Lameter
2007-03-21 12:27 ` [RFC, PATCH] SLAB : [NUMA] keep nodeid in struct page instead of struct slab Pekka Enberg
` (2 subsequent siblings)
3 siblings, 1 reply; 26+ messages in thread
From: Pekka Enberg @ 2007-03-21 12:21 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Andi Kleen, Andrew Morton, Christoph Lameter, linux kernel
On 3/21/07, Eric Dumazet <dada1@cosmosbay.com> wrote:
> In order to avoid a cache miss in kmem_cache_free() on NUMA and reduce hot path
> length, we could exploit the following common facts.
It would be nice if you could cc me for slab patches.
On 3/21/07, Eric Dumazet <dada1@cosmosbay.com> wrote:
> -static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
> +static inline void page_set_cache_slab_nodeid(struct page *page,
> + struct kmem_cache *cache, struct slab *slab, int nodeid)
> {
> + page->lru.prev = (struct list_head *)slab;
> +#ifdef KEEP_NODEID_IN_PAGE
> + page->lru.next = (struct list_head *)((long)cache + nodeid);
> +#else
> page->lru.next = (struct list_head *)cache;
> +#endif
> +#ifdef KEEP_NODEID_IN_SLAB
> + slab->nodeid = nodeid;
> +#endif
Can we please have a slab_get_nid() and slab_set_nid() instead which
reduces the need for #ifdefs?
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC, PATCH] SLAB : [NUMA] keep nodeid in struct page instead of struct slab
2007-03-21 9:22 [RFC, PATCH] SLAB : [NUMA] keep nodeid in struct page instead of struct slab Eric Dumazet
2007-03-21 12:21 ` Pekka Enberg
@ 2007-03-21 12:27 ` Pekka Enberg
2007-03-21 14:41 ` Christoph Lameter
2007-03-21 16:08 ` Andi Kleen
3 siblings, 0 replies; 26+ messages in thread
From: Pekka Enberg @ 2007-03-21 12:27 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Andi Kleen, Andrew Morton, Christoph Lameter, linux kernel
On 3/21/07, Eric Dumazet <dada1@cosmosbay.com> wrote:
> +#ifdef KEEP_NODEID_IN_PAGE
> + /* kmem_cache addresses must be multiple of 64 */
> + cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
> + max(64, cache_line_size()));
> +#else
> cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
> cache_line_size());
> +#endif
Please introduce cache_cache_align() to hide the #ifdeffing.
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC, PATCH] SLAB : [NUMA] keep nodeid in struct page instead of struct slab
2007-03-21 9:22 [RFC, PATCH] SLAB : [NUMA] keep nodeid in struct page instead of struct slab Eric Dumazet
2007-03-21 12:21 ` Pekka Enberg
2007-03-21 12:27 ` [RFC, PATCH] SLAB : [NUMA] keep nodeid in struct page instead of struct slab Pekka Enberg
@ 2007-03-21 14:41 ` Christoph Lameter
2007-03-21 15:36 ` Eric Dumazet
2007-03-21 16:08 ` Andi Kleen
3 siblings, 1 reply; 26+ messages in thread
From: Christoph Lameter @ 2007-03-21 14:41 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Andi Kleen, Andrew Morton, linux kernel
On Wed, 21 Mar 2007, Eric Dumazet wrote:
> In order to avoid a cache miss in kmem_cache_free() on NUMA and reduce hot path length, we could exploit the following common facts.
>
> 1) MAX_NUMNODES <= 64
MAX_NUMNODES == 1024 on IA64 and may be on x86_64 in the future.
> 2) alignment of 'struct kmem_cache *' can be >= 64
>
> The following patch changes the page->lru.next to contain not only the
> 'struct kmem_cache *' pointer, but also the nodeid in the low order
> bits.
Nack.
I would suggest you audit slab and make sure that we never fall back and
always enter a slab on the node of page_to_nid(page). If
all allocations use GFP_THISNODE then you should be safe and then we can
remove the nodeid from the slab structure and simply use the available
node information in the page struct.
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC, PATCH] SLAB : [NUMA] keep nodeid in struct page instead of struct slab
2007-03-21 12:21 ` Pekka Enberg
@ 2007-03-21 14:42 ` Christoph Lameter
2007-03-21 15:03 ` Pekka Enberg
2007-03-21 15:42 ` Pekka J Enberg
0 siblings, 2 replies; 26+ messages in thread
From: Christoph Lameter @ 2007-03-21 14:42 UTC (permalink / raw)
To: Pekka Enberg; +Cc: Eric Dumazet, Andi Kleen, Andrew Morton, linux kernel
On Wed, 21 Mar 2007, Pekka Enberg wrote:
> Can we please have a slab_get_nid() and slab_set_nid() instead which
> reduces the need for #ifdefs?
None of that please. The page flags already contain a node number that is
accessible via page_to_nid(). Just make sure that we never put a slab onto
the wrong node.
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC, PATCH] SLAB : [NUMA] keep nodeid in struct page instead of struct slab
2007-03-21 14:42 ` Christoph Lameter
@ 2007-03-21 15:03 ` Pekka Enberg
2007-03-21 15:43 ` Christoph Lameter
2007-03-21 15:42 ` Pekka J Enberg
1 sibling, 1 reply; 26+ messages in thread
From: Pekka Enberg @ 2007-03-21 15:03 UTC (permalink / raw)
To: Christoph Lameter; +Cc: Eric Dumazet, Andi Kleen, Andrew Morton, linux kernel
On 3/21/07, Christoph Lameter <christoph@lameter.com> wrote:
> None of that please. The page flags already contain a node number that is
> accessible via page_to_nid(). Just make sure that we never put a slab onto
> the wrong node.
Sounds good. Do you know for a fact that we don't or are you just
afraid we have a bug?
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC, PATCH] SLAB : [NUMA] keep nodeid in struct page instead of struct slab
2007-03-21 14:41 ` Christoph Lameter
@ 2007-03-21 15:36 ` Eric Dumazet
2007-03-21 15:44 ` Christoph Lameter
2007-03-21 15:47 ` Pekka Enberg
0 siblings, 2 replies; 26+ messages in thread
From: Eric Dumazet @ 2007-03-21 15:36 UTC (permalink / raw)
To: Christoph Lameter; +Cc: Andi Kleen, Andrew Morton, linux kernel
On Wed, 21 Mar 2007 07:41:46 -0700 (PDT)
Christoph Lameter <christoph@lameter.com> wrote:
> On Wed, 21 Mar 2007, Eric Dumazet wrote:
>
> > In order to avoid a cache miss in kmem_cache_free() on NUMA and reduce hot path length, we could exploit the following common facts.
> >
> > 1) MAX_NUMNODES <= 64
>
> MAX_NUMNODES == 1024 on IA64 and may be on x86_64 in the future.
So what ? I am very happy for you Christoph.
Average linux machines (ie more than 99.99 % on this planet) are using MAX_NUMNODES <= 64
If you read my patch, your lovely 1024 nodes machines will run as before : No change at all for big machines.
>
> > 2) alignment of 'struct kmem_cache *' can be >= 64
> >
> > The following patch changes the page->lru.next to contain not only the
> > 'struct kmem_cache *' pointer, but also the nodeid in the low order
> > bits.
>
> Nack.
>
> I would suggest you audit slab and make sure that we never fall back and
> always enter a slab on the node of page_to_nid(page). If
> all allocations use GFP_THISNODE then you should be safe and then we can
> remove the nodeid from the slab structure and simply use the available
> node information in the page struct.
>
Last time I checked 'struct page', they was no nodeid in it.
I cooked this patch because you told me it was possible to have a 'slab nodeid' different than page_to_nid(obj). Now you want me to prove the contrary. Me confused.
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC, PATCH] SLAB : [NUMA] keep nodeid in struct page instead of struct slab
2007-03-21 14:42 ` Christoph Lameter
2007-03-21 15:03 ` Pekka Enberg
@ 2007-03-21 15:42 ` Pekka J Enberg
2007-03-21 15:49 ` Eric Dumazet
2007-03-21 16:09 ` Andi Kleen
1 sibling, 2 replies; 26+ messages in thread
From: Pekka J Enberg @ 2007-03-21 15:42 UTC (permalink / raw)
To: Christoph Lameter; +Cc: Eric Dumazet, Andi Kleen, Andrew Morton, linux kernel
On Wed, 21 Mar 2007, Christoph Lameter wrote:
> None of that please. The page flags already contain a node number that is
> accessible via page_to_nid(). Just make sure that we never put a slab onto
> the wrong node.
Oh well, here's a patch to find out. (I don't have a NUMA box.)
Pekka
From: Pekka Enberg <penberg@cs.helsinki.fi>
This adds a virt_to_nodeid() for looking up the NUMA node ID for a
slab object. The current implementation uses slab->nodeid but adds a
WARN_ON to catch cases where page nid differs. Eventually, we can
change virt_to_nodeid() to use page_to_nid after which slab->nodeid
can be removed.
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
mm/slab.c | 17 ++++++++++++-----
1 file changed, 12 insertions(+), 5 deletions(-)
Index: 2.6/mm/slab.c
===================================================================
--- 2.6.orig/mm/slab.c 2007-03-21 16:58:53.000000000 +0200
+++ 2.6/mm/slab.c 2007-03-21 16:59:26.000000000 +0200
@@ -623,6 +623,14 @@ static inline struct slab *virt_to_slab(
return page_get_slab(page);
}
+static inline unsigned short virt_to_nodeid(const void *obj)
+{
+ struct page *page = virt_to_page(obj);
+ struct slab *slab = page_get_slab(page);
+ WARN_ON(slab->nodeid != page_to_nid(page));
+ return slab->nodeid;
+}
+
static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
unsigned int idx)
{
@@ -1134,8 +1142,7 @@ int i = 0;
static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
{
- struct slab *slabp = virt_to_slab(objp);
- int nodeid = slabp->nodeid;
+ int nodeid = virt_to_nodeid(objp);
struct kmem_list3 *l3;
struct array_cache *alien = NULL;
int node;
@@ -1146,7 +1153,7 @@ static inline int cache_free_alien(struc
* Make sure we are not freeing a object from another node to the array
* cache on this cpu.
*/
- if (likely(slabp->nodeid == node) || unlikely(!use_alien_caches))
+ if (likely(nodeid == node) || unlikely(!use_alien_caches))
return 0;
l3 = cachep->nodelists[node];
@@ -2665,7 +2672,7 @@ static void *slab_get_obj(struct kmem_ca
next = slab_bufctl(slabp)[slabp->free];
#if DEBUG
slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
- WARN_ON(slabp->nodeid != nodeid);
+ WARN_ON(virt_to_nodeid(objp) != nodeid);
#endif
slabp->free = next;
@@ -2679,7 +2686,7 @@ static void slab_put_obj(struct kmem_cac
#if DEBUG
/* Verify that the slab belongs to the intended node */
- WARN_ON(slabp->nodeid != nodeid);
+ WARN_ON(virt_to_nodeid(objp) != nodeid);
if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
printk(KERN_ERR "slab: double free detected in cache "
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC, PATCH] SLAB : [NUMA] keep nodeid in struct page instead of struct slab
2007-03-21 15:03 ` Pekka Enberg
@ 2007-03-21 15:43 ` Christoph Lameter
0 siblings, 0 replies; 26+ messages in thread
From: Christoph Lameter @ 2007-03-21 15:43 UTC (permalink / raw)
To: Pekka Enberg; +Cc: Eric Dumazet, Andi Kleen, Andrew Morton, linux kernel
On Wed, 21 Mar 2007, Pekka Enberg wrote:
> On 3/21/07, Christoph Lameter <christoph@lameter.com> wrote:
> > None of that please. The page flags already contain a node number that is
> > accessible via page_to_nid(). Just make sure that we never put a slab onto
> > the wrong node.
>
> Sounds good. Do you know for a fact that we don't or are you just
> afraid we have a bug?
I tried to clean it up during the GFP_THISNODE work but there was some
issue on powerpc where we needed fallback for bootstrap. There may be more
trouble there. Checking the allocs where we do not use GFP_THISNODE should
get you there.
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC, PATCH] SLAB : [NUMA] keep nodeid in struct page instead of struct slab
2007-03-21 15:36 ` Eric Dumazet
@ 2007-03-21 15:44 ` Christoph Lameter
2007-03-21 15:58 ` Eric Dumazet
2007-03-21 15:47 ` Pekka Enberg
1 sibling, 1 reply; 26+ messages in thread
From: Christoph Lameter @ 2007-03-21 15:44 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Andi Kleen, Andrew Morton, linux kernel
On Wed, 21 Mar 2007, Eric Dumazet wrote:
> On Wed, 21 Mar 2007 07:41:46 -0700 (PDT)
> Christoph Lameter <christoph@lameter.com> wrote:
>
> > On Wed, 21 Mar 2007, Eric Dumazet wrote:
> >
> > > In order to avoid a cache miss in kmem_cache_free() on NUMA and reduce hot path length, we could exploit the following common facts.
> > >
> > > 1) MAX_NUMNODES <= 64
> >
> > MAX_NUMNODES == 1024 on IA64 and may be on x86_64 in the future.
>
> So what ? I am very happy for you Christoph.
You wanted to exploit that MAX_NUMNODES <= 64?
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC, PATCH] SLAB : [NUMA] keep nodeid in struct page instead of struct slab
2007-03-21 15:36 ` Eric Dumazet
2007-03-21 15:44 ` Christoph Lameter
@ 2007-03-21 15:47 ` Pekka Enberg
2007-03-21 15:52 ` Eric Dumazet
1 sibling, 1 reply; 26+ messages in thread
From: Pekka Enberg @ 2007-03-21 15:47 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Christoph Lameter, Andi Kleen, Andrew Morton, linux kernel
On 3/21/07, Eric Dumazet <dada1@cosmosbay.com> wrote:
> Last time I checked 'struct page', they was no nodeid in it.
Hmm, page_to_nid() in include/linx/mm.h doesn't agree with you:
#ifdef NODE_NOT_IN_PAGE_FLAGS
extern int page_to_nid(struct page *page);
#else
static inline int page_to_nid(struct page *page)
{
return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
}
#endif
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC, PATCH] SLAB : [NUMA] keep nodeid in struct page instead of struct slab
2007-03-21 15:42 ` Pekka J Enberg
@ 2007-03-21 15:49 ` Eric Dumazet
2007-03-21 15:55 ` Pekka J Enberg
2007-03-21 16:09 ` Andi Kleen
1 sibling, 1 reply; 26+ messages in thread
From: Eric Dumazet @ 2007-03-21 15:49 UTC (permalink / raw)
To: Pekka J Enberg; +Cc: Christoph Lameter, Andi Kleen, Andrew Morton, linux kernel
On Wed, 21 Mar 2007 17:42:22 +0200 (EET)
Pekka J Enberg <penberg@cs.helsinki.fi> wrote:
> On Wed, 21 Mar 2007, Christoph Lameter wrote:
> > None of that please. The page flags already contain a node number that is
> > accessible via page_to_nid(). Just make sure that we never put a slab onto
> > the wrong node.
>
> Oh well, here's a patch to find out. (I don't have a NUMA box.)
>
This wont work, because of use_alien_caches knob.
If use_alien_caches is set to 0, your WARN_ON will trigger many false alarms
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC, PATCH] SLAB : [NUMA] keep nodeid in struct page instead of struct slab
2007-03-21 15:47 ` Pekka Enberg
@ 2007-03-21 15:52 ` Eric Dumazet
0 siblings, 0 replies; 26+ messages in thread
From: Eric Dumazet @ 2007-03-21 15:52 UTC (permalink / raw)
To: Pekka Enberg; +Cc: Christoph Lameter, Andi Kleen, Andrew Morton, linux kernel
On Wed, 21 Mar 2007 17:47:13 +0200
"Pekka Enberg" <penberg@cs.helsinki.fi> wrote:
> On 3/21/07, Eric Dumazet <dada1@cosmosbay.com> wrote:
> > Last time I checked 'struct page', they was no nodeid in it.
>
> Hmm, page_to_nid() in include/linx/mm.h doesn't agree with you:
>
> #ifdef NODE_NOT_IN_PAGE_FLAGS
> extern int page_to_nid(struct page *page);
> #else
> static inline int page_to_nid(struct page *page)
> {
> return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
> }
> #endif
>
Correct, but still another ifdef NODE_NOT_IN_PAGE_FLAGS
Anyway the correct path is to check GFP_THISNODE stuff, because we can
even avoid access to 'struct page' to get the nodeid, at least on x86_64 NUMA,
where struct memnode can provide the information with less cache footprint.
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC, PATCH] SLAB : [NUMA] keep nodeid in struct page instead of struct slab
2007-03-21 15:49 ` Eric Dumazet
@ 2007-03-21 15:55 ` Pekka J Enberg
0 siblings, 0 replies; 26+ messages in thread
From: Pekka J Enberg @ 2007-03-21 15:55 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Christoph Lameter, Andi Kleen, Andrew Morton, linux kernel
On Wed, 21 Mar 2007, Eric Dumazet wrote:
> This wont work, because of use_alien_caches knob.
>
> If use_alien_caches is set to 0, your WARN_ON will trigger many false
> alarms
How is that? The mapping should always be the same regardless of whether
we use alien caches or not.
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC, PATCH] SLAB : [NUMA] keep nodeid in struct page instead of struct slab
2007-03-21 15:44 ` Christoph Lameter
@ 2007-03-21 15:58 ` Eric Dumazet
0 siblings, 0 replies; 26+ messages in thread
From: Eric Dumazet @ 2007-03-21 15:58 UTC (permalink / raw)
To: Christoph Lameter; +Cc: Andi Kleen, Andrew Morton, linux kernel
On Wed, 21 Mar 2007 08:44:53 -0700 (PDT)
Christoph Lameter <christoph@lameter.com> wrote:
> You wanted to exploit that MAX_NUMNODES <= 64?
Maybe my english is not perfect. I promise I improve it eventually.
My patch was for unlucky guys that have MAX_NUMNODES <= 64.
For others, the preprocessor gave the old code.
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC, PATCH] SLAB : [NUMA] keep nodeid in struct page instead of struct slab
2007-03-21 9:22 [RFC, PATCH] SLAB : [NUMA] keep nodeid in struct page instead of struct slab Eric Dumazet
` (2 preceding siblings ...)
2007-03-21 14:41 ` Christoph Lameter
@ 2007-03-21 16:08 ` Andi Kleen
3 siblings, 0 replies; 26+ messages in thread
From: Andi Kleen @ 2007-03-21 16:08 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Andi Kleen, Andrew Morton, Christoph Lameter, linux kernel
On Wed, Mar 21, 2007 at 10:22:31AM +0100, Eric Dumazet wrote:
> In order to avoid a cache miss in kmem_cache_free() on NUMA and reduce hot path length, we could exploit the following common facts.
>
> 1) MAX_NUMNODES <= 64
>
> 2) alignment of 'struct kmem_cache *' can be >= 64
>
> The following patch changes the page->lru.next to contain not only the 'struct kmem_cache *' pointer, but also the nodeid in the low order bits.
>
It might be cleaner to just put it into flags on 64bit. There is enough
space there and we don't really care about 32bit NUMA machines.
-Andi
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC, PATCH] SLAB : [NUMA] keep nodeid in struct page instead of struct slab
2007-03-21 15:42 ` Pekka J Enberg
2007-03-21 15:49 ` Eric Dumazet
@ 2007-03-21 16:09 ` Andi Kleen
2007-03-21 16:19 ` Pekka Enberg
` (2 more replies)
1 sibling, 3 replies; 26+ messages in thread
From: Andi Kleen @ 2007-03-21 16:09 UTC (permalink / raw)
To: Pekka J Enberg
Cc: Christoph Lameter, Eric Dumazet, Andi Kleen, Andrew Morton,
linux kernel
On Wed, Mar 21, 2007 at 05:42:22PM +0200, Pekka J Enberg wrote:
> On Wed, 21 Mar 2007, Christoph Lameter wrote:
> > None of that please. The page flags already contain a node number that is
> > accessible via page_to_nid(). Just make sure that we never put a slab onto
> > the wrong node.
>
> Oh well, here's a patch to find out. (I don't have a NUMA box.)
You can always use numa emulation on x86-64. Boot with numa=fake=NODES
-Andi
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC, PATCH] SLAB : [NUMA] keep nodeid in struct page instead of struct slab
2007-03-21 16:09 ` Andi Kleen
@ 2007-03-21 16:19 ` Pekka Enberg
2007-03-21 16:27 ` Christoph Lameter
2007-03-23 11:15 ` [RFC] NUMA : could we introduce virt_to_nid() ? Eric Dumazet
2 siblings, 0 replies; 26+ messages in thread
From: Pekka Enberg @ 2007-03-21 16:19 UTC (permalink / raw)
To: andi
Cc: Christoph Lameter, Eric Dumazet, Andi Kleen, Andrew Morton,
linux kernel
On 3/21/2007, "Andi Kleen" <andi@firstfloor.org> wrote:
> You can always use numa emulation on x86-64. Boot with
> numa=fake=NODES
Yes, I know, but I don't have a x86-64 box either =)
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC, PATCH] SLAB : [NUMA] keep nodeid in struct page instead of struct slab
2007-03-21 16:09 ` Andi Kleen
2007-03-21 16:19 ` Pekka Enberg
@ 2007-03-21 16:27 ` Christoph Lameter
2007-03-23 11:15 ` [RFC] NUMA : could we introduce virt_to_nid() ? Eric Dumazet
2 siblings, 0 replies; 26+ messages in thread
From: Christoph Lameter @ 2007-03-21 16:27 UTC (permalink / raw)
To: Andi Kleen; +Cc: Pekka J Enberg, Eric Dumazet, Andrew Morton, linux kernel
On Wed, 21 Mar 2007, Andi Kleen wrote:
> On Wed, Mar 21, 2007 at 05:42:22PM +0200, Pekka J Enberg wrote:
> > On Wed, 21 Mar 2007, Christoph Lameter wrote:
> > > None of that please. The page flags already contain a node number that is
> > > accessible via page_to_nid(). Just make sure that we never put a slab onto
> > > the wrong node.
> >
> > Oh well, here's a patch to find out. (I don't have a NUMA box.)
>
> You can always use numa emulation on x86-64. Boot with numa=fake=NODES
This wont help you to debug strange fallback scenarios on unusual NUMA
platforms.
^ permalink raw reply [flat|nested] 26+ messages in thread
* [RFC] NUMA : could we introduce virt_to_nid() ?
2007-03-21 16:09 ` Andi Kleen
2007-03-21 16:19 ` Pekka Enberg
2007-03-21 16:27 ` Christoph Lameter
@ 2007-03-23 11:15 ` Eric Dumazet
2007-03-23 12:48 ` Pekka Enberg
` (2 more replies)
2 siblings, 3 replies; 26+ messages in thread
From: Eric Dumazet @ 2007-03-23 11:15 UTC (permalink / raw)
To: Andi Kleen; +Cc: Pekka J Enberg, Christoph Lameter, Andrew Morton, linux kernel
Hi Andi
Checking Christoph quicklist implementation, I found the same cache miss in free() than SLAB has.
/* common implementation *
int virt_to_nid(const void *addr)
{
struct page *page = virt_to_page(addr);
return page_to_nid(page);
}
On some platforms (x86_64 for example), could we have a better implementation, not accessing struct page, but using phys_to_nid() ?
Thank you
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC] NUMA : could we introduce virt_to_nid() ?
2007-03-23 11:15 ` [RFC] NUMA : could we introduce virt_to_nid() ? Eric Dumazet
@ 2007-03-23 12:48 ` Pekka Enberg
2007-03-23 13:53 ` Eric Dumazet
2007-03-23 14:16 ` Andi Kleen
2007-03-23 14:50 ` Christoph Lameter
2 siblings, 1 reply; 26+ messages in thread
From: Pekka Enberg @ 2007-03-23 12:48 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Andi Kleen, Christoph Lameter, Andrew Morton, linux kernel
On 3/23/07, Eric Dumazet <dada1@cosmosbay.com> wrote:
> Checking Christoph quicklist implementation, I found the same cache miss in
> free() than SLAB has.
>
> /* common implementation *
> int virt_to_nid(const void *addr)
> {
> struct page *page = virt_to_page(addr);
> return page_to_nid(page);
> }
>
> On some platforms (x86_64 for example), could we have a better implementation,
> not accessing struct page, but using phys_to_nid() ?
Sounds good to me. At least cache_free_alien() in mm/slab.c to should
be converted to use it.
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC] NUMA : could we introduce virt_to_nid() ?
2007-03-23 12:48 ` Pekka Enberg
@ 2007-03-23 13:53 ` Eric Dumazet
0 siblings, 0 replies; 26+ messages in thread
From: Eric Dumazet @ 2007-03-23 13:53 UTC (permalink / raw)
To: Pekka Enberg; +Cc: Andi Kleen, Christoph Lameter, Andrew Morton, linux kernel
On Fri, 23 Mar 2007 14:48:24 +0200
"Pekka Enberg" <penberg@cs.helsinki.fi> wrote:
> On 3/23/07, Eric Dumazet <dada1@cosmosbay.com> wrote:
> > Checking Christoph quicklist implementation, I found the same cache miss in
> > free() than SLAB has.
> >
> > /* common implementation *
> > int virt_to_nid(const void *addr)
> > {
> > struct page *page = virt_to_page(addr);
> > return page_to_nid(page);
> > }
> >
> > On some platforms (x86_64 for example), could we have a better implementation,
> > not accessing struct page, but using phys_to_nid() ?
>
> Sounds good to me. At least cache_free_alien() in mm/slab.c to should
> be converted to use it.
>
Not yet :(
Because in slab we currently need virt_to_slab(addr)->nodeid, not page_to_nid(virt_to_page(addr))
It might be different, according to Christoph
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC] NUMA : could we introduce virt_to_nid() ?
2007-03-23 11:15 ` [RFC] NUMA : could we introduce virt_to_nid() ? Eric Dumazet
2007-03-23 12:48 ` Pekka Enberg
@ 2007-03-23 14:16 ` Andi Kleen
2007-03-23 14:50 ` Christoph Lameter
2 siblings, 0 replies; 26+ messages in thread
From: Andi Kleen @ 2007-03-23 14:16 UTC (permalink / raw)
To: Eric Dumazet
Cc: Andi Kleen, Pekka J Enberg, Christoph Lameter, Andrew Morton,
linux kernel
On Fri, Mar 23, 2007 at 12:15:12PM +0100, Eric Dumazet wrote:
> Hi Andi
>
> Checking Christoph quicklist implementation, I found the same cache miss in free() than SLAB has.
>
> /* common implementation *
> int virt_to_nid(const void *addr)
> {
> struct page *page = virt_to_page(addr);
> return page_to_nid(page);
> }
>
> On some platforms (x86_64 for example), could we have a better implementation, not accessing struct page, but using phys_to_nid() ?
Sure why not. Please send a patch.
-Andi
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC] NUMA : could we introduce virt_to_nid() ?
2007-03-23 11:15 ` [RFC] NUMA : could we introduce virt_to_nid() ? Eric Dumazet
2007-03-23 12:48 ` Pekka Enberg
2007-03-23 14:16 ` Andi Kleen
@ 2007-03-23 14:50 ` Christoph Lameter
2007-03-23 16:22 ` Eric Dumazet
2 siblings, 1 reply; 26+ messages in thread
From: Christoph Lameter @ 2007-03-23 14:50 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Andi Kleen, Pekka J Enberg, Andrew Morton, linux kernel
On Fri, 23 Mar 2007, Eric Dumazet wrote:
> Checking Christoph quicklist implementation, I found the same cache miss in free() than SLAB has.
>
> /* common implementation *
> int virt_to_nid(const void *addr)
> {
> struct page *page = virt_to_page(addr);
> return page_to_nid(page);
> }
>
> On some platforms (x86_64 for example), could we have a better
> implementation, not accessing struct page, but using phys_to_nid() ?
This is going to pollute the caches since we then use multiple ways to
determine the node of a page. Its better to stay with the same approach
for all pages. The page struct is used for many other purposes as well.
Its likely to be in the cpu cache.
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC] NUMA : could we introduce virt_to_nid() ?
2007-03-23 14:50 ` Christoph Lameter
@ 2007-03-23 16:22 ` Eric Dumazet
2007-03-23 17:19 ` Christoph Lameter
0 siblings, 1 reply; 26+ messages in thread
From: Eric Dumazet @ 2007-03-23 16:22 UTC (permalink / raw)
To: Christoph Lameter; +Cc: Andi Kleen, Pekka J Enberg, Andrew Morton, linux kernel
On Fri, 23 Mar 2007 07:50:28 -0700 (PDT)
Christoph Lameter <christoph@lameter.com> wrote:
> On Fri, 23 Mar 2007, Eric Dumazet wrote:
>
> > Checking Christoph quicklist implementation, I found the same cache miss in free() than SLAB has.
> >
> > /* common implementation *
> > int virt_to_nid(const void *addr)
> > {
> > struct page *page = virt_to_page(addr);
> > return page_to_nid(page);
> > }
> >
> > On some platforms (x86_64 for example), could we have a better
> > implementation, not accessing struct page, but using phys_to_nid() ?
>
> This is going to pollute the caches since we then use multiple ways to
> determine the node of a page. Its better to stay with the same approach
> for all pages. The page struct is used for many other purposes as well.
> Its likely to be in the cpu cache.
>
Sorry ? page structs are not in cpu cache at all.
phys_to_nid() on my 16 GB x86_64 machine uses one single cache line, shared by all pages.
This single cache line is cache hot yes, not "struct page"s ...
And on this machine, thats about 224 Mbytes of 'struct pages'
You carefully commented your alloc() function saying it is touching two cache lines.
But you omited to say that free() function needs 3 cache lines if CONFIG_NUMA
For SLAB use, page struct is needed because we use lru.{next|prev} to store slab/cachep pointers, but for a pure page allocator, unless I misread your patch, we dont need it, if virt_to_nid() can do its job without it.
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC] NUMA : could we introduce virt_to_nid() ?
2007-03-23 16:22 ` Eric Dumazet
@ 2007-03-23 17:19 ` Christoph Lameter
0 siblings, 0 replies; 26+ messages in thread
From: Christoph Lameter @ 2007-03-23 17:19 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Andi Kleen, Pekka J Enberg, Andrew Morton, linux kernel
On Fri, 23 Mar 2007, Eric Dumazet wrote:
> Sorry ? page structs are not in cpu cache at all.
They are if they are in any way handled by the VM.
> You carefully commented your alloc() function saying it is touching two cache lines.
> But you omited to say that free() function needs 3 cache lines if CONFIG_NUMA
Yes it needs the third cacheline to check the node of the page in the NUMA
case.
> For SLAB use, page struct is needed because we use lru.{next|prev} to
> store slab/cachep pointers, but for a pure page allocator, unless I
> misread your patch, we dont need it, if virt_to_nid() can do its job
> without it.
The page allocator is only handling page_structs.
^ permalink raw reply [flat|nested] 26+ messages in thread
end of thread, other threads:[~2007-03-23 17:19 UTC | newest]
Thread overview: 26+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-03-21 9:22 [RFC, PATCH] SLAB : [NUMA] keep nodeid in struct page instead of struct slab Eric Dumazet
2007-03-21 12:21 ` Pekka Enberg
2007-03-21 14:42 ` Christoph Lameter
2007-03-21 15:03 ` Pekka Enberg
2007-03-21 15:43 ` Christoph Lameter
2007-03-21 15:42 ` Pekka J Enberg
2007-03-21 15:49 ` Eric Dumazet
2007-03-21 15:55 ` Pekka J Enberg
2007-03-21 16:09 ` Andi Kleen
2007-03-21 16:19 ` Pekka Enberg
2007-03-21 16:27 ` Christoph Lameter
2007-03-23 11:15 ` [RFC] NUMA : could we introduce virt_to_nid() ? Eric Dumazet
2007-03-23 12:48 ` Pekka Enberg
2007-03-23 13:53 ` Eric Dumazet
2007-03-23 14:16 ` Andi Kleen
2007-03-23 14:50 ` Christoph Lameter
2007-03-23 16:22 ` Eric Dumazet
2007-03-23 17:19 ` Christoph Lameter
2007-03-21 12:27 ` [RFC, PATCH] SLAB : [NUMA] keep nodeid in struct page instead of struct slab Pekka Enberg
2007-03-21 14:41 ` Christoph Lameter
2007-03-21 15:36 ` Eric Dumazet
2007-03-21 15:44 ` Christoph Lameter
2007-03-21 15:58 ` Eric Dumazet
2007-03-21 15:47 ` Pekka Enberg
2007-03-21 15:52 ` Eric Dumazet
2007-03-21 16:08 ` Andi Kleen
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox