[PATCH] numa slab, rediffed against 2.5.46

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Manfred Spraul <manfred@colorfullife.com>
To: linux-kernel@vger.kernel.org, lse-tech@lists.sourceforge.net
Subject: [PATCH] numa slab, rediffed against 2.5.46
Date: Sat, 09 Nov 2002 18:51:44 +0100	[thread overview]
Message-ID: <3DCD4B30.2090204@colorfullife.com> (raw)

[-- Attachment #1: Type: text/plain, Size: 1051 bytes --]

Attached is my numa aware slab allocator, rediffed against 2.5.46.
It makes the objects that are returned from kmem_cache_alloc strictly 
node local. Unfortunately this means that kmem_cache_free must return 
objects to the home node, which is expensive. (The return is batched, 
but it's still expensive)

I'm not sure that the patch will improve the performance - benchmarks 
are now needed.

TODO:
- implement ptr_to_nodeid() for all archs.The current implementation is 
a dummy, to test the code on non-NUMA systems.
- switch from MAX_NUMNODES to numnodes - Anton proposed that.
- improve the handling of nodes without cpus or without memory.
- add a kmem_cache_alloc_fromnode() function
- replace the kmem_list3 array with an array of pointers, and allocate 
the storage from the right node.
- allocate the head arrays from the node that is local to the cpu that 
accesses the head array.
- check for regressions - I was careful not to undo any cleanups that 
happened between 2.5.42 and 46, but it's possible that I missed some.

--
    Manfred

[-- Attachment #2: patch-slab-numa --]
[-- Type: text/plain, Size: 31984 bytes --]

--- 2.5/mm/slab.c	2002-11-09 00:45:37.000000000 +0100
+++ build-2.5/mm/slab.c	2002-11-09 15:25:05.000000000 +0100
@@ -10,6 +10,8 @@
  *
  * Cleanup, make the head arrays unconditional, preparation for NUMA
  * 	(c) 2002 Manfred Spraul
+ * Initial NUMA implementation
+ *      (c) 2002 Manfred Spraul
  *
  * An implementation of the Slab Allocator as described in outline in;
  *	UNIX Internals: The New Frontiers by Uresh Vahalia
@@ -85,6 +87,29 @@
 #include	<asm/uaccess.h>
 
 /*
+ * Enable the NUMA mode for slab
+ * This is a separate define from CONFIG_DISCONTIGMEM, because it only
+ * applies if ZONE_NORMAL allocations are possible on all zones.
+ * TODO:
+ * - move ptr_to_nodeid into include/asm-
+ * - make the cache structures themselves node local
+ * - is it possible to use the cpu alloc interface?
+ * - the behaviour is bad if get_free_pages returns returns
+ *   memory from the another node: 
+ *   The page is used just for one refill, then left on the
+ *   other node's partial list.
+ *   Is that acceptable?
+ * - determine the optimal placement for the chache spinlock:
+ *   node local or global?
+ * - which additional statistic counters would be interesting?
+ * - disable object return for the hopeless caches [journal head,
+ *   buffer head, dentry - we'll trash cachelines anyway]
+ */
+#define CONFIG_SLAB_NUMA
+#undef MAX_NUMNODES
+#define MAX_NUMNODES 4
+ 
+/*
  * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
  *		  SLAB_RED_ZONE & SLAB_POISON.
  *		  0 for faster, smaller code (especially in the critical paths).
@@ -174,6 +199,10 @@
  *
  * The limit is stored in the per-cpu structure to reduce the data cache
  * footprint.
+ * On NUMA systems, 2 per-cpu structures exist: one for the current
+ * node, one for wrong node free calls.
+ * Memory from the wrong node is never returned by alloc, it's returned
+ * to the home node as soon as the cpu cache is filled
  *
  */
 struct array_cache {
@@ -183,8 +212,17 @@
 	unsigned int touched;
 };
 
+struct cpucache_wrapper {
+	struct array_cache *native;
+#ifdef CONFIG_SLAB_NUMA
+	struct array_cache *alien;
+#endif
+};
 /* bootstrap: The caches do not work without cpuarrays anymore,
  * but the cpuarrays are allocated from the generic caches...
+ *
+ * sizeof(struct arraycache_init) must be <= the size of the first
+ * 	kmalloc general cache, otherwise the bootstrap will crash.
  */
 #define BOOT_CPUCACHE_ENTRIES	1
 struct arraycache_init {
@@ -206,20 +244,31 @@
 	unsigned long	free_objects;
 	int		free_touched;
 	unsigned long	next_reap;
+#if STATS
+	unsigned long		num_allocations;
+
+	unsigned long		grown;
+	unsigned long		high_mark;
+	unsigned long		num_active;
+#endif
 };
 
-#define LIST3_INIT(parent) \
-	{ \
-		.slabs_full	= LIST_HEAD_INIT(parent.slabs_full), \
-		.slabs_partial	= LIST_HEAD_INIT(parent.slabs_partial), \
-		.slabs_free	= LIST_HEAD_INIT(parent.slabs_free) \
-	}
-#define list3_data(cachep) \
-	(&(cachep)->lists)
+#if STATS
+#define	STATS_INC_GROWN(x)	((x)->grown++)
+#define	STATS_INC_ALLOCED(x)	((x)->num_allocations++)
+#define	STATS_INC_ACTIVE(x)	do { (x)->num_active++; \
+				     if ((x)->num_active > (x)->high_mark) \
+					(x)->high_mark = (x)->num_active; \
+				} while (0)
+#define	STATS_DEC_ACTIVE(x)	((x)->num_active--)
+#else
+#define	STATS_INC_GROWN(x)	do { } while (0)
+#define	STATS_INC_ALLOCED(x)	do { } while (0)
+#define	STATS_INC_ACTIVE(x)	do { } while (0)
 
-/* NUMA: per-node */
-#define list3_data_ptr(cachep, ptr) \
-		list3_data(cachep)
+#define	STATS_DEC_ACTIVE(x)	do { } while (0)
+
+#endif
 
 /*
  * kmem_cache_t
@@ -229,12 +278,11 @@
 	
 struct kmem_cache_s {
 /* 1) per-cpu data, touched during every alloc/free */
-	struct array_cache	*array[NR_CPUS];
+	struct cpucache_wrapper	cpudata[NR_CPUS];
 	unsigned int		batchcount;
 	unsigned int		limit;
 /* 2) touched by every alloc & free from the backend */
-	struct kmem_list3	lists;
-	/* NUMA: kmem_3list_t	*nodelists[NR_NODES] */
+	struct kmem_list3	lists[MAX_NUMNODES];	/* NUMA: pointers would be better */
 	unsigned int		objsize;
 	unsigned int	 	flags;	/* constant flags */
 	unsigned int		num;	/* # of objs per slab */
@@ -252,7 +300,6 @@
 	unsigned int		colour_off;	/* colour offset */
 	unsigned int		colour_next;	/* cache colouring */
 	kmem_cache_t		*slabp_cache;
-	unsigned int		dflags;		/* dynamic flags */
 
 	/* constructor func */
 	void (*ctor)(void *, kmem_cache_t *, unsigned long);
@@ -266,17 +313,15 @@
 
 /* 5) statistics */
 #if STATS
-	unsigned long		num_active;
-	unsigned long		num_allocations;
-	unsigned long		high_mark;
-	unsigned long		grown;
-	unsigned long		reaped;
-	unsigned long 		errors;
-	unsigned long		max_freeable;
-	atomic_t		allochit;
-	atomic_t		allocmiss;
-	atomic_t		freehit;
-	atomic_t		freemiss;
+	atomic_t	errors;
+
+	atomic_t	allochit[NR_CPUS];
+	atomic_t	allocmiss[NR_CPUS];
+	atomic_t	freehit[NR_CPUS];
+	atomic_t	freemiss[NR_CPUS];
+#ifdef CONFIG_SLAB_NUMA
+	atomic_t	foreign[NR_CPUS];
+#endif
 #endif
 };
 
@@ -296,39 +341,21 @@
 #define REAPTIMEOUT_LIST3	(4*HZ)
 
 #if STATS
-#define	STATS_INC_ACTIVE(x)	((x)->num_active++)
-#define	STATS_DEC_ACTIVE(x)	((x)->num_active--)
-#define	STATS_INC_ALLOCED(x)	((x)->num_allocations++)
-#define	STATS_INC_GROWN(x)	((x)->grown++)
-#define	STATS_INC_REAPED(x)	((x)->reaped++)
-#define	STATS_SET_HIGH(x)	do { if ((x)->num_active > (x)->high_mark) \
-					(x)->high_mark = (x)->num_active; \
-				} while (0)
-#define	STATS_INC_ERR(x)	((x)->errors++)
-#define	STATS_SET_FREEABLE(x, i) \
-				do { if ((x)->max_freeable < i) \
-					(x)->max_freeable = i; \
-				} while (0)
+#define	STATS_INC_ERR(x)	atomic_inc(&(x)->errors)
 
-#define STATS_INC_ALLOCHIT(x)	atomic_inc(&(x)->allochit)
-#define STATS_INC_ALLOCMISS(x)	atomic_inc(&(x)->allocmiss)
-#define STATS_INC_FREEHIT(x)	atomic_inc(&(x)->freehit)
-#define STATS_INC_FREEMISS(x)	atomic_inc(&(x)->freemiss)
+#define STATS_INC_ALLOCHIT(x)	atomic_inc(&(x)->allochit[smp_processor_id()])
+#define STATS_INC_ALLOCMISS(x)	atomic_inc(&(x)->allocmiss[smp_processor_id()])
+#define STATS_INC_FREEHIT(x)	atomic_inc(&(x)->freehit[smp_processor_id()])
+#define STATS_INC_FREEMISS(x)	atomic_inc(&(x)->freemiss[smp_processor_id()])
+#define STATS_INC_FOREIGN(x)	atomic_inc(&(x)->foreign[smp_processor_id()])
 #else
-#define	STATS_INC_ACTIVE(x)	do { } while (0)
-#define	STATS_DEC_ACTIVE(x)	do { } while (0)
-#define	STATS_INC_ALLOCED(x)	do { } while (0)
-#define	STATS_INC_GROWN(x)	do { } while (0)
-#define	STATS_INC_REAPED(x)	do { } while (0)
-#define	STATS_SET_HIGH(x)	do { } while (0)
-#define	STATS_INC_ERR(x)	do { } while (0)
-#define	STATS_SET_FREEABLE(x, i) \
-				do { } while (0)
+#define STATS_INC_ERR(x)	do { } while (0)
 
 #define STATS_INC_ALLOCHIT(x)	do { } while (0)
 #define STATS_INC_ALLOCMISS(x)	do { } while (0)
 #define STATS_INC_FREEHIT(x)	do { } while (0)
 #define STATS_INC_FREEMISS(x)	do { } while (0)
+#define STATS_INC_FOREIGN(x)	do { } while (0)	
 #endif
 
 #if DEBUG
@@ -436,8 +463,6 @@
 
 /* internal cache of cache description objs */
 static kmem_cache_t cache_cache = {
-	.lists		= LIST3_INIT(cache_cache.lists),
-	.array		= { [0] = &initarray_cache.cache },
 	.batchcount	= 1,
 	.limit		= BOOT_CPUCACHE_ENTRIES,
 	.objsize	= sizeof(kmem_cache_t),
@@ -514,6 +539,23 @@
 	}
 }
 
+static struct array_cache *alloc_acdata(int limit, int batchcount)
+{
+	int memsize;
+	struct array_cache *nc;
+
+	memsize = sizeof(void*)*limit+sizeof(struct array_cache);
+	nc = kmalloc(memsize, GFP_KERNEL);
+	if (!nc)
+		return NULL;
+	nc->avail = 0;
+	nc->limit = limit;
+	nc->batchcount = batchcount;
+	nc->touched = 0;
+
+	return nc;
+}
+
 /*
  * Note: if someone calls kmem_cache_alloc() on the new
  * cpu before the cpuup callback had a chance to allocate
@@ -531,25 +573,27 @@
 	case CPU_UP_PREPARE:
 		down(&cache_chain_sem);
 		list_for_each(p, &cache_chain) {
-			int memsize;
 			struct array_cache *nc;
 
 			kmem_cache_t* cachep = list_entry(p, kmem_cache_t, next);
-			memsize = sizeof(void*)*cachep->limit+sizeof(struct array_cache);
-			nc = kmalloc(memsize, GFP_KERNEL);
+			nc = alloc_acdata(cachep->limit, cachep->batchcount);
 			if (!nc)
 				goto bad;
-			nc->avail = 0;
-			nc->limit = cachep->limit;
-			nc->batchcount = cachep->batchcount;
-			nc->touched = 0;
 
 			spin_lock_irq(&cachep->spinlock);
-			cachep->array[cpu] = nc;
+			cachep->cpudata[cpu].native = nc;
 			cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
 						+ cachep->num;
 			spin_unlock_irq(&cachep->spinlock);
+#ifdef CONFIG_SLAB_NUMA
+			nc = alloc_acdata(cachep->limit, cachep->limit);
+			if (!nc)
+				goto bad;
 
+			spin_lock_irq(&cachep->spinlock);
+			cachep->cpudata[cpu].alien = nc;
+			spin_unlock_irq(&cachep->spinlock);
+#endif
 		}
 		up(&cache_chain_sem);
 		break;
@@ -564,9 +608,14 @@
 			struct array_cache *nc;
 			kmem_cache_t* cachep = list_entry(p, kmem_cache_t, next);
 
-			nc = cachep->array[cpu];
-			cachep->array[cpu] = NULL;
+			nc = cachep->cpudata[cpu].native;
+			cachep->cpudata[cpu].native = NULL;
 			kfree(nc);
+#ifdef CONFIG_SLAB_NUMA
+			nc = cachep->cpudata[cpu].alien;
+			cachep->cpudata[cpu].alien = NULL;
+			kfree(nc);
+#endif
 		}
 		up(&cache_chain_sem);
 		break;
@@ -584,20 +633,74 @@
 	return (void**)(ac+1);
 }
 
-static inline struct array_cache *ac_data(kmem_cache_t *cachep)
+/*
+ * Helper functions/macros to access the per-cpu
+ * and per-node structures
+ */
+
+#define ac_data(cachep) \
+	((cachep)->cpudata[smp_processor_id()].native)
+
+#define list3_data(cachep) \
+	(&(cachep)->lists[__cpu_to_node(smp_processor_id())])
+
+#ifdef CONFIG_SLAB_NUMA
+/*
+ * NUMA: check where ptr points, and select the appropriate storage
+ * 	for the object.
+ */
+/* FIXME - this function must be somewhere in include/asm- */
+static inline int ptr_to_node(void *obj)
 {
-	return cachep->array[smp_processor_id()];
+	return (((unsigned long)obj)/4/1024/1024)%MAX_NUMNODES;
 }
 
+static inline struct array_cache * ac_data_ptr(kmem_cache_t *cachep, void *objp)
+{
+	if (ptr_to_node(objp) == __cpu_to_node(smp_processor_id()))
+		return cachep->cpudata[smp_processor_id()].native;
+	STATS_INC_FOREIGN(cachep);
+	return cachep->cpudata[smp_processor_id()].alien;
+}
+#define DEFINE_NUMALIST_PTR(x)	\
+	struct kmem_list3 *x
+
+#define set_numalist_ptr(x, cachep, objp) \
+		do { x = &cachep->lists[ptr_to_node(objp)]; } while(0)
+#define set_numalist_cur(x, cachep) \
+		do { x = &cachep->lists[__cpu_to_node(smp_processor_id())]; } while(0)
+#define access_numalist_ptr(cachep, x) \
+		(x)
+
+#else
+
+#define ac_data_ptr(cachep, ptr)	 ac_data(cachep)
+
+#define DEFINE_NUMALIST_PTR(x)	
+#define set_numalist_ptr(x, cachep, objp)	do { } while(0)
+#define set_numalist_cur(x, cachep)	 	do { } while(0)
+
+#define access_numalist_ptr(cachep, x)	 	(&(cachep->lists[0]))
+
+#endif
+
 /* Initialisation - setup the `cache' cache. */
 void __init kmem_cache_init(void)
 {
 	size_t left_over;
+	int i;
 
 	init_MUTEX(&cache_chain_sem);
 	INIT_LIST_HEAD(&cache_chain);
 	list_add(&cache_cache.next, &cache_chain);
 
+	for (i=0;i<MAX_NUMNODES;i++) {
+		INIT_LIST_HEAD(&cache_cache.lists[i].slabs_full);
+		INIT_LIST_HEAD(&cache_cache.lists[i].slabs_partial);
+		INIT_LIST_HEAD(&cache_cache.lists[i].slabs_free);
+	}
+	ac_data(&cache_cache) = &initarray_cache.cache;
+
 	cache_estimate(0, cache_cache.objsize, 0,
 			&left_over, &cache_cache.num);
 	if (!cache_cache.num)
@@ -657,20 +760,33 @@
 	 */
 	{
 		void * ptr;
+#ifdef CONFIG_SLAB_NUMA
+		void * ptr2;
+#endif
 		
-		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+		ptr = alloc_acdata(1, 1);
+#ifdef CONFIG_SLAB_NUMA
+		ptr2 = alloc_acdata(1, 1);
+#endif
 		local_irq_disable();
-		BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
-		memcpy(ptr, ac_data(&cache_cache), sizeof(struct arraycache_init));
-		cache_cache.array[smp_processor_id()] = ptr;
+		BUG_ON(cache_cache.cpudata[smp_processor_id()].native != &initarray_cache.cache);
+		cache_cache.cpudata[smp_processor_id()].native = ptr;
+#ifdef CONFIG_SLAB_NUMA
+		cache_cache.cpudata[smp_processor_id()].alien = ptr2;
+#endif
 		local_irq_enable();
 	
-		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+		ptr = alloc_acdata(1, 1);
+#ifdef CONFIG_SLAB_NUMA
+		ptr2 = alloc_acdata(1, 1);
+#endif
 		local_irq_disable();
-		BUG_ON(ac_data(malloc_sizes[0].cs_cachep) != &initarray_generic.cache);
-		memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep),
-				sizeof(struct arraycache_init));
-		malloc_sizes[0].cs_cachep->array[smp_processor_id()] = ptr;
+		BUG_ON(malloc_sizes[0].cs_cachep->cpudata[smp_processor_id()].native !=
+				&initarray_generic.cache);
+		malloc_sizes[0].cs_cachep->cpudata[smp_processor_id()].native = ptr;
+#ifdef CONFIG_SLAB_NUMA
+		malloc_sizes[0].cs_cachep->cpudata[smp_processor_id()].alien = ptr2;
+#endif
 		local_irq_enable();
 	}
 }
@@ -850,6 +966,7 @@
 	const char *func_nm = KERN_ERR "kmem_create: ";
 	size_t left_over, align, slab_size;
 	kmem_cache_t *cachep = NULL;
+	int i;
 
 	/*
 	 * Sanity checks... these are all serious usage bugs.
@@ -1000,10 +1117,11 @@
 		cachep->gfpflags |= GFP_DMA;
 	spin_lock_init(&cachep->spinlock);
 	cachep->objsize = size;
-	/* NUMA */
-	INIT_LIST_HEAD(&cachep->lists.slabs_full);
-	INIT_LIST_HEAD(&cachep->lists.slabs_partial);
-	INIT_LIST_HEAD(&cachep->lists.slabs_free);
+	for (i=0;i<MAX_NUMNODES;i++) {
+		INIT_LIST_HEAD(&cachep->lists[i].slabs_full);
+		INIT_LIST_HEAD(&cachep->lists[i].slabs_partial);
+		INIT_LIST_HEAD(&cachep->lists[i].slabs_free);
+	}
 
 	if (flags & CFLGS_OFF_SLAB)
 		cachep->slabp_cache = kmem_find_general_cachep(slab_size,0);
@@ -1019,24 +1137,26 @@
 			 * the cache that's used by kmalloc(24), otherwise
 			 * the creation of further caches will BUG().
 			 */
-			cachep->array[smp_processor_id()] = &initarray_generic.cache;
+			ac_data(cachep) = &initarray_generic.cache;
 			g_cpucache_up = PARTIAL;
 		} else {
-			cachep->array[smp_processor_id()] = kmalloc(sizeof(struct arraycache_init),GFP_KERNEL);
+			ac_data(cachep) = alloc_acdata(1,1);
+#ifdef CONFIG_SLAB_NUMA
+			cachep->cpudata[smp_processor_id()].alien =
+					alloc_acdata(1,1);
+#endif
 		}
-		BUG_ON(!ac_data(cachep));
-		ac_data(cachep)->avail = 0;
-		ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
-		ac_data(cachep)->batchcount = 1;
-		ac_data(cachep)->touched = 0;
 		cachep->batchcount = 1;
 		cachep->limit = BOOT_CPUCACHE_ENTRIES;
 		cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
 					+ cachep->num;
 	} 
 
-	cachep->lists.next_reap = jiffies + REAPTIMEOUT_LIST3 +
-					((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+	for (i=0;i< MAX_NUMNODES;i++) {
+		cachep->lists[i].next_reap = jiffies + REAPTIMEOUT_LIST3 +
+					((unsigned long)cachep)%REAPTIMEOUT_LIST3 +
+					i*HZ/10;
+	}
 
 	/* Need the semaphore to access the chain. */
 	down(&cache_chain_sem);
@@ -1128,38 +1248,41 @@
 }
 
 
-/* NUMA shrink all list3s */
 static int __cache_shrink(kmem_cache_t *cachep)
 {
 	struct slab *slabp;
 	int ret;
+	int i;
 
 	drain_cpu_caches(cachep);
 
 	check_irq_on();
 	spin_lock_irq(&cachep->spinlock);
 
-	for(;;) {
-		struct list_head *p;
+	ret = 0;
+	for (i=0;i<MAX_NUMNODES;i++) {
+		for(;;) {
+			struct list_head *p;
 
-		p = cachep->lists.slabs_free.prev;
-		if (p == &cachep->lists.slabs_free)
-			break;
+			p = cachep->lists[i].slabs_free.prev;
+			if (p == &cachep->lists[i].slabs_free)
+				break;
 
-		slabp = list_entry(cachep->lists.slabs_free.prev, struct slab, list);
+			slabp = list_entry(cachep->lists[i].slabs_free.prev, struct slab, list);
 #if DEBUG
-		if (slabp->inuse)
-			BUG();
+			if (slabp->inuse)
+				BUG();
 #endif
-		list_del(&slabp->list);
+			list_del(&slabp->list);
 
-		cachep->lists.free_objects -= cachep->num;
-		spin_unlock_irq(&cachep->spinlock);
-		slab_destroy(cachep, slabp);
-		spin_lock_irq(&cachep->spinlock);
+			cachep->lists[i].free_objects -= cachep->num;
+			spin_unlock_irq(&cachep->spinlock);
+			slab_destroy(cachep, slabp);
+			spin_lock_irq(&cachep->spinlock);
+		}
+		ret |= !list_empty(&cachep->lists[i].slabs_full);
+		ret |= !list_empty(&cachep->lists[i].slabs_partial);
 	}
-	ret = !list_empty(&cachep->lists.slabs_full) ||
-		!list_empty(&cachep->lists.slabs_partial);
 	spin_unlock_irq(&cachep->spinlock);
 	return ret;
 }
@@ -1217,9 +1340,12 @@
 	}
 	{
 		int i;
-		for (i = 0; i < NR_CPUS; i++)
-			kfree(cachep->array[i]);
-		/* NUMA: free the list3 structures */
+		for (i = 0; i < NR_CPUS; i++) {
+			kfree(cachep->cpudata[i].native);
+#ifdef CONFIG_SLAB_NUMA
+			kfree(cachep->cpudata[i].alien);
+#endif
+		}
 	}
 	kmem_cache_free(&cache_cache, cachep);
 
@@ -1316,7 +1442,7 @@
  * Grow (by 1) the number of slabs within a cache.  This is called by
  * kmem_cache_alloc() when there are no active objs left in a cache.
  */
-static int cache_grow (kmem_cache_t * cachep, int flags)
+static struct kmem_list3 *cache_grow (kmem_cache_t * cachep, int flags)
 {
 	struct slab	*slabp;
 	struct page	*page;
@@ -1324,6 +1450,7 @@
 	size_t		 offset;
 	unsigned int	 i, local_flags;
 	unsigned long	 ctor_flags;
+	DEFINE_NUMALIST_PTR(l3);
 
 	/* Be lazy and only check for valid flags here,
  	 * keeping it out of the critical path in kmem_cache_alloc().
@@ -1394,15 +1521,17 @@
 	spin_lock(&cachep->spinlock);
 
 	/* Make slab active. */
-	list_add_tail(&slabp->list, &(list3_data(cachep)->slabs_free));
-	STATS_INC_GROWN(cachep);
-	list3_data(cachep)->free_objects += cachep->num;
+	set_numalist_ptr(l3, cachep, slabp->s_mem);
+	list_add_tail(&slabp->list, &(access_numalist_ptr(cachep, l3)->slabs_free));
+	STATS_INC_GROWN(access_numalist_ptr(cachep, l3));
+	access_numalist_ptr(cachep, l3)->free_objects += cachep->num;
 	spin_unlock(&cachep->spinlock);
-	return 1;
+	return access_numalist_ptr(cachep, l3);
 opps1:
 	kmem_freepages(cachep, objp);
 failed:
-	return 0;
+	STATS_INC_ERR(cachep);
+	return NULL;
 }
 
 /*
@@ -1502,25 +1631,6 @@
 #endif
 }
 
-static inline void * cache_alloc_one_tail (kmem_cache_t *cachep,
-						struct slab *slabp)
-{
-	void *objp;
-
-	check_spinlock_acquired(cachep);
-
-	STATS_INC_ALLOCED(cachep);
-	STATS_INC_ACTIVE(cachep);
-	STATS_SET_HIGH(cachep);
-
-	/* get obj pointer */
-	slabp->inuse++;
-	objp = slabp->s_mem + slabp->free*cachep->objsize;
-	slabp->free=slab_bufctl(slabp)[slabp->free];
-
-	return objp;
-}
-
 static inline void cache_alloc_listfixup(struct kmem_list3 *l3, struct slab *slabp)
 {
 	list_del(&slabp->list);
@@ -1539,6 +1649,7 @@
 
 	check_irq_off();
 	ac = ac_data(cachep);
+	l3 = list3_data(cachep);
 retry:
 	batchcount = ac->batchcount;
 	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -1548,7 +1659,6 @@
 		 */
 		batchcount = BATCHREFILL_LIMIT;
 	}
-	l3 = list3_data(cachep);
 
 	BUG_ON(ac->avail > 0);
 	spin_lock(&cachep->spinlock);
@@ -1566,9 +1676,16 @@
 
 		slabp = list_entry(entry, struct slab, list);
 		check_slabp(cachep, slabp);
-		while (slabp->inuse < cachep->num && batchcount--)
+		while (slabp->inuse < cachep->num && batchcount--) {
+			STATS_INC_ALLOCED(l3);
+			STATS_INC_ACTIVE(l3);
+
+			slabp->inuse++;
+			/* get obj pointer */
 			ac_entry(ac)[ac->avail++] =
-				cache_alloc_one_tail(cachep, slabp);
+					slabp->s_mem + slabp->free*cachep->objsize;
+			slabp->free=slab_bufctl(slabp)[slabp->free];
+		}
 		check_slabp(cachep, slabp);
 		cache_alloc_listfixup(l3, slabp);
 	}
@@ -1578,12 +1695,11 @@
 	spin_unlock(&cachep->spinlock);
 
 	if (unlikely(!ac->avail)) {
-		int x;
-		x = cache_grow(cachep, flags);
+		l3 = cache_grow(cachep, flags);
 		
 		// cache_grow can reenable interrupts, then ac could change.
 		ac = ac_data(cachep);
-		if (!x && ac->avail == 0)	// no objects in sight? abort
+		if (!l3 && ac->avail == 0)	// no objects in sight? abort
 			return NULL;
 
 		if (!ac->avail)		// objects refilled by interrupt?
@@ -1654,51 +1770,48 @@
 	return objp;
 }
 
-/* 
- * NUMA: different approach needed if the spinlock is moved into
- * the l3 structure
- */
-
-static inline void
-__free_block(kmem_cache_t *cachep, void **objpp, int nr_objects)
+static inline void __free_block (kmem_cache_t* cachep, void** objpp, int len)
 {
-	int i;
-
 	check_irq_off();
 	spin_lock(&cachep->spinlock);
+#ifndef CONFIG_SLAB_NUMA
+	cachep->lists[0].free_objects += len;
+#endif
 
-	/* NUMA: move add into loop */
-	cachep->lists.free_objects += nr_objects;
-
-	for (i = 0; i < nr_objects; i++) {
-		void *objp = objpp[i];
-		struct slab *slabp;
-		unsigned int objnr;
+	for ( ; len > 0; len--, objpp++) {
+		struct slab* slabp;
+		void *objp = *objpp;
+		DEFINE_NUMALIST_PTR(l3);
 
 		slabp = GET_PAGE_SLAB(virt_to_page(objp));
 		list_del(&slabp->list);
-		objnr = (objp - slabp->s_mem) / cachep->objsize;
-		slab_bufctl(slabp)[objnr] = slabp->free;
-		slabp->free = objnr;
-		STATS_DEC_ACTIVE(cachep);
-		slabp->inuse--;
+		{
+			unsigned int objnr = (objp-slabp->s_mem)/cachep->objsize;
 
+			slab_bufctl(slabp)[objnr] = slabp->free;
+			slabp->free = objnr;
+		}
+	
+		set_numalist_ptr(l3, cachep, objp);
+		STATS_DEC_ACTIVE(access_numalist_ptr(cachep, l3));
+#ifdef CONFIG_SLAB_NUMA
+		l3->free_objects++;
+#endif
 		/* fixup slab chains */
-		if (slabp->inuse == 0) {
-			if (cachep->lists.free_objects > cachep->free_limit) {
-				cachep->lists.free_objects -= cachep->num;
+		if (unlikely(!--slabp->inuse)) {
+			if (access_numalist_ptr(cachep, l3)->free_objects > cachep->free_limit) {
+				access_numalist_ptr(cachep, l3)->free_objects -= cachep->num;
 				slab_destroy(cachep, slabp);
 			} else {
 				list_add(&slabp->list,
-				&list3_data_ptr(cachep, objp)->slabs_free);
+						&(access_numalist_ptr(cachep, l3)->slabs_free));
 			}
 		} else {
 			/* Unconditionally move a slab to the end of the
 			 * partial list on free - maximum time for the
 			 * other objects to be freed, too.
 			 */
-			list_add_tail(&slabp->list,
-				&list3_data_ptr(cachep, objp)->slabs_partial);
+			list_add_tail(&slabp->list, &(access_numalist_ptr(cachep, l3)->slabs_partial));
 		}
 	}
 	spin_unlock(&cachep->spinlock);
@@ -1720,26 +1833,6 @@
 	check_irq_off();
 	__free_block(cachep, &ac_entry(ac)[0], batchcount);
 
-#if STATS
-	{
-		int i = 0;
-		struct list_head *p;
-
-		spin_lock(&cachep->spinlock);
-		p = list3_data(cachep)->slabs_free.next;
-		while (p != &(list3_data(cachep)->slabs_free)) {
-			struct slab *slabp;
-
-			slabp = list_entry(p, struct slab, list);
-			BUG_ON(slabp->inuse);
-
-			i++;
-			p = p->next;
-		}
-		STATS_SET_FREEABLE(cachep, i);
-		spin_unlock(&cachep->spinlock);
-	}
-#endif
 	ac->avail -= batchcount;
 	memmove(&ac_entry(ac)[0], &ac_entry(ac)[batchcount],
 			sizeof(void*)*ac->avail);
@@ -1754,7 +1847,7 @@
  */
 static inline void __cache_free (kmem_cache_t *cachep, void* objp)
 {
-	struct array_cache *ac = ac_data(cachep);
+	struct array_cache *ac = ac_data_ptr(cachep, objp);
 
 	check_irq_off();
 	objp = cache_free_debugcheck(cachep, objp);
@@ -1890,6 +1983,9 @@
 struct ccupdate_struct {
 	kmem_cache_t *cachep;
 	struct array_cache *new[NR_CPUS];
+#ifdef CONFIG_SLAB_NUMA
+	struct array_cache *new_alien[NR_CPUS];
+#endif
 };
 
 static void do_ccupdate_local(void *info)
@@ -1898,10 +1994,15 @@
 	struct array_cache *old;
 
 	check_irq_off();
-	old = ac_data(new->cachep);
-	
-	new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
+	old = new->cachep->cpudata[smp_processor_id()].native;
+	new->cachep->cpudata[smp_processor_id()].native = new->new[smp_processor_id()];
 	new->new[smp_processor_id()] = old;
+
+#ifdef CONFIG_SLAB_NUMA
+	old = new->cachep->cpudata[smp_processor_id()].alien;
+	new->cachep->cpudata[smp_processor_id()].alien = new->new_alien[smp_processor_id()];
+	new->new_alien[smp_processor_id()] = old;
+#endif
 }
 
 
@@ -1909,22 +2010,22 @@
 {
 	struct ccupdate_struct new;
 	int i;
+	int ret;
 
 	memset(&new.new,0,sizeof(new.new));
 	for (i = 0; i < NR_CPUS; i++) {
-		struct array_cache *ccnew;
-
-		ccnew = kmalloc(sizeof(void*)*limit+
-				sizeof(struct array_cache), GFP_KERNEL);
-		if (!ccnew) {
-			for (i--; i >= 0; i--) kfree(new.new[i]);
-			return -ENOMEM;
-		}
-		ccnew->avail = 0;
-		ccnew->limit = limit;
-		ccnew->batchcount = batchcount;
-		ccnew->touched = 0;
-		new.new[i] = ccnew;
+		new.new[i] = alloc_acdata(limit, batchcount);
+		if (!new.new[i]) {
+			ret = -ENOMEM;
+			goto out;
+		}
+#ifdef CONFIG_SLAB_NUMA
+		new.new_alien[i] = alloc_acdata(limit, limit);
+		if (!new.new_alien[i]) {
+			ret = -ENOMEM;
+			goto out;
+		}
+#endif
 	}
 	new.cachep = cachep;
 
@@ -1936,17 +2037,30 @@
 	cachep->limit = limit;
 	cachep->free_limit = (1+num_online_cpus())*cachep->batchcount + cachep->num;
 	spin_unlock_irq(&cachep->spinlock);
-
+	
+	ret = 0;
+out:
 	for (i = 0; i < NR_CPUS; i++) {
-		struct array_cache *ccold = new.new[i];
-		if (!ccold)
-			continue;
-		local_irq_disable();
-		free_block(cachep, ac_entry(ccold), ccold->avail);
-		local_irq_enable();
-		kfree(ccold);
+		struct array_cache* ccold;
+		
+		ccold = new.new[i];
+		if (ccold) {
+			local_irq_disable();
+			free_block(cachep, ac_entry(ccold), ccold->avail);
+			local_irq_enable();
+			kfree(ccold);
+		}
+#ifdef CONFIG_SLAB_NUMA
+		ccold = new.new_alien[i];
+		if (ccold) {
+			local_irq_disable();
+			free_block(cachep, ac_entry(ccold), ccold->avail);
+			local_irq_enable();
+			kfree(ccold);
+		}
+#endif
 	}
-	return 0;
+	return ret;
 }
 
 
@@ -1998,6 +2112,7 @@
 		int tofree;
 		struct array_cache *ac;
 		struct slab *slabp;
+		DEFINE_NUMALIST_PTR(l3);
 
 		searchp = list_entry(walk, kmem_cache_t, next);
 
@@ -2019,36 +2134,41 @@
 			memmove(&ac_entry(ac)[0], &ac_entry(ac)[tofree],
 					sizeof(void*)*ac->avail);
 		}
-		if(time_after(searchp->lists.next_reap, jiffies))
+#ifdef CONFIG_SLAB_NUMA
+		ac = searchp->cpudata[smp_processor_id()].alien;
+		free_block(searchp, ac_entry(ac), ac->avail);
+		ac->avail = 0;
+#endif
+		set_numalist_cur(l3, searchp);
+		if(time_after(access_numalist_ptr(searchp, l3)->next_reap, jiffies))
 			goto next_irqon;
 
 		spin_lock(&searchp->spinlock);
-		if(time_after(searchp->lists.next_reap, jiffies)) {
+		if(time_after(access_numalist_ptr(searchp, l3)->next_reap, jiffies)) {
 			goto next_unlock;
 		}
-		searchp->lists.next_reap = jiffies + REAPTIMEOUT_LIST3;
-		if (searchp->lists.free_touched) {
-			searchp->lists.free_touched = 0;
+		access_numalist_ptr(searchp, l3)->next_reap = jiffies + REAPTIMEOUT_LIST3;
+		if (access_numalist_ptr(searchp, l3)->free_touched) {
+			access_numalist_ptr(searchp, l3)->free_touched = 0;
 			goto next_unlock;
 		}
 
 		tofree = (searchp->free_limit+5*searchp->num-1)/(5*searchp->num);
 		do {
-			p = list3_data(searchp)->slabs_free.next;
-			if (p == &(list3_data(searchp)->slabs_free))
+			p = access_numalist_ptr(searchp, l3)->slabs_free.next;
+			if (p == &(access_numalist_ptr(searchp, l3)->slabs_free))
 				break;
 
 			slabp = list_entry(p, struct slab, list);
 			BUG_ON(slabp->inuse);
 			list_del(&slabp->list);
-			STATS_INC_REAPED(searchp);
 
 			/* Safe to drop the lock. The slab is no longer
 			 * linked to the cache.
 			 * searchp cannot disappear, we hold
 			 * cache_chain_lock
 			 */
-			searchp->lists.free_objects -= searchp->num;
+			access_numalist_ptr(searchp, l3)->free_objects -= searchp->num;
 			spin_unlock_irq(&searchp->spinlock);
 			slab_destroy(searchp, slabp);
 			spin_lock_irq(&searchp->spinlock);
@@ -2075,7 +2195,7 @@
 	struct timer_list *rt = &reap_timers[cpu];
 
 	cache_reap();
-	mod_timer(rt, jiffies + REAPTIMEOUT_CPUC + cpu);
+	mod_timer(rt, jiffies + REAPTIMEOUT_CPUC);
 }
 
 #ifdef CONFIG_PROC_FS
@@ -2116,19 +2236,16 @@
 {
 	kmem_cache_t *cachep = p;
 	struct list_head *q;
-	struct slab	*slabp;
-	unsigned long	active_objs;
-	unsigned long	num_objs;
-	unsigned long	active_slabs = 0;
-	unsigned long	num_slabs;
+	struct slab		*slabp;
 	const char *name; 
+	int i;
 
 	if (p == (void*)1) {
 		/*
 		 * Output format version, so at least we can change it
 		 * without _too_ many complaints.
 		 */
-		seq_puts(m, "slabinfo - version: 1.2"
+		seq_puts(m, "slabinfo - version: 2.0"
 #if STATS
 				" (statistics)"
 #endif
@@ -2136,33 +2253,7 @@
 		return 0;
 	}
 
-	check_irq_on();
-	spin_lock_irq(&cachep->spinlock);
-	active_objs = 0;
-	num_slabs = 0;
-	list_for_each(q,&cachep->lists.slabs_full) {
-		slabp = list_entry(q, struct slab, list);
-		if (slabp->inuse != cachep->num)
-			BUG();
-		active_objs += cachep->num;
-		active_slabs++;
-	}
-	list_for_each(q,&cachep->lists.slabs_partial) {
-		slabp = list_entry(q, struct slab, list);
-		BUG_ON(slabp->inuse == cachep->num || !slabp->inuse);
-		active_objs += slabp->inuse;
-		active_slabs++;
-	}
-	list_for_each(q,&cachep->lists.slabs_free) {
-		slabp = list_entry(q, struct slab, list);
-		if (slabp->inuse)
-			BUG();
-		num_slabs++;
-	}
-	num_slabs+=active_slabs;
-	num_objs = num_slabs*cachep->num;
-	BUG_ON(num_objs - active_objs != cachep->lists.free_objects);
-
+	/* line 1: global stats */
 	name = cachep->name; 
 	{
 	char tmp; 
@@ -2175,33 +2266,76 @@
 	set_fs(old_fs);
 	} 	
 
-	seq_printf(m, "%-17s %6lu %6lu %6u %4lu %4lu %4u",
-		name, active_objs, num_objs, cachep->objsize,
-		active_slabs, num_slabs, (1<<cachep->gfporder));
+	seq_printf(m, "%-17s : %6u %6u %4u 0x%04x %6u %4u %4u",
+		name, cachep->objsize, cachep->num, (1<<cachep->gfporder),
+		cachep->flags, cachep->free_limit, cachep->limit, cachep->batchcount);
+#if STATS
+	seq_printf(m, " %4u", atomic_read(&cachep->errors));
+#endif
+
+	seq_putc(m, '\n');
+
+
+	check_irq_on();
+	/* block 2: list3 data */
+	spin_lock_irq(&cachep->spinlock);
+	for (i=0;i<MAX_NUMNODES;i++) {
+		struct kmem_list3 *l3 = &cachep->lists[i];
+		unsigned long	active_objs = 0;
+		unsigned long	num_objs = 0;
+		unsigned long	active_slabs = 0;
+		unsigned long	num_slabs = 0;
+
+		list_for_each(q,&l3->slabs_full) {
+			slabp = list_entry(q, struct slab, list);
+			if (slabp->inuse != cachep->num)
+				BUG();
+			active_objs += cachep->num;
+			active_slabs++;
+		}
+		list_for_each(q,&l3->slabs_partial) {
+			slabp = list_entry(q, struct slab, list);
+			BUG_ON(slabp->inuse == cachep->num || !slabp->inuse);
+			active_objs += slabp->inuse;
+			active_slabs++;
+		}
+		list_for_each(q,&l3->slabs_free) {
+			slabp = list_entry(q, struct slab, list);
+			if (slabp->inuse)
+				BUG();
+			num_slabs++;
+		}
+		num_slabs+=active_slabs;
+		num_objs = num_slabs*cachep->num;
+
+		BUG_ON(num_objs - active_objs != l3->free_objects);
+		seq_printf(m, "# Node %2u         : %6lu %6lu %8lu %8lu",
+			i, active_slabs, num_slabs, active_objs, num_objs);
+#if STATS
+		BUG_ON(active_objs != l3->num_active);
 
-	seq_printf(m, " : %4u %4u", cachep->limit, cachep->batchcount);
+		seq_printf(m, " %8lu %8lu %6lu", l3->num_allocations, 
+					l3->high_mark, l3->grown);
+#endif
+		seq_putc(m, '\n');
+	}
+	/* block 3: array data */
 #if STATS
-	{	// list3 stats
-		unsigned long high = cachep->high_mark;
-		unsigned long allocs = cachep->num_allocations;
-		unsigned long grown = cachep->grown;
-		unsigned long reaped = cachep->reaped;
-		unsigned long errors = cachep->errors;
-		unsigned long max_freeable = cachep->max_freeable;
-		unsigned long free_limit = cachep->free_limit;
-
-		seq_printf(m, " : %6lu %7lu %5lu %4lu %4lu %4lu %4lu",
-				high, allocs, grown, reaped, errors, 
-				max_freeable, free_limit);
-	}
-	{	// cpucache stats
-		unsigned long allochit = atomic_read(&cachep->allochit);
-		unsigned long allocmiss = atomic_read(&cachep->allocmiss);
-		unsigned long freehit = atomic_read(&cachep->freehit);
-		unsigned long freemiss = atomic_read(&cachep->freemiss);
+	for (i=0;i<NR_CPUS;i++) {
+		if (!cpu_online(i))
+			continue;
 
-		seq_printf(m, " : %6lu %6lu %6lu %6lu",
-				allochit, allocmiss, freehit, freemiss);
+		seq_printf(m, "# Cpu %2i          : %6u %6u %6u %6u",
+				i, 
+				atomic_read(&cachep->allochit[i]),
+				atomic_read(&cachep->allocmiss[i]),
+				atomic_read(&cachep->freehit[i]),
+				atomic_read(&cachep->freemiss[i]));
+#ifdef CONFIG_SLAB_NUMA
+		seq_printf(m, " %6u",
+				atomic_read(&cachep->foreign[i]));
+#endif
+		seq_putc(m, '\n');
 	}
 #endif
 	spin_unlock_irq(&cachep->spinlock);

next             reply	other threads:[~2002-11-09 17:47 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2002-11-09 17:51 Manfred Spraul [this message]
2002-11-09 22:51 ` [Lse-tech] [PATCH] numa slab, rediffed against 2.5.46 Martin J. Bligh

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=3DCD4B30.2090204@colorfullife.com \
    --to=manfred@colorfullife.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=lse-tech@lists.sourceforge.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.