--- 2.5/mm/slab.c Tue Oct 8 22:18:52 2002 +++ build-2.5/mm/slab.c Wed Oct 9 00:03:37 2002 @@ -85,6 +85,21 @@ #include /* + * Enable the NUMA mode for slab + * This is a separate define from CONFIG_DISCONTIGMEM, because it only + * applies if ZONE_NORMAL allocations are possible on all zones. + * TODO: + * - the behaviour is extremely bad if get_free_pages returns returns + * memory from the another node. This must be improved. + * - make the cache structures themselves node local + * - determine the optimal placement for the chache spinlock: + * node local or global? + * - add NUMA specific statistic counters, otherwise we'll + * never find that out. + */ +#define CONFIG_SLAB_NUMA + +/* * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, * SLAB_RED_ZONE & SLAB_POISON. * 0 for faster, smaller code (especially in the critical paths). @@ -189,6 +204,12 @@ unsigned int touched; } cpucache_t; +struct cpucache_wrapper { + struct cpucache_s *native; +#ifdef CONFIG_SLAB_NUMA + struct cpucache_s *alien; +#endif +}; /* bootstrap: The caches do not work without cpuarrays anymore, * but the cpuarrays are allocated from the generic caches... */ @@ -200,14 +221,20 @@ #define cc_entry(cpucache) \ ((void **)(((cpucache_t*)(cpucache))+1)) + + +#ifdef CONFIG_SLAB_NUMA + #define cc_data(cachep) \ - ((cachep)->cpudata[smp_processor_id()]) -/* - * NUMA: check if 'ptr' points into the current node, - * use the alternate cpudata cache if wrong - */ + ((cachep)->cpudata[smp_processor_id()].native) + +#else +#define cc_data(cachep) \ + ((cachep)->cpudata[smp_processor_id()].native) + #define cc_data_ptr(cachep, ptr) \ cc_data(cachep) +#endif /* * The slab lists of all objects. @@ -231,12 +258,25 @@ .slabs_partial = LIST_HEAD_INIT(parent.slabs_partial), \ .slabs_free = LIST_HEAD_INIT(parent.slabs_free) \ } + +#ifdef CONFIG_SLAB_NUMA +#define list3_data(cachep) \ + (&(cachep)->lists[__cpu_to_node(smp_processor_id())]) +#else #define list3_data(cachep) \ - (&(cachep)->lists) + (&(cachep)->lists[0]) + +#define DEFINE_NUMALIST_PTR(x) +#define set_numalist_ptr(x, cachep, objp) \ + do { } while(0) -/* NUMA: per-node */ -#define list3_data_ptr(cachep, ptr) \ - list3_data(cachep) +#define set_numalist_cur(x, cachep) \ + do { } while(0) + +#define access_numalist_ptr(cachep, x) \ + (&(cachep->lists[0])) + +#endif /* * kmem_cache_t @@ -246,13 +286,12 @@ struct kmem_cache_s { /* 1) per-cpu data, touched during every alloc/free */ - cpucache_t *cpudata[NR_CPUS]; - /* NUMA: cpucache_t *cpudata_othernode[NR_CPUS]; */ + + struct cpucache_wrapper cpudata[NR_CPUS]; unsigned int batchcount; unsigned int limit; /* 2) touched by every alloc & free from the backend */ - struct kmem_list3 lists; - /* NUMA: kmem_3list_t *nodelists[NR_NODES] */ + struct kmem_list3 lists[MAX_NUMNODES]; /* NUMA: pointers would be better */ unsigned int objsize; unsigned int flags; /* constant flags */ unsigned int num; /* # of objs per slab */ @@ -446,8 +485,6 @@ /* internal cache of cache description objs */ static kmem_cache_t cache_cache = { - .lists = LIST3_INIT(cache_cache.lists), - .cpudata = { [0] = &cpuarray_cache.cache }, .batchcount = 1, .limit = BOOT_CPUCACHE_ENTRIES, .objsize = sizeof(kmem_cache_t), @@ -590,11 +627,26 @@ nc->touched = 0; spin_lock_irq(&cachep->spinlock); - cachep->cpudata[cpu] = nc; + cachep->cpudata[cpu].native = nc; cachep->free_limit = (1+num_online_cpus())*cachep->batchcount + cachep->num; spin_unlock_irq(&cachep->spinlock); +#ifdef CONFIG_NUMASLAB + memsize = sizeof(void*)*cachep->limit+sizeof(cpucache_t); + nc = kmalloc(memsize, GFP_KERNEL); + if (!nc) + goto bad; + nc->avail = 0; + nc->limit = cachep->limit; + nc->batchcount = cachep->limit; + nc->touched = 0; + spin_lock_irq(&cachep->spinlock); + cachep->cpudata[cpu].alien = nc; + cachep->free_limit = (1+num_online_cpus())*cachep->batchcount + + cachep->num; + spin_unlock_irq(&cachep->spinlock); +#endif } if (g_cpucache_up == FULL) @@ -610,15 +662,51 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; +#ifdef CONFIG_SLAB_NUMA +/* + * NUMA: check where ptr points, and select the appropriate storage + * for the object. + */ + +static inline int ptr_to_node(void *obj) +{ + return 0; /* FIXME */ +} +static inline struct cpucache_s * cc_data_ptr(kmem_cache_t *cachep, void *objp) +{ + if (ptr_to_node(objp) == __cpu_to_node(smp_processor_id())) + return cachep->cpudata[smp_processor_id()].native; + return cachep->cpudata[smp_processor_id()].alien; +} +#define DEFINE_NUMALIST_PTR(x) \ + struct kmem_list3 *x + +#define set_numalist_ptr(x, cachep, objp) \ + do { x = &cachep->lists[ptr_to_node(objp)]; } while(0) +#define set_numalist_cur(x, cachep) \ + do { x = &cachep->lists[__cpu_to_node(smp_processor_id())]; } while(0) +#define access_numalist_ptr(cachep, x) \ + (x) + +#endif + /* Initialisation - setup the `cache' cache. */ void __init kmem_cache_init(void) { size_t left_over; + int i; init_MUTEX(&cache_chain_sem); INIT_LIST_HEAD(&cache_chain); list_add(&cache_cache.next, &cache_chain); + for (i=0;icpudata[0].native != + &cpuarray_generic.cache); + memcpy(ptr, cache_sizes[0].cs_cachep->cpudata[0].native, sizeof(struct cpucache_int)); - cc_data(cache_sizes[0].cs_cachep) = ptr; + cache_sizes[0].cs_cachep->cpudata[0].native = ptr; +#ifdef CONFIG_SLAB_NUMA + memcpy(ptr2, ptr, sizeof(struct cpucache_int)); + cache_sizes[0].cs_cachep->cpudata[0].alien = ptr2; +#endif local_irq_enable(); } } @@ -871,6 +977,7 @@ const char *func_nm = KERN_ERR "kmem_create: "; size_t left_over, align, slab_size; kmem_cache_t *cachep = NULL; + int i; /* * Sanity checks... these are all serious usage bugs. @@ -1021,10 +1128,11 @@ cachep->gfpflags |= GFP_DMA; spin_lock_init(&cachep->spinlock); cachep->objsize = size; - /* NUMA */ - INIT_LIST_HEAD(&cachep->lists.slabs_full); - INIT_LIST_HEAD(&cachep->lists.slabs_partial); - INIT_LIST_HEAD(&cachep->lists.slabs_free); + for (i=0;ilists[i].slabs_full); + INIT_LIST_HEAD(&cachep->lists[i].slabs_partial); + INIT_LIST_HEAD(&cachep->lists[i].slabs_free); + } if (flags & CFLGS_OFF_SLAB) cachep->slabp_cache = kmem_find_general_cachep(slab_size,0); @@ -1044,20 +1152,35 @@ g_cpucache_up = PARTIAL; } else { cc_data(cachep) = kmalloc(sizeof(struct cpucache_int),GFP_KERNEL); +#ifdef CONFIG_SLAB_NUMA + cachep->cpudata[smp_processor_id()].alien = + kmalloc(sizeof(struct cpucache_int),GFP_KERNEL); +#endif } BUG_ON(!cc_data(cachep)); cc_data(cachep)->avail = 0; cc_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES; cc_data(cachep)->batchcount = 1; cc_data(cachep)->touched = 0; +#ifdef CONFIG_SLAB_NUMA + if (cachep->cpudata[smp_processor_id()].alien) { + cachep->cpudata[smp_processor_id()].alien->avail = 0; + cachep->cpudata[smp_processor_id()].alien->limit = BOOT_CPUCACHE_ENTRIES; + cachep->cpudata[smp_processor_id()].alien->batchcount = 1; + cachep->cpudata[smp_processor_id()].alien->touched = 0; + } +#endif cachep->batchcount = 1; cachep->limit = BOOT_CPUCACHE_ENTRIES; cachep->free_limit = (1+num_online_cpus())*cachep->batchcount + cachep->num; } - cachep->lists.next_reap = jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep)%REAPTIMEOUT_LIST3; + for (i=0;i< MAX_NUMNODES;i++) { + cachep->lists[i].next_reap = jiffies + REAPTIMEOUT_LIST3 + + ((unsigned long)cachep)%REAPTIMEOUT_LIST3 + + i*HZ/10; + } /* Need the semaphore to access the chain. */ down(&cache_chain_sem); @@ -1151,38 +1274,41 @@ } -/* NUMA shrink all list3s */ static int __cache_shrink(kmem_cache_t *cachep) { slab_t *slabp; int ret; + int i; drain_cpu_caches(cachep); check_irq_on(); spin_lock_irq(&cachep->spinlock); - for(;;) { - struct list_head *p; + ret = 0; + for (i=0;ilists.slabs_free.prev; - if (p == &cachep->lists.slabs_free) - break; + p = cachep->lists[i].slabs_free.prev; + if (p == &cachep->lists[i].slabs_free) + break; - slabp = list_entry(cachep->lists.slabs_free.prev, slab_t, list); + slabp = list_entry(cachep->lists[i].slabs_free.prev, slab_t, list); #if DEBUG - if (slabp->inuse) - BUG(); + if (slabp->inuse) + BUG(); #endif - list_del(&slabp->list); + list_del(&slabp->list); - cachep->lists.free_objects -= cachep->num; - spin_unlock_irq(&cachep->spinlock); - slab_destroy(cachep, slabp); - spin_lock_irq(&cachep->spinlock); + cachep->lists[i].free_objects -= cachep->num; + spin_unlock_irq(&cachep->spinlock); + slab_destroy(cachep, slabp); + spin_lock_irq(&cachep->spinlock); + } + ret |= !list_empty(&cachep->lists[i].slabs_full); + ret |= !list_empty(&cachep->lists[i].slabs_partial); } - ret = !list_empty(&cachep->lists.slabs_full) || - !list_empty(&cachep->lists.slabs_partial); spin_unlock_irq(&cachep->spinlock); return ret; } @@ -1242,9 +1368,12 @@ } { int i; - for (i = 0; i < NR_CPUS; i++) - kfree(cachep->cpudata[i]); - /* NUMA: free the list3 structures */ + for (i = 0; i < NR_CPUS; i++) { + kfree(cachep->cpudata[i].native); +#ifdef CONFIG_SLAB_NUMA + kfree(cachep->cpudata[i].alien); +#endif + } } kmem_cache_free(&cache_cache, cachep); @@ -1674,21 +1803,18 @@ return objp; } -/* - * NUMA: different approach needed if the spinlock is moved into - * the l3 structure - */ - static inline void __free_block (kmem_cache_t* cachep, void** objpp, int len) { check_irq_off(); spin_lock(&cachep->spinlock); - /* NUMA: move add into loop */ - cachep->lists.free_objects += len; +#ifndef CONFIG_SLAB_NUMA + cachep->lists[0].free_objects += len; +#endif for ( ; len > 0; len--, objpp++) { slab_t* slabp; void *objp = *objpp; + DEFINE_NUMALIST_PTR(l3); slabp = GET_PAGE_SLAB(virt_to_page(objp)); list_del(&slabp->list); @@ -1700,21 +1826,25 @@ } STATS_DEC_ACTIVE(cachep); + set_numalist_ptr(l3, cachep, objp); +#ifdef CONFIG_SLAB_NUMA + l3->free_objects++; +#endif /* fixup slab chains */ if (unlikely(!--slabp->inuse)) { - if (cachep->lists.free_objects > cachep->free_limit) { - cachep->lists.free_objects -= cachep->num; + if (access_numalist_ptr(cachep, l3)->free_objects > cachep->free_limit) { + access_numalist_ptr(cachep, l3)->free_objects -= cachep->num; slab_destroy(cachep, slabp); } else { list_add(&slabp->list, - &list3_data_ptr(cachep, objp)->slabs_free); + &(access_numalist_ptr(cachep, l3)->slabs_free)); } } else { /* Unconditionally move a slab to the end of the * partial list on free - maximum time for the * other objects to be freed, too. */ - list_add_tail(&slabp->list, &list3_data_ptr(cachep, objp)->slabs_partial); + list_add_tail(&slabp->list, &(access_numalist_ptr(cachep, l3)->slabs_partial)); } } spin_unlock(&cachep->spinlock); @@ -1906,6 +2036,9 @@ struct ccupdate_struct { kmem_cache_t *cachep; cpucache_t *new[NR_CPUS]; +#ifdef CONFIG_SLAB_NUMA + cpucache_t *new_alien[NR_CPUS]; +#endif }; static void do_ccupdate_local(void *info) @@ -1914,10 +2047,15 @@ cpucache_t *old; check_irq_off(); - old = cc_data(new->cachep); - - cc_data(new->cachep) = new->new[smp_processor_id()]; + old = new->cachep->cpudata[smp_processor_id()].native; + new->cachep->cpudata[smp_processor_id()].native = new->new[smp_processor_id()]; new->new[smp_processor_id()] = old; + +#ifdef CONFIG_SLAB_NUMA + old = new->cachep->cpudata[smp_processor_id()].alien; + new->cachep->cpudata[smp_processor_id()].alien = new->new_alien[smp_processor_id()]; + new->new_alien[smp_processor_id()] = old; +#endif } @@ -1941,6 +2079,19 @@ ccnew->batchcount = batchcount; ccnew->touched = 0; new.new[i] = ccnew; +#ifdef CONFIG_SLAB_NUMA + ccnew = kmalloc(sizeof(void*)*limit+ + sizeof(cpucache_t), GFP_KERNEL); + if (!ccnew) { + for (i--; i >= 0; i--) kfree(new.new[i]); + return -ENOMEM; + } + ccnew->avail = 0; + ccnew->limit = limit; + ccnew->batchcount = limit; + ccnew->touched = 0; + new.new_alien[i] = ccnew; +#endif } new.cachep = cachep; @@ -1954,13 +2105,22 @@ spin_unlock_irq(&cachep->spinlock); for (i = 0; i < NR_CPUS; i++) { - cpucache_t* ccold = new.new[i]; - if (!ccold) - continue; - local_irq_disable(); - free_block(cachep, cc_entry(ccold), ccold->avail); - local_irq_enable(); - kfree(ccold); + cpucache_t* ccold; + + ccold = new.new[i]; + if (ccold) { + local_irq_disable(); + free_block(cachep, cc_entry(ccold), ccold->avail); + local_irq_enable(); + kfree(ccold); + } + ccold = new.new_alien[i]; + if (ccold) { + local_irq_disable(); + free_block(cachep, cc_entry(ccold), ccold->avail); + local_irq_enable(); + kfree(ccold); + } } return 0; } @@ -2010,6 +2170,7 @@ int tofree; cpucache_t *cc; slab_t *slabp; + DEFINE_NUMALIST_PTR(l3); searchp = list_entry(walk, kmem_cache_t, next); @@ -2031,16 +2192,22 @@ memmove(&cc_entry(cc)[0], &cc_entry(cc)[tofree], sizeof(void*)*cc->avail); } - if(time_after(searchp->lists.next_reap, jiffies)) +#ifdef CONFIG_SLAB_NUMA + cc = searchp->cpudata[smp_processor_id()].alien; + free_block(searchp, cc_entry(cc), cc->avail); + cc->avail = 0; +#endif + set_numalist_cur(l3, searchp); + if(time_after(access_numalist_ptr(searchp, l3)->next_reap, jiffies)) goto next_irqon; spin_lock(&searchp->spinlock); - if(time_after(searchp->lists.next_reap, jiffies)) { + if(time_after(access_numalist_ptr(searchp, l3)->next_reap, jiffies)) { goto next_unlock; } - searchp->lists.next_reap = jiffies + REAPTIMEOUT_LIST3; - if (searchp->lists.free_touched) { - searchp->lists.free_touched = 0; + access_numalist_ptr(searchp, l3)->next_reap = jiffies + REAPTIMEOUT_LIST3; + if (access_numalist_ptr(searchp, l3)->free_touched) { + access_numalist_ptr(searchp, l3)->free_touched = 0; goto next_unlock; } @@ -2060,7 +2227,7 @@ * searchp cannot disappear, we hold * cache_chain_lock */ - searchp->lists.free_objects -= searchp->num; + access_numalist_ptr(searchp, l3)->free_objects -= searchp->num; spin_unlock_irq(&searchp->spinlock); slab_destroy(searchp, slabp); spin_lock_irq(&searchp->spinlock); @@ -2128,6 +2295,7 @@ unsigned long active_slabs = 0; unsigned long num_slabs; const char *name; + int i; if (p == (void*)1) { /* @@ -2146,28 +2314,32 @@ spin_lock_irq(&cachep->spinlock); active_objs = 0; num_slabs = 0; - list_for_each(q,&cachep->lists.slabs_full) { - slabp = list_entry(q, slab_t, list); - if (slabp->inuse != cachep->num) - BUG(); - active_objs += cachep->num; - active_slabs++; - } - list_for_each(q,&cachep->lists.slabs_partial) { - slabp = list_entry(q, slab_t, list); - BUG_ON(slabp->inuse == cachep->num || !slabp->inuse); - active_objs += slabp->inuse; - active_slabs++; - } - list_for_each(q,&cachep->lists.slabs_free) { - slabp = list_entry(q, slab_t, list); - if (slabp->inuse) - BUG(); - num_slabs++; + for (i=0;ilists[i].slabs_full) { + slabp = list_entry(q, slab_t, list); + if (slabp->inuse != cachep->num) + BUG(); + active_objs += cachep->num; + active_slabs++; + } + list_for_each(q,&cachep->lists[i].slabs_partial) { + slabp = list_entry(q, slab_t, list); + BUG_ON(slabp->inuse == cachep->num || !slabp->inuse); + active_objs += slabp->inuse; + active_slabs++; + } + list_for_each(q,&cachep->lists[i].slabs_free) { + slabp = list_entry(q, slab_t, list); + if (slabp->inuse) + BUG(); + num_slabs++; + } } num_slabs+=active_slabs; num_objs = num_slabs*cachep->num; - BUG_ON(num_objs - active_objs != cachep->lists.free_objects); +#ifndef CONFIG_SLAB_NUMA + BUG_ON(num_objs - active_objs != cachep->lists[0].free_objects); +#endif name = cachep->name; {