From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1755820Ab0C3JBp (ORCPT ); Tue, 30 Mar 2010 05:01:45 -0400 Received: from mail-fx0-f223.google.com ([209.85.220.223]:53030 "EHLO mail-fx0-f223.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754562Ab0C3JBn convert rfc822-to-8bit (ORCPT ); Tue, 30 Mar 2010 05:01:43 -0400 DomainKey-Signature: a=rsa-sha1; c=nofws; d=gmail.com; s=gamma; h=mime-version:sender:in-reply-to:references:date :x-google-sender-auth:message-id:subject:from:to:cc:content-type :content-transfer-encoding; b=cBevdwkf0O7M+aoGrQesf/XA4xRFU6qBQHyowPQNSV2T6q6avm9uXXtu3lIxBNWZvd /mKWiWpxiFHpXANuTmDuM4YmNmW/v0O6pERPENhwmp9lawTcntTxuX2opTawRL9fZH2q RAe8a8FliEmGorKSvLKoGtTMyv6vPUgKICCZE= MIME-Version: 1.0 In-Reply-To: References: <20100226155755.GE16335@basil.fritz.box> <20100305062002.GV8653@laptop> <20100309134633.GM8653@laptop> Date: Tue, 30 Mar 2010 12:01:40 +0300 X-Google-Sender-Auth: 47cee166456ffe8a Message-ID: <84144f021003300201x563c72vb41cc9de359cc7d0@mail.gmail.com> Subject: Re: [patch v2] slab: add memory hotplug support From: Pekka Enberg To: David Rientjes Cc: Nick Piggin , Andi Kleen , Christoph Lameter , linux-kernel@vger.kernel.org, linux-mm@kvack.org, haicheng.li@intel.com, KAMEZAWA Hiroyuki Content-Type: text/plain; charset=ISO-8859-1 Content-Transfer-Encoding: 8BIT Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Sun, Mar 28, 2010 at 5:40 AM, David Rientjes wrote: > Slab lacks any memory hotplug support for nodes that are hotplugged > without cpus being hotplugged.  This is possible at least on x86 > CONFIG_MEMORY_HOTPLUG_SPARSE kernels where SRAT entries are marked > ACPI_SRAT_MEM_HOT_PLUGGABLE and the regions of RAM represent a seperate > node.  It can also be done manually by writing the start address to > /sys/devices/system/memory/probe for kernels that have > CONFIG_ARCH_MEMORY_PROBE set, which is how this patch was tested, and > then onlining the new memory region. > > When a node is hotadded, a nodelist for that node is allocated and > initialized for each slab cache.  If this isn't completed due to a lack > of memory, the hotadd is aborted: we have a reasonable expectation that > kmalloc_node(nid) will work for all caches if nid is online and memory is > available. > > Since nodelists must be allocated and initialized prior to the new node's > memory actually being online, the struct kmem_list3 is allocated off-node > due to kmalloc_node()'s fallback. > > When an entire node would be offlined, its nodelists are subsequently > drained.  If slab objects still exist and cannot be freed, the offline is > aborted.  It is possible that objects will be allocated between this > drain and page isolation, so it's still possible that the offline will > still fail, however. > > Signed-off-by: David Rientjes Nick, Christoph, lets make a a deal: you ACK, I merge. How does that sound to you? > --- >  mm/slab.c |  157 ++++++++++++++++++++++++++++++++++++++++++++++++------------ >  1 files changed, 125 insertions(+), 32 deletions(-) > > diff --git a/mm/slab.c b/mm/slab.c > --- a/mm/slab.c > +++ b/mm/slab.c > @@ -115,6 +115,7 @@ >  #include       >  #include       >  #include       > +#include       > >  #include       >  #include       > @@ -1102,6 +1103,52 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) >  } >  #endif > > +/* > + * Allocates and initializes nodelists for a node on each slab cache, used for > + * either memory or cpu hotplug.  If memory is being hot-added, the kmem_list3 > + * will be allocated off-node since memory is not yet online for the new node. > + * When hotplugging memory or a cpu, existing nodelists are not replaced if > + * already in use. > + * > + * Must hold cache_chain_mutex. > + */ > +static int init_cache_nodelists_node(int node) > +{ > +       struct kmem_cache *cachep; > +       struct kmem_list3 *l3; > +       const int memsize = sizeof(struct kmem_list3); > + > +       list_for_each_entry(cachep, &cache_chain, next) { > +               /* > +                * Set up the size64 kmemlist for cpu before we can > +                * begin anything. Make sure some other cpu on this > +                * node has not already allocated this > +                */ > +               if (!cachep->nodelists[node]) { > +                       l3 = kmalloc_node(memsize, GFP_KERNEL, node); > +                       if (!l3) > +                               return -ENOMEM; > +                       kmem_list3_init(l3); > +                       l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + > +                           ((unsigned long)cachep) % REAPTIMEOUT_LIST3; > + > +                       /* > +                        * The l3s don't come and go as CPUs come and > +                        * go.  cache_chain_mutex is sufficient > +                        * protection here. > +                        */ > +                       cachep->nodelists[node] = l3; > +               } > + > +               spin_lock_irq(&cachep->nodelists[node]->list_lock); > +               cachep->nodelists[node]->free_limit = > +                       (1 + nr_cpus_node(node)) * > +                       cachep->batchcount + cachep->num; > +               spin_unlock_irq(&cachep->nodelists[node]->list_lock); > +       } > +       return 0; > +} > + >  static void __cpuinit cpuup_canceled(long cpu) >  { >        struct kmem_cache *cachep; > @@ -1172,7 +1219,7 @@ static int __cpuinit cpuup_prepare(long cpu) >        struct kmem_cache *cachep; >        struct kmem_list3 *l3 = NULL; >        int node = cpu_to_node(cpu); > -       const int memsize = sizeof(struct kmem_list3); > +       int err; > >        /* >         * We need to do this right in the beginning since > @@ -1180,35 +1227,9 @@ static int __cpuinit cpuup_prepare(long cpu) >         * kmalloc_node allows us to add the slab to the right >         * kmem_list3 and not this cpu's kmem_list3 >         */ > - > -       list_for_each_entry(cachep, &cache_chain, next) { > -               /* > -                * Set up the size64 kmemlist for cpu before we can > -                * begin anything. Make sure some other cpu on this > -                * node has not already allocated this > -                */ > -               if (!cachep->nodelists[node]) { > -                       l3 = kmalloc_node(memsize, GFP_KERNEL, node); > -                       if (!l3) > -                               goto bad; > -                       kmem_list3_init(l3); > -                       l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + > -                           ((unsigned long)cachep) % REAPTIMEOUT_LIST3; > - > -                       /* > -                        * The l3s don't come and go as CPUs come and > -                        * go.  cache_chain_mutex is sufficient > -                        * protection here. > -                        */ > -                       cachep->nodelists[node] = l3; > -               } > - > -               spin_lock_irq(&cachep->nodelists[node]->list_lock); > -               cachep->nodelists[node]->free_limit = > -                       (1 + nr_cpus_node(node)) * > -                       cachep->batchcount + cachep->num; > -               spin_unlock_irq(&cachep->nodelists[node]->list_lock); > -       } > +       err = init_cache_nodelists_node(node); > +       if (err < 0) > +               goto bad; > >        /* >         * Now we can go ahead with allocating the shared arrays and > @@ -1331,11 +1352,75 @@ static struct notifier_block __cpuinitdata cpucache_notifier = { >        &cpuup_callback, NULL, 0 >  }; > > +#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) > +/* > + * Drains freelist for a node on each slab cache, used for memory hot-remove. > + * Returns -EBUSY if all objects cannot be drained so that the node is not > + * removed. > + * > + * Must hold cache_chain_mutex. > + */ > +static int __meminit drain_cache_nodelists_node(int node) > +{ > +       struct kmem_cache *cachep; > +       int ret = 0; > + > +       list_for_each_entry(cachep, &cache_chain, next) { > +               struct kmem_list3 *l3; > + > +               l3 = cachep->nodelists[node]; > +               if (!l3) > +                       continue; > + > +               drain_freelist(cachep, l3, l3->free_objects); > + > +               if (!list_empty(&l3->slabs_full) || > +                   !list_empty(&l3->slabs_partial)) { > +                       ret = -EBUSY; > +                       break; > +               } > +       } > +       return ret; > +} > + > +static int __meminit slab_memory_callback(struct notifier_block *self, > +                                       unsigned long action, void *arg) > +{ > +       struct memory_notify *mnb = arg; > +       int ret = 0; > +       int nid; > + > +       nid = mnb->status_change_nid; > +       if (nid < 0) > +               goto out; > + > +       switch (action) { > +       case MEM_GOING_ONLINE: > +               mutex_lock(&cache_chain_mutex); > +               ret = init_cache_nodelists_node(nid); > +               mutex_unlock(&cache_chain_mutex); > +               break; > +       case MEM_GOING_OFFLINE: > +               mutex_lock(&cache_chain_mutex); > +               ret = drain_cache_nodelists_node(nid); > +               mutex_unlock(&cache_chain_mutex); > +               break; > +       case MEM_ONLINE: > +       case MEM_OFFLINE: > +       case MEM_CANCEL_ONLINE: > +       case MEM_CANCEL_OFFLINE: > +               break; > +       } > +out: > +       return ret ? notifier_from_errno(ret) : NOTIFY_OK; > +} > +#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */ > + >  /* >  * swap the static kmem_list3 with kmalloced memory >  */ > -static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, > -                       int nodeid) > +static void __init init_list(struct kmem_cache *cachep, struct kmem_list3 *list, > +                               int nodeid) >  { >        struct kmem_list3 *ptr; > > @@ -1580,6 +1665,14 @@ void __init kmem_cache_init_late(void) >         */ >        register_cpu_notifier(&cpucache_notifier); > > +#ifdef CONFIG_NUMA > +       /* > +        * Register a memory hotplug callback that initializes and frees > +        * nodelists. > +        */ > +       hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); > +#endif > + >        /* >         * The reap timers are started later, with a module init call: That part >         * of the kernel is not yet operational. > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at  http://vger.kernel.org/majordomo-info.html > Please read the FAQ at  http://www.tux.org/lkml/ >