Re: [RFC PATCH] fork: Free per-cpu cached vmalloc'ed thread stacks with

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Uladzislau Rezki <urezki@gmail.com>
To: "Isaac J. Manjarres" <isaacm@codeaurora.org>
Cc: "Isaac J. Manjarres" <isaacm@codeaurora.org>,
	linux-mm.kvack.org@wittgenstein, linux-kernel@vger.kernel.org,
	akpm@linux-foundation.org, mingo@kernel.org,
	peterz@infradead.org, ebiederm@xmission.com, esyr@redhat.com,
	tglx@linutronix.de, christian@kellner.me, areber@redhat.com,
	shakeelb@google.com, cyphar@cyphar.com, psodagud@codeaurora.org,
	pratikp@codeaurora.org
Subject: Re: [RFC PATCH] fork: Free per-cpu cached vmalloc'ed thread stacks with
Date: Mon, 7 Sep 2020 16:21:31 +0200	[thread overview]
Message-ID: <20200907142131.GA16844@pc636> (raw)
In-Reply-To: <20200907084538.qz5g4mikznwxr67g@wittgenstein>

On Mon, Sep 07, 2020 at 10:45:38AM +0200, Christian Brauner wrote:
> On Sat, Sep 05, 2020 at 12:12:29AM +0000, Isaac J. Manjarres wrote:
> > The per-cpu cached vmalloc'ed stacks are currently freed in the
> > CPU hotplug teardown path by the free_vm_stack_cache() callback,
> > which invokes vfree(), which may result in purging the list of
> > lazily freed vmap areas.
> > 
> > Purging all of the lazily freed vmap areas can take a long time
> > when the list of vmap areas is large. This is problematic, as
> > free_vm_stack_cache() is invoked prior to the offline CPU's timers
> > being migrated. This is not desirable as it can lead to timer
> > migration delays in the CPU hotplug teardown path, and timer callbacks
> > will be invoked long after the timer has expired.
> > 
> > For example, on a system that has only one online CPU (CPU 1) that is
> > running a heavy workload, and another CPU that is being offlined,
> > the online CPU will invoke free_vm_stack_cache() to free the cached
> > vmalloc'ed stacks for the CPU being offlined. When there are 2702
> > vmap areas that total to 13498 pages, free_vm_stack_cache() takes
> > over 2 seconds to execute:
> > 
> > [001]   399.335808: cpuhp_enter: cpu: 0005 target:   0 step:  67 (free_vm_stack_cache)
> > 
> > /* The first vmap area to be freed */
> > [001]   399.337157: __purge_vmap_area_lazy: [0:2702] 0xffffffc033da8000 - 0xffffffc033dad000 (5 : 13498)
> > 
> > /* After two seconds */
> > [001]   401.528010: __purge_vmap_area_lazy: [1563:2702] 0xffffffc02fe10000 - 0xffffffc02fe15000 (5 : 5765)
> > 
> > Instead of freeing the per-cpu cached vmalloc'ed stacks synchronously
> > with respect to the CPU hotplug teardown state machine, free them
> > asynchronously to help move along the CPU hotplug teardown state machine
> > quickly.
> > 
> > Signed-off-by: Isaac J. Manjarres <isaacm@codeaurora.org>
> > ---
> 
> This needs reviews and acks from the good folks over at mm/.
> 
> >  kernel/fork.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index 4d32190..68346a0 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -202,7 +202,7 @@ static int free_vm_stack_cache(unsigned int cpu)
> >  		if (!vm_stack)
> >  			continue;
> >  
> > -		vfree(vm_stack->addr);
> > +		vfree_atomic(vm_stack->addr);
> >  		cached_vm_stacks[i] = NULL;
> >  	}
> >  
Could you please also test below patch?

<snip>
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index be4724b916b3..cef7a9768f01 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -419,6 +419,10 @@ static LLIST_HEAD(vmap_purge_list);
 static struct rb_root vmap_area_root = RB_ROOT;
 static bool vmap_initialized __read_mostly;

+static struct rb_root purge_vmap_area_root = RB_ROOT;
+static LIST_HEAD(purge_vmap_area_list);
+static DEFINE_SPINLOCK(purge_vmap_area_lock);
+
 /*
  * This kmem_cache is used for vmap_area objects. Instead of
  * allocating from slab we reuse an object from this cache to
@@ -743,7 +747,8 @@ insert_vmap_area_augment(struct vmap_area *va,
  */
 static __always_inline struct vmap_area *
 merge_or_add_vmap_area(struct vmap_area *va,
-       struct rb_root *root, struct list_head *head)
+       struct rb_root *root, struct list_head *head,
+       bool update)
 {
        struct vmap_area *sibling;
        struct list_head *next;
@@ -825,7 +830,9 @@ merge_or_add_vmap_area(struct vmap_area *va,
        /*
         * Last step is to check and update the tree.
         */
-       augment_tree_propagate_from(va);
+       if (update)
+               augment_tree_propagate_from(va);
+
        return va;
 }

@@ -1140,7 +1147,7 @@ static void free_vmap_area(struct vmap_area *va)
         * Insert/Merge it back to the free tree/list.
         */
        spin_lock(&free_vmap_area_lock);
-       merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list);
+       merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list, true);
        spin_unlock(&free_vmap_area_lock);
 }
 
@@ -1328,32 +1335,33 @@ void set_iounmap_nonlazy(void)
 static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
 {
        unsigned long resched_threshold;
-       struct llist_node *valist;
-       struct vmap_area *va;
-       struct vmap_area *n_va;
+       struct list_head local_pure_list;
+       struct vmap_area *va, *n_va;
 
        lockdep_assert_held(&vmap_purge_lock);
 
-       valist = llist_del_all(&vmap_purge_list);
-       if (unlikely(valist == NULL))
+       spin_lock(&purge_vmap_area_lock);
+       purge_vmap_area_root = RB_ROOT;
+       list_replace_init(&purge_vmap_area_list, &local_pure_list);
+       spin_unlock(&purge_vmap_area_lock);
+
+       if (unlikely(list_empty(&local_pure_list)))
                return false;

-       /*
-        * TODO: to calculate a flush range without looping.
-        * The list can be up to lazy_max_pages() elements.
-        */
-       llist_for_each_entry(va, valist, purge_list) {
-               if (va->va_start < start)
-                       start = va->va_start;
-               if (va->va_end > end)
-                       end = va->va_end;
-       }
+
+       start = min(start,
+                       list_first_entry(&local_pure_list,
+                               struct vmap_area, list)->va_start);
+
+       end = max(end,
+                       list_last_entry(&local_pure_list,
+                               struct vmap_area, list)->va_end);
 
        flush_tlb_kernel_range(start, end);
        resched_threshold = lazy_max_pages() << 1;
 
        spin_lock(&free_vmap_area_lock);
-       llist_for_each_entry_safe(va, n_va, valist, purge_list) {
+       list_for_each_entry_safe(va, n_va, &local_pure_list, list) {
                unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
                unsigned long orig_start = va->va_start;
                unsigned long orig_end = va->va_end;
@@ -1364,7 +1372,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
                 * anything.
                 */
                va = merge_or_add_vmap_area(va, &free_vmap_area_root,
-                                           &free_vmap_area_list);
+                               &free_vmap_area_list, true);
 
                if (!va)
                        continue;
@@ -1421,9 +1429,19 @@ static void free_vmap_area_noflush(struct vmap_area *va)
        nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
                                PAGE_SHIFT, &vmap_lazy_nr);
 
-       /* After this point, we may free va at any time */
-       llist_add(&va->purge_list, &vmap_purge_list);
+       /*
+        * But this time the node has been removed from the busy
+        * data structures therefore, merge or place it to the purge
+        * tree/list.
+        */
+       spin_lock(&purge_vmap_area_lock);
+       merge_or_add_vmap_area(va,
+               &purge_vmap_area_root, &purge_vmap_area_list, false);
+       spin_unlock(&purge_vmap_area_lock);
 
+       /*
+        * After this point, we may free va at any time.
+        */
        if (unlikely(nr_lazy > lazy_max_pages()))
                try_purge_vmap_area_lazy();
 }
@@ -3352,7 +3370,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
                orig_start = vas[area]->va_start;
                orig_end = vas[area]->va_end;
                va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
-                                           &free_vmap_area_list);
+                               &free_vmap_area_list, true);
                if (va)
                        kasan_release_vmalloc(orig_start, orig_end,
                                va->va_start, va->va_end);
@@ -3402,7 +3420,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
                orig_start = vas[area]->va_start;
                orig_end = vas[area]->va_end;
                va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
-                                           &free_vmap_area_list);
+                               &free_vmap_area_list, true);
                if (va)
                        kasan_release_vmalloc(orig_start, orig_end,
                                va->va_start, va->va_end);
<snip>

Thank you!

--
Vlad Rezki

next prev parent reply	other threads:[~2020-09-07 16:32 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <010101745b9b7825-f4d1db4b-83fb-450e-9391-82aad77a1bd6-000000@us-west-2.amazonses.com>
2020-09-07  8:45 ` [RFC PATCH] fork: Free per-cpu cached vmalloc'ed thread stacks with Christian Brauner
2020-09-07 14:21   ` Uladzislau Rezki [this message]
     [not found] <010101745b9b7830-0392d16f-4ee3-4b0e-afc1-51ebb71d0cb3-000000@us-west-2.amazonses.com>
2020-09-14  3:37 ` Andrew Morton
2020-09-05  0:11 Isaac J. Manjarres

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:be4724b916b dfblob:cef7a9768f0 )
 OR (
bs:"Re: [RFC PATCH] fork: Free per-cpu cached vmalloc'ed thread stacks with" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200907142131.GA16844@pc636 \
    --to=urezki@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=areber@redhat.com \
    --cc=christian@kellner.me \
    --cc=cyphar@cyphar.com \
    --cc=ebiederm@xmission.com \
    --cc=esyr@redhat.com \
    --cc=isaacm@codeaurora.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm.kvack.org@wittgenstein \
    --cc=mingo@kernel.org \
    --cc=peterz@infradead.org \
    --cc=pratikp@codeaurora.org \
    --cc=psodagud@codeaurora.org \
    --cc=shakeelb@google.com \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.