[RFC PATCH v2] xen: free_domheap_pages: delay page scrub to idle loop

xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed

* [RFC PATCH v2] xen: free_domheap_pages: delay page scrub to idle loop
@ 2014-05-20  2:15 Bob Liu
  2014-05-20  8:20 ` Jan Beulich
  2014-05-20 13:56 ` Konrad Rzeszutek Wilk
  0 siblings, 2 replies; 5+ messages in thread
From: Bob Liu @ 2014-05-20  2:15 UTC (permalink / raw)
  To: xen-devel; +Cc: keir, ian.campbell, andrew.cooper3, jbeulich

Because of page scrub, it's very slow to destroy a domain with large
memory.
It took around 10 minutes when destroy a guest of nearly 1 TB of memory.

[root@ca-test111 ~]# time xm des 5
real    10m51.582s
user    0m0.115s
sys     0m0.039s
[root@ca-test111 ~]#

There are two meanings to improve this situation.
1. Delay the page scrub in free_domheap_pages(), so that 'xl destroy xxx' can
return earlier.

2. But the real scrub time doesn't get improved a lot, we should consider put the
scrub job on all idle cpus in parallel. An obvious solution is add page to
a global list during free_domheap_pages(), and then whenever a cpu enter
idle_loop() it will try to isolate a page and scrub/free it.
Unfortunately this solution didn't work as expected in my testing, because
introduce a global list which also means we need a lock to protect that list.
The cost is too heavy!

So I use a percpu scrub page list in this patch, the tradeoff is we may not use
all idle cpus. It depends on free_domheap_pages() runs on which cpu.

Signed-off-by: Bob Liu <bob.liu@oracle.com>
---
 xen/arch/x86/domain.c   |    1 +
 xen/common/page_alloc.c |   32 +++++++++++++++++++++++++++++++-
 xen/include/xen/mm.h    |    1 +
 3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 6fddd4c..f3f1260 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -119,6 +119,7 @@ static void idle_loop(void)
         (*pm_idle)();
         do_tasklet();
         do_softirq();
+        scrub_free_pages();
     }
 }
 
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 601319c..b2a0fc5 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -79,6 +79,8 @@ PAGE_LIST_HEAD(page_offlined_list);
 /* Broken page list, protected by heap_lock. */
 PAGE_LIST_HEAD(page_broken_list);
 
+DEFINE_PER_CPU(struct page_list_head , page_scrub_list);
+
 /*************************
  * BOOT-TIME ALLOCATOR
  */
@@ -633,6 +635,9 @@ static struct page_info *alloc_heap_pages(
                     goto found;
         } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
 
+        if ( scrub_free_pages() )
+            continue;
+
         if ( memflags & MEMF_exact_node )
             goto not_found;
 
@@ -1417,6 +1422,23 @@ void free_xenheap_pages(void *v, unsigned int order)
 #endif
 
 
+unsigned long scrub_free_pages(void)
+{
+    struct page_info *pg;
+    unsigned long nr_scrubed = 0;
+
+    /* Scrub around 400M memory every time */
+    while ( nr_scrubed < 100000 )
+    {
+        pg = page_list_remove_head( &this_cpu(page_scrub_list) );
+        if (!pg)
+            break;
+        scrub_one_page(pg);
+        free_heap_pages(pg, 0);
+        nr_scrubed++;
+    }
+    return nr_scrubed;
+}
 
 /*************************
  * DOMAIN-HEAP SUB-ALLOCATOR
@@ -1564,8 +1586,15 @@ void free_domheap_pages(struct page_info *pg, unsigned int order)
          * domain has died we assume responsibility for erasure.
          */
         if ( unlikely(d->is_dying) )
+        {
+            /*
+             * Add page to page_scrub_list to speed up domain destroy, those
+             * pages will be zeroed later by scrub_page_tasklet.
+             */
             for ( i = 0; i < (1 << order); i++ )
-                scrub_one_page(&pg[i]);
+                page_list_add_tail( &pg[i], &this_cpu(page_scrub_list) );
+            goto out;
+        }
 
         free_heap_pages(pg, order);
     }
@@ -1583,6 +1612,7 @@ void free_domheap_pages(struct page_info *pg, unsigned int order)
         drop_dom_ref = 0;
     }
 
+out:
     if ( drop_dom_ref )
         put_domain(d);
 }
diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h
index b183189..3560335 100644
--- a/xen/include/xen/mm.h
+++ b/xen/include/xen/mm.h
@@ -355,6 +355,7 @@ static inline unsigned int get_order_from_pages(unsigned long nr_pages)
 }
 
 void scrub_one_page(struct page_info *);
+unsigned long scrub_free_pages(void);
 
 int xenmem_add_to_physmap_one(struct domain *d, unsigned int space,
                               domid_t foreign_domid,
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [RFC PATCH v2] xen: free_domheap_pages: delay page scrub to idle loop
  2014-05-20  2:15 [RFC PATCH v2] xen: free_domheap_pages: delay page scrub to idle loop Bob Liu
@ 2014-05-20  8:20 ` Jan Beulich
  2014-05-20  8:47   ` Bob Liu
  2014-05-20 13:56 ` Konrad Rzeszutek Wilk
  1 sibling, 1 reply; 5+ messages in thread
From: Jan Beulich @ 2014-05-20  8:20 UTC (permalink / raw)
  To: Bob Liu; +Cc: keir, ian.campbell, andrew.cooper3, xen-devel

>>> On 20.05.14 at 04:15, <lliubbo@gmail.com> wrote:
> So I use a percpu scrub page list in this patch, the tradeoff is we may not 
> use all idle cpus. It depends on free_domheap_pages() runs on which cpu.

So this means the time until all memory can be used again a completely
arbitrary amount of time can pass, as it depends on the freeing CPU
to be idle long enough (many minutes according to your observations)
- even if all the rest of the system was idle.

I see the problem with the lock contention, but I think an at least
slightly more sophisticated solution is going to be needed.

> @@ -633,6 +635,9 @@ static struct page_info *alloc_heap_pages(
>                      goto found;
>          } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
>  
> +        if ( scrub_free_pages() )
> +            continue;

This will recover memory only from the current CPU's list - the larger
the system, the less likely that this will turn up anything. Furthermore
you're creating an almost unbounded loop here - for order > 0 the
ability of scrub_pages() to make memory available doesn't mean that
on the next iteration the loop wouldn't come back here.

> @@ -1417,6 +1422,23 @@ void free_xenheap_pages(void *v, unsigned int order)
>  #endif
>  
>  
> +unsigned long scrub_free_pages(void)

The return value and the local variable below could easily be
unsigned int.

> +{
> +    struct page_info *pg;
> +    unsigned long nr_scrubed = 0;
> +
> +    /* Scrub around 400M memory every time */
> +    while ( nr_scrubed < 100000 )

Without explanation such a hard coded number wouldn't be acceptable
in any case. How long does it take to scrub 400Mb on a _slow_ system?
I hope you realize that the amount of work you do here affects the
wakeup time of a vCPU supposed to run on the given CPU.

> @@ -1564,8 +1586,15 @@ void free_domheap_pages(struct page_info *pg, unsigned int order)
>           * domain has died we assume responsibility for erasure.
>           */
>          if ( unlikely(d->is_dying) )
> +        {
> +            /*
> +             * Add page to page_scrub_list to speed up domain destroy, those
> +             * pages will be zeroed later by scrub_page_tasklet.
> +             */
>              for ( i = 0; i < (1 << order); i++ )
> -                scrub_one_page(&pg[i]);
> +                page_list_add_tail( &pg[i], &this_cpu(page_scrub_list) );
> +            goto out;
> +        }

If done this way, I see no reason why you couldn't add the page in one
chunk to the list (i.e. even if order > 0), by making use of PFN_ORDER()
to communicate the order to the scrubbing routine.

But having sent a v2 patch without the conceptional questions being
sorted out I consider kind of odd anyway. I.e. before sending another
version I think you need to
- explain that the latency gain here outweighs the performance effects
  on other guests,
- explain why alternative approaches (like the suggested flagging of the
  pages as needing scrubbing during freeing, and doing the scrubbing in
  the background as well as on the allocation path) are worse, or at least
  not better than this one (this model may even allow avoiding the
  scrubbing altogether for hypervisor internal allocations).

Jan

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC PATCH v2] xen: free_domheap_pages: delay page scrub to idle loop
  2014-05-20  8:20 ` Jan Beulich
@ 2014-05-20  8:47   ` Bob Liu
  2014-05-20  9:46     ` Jan Beulich
  0 siblings, 1 reply; 5+ messages in thread
From: Bob Liu @ 2014-05-20  8:47 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Bob Liu, keir, ian.campbell, andrew.cooper3, xen-devel


On 05/20/2014 04:20 PM, Jan Beulich wrote:
>>>> On 20.05.14 at 04:15, <lliubbo@gmail.com> wrote:
>> So I use a percpu scrub page list in this patch, the tradeoff is we may not 
>> use all idle cpus. It depends on free_domheap_pages() runs on which cpu.
> 
> So this means the time until all memory can be used again a completely
> arbitrary amount of time can pass, as it depends on the freeing CPU
> to be idle long enough (many minutes according to your observations)
> - even if all the rest of the system was idle.
> 
> I see the problem with the lock contention, but I think an at least
> slightly more sophisticated solution is going to be needed.
> 
>> @@ -633,6 +635,9 @@ static struct page_info *alloc_heap_pages(
>>                      goto found;
>>          } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
>>  
>> +        if ( scrub_free_pages() )
>> +            continue;
> 
> This will recover memory only from the current CPU's list - the larger
> the system, the less likely that this will turn up anything. Furthermore
> you're creating an almost unbounded loop here - for order > 0 the
> ability of scrub_pages() to make memory available doesn't mean that
> on the next iteration the loop wouldn't come back here.
> 
>> @@ -1417,6 +1422,23 @@ void free_xenheap_pages(void *v, unsigned int order)
>>  #endif
>>  
>>  
>> +unsigned long scrub_free_pages(void)
> 
> The return value and the local variable below could easily be
> unsigned int.
> 
>> +{
>> +    struct page_info *pg;
>> +    unsigned long nr_scrubed = 0;
>> +
>> +    /* Scrub around 400M memory every time */
>> +    while ( nr_scrubed < 100000 )
> 
> Without explanation such a hard coded number wouldn't be acceptable
> in any case. How long does it take to scrub 400Mb on a _slow_ system?
> I hope you realize that the amount of work you do here affects the
> wakeup time of a vCPU supposed to run on the given CPU.
> 
>> @@ -1564,8 +1586,15 @@ void free_domheap_pages(struct page_info *pg, unsigned int order)
>>           * domain has died we assume responsibility for erasure.
>>           */
>>          if ( unlikely(d->is_dying) )
>> +        {
>> +            /*
>> +             * Add page to page_scrub_list to speed up domain destroy, those
>> +             * pages will be zeroed later by scrub_page_tasklet.
>> +             */
>>              for ( i = 0; i < (1 << order); i++ )
>> -                scrub_one_page(&pg[i]);
>> +                page_list_add_tail( &pg[i], &this_cpu(page_scrub_list) );
>> +            goto out;
>> +        }
> 
> If done this way, I see no reason why you couldn't add the page in one
> chunk to the list (i.e. even if order > 0), by making use of PFN_ORDER()
> to communicate the order to the scrubbing routine.
> 
> But having sent a v2 patch without the conceptional questions being

Sorry, I also realised this version is immature.

> sorted out I consider kind of odd anyway. I.e. before sending another
> version I think you need to
> - explain that the latency gain here outweighs the performance effects
>   on other guests,
> - explain why alternative approaches (like the suggested flagging of the
>   pages as needing scrubbing during freeing, and doing the scrubbing in

Could you expand a bit more on how to use the idle cpus to do the
scrubbing in background without impact other guests?
I don't have a good solution in mind yet.

>   the background as well as on the allocation path) are worse, or at least

Do you mean we can delay the page scrubbing to alloc_heap_pages()?

free_domheap_pages()
{
    if ( unlikely(d->is_dying) )
    {
        //set page flag to need scrubbing, and then free
        free_heap_pages(pg);
    }
}

alloc_heap_pages()
{
    for ( ; ; )
    {
        if ( (pg = page_list_remove_head(&heap(node, zone, j))) )
            goto found;
    }

found:
    if (page tagged with need scrubbing)
        scrub_one_page(pg);
}

Thanks,
-Bob

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC PATCH v2] xen: free_domheap_pages: delay page scrub to idle loop
  2014-05-20  8:47   ` Bob Liu
@ 2014-05-20  9:46     ` Jan Beulich
  0 siblings, 0 replies; 5+ messages in thread
From: Jan Beulich @ 2014-05-20  9:46 UTC (permalink / raw)
  To: Bob Liu; +Cc: Bob Liu, keir, ian.campbell, andrew.cooper3, xen-devel

>>> On 20.05.14 at 10:47, <bob.liu@oracle.com> wrote:
> On 05/20/2014 04:20 PM, Jan Beulich wrote:
>> - explain why alternative approaches (like the suggested flagging of the
>>   pages as needing scrubbing during freeing, and doing the scrubbing in
> 
> Could you expand a bit more on how to use the idle cpus to do the
> scrubbing in background without impact other guests?

The only thing that needs specific consideration is the desire to
not have everyone contend on a global lock. Beyond that it
ought to be mostly mechanical: You'd need a quick way for CPUs
to find pages that need scrubbing (e.g. by adding a second link
field to struct page_info, even if this may need careful field
re-organization), and you need to deal with the race between
idle and allocation based scrubbing.

Ideally we would also take NUMA information into account here,
i.e. prefer scrubbing pages on CPUs close to the memory.

>>   the background as well as on the allocation path) are worse, or at least
> 
> Do you mean we can delay the page scrubbing to alloc_heap_pages()?
> 
> free_domheap_pages()
> {
>     if ( unlikely(d->is_dying) )
>     {
>         //set page flag to need scrubbing, and then free
>         free_heap_pages(pg);
>     }
> }
> 
> alloc_heap_pages()
> {
>     for ( ; ; )
>     {
>         if ( (pg = page_list_remove_head(&heap(node, zone, j))) )
>             goto found;
>     }
> 
> found:
>     if (page tagged with need scrubbing)
>         scrub_one_page(pg);
> }

Yes, along those lines, but of course with taking order > 0 into account.
That may involve deferring the merging of chunks until after scrubbing
was done, or having an efficient way to know from a higher order chunk
which lower order parts of it still need to be scrubbed.

Jan

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC PATCH v2] xen: free_domheap_pages: delay page scrub to idle loop
  2014-05-20  2:15 [RFC PATCH v2] xen: free_domheap_pages: delay page scrub to idle loop Bob Liu
  2014-05-20  8:20 ` Jan Beulich
@ 2014-05-20 13:56 ` Konrad Rzeszutek Wilk
  1 sibling, 0 replies; 5+ messages in thread
From: Konrad Rzeszutek Wilk @ 2014-05-20 13:56 UTC (permalink / raw)
  To: Bob Liu; +Cc: keir, ian.campbell, andrew.cooper3, jbeulich, xen-devel

On Tue, May 20, 2014 at 10:15:31AM +0800, Bob Liu wrote:
> Because of page scrub, it's very slow to destroy a domain with large
> memory.
> It took around 10 minutes when destroy a guest of nearly 1 TB of memory.
> 
> [root@ca-test111 ~]# time xm des 5
> real    10m51.582s
> user    0m0.115s
> sys     0m0.039s
> [root@ca-test111 ~]#
> 
> There are two meanings to improve this situation.
> 1. Delay the page scrub in free_domheap_pages(), so that 'xl destroy xxx' can
> return earlier.
> 
> 2. But the real scrub time doesn't get improved a lot, we should consider put the
> scrub job on all idle cpus in parallel. An obvious solution is add page to
> a global list during free_domheap_pages(), and then whenever a cpu enter
> idle_loop() it will try to isolate a page and scrub/free it.
> Unfortunately this solution didn't work as expected in my testing, because
> introduce a global list which also means we need a lock to protect that list.
> The cost is too heavy!

You can introduce a per-cpu list which does not need a global lock.

The problem is with insertion of items in it - that would require an IPI
which would then need to take the global lock and populate the local CPU
list.

Interestingly, this is what I had been working on to convert the
tasklets in per-cpu tasklets.

> 
> So I use a percpu scrub page list in this patch, the tradeoff is we may not use
> all idle cpus. It depends on free_domheap_pages() runs on which cpu.
> 
> Signed-off-by: Bob Liu <bob.liu@oracle.com>
> ---
>  xen/arch/x86/domain.c   |    1 +
>  xen/common/page_alloc.c |   32 +++++++++++++++++++++++++++++++-
>  xen/include/xen/mm.h    |    1 +
>  3 files changed, 33 insertions(+), 1 deletion(-)
> 
> diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
> index 6fddd4c..f3f1260 100644
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -119,6 +119,7 @@ static void idle_loop(void)
>          (*pm_idle)();

I would actually do it _right_ before we call the pm_idle. As in
right before we go in C states instead of after we have been woken up -
as that means:
 a) the timer expires, so a guest needs to be woken up - and we do not
    want to take use its timeslice to scrub some other guest memory.
 b). an interrupt that needs to be processed. (do_IRQ gets called,
    does it stuff, setups the right softirq bits and then exits which
    resumes).

>          do_tasklet();
>          do_softirq();
> +        scrub_free_pages();
>      }
>  }
>  
> diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
> index 601319c..b2a0fc5 100644
> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -79,6 +79,8 @@ PAGE_LIST_HEAD(page_offlined_list);
>  /* Broken page list, protected by heap_lock. */
>  PAGE_LIST_HEAD(page_broken_list);
>  
> +DEFINE_PER_CPU(struct page_list_head , page_scrub_list);
> +
>  /*************************
>   * BOOT-TIME ALLOCATOR
>   */
> @@ -633,6 +635,9 @@ static struct page_info *alloc_heap_pages(
>                      goto found;
>          } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
>  
> +        if ( scrub_free_pages() )
> +            continue;
> +
>          if ( memflags & MEMF_exact_node )
>              goto not_found;
>  
> @@ -1417,6 +1422,23 @@ void free_xenheap_pages(void *v, unsigned int order)
>  #endif
>  
>  
> +unsigned long scrub_free_pages(void)
> +{
> +    struct page_info *pg;
> +    unsigned long nr_scrubed = 0;
> +
> +    /* Scrub around 400M memory every time */

Could you mention why 400M? 
> +    while ( nr_scrubed < 100000 )
> +    {
> +        pg = page_list_remove_head( &this_cpu(page_scrub_list) );
> +        if (!pg)
> +            break;
> +        scrub_one_page(pg);
> +        free_heap_pages(pg, 0);
> +        nr_scrubed++;
> +    }
> +    return nr_scrubed;
> +}
>  
>  /*************************
>   * DOMAIN-HEAP SUB-ALLOCATOR
> @@ -1564,8 +1586,15 @@ void free_domheap_pages(struct page_info *pg, unsigned int order)
>           * domain has died we assume responsibility for erasure.
>           */
>          if ( unlikely(d->is_dying) )
> +        {
> +            /*
> +             * Add page to page_scrub_list to speed up domain destroy, those
> +             * pages will be zeroed later by scrub_page_tasklet.
> +             */
>              for ( i = 0; i < (1 << order); i++ )
> -                scrub_one_page(&pg[i]);
> +                page_list_add_tail( &pg[i], &this_cpu(page_scrub_list) );
> +            goto out;
> +        }
>  
>          free_heap_pages(pg, order);
>      }
> @@ -1583,6 +1612,7 @@ void free_domheap_pages(struct page_info *pg, unsigned int order)
>          drop_dom_ref = 0;
>      }
>  
> +out:
>      if ( drop_dom_ref )
>          put_domain(d);
>  }
> diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h
> index b183189..3560335 100644
> --- a/xen/include/xen/mm.h
> +++ b/xen/include/xen/mm.h
> @@ -355,6 +355,7 @@ static inline unsigned int get_order_from_pages(unsigned long nr_pages)
>  }
>  
>  void scrub_one_page(struct page_info *);
> +unsigned long scrub_free_pages(void);
>  
>  int xenmem_add_to_physmap_one(struct domain *d, unsigned int space,
>                                domid_t foreign_domid,
> -- 
> 1.7.10.4
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2014-05-20 13:56 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-05-20  2:15 [RFC PATCH v2] xen: free_domheap_pages: delay page scrub to idle loop Bob Liu
2014-05-20  8:20 ` Jan Beulich
2014-05-20  8:47   ` Bob Liu
2014-05-20  9:46     ` Jan Beulich
2014-05-20 13:56 ` Konrad Rzeszutek Wilk

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).