* [PATCH 1/2] xen: introduce a no lock version function of free_heap_pages @ 2014-06-10 12:18 Bob Liu 2014-06-10 12:18 ` [PATCH 2/2] xen: spread page scrubbing across all idle CPU Bob Liu 2014-06-10 12:32 ` [PATCH 1/2] xen: introduce a no lock version function of free_heap_pages Andrew Cooper 0 siblings, 2 replies; 13+ messages in thread From: Bob Liu @ 2014-06-10 12:18 UTC (permalink / raw) To: xen-devel; +Cc: keir, ian.campbell, andrew.cooper3, jbeulich Function free_heap_pages() needs to get the heap_lock every time. This lock may become a problem when there are many CPUs trying to free pages in parallel. This patch introduces a no lock version function __free_heap_pages(), it can be used to free a batch of pages after grabbing the heap_lock. Signed-off-by: Bob Liu <bob.liu@oracle.com> --- xen/common/page_alloc.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c index 601319c..56826b4 100644 --- a/xen/common/page_alloc.c +++ b/xen/common/page_alloc.c @@ -808,8 +808,8 @@ static int reserve_offlined_page(struct page_info *head) return count; } -/* Free 2^@order set of pages. */ -static void free_heap_pages( +/* No lock version, the caller must hold heap_lock */ +static void __free_heap_pages( struct page_info *pg, unsigned int order) { unsigned long mask, mfn = page_to_mfn(pg); @@ -819,8 +819,6 @@ static void free_heap_pages( ASSERT(order <= MAX_ORDER); ASSERT(node >= 0); - spin_lock(&heap_lock); - for ( i = 0; i < (1 << order); i++ ) { /* @@ -894,7 +892,14 @@ static void free_heap_pages( if ( tainted ) reserve_offlined_page(pg); +} +/* Free 2^@order set of pages. */ +static void free_heap_pages( + struct page_info *pg, unsigned int order) +{ + spin_lock(&heap_lock); + __free_heap_pages(pg, order); spin_unlock(&heap_lock); } -- 1.7.10.4 ^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH 2/2] xen: spread page scrubbing across all idle CPU 2014-06-10 12:18 [PATCH 1/2] xen: introduce a no lock version function of free_heap_pages Bob Liu @ 2014-06-10 12:18 ` Bob Liu 2014-06-10 12:42 ` Andrew Cooper 2014-06-10 14:12 ` Jan Beulich 2014-06-10 12:32 ` [PATCH 1/2] xen: introduce a no lock version function of free_heap_pages Andrew Cooper 1 sibling, 2 replies; 13+ messages in thread From: Bob Liu @ 2014-06-10 12:18 UTC (permalink / raw) To: xen-devel; +Cc: keir, ian.campbell, andrew.cooper3, jbeulich Because of page scrubbing, it's very slow to destroy a domain with large memory. It takes around 10 minutes when destroy a guest of nearly 1 TB of memory. Besides the CPU running 'xl destory', this patch try to accelerate the scrubbing time by using all idle CPUs to do the scrubbing work in parallel. Compared with other approaches, this way has a minimum effect to the whole system. The disadvantage is if all CPUs are not idle, the scrubbing time can't be reduced. On a 60 CPUs system, the 'xl destroy' time of a 30G guest reduced from 15s to 8s. Signed-off-by: Bob Liu <bob.liu@oracle.com> --- xen/arch/x86/domain.c | 1 + xen/common/domain.c | 4 +++ xen/common/page_alloc.c | 73 +++++++++++++++++++++++++++++++++++++++++++++-- xen/include/xen/mm.h | 2 ++ 4 files changed, 78 insertions(+), 2 deletions(-) diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index 6fddd4c..ffa9b91 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -116,6 +116,7 @@ static void idle_loop(void) { if ( cpu_is_offline(smp_processor_id()) ) play_dead(); + scrub_free_pages(0); (*pm_idle)(); do_tasklet(); do_softirq(); diff --git a/xen/common/domain.c b/xen/common/domain.c index 4291e29..7a8c6bf 100644 --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -593,6 +593,10 @@ int domain_kill(struct domain *d) BUG_ON(rc != -EAGAIN); break; } + + tasklet_init(&global_scrub_tasklet, scrub_free_pages, 1); + tasklet_schedule_on_cpu(&global_scrub_tasklet, smp_processor_id()); + for_each_vcpu ( d, v ) unmap_vcpu_info(v); d->is_dying = DOMDYING_dead; diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c index 56826b4..d51926d 100644 --- a/xen/common/page_alloc.c +++ b/xen/common/page_alloc.c @@ -79,6 +79,14 @@ PAGE_LIST_HEAD(page_offlined_list); /* Broken page list, protected by heap_lock. */ PAGE_LIST_HEAD(page_broken_list); +/* Global scrub page list, protected by scrub_lock */ +PAGE_LIST_HEAD(global_scrub_list); +DEFINE_SPINLOCK(scrub_lock); +struct tasklet global_scrub_tasklet; + +DEFINE_PER_CPU(struct page_list_head, scrub_list); +DEFINE_PER_CPU(struct page_list_head, scrubbed_list); + /************************* * BOOT-TIME ALLOCATOR */ @@ -1569,9 +1577,13 @@ void free_domheap_pages(struct page_info *pg, unsigned int order) * domain has died we assume responsibility for erasure. */ if ( unlikely(d->is_dying) ) + { + spin_lock(&scrub_lock); for ( i = 0; i < (1 << order); i++ ) - scrub_one_page(&pg[i]); - + page_list_add_tail(&pg[i], &global_scrub_list); + spin_unlock(&scrub_lock); + goto out; + } free_heap_pages(pg, order); } else if ( unlikely(d == dom_cow) ) @@ -1588,6 +1600,7 @@ void free_domheap_pages(struct page_info *pg, unsigned int order) drop_dom_ref = 0; } +out: if ( drop_dom_ref ) put_domain(d); } @@ -1680,6 +1693,62 @@ void scrub_one_page(struct page_info *pg) unmap_domain_page(p); } +#define SCRUB_BATCH 1024 +void scrub_free_pages(unsigned long is_tasklet) +{ + struct page_info *pg; + unsigned int i, cpu, empty = 0; + + cpu = smp_processor_id(); + do + { + if ( page_list_empty(&this_cpu(scrub_list)) ) + { + /* Delist a batch of pages from global scrub list */ + spin_lock(&scrub_lock); + for( i = 0; i < SCRUB_BATCH; i++ ) + { + pg = page_list_remove_head(&global_scrub_list); + if ( !pg ) + { + empty = 1; + break; + } + page_list_add_tail(pg, &this_cpu(scrub_list)); + } + spin_unlock(&scrub_lock); + } + + /* Scrub percpu list */ + while ( !page_list_empty(&this_cpu(scrub_list)) ) + { + pg = page_list_remove_head(&this_cpu(scrub_list)); + ASSERT(pg); + scrub_one_page(pg); + page_list_add_tail(pg, &this_cpu(scrubbed_list)); + } + + /* Free percpu scrubbed list */ + if ( !page_list_empty(&this_cpu(scrubbed_list)) ) + { + spin_lock(&heap_lock); + while ( !page_list_empty(&this_cpu(scrubbed_list)) ) + { + pg = page_list_remove_head(&this_cpu(scrubbed_list)); + __free_heap_pages(pg, 0); + } + spin_unlock(&heap_lock); + } + + /* Global list is empty */ + if (empty) + return; + } while ( !softirq_pending(cpu) ); + + if( is_tasklet ) + tasklet_schedule_on_cpu(&global_scrub_tasklet, cpu); +} + static void dump_heap(unsigned char key) { s_time_t now = NOW(); diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h index b183189..29821c4 100644 --- a/xen/include/xen/mm.h +++ b/xen/include/xen/mm.h @@ -78,6 +78,8 @@ int query_page_offline(unsigned long mfn, uint32_t *status); unsigned long total_free_pages(void); void scrub_heap_pages(void); +void scrub_free_pages(unsigned long is_tasklet); +extern struct tasklet global_scrub_tasklet; int assign_pages( struct domain *d, -- 1.7.10.4 ^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [PATCH 2/2] xen: spread page scrubbing across all idle CPU 2014-06-10 12:18 ` [PATCH 2/2] xen: spread page scrubbing across all idle CPU Bob Liu @ 2014-06-10 12:42 ` Andrew Cooper 2014-06-10 14:12 ` Jan Beulich 1 sibling, 0 replies; 13+ messages in thread From: Andrew Cooper @ 2014-06-10 12:42 UTC (permalink / raw) To: Bob Liu; +Cc: keir, ian.campbell, jbeulich, xen-devel On 10/06/14 13:18, Bob Liu wrote: > Because of page scrubbing, it's very slow to destroy a domain with large memory. > It takes around 10 minutes when destroy a guest of nearly 1 TB of memory. > > Besides the CPU running 'xl destory', this patch try to accelerate the scrubbing destroy > time by using all idle CPUs to do the scrubbing work in parallel. > > Compared with other approaches, this way has a minimum effect to the whole > system. The disadvantage is if all CPUs are not idle, the scrubbing time can't > be reduced. > > On a 60 CPUs system, the 'xl destroy' time of a 30G guest reduced from 15s to > 8s. > > Signed-off-by: Bob Liu <bob.liu@oracle.com> > --- > xen/arch/x86/domain.c | 1 + > xen/common/domain.c | 4 +++ > xen/common/page_alloc.c | 73 +++++++++++++++++++++++++++++++++++++++++++++-- > xen/include/xen/mm.h | 2 ++ > 4 files changed, 78 insertions(+), 2 deletions(-) > > diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c > index 6fddd4c..ffa9b91 100644 > --- a/xen/arch/x86/domain.c > +++ b/xen/arch/x86/domain.c > @@ -116,6 +116,7 @@ static void idle_loop(void) > { > if ( cpu_is_offline(smp_processor_id()) ) > play_dead(); > + scrub_free_pages(0); > (*pm_idle)(); > do_tasklet(); > do_softirq(); > diff --git a/xen/common/domain.c b/xen/common/domain.c > index 4291e29..7a8c6bf 100644 > --- a/xen/common/domain.c > +++ b/xen/common/domain.c > @@ -593,6 +593,10 @@ int domain_kill(struct domain *d) > BUG_ON(rc != -EAGAIN); > break; > } > + > + tasklet_init(&global_scrub_tasklet, scrub_free_pages, 1); > + tasklet_schedule_on_cpu(&global_scrub_tasklet, smp_processor_id()); > + Doesn't this result in re-init()ing the global scrub tasklet if domain_kill() is called on two domains at the same time? > for_each_vcpu ( d, v ) > unmap_vcpu_info(v); > d->is_dying = DOMDYING_dead; > diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c > index 56826b4..d51926d 100644 > --- a/xen/common/page_alloc.c > +++ b/xen/common/page_alloc.c > @@ -79,6 +79,14 @@ PAGE_LIST_HEAD(page_offlined_list); > /* Broken page list, protected by heap_lock. */ > PAGE_LIST_HEAD(page_broken_list); > > +/* Global scrub page list, protected by scrub_lock */ > +PAGE_LIST_HEAD(global_scrub_list); > +DEFINE_SPINLOCK(scrub_lock); > +struct tasklet global_scrub_tasklet; > + > +DEFINE_PER_CPU(struct page_list_head, scrub_list); > +DEFINE_PER_CPU(struct page_list_head, scrubbed_list); > + > /************************* > * BOOT-TIME ALLOCATOR > */ > @@ -1569,9 +1577,13 @@ void free_domheap_pages(struct page_info *pg, unsigned int order) > * domain has died we assume responsibility for erasure. > */ > if ( unlikely(d->is_dying) ) > + { > + spin_lock(&scrub_lock); > for ( i = 0; i < (1 << order); i++ ) > - scrub_one_page(&pg[i]); > - > + page_list_add_tail(&pg[i], &global_scrub_list); > + spin_unlock(&scrub_lock); > + goto out; > + } > free_heap_pages(pg, order); > } > else if ( unlikely(d == dom_cow) ) > @@ -1588,6 +1600,7 @@ void free_domheap_pages(struct page_info *pg, unsigned int order) > drop_dom_ref = 0; > } > > +out: > if ( drop_dom_ref ) > put_domain(d); > } > @@ -1680,6 +1693,62 @@ void scrub_one_page(struct page_info *pg) > unmap_domain_page(p); > } > > +#define SCRUB_BATCH 1024 > +void scrub_free_pages(unsigned long is_tasklet) > +{ > + struct page_info *pg; > + unsigned int i, cpu, empty = 0; bool_t empty. cpu can be initialised when declared. > + > + cpu = smp_processor_id(); > + do > + { > + if ( page_list_empty(&this_cpu(scrub_list)) ) Repeatedly using this_cpu() is bad, due to the hidden RELOC_HIDE. Instead, use something like this: struct *page_list_head this_scrub_list = &this_cpu(scrub_list) > + { > + /* Delist a batch of pages from global scrub list */ > + spin_lock(&scrub_lock); > + for( i = 0; i < SCRUB_BATCH; i++ ) > + { > + pg = page_list_remove_head(&global_scrub_list); > + if ( !pg ) > + { > + empty = 1; > + break; > + } > + page_list_add_tail(pg, &this_cpu(scrub_list)); > + } > + spin_unlock(&scrub_lock); > + } > + > + /* Scrub percpu list */ > + while ( !page_list_empty(&this_cpu(scrub_list)) ) > + { > + pg = page_list_remove_head(&this_cpu(scrub_list)); > + ASSERT(pg); > + scrub_one_page(pg); > + page_list_add_tail(pg, &this_cpu(scrubbed_list)); > + } > + > + /* Free percpu scrubbed list */ > + if ( !page_list_empty(&this_cpu(scrubbed_list)) ) > + { > + spin_lock(&heap_lock); > + while ( !page_list_empty(&this_cpu(scrubbed_list)) ) > + { > + pg = page_list_remove_head(&this_cpu(scrubbed_list)); > + __free_heap_pages(pg, 0); > + } > + spin_unlock(&heap_lock); > + } > + > + /* Global list is empty */ > + if (empty) Spaces inside brackets. ~Andrew > + return; > + } while ( !softirq_pending(cpu) ); > + > + if( is_tasklet ) > + tasklet_schedule_on_cpu(&global_scrub_tasklet, cpu); > +} > + > static void dump_heap(unsigned char key) > { > s_time_t now = NOW(); > diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h > index b183189..29821c4 100644 > --- a/xen/include/xen/mm.h > +++ b/xen/include/xen/mm.h > @@ -78,6 +78,8 @@ int query_page_offline(unsigned long mfn, uint32_t *status); > unsigned long total_free_pages(void); > > void scrub_heap_pages(void); > +void scrub_free_pages(unsigned long is_tasklet); > +extern struct tasklet global_scrub_tasklet; > > int assign_pages( > struct domain *d, ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 2/2] xen: spread page scrubbing across all idle CPU 2014-06-10 12:18 ` [PATCH 2/2] xen: spread page scrubbing across all idle CPU Bob Liu 2014-06-10 12:42 ` Andrew Cooper @ 2014-06-10 14:12 ` Jan Beulich 2014-06-11 2:51 ` Bob Liu 1 sibling, 1 reply; 13+ messages in thread From: Jan Beulich @ 2014-06-10 14:12 UTC (permalink / raw) To: Bob Liu; +Cc: keir, ian.campbell, andrew.cooper3, xen-devel >>> On 10.06.14 at 14:18, <lliubbo@gmail.com> wrote: > --- a/xen/common/domain.c > +++ b/xen/common/domain.c > @@ -593,6 +593,10 @@ int domain_kill(struct domain *d) > BUG_ON(rc != -EAGAIN); > break; > } > + > + tasklet_init(&global_scrub_tasklet, scrub_free_pages, 1); > + tasklet_schedule_on_cpu(&global_scrub_tasklet, smp_processor_id()); Apart from this being a single global that you can't validly re-use this way (as already pointed out by Andrew), it also remains unclear why you want this scheduled on the current CPU rather than just anywhere. > --- a/xen/common/page_alloc.c > +++ b/xen/common/page_alloc.c > @@ -79,6 +79,14 @@ PAGE_LIST_HEAD(page_offlined_list); > /* Broken page list, protected by heap_lock. */ > PAGE_LIST_HEAD(page_broken_list); > > +/* Global scrub page list, protected by scrub_lock */ > +PAGE_LIST_HEAD(global_scrub_list); > +DEFINE_SPINLOCK(scrub_lock); > +struct tasklet global_scrub_tasklet; > + > +DEFINE_PER_CPU(struct page_list_head, scrub_list); > +DEFINE_PER_CPU(struct page_list_head, scrubbed_list); All static I suppose? > @@ -1680,6 +1693,62 @@ void scrub_one_page(struct page_info *pg) > unmap_domain_page(p); > } > > +#define SCRUB_BATCH 1024 > +void scrub_free_pages(unsigned long is_tasklet) > +{ > + struct page_info *pg; > + unsigned int i, cpu, empty = 0; > + > + cpu = smp_processor_id(); > + do > + { > + if ( page_list_empty(&this_cpu(scrub_list)) ) > + { > + /* Delist a batch of pages from global scrub list */ > + spin_lock(&scrub_lock); > + for( i = 0; i < SCRUB_BATCH; i++ ) > + { > + pg = page_list_remove_head(&global_scrub_list); > + if ( !pg ) > + { > + empty = 1; > + break; > + } > + page_list_add_tail(pg, &this_cpu(scrub_list)); > + } > + spin_unlock(&scrub_lock); > + } > + > + /* Scrub percpu list */ > + while ( !page_list_empty(&this_cpu(scrub_list)) ) > + { > + pg = page_list_remove_head(&this_cpu(scrub_list)); > + ASSERT(pg); > + scrub_one_page(pg); > + page_list_add_tail(pg, &this_cpu(scrubbed_list)); > + } So you scrub up to 1k pages at a time here - how long does that take? Did you take into consideration how much higher a wakeup latency this may cause when the invocation comes from the idle vCPU? > + /* Free percpu scrubbed list */ > + if ( !page_list_empty(&this_cpu(scrubbed_list)) ) > + { > + spin_lock(&heap_lock); > + while ( !page_list_empty(&this_cpu(scrubbed_list)) ) > + { > + pg = page_list_remove_head(&this_cpu(scrubbed_list)); > + __free_heap_pages(pg, 0); > + } > + spin_unlock(&heap_lock); > + } > + > + /* Global list is empty */ > + if (empty) > + return; > + } while ( !softirq_pending(cpu) ); > + > + if( is_tasklet ) > + tasklet_schedule_on_cpu(&global_scrub_tasklet, cpu); So you re-schedule this tasklet immediately - while this may be acceptable inside the hypervisor, did you consider the effect this will have on the guest (normally Dom0)? Its respective vCPU won't get run _at all_ until you're done scrubbing. In the end, this being an improvement by about 50% according to the numbers you gave, I'm not convinced the downsides of this are worth the gain. Jan ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 2/2] xen: spread page scrubbing across all idle CPU 2014-06-10 14:12 ` Jan Beulich @ 2014-06-11 2:51 ` Bob Liu 2014-06-11 8:13 ` Jan Beulich 0 siblings, 1 reply; 13+ messages in thread From: Bob Liu @ 2014-06-11 2:51 UTC (permalink / raw) To: Jan Beulich; +Cc: Bob Liu, keir, ian.campbell, andrew.cooper3, xen-devel On 06/10/2014 10:12 PM, Jan Beulich wrote: >>>> On 10.06.14 at 14:18, <lliubbo@gmail.com> wrote: >> --- a/xen/common/domain.c >> +++ b/xen/common/domain.c >> @@ -593,6 +593,10 @@ int domain_kill(struct domain *d) >> BUG_ON(rc != -EAGAIN); >> break; >> } >> + >> + tasklet_init(&global_scrub_tasklet, scrub_free_pages, 1); >> + tasklet_schedule_on_cpu(&global_scrub_tasklet, smp_processor_id()); > > Apart from this being a single global that you can't validly re-use > this way (as already pointed out by Andrew), it also remains > unclear why you want this scheduled on the current CPU rather > than just anywhere. > The reason is the same as you noticed later, if also schedule this tasklet on other CPUs then their respective vCPU won't get run at all until the scrubbing is done. So I use idle_loop() instead of tasklet on other CPUs. >> --- a/xen/common/page_alloc.c >> +++ b/xen/common/page_alloc.c >> @@ -79,6 +79,14 @@ PAGE_LIST_HEAD(page_offlined_list); >> /* Broken page list, protected by heap_lock. */ >> PAGE_LIST_HEAD(page_broken_list); >> >> +/* Global scrub page list, protected by scrub_lock */ >> +PAGE_LIST_HEAD(global_scrub_list); >> +DEFINE_SPINLOCK(scrub_lock); >> +struct tasklet global_scrub_tasklet; >> + >> +DEFINE_PER_CPU(struct page_list_head, scrub_list); >> +DEFINE_PER_CPU(struct page_list_head, scrubbed_list); > > All static I suppose? > >> @@ -1680,6 +1693,62 @@ void scrub_one_page(struct page_info *pg) >> unmap_domain_page(p); >> } >> >> +#define SCRUB_BATCH 1024 >> +void scrub_free_pages(unsigned long is_tasklet) >> +{ >> + struct page_info *pg; >> + unsigned int i, cpu, empty = 0; >> + >> + cpu = smp_processor_id(); >> + do >> + { >> + if ( page_list_empty(&this_cpu(scrub_list)) ) >> + { >> + /* Delist a batch of pages from global scrub list */ >> + spin_lock(&scrub_lock); >> + for( i = 0; i < SCRUB_BATCH; i++ ) >> + { >> + pg = page_list_remove_head(&global_scrub_list); >> + if ( !pg ) >> + { >> + empty = 1; >> + break; >> + } >> + page_list_add_tail(pg, &this_cpu(scrub_list)); >> + } >> + spin_unlock(&scrub_lock); >> + } >> + >> + /* Scrub percpu list */ >> + while ( !page_list_empty(&this_cpu(scrub_list)) ) >> + { >> + pg = page_list_remove_head(&this_cpu(scrub_list)); >> + ASSERT(pg); >> + scrub_one_page(pg); >> + page_list_add_tail(pg, &this_cpu(scrubbed_list)); >> + } > > So you scrub up to 1k pages at a time here - how long does that take? > Did you take into consideration how much higher a wakeup latency this > may cause when the invocation comes from the idle vCPU? > An extra softirq_pending(cpu) check can be added in this while loop, I think it's no more a problem if scrubbing only 1 page every time. >> + /* Free percpu scrubbed list */ >> + if ( !page_list_empty(&this_cpu(scrubbed_list)) ) >> + { >> + spin_lock(&heap_lock); >> + while ( !page_list_empty(&this_cpu(scrubbed_list)) ) >> + { >> + pg = page_list_remove_head(&this_cpu(scrubbed_list)); >> + __free_heap_pages(pg, 0); >> + } >> + spin_unlock(&heap_lock); >> + } >> + >> + /* Global list is empty */ >> + if (empty) >> + return; >> + } while ( !softirq_pending(cpu) ); >> + >> + if( is_tasklet ) >> + tasklet_schedule_on_cpu(&global_scrub_tasklet, cpu); > > So you re-schedule this tasklet immediately - while this may be > acceptable inside the hypervisor, did you consider the effect this > will have on the guest (normally Dom0)? Its respective vCPU won't > get run _at all_ until you're done scrubbing. > Yes, that's a problem. I don't have any better idea right now. What I'm trying is doing the scrubbing on current CPU as well as on all idle vcpus in parallel. I also considered your suggestion about doing the scrubbing in the background as well as on the allocation path. But I think it's more unacceptable for users to get blocked randomly for a uncertain time when allocating a large mount of memory. That's why I still chose the sync way that once 'xl destroy' return all memory are scrubbed. > In the end, this being an improvement by about 50% according to > the numbers you gave, I'm not convinced the downsides of this are > worth the gain. > > Jan > -- Regards, -Bob ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 2/2] xen: spread page scrubbing across all idle CPU 2014-06-11 2:51 ` Bob Liu @ 2014-06-11 8:13 ` Jan Beulich 2014-06-11 10:13 ` George Dunlap 0 siblings, 1 reply; 13+ messages in thread From: Jan Beulich @ 2014-06-11 8:13 UTC (permalink / raw) To: Bob Liu; +Cc: Bob Liu, keir, ian.campbell, andrew.cooper3, xen-devel >>> On 11.06.14 at 04:51, <bob.liu@oracle.com> wrote: > On 06/10/2014 10:12 PM, Jan Beulich wrote: >>>>> On 10.06.14 at 14:18, <lliubbo@gmail.com> wrote: >>> + if( is_tasklet ) >>> + tasklet_schedule_on_cpu(&global_scrub_tasklet, cpu); >> >> So you re-schedule this tasklet immediately - while this may be >> acceptable inside the hypervisor, did you consider the effect this >> will have on the guest (normally Dom0)? Its respective vCPU won't >> get run _at all_ until you're done scrubbing. >> > > Yes, that's a problem. I don't have any better idea right now. > > What I'm trying is doing the scrubbing on current CPU as well as on all > idle vcpus in parallel. > I also considered your suggestion about doing the scrubbing in the > background as well as on the allocation path. But I think it's more > unacceptable for users to get blocked randomly for a uncertain time when > allocating a large mount of memory. > That's why I still chose the sync way that once 'xl destroy' return all > memory are scrubbed. But I hope you realize that in the current shape, with the shortcomings pointed out un-addressed, there's no way for this to go in. Jan ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 2/2] xen: spread page scrubbing across all idle CPU 2014-06-11 8:13 ` Jan Beulich @ 2014-06-11 10:13 ` George Dunlap 2014-06-11 10:22 ` Jan Beulich 2014-06-11 10:36 ` Bob Liu 0 siblings, 2 replies; 13+ messages in thread From: George Dunlap @ 2014-06-11 10:13 UTC (permalink / raw) To: Jan Beulich; +Cc: Bob Liu, Keir Fraser, Ian Campbell, Andrew Cooper, xen-devel On Wed, Jun 11, 2014 at 9:13 AM, Jan Beulich <JBeulich@suse.com> wrote: >>>> On 11.06.14 at 04:51, <bob.liu@oracle.com> wrote: >> On 06/10/2014 10:12 PM, Jan Beulich wrote: >>>>>> On 10.06.14 at 14:18, <lliubbo@gmail.com> wrote: >>>> + if( is_tasklet ) >>>> + tasklet_schedule_on_cpu(&global_scrub_tasklet, cpu); >>> >>> So you re-schedule this tasklet immediately - while this may be >>> acceptable inside the hypervisor, did you consider the effect this >>> will have on the guest (normally Dom0)? Its respective vCPU won't >>> get run _at all_ until you're done scrubbing. >>> >> >> Yes, that's a problem. I don't have any better idea right now. >> >> What I'm trying is doing the scrubbing on current CPU as well as on all >> idle vcpus in parallel. >> I also considered your suggestion about doing the scrubbing in the >> background as well as on the allocation path. But I think it's more >> unacceptable for users to get blocked randomly for a uncertain time when >> allocating a large mount of memory. >> That's why I still chose the sync way that once 'xl destroy' return all >> memory are scrubbed. > > But I hope you realize that in the current shape, with the shortcomings > pointed out un-addressed, there's no way for this to go in. Would it make more sense to do something like the following: * Have a "clean" freelist and a "dirty" freelist * When destroying a domain, simply move pages to the dirty freelist * Have idle vcpus scrub the dirty freelist before going to sleep - ...and wake up idle vcpus to do some scrubbing when adding pages to the dirty freelist * In alloc_domheap_pages(): - If there are pages on the "clean" freelist, allocate them - If there are no pages on the "clean" freelist but there are on the "dirty" freelist, scrub pages from the "dirty" freelist synchronously. -George ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 2/2] xen: spread page scrubbing across all idle CPU 2014-06-11 10:13 ` George Dunlap @ 2014-06-11 10:22 ` Jan Beulich 2014-06-11 10:36 ` Bob Liu 1 sibling, 0 replies; 13+ messages in thread From: Jan Beulich @ 2014-06-11 10:22 UTC (permalink / raw) To: George Dunlap Cc: Bob Liu, Keir Fraser, Ian Campbell, Andrew Cooper, xen-devel >>> On 11.06.14 at 12:13, <George.Dunlap@eu.citrix.com> wrote: > Would it make more sense to do something like the following: > * Have a "clean" freelist and a "dirty" freelist > * When destroying a domain, simply move pages to the dirty freelist > * Have idle vcpus scrub the dirty freelist before going to sleep > - ...and wake up idle vcpus to do some scrubbing when adding pages to > the dirty freelist > * In alloc_domheap_pages(): > - If there are pages on the "clean" freelist, allocate them > - If there are no pages on the "clean" freelist but there are on the > "dirty" freelist, scrub pages from the "dirty" freelist synchronously. Yes, that's the model I suggested weeks ago. Jan ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 2/2] xen: spread page scrubbing across all idle CPU 2014-06-11 10:13 ` George Dunlap 2014-06-11 10:22 ` Jan Beulich @ 2014-06-11 10:36 ` Bob Liu 2014-06-11 10:45 ` George Dunlap 2014-06-11 10:48 ` Andrew Cooper 1 sibling, 2 replies; 13+ messages in thread From: Bob Liu @ 2014-06-11 10:36 UTC (permalink / raw) To: George Dunlap Cc: Bob Liu, Keir Fraser, Ian Campbell, Andrew Cooper, Jan Beulich, xen-devel On 06/11/2014 06:13 PM, George Dunlap wrote: > On Wed, Jun 11, 2014 at 9:13 AM, Jan Beulich <JBeulich@suse.com> wrote: >>>>> On 11.06.14 at 04:51, <bob.liu@oracle.com> wrote: >>> On 06/10/2014 10:12 PM, Jan Beulich wrote: >>>>>>> On 10.06.14 at 14:18, <lliubbo@gmail.com> wrote: >>>>> + if( is_tasklet ) >>>>> + tasklet_schedule_on_cpu(&global_scrub_tasklet, cpu); >>>> >>>> So you re-schedule this tasklet immediately - while this may be >>>> acceptable inside the hypervisor, did you consider the effect this >>>> will have on the guest (normally Dom0)? Its respective vCPU won't >>>> get run _at all_ until you're done scrubbing. >>>> >>> >>> Yes, that's a problem. I don't have any better idea right now. >>> >>> What I'm trying is doing the scrubbing on current CPU as well as on all >>> idle vcpus in parallel. >>> I also considered your suggestion about doing the scrubbing in the >>> background as well as on the allocation path. But I think it's more >>> unacceptable for users to get blocked randomly for a uncertain time when >>> allocating a large mount of memory. >>> That's why I still chose the sync way that once 'xl destroy' return all >>> memory are scrubbed. >> >> But I hope you realize that in the current shape, with the shortcomings >> pointed out un-addressed, there's no way for this to go in. > > Would it make more sense to do something like the following: > * Have a "clean" freelist and a "dirty" freelist > * When destroying a domain, simply move pages to the dirty freelist > * Have idle vcpus scrub the dirty freelist before going to sleep > - ...and wake up idle vcpus to do some scrubbing when adding pages to > the dirty freelist > * In alloc_domheap_pages(): > - If there are pages on the "clean" freelist, allocate them > - If there are no pages on the "clean" freelist but there are on the > "dirty" freelist, scrub pages from the "dirty" freelist synchronously. > Thank you very much for your suggestion, it's similar as Jan suggested. My concern of this approach is in some bad situation the allocation path may be blocked for a long time waiting for scrubbing "dirty" freelist. What the users see is it's much faster to destroy a domain but may slower to create an new one(or slow down other routines need to alloc large memory). If you and Jan think it's fine I can make a patch towards this way. -- Regards, -Bob ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 2/2] xen: spread page scrubbing across all idle CPU 2014-06-11 10:36 ` Bob Liu @ 2014-06-11 10:45 ` George Dunlap 2014-06-11 11:17 ` Bob Liu 2014-06-11 10:48 ` Andrew Cooper 1 sibling, 1 reply; 13+ messages in thread From: George Dunlap @ 2014-06-11 10:45 UTC (permalink / raw) To: Bob Liu Cc: Bob Liu, Keir Fraser, Ian Campbell, Andrew Cooper, Jan Beulich, xen-devel On Wed, Jun 11, 2014 at 11:36 AM, Bob Liu <bob.liu@oracle.com> wrote: > > On 06/11/2014 06:13 PM, George Dunlap wrote: >> On Wed, Jun 11, 2014 at 9:13 AM, Jan Beulich <JBeulich@suse.com> wrote: >>>>>> On 11.06.14 at 04:51, <bob.liu@oracle.com> wrote: >>>> On 06/10/2014 10:12 PM, Jan Beulich wrote: >>>>>>>> On 10.06.14 at 14:18, <lliubbo@gmail.com> wrote: >>>>>> + if( is_tasklet ) >>>>>> + tasklet_schedule_on_cpu(&global_scrub_tasklet, cpu); >>>>> >>>>> So you re-schedule this tasklet immediately - while this may be >>>>> acceptable inside the hypervisor, did you consider the effect this >>>>> will have on the guest (normally Dom0)? Its respective vCPU won't >>>>> get run _at all_ until you're done scrubbing. >>>>> >>>> >>>> Yes, that's a problem. I don't have any better idea right now. >>>> >>>> What I'm trying is doing the scrubbing on current CPU as well as on all >>>> idle vcpus in parallel. >>>> I also considered your suggestion about doing the scrubbing in the >>>> background as well as on the allocation path. But I think it's more >>>> unacceptable for users to get blocked randomly for a uncertain time when >>>> allocating a large mount of memory. >>>> That's why I still chose the sync way that once 'xl destroy' return all >>>> memory are scrubbed. >>> >>> But I hope you realize that in the current shape, with the shortcomings >>> pointed out un-addressed, there's no way for this to go in. >> >> Would it make more sense to do something like the following: >> * Have a "clean" freelist and a "dirty" freelist >> * When destroying a domain, simply move pages to the dirty freelist >> * Have idle vcpus scrub the dirty freelist before going to sleep >> - ...and wake up idle vcpus to do some scrubbing when adding pages to >> the dirty freelist >> * In alloc_domheap_pages(): >> - If there are pages on the "clean" freelist, allocate them >> - If there are no pages on the "clean" freelist but there are on the >> "dirty" freelist, scrub pages from the "dirty" freelist synchronously. >> > > Thank you very much for your suggestion, it's similar as Jan suggested. > > My concern of this approach is in some bad situation the allocation path > may be blocked for a long time waiting for scrubbing "dirty" freelist. > > What the users see is it's much faster to destroy a domain but may > slower to create an new one(or slow down other routines need to alloc > large memory). Yes, so on a system with near 100% memory usage, a reboot of a large VM would mean short destroy, long start-up, rather than long destroy, short start-up. End-to-end it's basically the same; but on a system with less memory usage, both operations are significantly faster. The "other routines" argument I'm not sure makes sense either -- after all, these other routines might have to wait for the domain clean-up to finish anyway. It might make sense at some point to think about prioritizing page scrubbing when memory is particularly low -- maybe spending 5-10% of the CPU time doing scrubbing. But that's an additional feature we can explore later. -George ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 2/2] xen: spread page scrubbing across all idle CPU 2014-06-11 10:45 ` George Dunlap @ 2014-06-11 11:17 ` Bob Liu 0 siblings, 0 replies; 13+ messages in thread From: Bob Liu @ 2014-06-11 11:17 UTC (permalink / raw) To: George Dunlap Cc: Bob Liu, Keir Fraser, Ian Campbell, Andrew Cooper, Jan Beulich, xen-devel On 06/11/2014 06:45 PM, George Dunlap wrote: > On Wed, Jun 11, 2014 at 11:36 AM, Bob Liu <bob.liu@oracle.com> wrote: >> >> On 06/11/2014 06:13 PM, George Dunlap wrote: >>> On Wed, Jun 11, 2014 at 9:13 AM, Jan Beulich <JBeulich@suse.com> wrote: >>>>>>> On 11.06.14 at 04:51, <bob.liu@oracle.com> wrote: >>>>> On 06/10/2014 10:12 PM, Jan Beulich wrote: >>>>>>>>> On 10.06.14 at 14:18, <lliubbo@gmail.com> wrote: >>>>>>> + if( is_tasklet ) >>>>>>> + tasklet_schedule_on_cpu(&global_scrub_tasklet, cpu); >>>>>> >>>>>> So you re-schedule this tasklet immediately - while this may be >>>>>> acceptable inside the hypervisor, did you consider the effect this >>>>>> will have on the guest (normally Dom0)? Its respective vCPU won't >>>>>> get run _at all_ until you're done scrubbing. >>>>>> >>>>> >>>>> Yes, that's a problem. I don't have any better idea right now. >>>>> >>>>> What I'm trying is doing the scrubbing on current CPU as well as on all >>>>> idle vcpus in parallel. >>>>> I also considered your suggestion about doing the scrubbing in the >>>>> background as well as on the allocation path. But I think it's more >>>>> unacceptable for users to get blocked randomly for a uncertain time when >>>>> allocating a large mount of memory. >>>>> That's why I still chose the sync way that once 'xl destroy' return all >>>>> memory are scrubbed. >>>> >>>> But I hope you realize that in the current shape, with the shortcomings >>>> pointed out un-addressed, there's no way for this to go in. >>> >>> Would it make more sense to do something like the following: >>> * Have a "clean" freelist and a "dirty" freelist >>> * When destroying a domain, simply move pages to the dirty freelist >>> * Have idle vcpus scrub the dirty freelist before going to sleep >>> - ...and wake up idle vcpus to do some scrubbing when adding pages to >>> the dirty freelist >>> * In alloc_domheap_pages(): >>> - If there are pages on the "clean" freelist, allocate them >>> - If there are no pages on the "clean" freelist but there are on the >>> "dirty" freelist, scrub pages from the "dirty" freelist synchronously. >>> >> >> Thank you very much for your suggestion, it's similar as Jan suggested. >> >> My concern of this approach is in some bad situation the allocation path >> may be blocked for a long time waiting for scrubbing "dirty" freelist. >> >> What the users see is it's much faster to destroy a domain but may >> slower to create an new one(or slow down other routines need to alloc >> large memory). > > Yes, so on a system with near 100% memory usage, a reboot of a large > VM would mean short destroy, long start-up, rather than long destroy, > short start-up. End-to-end it's basically the same; but on a system > with less memory usage, both operations are significantly faster. > Okay, I'll follow your suggestion. And thanks everyone! -- Regards, -Bob ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 2/2] xen: spread page scrubbing across all idle CPU 2014-06-11 10:36 ` Bob Liu 2014-06-11 10:45 ` George Dunlap @ 2014-06-11 10:48 ` Andrew Cooper 1 sibling, 0 replies; 13+ messages in thread From: Andrew Cooper @ 2014-06-11 10:48 UTC (permalink / raw) To: Bob Liu Cc: Bob Liu, Keir Fraser, Ian Campbell, George Dunlap, Jan Beulich, xen-devel On 11/06/14 11:36, Bob Liu wrote: > On 06/11/2014 06:13 PM, George Dunlap wrote: >> On Wed, Jun 11, 2014 at 9:13 AM, Jan Beulich <JBeulich@suse.com> wrote: >>>>>> On 11.06.14 at 04:51, <bob.liu@oracle.com> wrote: >>>> On 06/10/2014 10:12 PM, Jan Beulich wrote: >>>>>>>> On 10.06.14 at 14:18, <lliubbo@gmail.com> wrote: >>>>>> + if( is_tasklet ) >>>>>> + tasklet_schedule_on_cpu(&global_scrub_tasklet, cpu); >>>>> So you re-schedule this tasklet immediately - while this may be >>>>> acceptable inside the hypervisor, did you consider the effect this >>>>> will have on the guest (normally Dom0)? Its respective vCPU won't >>>>> get run _at all_ until you're done scrubbing. >>>>> >>>> Yes, that's a problem. I don't have any better idea right now. >>>> >>>> What I'm trying is doing the scrubbing on current CPU as well as on all >>>> idle vcpus in parallel. >>>> I also considered your suggestion about doing the scrubbing in the >>>> background as well as on the allocation path. But I think it's more >>>> unacceptable for users to get blocked randomly for a uncertain time when >>>> allocating a large mount of memory. >>>> That's why I still chose the sync way that once 'xl destroy' return all >>>> memory are scrubbed. >>> But I hope you realize that in the current shape, with the shortcomings >>> pointed out un-addressed, there's no way for this to go in. >> Would it make more sense to do something like the following: >> * Have a "clean" freelist and a "dirty" freelist >> * When destroying a domain, simply move pages to the dirty freelist >> * Have idle vcpus scrub the dirty freelist before going to sleep >> - ...and wake up idle vcpus to do some scrubbing when adding pages to >> the dirty freelist >> * In alloc_domheap_pages(): >> - If there are pages on the "clean" freelist, allocate them >> - If there are no pages on the "clean" freelist but there are on the >> "dirty" freelist, scrub pages from the "dirty" freelist synchronously. >> > Thank you very much for your suggestion, it's similar as Jan suggested. > > My concern of this approach is in some bad situation the allocation path > may be blocked for a long time waiting for scrubbing "dirty" freelist. > > What the users see is it's much faster to destroy a domain but may > slower to create an new one(or slow down other routines need to alloc > large memory). > > If you and Jan think it's fine I can make a patch towards this way. > One way or another the pages need to be scrubbed. All you are doing is juggling when this time hit gets taken. I would agree that having the toolstack take the time hit as part of repeated domain_kill() hypercalls is antisocial, especially as the toolstack/dom0 has far more important things to be doing than being blocked in Xen. I too would agree with Jan's suggestion and think it is the best solution to the problem at hand. ~Andrew ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 1/2] xen: introduce a no lock version function of free_heap_pages 2014-06-10 12:18 [PATCH 1/2] xen: introduce a no lock version function of free_heap_pages Bob Liu 2014-06-10 12:18 ` [PATCH 2/2] xen: spread page scrubbing across all idle CPU Bob Liu @ 2014-06-10 12:32 ` Andrew Cooper 1 sibling, 0 replies; 13+ messages in thread From: Andrew Cooper @ 2014-06-10 12:32 UTC (permalink / raw) To: Bob Liu; +Cc: keir, ian.campbell, jbeulich, xen-devel On 10/06/14 13:18, Bob Liu wrote: > Function free_heap_pages() needs to get the heap_lock every time. This lock > may become a problem when there are many CPUs trying to free pages in parallel. > > This patch introduces a no lock version function __free_heap_pages(), it can be > used to free a batch of pages after grabbing the heap_lock. > > Signed-off-by: Bob Liu <bob.liu@oracle.com> > --- > xen/common/page_alloc.c | 13 +++++++++---- > 1 file changed, 9 insertions(+), 4 deletions(-) > > diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c > index 601319c..56826b4 100644 > --- a/xen/common/page_alloc.c > +++ b/xen/common/page_alloc.c > @@ -808,8 +808,8 @@ static int reserve_offlined_page(struct page_info *head) > return count; > } > > -/* Free 2^@order set of pages. */ > -static void free_heap_pages( > +/* No lock version, the caller must hold heap_lock */ > +static void __free_heap_pages( > struct page_info *pg, unsigned int order) > { > unsigned long mask, mfn = page_to_mfn(pg); > @@ -819,8 +819,6 @@ static void free_heap_pages( > ASSERT(order <= MAX_ORDER); > ASSERT(node >= 0); Probably best to insert an ASSERT(spin_is_locked(&heap_lock)) here. ~Andrew > > - spin_lock(&heap_lock); > - > for ( i = 0; i < (1 << order); i++ ) > { > /* > @@ -894,7 +892,14 @@ static void free_heap_pages( > > if ( tainted ) > reserve_offlined_page(pg); > +} > > +/* Free 2^@order set of pages. */ > +static void free_heap_pages( > + struct page_info *pg, unsigned int order) > +{ > + spin_lock(&heap_lock); > + __free_heap_pages(pg, order); > spin_unlock(&heap_lock); > } > ^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2014-06-11 11:18 UTC | newest] Thread overview: 13+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2014-06-10 12:18 [PATCH 1/2] xen: introduce a no lock version function of free_heap_pages Bob Liu 2014-06-10 12:18 ` [PATCH 2/2] xen: spread page scrubbing across all idle CPU Bob Liu 2014-06-10 12:42 ` Andrew Cooper 2014-06-10 14:12 ` Jan Beulich 2014-06-11 2:51 ` Bob Liu 2014-06-11 8:13 ` Jan Beulich 2014-06-11 10:13 ` George Dunlap 2014-06-11 10:22 ` Jan Beulich 2014-06-11 10:36 ` Bob Liu 2014-06-11 10:45 ` George Dunlap 2014-06-11 11:17 ` Bob Liu 2014-06-11 10:48 ` Andrew Cooper 2014-06-10 12:32 ` [PATCH 1/2] xen: introduce a no lock version function of free_heap_pages Andrew Cooper
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.