From mboxrd@z Thu Jan  1 00:00:00 1970
From: Bob Liu <lliubbo@gmail.com>
Subject: [PATCH v2 3/3] xen: use idle vcpus to scrub pages
Date: Mon, 30 Jun 2014 21:39:44 +0800
Message-ID: <1404135584-29206-3-git-send-email-bob.liu@oracle.com>
References: <1404135584-29206-1-git-send-email-bob.liu@oracle.com>
Mime-Version: 1.0
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit
Return-path: <xen-devel-bounces@lists.xen.org>
Received: from mail6.bemta4.messagelabs.com ([85.158.143.247])
	by lists.xen.org with esmtp (Exim 4.72)
	(envelope-from <lliubbo@gmail.com>) id 1X1bod-0007re-Ee
	for xen-devel@lists.xenproject.org; Mon, 30 Jun 2014 13:40:15 +0000
Received: by mail-ob0-f182.google.com with SMTP id nu7so8951941obb.27
	for <xen-devel@lists.xenproject.org>;
	Mon, 30 Jun 2014 06:40:12 -0700 (PDT)
In-Reply-To: <1404135584-29206-1-git-send-email-bob.liu@oracle.com>
List-Unsubscribe: <http://lists.xen.org/cgi-bin/mailman/options/xen-devel>,
	<mailto:xen-devel-request@lists.xen.org?subject=unsubscribe>
List-Post: <mailto:xen-devel@lists.xen.org>
List-Help: <mailto:xen-devel-request@lists.xen.org?subject=help>
List-Subscribe: <http://lists.xen.org/cgi-bin/mailman/listinfo/xen-devel>,
	<mailto:xen-devel-request@lists.xen.org?subject=subscribe>
Sender: xen-devel-bounces@lists.xen.org
Errors-To: xen-devel-bounces@lists.xen.org
To: xen-devel@lists.xenproject.org
Cc: keir@xen.org, ian.campbell@citrix.com, George.Dunlap@eu.citrix.com, andrew.cooper3@citrix.com, JBeulich@suse.com
List-Id: xen-devel@lists.xenproject.org

In case of heavy lock contention, use two percpu lists.
 - Delist a batch of pages to a percpu list from _heap[] free page list.
 - Scrub pages on this percpu list and add to another percpu free list.
 - Free those clean pages to _heap[], merge with other chunks if needed.

v2:
* Avoid having two hyperthreads within the same core doing scrubbing
* Limit (1<<SCRUB_BATCH_ORDER) pages to percpu list in one go
* Won't spin on heap lock when there is nothing to scrub
* Partial numa aware

Signed-off-by: Bob Liu <bob.liu@oracle.com>
---
 xen/arch/arm/domain.c   |    1 +
 xen/arch/x86/domain.c   |    1 +
 xen/common/page_alloc.c |  130 +++++++++++++++++++++++++++++++++++++++++++++++
 xen/include/xen/mm.h    |    1 +
 4 files changed, 133 insertions(+)

diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index 04d0cd0..b6bc3ac 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -44,6 +44,7 @@ void idle_loop(void)
         if ( cpu_is_offline(smp_processor_id()) )
             stop_cpu();
 
+        scrub_free_pages();
         local_irq_disable();
         if ( cpu_is_haltable(smp_processor_id()) )
         {
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index e896210..e8d4fe7 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -116,6 +116,7 @@ static void idle_loop(void)
     {
         if ( cpu_is_offline(smp_processor_id()) )
             play_dead();
+        scrub_free_pages();
         (*pm_idle)();
         do_tasklet();
         do_softirq();
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index ab293c8..6ab1d1d 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -86,6 +86,12 @@ PAGE_LIST_HEAD(page_offlined_list);
 /* Broken page list, protected by heap_lock. */
 PAGE_LIST_HEAD(page_broken_list);
 
+/* A rough flag to indicate whether a node have need_scrub pages */
+static bool_t node_need_scrub[MAX_NUMNODES];
+static DEFINE_PER_CPU(bool_t, is_scrubbing);
+static DEFINE_PER_CPU(struct page_list_head, scrub_list_cpu);
+static DEFINE_PER_CPU(struct page_list_head, free_list_cpu);
+
 /*************************
  * BOOT-TIME ALLOCATOR
  */
@@ -948,6 +954,7 @@ static void free_heap_pages(
     {
         if ( !tainted )
         {
+            node_need_scrub[node] = 1;
             for ( i = 0; i < (1 << order); i++ )
                 pg[i].count_info |= PGC_need_scrub;
         }
@@ -1525,7 +1532,130 @@ void __init scrub_heap_pages(void)
     setup_low_mem_virq();
 }
 
+#define SCRUB_BATCH_ORDER 12
+static void __scrub_free_pages(unsigned int node, unsigned int cpu)
+{
+    struct page_info *pg, *tmp;
+    unsigned int i;
+    int order;
+    struct page_list_head *local_scrub_list = &this_cpu(scrub_list_cpu);
+    struct page_list_head *local_free_list = &this_cpu(free_list_cpu);
+
+    /* Scrub percpu list */
+    while ( !page_list_empty(local_scrub_list) )
+    {
+        pg = page_list_remove_head(local_scrub_list);
+        order = PFN_ORDER(pg);
+        ASSERT( pg && order <= SCRUB_BATCH_ORDER );
+        for ( i = 0; i < (1 << order); i++ )
+        {
+            ASSERT( test_bit(_PGC_need_scrub, &pg[i].count_info) );
+            scrub_one_page(&pg[i]);
+        }
+        page_list_add_tail(pg, local_free_list);
+        if ( softirq_pending(cpu) )
+		return;
+    }
+
+    /* free percpu free list */
+    if ( !page_list_empty(local_free_list) )
+    {
+        spin_lock(&heap_lock);
+        page_list_for_each_safe( pg, tmp, local_free_list )
+        {
+            order = PFN_ORDER(pg);
+            page_list_del(pg, local_free_list);
+            for ( i = 0; i < (1 << order); i++ )
+	    {
+                pg[i].count_info |= PGC_state_free;
+                pg[i].count_info &= ~PGC_need_scrub;
+            }
+            merge_free_trunks(pg, order, node, page_to_zone(pg), 0);
+        }
+        spin_unlock(&heap_lock);
+    }
+}
+
+void scrub_free_pages(void)
+{
+    int order;
+    struct page_info *pg, *tmp;
+    unsigned int i, zone, nr_delisted = 0;
+    unsigned int cpu = smp_processor_id();
+    unsigned int node = cpu_to_node(cpu);
+    struct page_list_head *local_scrub_list = &this_cpu(scrub_list_cpu);
+
+    /* Return if our sibling already started scrubbing */
+    for_each_cpu( i, per_cpu(cpu_sibling_mask,cpu) )
+        if ( per_cpu(is_scrubbing, i) )
+            return;
+    this_cpu(is_scrubbing) = 1;
+
+    while ( !softirq_pending(cpu) )
+    {
+        if ( !node_need_scrub[node] )
+        {
+            /* Free local per cpu list before we exit */
+            __scrub_free_pages(node, cpu);
+            goto out;
+        }
+
+        /* Delist a batch of pages from global scrub list */
+        if ( page_list_empty(local_scrub_list) )
+        {
+            spin_lock(&heap_lock);
+            for ( zone = 0; zone < NR_ZONES; zone++ )
+            {
+                for ( order = MAX_ORDER; order >= 0; order-- )
+                {
+                    page_list_for_each_safe( pg, tmp, &heap(node, zone, order) )
+                    {
+                        if ( !test_bit(_PGC_need_scrub, &(pg->count_info)) )
+                            continue;
+
+                        page_list_del( pg, &heap(node, zone, order) );
+                        if ( order > SCRUB_BATCH_ORDER)
+                        {
+                            /* putback extra pages */
+                            i = order;
+                            while ( i != SCRUB_BATCH_ORDER )
+                            {
+                                PFN_ORDER(pg) = --i;
+                                page_list_add_tail(pg, &heap(node, zone, i));
+                                pg += 1 << i;
+                            }
+                            PFN_ORDER(pg) = SCRUB_BATCH_ORDER;
+                        }
+
+                        for ( i = 0; i < (1 << PFN_ORDER(pg)); i++ )
+                        {
+                            ASSERT( test_bit(_PGC_need_scrub, &pg[i].count_info) );
+                            ASSERT( !test_bit(_PGC_broken, &pg[i].count_info) );
+                            mark_page_offline(&pg[i], 0);
+                        }
+                        page_list_add_tail(pg, local_scrub_list);
+                        nr_delisted += ( 1 << PFN_ORDER(pg) );
+                        if ( nr_delisted >= (1 << SCRUB_BATCH_ORDER) )
+                        {
+                            nr_delisted = 0;
+                            spin_unlock(&heap_lock);
+                            goto start_scrub;
+                        }
+                    }
+                }
+            }
+
+            node_need_scrub[node] = 0;
+            spin_unlock(&heap_lock);
+        }
 
+ start_scrub:
+        __scrub_free_pages(node, cpu);
+    }
+
+ out:
+    this_cpu(is_scrubbing) = 0;
+}
 
 /*************************
  * XEN-HEAP SUB-ALLOCATOR
diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h
index b183189..1fa8c3d 100644
--- a/xen/include/xen/mm.h
+++ b/xen/include/xen/mm.h
@@ -78,6 +78,7 @@ int query_page_offline(unsigned long mfn, uint32_t *status);
 unsigned long total_free_pages(void);
 
 void scrub_heap_pages(void);
+void scrub_free_pages(void);
 
 int assign_pages(
     struct domain *d,
-- 
1.7.10.4