All of lore.kernel.org
 help / color / mirror / Atom feed
From: Ingo Molnar <mingo@kernel.org>
To: linux-kernel@vger.kernel.org, linux-mm@kvack.org
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>,
	Paul Turner <pjt@google.com>,
	Lee Schermerhorn <Lee.Schermerhorn@hp.com>,
	Christoph Lameter <cl@linux.com>, Rik van Riel <riel@redhat.com>,
	Mel Gorman <mgorman@suse.de>,
	Andrew Morton <akpm@linux-foundation.org>,
	Andrea Arcangeli <aarcange@redhat.com>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Thomas Gleixner <tglx@linutronix.de>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Hugh Dickins <hughd@google.com>
Subject: [PATCH 09/10] sched: Add convergence strength based adaptive NUMA page fault rate
Date: Fri, 30 Nov 2012 20:58:40 +0100	[thread overview]
Message-ID: <1354305521-11583-10-git-send-email-mingo@kernel.org> (raw)
In-Reply-To: <1354305521-11583-1-git-send-email-mingo@kernel.org>

Mel Gorman reported that the NUMA code is system-time intense even
after a workload has converged.

To remedy this, turn sched_numa_scan_size into a range:

   sched_numa_scan_size_min        [default:  32 MB]
   sched_numa_scan_size_max        [default: 512 MB]

As workloads converge, so does their scanning activity get reduced.
If they unconverge again - for example because system load changes,
then their scanning will pick up again.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  3 ++-
 kernel/sched/fair.c   | 57 +++++++++++++++++++++++++++++++++++++++++++--------
 kernel/sysctl.c       | 11 ++++++++--
 3 files changed, 59 insertions(+), 12 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5b2cf2e..ce834e7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2057,7 +2057,8 @@ extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
 extern unsigned int sysctl_sched_numa_scan_delay;
 extern unsigned int sysctl_sched_numa_scan_period_min;
 extern unsigned int sysctl_sched_numa_scan_period_max;
-extern unsigned int sysctl_sched_numa_scan_size;
+extern unsigned int sysctl_sched_numa_scan_size_min;
+extern unsigned int sysctl_sched_numa_scan_size_max;
 extern unsigned int sysctl_sched_numa_settle_count;
 
 #ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 10cbfa3..9262692 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -805,15 +805,17 @@ static unsigned long task_h_load(struct task_struct *p);
 /*
  * Scan @scan_size MB every @scan_period after an initial @scan_delay.
  */
-unsigned int sysctl_sched_numa_scan_delay = 1000;	/* ms */
-unsigned int sysctl_sched_numa_scan_period_min = 100;	/* ms */
-unsigned int sysctl_sched_numa_scan_period_max = 100*16;/* ms */
-unsigned int sysctl_sched_numa_scan_size = 256;		/* MB */
+unsigned int sysctl_sched_numa_scan_delay	__read_mostly = 1000;	/* ms */
+unsigned int sysctl_sched_numa_scan_period_min	__read_mostly = 100;	/* ms */
+unsigned int sysctl_sched_numa_scan_period_max	__read_mostly = 100*16;	/* ms */
+
+unsigned int sysctl_sched_numa_scan_size_min	__read_mostly =  32;	/* MB */
+unsigned int sysctl_sched_numa_scan_size_max	__read_mostly = 512;	/* MB */
 
 /*
  * Wait for the 2-sample stuff to settle before migrating again
  */
-unsigned int sysctl_sched_numa_settle_count = 2;
+unsigned int sysctl_sched_numa_settle_count	__read_mostly = 2;
 
 static int task_ideal_cpu(struct task_struct *p)
 {
@@ -2077,9 +2079,15 @@ static void task_numa_placement_tick(struct task_struct *p)
 			p->numa_faults[idx_oldnode] = 0;
 		}
 		sched_setnuma(p, ideal_node, shared);
+		/*
+		 * We changed a node, start scanning more frequently again
+		 * to map out the working set:
+		 */
+		p->numa_scan_period = sysctl_sched_numa_scan_period_min;
 	} else {
 		/* node unchanged, back off: */
-		p->numa_scan_period = min(p->numa_scan_period * 2, sysctl_sched_numa_scan_period_max);
+		p->numa_scan_period = min(p->numa_scan_period*2,
+						sysctl_sched_numa_scan_period_max);
 	}
 
 	this_cpu = task_cpu(p);
@@ -2238,6 +2246,7 @@ void task_numa_scan_work(struct callback_head *work)
 	struct task_struct *p = current;
 	struct mm_struct *mm = p->mm;
 	struct vm_area_struct *vma;
+	long pages_min, pages_max;
 
 	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_scan_work));
 
@@ -2260,10 +2269,40 @@ void task_numa_scan_work(struct callback_head *work)
 	current->numa_scan_period += jiffies_to_msecs(2);
 
 	start0 = start = end = mm->numa_scan_offset;
-	pages_total = sysctl_sched_numa_scan_size;
-	pages_total <<= 20 - PAGE_SHIFT; /* MB in pages */
-	if (!pages_total)
+
+	pages_max = sysctl_sched_numa_scan_size_max;
+	pages_max <<= 20 - PAGE_SHIFT; /* MB in pages */
+	if (!pages_max)
+		return;
+
+	pages_min = sysctl_sched_numa_scan_size_min;
+	pages_min <<= 20 - PAGE_SHIFT; /* MB in pages */
+	if (!pages_min)
+		return;
+
+	if (WARN_ON_ONCE(p->convergence_strength < 0 || p->convergence_strength > 1024))
 		return;
+	if (WARN_ON_ONCE(pages_min > pages_max))
+		return;
+
+	/*
+	 * Convergence strength is a number in the range of
+	 * 0 ... 1024.
+	 *
+	 * As tasks converge, scale down our scanning to the minimum
+	 * of the allowed range. Shortly after they get unsettled
+	 * (because the workload changes or the system is loaded
+	 * differently), scanning revs up again.
+	 *
+	 * The important thing is that when the system is in an
+	 * equilibrium, we do the minimum amount of scanning.
+	 */
+
+	pages_total = pages_min;
+	pages_total += (pages_max - pages_min)*(1024-p->convergence_strength)/1024;
+
+	pages_total = max(pages_min, pages_total);
+	pages_total = min(pages_max, pages_total);
 
 	sum_pages_scanned = 0;
 	pages_left = pages_total;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6d2fe5b..b6ddfae 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -374,8 +374,15 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 	{
-		.procname	= "sched_numa_scan_size_mb",
-		.data		= &sysctl_sched_numa_scan_size,
+		.procname	= "sched_numa_scan_size_min_mb",
+		.data		= &sysctl_sched_numa_scan_size_min,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "sched_numa_scan_size_max_mb",
+		.data		= &sysctl_sched_numa_scan_size_max,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
-- 
1.7.11.7

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

WARNING: multiple messages have this Message-ID (diff)
From: Ingo Molnar <mingo@kernel.org>
To: linux-kernel@vger.kernel.org, linux-mm@kvack.org
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>,
	Paul Turner <pjt@google.com>,
	Lee Schermerhorn <Lee.Schermerhorn@hp.com>,
	Christoph Lameter <cl@linux.com>, Rik van Riel <riel@redhat.com>,
	Mel Gorman <mgorman@suse.de>,
	Andrew Morton <akpm@linux-foundation.org>,
	Andrea Arcangeli <aarcange@redhat.com>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Thomas Gleixner <tglx@linutronix.de>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Hugh Dickins <hughd@google.com>
Subject: [PATCH 09/10] sched: Add convergence strength based adaptive NUMA page fault rate
Date: Fri, 30 Nov 2012 20:58:40 +0100	[thread overview]
Message-ID: <1354305521-11583-10-git-send-email-mingo@kernel.org> (raw)
In-Reply-To: <1354305521-11583-1-git-send-email-mingo@kernel.org>

Mel Gorman reported that the NUMA code is system-time intense even
after a workload has converged.

To remedy this, turn sched_numa_scan_size into a range:

   sched_numa_scan_size_min        [default:  32 MB]
   sched_numa_scan_size_max        [default: 512 MB]

As workloads converge, so does their scanning activity get reduced.
If they unconverge again - for example because system load changes,
then their scanning will pick up again.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  3 ++-
 kernel/sched/fair.c   | 57 +++++++++++++++++++++++++++++++++++++++++++--------
 kernel/sysctl.c       | 11 ++++++++--
 3 files changed, 59 insertions(+), 12 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5b2cf2e..ce834e7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2057,7 +2057,8 @@ extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
 extern unsigned int sysctl_sched_numa_scan_delay;
 extern unsigned int sysctl_sched_numa_scan_period_min;
 extern unsigned int sysctl_sched_numa_scan_period_max;
-extern unsigned int sysctl_sched_numa_scan_size;
+extern unsigned int sysctl_sched_numa_scan_size_min;
+extern unsigned int sysctl_sched_numa_scan_size_max;
 extern unsigned int sysctl_sched_numa_settle_count;
 
 #ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 10cbfa3..9262692 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -805,15 +805,17 @@ static unsigned long task_h_load(struct task_struct *p);
 /*
  * Scan @scan_size MB every @scan_period after an initial @scan_delay.
  */
-unsigned int sysctl_sched_numa_scan_delay = 1000;	/* ms */
-unsigned int sysctl_sched_numa_scan_period_min = 100;	/* ms */
-unsigned int sysctl_sched_numa_scan_period_max = 100*16;/* ms */
-unsigned int sysctl_sched_numa_scan_size = 256;		/* MB */
+unsigned int sysctl_sched_numa_scan_delay	__read_mostly = 1000;	/* ms */
+unsigned int sysctl_sched_numa_scan_period_min	__read_mostly = 100;	/* ms */
+unsigned int sysctl_sched_numa_scan_period_max	__read_mostly = 100*16;	/* ms */
+
+unsigned int sysctl_sched_numa_scan_size_min	__read_mostly =  32;	/* MB */
+unsigned int sysctl_sched_numa_scan_size_max	__read_mostly = 512;	/* MB */
 
 /*
  * Wait for the 2-sample stuff to settle before migrating again
  */
-unsigned int sysctl_sched_numa_settle_count = 2;
+unsigned int sysctl_sched_numa_settle_count	__read_mostly = 2;
 
 static int task_ideal_cpu(struct task_struct *p)
 {
@@ -2077,9 +2079,15 @@ static void task_numa_placement_tick(struct task_struct *p)
 			p->numa_faults[idx_oldnode] = 0;
 		}
 		sched_setnuma(p, ideal_node, shared);
+		/*
+		 * We changed a node, start scanning more frequently again
+		 * to map out the working set:
+		 */
+		p->numa_scan_period = sysctl_sched_numa_scan_period_min;
 	} else {
 		/* node unchanged, back off: */
-		p->numa_scan_period = min(p->numa_scan_period * 2, sysctl_sched_numa_scan_period_max);
+		p->numa_scan_period = min(p->numa_scan_period*2,
+						sysctl_sched_numa_scan_period_max);
 	}
 
 	this_cpu = task_cpu(p);
@@ -2238,6 +2246,7 @@ void task_numa_scan_work(struct callback_head *work)
 	struct task_struct *p = current;
 	struct mm_struct *mm = p->mm;
 	struct vm_area_struct *vma;
+	long pages_min, pages_max;
 
 	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_scan_work));
 
@@ -2260,10 +2269,40 @@ void task_numa_scan_work(struct callback_head *work)
 	current->numa_scan_period += jiffies_to_msecs(2);
 
 	start0 = start = end = mm->numa_scan_offset;
-	pages_total = sysctl_sched_numa_scan_size;
-	pages_total <<= 20 - PAGE_SHIFT; /* MB in pages */
-	if (!pages_total)
+
+	pages_max = sysctl_sched_numa_scan_size_max;
+	pages_max <<= 20 - PAGE_SHIFT; /* MB in pages */
+	if (!pages_max)
+		return;
+
+	pages_min = sysctl_sched_numa_scan_size_min;
+	pages_min <<= 20 - PAGE_SHIFT; /* MB in pages */
+	if (!pages_min)
+		return;
+
+	if (WARN_ON_ONCE(p->convergence_strength < 0 || p->convergence_strength > 1024))
 		return;
+	if (WARN_ON_ONCE(pages_min > pages_max))
+		return;
+
+	/*
+	 * Convergence strength is a number in the range of
+	 * 0 ... 1024.
+	 *
+	 * As tasks converge, scale down our scanning to the minimum
+	 * of the allowed range. Shortly after they get unsettled
+	 * (because the workload changes or the system is loaded
+	 * differently), scanning revs up again.
+	 *
+	 * The important thing is that when the system is in an
+	 * equilibrium, we do the minimum amount of scanning.
+	 */
+
+	pages_total = pages_min;
+	pages_total += (pages_max - pages_min)*(1024-p->convergence_strength)/1024;
+
+	pages_total = max(pages_min, pages_total);
+	pages_total = min(pages_max, pages_total);
 
 	sum_pages_scanned = 0;
 	pages_left = pages_total;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6d2fe5b..b6ddfae 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -374,8 +374,15 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 	{
-		.procname	= "sched_numa_scan_size_mb",
-		.data		= &sysctl_sched_numa_scan_size,
+		.procname	= "sched_numa_scan_size_min_mb",
+		.data		= &sysctl_sched_numa_scan_size_min,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "sched_numa_scan_size_max_mb",
+		.data		= &sysctl_sched_numa_scan_size_max,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
-- 
1.7.11.7


  parent reply	other threads:[~2012-11-30 19:59 UTC|newest]

Thread overview: 78+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-11-30 19:58 [PATCH 00/10] Latest numa/core release, v18 Ingo Molnar
2012-11-30 19:58 ` Ingo Molnar
2012-11-30 19:58 ` [PATCH 01/10] sched: Add "task flipping" support Ingo Molnar
2012-11-30 19:58   ` Ingo Molnar
2012-11-30 19:58 ` [PATCH 02/10] sched: Move the NUMA placement logic to a worklet Ingo Molnar
2012-11-30 19:58   ` Ingo Molnar
2012-11-30 19:58 ` [PATCH 03/10] numa, mempolicy: Improve CONFIG_NUMA_BALANCING=y OOM behavior Ingo Molnar
2012-11-30 19:58   ` Ingo Molnar
2012-11-30 19:58 ` [PATCH 04/10] mm, numa: Turn 4K pte NUMA faults into effective hugepage ones Ingo Molnar
2012-11-30 19:58   ` Ingo Molnar
2012-11-30 19:58 ` [PATCH 05/10] sched: Introduce directed NUMA convergence Ingo Molnar
2012-11-30 19:58   ` Ingo Molnar
2012-11-30 19:58 ` [PATCH 06/10] sched: Remove statistical NUMA scheduling Ingo Molnar
2012-11-30 19:58   ` Ingo Molnar
2012-11-30 19:58 ` [PATCH 07/10] sched: Track quality and strength of convergence Ingo Molnar
2012-11-30 19:58   ` Ingo Molnar
2012-11-30 19:58 ` [PATCH 08/10] sched: Converge NUMA migrations Ingo Molnar
2012-11-30 19:58   ` Ingo Molnar
2012-11-30 19:58 ` Ingo Molnar [this message]
2012-11-30 19:58   ` [PATCH 09/10] sched: Add convergence strength based adaptive NUMA page fault rate Ingo Molnar
2012-11-30 19:58 ` [PATCH 10/10] sched: Refine the 'shared tasks' memory interleaving logic Ingo Molnar
2012-11-30 19:58   ` Ingo Molnar
2012-11-30 20:37 ` [PATCH 00/10] Latest numa/core release, v18 Linus Torvalds
2012-11-30 20:37   ` Linus Torvalds
2012-12-01  9:49   ` [RFC PATCH] mm/migration: Don't lock anon vmas in rmap_walk_anon() Ingo Molnar
2012-12-01  9:49     ` Ingo Molnar
2012-12-01 12:26     ` [RFC PATCH] mm/migration: Remove anon vma locking from try_to_unmap() use Ingo Molnar
2012-12-01 12:26       ` Ingo Molnar
2012-12-01 18:38       ` Linus Torvalds
2012-12-01 18:38         ` Linus Torvalds
2012-12-01 18:41         ` Ingo Molnar
2012-12-01 18:41           ` Ingo Molnar
2012-12-01 18:50           ` Linus Torvalds
2012-12-01 18:50             ` Linus Torvalds
2012-12-01 20:10             ` [PATCH 1/2] mm/rmap: Convert the struct anon_vma::mutex to an rwsem Ingo Molnar
2012-12-01 20:10               ` Ingo Molnar
2012-12-01 20:19               ` Rik van Riel
2012-12-01 20:19                 ` Rik van Riel
2012-12-02 15:10                 ` Ingo Molnar
2012-12-02 15:10                   ` Ingo Molnar
2012-12-03 13:59               ` Mel Gorman
2012-12-03 13:59                 ` Mel Gorman
2012-12-01 20:15             ` [PATCH 2/2] mm/migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable Ingo Molnar
2012-12-01 20:15               ` Ingo Molnar
2012-12-01 20:33               ` Rik van Riel
2012-12-01 20:33                 ` Rik van Riel
2012-12-02 15:12                 ` [PATCH 2/2, v2] " Ingo Molnar
2012-12-02 15:12                   ` Ingo Molnar
2012-12-02 17:53                   ` Rik van Riel
2012-12-02 17:53                     ` Rik van Riel
2012-12-04 14:42                   ` Michel Lespinasse
2012-12-04 14:42                     ` Michel Lespinasse
2012-12-05  2:59                   ` Michel Lespinasse
2012-12-05  2:59                     ` Michel Lespinasse
2012-12-03 14:17               ` [PATCH 2/2] " Mel Gorman
2012-12-03 14:17                 ` Mel Gorman
2012-12-04 14:37                 ` Michel Lespinasse
2012-12-04 14:37                   ` Michel Lespinasse
2012-12-04 18:17                   ` Mel Gorman
2012-12-04 18:17                     ` Mel Gorman
2012-12-01 18:55         ` [RFC PATCH] mm/migration: Remove anon vma locking from try_to_unmap() use Rik van Riel
2012-12-01 18:55           ` Rik van Riel
2012-12-01 16:19     ` [RFC PATCH] mm/migration: Don't lock anon vmas in rmap_walk_anon() Rik van Riel
2012-12-01 16:19       ` Rik van Riel
2012-12-01 17:55     ` Linus Torvalds
2012-12-01 17:55       ` Linus Torvalds
2012-12-01 18:30       ` Ingo Molnar
2012-12-01 18:30         ` Ingo Molnar
2012-12-03 13:41   ` [PATCH 00/10] Latest numa/core release, v18 Mel Gorman
2012-12-03 13:41     ` Mel Gorman
2012-12-04 17:30     ` Thomas Gleixner
2012-12-04 17:30       ` Thomas Gleixner
2012-12-03 10:43 ` Mel Gorman
2012-12-03 10:43   ` Mel Gorman
2012-12-03 11:32 ` Mel Gorman
2012-12-03 11:32   ` Mel Gorman
2012-12-04 22:49 ` Mel Gorman
2012-12-04 22:49   ` Mel Gorman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1354305521-11583-10-git-send-email-mingo@kernel.org \
    --to=mingo@kernel.org \
    --cc=Lee.Schermerhorn@hp.com \
    --cc=a.p.zijlstra@chello.nl \
    --cc=aarcange@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=cl@linux.com \
    --cc=hannes@cmpxchg.org \
    --cc=hughd@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mgorman@suse.de \
    --cc=pjt@google.com \
    --cc=riel@redhat.com \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.