[PATCH 41/52] sched: Move the NUMA placement logic to a worklet

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Ingo Molnar <mingo@kernel.org>
To: linux-kernel@vger.kernel.org, linux-mm@kvack.org
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>,
	Paul Turner <pjt@google.com>,
	Lee Schermerhorn <Lee.Schermerhorn@hp.com>,
	Christoph Lameter <cl@linux.com>, Rik van Riel <riel@redhat.com>,
	Mel Gorman <mgorman@suse.de>,
	Andrew Morton <akpm@linux-foundation.org>,
	Andrea Arcangeli <aarcange@redhat.com>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Thomas Gleixner <tglx@linutronix.de>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Hugh Dickins <hughd@google.com>
Subject: [PATCH 41/52] sched: Move the NUMA placement logic to a worklet
Date: Sun,  2 Dec 2012 19:43:33 +0100	[thread overview]
Message-ID: <1354473824-19229-42-git-send-email-mingo@kernel.org> (raw)
In-Reply-To: <1354473824-19229-1-git-send-email-mingo@kernel.org>

As an implementational detail, to be able to do directed task placement
we have to change how task_numa_fault() interfaces with the scheduler:
instead of the placement logic being executed directly from the fault
path we now trigger a worklet, similarly to how we do the NUMA
hinting fault work.

This moves placement into process context and allows the execution of the
directed task-flipping code via sched_rebalance_to().

This further decouples the NUMA hinting fault engine from
the actual NUMA placement logic.

[ Also move __sched_fork() out of preemptible context. ]

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |   3 +-
 kernel/sched/core.c   |  25 ++++++++-
 kernel/sched/fair.c   | 153 +++++++++++++++++++++++++++++++-------------------
 kernel/sched/sched.h  |   5 ++
 4 files changed, 126 insertions(+), 60 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 696492e..ce9ccd7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1512,7 +1512,8 @@ struct task_struct {
 	unsigned long numa_weight;
 	unsigned long *numa_faults;
 	unsigned long *numa_faults_curr;
-	struct callback_head numa_work;
+	struct callback_head numa_scan_work;
+	struct callback_head numa_placement_work;
 
 	struct task_struct *shared_buddy, *shared_buddy_curr;
 	unsigned long shared_buddy_faults, shared_buddy_faults_curr;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index cad6c89..05d4e1d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -39,6 +39,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/debug_locks.h>
 #include <linux/perf_event.h>
+#include <linux/task_work.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <linux/profile.h>
@@ -1558,7 +1559,6 @@ static void __sched_fork(struct task_struct *p)
 	p->numa_migrate_seq = 2;
 	p->numa_faults = NULL;
 	p->numa_scan_period = sysctl_sched_numa_scan_delay;
-	p->numa_work.next = &p->numa_work;
 
 	p->shared_buddy = NULL;
 	p->shared_buddy_faults = 0;
@@ -1570,6 +1570,25 @@ static void __sched_fork(struct task_struct *p)
 	p->numa_policy.v.preferred_node = 0;
 	p->numa_policy.v.nodes = node_online_map;
 
+	init_task_work(&p->numa_scan_work, task_numa_scan_work);
+	p->numa_scan_work.next = &p->numa_scan_work;
+
+	init_task_work(&p->numa_placement_work, task_numa_placement_work);
+	p->numa_placement_work.next = &p->numa_placement_work;
+
+	if (p->mm) {
+		int entries = 2*nr_node_ids;
+		int size = sizeof(*p->numa_faults) * entries;
+
+		/*
+		 * For efficiency reasons we allocate ->numa_faults[]
+		 * and ->numa_faults_curr[] at once and split the
+		 * buffer we get. They are separate otherwise.
+		 */
+		p->numa_faults = kzalloc(2*size, GFP_KERNEL);
+		if (p->numa_faults)
+			p->numa_faults_curr = p->numa_faults + entries;
+	}
 #endif /* CONFIG_NUMA_BALANCING */
 }
 
@@ -1579,9 +1598,11 @@ static void __sched_fork(struct task_struct *p)
 void sched_fork(struct task_struct *p)
 {
 	unsigned long flags;
-	int cpu = get_cpu();
+	int cpu;
 
 	__sched_fork(p);
+
+	cpu = get_cpu();
 	/*
 	 * We mark the process as running here. This guarantees that
 	 * nobody will actually run it, and a signal or other external
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f0d3876..fda1b63 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1063,19 +1063,18 @@ clear_buddy:
 	p->ideal_cpu_curr		= -1;
 }
 
-static void task_numa_placement(struct task_struct *p)
+/*
+ * Called every couple of hundred milliseconds in the task's
+ * execution life-time, this function decides whether to
+ * change placement parameters:
+ */
+static void task_numa_placement_tick(struct task_struct *p)
 {
-	int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
 	unsigned long total[2] = { 0, 0 };
 	unsigned long faults, max_faults = 0;
 	int node, priv, shared, max_node = -1;
 	int this_node;
 
-	if (p->numa_scan_seq == seq)
-		return;
-
-	p->numa_scan_seq = seq;
-
 	/*
 	 * Update the fault average with the result of the latest
 	 * scan:
@@ -1279,44 +1278,25 @@ void task_numa_fault(int node, int last_cpu, int pages)
 	int priv = (task_cpu(p) == last_cpu);
 	int idx = 2*node + priv;
 
-	WARN_ON_ONCE(last_cpu < 0 || node < 0);
-
-	if (unlikely(!p->numa_faults)) {
-		int entries = 2*nr_node_ids;
-		int size = sizeof(*p->numa_faults) * entries;
-
-		p->numa_faults = kzalloc(2*size, GFP_KERNEL);
-		if (!p->numa_faults)
-			return;
-		/*
-		 * For efficiency reasons we allocate ->numa_faults[]
-		 * and ->numa_faults_curr[] at once and split the
-		 * buffer we get. They are separate otherwise.
-		 */
-		p->numa_faults_curr = p->numa_faults + entries;
-	}
+	WARN_ON_ONCE(last_cpu == -1 || node == -1);
+	BUG_ON(!p->numa_faults);
 
 	p->numa_faults_curr[idx] += pages;
 	shared_fault_tick(p, node, last_cpu, pages);
-	task_numa_placement(p);
 }
 
 /*
  * The expensive part of numa migration is done from task_work context.
  * Triggered from task_tick_numa().
  */
-void task_numa_work(struct callback_head *work)
+void task_numa_placement_work(struct callback_head *work)
 {
-	long pages_total, pages_left, pages_changed;
-	unsigned long migrate, next_scan, now = jiffies;
-	unsigned long start0, start, end;
 	struct task_struct *p = current;
-	struct mm_struct *mm = p->mm;
-	struct vm_area_struct *vma;
 
-	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_placement_work));
 
 	work->next = work; /* protect against double add */
+
 	/*
 	 * Who cares about NUMA placement when they're dying.
 	 *
@@ -1328,6 +1308,29 @@ void task_numa_work(struct callback_head *work)
 	if (p->flags & PF_EXITING)
 		return;
 
+	task_numa_placement_tick(p);
+}
+
+/*
+ * The expensive part of numa migration is done from task_work context.
+ * Triggered from task_tick_numa().
+ */
+void task_numa_scan_work(struct callback_head *work)
+{
+	long pages_total, pages_left, pages_changed;
+	unsigned long migrate, next_scan, now = jiffies;
+	unsigned long start0, start, end;
+	struct task_struct *p = current;
+	struct mm_struct *mm = p->mm;
+	struct vm_area_struct *vma;
+
+	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_scan_work));
+
+	work->next = work; /* protect against double add */
+
+	if (p->flags & PF_EXITING)
+		return;
+
 	/*
 	 * Enforce maximal scan/migration frequency..
 	 */
@@ -1383,15 +1386,12 @@ out:
 /*
  * Drive the periodic memory faults..
  */
-void task_tick_numa(struct rq *rq, struct task_struct *curr)
+static void task_tick_numa_scan(struct rq *rq, struct task_struct *curr)
 {
-	struct callback_head *work = &curr->numa_work;
+	struct callback_head *work = &curr->numa_scan_work;
 	u64 period, now;
 
-	/*
-	 * We don't care about NUMA placement if we don't have memory.
-	 */
-	if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
+	if (work->next != work)
 		return;
 
 	/*
@@ -1403,28 +1403,67 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
 	now = curr->se.sum_exec_runtime;
 	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
 
-	if (now - curr->node_stamp > period) {
-		curr->node_stamp += period;
-		curr->numa_scan_period = sysctl_sched_numa_scan_period_min;
+	if (now - curr->node_stamp <= period)
+		return;
 
-		/*
-		 * We are comparing runtime to wall clock time here, which
-		 * puts a maximum scan frequency limit on the task work.
-		 *
-		 * This, together with the limits in task_numa_work() filters
-		 * us from over-sampling if there are many threads: if all
-		 * threads happen to come in at the same time we don't create a
-		 * spike in overhead.
-		 *
-		 * We also avoid multiple threads scanning at once in parallel to
-		 * each other.
-		 */
-		if (!time_before(jiffies, curr->mm->numa_next_scan)) {
-			init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
-			task_work_add(curr, work, true);
-		}
-	}
+	curr->node_stamp += period;
+	curr->numa_scan_period = sysctl_sched_numa_scan_period_min;
+
+	/*
+	 * We are comparing runtime to wall clock time here, which
+	 * puts a maximum scan frequency limit on the task work.
+	 *
+	 * This, together with the limits in task_numa_work() filters
+	 * us from over-sampling if there are many threads: if all
+	 * threads happen to come in at the same time we don't create a
+	 * spike in overhead.
+	 *
+	 * We also avoid multiple threads scanning at once in parallel to
+	 * each other.
+	 */
+	if (time_before(jiffies, curr->mm->numa_next_scan))
+		return;
+
+	task_work_add(curr, work, true);
 }
+
+/*
+ * Drive the placement logic:
+ */
+static void task_tick_numa_placement(struct rq *rq, struct task_struct *curr)
+{
+	struct callback_head *work = &curr->numa_placement_work;
+	int seq;
+
+	if (work->next != work)
+		return;
+
+	/*
+	 * Check whether we should run task_numa_placement(),
+	 * and if yes, activate the worklet:
+	 */
+	seq = ACCESS_ONCE(curr->mm->numa_scan_seq);
+
+	if (curr->numa_scan_seq == seq)
+		return;
+
+	curr->numa_scan_seq = seq;
+	task_work_add(curr, work, true);
+}
+
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+	/*
+	 * We don't care about NUMA placement if we don't have memory
+	 * or are exiting:
+	 */
+	if (!curr->mm || (curr->flags & PF_EXITING))
+		return;
+
+	task_tick_numa_scan(rq, curr);
+	task_tick_numa_placement(rq, curr);
+}
+
 #else /* !CONFIG_NUMA_BALANCING: */
 #ifdef CONFIG_SMP
 static inline int task_ideal_cpu(struct task_struct *p)				{ return -1; }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c4d15fd..f46405e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1261,6 +1261,11 @@ static inline u64 irq_time_read(int cpu)
 #endif /* CONFIG_64BIT */
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 
+#ifdef CONFIG_NUMA_BALANCING
+extern void task_numa_scan_work(struct callback_head *work);
+extern void task_numa_placement_work(struct callback_head *work);
+#endif
+
 #ifdef CONFIG_SMP
 extern void sched_rebalance_to(int dest_cpu, int flip_tasks);
 #else
-- 
1.7.11.7

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

WARNING: multiple messages have this Message-ID (diff)

From: Ingo Molnar <mingo@kernel.org>
To: linux-kernel@vger.kernel.org, linux-mm@kvack.org
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>,
	Paul Turner <pjt@google.com>,
	Lee Schermerhorn <Lee.Schermerhorn@hp.com>,
	Christoph Lameter <cl@linux.com>, Rik van Riel <riel@redhat.com>,
	Mel Gorman <mgorman@suse.de>,
	Andrew Morton <akpm@linux-foundation.org>,
	Andrea Arcangeli <aarcange@redhat.com>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Thomas Gleixner <tglx@linutronix.de>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Hugh Dickins <hughd@google.com>
Subject: [PATCH 41/52] sched: Move the NUMA placement logic to a worklet
Date: Sun,  2 Dec 2012 19:43:33 +0100	[thread overview]
Message-ID: <1354473824-19229-42-git-send-email-mingo@kernel.org> (raw)
In-Reply-To: <1354473824-19229-1-git-send-email-mingo@kernel.org>

As an implementational detail, to be able to do directed task placement
we have to change how task_numa_fault() interfaces with the scheduler:
instead of the placement logic being executed directly from the fault
path we now trigger a worklet, similarly to how we do the NUMA
hinting fault work.

This moves placement into process context and allows the execution of the
directed task-flipping code via sched_rebalance_to().

This further decouples the NUMA hinting fault engine from
the actual NUMA placement logic.

[ Also move __sched_fork() out of preemptible context. ]

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |   3 +-
 kernel/sched/core.c   |  25 ++++++++-
 kernel/sched/fair.c   | 153 +++++++++++++++++++++++++++++++-------------------
 kernel/sched/sched.h  |   5 ++
 4 files changed, 126 insertions(+), 60 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 696492e..ce9ccd7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1512,7 +1512,8 @@ struct task_struct {
 	unsigned long numa_weight;
 	unsigned long *numa_faults;
 	unsigned long *numa_faults_curr;
-	struct callback_head numa_work;
+	struct callback_head numa_scan_work;
+	struct callback_head numa_placement_work;
 
 	struct task_struct *shared_buddy, *shared_buddy_curr;
 	unsigned long shared_buddy_faults, shared_buddy_faults_curr;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index cad6c89..05d4e1d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -39,6 +39,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/debug_locks.h>
 #include <linux/perf_event.h>
+#include <linux/task_work.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <linux/profile.h>
@@ -1558,7 +1559,6 @@ static void __sched_fork(struct task_struct *p)
 	p->numa_migrate_seq = 2;
 	p->numa_faults = NULL;
 	p->numa_scan_period = sysctl_sched_numa_scan_delay;
-	p->numa_work.next = &p->numa_work;
 
 	p->shared_buddy = NULL;
 	p->shared_buddy_faults = 0;
@@ -1570,6 +1570,25 @@ static void __sched_fork(struct task_struct *p)
 	p->numa_policy.v.preferred_node = 0;
 	p->numa_policy.v.nodes = node_online_map;
 
+	init_task_work(&p->numa_scan_work, task_numa_scan_work);
+	p->numa_scan_work.next = &p->numa_scan_work;
+
+	init_task_work(&p->numa_placement_work, task_numa_placement_work);
+	p->numa_placement_work.next = &p->numa_placement_work;
+
+	if (p->mm) {
+		int entries = 2*nr_node_ids;
+		int size = sizeof(*p->numa_faults) * entries;
+
+		/*
+		 * For efficiency reasons we allocate ->numa_faults[]
+		 * and ->numa_faults_curr[] at once and split the
+		 * buffer we get. They are separate otherwise.
+		 */
+		p->numa_faults = kzalloc(2*size, GFP_KERNEL);
+		if (p->numa_faults)
+			p->numa_faults_curr = p->numa_faults + entries;
+	}
 #endif /* CONFIG_NUMA_BALANCING */
 }
 
@@ -1579,9 +1598,11 @@ static void __sched_fork(struct task_struct *p)
 void sched_fork(struct task_struct *p)
 {
 	unsigned long flags;
-	int cpu = get_cpu();
+	int cpu;
 
 	__sched_fork(p);
+
+	cpu = get_cpu();
 	/*
 	 * We mark the process as running here. This guarantees that
 	 * nobody will actually run it, and a signal or other external
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f0d3876..fda1b63 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1063,19 +1063,18 @@ clear_buddy:
 	p->ideal_cpu_curr		= -1;
 }
 
-static void task_numa_placement(struct task_struct *p)
+/*
+ * Called every couple of hundred milliseconds in the task's
+ * execution life-time, this function decides whether to
+ * change placement parameters:
+ */
+static void task_numa_placement_tick(struct task_struct *p)
 {
-	int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
 	unsigned long total[2] = { 0, 0 };
 	unsigned long faults, max_faults = 0;
 	int node, priv, shared, max_node = -1;
 	int this_node;
 
-	if (p->numa_scan_seq == seq)
-		return;
-
-	p->numa_scan_seq = seq;
-
 	/*
 	 * Update the fault average with the result of the latest
 	 * scan:
@@ -1279,44 +1278,25 @@ void task_numa_fault(int node, int last_cpu, int pages)
 	int priv = (task_cpu(p) == last_cpu);
 	int idx = 2*node + priv;
 
-	WARN_ON_ONCE(last_cpu < 0 || node < 0);
-
-	if (unlikely(!p->numa_faults)) {
-		int entries = 2*nr_node_ids;
-		int size = sizeof(*p->numa_faults) * entries;
-
-		p->numa_faults = kzalloc(2*size, GFP_KERNEL);
-		if (!p->numa_faults)
-			return;
-		/*
-		 * For efficiency reasons we allocate ->numa_faults[]
-		 * and ->numa_faults_curr[] at once and split the
-		 * buffer we get. They are separate otherwise.
-		 */
-		p->numa_faults_curr = p->numa_faults + entries;
-	}
+	WARN_ON_ONCE(last_cpu == -1 || node == -1);
+	BUG_ON(!p->numa_faults);
 
 	p->numa_faults_curr[idx] += pages;
 	shared_fault_tick(p, node, last_cpu, pages);
-	task_numa_placement(p);
 }
 
 /*
  * The expensive part of numa migration is done from task_work context.
  * Triggered from task_tick_numa().
  */
-void task_numa_work(struct callback_head *work)
+void task_numa_placement_work(struct callback_head *work)
 {
-	long pages_total, pages_left, pages_changed;
-	unsigned long migrate, next_scan, now = jiffies;
-	unsigned long start0, start, end;
 	struct task_struct *p = current;
-	struct mm_struct *mm = p->mm;
-	struct vm_area_struct *vma;
 
-	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_placement_work));
 
 	work->next = work; /* protect against double add */
+
 	/*
 	 * Who cares about NUMA placement when they're dying.
 	 *
@@ -1328,6 +1308,29 @@ void task_numa_work(struct callback_head *work)
 	if (p->flags & PF_EXITING)
 		return;
 
+	task_numa_placement_tick(p);
+}
+
+/*
+ * The expensive part of numa migration is done from task_work context.
+ * Triggered from task_tick_numa().
+ */
+void task_numa_scan_work(struct callback_head *work)
+{
+	long pages_total, pages_left, pages_changed;
+	unsigned long migrate, next_scan, now = jiffies;
+	unsigned long start0, start, end;
+	struct task_struct *p = current;
+	struct mm_struct *mm = p->mm;
+	struct vm_area_struct *vma;
+
+	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_scan_work));
+
+	work->next = work; /* protect against double add */
+
+	if (p->flags & PF_EXITING)
+		return;
+
 	/*
 	 * Enforce maximal scan/migration frequency..
 	 */
@@ -1383,15 +1386,12 @@ out:
 /*
  * Drive the periodic memory faults..
  */
-void task_tick_numa(struct rq *rq, struct task_struct *curr)
+static void task_tick_numa_scan(struct rq *rq, struct task_struct *curr)
 {
-	struct callback_head *work = &curr->numa_work;
+	struct callback_head *work = &curr->numa_scan_work;
 	u64 period, now;
 
-	/*
-	 * We don't care about NUMA placement if we don't have memory.
-	 */
-	if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
+	if (work->next != work)
 		return;
 
 	/*
@@ -1403,28 +1403,67 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
 	now = curr->se.sum_exec_runtime;
 	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
 
-	if (now - curr->node_stamp > period) {
-		curr->node_stamp += period;
-		curr->numa_scan_period = sysctl_sched_numa_scan_period_min;
+	if (now - curr->node_stamp <= period)
+		return;
 
-		/*
-		 * We are comparing runtime to wall clock time here, which
-		 * puts a maximum scan frequency limit on the task work.
-		 *
-		 * This, together with the limits in task_numa_work() filters
-		 * us from over-sampling if there are many threads: if all
-		 * threads happen to come in at the same time we don't create a
-		 * spike in overhead.
-		 *
-		 * We also avoid multiple threads scanning at once in parallel to
-		 * each other.
-		 */
-		if (!time_before(jiffies, curr->mm->numa_next_scan)) {
-			init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
-			task_work_add(curr, work, true);
-		}
-	}
+	curr->node_stamp += period;
+	curr->numa_scan_period = sysctl_sched_numa_scan_period_min;
+
+	/*
+	 * We are comparing runtime to wall clock time here, which
+	 * puts a maximum scan frequency limit on the task work.
+	 *
+	 * This, together with the limits in task_numa_work() filters
+	 * us from over-sampling if there are many threads: if all
+	 * threads happen to come in at the same time we don't create a
+	 * spike in overhead.
+	 *
+	 * We also avoid multiple threads scanning at once in parallel to
+	 * each other.
+	 */
+	if (time_before(jiffies, curr->mm->numa_next_scan))
+		return;
+
+	task_work_add(curr, work, true);
 }
+
+/*
+ * Drive the placement logic:
+ */
+static void task_tick_numa_placement(struct rq *rq, struct task_struct *curr)
+{
+	struct callback_head *work = &curr->numa_placement_work;
+	int seq;
+
+	if (work->next != work)
+		return;
+
+	/*
+	 * Check whether we should run task_numa_placement(),
+	 * and if yes, activate the worklet:
+	 */
+	seq = ACCESS_ONCE(curr->mm->numa_scan_seq);
+
+	if (curr->numa_scan_seq == seq)
+		return;
+
+	curr->numa_scan_seq = seq;
+	task_work_add(curr, work, true);
+}
+
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+	/*
+	 * We don't care about NUMA placement if we don't have memory
+	 * or are exiting:
+	 */
+	if (!curr->mm || (curr->flags & PF_EXITING))
+		return;
+
+	task_tick_numa_scan(rq, curr);
+	task_tick_numa_placement(rq, curr);
+}
+
 #else /* !CONFIG_NUMA_BALANCING: */
 #ifdef CONFIG_SMP
 static inline int task_ideal_cpu(struct task_struct *p)				{ return -1; }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c4d15fd..f46405e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1261,6 +1261,11 @@ static inline u64 irq_time_read(int cpu)
 #endif /* CONFIG_64BIT */
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 
+#ifdef CONFIG_NUMA_BALANCING
+extern void task_numa_scan_work(struct callback_head *work);
+extern void task_numa_placement_work(struct callback_head *work);
+#endif
+
 #ifdef CONFIG_SMP
 extern void sched_rebalance_to(int dest_cpu, int flip_tasks);
 #else
-- 
1.7.11.7

next prev parent reply	other threads:[~2012-12-02 18:45 UTC|newest]

Thread overview: 126+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-12-02 18:42 [PATCH 00/52] RFC: Unified NUMA balancing tree, v1 Ingo Molnar
2012-12-02 18:42 ` Ingo Molnar
2012-12-02 18:42 ` [PATCH 01/52] mm/compaction: Move migration fail/success stats to migrate.c Ingo Molnar
2012-12-02 18:42   ` Ingo Molnar
2012-12-02 18:42 ` [PATCH 02/52] mm/compaction: Add scanned and isolated counters for compaction Ingo Molnar
2012-12-02 18:42   ` Ingo Molnar
2012-12-02 18:42 ` [PATCH 03/52] mm/migrate: Add a tracepoint for migrate_pages Ingo Molnar
2012-12-02 18:42   ` Ingo Molnar
2012-12-02 18:42 ` [PATCH 04/52] mm/numa: define _PAGE_NUMA Ingo Molnar
2012-12-02 18:42   ` Ingo Molnar
2012-12-02 18:42 ` [PATCH 05/52] mm/numa: Add pte_numa() and pmd_numa() Ingo Molnar
2012-12-02 18:42   ` Ingo Molnar
2012-12-02 18:42 ` [PATCH 06/52] mm/numa: Support NUMA hinting page faults from gup/gup_fast Ingo Molnar
2012-12-02 18:42   ` Ingo Molnar
2012-12-02 18:42 ` [PATCH 07/52] mm/numa: split_huge_page: transfer the NUMA type from the pmd to the pte Ingo Molnar
2012-12-02 18:42   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 08/52] mm/numa: Create basic numa page hinting infrastructure Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 09/52] mm/mempolicy: Make MPOL_LOCAL a real policy Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 10/52] mm/mempolicy: Add MPOL_MF_NOOP Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 11/52] mm/mempolicy: Check for misplaced page Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 12/52] mm/migrate: Introduce migrate_misplaced_page() Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 13/52] mm/mempolicy: Use _PAGE_NUMA to migrate pages Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 14/52] mm/mempolicy: Add MPOL_MF_LAZY Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 15/52] mm/mempolicy: Implement change_prot_numa() in terms of change_protection() Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 16/52] mm/mempolicy: Hide MPOL_NOOP and MPOL_MF_LAZY from userspace for now Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 17/52] mm/numa: Add pte updates, hinting and migration stats Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 18/52] mm/numa: Migrate on reference policy Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-03 15:44   ` Mel Gorman
2012-12-03 15:44     ` Mel Gorman
2012-12-02 18:43 ` [PATCH 19/52] sched, numa, mm: Add last_cpu to page flags Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 20/52] mm, numa: Implement migrate-on-fault lazy NUMA strategy for regular and THP pages Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-05  0:55   ` David Rientjes
2012-12-05  0:55     ` David Rientjes
2012-12-05  9:43     ` Mel Gorman
2012-12-05  9:43       ` Mel Gorman
2012-12-02 18:43 ` [PATCH 21/52] sched: Make find_busiest_queue() a method Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 22/52] sched, numa, mm: Add credits for NUMA placement Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 23/52] sched, numa, mm: Describe the NUMA scheduling problem formally Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 24/52] sched: Add adaptive NUMA affinity support Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 25/52] sched, numa: Improve the CONFIG_NUMA_BALANCING help text Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 26/52] sched: Implement constant, per task Working Set Sampling (WSS) rate Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 27/52] sched, numa, mm: Count WS scanning against present PTEs, not virtual memory ranges Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 28/52] sched: Implement slow start for working set sampling Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 29/52] sched: Implement NUMA scanning backoff Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-03 19:55   ` Rik van Riel
2012-12-03 19:55     ` Rik van Riel
2012-12-02 18:43 ` [PATCH 30/52] sched: Improve convergence Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 31/52] sched: Introduce staged average NUMA faults Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 32/52] sched: Track groups of shared tasks Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-03 22:46   ` Rik van Riel
2012-12-03 22:46     ` Rik van Riel
2012-12-02 18:43 ` [PATCH 33/52] sched: Use the best-buddy 'ideal cpu' in balancing decisions Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 34/52] sched: Average the fault stats longer Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 35/52] sched: Use the ideal CPU to drive active balancing Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 36/52] sched: Add hysteresis to p->numa_shared Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 37/52] sched, numa, mm: Interleave shared tasks Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 38/52] sched, mm, mempolicy: Add per task mempolicy Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 39/52] sched: Track shared task's node groups and interleave their memory allocations Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 40/52] sched: Add "task flipping" support Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` Ingo Molnar [this message]
2012-12-02 18:43   ` [PATCH 41/52] sched: Move the NUMA placement logic to a worklet Ingo Molnar
2012-12-02 18:43 ` [PATCH 42/52] numa, mempolicy: Improve CONFIG_NUMA_BALANCING=y OOM behavior Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 43/52] sched: Introduce directed NUMA convergence Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 44/52] sched: Remove statistical NUMA scheduling Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 45/52] sched: Track quality and strength of convergence Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 46/52] sched: Converge NUMA migrations Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 47/52] sched: Add convergence strength based adaptive NUMA page fault rate Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 48/52] sched: Refine the 'shared tasks' memory interleaving logic Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 49/52] mm/rmap: Convert the struct anon_vma::mutex to an rwsem Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-04 14:43   ` Michel Lespinasse
2012-12-04 14:43     ` Michel Lespinasse
2012-12-02 18:43 ` [PATCH 50/52] mm/rmap, migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 51/52] sched: Exclude pinned tasks from the NUMA-balancing logic Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-02 18:43 ` [PATCH 52/52] sched: Add RSS filter to NUMA-balancing Ingo Molnar
2012-12-02 18:43   ` Ingo Molnar
2012-12-03  5:09 ` [PATCH 00/52] RFC: Unified NUMA balancing tree, v1 Ingo Molnar
2012-12-03  5:09   ` Ingo Molnar
2012-12-03  9:25   ` [GIT] Unified NUMA balancing tree, v2 Ingo Molnar
2012-12-03  9:25     ` Ingo Molnar
2012-12-03 15:52 ` [PATCH 00/52] RFC: Unified NUMA balancing tree, v1 Rik van Riel
2012-12-03 15:52   ` Rik van Riel
2012-12-03 17:11   ` Ingo Molnar
2012-12-03 17:11     ` Ingo Molnar

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:696492e dfblob:ce9ccd7 dfblob:cad6c89 dfblob:05d4e1d
dfblob:f0d3876 dfblob:fda1b63 dfblob:c4d15fd dfblob:f46405e
dfblob:696492e dfblob:ce9ccd7 dfblob:cad6c89 dfblob:05d4e1d
dfblob:f0d3876 dfblob:fda1b63 dfblob:c4d15fd dfblob:f46405e )
 OR (
bs:"[PATCH 41/52] sched: Move the NUMA placement logic to a worklet" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1354473824-19229-42-git-send-email-mingo@kernel.org \
    --to=mingo@kernel.org \
    --cc=Lee.Schermerhorn@hp.com \
    --cc=a.p.zijlstra@chello.nl \
    --cc=aarcange@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=cl@linux.com \
    --cc=hannes@cmpxchg.org \
    --cc=hughd@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mgorman@suse.de \
    --cc=pjt@google.com \
    --cc=riel@redhat.com \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.