From: Mel Gorman <mgorman@suse.de>
To: Peter Zijlstra <a.p.zijlstra@chello.nl>,
Andrea Arcangeli <aarcange@redhat.com>,
Ingo Molnar <mingo@kernel.org>
Cc: Rik van Riel <riel@redhat.com>,
Johannes Weiner <hannes@cmpxchg.org>,
Hugh Dickins <hughd@google.com>,
Thomas Gleixner <tglx@linutronix.de>,
Linus Torvalds <torvalds@linux-foundation.org>,
Andrew Morton <akpm@linux-foundation.org>,
Linux-MM <linux-mm@kvack.org>,
LKML <linux-kernel@vger.kernel.org>, Mel Gorman <mgorman@suse.de>
Subject: [PATCH 39/43] sched: numa: Introduce per-mm and per-task structures
Date: Fri, 16 Nov 2012 11:22:49 +0000 [thread overview]
Message-ID: <1353064973-26082-40-git-send-email-mgorman@suse.de> (raw)
In-Reply-To: <1353064973-26082-1-git-send-email-mgorman@suse.de>
NOTE: This is heavily based on "autonuma: CPU follows memory algorithm"
and "autonuma: mm_autonuma and task_autonuma data structures"
At the most basic level, any placement policy is going to make some
sort of smart decision based on per-mm and per-task statistics. This
patch simply introduces the structures with basic fault statistics
that can be expaned upon or replaced later. It may be that a placement
policy can approximate without needing both structures in which case
they can be safely deleted later while still having a comparison point
to ensure the approximation is accurate.
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
include/linux/mm_types.h | 26 ++++++++++++++++++++++++++
include/linux/sched.h | 18 ++++++++++++++++++
kernel/fork.c | 18 ++++++++++++++++++
kernel/sched/core.c | 3 +++
kernel/sched/fair.c | 25 ++++++++++++++++++++++++-
kernel/sched/sched.h | 14 ++++++++++++++
6 files changed, 103 insertions(+), 1 deletion(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6b478ff..9588a91 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -312,6 +312,29 @@ struct mm_rss_stat {
atomic_long_t count[NR_MM_COUNTERS];
};
+#ifdef CONFIG_BALANCE_NUMA
+/*
+ * Per-mm structure that contains the NUMA memory placement statistics
+ * generated by pte_numa faults.
+ */
+struct mm_balancenuma {
+ /*
+ * Number of pages that will trigger NUMA faults for this mm. Total
+ * decays each time whether the home node should change to keep
+ * track only of recent events
+ */
+ unsigned long mm_numa_fault_tot;
+
+ /*
+ * Number of pages that will trigger NUMA faults for each [nid].
+ * Also decays.
+ */
+ unsigned long mm_numa_fault[0];
+
+ /* do not add more variables here, the above array size is dynamic */
+};
+#endif /* CONFIG_BALANCE_NUMA */
+
struct mm_struct {
struct vm_area_struct * mmap; /* list of VMAs */
struct rb_root mm_rb;
@@ -415,6 +438,9 @@ struct mm_struct {
/* numa_scan_seq prevents two threads setting pte_numa */
int numa_scan_seq;
+
+ /* this is used by the scheduler and the page allocator */
+ struct mm_balancenuma *mm_balancenuma;
#endif
struct uprobes_state uprobes_state;
};
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1cccfc3..7b6625a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1188,6 +1188,23 @@ enum perf_event_task_context {
perf_nr_task_contexts,
};
+#ifdef CONFIG_BALANCE_NUMA
+/*
+ * Per-task structure that contains the NUMA memory placement statistics
+ * generated by pte_numa faults. This structure is dynamically allocated
+ * when the first pte_numa fault is handled.
+ */
+struct task_balancenuma {
+ /* Total number of eligible pages that triggered NUMA faults */
+ unsigned long task_numa_fault_tot;
+
+ /* Number of pages that triggered NUMA faults for each [nid] */
+ unsigned long task_numa_fault[0];
+
+ /* do not add more variables here, the above array size is dynamic */
+};
+#endif /* CONFIG_BALANCE_NUMA */
+
struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
void *stack;
@@ -1488,6 +1505,7 @@ struct task_struct {
unsigned int numa_scan_period;
u64 node_stamp; /* migration stamp */
struct callback_head numa_work;
+ struct task_balancenuma *task_balancenuma;
#endif /* CONFIG_BALANCE_NUMA */
struct rcu_head rcu;
diff --git a/kernel/fork.c b/kernel/fork.c
index 8b20ab7..c8752f6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -525,6 +525,20 @@ static void mm_init_aio(struct mm_struct *mm)
#endif
}
+#ifdef CONFIG_BALANCE_NUMA
+static inline void free_mm_balancenuma(struct mm_struct *mm)
+{
+ if (mm->mm_balancenuma)
+ kfree(mm->mm_balancenuma);
+
+ mm->mm_balancenuma = NULL;
+}
+#else
+static inline void free_mm_balancenuma(struct mm_struct *mm)
+{
+}
+#endif
+
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
{
atomic_set(&mm->mm_users, 1);
@@ -539,6 +553,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
spin_lock_init(&mm->page_table_lock);
mm->free_area_cache = TASK_UNMAPPED_BASE;
mm->cached_hole_size = ~0UL;
+ mm->mm_balancenuma = NULL;
mm_init_aio(mm);
mm_init_owner(mm, p);
@@ -548,6 +563,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
return mm;
}
+ free_mm_balancenuma(mm);
free_mm(mm);
return NULL;
}
@@ -597,6 +613,7 @@ void __mmdrop(struct mm_struct *mm)
destroy_context(mm);
mmu_notifier_mm_destroy(mm);
check_mm(mm);
+ free_mm_balancenuma(mm);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -854,6 +871,7 @@ fail_nocontext:
* If init_new_context() failed, we cannot use mmput() to free the mm
* because it calls destroy_context()
*/
+ free_mm_balancenuma(mm);
mm_free_pgd(mm);
free_mm(mm);
return NULL;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3d9fc26..9472d5d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1543,6 +1543,7 @@ static void __sched_fork(struct task_struct *p)
p->node_stamp = 0ULL;
p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
+ p->task_balancenuma = NULL;
p->numa_scan_period = sysctl_balance_numa_scan_delay;
p->numa_work.next = &p->numa_work;
#endif /* CONFIG_BALANCE_NUMA */
@@ -1787,6 +1788,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
if (mm)
mmdrop(mm);
if (unlikely(prev_state == TASK_DEAD)) {
+ free_task_balancenuma(prev);
+
/*
* Remove function-return probe instances associated with this
* task and put them back on the free list.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 219158f..98c621c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -854,7 +854,30 @@ void task_numa_fault(int node, int pages)
{
struct task_struct *p = current;
- /* FIXME: Allocate task-specific structure for placement policy here */
+ if (!p->task_balancenuma) {
+ int size = sizeof(struct task_balancenuma) +
+ (sizeof(unsigned long) * nr_node_ids);
+ p->task_balancenuma = kzalloc(size, GFP_KERNEL);
+ if (!p->task_balancenuma)
+ return;
+ }
+
+ if (!p->mm->mm_balancenuma) {
+ int size = sizeof(struct mm_balancenuma) +
+ (sizeof(unsigned long) * nr_node_ids);
+ p->mm->mm_balancenuma = kzalloc(size, GFP_KERNEL);
+ if (!p->mm->mm_balancenuma) {
+ kfree(p->task_balancenuma);
+ p->task_balancenuma = NULL;
+ return;
+ }
+ }
+
+ /* Record fault statistics */
+ p->task_balancenuma->task_numa_fault_tot++;
+ p->task_balancenuma->task_numa_fault[node]++;
+ p->mm->mm_balancenuma->mm_numa_fault_tot++;
+ p->mm->mm_balancenuma->mm_numa_fault[node]++;
/*
* Assume that as faults occur that pages are getting properly placed
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3f0e5a1..92df3d4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -502,6 +502,20 @@ DECLARE_PER_CPU(struct rq, runqueues);
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() (&__raw_get_cpu_var(runqueues))
+
+#ifdef CONFIG_BALANCE_NUMA
+static inline void free_task_balancenuma(struct task_struct *p)
+{
+ if (p->task_balancenuma)
+ kfree(p->task_balancenuma);
+ p->task_balancenuma = NULL;
+}
+#else
+static inline void free_task_balancenuma(struct task_struct *p)
+{
+}
+#endif /* CONFIG_BALANCE_NUMA */
+
#ifdef CONFIG_SMP
#define rcu_dereference_check_sched_domain(p) \
--
1.7.9.2
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2012-11-16 11:23 UTC|newest]
Thread overview: 62+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-11-16 11:22 [RFC PATCH 00/43] Automatic NUMA Balancing V3 Mel Gorman
2012-11-16 11:22 ` [PATCH 01/43] mm: compaction: Move migration fail/success stats to migrate.c Mel Gorman
2012-11-16 11:22 ` [PATCH 02/43] mm: migrate: Add a tracepoint for migrate_pages Mel Gorman
2012-11-16 11:22 ` [PATCH 03/43] mm: compaction: Add scanned and isolated counters for compaction Mel Gorman
2012-11-16 11:22 ` [PATCH 04/43] mm: numa: define _PAGE_NUMA Mel Gorman
2012-11-16 11:22 ` [PATCH 05/43] mm: numa: pte_numa() and pmd_numa() Mel Gorman
2012-11-16 11:22 ` [PATCH 06/43] mm: numa: Make pte_numa() and pmd_numa() a generic implementation Mel Gorman
2012-11-16 14:09 ` Rik van Riel
2012-11-16 14:41 ` Mel Gorman
2012-11-16 15:32 ` Linus Torvalds
2012-11-16 16:08 ` Ingo Molnar
2012-11-16 16:56 ` Mel Gorman
2012-11-16 17:12 ` Ingo Molnar
2012-11-16 17:48 ` Mel Gorman
2012-11-16 18:04 ` Ingo Molnar
2012-11-16 18:55 ` Mel Gorman
2012-11-16 17:26 ` Rik van Riel
2012-11-16 17:37 ` Ingo Molnar
2012-11-16 18:44 ` Mel Gorman
2012-11-16 16:19 ` Mel Gorman
2012-11-16 11:22 ` [PATCH 07/43] mm: numa: Support NUMA hinting page faults from gup/gup_fast Mel Gorman
2012-11-16 14:09 ` Rik van Riel
2012-11-16 11:22 ` [PATCH 08/43] mm: numa: split_huge_page: transfer the NUMA type from the pmd to the pte Mel Gorman
2012-11-16 11:22 ` [PATCH 09/43] mm: numa: Create basic numa page hinting infrastructure Mel Gorman
2012-11-16 11:22 ` [PATCH 10/43] mm: mempolicy: Make MPOL_LOCAL a real policy Mel Gorman
2012-11-16 11:22 ` [PATCH 11/43] mm: mempolicy: Add MPOL_MF_NOOP Mel Gorman
2012-11-16 11:22 ` [PATCH 12/43] mm: mempolicy: Check for misplaced page Mel Gorman
2012-11-16 11:22 ` [PATCH 13/43] mm: migrate: Introduce migrate_misplaced_page() Mel Gorman
2012-11-19 19:44 ` [tip:numa/core] mm/migration: Improve migrate_misplaced_page() tip-bot for Mel Gorman
2012-11-16 11:22 ` [PATCH 14/43] mm: mempolicy: Use _PAGE_NUMA to migrate pages Mel Gorman
2012-11-16 16:08 ` Rik van Riel
2012-11-16 11:22 ` [PATCH 15/43] mm: mempolicy: Add MPOL_MF_LAZY Mel Gorman
2012-11-16 11:22 ` [PATCH 16/43] mm: mempolicy: Hide MPOL_NOOP and MPOL_MF_LAZY from userspace for now Mel Gorman
2012-11-16 16:22 ` Rik van Riel
2012-11-16 11:22 ` [PATCH 17/43] sched, mm, x86: Add the ARCH_SUPPORTS_NUMA_BALANCING flag Mel Gorman
2012-11-16 11:22 ` [PATCH 18/43] mm: numa: Add fault driven placement and migration Mel Gorman
2012-11-16 11:22 ` [PATCH 19/43] mm: numa: Avoid double faulting after migrating misplaced page Mel Gorman
2012-11-16 11:22 ` [PATCH 20/43] mm: sched: numa: Implement constant, per task Working Set Sampling (WSS) rate Mel Gorman
2012-11-16 11:22 ` [PATCH 21/43] sched, numa, mm: Count WS scanning against present PTEs, not virtual memory ranges Mel Gorman
2012-11-16 11:22 ` [PATCH 22/43] mm: sched: numa: Implement slow start for working set sampling Mel Gorman
2012-11-16 11:22 ` [PATCH 23/43] mm: numa: Add pte updates, hinting and migration stats Mel Gorman
2012-11-16 11:22 ` [PATCH 24/43] mm: numa: Migrate on reference policy Mel Gorman
2012-11-16 11:22 ` [PATCH 25/43] mm: numa: Migrate pages handled during a pmd_numa hinting fault Mel Gorman
2012-11-16 11:22 ` [PATCH 26/43] mm: numa: Only mark a PMD pmd_numa if the pages are all on the same node Mel Gorman
2012-11-16 11:22 ` [PATCH 27/43] mm: numa: Structures for Migrate On Fault per NUMA migration rate limiting Mel Gorman
2012-11-16 11:22 ` [PATCH 28/43] mm: numa: Rate limit the amount of memory that is migrated between nodes Mel Gorman
2012-11-16 11:22 ` [PATCH 29/43] mm: numa: Rate limit setting of pte_numa if node is saturated Mel Gorman
2012-11-16 11:22 ` [PATCH 30/43] sched: numa: Slowly increase the scanning period as NUMA faults are handled Mel Gorman
2012-11-16 11:22 ` [PATCH 31/43] mm: numa: Introduce last_nid to the page frame Mel Gorman
2012-11-16 11:22 ` [PATCH 32/43] mm: numa: Use a two-stage filter to restrict pages being migrated for unlikely task<->node relationships Mel Gorman
2012-11-16 11:22 ` [PATCH 33/43] x86: mm: only do a local tlb flush in ptep_set_access_flags() Mel Gorman
2012-11-16 11:22 ` [PATCH 34/43] x86: mm: drop TLB flush from ptep_set_access_flags Mel Gorman
2012-11-16 11:22 ` [PATCH 35/43] mm,generic: only flush the local TLB in ptep_set_access_flags Mel Gorman
2012-11-16 11:22 ` [PATCH 36/43] sched: numa: Introduce tsk_home_node() Mel Gorman
2012-11-16 11:22 ` [PATCH 37/43] sched: numa: Make find_busiest_queue() a method Mel Gorman
2012-11-16 11:22 ` [PATCH 38/43] sched: numa: Implement home-node awareness Mel Gorman
2012-11-16 11:22 ` Mel Gorman [this message]
2012-11-16 11:22 ` [PATCH 40/43] sched: numa: CPU follows memory Mel Gorman
2012-11-16 11:22 ` [PATCH 41/43] sched: numa: Rename mempolicy to HOME Mel Gorman
2012-11-16 11:22 ` [PATCH 42/43] sched: numa: Consider only one CPU per node for CPU-follows-memory Mel Gorman
2012-11-16 11:22 ` [PATCH 43/43] sched: numa: Increase and decrease a tasks scanning period based on task fault statistics Mel Gorman
2012-11-16 14:56 ` [RFC PATCH 00/43] Automatic NUMA Balancing V3 Mel Gorman
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1353064973-26082-40-git-send-email-mgorman@suse.de \
--to=mgorman@suse.de \
--cc=a.p.zijlstra@chello.nl \
--cc=aarcange@redhat.com \
--cc=akpm@linux-foundation.org \
--cc=hannes@cmpxchg.org \
--cc=hughd@google.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mingo@kernel.org \
--cc=riel@redhat.com \
--cc=tglx@linutronix.de \
--cc=torvalds@linux-foundation.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).