Re: [PATCH 26/31] sched, numa, mm: Add fault driven placement and migration policy

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Mel Gorman <mgorman@suse.de>
To: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Rik van Riel <riel@redhat.com>,
	Andrea Arcangeli <aarcange@redhat.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Thomas Gleixner <tglx@linutronix.de>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	Ingo Molnar <mingo@kernel.org>
Subject: Re: [PATCH 26/31] sched, numa, mm: Add fault driven placement and migration policy
Date: Thu, 1 Nov 2012 15:40:28 +0000	[thread overview]
Message-ID: <20121101154027.GC3888@suse.de> (raw)
In-Reply-To: <20121025124834.467791319@chello.nl>

On Thu, Oct 25, 2012 at 02:16:43PM +0200, Peter Zijlstra wrote:
> As per the problem/design document Documentation/scheduler/numa-problem.txt
> implement 3ac & 4.
> 
> ( A pure 3a was found too unstable, I did briefly try 3bc
>   but found no significant improvement. )
> 
> Implement a per-task memory placement scheme relying on a regular
> PROT_NONE 'migration' fault to scan the memory space of the procress
> and uses a two stage migration scheme to reduce the invluence of
> unlikely usage relations.
> 
> It relies on the assumption that the compute part is tied to a
> paticular task and builds a task<->page relation set to model the
> compute<->data relation.
> 
> In the previous patch we made memory migrate towards where the task
> is running, here we select the node on which most memory is located
> as the preferred node to run on.
> 
> This creates a feed-back control loop between trying to schedule a
> task on a node and migrating memory towards the node the task is
> scheduled on. 
> 

Ok.

> Suggested-by: Andrea Arcangeli <aarcange@redhat.com>
> Suggested-by: Rik van Riel <riel@redhat.com>
> Fixes-by: David Rientjes <rientjes@google.com>
> Cc: Linus Torvalds <torvalds@linux-foundation.org>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> Signed-off-by: Ingo Molnar <mingo@kernel.org>
> ---
>  include/linux/mm_types.h |    4 +
>  include/linux/sched.h    |   35 +++++++--
>  kernel/sched/core.c      |   16 ++++
>  kernel/sched/fair.c      |  175 +++++++++++++++++++++++++++++++++++++++++++++++
>  kernel/sched/features.h  |    1 
>  kernel/sched/sched.h     |   31 +++++---
>  kernel/sysctl.c          |   31 +++++++-
>  mm/huge_memory.c         |    7 +
>  mm/memory.c              |    4 -
>  9 files changed, 282 insertions(+), 22 deletions(-)
> Index: tip/include/linux/mm_types.h
> ===================================================================
> --- tip.orig/include/linux/mm_types.h
> +++ tip/include/linux/mm_types.h
> @@ -403,6 +403,10 @@ struct mm_struct {
>  #ifdef CONFIG_CPUMASK_OFFSTACK
>  	struct cpumask cpumask_allocation;
>  #endif
> +#ifdef CONFIG_SCHED_NUMA
> +	unsigned long numa_next_scan;

comment. 

> +	int numa_scan_seq;

comment! at least the other one is easy to guess. This thing looks like
it's preventing multiple threads in a process space scanning and
updating PTEs at the same time. Effectively it's a type of barrier but
without a comment I'm not sure if what it's doing is what you expect it
to be doing or something else entirely.

> +#endif
>  	struct uprobes_state uprobes_state;
>  };
>  
> Index: tip/include/linux/sched.h
> ===================================================================
> --- tip.orig/include/linux/sched.h
> +++ tip/include/linux/sched.h
> @@ -1481,9 +1481,16 @@ struct task_struct {
>  	short pref_node_fork;
>  #endif
>  #ifdef CONFIG_SCHED_NUMA
> -	int node;
> +	int node;			/* task home node   */
> +	int numa_scan_seq;
> +	int numa_migrate_seq;
> +	unsigned int numa_scan_period;
> +	u64 node_stamp;			/* migration stamp  */
>  	unsigned long numa_contrib;
> -#endif
> +	unsigned long *numa_faults;
> +	struct callback_head numa_work;
> +#endif /* CONFIG_SCHED_NUMA */
> +
>  	struct rcu_head rcu;
>  
>  	/*
> @@ -1558,15 +1565,24 @@ struct task_struct {
>  /* Future-safe accessor for struct task_struct's cpus_allowed. */
>  #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
>  
> +#ifdef CONFIG_SCHED_NUMA
>  static inline int tsk_home_node(struct task_struct *p)
>  {
> -#ifdef CONFIG_SCHED_NUMA
>  	return p->node;
> +}
> +
> +extern void task_numa_fault(int node, int pages);
>  #else
> +static inline int tsk_home_node(struct task_struct *p)
> +{
>  	return -1;
> -#endif
>  }
>  
> +static inline void task_numa_fault(int node, int pages)
> +{
> +}
> +#endif /* CONFIG_SCHED_NUMA */
> +
>  /*
>   * Priority of a process goes from 0..MAX_PRIO-1, valid RT
>   * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
> @@ -2004,6 +2020,10 @@ enum sched_tunable_scaling {
>  };
>  extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
>  
> +extern unsigned int sysctl_sched_numa_scan_period_min;
> +extern unsigned int sysctl_sched_numa_scan_period_max;
> +extern unsigned int sysctl_sched_numa_settle_count;
> +
>  #ifdef CONFIG_SCHED_DEBUG
>  extern unsigned int sysctl_sched_migration_cost;
>  extern unsigned int sysctl_sched_nr_migrate;
> @@ -2014,18 +2034,17 @@ extern unsigned int sysctl_sched_shares_
>  int sched_proc_update_handler(struct ctl_table *table, int write,
>  		void __user *buffer, size_t *length,
>  		loff_t *ppos);
> -#endif
> -#ifdef CONFIG_SCHED_DEBUG
> +
>  static inline unsigned int get_sysctl_timer_migration(void)
>  {
>  	return sysctl_timer_migration;
>  }
> -#else
> +#else /* CONFIG_SCHED_DEBUG */
>  static inline unsigned int get_sysctl_timer_migration(void)
>  {
>  	return 1;
>  }
> -#endif
> +#endif /* CONFIG_SCHED_DEBUG */
>  extern unsigned int sysctl_sched_rt_period;
>  extern int sysctl_sched_rt_runtime;
>  
> Index: tip/kernel/sched/core.c
> ===================================================================
> --- tip.orig/kernel/sched/core.c
> +++ tip/kernel/sched/core.c
> @@ -1533,6 +1533,21 @@ static void __sched_fork(struct task_str
>  #ifdef CONFIG_PREEMPT_NOTIFIERS
>  	INIT_HLIST_HEAD(&p->preempt_notifiers);
>  #endif
> +
> +#ifdef CONFIG_SCHED_NUMA
> +	if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
> +		p->mm->numa_next_scan = jiffies;
> +		p->mm->numa_scan_seq = 0;
> +	}
> +
> +	p->node = -1;
> +	p->node_stamp = 0ULL;
> +	p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
> +	p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
> +	p->numa_faults = NULL;
> +	p->numa_scan_period = sysctl_sched_numa_scan_period_min;
> +	p->numa_work.next = &p->numa_work;
> +#endif /* CONFIG_SCHED_NUMA */
>  }
>  
>  /*
> @@ -1774,6 +1789,7 @@ static void finish_task_switch(struct rq
>  	if (mm)
>  		mmdrop(mm);
>  	if (unlikely(prev_state == TASK_DEAD)) {
> +		task_numa_free(prev);
>  		/*
>  		 * Remove function-return probe instances associated with this
>  		 * task and put them back on the free list.
> Index: tip/kernel/sched/fair.c
> ===================================================================
> --- tip.orig/kernel/sched/fair.c
> +++ tip/kernel/sched/fair.c
> @@ -27,6 +27,8 @@
>  #include <linux/profile.h>
>  #include <linux/interrupt.h>
>  #include <linux/random.h>
> +#include <linux/mempolicy.h>
> +#include <linux/task_work.h>
>  
>  #include <trace/events/sched.h>
>  
> @@ -775,6 +777,21 @@ update_stats_curr_start(struct cfs_rq *c
>  
>  /**************************************************
>   * Scheduling class numa methods.
> + *
> + * The purpose of the NUMA bits are to maintain compute (task) and data
> + * (memory) locality. We try and achieve this by making tasks stick to
> + * a particular node (their home node) but if fairness mandates they run
> + * elsewhere for long enough, we let the memory follow them.
> + *
> + * Tasks start out with their home-node unset (-1) this effectively means
> + * they act !NUMA until we've established the task is busy enough to bother
> + * with placement.
> + *
> + * We keep a home-node per task and use periodic fault scans to try and
> + * estalish a task<->page relation. This assumes the task<->page relation is a
> + * compute<->data relation, this is false for things like virt. and n:m
> + * threading solutions but its the best we can do given the information we
> + * have.
>   */
>  
>  #ifdef CONFIG_SMP
> @@ -805,6 +822,157 @@ static void account_numa_dequeue(struct
>  	} else
>  		rq->onnode_running--;
>  }
> +
> +/*
> + * numa task sample period in ms: 5s
> + */
> +unsigned int sysctl_sched_numa_scan_period_min = 5000;
> +unsigned int sysctl_sched_numa_scan_period_max = 5000*16;
> +
> +/*
> + * Wait for the 2-sample stuff to settle before migrating again
> + */
> +unsigned int sysctl_sched_numa_settle_count = 2;
> +
> +static void task_numa_placement(struct task_struct *p)
> +{
> +	unsigned long faults, max_faults = 0;
> +	int node, max_node = -1;
> +	int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
> +
> +	if (p->numa_scan_seq == seq)
> +		return;
> +
> +	p->numa_scan_seq = seq;
> +
> +	for (node = 0; node < nr_node_ids; node++) {
> +		faults = p->numa_faults[node];
> +
> +		if (faults > max_faults) {
> +			max_faults = faults;
> +			max_node = node;
> +		}
> +
> +		p->numa_faults[node] /= 2;
> +	}

No comments explaining the logic behind the decaying average. It can be
inferred if someone reads Documentation/scheduler/numa-problem.txt and
point 3c carefully enough. At the very least point them at it.


> +
> +	if (max_node == -1)
> +		return;
> +
> +	if (p->node != max_node) {
> +		p->numa_scan_period = sysctl_sched_numa_scan_period_min;
> +		if (sched_feat(NUMA_SETTLE) &&
> +		    (seq - p->numa_migrate_seq) <= (int)sysctl_sched_numa_settle_count)
> +			return;
> +		p->numa_migrate_seq = seq;
> +		sched_setnode(p, max_node);

Ok, so at a guess even if we do ping-pong it will only take effect every
10 seconds which could be far worse.

> +	} else {
> +		p->numa_scan_period = min(sysctl_sched_numa_scan_period_max,
> +				p->numa_scan_period * 2);
> +	}
> +}
> +
> +/*
> + * Got a PROT_NONE fault for a page on @node.
> + */
> +void task_numa_fault(int node, int pages)
> +{
> +	struct task_struct *p = current;
> +
> +	if (unlikely(!p->numa_faults)) {
> +		int size = sizeof(unsigned long) * nr_node_ids;
> +
> +		p->numa_faults = kzalloc(size, GFP_KERNEL);
> +		if (!p->numa_faults)
> +			return;
> +	}
> +

On a maximally configured machine this will be an order-4 allocation and
you need at least 512 nodes before it's an order-1 allocation. As unlikely
as it is, should this be GFP_NOWARN?

> +	task_numa_placement(p);
> +
> +	p->numa_faults[node] += pages;
> +}
> +
> +/*
> + * The expensive part of numa migration is done from task_work context.
> + * Triggered from task_tick_numa().
> + */
> +void task_numa_work(struct callback_head *work)
> +{
> +	unsigned long migrate, next_scan, now = jiffies;
> +	struct task_struct *p = current;
> +	struct mm_struct *mm = p->mm;
> +
> +	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
> +
> +	work->next = work; /* protect against double add */
> +	/*
> +	 * Who cares about NUMA placement when they're dying.
> +	 *
> +	 * NOTE: make sure not to dereference p->mm before this check,
> +	 * exit_task_work() happens _after_ exit_mm() so we could be called
> +	 * without p->mm even though we still had it when we enqueued this
> +	 * work.
> +	 */
> +	if (p->flags & PF_EXITING)
> +		return;
> +
> +	/*
> +	 * Enforce maximal scan/migration frequency..
> +	 */
> +	migrate = mm->numa_next_scan;
> +	if (time_before(now, migrate))
> +		return;
> +
> +	next_scan = now + 2*msecs_to_jiffies(sysctl_sched_numa_scan_period_min);
> +	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
> +		return;
> +
> +	ACCESS_ONCE(mm->numa_scan_seq)++;
> +	{
> +		struct vm_area_struct *vma;
> +
> +		down_write(&mm->mmap_sem);
> +		for (vma = mm->mmap; vma; vma = vma->vm_next) {
> +			if (!vma_migratable(vma))
> +				continue;
> +			change_protection(vma, vma->vm_start, vma->vm_end, vma_prot_none(vma), 0);
> +		}
> +		up_write(&mm->mmap_sem);
> +	}
> +}

Ok, I like the idea of the scanning cost being incurred by the process.
I was going to complain though that for very large processes that the length
time it takes to complete this scan could be considerable.  However,
a quick glance forward indicates that you cope with this problem later by
limiting how much is scanned each time.

> +
> +/*
> + * Drive the periodic memory faults..
> + */
> +void task_tick_numa(struct rq *rq, struct task_struct *curr)
> +{
> +	struct callback_head *work = &curr->numa_work;
> +	u64 period, now;
> +
> +	/*
> +	 * We don't care about NUMA placement if we don't have memory.
> +	 */
> +	if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
> +		return;
> +
> +	/*
> +	 * Using runtime rather than walltime has the dual advantage that
> +	 * we (mostly) drive the selection from busy threads and that the
> +	 * task needs to have done some actual work before we bother with
> +	 * NUMA placement.
> +	 */

Makes sense.

> +	now = curr->se.sum_exec_runtime;
> +	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
> +
> +	if (now - curr->node_stamp > period) {
> +		curr->node_stamp = now;
> +
> +		if (!time_before(jiffies, curr->mm->numa_next_scan)) {
> +			init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
> +			task_work_add(curr, work, true);
> +		}
> +	}
> +}
>  #else
>  #ifdef CONFIG_SMP
>  static struct list_head *account_numa_enqueue(struct rq *rq, struct task_struct *p)
> @@ -816,6 +984,10 @@ static struct list_head *account_numa_en
>  static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
>  {
>  }
> +
> +static void task_tick_numa(struct rq *rq, struct task_struct *curr)
> +{
> +}
>  #endif /* CONFIG_SCHED_NUMA */
>  
>  /**************************************************
> @@ -5265,6 +5437,9 @@ static void task_tick_fair(struct rq *rq
>  		cfs_rq = cfs_rq_of(se);
>  		entity_tick(cfs_rq, se, queued);
>  	}
> +
> +	if (sched_feat_numa(NUMA))
> +		task_tick_numa(rq, curr);
>  }
>  
>  /*
> Index: tip/kernel/sched/features.h
> ===================================================================
> --- tip.orig/kernel/sched/features.h
> +++ tip/kernel/sched/features.h
> @@ -69,5 +69,6 @@ SCHED_FEAT(NUMA_TTWU_BIAS, false)
>  SCHED_FEAT(NUMA_TTWU_TO,   false)
>  SCHED_FEAT(NUMA_PULL,      true)
>  SCHED_FEAT(NUMA_PULL_BIAS, true)
> +SCHED_FEAT(NUMA_SETTLE,    true)
>  #endif
>  
> Index: tip/kernel/sched/sched.h
> ===================================================================
> --- tip.orig/kernel/sched/sched.h
> +++ tip/kernel/sched/sched.h
> @@ -3,6 +3,7 @@
>  #include <linux/mutex.h>
>  #include <linux/spinlock.h>
>  #include <linux/stop_machine.h>
> +#include <linux/slab.h>
>  
>  #include "cpupri.h"
>  
> @@ -476,15 +477,6 @@ struct rq {
>  #endif
>  };
>  
> -static inline struct list_head *offnode_tasks(struct rq *rq)
> -{
> -#ifdef CONFIG_SCHED_NUMA
> -	return &rq->offnode_tasks;
> -#else
> -	return NULL;
> -#endif
> -}
> -
>  static inline int cpu_of(struct rq *rq)
>  {
>  #ifdef CONFIG_SMP
> @@ -502,6 +494,27 @@ DECLARE_PER_CPU(struct rq, runqueues);
>  #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
>  #define raw_rq()		(&__raw_get_cpu_var(runqueues))
>  
> +#ifdef CONFIG_SCHED_NUMA
> +static inline struct list_head *offnode_tasks(struct rq *rq)
> +{
> +	return &rq->offnode_tasks;
> +}
> +
> +static inline void task_numa_free(struct task_struct *p)
> +{
> +	kfree(p->numa_faults);
> +}
> +#else /* CONFIG_SCHED_NUMA */
> +static inline struct list_head *offnode_tasks(struct rq *rq)
> +{
> +	return NULL;
> +}
> +
> +static inline void task_numa_free(struct task_struct *p)
> +{
> +}
> +#endif /* CONFIG_SCHED_NUMA */
> +
>  #ifdef CONFIG_SMP
>  
>  #define rcu_dereference_check_sched_domain(p) \
> Index: tip/kernel/sysctl.c
> ===================================================================
> --- tip.orig/kernel/sysctl.c
> +++ tip/kernel/sysctl.c
> @@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 10
>  static int max_sched_granularity_ns = NSEC_PER_SEC;	/* 1 second */
>  static int min_wakeup_granularity_ns;			/* 0 usecs */
>  static int max_wakeup_granularity_ns = NSEC_PER_SEC;	/* 1 second */
> +#ifdef CONFIG_SMP
>  static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
>  static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
> -#endif
> +#endif /* CONFIG_SMP */
> +#endif /* CONFIG_SCHED_DEBUG */
>  
>  #ifdef CONFIG_COMPACTION
>  static int min_extfrag_threshold;
> @@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = {
>  		.extra1		= &min_wakeup_granularity_ns,
>  		.extra2		= &max_wakeup_granularity_ns,
>  	},
> +#ifdef CONFIG_SMP
>  	{
>  		.procname	= "sched_tunable_scaling",
>  		.data		= &sysctl_sched_tunable_scaling,
> @@ -347,7 +350,31 @@ static struct ctl_table kern_table[] = {
>  		.extra1		= &zero,
>  		.extra2		= &one,
>  	},
> -#endif
> +#endif /* CONFIG_SMP */
> +#ifdef CONFIG_SCHED_NUMA
> +	{
> +		.procname	= "sched_numa_scan_period_min_ms",
> +		.data		= &sysctl_sched_numa_scan_period_min,
> +		.maxlen		= sizeof(unsigned int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec,
> +	},
> +	{
> +		.procname	= "sched_numa_scan_period_max_ms",
> +		.data		= &sysctl_sched_numa_scan_period_max,
> +		.maxlen		= sizeof(unsigned int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec,
> +	},
> +	{
> +		.procname	= "sched_numa_settle_count",
> +		.data		= &sysctl_sched_numa_settle_count,
> +		.maxlen		= sizeof(unsigned int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec,
> +	},
> +#endif /* CONFIG_SCHED_NUMA */
> +#endif /* CONFIG_SCHED_DEBUG */
>  	{
>  		.procname	= "sched_rt_period_us",
>  		.data		= &sysctl_sched_rt_period,
> Index: tip/mm/huge_memory.c
> ===================================================================
> --- tip.orig/mm/huge_memory.c
> +++ tip/mm/huge_memory.c
> @@ -774,9 +774,10 @@ fixup:
>  
>  unlock:
>  	spin_unlock(&mm->page_table_lock);
> -	if (page)
> +	if (page) {
> +		task_numa_fault(page_to_nid(page), HPAGE_PMD_NR);
>  		put_page(page);
> -
> +	}
>  	return;
>  
>  migrate:
> @@ -845,6 +846,8 @@ migrate:
>  
>  	put_page(page);			/* Drop the rmap reference */
>  
> +	task_numa_fault(node, HPAGE_PMD_NR);
> +
>  	if (lru)
>  		put_page(page);		/* drop the LRU isolation reference */
>  
> Index: tip/mm/memory.c
> ===================================================================
> --- tip.orig/mm/memory.c
> +++ tip/mm/memory.c
> @@ -3512,8 +3512,10 @@ out_pte_upgrade_unlock:
>  out_unlock:
>  	pte_unmap_unlock(ptep, ptl);
>  out:
> -	if (page)
> +	if (page) {
> +		task_numa_fault(page_nid, 1);
>  		put_page(page);
> +	}
>  
>  	return 0;
>  
> 
> 

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

WARNING: multiple messages have this Message-ID (diff)

From: Mel Gorman <mgorman@suse.de>
To: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Rik van Riel <riel@redhat.com>,
	Andrea Arcangeli <aarcange@redhat.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Thomas Gleixner <tglx@linutronix.de>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	Ingo Molnar <mingo@kernel.org>
Subject: Re: [PATCH 26/31] sched, numa, mm: Add fault driven placement and migration policy
Date: Thu, 1 Nov 2012 15:40:28 +0000	[thread overview]
Message-ID: <20121101154027.GC3888@suse.de> (raw)
In-Reply-To: <20121025124834.467791319@chello.nl>

On Thu, Oct 25, 2012 at 02:16:43PM +0200, Peter Zijlstra wrote:
> As per the problem/design document Documentation/scheduler/numa-problem.txt
> implement 3ac & 4.
> 
> ( A pure 3a was found too unstable, I did briefly try 3bc
>   but found no significant improvement. )
> 
> Implement a per-task memory placement scheme relying on a regular
> PROT_NONE 'migration' fault to scan the memory space of the procress
> and uses a two stage migration scheme to reduce the invluence of
> unlikely usage relations.
> 
> It relies on the assumption that the compute part is tied to a
> paticular task and builds a task<->page relation set to model the
> compute<->data relation.
> 
> In the previous patch we made memory migrate towards where the task
> is running, here we select the node on which most memory is located
> as the preferred node to run on.
> 
> This creates a feed-back control loop between trying to schedule a
> task on a node and migrating memory towards the node the task is
> scheduled on. 
> 

Ok.

> Suggested-by: Andrea Arcangeli <aarcange@redhat.com>
> Suggested-by: Rik van Riel <riel@redhat.com>
> Fixes-by: David Rientjes <rientjes@google.com>
> Cc: Linus Torvalds <torvalds@linux-foundation.org>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> Signed-off-by: Ingo Molnar <mingo@kernel.org>
> ---
>  include/linux/mm_types.h |    4 +
>  include/linux/sched.h    |   35 +++++++--
>  kernel/sched/core.c      |   16 ++++
>  kernel/sched/fair.c      |  175 +++++++++++++++++++++++++++++++++++++++++++++++
>  kernel/sched/features.h  |    1 
>  kernel/sched/sched.h     |   31 +++++---
>  kernel/sysctl.c          |   31 +++++++-
>  mm/huge_memory.c         |    7 +
>  mm/memory.c              |    4 -
>  9 files changed, 282 insertions(+), 22 deletions(-)
> Index: tip/include/linux/mm_types.h
> ===================================================================
> --- tip.orig/include/linux/mm_types.h
> +++ tip/include/linux/mm_types.h
> @@ -403,6 +403,10 @@ struct mm_struct {
>  #ifdef CONFIG_CPUMASK_OFFSTACK
>  	struct cpumask cpumask_allocation;
>  #endif
> +#ifdef CONFIG_SCHED_NUMA
> +	unsigned long numa_next_scan;

comment. 

> +	int numa_scan_seq;

comment! at least the other one is easy to guess. This thing looks like
it's preventing multiple threads in a process space scanning and
updating PTEs at the same time. Effectively it's a type of barrier but
without a comment I'm not sure if what it's doing is what you expect it
to be doing or something else entirely.

> +#endif
>  	struct uprobes_state uprobes_state;
>  };
>  
> Index: tip/include/linux/sched.h
> ===================================================================
> --- tip.orig/include/linux/sched.h
> +++ tip/include/linux/sched.h
> @@ -1481,9 +1481,16 @@ struct task_struct {
>  	short pref_node_fork;
>  #endif
>  #ifdef CONFIG_SCHED_NUMA
> -	int node;
> +	int node;			/* task home node   */
> +	int numa_scan_seq;
> +	int numa_migrate_seq;
> +	unsigned int numa_scan_period;
> +	u64 node_stamp;			/* migration stamp  */
>  	unsigned long numa_contrib;
> -#endif
> +	unsigned long *numa_faults;
> +	struct callback_head numa_work;
> +#endif /* CONFIG_SCHED_NUMA */
> +
>  	struct rcu_head rcu;
>  
>  	/*
> @@ -1558,15 +1565,24 @@ struct task_struct {
>  /* Future-safe accessor for struct task_struct's cpus_allowed. */
>  #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
>  
> +#ifdef CONFIG_SCHED_NUMA
>  static inline int tsk_home_node(struct task_struct *p)
>  {
> -#ifdef CONFIG_SCHED_NUMA
>  	return p->node;
> +}
> +
> +extern void task_numa_fault(int node, int pages);
>  #else
> +static inline int tsk_home_node(struct task_struct *p)
> +{
>  	return -1;
> -#endif
>  }
>  
> +static inline void task_numa_fault(int node, int pages)
> +{
> +}
> +#endif /* CONFIG_SCHED_NUMA */
> +
>  /*
>   * Priority of a process goes from 0..MAX_PRIO-1, valid RT
>   * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
> @@ -2004,6 +2020,10 @@ enum sched_tunable_scaling {
>  };
>  extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
>  
> +extern unsigned int sysctl_sched_numa_scan_period_min;
> +extern unsigned int sysctl_sched_numa_scan_period_max;
> +extern unsigned int sysctl_sched_numa_settle_count;
> +
>  #ifdef CONFIG_SCHED_DEBUG
>  extern unsigned int sysctl_sched_migration_cost;
>  extern unsigned int sysctl_sched_nr_migrate;
> @@ -2014,18 +2034,17 @@ extern unsigned int sysctl_sched_shares_
>  int sched_proc_update_handler(struct ctl_table *table, int write,
>  		void __user *buffer, size_t *length,
>  		loff_t *ppos);
> -#endif
> -#ifdef CONFIG_SCHED_DEBUG
> +
>  static inline unsigned int get_sysctl_timer_migration(void)
>  {
>  	return sysctl_timer_migration;
>  }
> -#else
> +#else /* CONFIG_SCHED_DEBUG */
>  static inline unsigned int get_sysctl_timer_migration(void)
>  {
>  	return 1;
>  }
> -#endif
> +#endif /* CONFIG_SCHED_DEBUG */
>  extern unsigned int sysctl_sched_rt_period;
>  extern int sysctl_sched_rt_runtime;
>  
> Index: tip/kernel/sched/core.c
> ===================================================================
> --- tip.orig/kernel/sched/core.c
> +++ tip/kernel/sched/core.c
> @@ -1533,6 +1533,21 @@ static void __sched_fork(struct task_str
>  #ifdef CONFIG_PREEMPT_NOTIFIERS
>  	INIT_HLIST_HEAD(&p->preempt_notifiers);
>  #endif
> +
> +#ifdef CONFIG_SCHED_NUMA
> +	if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
> +		p->mm->numa_next_scan = jiffies;
> +		p->mm->numa_scan_seq = 0;
> +	}
> +
> +	p->node = -1;
> +	p->node_stamp = 0ULL;
> +	p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
> +	p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
> +	p->numa_faults = NULL;
> +	p->numa_scan_period = sysctl_sched_numa_scan_period_min;
> +	p->numa_work.next = &p->numa_work;
> +#endif /* CONFIG_SCHED_NUMA */
>  }
>  
>  /*
> @@ -1774,6 +1789,7 @@ static void finish_task_switch(struct rq
>  	if (mm)
>  		mmdrop(mm);
>  	if (unlikely(prev_state == TASK_DEAD)) {
> +		task_numa_free(prev);
>  		/*
>  		 * Remove function-return probe instances associated with this
>  		 * task and put them back on the free list.
> Index: tip/kernel/sched/fair.c
> ===================================================================
> --- tip.orig/kernel/sched/fair.c
> +++ tip/kernel/sched/fair.c
> @@ -27,6 +27,8 @@
>  #include <linux/profile.h>
>  #include <linux/interrupt.h>
>  #include <linux/random.h>
> +#include <linux/mempolicy.h>
> +#include <linux/task_work.h>
>  
>  #include <trace/events/sched.h>
>  
> @@ -775,6 +777,21 @@ update_stats_curr_start(struct cfs_rq *c
>  
>  /**************************************************
>   * Scheduling class numa methods.
> + *
> + * The purpose of the NUMA bits are to maintain compute (task) and data
> + * (memory) locality. We try and achieve this by making tasks stick to
> + * a particular node (their home node) but if fairness mandates they run
> + * elsewhere for long enough, we let the memory follow them.
> + *
> + * Tasks start out with their home-node unset (-1) this effectively means
> + * they act !NUMA until we've established the task is busy enough to bother
> + * with placement.
> + *
> + * We keep a home-node per task and use periodic fault scans to try and
> + * estalish a task<->page relation. This assumes the task<->page relation is a
> + * compute<->data relation, this is false for things like virt. and n:m
> + * threading solutions but its the best we can do given the information we
> + * have.
>   */
>  
>  #ifdef CONFIG_SMP
> @@ -805,6 +822,157 @@ static void account_numa_dequeue(struct
>  	} else
>  		rq->onnode_running--;
>  }
> +
> +/*
> + * numa task sample period in ms: 5s
> + */
> +unsigned int sysctl_sched_numa_scan_period_min = 5000;
> +unsigned int sysctl_sched_numa_scan_period_max = 5000*16;
> +
> +/*
> + * Wait for the 2-sample stuff to settle before migrating again
> + */
> +unsigned int sysctl_sched_numa_settle_count = 2;
> +
> +static void task_numa_placement(struct task_struct *p)
> +{
> +	unsigned long faults, max_faults = 0;
> +	int node, max_node = -1;
> +	int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
> +
> +	if (p->numa_scan_seq == seq)
> +		return;
> +
> +	p->numa_scan_seq = seq;
> +
> +	for (node = 0; node < nr_node_ids; node++) {
> +		faults = p->numa_faults[node];
> +
> +		if (faults > max_faults) {
> +			max_faults = faults;
> +			max_node = node;
> +		}
> +
> +		p->numa_faults[node] /= 2;
> +	}

No comments explaining the logic behind the decaying average. It can be
inferred if someone reads Documentation/scheduler/numa-problem.txt and
point 3c carefully enough. At the very least point them at it.


> +
> +	if (max_node == -1)
> +		return;
> +
> +	if (p->node != max_node) {
> +		p->numa_scan_period = sysctl_sched_numa_scan_period_min;
> +		if (sched_feat(NUMA_SETTLE) &&
> +		    (seq - p->numa_migrate_seq) <= (int)sysctl_sched_numa_settle_count)
> +			return;
> +		p->numa_migrate_seq = seq;
> +		sched_setnode(p, max_node);

Ok, so at a guess even if we do ping-pong it will only take effect every
10 seconds which could be far worse.

> +	} else {
> +		p->numa_scan_period = min(sysctl_sched_numa_scan_period_max,
> +				p->numa_scan_period * 2);
> +	}
> +}
> +
> +/*
> + * Got a PROT_NONE fault for a page on @node.
> + */
> +void task_numa_fault(int node, int pages)
> +{
> +	struct task_struct *p = current;
> +
> +	if (unlikely(!p->numa_faults)) {
> +		int size = sizeof(unsigned long) * nr_node_ids;
> +
> +		p->numa_faults = kzalloc(size, GFP_KERNEL);
> +		if (!p->numa_faults)
> +			return;
> +	}
> +

On a maximally configured machine this will be an order-4 allocation and
you need at least 512 nodes before it's an order-1 allocation. As unlikely
as it is, should this be GFP_NOWARN?

> +	task_numa_placement(p);
> +
> +	p->numa_faults[node] += pages;
> +}
> +
> +/*
> + * The expensive part of numa migration is done from task_work context.
> + * Triggered from task_tick_numa().
> + */
> +void task_numa_work(struct callback_head *work)
> +{
> +	unsigned long migrate, next_scan, now = jiffies;
> +	struct task_struct *p = current;
> +	struct mm_struct *mm = p->mm;
> +
> +	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
> +
> +	work->next = work; /* protect against double add */
> +	/*
> +	 * Who cares about NUMA placement when they're dying.
> +	 *
> +	 * NOTE: make sure not to dereference p->mm before this check,
> +	 * exit_task_work() happens _after_ exit_mm() so we could be called
> +	 * without p->mm even though we still had it when we enqueued this
> +	 * work.
> +	 */
> +	if (p->flags & PF_EXITING)
> +		return;
> +
> +	/*
> +	 * Enforce maximal scan/migration frequency..
> +	 */
> +	migrate = mm->numa_next_scan;
> +	if (time_before(now, migrate))
> +		return;
> +
> +	next_scan = now + 2*msecs_to_jiffies(sysctl_sched_numa_scan_period_min);
> +	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
> +		return;
> +
> +	ACCESS_ONCE(mm->numa_scan_seq)++;
> +	{
> +		struct vm_area_struct *vma;
> +
> +		down_write(&mm->mmap_sem);
> +		for (vma = mm->mmap; vma; vma = vma->vm_next) {
> +			if (!vma_migratable(vma))
> +				continue;
> +			change_protection(vma, vma->vm_start, vma->vm_end, vma_prot_none(vma), 0);
> +		}
> +		up_write(&mm->mmap_sem);
> +	}
> +}

Ok, I like the idea of the scanning cost being incurred by the process.
I was going to complain though that for very large processes that the length
time it takes to complete this scan could be considerable.  However,
a quick glance forward indicates that you cope with this problem later by
limiting how much is scanned each time.

> +
> +/*
> + * Drive the periodic memory faults..
> + */
> +void task_tick_numa(struct rq *rq, struct task_struct *curr)
> +{
> +	struct callback_head *work = &curr->numa_work;
> +	u64 period, now;
> +
> +	/*
> +	 * We don't care about NUMA placement if we don't have memory.
> +	 */
> +	if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
> +		return;
> +
> +	/*
> +	 * Using runtime rather than walltime has the dual advantage that
> +	 * we (mostly) drive the selection from busy threads and that the
> +	 * task needs to have done some actual work before we bother with
> +	 * NUMA placement.
> +	 */

Makes sense.

> +	now = curr->se.sum_exec_runtime;
> +	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
> +
> +	if (now - curr->node_stamp > period) {
> +		curr->node_stamp = now;
> +
> +		if (!time_before(jiffies, curr->mm->numa_next_scan)) {
> +			init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
> +			task_work_add(curr, work, true);
> +		}
> +	}
> +}
>  #else
>  #ifdef CONFIG_SMP
>  static struct list_head *account_numa_enqueue(struct rq *rq, struct task_struct *p)
> @@ -816,6 +984,10 @@ static struct list_head *account_numa_en
>  static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
>  {
>  }
> +
> +static void task_tick_numa(struct rq *rq, struct task_struct *curr)
> +{
> +}
>  #endif /* CONFIG_SCHED_NUMA */
>  
>  /**************************************************
> @@ -5265,6 +5437,9 @@ static void task_tick_fair(struct rq *rq
>  		cfs_rq = cfs_rq_of(se);
>  		entity_tick(cfs_rq, se, queued);
>  	}
> +
> +	if (sched_feat_numa(NUMA))
> +		task_tick_numa(rq, curr);
>  }
>  
>  /*
> Index: tip/kernel/sched/features.h
> ===================================================================
> --- tip.orig/kernel/sched/features.h
> +++ tip/kernel/sched/features.h
> @@ -69,5 +69,6 @@ SCHED_FEAT(NUMA_TTWU_BIAS, false)
>  SCHED_FEAT(NUMA_TTWU_TO,   false)
>  SCHED_FEAT(NUMA_PULL,      true)
>  SCHED_FEAT(NUMA_PULL_BIAS, true)
> +SCHED_FEAT(NUMA_SETTLE,    true)
>  #endif
>  
> Index: tip/kernel/sched/sched.h
> ===================================================================
> --- tip.orig/kernel/sched/sched.h
> +++ tip/kernel/sched/sched.h
> @@ -3,6 +3,7 @@
>  #include <linux/mutex.h>
>  #include <linux/spinlock.h>
>  #include <linux/stop_machine.h>
> +#include <linux/slab.h>
>  
>  #include "cpupri.h"
>  
> @@ -476,15 +477,6 @@ struct rq {
>  #endif
>  };
>  
> -static inline struct list_head *offnode_tasks(struct rq *rq)
> -{
> -#ifdef CONFIG_SCHED_NUMA
> -	return &rq->offnode_tasks;
> -#else
> -	return NULL;
> -#endif
> -}
> -
>  static inline int cpu_of(struct rq *rq)
>  {
>  #ifdef CONFIG_SMP
> @@ -502,6 +494,27 @@ DECLARE_PER_CPU(struct rq, runqueues);
>  #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
>  #define raw_rq()		(&__raw_get_cpu_var(runqueues))
>  
> +#ifdef CONFIG_SCHED_NUMA
> +static inline struct list_head *offnode_tasks(struct rq *rq)
> +{
> +	return &rq->offnode_tasks;
> +}
> +
> +static inline void task_numa_free(struct task_struct *p)
> +{
> +	kfree(p->numa_faults);
> +}
> +#else /* CONFIG_SCHED_NUMA */
> +static inline struct list_head *offnode_tasks(struct rq *rq)
> +{
> +	return NULL;
> +}
> +
> +static inline void task_numa_free(struct task_struct *p)
> +{
> +}
> +#endif /* CONFIG_SCHED_NUMA */
> +
>  #ifdef CONFIG_SMP
>  
>  #define rcu_dereference_check_sched_domain(p) \
> Index: tip/kernel/sysctl.c
> ===================================================================
> --- tip.orig/kernel/sysctl.c
> +++ tip/kernel/sysctl.c
> @@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 10
>  static int max_sched_granularity_ns = NSEC_PER_SEC;	/* 1 second */
>  static int min_wakeup_granularity_ns;			/* 0 usecs */
>  static int max_wakeup_granularity_ns = NSEC_PER_SEC;	/* 1 second */
> +#ifdef CONFIG_SMP
>  static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
>  static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
> -#endif
> +#endif /* CONFIG_SMP */
> +#endif /* CONFIG_SCHED_DEBUG */
>  
>  #ifdef CONFIG_COMPACTION
>  static int min_extfrag_threshold;
> @@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = {
>  		.extra1		= &min_wakeup_granularity_ns,
>  		.extra2		= &max_wakeup_granularity_ns,
>  	},
> +#ifdef CONFIG_SMP
>  	{
>  		.procname	= "sched_tunable_scaling",
>  		.data		= &sysctl_sched_tunable_scaling,
> @@ -347,7 +350,31 @@ static struct ctl_table kern_table[] = {
>  		.extra1		= &zero,
>  		.extra2		= &one,
>  	},
> -#endif
> +#endif /* CONFIG_SMP */
> +#ifdef CONFIG_SCHED_NUMA
> +	{
> +		.procname	= "sched_numa_scan_period_min_ms",
> +		.data		= &sysctl_sched_numa_scan_period_min,
> +		.maxlen		= sizeof(unsigned int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec,
> +	},
> +	{
> +		.procname	= "sched_numa_scan_period_max_ms",
> +		.data		= &sysctl_sched_numa_scan_period_max,
> +		.maxlen		= sizeof(unsigned int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec,
> +	},
> +	{
> +		.procname	= "sched_numa_settle_count",
> +		.data		= &sysctl_sched_numa_settle_count,
> +		.maxlen		= sizeof(unsigned int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec,
> +	},
> +#endif /* CONFIG_SCHED_NUMA */
> +#endif /* CONFIG_SCHED_DEBUG */
>  	{
>  		.procname	= "sched_rt_period_us",
>  		.data		= &sysctl_sched_rt_period,
> Index: tip/mm/huge_memory.c
> ===================================================================
> --- tip.orig/mm/huge_memory.c
> +++ tip/mm/huge_memory.c
> @@ -774,9 +774,10 @@ fixup:
>  
>  unlock:
>  	spin_unlock(&mm->page_table_lock);
> -	if (page)
> +	if (page) {
> +		task_numa_fault(page_to_nid(page), HPAGE_PMD_NR);
>  		put_page(page);
> -
> +	}
>  	return;
>  
>  migrate:
> @@ -845,6 +846,8 @@ migrate:
>  
>  	put_page(page);			/* Drop the rmap reference */
>  
> +	task_numa_fault(node, HPAGE_PMD_NR);
> +
>  	if (lru)
>  		put_page(page);		/* drop the LRU isolation reference */
>  
> Index: tip/mm/memory.c
> ===================================================================
> --- tip.orig/mm/memory.c
> +++ tip/mm/memory.c
> @@ -3512,8 +3512,10 @@ out_pte_upgrade_unlock:
>  out_unlock:
>  	pte_unmap_unlock(ptep, ptl);
>  out:
> -	if (page)
> +	if (page) {
> +		task_numa_fault(page_nid, 1);
>  		put_page(page);
> +	}
>  
>  	return 0;
>  
> 
> 

-- 
Mel Gorman
SUSE Labs

next prev parent reply	other threads:[~2012-11-01 15:40 UTC|newest]

Thread overview: 269+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-10-25 12:16 [PATCH 00/31] numa/core patches Peter Zijlstra
2012-10-25 12:16 ` Peter Zijlstra
2012-10-25 12:16 ` [PATCH 01/31] sched, numa, mm: Make find_busiest_queue() a method Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-10-25 12:16 ` [PATCH 02/31] sched, numa, mm: Describe the NUMA scheduling problem formally Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-11-01  9:56   ` Mel Gorman
2012-11-01  9:56     ` Mel Gorman
2012-11-01 13:13     ` Rik van Riel
2012-11-01 13:13       ` Rik van Riel
2012-10-25 12:16 ` [PATCH 03/31] mm/thp: Preserve pgprot across huge page split Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-11-01 10:22   ` Mel Gorman
2012-11-01 10:22     ` Mel Gorman
2012-10-25 12:16 ` [PATCH 04/31] x86/mm: Introduce pte_accessible() Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-10-25 20:10   ` Linus Torvalds
2012-10-25 20:10     ` Linus Torvalds
2012-10-26  6:24     ` [PATCH 04/31, v2] " Ingo Molnar
2012-10-26  6:24       ` Ingo Molnar
2012-11-01 10:42   ` [PATCH 04/31] " Mel Gorman
2012-11-01 10:42     ` Mel Gorman
2012-10-25 12:16 ` [PATCH 05/31] x86/mm: Reduce tlb flushes from ptep_set_access_flags() Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-10-25 20:17   ` Linus Torvalds
2012-10-25 20:17     ` Linus Torvalds
2012-10-26  2:30     ` Rik van Riel
2012-10-26  2:30       ` Rik van Riel
2012-10-26  2:56       ` Linus Torvalds
2012-10-26  2:56         ` Linus Torvalds
2012-10-26  3:57         ` Rik van Riel
2012-10-26  3:57           ` Rik van Riel
2012-10-26  4:23           ` Linus Torvalds
2012-10-26  4:23             ` Linus Torvalds
2012-10-26  6:42             ` Ingo Molnar
2012-10-26  6:42               ` Ingo Molnar
2012-10-26 12:34             ` Michel Lespinasse
2012-10-26 12:34               ` Michel Lespinasse
2012-10-26 12:48               ` Andi Kleen
2012-10-26 12:48                 ` Andi Kleen
2012-10-26 13:16                 ` Rik van Riel
2012-10-26 13:16                   ` Rik van Riel
2012-10-26 13:26                   ` Ingo Molnar
2012-10-26 13:26                     ` Ingo Molnar
2012-10-26 13:28                     ` Ingo Molnar
2012-10-26 13:28                       ` Ingo Molnar
2012-10-26 18:44                     ` [PATCH 1/3] x86/mm: only do a local TLB flush in ptep_set_access_flags() Rik van Riel
2012-10-26 18:44                       ` Rik van Riel
2012-10-26 18:49                       ` Linus Torvalds
2012-10-26 18:49                         ` Linus Torvalds
2012-10-26 19:16                         ` Rik van Riel
2012-10-26 19:16                           ` Rik van Riel
2012-10-26 19:18                           ` Linus Torvalds
2012-10-26 19:18                             ` Linus Torvalds
2012-10-26 19:21                             ` Rik van Riel
2012-10-26 19:21                               ` Rik van Riel
2012-10-29 15:23                             ` Rik van Riel
2012-10-29 15:23                               ` Rik van Riel
2012-12-21  9:57                               ` trailing flush_tlb_fix_spurious_fault in handle_pte_fault (was Re: [PATCH 1/3] x86/mm: only do a local TLB flush in ptep_set_access_flags()) Vineet Gupta
2012-12-21  9:57                                 ` Vineet Gupta
2012-10-26 18:45                     ` [PATCH 2/3] x86,mm: drop TLB flush from ptep_set_access_flags Rik van Riel
2012-10-26 18:45                       ` Rik van Riel
2012-10-26 21:12                       ` Alan Cox
2012-10-26 21:12                         ` Alan Cox
2012-10-27  3:49                         ` Rik van Riel
2012-10-27  3:49                           ` Rik van Riel
2012-10-27 10:29                           ` Ingo Molnar
2012-10-27 10:29                             ` Ingo Molnar
2012-10-27 13:40                         ` Rik van Riel
2012-10-27 13:40                           ` Rik van Riel
2012-10-29 16:57                           ` Borislav Petkov
2012-10-29 16:57                             ` Borislav Petkov
2012-10-29 17:06                             ` Linus Torvalds
2012-10-29 17:06                               ` Linus Torvalds
2012-11-17 14:50                               ` Borislav Petkov
2012-11-17 14:50                                 ` Borislav Petkov
2012-11-17 14:56                                 ` Linus Torvalds
2012-11-17 14:56                                   ` Linus Torvalds
2012-11-17 15:17                                   ` Borislav Petkov
2012-11-17 15:17                                     ` Borislav Petkov
2012-11-17 15:24                                   ` Rik van Riel
2012-11-17 15:24                                     ` Rik van Riel
2012-11-17 21:53                                     ` Shentino
2012-11-17 21:53                                       ` Shentino
2012-11-18 15:29                                       ` Michel Lespinasse
2012-11-18 15:29                                         ` Michel Lespinasse
2012-10-26 18:46                     ` [PATCH 3/3] mm,generic: only flush the local TLB in ptep_set_access_flags Rik van Riel
2012-10-26 18:46                       ` Rik van Riel
2012-10-26 18:48                       ` Linus Torvalds
2012-10-26 18:48                         ` Linus Torvalds
2012-10-26 18:53                         ` Linus Torvalds
2012-10-26 18:53                           ` Linus Torvalds
2012-10-26 18:57                         ` Rik van Riel
2012-10-26 18:57                           ` Rik van Riel
2012-10-26 19:16                           ` Linus Torvalds
2012-10-26 19:16                             ` Linus Torvalds
2012-10-26 19:33                             ` [PATCH -v2 " Rik van Riel
2012-10-26 19:33                               ` Rik van Riel
2012-10-26 13:23                 ` [PATCH 05/31] x86/mm: Reduce tlb flushes from ptep_set_access_flags() Michel Lespinasse
2012-10-26 13:23                   ` Michel Lespinasse
2012-10-26 17:01               ` Linus Torvalds
2012-10-26 17:01                 ` Linus Torvalds
2012-10-26 17:54                 ` Rik van Riel
2012-10-26 17:54                   ` Rik van Riel
2012-10-26 18:02                   ` Linus Torvalds
2012-10-26 18:02                     ` Linus Torvalds
2012-10-26 18:14                     ` Rik van Riel
2012-10-26 18:14                       ` Rik van Riel
2012-10-26 18:41                       ` Linus Torvalds
2012-10-26 18:41                         ` Linus Torvalds
2012-10-25 12:16 ` [PATCH 06/31] mm: Only flush the TLB when clearing an accessible pte Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-10-25 12:16 ` [PATCH 07/31] sched, numa, mm, s390/thp: Implement pmd_pgprot() for s390 Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-11-01 10:49   ` Mel Gorman
2012-11-01 10:49     ` Mel Gorman
2012-10-25 12:16 ` [PATCH 08/31] sched, numa, mm, MIPS/thp: Add pmd_pgprot() implementation Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-10-25 12:16 ` [PATCH 09/31] mm/pgprot: Move the pgprot_modify() fallback definition to mm.h Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-10-25 12:16 ` [PATCH 10/31] mm/mpol: Remove NUMA_INTERLEAVE_HIT Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-10-25 20:58   ` Andi Kleen
2012-10-25 20:58     ` Andi Kleen
2012-10-26  7:59     ` Ingo Molnar
2012-10-26  7:59       ` Ingo Molnar
2012-10-25 12:16 ` [PATCH 11/31] mm/mpol: Make MPOL_LOCAL a real policy Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-11-01 10:58   ` Mel Gorman
2012-11-01 10:58     ` Mel Gorman
2012-10-25 12:16 ` [PATCH 12/31] mm/mpol: Add MPOL_MF_NOOP Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-11-01 11:10   ` Mel Gorman
2012-11-01 11:10     ` Mel Gorman
2012-10-25 12:16 ` [PATCH 13/31] mm/mpol: Check for misplaced page Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-10-25 12:16 ` [PATCH 14/31] mm/mpol: Create special PROT_NONE infrastructure Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-11-01 11:51   ` Mel Gorman
2012-11-01 11:51     ` Mel Gorman
2012-10-25 12:16 ` [PATCH 15/31] mm/mpol: Add MPOL_MF_LAZY Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-11-01 12:01   ` Mel Gorman
2012-11-01 12:01     ` Mel Gorman
2012-10-25 12:16 ` [PATCH 16/31] numa, mm: Support NUMA hinting page faults from gup/gup_fast Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-10-25 12:16 ` [PATCH 17/31] mm/migrate: Introduce migrate_misplaced_page() Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-11-01 12:20   ` Mel Gorman
2012-11-01 12:20     ` Mel Gorman
2012-10-25 12:16 ` [PATCH 18/31] mm/mpol: Use special PROT_NONE to migrate pages Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-10-25 12:16 ` [PATCH 19/31] sched, numa, mm: Introduce tsk_home_node() Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-11-01 13:48   ` Mel Gorman
2012-11-01 13:48     ` Mel Gorman
2012-10-25 12:16 ` [PATCH 20/31] sched, numa, mm/mpol: Make mempolicy home-node aware Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-11-01 13:58   ` Mel Gorman
2012-11-01 13:58     ` Mel Gorman
2012-11-01 14:10     ` Don Morris
2012-11-01 14:10       ` Don Morris
2012-10-25 12:16 ` [PATCH 21/31] sched, numa, mm: Introduce sched_feat_numa() Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-11-01 14:00   ` Mel Gorman
2012-11-01 14:00     ` Mel Gorman
2012-10-25 12:16 ` [PATCH 22/31] sched, numa, mm: Implement THP migration Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-11-01 14:16   ` Mel Gorman
2012-11-01 14:16     ` Mel Gorman
2012-10-25 12:16 ` [PATCH 23/31] sched, numa, mm: Implement home-node awareness Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-11-01 15:06   ` Mel Gorman
2012-11-01 15:06     ` Mel Gorman
2012-10-25 12:16 ` [PATCH 24/31] sched, numa, mm: Introduce last_nid in the pageframe Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-11-01 15:17   ` Mel Gorman
2012-11-01 15:17     ` Mel Gorman
2012-10-25 12:16 ` [PATCH 25/31] sched, numa, mm/mpol: Add_MPOL_F_HOME Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-10-25 12:16 ` [PATCH 26/31] sched, numa, mm: Add fault driven placement and migration policy Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-10-25 20:53   ` Linus Torvalds
2012-10-25 20:53     ` Linus Torvalds
2012-10-26  7:15     ` Ingo Molnar
2012-10-26  7:15       ` Ingo Molnar
2012-10-26 13:50       ` Ingo Molnar
2012-10-26 13:50         ` Ingo Molnar
2012-10-26 14:11         ` Peter Zijlstra
2012-10-26 14:11           ` Peter Zijlstra
2012-10-26 14:14           ` Ingo Molnar
2012-10-26 14:14             ` Ingo Molnar
2012-10-26 16:47             ` Linus Torvalds
2012-10-26 16:47               ` Linus Torvalds
2012-10-30 19:23   ` Rik van Riel
2012-10-30 19:23     ` Rik van Riel
2012-11-01 15:40   ` Mel Gorman [this message]
2012-11-01 15:40     ` Mel Gorman
2012-10-25 12:16 ` [PATCH 27/31] sched, numa, mm: Add credits for NUMA placement Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-10-25 12:16 ` [PATCH 28/31] sched, numa, mm: Implement constant, per task Working Set Sampling (WSS) rate Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-11-01 15:48   ` Mel Gorman
2012-11-01 15:48     ` Mel Gorman
2012-10-25 12:16 ` [PATCH 29/31] sched, numa, mm: Add NUMA_MIGRATION feature flag Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-10-25 12:16 ` [PATCH 30/31] sched, numa, mm: Implement slow start for working set sampling Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-11-01 15:52   ` Mel Gorman
2012-11-01 15:52     ` Mel Gorman
2012-10-25 12:16 ` [PATCH 31/31] sched, numa, mm: Add memcg support to do_huge_pmd_numa_page() Peter Zijlstra
2012-10-25 12:16   ` Peter Zijlstra
2012-10-26  9:07 ` [PATCH 00/31] numa/core patches Zhouping Liu
2012-10-26  9:08   ` Peter Zijlstra
2012-10-26  9:08     ` Peter Zijlstra
2012-10-26  9:20     ` Ingo Molnar
2012-10-26  9:20       ` Ingo Molnar
2012-10-26  9:41       ` Zhouping Liu
2012-10-26  9:41         ` Zhouping Liu
2012-10-26 10:20       ` Zhouping Liu
2012-10-26 10:20         ` Zhouping Liu
2012-10-26 10:24         ` Ingo Molnar
2012-10-26 10:24           ` Ingo Molnar
2012-10-28 17:56     ` Johannes Weiner
2012-10-28 17:56       ` Johannes Weiner
2012-10-29  2:44       ` Zhouping Liu
2012-10-29  2:44         ` Zhouping Liu
2012-10-29  6:50         ` [PATCH] sched, numa, mm: Add memcg support to do_huge_pmd_numa_page() Ingo Molnar
2012-10-29  6:50           ` Ingo Molnar
2012-10-29  8:24           ` Johannes Weiner
2012-10-29  8:24             ` Johannes Weiner
2012-10-29  8:36             ` Zhouping Liu
2012-10-29  8:36               ` Zhouping Liu
2012-10-29 11:15             ` Ingo Molnar
2012-10-29 11:15               ` Ingo Molnar
2012-10-30  6:29       ` [PATCH 00/31] numa/core patches Zhouping Liu
2012-10-30  6:29         ` Zhouping Liu
2012-10-31  0:48         ` Johannes Weiner
2012-10-31  0:48           ` Johannes Weiner
2012-10-31  7:26           ` Hugh Dickins
2012-10-31  7:26             ` Hugh Dickins
2012-10-31 13:15             ` Zhouping Liu
2012-10-31 13:15               ` Zhouping Liu
2012-10-31 17:31               ` Hugh Dickins
2012-10-31 17:31                 ` Hugh Dickins
2012-11-01 13:41                 ` Hugh Dickins
2012-11-01 13:41                   ` Hugh Dickins
2012-11-02  3:23                   ` Zhouping Liu
2012-11-02  3:23                     ` Zhouping Liu
2012-11-02 23:06                     ` Hugh Dickins
2012-11-02 23:06                       ` Hugh Dickins
2012-10-30 12:20 ` Mel Gorman
2012-10-30 12:20   ` Mel Gorman
2012-10-30 15:28   ` Andrew Morton
2012-10-30 15:28     ` Andrew Morton
2012-10-30 16:59     ` Mel Gorman
2012-10-30 16:59       ` Mel Gorman
2012-11-03 11:04   ` Alex Shi
2012-11-03 11:04     ` Alex Shi
2012-11-03 12:21     ` Mel Gorman
2012-11-03 12:21       ` Mel Gorman
2012-11-10  2:47       ` Alex Shi
2012-11-10  2:47         ` Alex Shi
2012-11-12  9:50         ` Mel Gorman
2012-11-12  9:50           ` Mel Gorman
2012-11-09  8:51   ` Rik van Riel
2012-11-09  8:51     ` Rik van Riel
2012-11-05 17:11 ` Srikar Dronamraju
2012-11-05 17:11   ` Srikar Dronamraju

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20121101154027.GC3888@suse.de \
    --to=mgorman@suse.de \
    --cc=a.p.zijlstra@chello.nl \
    --cc=aarcange@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=hannes@cmpxchg.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mingo@kernel.org \
    --cc=riel@redhat.com \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.