From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx117.postini.com [74.125.245.117]) by kanga.kvack.org (Postfix) with SMTP id 185E66B003C for ; Tue, 30 Jul 2013 03:50:02 -0400 (EDT) Received: from /spool/local by e39.co.us.ibm.com with IBM ESMTP SMTP Gateway: Authorized Use Only! Violators will be prosecuted for from ; Tue, 30 Jul 2013 01:50:01 -0600 Received: from d01relay04.pok.ibm.com (d01relay04.pok.ibm.com [9.56.227.236]) by d01dlp01.pok.ibm.com (Postfix) with ESMTP id 4E0D538C8027 for ; Tue, 30 Jul 2013 03:49:57 -0400 (EDT) Received: from d01av03.pok.ibm.com (d01av03.pok.ibm.com [9.56.224.217]) by d01relay04.pok.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id r6U7nwU6197128 for ; Tue, 30 Jul 2013 03:49:58 -0400 Received: from d01av03.pok.ibm.com (loopback [127.0.0.1]) by d01av03.pok.ibm.com (8.14.4/8.13.1/NCO v10.0 AVout) with ESMTP id r6U7nuCO015788 for ; Tue, 30 Jul 2013 04:49:57 -0300 From: Srikar Dronamraju Subject: [RFC PATCH 09/10] sched: Choose a runqueue that has lesser local affinity tasks Date: Tue, 30 Jul 2013 13:18:24 +0530 Message-Id: <1375170505-5967-10-git-send-email-srikar@linux.vnet.ibm.com> In-Reply-To: <1375170505-5967-1-git-send-email-srikar@linux.vnet.ibm.com> References: <1375170505-5967-1-git-send-email-srikar@linux.vnet.ibm.com> Sender: owner-linux-mm@kvack.org List-ID: To: Mel Gorman , Peter Zijlstra , Ingo Molnar Cc: Andrea Arcangeli , Johannes Weiner , Linux-MM , LKML , Preeti U Murthy , Linus Torvalds , Srikar Dronamraju While migrating tasks to a different node, choosing the busiest runqueue may not always be the right choice. The busiest runqueue might have tasks that are already consolidated. Choosing such a runqueue might actually lead to more performance impact. Alternatively choose a runqueue that has less local numa affine tasks, i.e, tasks that benefit if run on a node other than their current node. The load balancer would then pitchin to move load from the busiest runqueue to the runqueue from where tasks for cross node migration were picked. So the load would end up being better consolidated. Signed-off-by: Srikar Dronamraju --- include/linux/sched.h | 2 + kernel/sched/fair.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++-- kernel/sched/sched.h | 1 + 3 files changed, 82 insertions(+), 3 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index ba188f1..c5d0a13 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1507,6 +1507,8 @@ struct task_struct { u64 node_stamp; /* migration stamp */ struct callback_head numa_work; int migrate_seq; + bool pinned_task; + bool local_task; #endif /* CONFIG_NUMA_BALANCING */ struct rcu_head rcu; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a99aebc..e749650 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -805,6 +805,36 @@ static void task_numa_placement(struct task_struct *p) /* FIXME: Scheduling placement policy hints go here */ } +static void update_local_task_count(struct task_struct *p) +{ + struct rq *rq = task_rq(p); + int curnode = cpu_to_node(cpu_of(rq)); + int cur_numa_weight = 0; + int total_numa_weight = 0; + + if (!p->pinned_task) { + if (p->mm && p->mm->numa_weights) { + cur_numa_weight = atomic_read(&p->mm->numa_weights[curnode]); + total_numa_weight = atomic_read(&p->mm->numa_weights[nr_node_ids]); + } + + /* + * Account tasks that are neither pinned nor have numa affinity as + * non local tasks. + */ + if (p->local_task != (cur_numa_weight * nr_node_ids > total_numa_weight)) { + if (!p->local_task) { + rq->non_local_task_count--; + p->local_task = true; + } else { + rq->non_local_task_count++; + p->local_task = false; + } + + } + } +} + /* * Got a PROT_NONE fault for a page on @node. */ @@ -826,6 +856,9 @@ void task_numa_fault(int node, int pages, bool migrated) p->numa_scan_period + jiffies_to_msecs(10)); task_numa_placement(p); + + /* Should this be moved to update_curr()? */ + update_local_task_count(p); } static void reset_ptenuma_scan(struct task_struct *p) @@ -996,16 +1029,31 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) } } +static void add_non_local_task_count(struct rq *rq, struct task_struct *p, + int value) +{ + if (p->pinned_task || p->local_task) + return; + else + rq->non_local_task_count += value; +} + static void account_numa_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) { struct mm_struct *mm = p->mm; struct rq *rq = rq_of(cfs_rq); int curnode = cpu_to_node(cpu_of(rq)); + int cur_numa_weight = 0; + int total_numa_weight = 0; if (mm && mm->numa_weights) { - atomic_read(&mm->numa_weights[curnode]); - atomic_read(&mm->numa_weights[nr_node_ids]); + cur_numa_weight = atomic_inc_return(&mm->numa_weights[curnode]); + total_numa_weight = atomic_inc_return(&mm->numa_weights[nr_node_ids]); } + + p->pinned_task = (p->nr_cpus_allowed == 1); + p->local_task = (cur_numa_weight * nr_node_ids > total_numa_weight); + add_non_local_task_count(rq, p, 1); } static void account_numa_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) @@ -1019,6 +1067,10 @@ static void account_numa_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) atomic_dec(&mm->numa_weights[curnode]); atomic_dec(&mm->numa_weights[nr_node_ids]); } + + add_non_local_task_count(rq, p, -1); + p->pinned_task = false; + p->local_task = false; } #else static void task_tick_numa(struct rq *rq, struct task_struct *curr) @@ -5046,6 +5098,27 @@ find_busiest_group(struct lb_env *env, int *balance) return NULL; } +#ifdef CONFIG_NUMA_BALANCING +static struct rq *find_numa_queue(struct lb_env *env, + struct sched_group *group, struct rq *busy_rq) +{ + struct rq *rq; + int i; + + for_each_cpu(i, sched_group_cpus(group)) { + if (!cpumask_test_cpu(i, env->cpus)) + continue; + + rq = cpu_rq(i); + if (rq->nr_running > 1) { + if (rq->non_local_task_count > busy_rq->non_local_task_count) + busy_rq = rq; + } + } + return busy_rq; +} +#endif + /* * find_busiest_queue - find the busiest runqueue among the cpus in group. */ @@ -5187,8 +5260,11 @@ static int load_balance(int this_cpu, struct rq *this_rq, if (busiest->nr_running > 1) { #ifdef CONFIG_NUMA_BALANCING if (sd->flags & SD_NUMA) { - if (cpu_to_node(env.dst_cpu) != cpu_to_node(env.src_cpu)) + if (cpu_to_node(env.dst_cpu) != cpu_to_node(env.src_cpu)) { env.iterations = 0; + busiest = find_numa_queue(&env, group, busiest); + } + } #endif /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9f60d74..5e620b7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -486,6 +486,7 @@ struct rq { struct sched_avg avg; #ifdef CONFIG_NUMA_BALANCING struct task_struct *push_task; + unsigned int non_local_task_count; #endif }; -- 1.7.1 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org