[RFC PATCH 0/3] sched: core-balancer v2

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [RFC PATCH 0/3] sched: core-balancer v2
@ 2008-05-15 18:35 Gregory Haskins
  2008-05-15 18:35 ` [RFC PATCH 1/3] sched: create sched_balancer container for sched_groups Gregory Haskins
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Gregory Haskins @ 2008-05-15 18:35 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra
  Cc: Suresh Sidda, Srivatsa Vaddagiri, linux-kernel, Gregory Haskins

Here is an update to the core-balancer previously discussed here

http://lkml.org/lkml/2008/5/12/260

Changes since v1:

1) Fixed a memory allocation error pointed out by Suresh Siddha
2) Implemented a suggestion by Suresh to only assign core-balancers to SMP or
NUMA domains (i.e. exclude HT/MC domains)
3) Fixed several small bugs and misc cleanups

Comments/feedback welcome!

Regards,
-Greg

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [RFC PATCH 1/3] sched: create sched_balancer container for sched_groups
  2008-05-15 18:35 [RFC PATCH 0/3] sched: core-balancer v2 Gregory Haskins
@ 2008-05-15 18:35 ` Gregory Haskins
  2008-05-15 18:35 ` [RFC PATCH 2/3] sched: pass specific sched_balancer to functions Gregory Haskins
  2008-05-15 18:35 ` [RFC PATCH 3/3] sched: add a per-core balancer group Gregory Haskins
  2 siblings, 0 replies; 4+ messages in thread
From: Gregory Haskins @ 2008-05-15 18:35 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra
  Cc: Suresh Sidda, Srivatsa Vaddagiri, linux-kernel, Gregory Haskins

We want to add multiple sched_group balancing domains per sched_domains
(later in the series), so we split up the current logic to make it more
fine grained.  This patch does not alter the general logic at all.
However, it does move the initialization of some of the fields to runtime
instead of declared statically in topology.h

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 include/asm-ia64/topology.h           |    6 --
 include/asm-mips/mach-ip27/topology.h |    3 -
 include/asm-powerpc/topology.h        |    3 -
 include/asm-sh/topology.h             |    3 -
 include/asm-sparc64/topology.h        |    2 -
 include/asm-x86/topology.h            |    2 -
 include/linux/sched.h                 |   15 +++-
 include/linux/topology.h              |    8 --
 kernel/sched.c                        |  125 +++++++++++++++++++++------------
 9 files changed, 89 insertions(+), 78 deletions(-)

diff --git a/include/asm-ia64/topology.h b/include/asm-ia64/topology.h
index 32863b3..6409a70 100644
--- a/include/asm-ia64/topology.h
+++ b/include/asm-ia64/topology.h
@@ -75,9 +75,6 @@ void build_cpu_to_node_map(void);
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_AFFINE,	\
-	.last_balance		= jiffies,		\
-	.balance_interval	= 1,			\
-	.nr_balance_failed	= 0,			\
 }
 
 /* sched_domains SD_NODE_INIT for IA64 NUMA machines */
@@ -101,9 +98,6 @@ void build_cpu_to_node_map(void);
 				| SD_BALANCE_FORK	\
 				| SD_SERIALIZE		\
 				| SD_WAKE_BALANCE,	\
-	.last_balance		= jiffies,		\
-	.balance_interval	= 64,			\
-	.nr_balance_failed	= 0,			\
 }
 
 #endif /* CONFIG_NUMA */
diff --git a/include/asm-mips/mach-ip27/topology.h b/include/asm-mips/mach-ip27/topology.h
index 7785bec..583c0c6 100644
--- a/include/asm-mips/mach-ip27/topology.h
+++ b/include/asm-mips/mach-ip27/topology.h
@@ -49,9 +49,6 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES];
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_BALANCE,	\
-	.last_balance		= jiffies,		\
-	.balance_interval	= 1,			\
-	.nr_balance_failed	= 0,			\
 }
 
 #include <asm-generic/topology.h>
diff --git a/include/asm-powerpc/topology.h b/include/asm-powerpc/topology.h
index 100c6fb..182ecb5 100644
--- a/include/asm-powerpc/topology.h
+++ b/include/asm-powerpc/topology.h
@@ -67,9 +67,6 @@ static inline int pcibus_to_node(struct pci_bus *bus)
 				| SD_WAKE_IDLE		\
 				| SD_SERIALIZE		\
 				| SD_WAKE_BALANCE,	\
-	.last_balance		= jiffies,		\
-	.balance_interval	= 1,			\
-	.nr_balance_failed	= 0,			\
 }
 
 extern void __init dump_numa_cpu_topology(void);
diff --git a/include/asm-sh/topology.h b/include/asm-sh/topology.h
index 34cdb28..53e090d 100644
--- a/include/asm-sh/topology.h
+++ b/include/asm-sh/topology.h
@@ -24,9 +24,6 @@
 				| SD_BALANCE_EXEC	\
 				| SD_SERIALIZE		\
 				| SD_WAKE_BALANCE,	\
-	.last_balance		= jiffies,		\
-	.balance_interval	= 1,			\
-	.nr_balance_failed	= 0,			\
 }
 
 #endif
diff --git a/include/asm-sparc64/topology.h b/include/asm-sparc64/topology.h
index 001c040..f9f6c2e 100644
--- a/include/asm-sparc64/topology.h
+++ b/include/asm-sparc64/topology.h
@@ -62,8 +62,6 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
 				| SD_BALANCE_EXEC	\
 				| SD_SERIALIZE		\
 				| SD_WAKE_BALANCE,	\
-	.last_balance		= jiffies,		\
-	.balance_interval	= 1,			\
 }
 
 #else /* CONFIG_NUMA */
diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h
index 1f97758..51329c9 100644
--- a/include/asm-x86/topology.h
+++ b/include/asm-x86/topology.h
@@ -166,8 +166,6 @@ extern unsigned long node_remap_size[];
 				| SD_BALANCE_FORK	\
 				| SD_SERIALIZE		\
 				| SD_WAKE_BALANCE,	\
-	.last_balance		= jiffies,		\
-	.balance_interval	= 1,			\
 }
 
 #ifdef CONFIG_X86_64_ACPI_NUMA
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3882650..95e46e3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -762,11 +762,18 @@ struct sched_domain_attr {
 	.relax_domain_level = -1,			\
 }
 
+struct sched_balancer {
+	struct sched_group *groups;
+	unsigned long last_exec;      /* init to jiffies. units in jiffies */
+	unsigned long interval;	      /* initialise to 1. units in ms. */
+	unsigned long *interval_reset; /* value to reset interval to */
+	unsigned int nr_failed;       /* initialise to 0 */
+};
+
 struct sched_domain {
 	/* These fields must be setup */
 	struct sched_domain *parent;	/* top domain must be null terminated */
 	struct sched_domain *child;	/* bottom domain must be null terminated */
-	struct sched_group *groups;	/* the balancing groups of the domain */
 	cpumask_t span;			/* span of all CPUs in this domain */
 	int first_cpu;			/* cache of the first cpu in this domain */
 	unsigned long min_interval;	/* Minimum balance interval ms */
@@ -782,10 +789,8 @@ struct sched_domain {
 	int flags;			/* See SD_* */
 	enum sched_domain_level level;
 
-	/* Runtime fields. */
-	unsigned long last_balance;	/* init to jiffies. units in jiffies */
-	unsigned int balance_interval;	/* initialise to 1. units in ms. */
-	unsigned int nr_balance_failed; /* initialise to 0 */
+	/* Balancer data */
+	struct sched_balancer group_balancer;
 
 #ifdef CONFIG_SCHEDSTATS
 	/* load_balance() stats */
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 4bb7074..8e60655 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -101,8 +101,6 @@ void arch_update_cpu_topology(void);
 				| SD_WAKE_AFFINE	\
 				| SD_WAKE_IDLE		\
 				| SD_SHARE_CPUPOWER,	\
-	.last_balance		= jiffies,		\
-	.balance_interval	= 1,			\
 }
 #endif
 #endif /* CONFIG_SCHED_SMT */
@@ -126,8 +124,6 @@ void arch_update_cpu_topology(void);
 				| SD_WAKE_AFFINE	\
 				| SD_SHARE_PKG_RESOURCES\
 				| BALANCE_FOR_MC_POWER,	\
-	.last_balance		= jiffies,		\
-	.balance_interval	= 1,			\
 }
 #endif
 #endif /* CONFIG_SCHED_MC */
@@ -151,8 +147,6 @@ void arch_update_cpu_topology(void);
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_AFFINE	\
 				| BALANCE_FOR_PKG_POWER,\
-	.last_balance		= jiffies,		\
-	.balance_interval	= 1,			\
 }
 #endif
 
@@ -167,8 +161,6 @@ void arch_update_cpu_topology(void);
 	.idle_idx		= 3,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_SERIALIZE,	\
-	.last_balance		= jiffies,		\
-	.balance_interval	= 64,			\
 }
 
 #ifdef CONFIG_NUMA
diff --git a/kernel/sched.c b/kernel/sched.c
index 6947d6b..a8e0bd3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2282,7 +2282,9 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 static struct sched_group *
 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 {
-	struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
+	struct sched_balancer *balancer = &sd->group_balancer;
+	struct sched_group *first_group = balancer->groups;
+	struct sched_group *idlest = NULL, *this = NULL, *group = first_group;
 	unsigned long min_load = ULONG_MAX, this_load = 0;
 	int load_idx = sd->forkexec_idx;
 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
@@ -2322,7 +2324,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 			min_load = avg_load;
 			idlest = group;
 		}
-	} while (group = group->next, group != sd->groups);
+	} while (group = group->next, group != first_group);
 
 	if (!idlest || 100*this_load < imbalance*min_load)
 		return NULL;
@@ -3129,7 +3131,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 	 */
 
 	if (!task_hot(p, rq->clock, sd) ||
-			sd->nr_balance_failed > sd->cache_nice_tries) {
+	    sd->group_balancer.nr_failed > sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
 		if (task_hot(p, rq->clock, sd)) {
 			schedstat_inc(sd, lb_hot_gained[idle]);
@@ -3290,7 +3292,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		   unsigned long *imbalance, enum cpu_idle_type idle,
 		   int *sd_idle, const cpumask_t *cpus, int *balance)
 {
-	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
+	struct sched_group *first_group = sd->group_balancer.groups;
+	struct sched_group *busiest = NULL, *this = NULL, *group = first_group;
 	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
 	unsigned long max_pull;
 	unsigned long busiest_load_per_task, busiest_nr_running;
@@ -3458,7 +3461,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 group_next:
 #endif
 		group = group->next;
-	} while (group != sd->groups);
+	} while (group != first_group);
 
 	if (!busiest || this_load >= max_load || busiest_nr_running == 0)
 		goto out_balanced;
@@ -3630,6 +3633,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 			struct sched_domain *sd, enum cpu_idle_type idle,
 			int *balance, cpumask_t *cpus)
 {
+	struct sched_balancer *balancer = &sd->group_balancer;
 	int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
 	struct sched_group *group;
 	unsigned long imbalance;
@@ -3707,9 +3711,9 @@ redo:
 
 	if (!ld_moved) {
 		schedstat_inc(sd, lb_failed[idle]);
-		sd->nr_balance_failed++;
+		balancer->nr_failed++;
 
-		if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
+		if (unlikely(balancer->nr_failed > sd->cache_nice_tries+2)) {
 
 			spin_lock_irqsave(&busiest->lock, flags);
 
@@ -3735,14 +3739,14 @@ redo:
 			 * We've kicked active balancing, reset the failure
 			 * counter.
 			 */
-			sd->nr_balance_failed = sd->cache_nice_tries+1;
+			balancer->nr_failed = sd->cache_nice_tries+1;
 		}
 	} else
-		sd->nr_balance_failed = 0;
+		balancer->nr_failed = 0;
 
 	if (likely(!active_balance)) {
 		/* We were unbalanced, so reset the balancing interval */
-		sd->balance_interval = sd->min_interval;
+		balancer->interval = *balancer->interval_reset;
 	} else {
 		/*
 		 * If we've begun active balancing, start to back off. This
@@ -3750,8 +3754,8 @@ redo:
 		 * is only 1 task on the busy runqueue (because we don't call
 		 * move_tasks).
 		 */
-		if (sd->balance_interval < sd->max_interval)
-			sd->balance_interval *= 2;
+		if (balancer->interval < sd->max_interval)
+			balancer->interval *= 2;
 	}
 
 	if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
@@ -3763,13 +3767,13 @@ redo:
 out_balanced:
 	schedstat_inc(sd, lb_balanced[idle]);
 
-	sd->nr_balance_failed = 0;
+	balancer->nr_failed = 0;
 
 out_one_pinned:
 	/* tune up the balancing interval */
-	if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
-			(sd->balance_interval < sd->max_interval))
-		sd->balance_interval *= 2;
+	if ((all_pinned && balancer->interval < MAX_PINNED_INTERVAL) ||
+			(balancer->interval < sd->max_interval))
+		balancer->interval *= 2;
 
 	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
@@ -3793,6 +3797,7 @@ static int
 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
 			cpumask_t *cpus)
 {
+	struct sched_balancer *balancer = &sd->group_balancer;
 	struct sched_group *group;
 	struct rq *busiest = NULL;
 	unsigned long imbalance;
@@ -3855,7 +3860,7 @@ redo:
 		    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 			return -1;
 	} else
-		sd->nr_balance_failed = 0;
+		balancer->nr_failed = 0;
 
 	return ld_moved;
 
@@ -3864,7 +3869,7 @@ out_balanced:
 	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		return -1;
-	sd->nr_balance_failed = 0;
+	balancer->nr_failed = 0;
 
 	return 0;
 }
@@ -3881,6 +3886,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 	cpumask_t tmpmask;
 
 	for_each_domain(this_cpu, sd) {
+		struct sched_balancer *balancer = &sd->group_balancer;
 		unsigned long interval;
 
 		if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3891,9 +3897,9 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 			pulled_task = load_balance_newidle(this_cpu, this_rq,
 							   sd, &tmpmask);
 
-		interval = msecs_to_jiffies(sd->balance_interval);
-		if (time_after(next_balance, sd->last_balance + interval))
-			next_balance = sd->last_balance + interval;
+		interval = msecs_to_jiffies(balancer->interval);
+		if (time_after(next_balance, balancer->last_exec + interval))
+			next_balance = balancer->last_exec + interval;
 		if (pulled_task)
 			break;
 	}
@@ -4052,10 +4058,12 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 	cpumask_t tmp;
 
 	for_each_domain(cpu, sd) {
+		struct sched_balancer *balancer = &sd->group_balancer;
+
 		if (!(sd->flags & SD_LOAD_BALANCE))
 			continue;
 
-		interval = sd->balance_interval;
+		interval = balancer->interval;
 		if (idle != CPU_IDLE)
 			interval *= sd->busy_factor;
 
@@ -4073,7 +4081,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 				goto out;
 		}
 
-		if (time_after_eq(jiffies, sd->last_balance + interval)) {
+		if (time_after_eq(jiffies, balancer->last_exec + interval)) {
 			if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
 				/*
 				 * We've pulled tasks over so either we're no
@@ -4082,13 +4090,13 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 				 */
 				idle = CPU_NOT_IDLE;
 			}
-			sd->last_balance = jiffies;
+			balancer->last_exec = jiffies;
 		}
 		if (need_serialize)
 			spin_unlock(&balancing);
 out:
-		if (time_after(next_balance, sd->last_balance + interval)) {
-			next_balance = sd->last_balance + interval;
+		if (time_after(next_balance, balancer->last_exec + interval)) {
+			next_balance = balancer->last_exec + interval;
 			update_next_balance = 1;
 		}
 
@@ -6611,7 +6619,8 @@ void __init migration_init(void)
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 				  cpumask_t *groupmask)
 {
-	struct sched_group *group = sd->groups;
+	struct sched_group *first_group = sd->group_balancer.groups;
+	struct sched_group *group = first_group;
 	char str[256];
 
 	cpulist_scnprintf(str, sizeof(str), sd->span);
@@ -6671,7 +6680,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 		printk(KERN_CONT " %s", str);
 
 		group = group->next;
-	} while (group != sd->groups);
+	} while (group != first_group);
 	printk(KERN_CONT "\n");
 
 	if (!cpus_equal(sd->span, *groupmask))
@@ -6717,6 +6726,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 
 static int sd_degenerate(struct sched_domain *sd)
 {
+	struct sched_group *first_group = sd->group_balancer.groups;
+
 	if (cpus_weight(sd->span) == 1)
 		return 1;
 
@@ -6727,7 +6738,7 @@ static int sd_degenerate(struct sched_domain *sd)
 			 SD_BALANCE_EXEC |
 			 SD_SHARE_CPUPOWER |
 			 SD_SHARE_PKG_RESOURCES)) {
-		if (sd->groups != sd->groups->next)
+		if (first_group != first_group->next)
 			return 0;
 	}
 
@@ -6744,6 +6755,7 @@ static int
 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 {
 	unsigned long cflags = sd->flags, pflags = parent->flags;
+	struct sched_group *parent_group = parent->group_balancer.groups;
 
 	if (sd_degenerate(parent))
 		return 1;
@@ -6756,7 +6768,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 	if (cflags & SD_WAKE_AFFINE)
 		pflags &= ~SD_WAKE_BALANCE;
 	/* Flags needing groups don't count if only 1 group in parent */
-	if (parent->groups == parent->groups->next) {
+	if (parent_group == parent_group->next) {
 		pflags &= ~(SD_LOAD_BALANCE |
 				SD_BALANCE_NEWIDLE |
 				SD_BALANCE_FORK |
@@ -7122,10 +7134,10 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
 		return;
 	do {
 		for_each_cpu_mask_nr(j, sg->cpumask) {
-			struct sched_domain *sd;
+			struct sched_domain *sd = &per_cpu(phys_domains, j);;
+			struct sched_group *group = sd->group_balancer.groups;
 
-			sd = &per_cpu(phys_domains, j);
-			if (j != first_cpu(sd->groups->cpumask)) {
+			if (j != first_cpu(group->cpumask)) {
 				/*
 				 * Only add "power" once for each
 				 * physical package.
@@ -7133,7 +7145,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
 				continue;
 			}
 
-			sg_inc_cpu_power(sg, sd->groups->__cpu_power);
+			sg_inc_cpu_power(sg, group->__cpu_power);
 		}
 		sg = sg->next;
 	} while (sg != group_head);
@@ -7199,15 +7211,16 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 {
 	struct sched_domain *child;
 	struct sched_group *group;
+	struct sched_group *first_group = sd->group_balancer.groups;
 
-	WARN_ON(!sd || !sd->groups);
+	WARN_ON(!sd || !first_group);
 
-	if (cpu != first_cpu(sd->groups->cpumask))
+	if (cpu != first_cpu(first_group->cpumask))
 		return;
 
 	child = sd->child;
 
-	sd->groups->__cpu_power = 0;
+	first_group->__cpu_power = 0;
 
 	/*
 	 * For perf policy, if the groups in child domain share resources
@@ -7219,18 +7232,18 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 	if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
 		       (child->flags &
 			(SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
-		sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
+		sg_inc_cpu_power(first_group, SCHED_LOAD_SCALE);
 		return;
 	}
 
 	/*
 	 * add cpu_power of each child group to this groups cpu_power
 	 */
-	group = child->groups;
+	group = child->group_balancer.groups;
 	do {
-		sg_inc_cpu_power(sd->groups, group->__cpu_power);
+		sg_inc_cpu_power(first_group, group->__cpu_power);
 		group = group->next;
-	} while (group != child->groups);
+	} while (group != child->group_balancer.groups);
 }
 
 /*
@@ -7323,6 +7336,15 @@ static void set_domain_attribute(struct sched_domain *sd,
 	}
 }
 
+static void init_sched_balancer(struct sched_balancer *balancer,
+				unsigned long *interval_reset)
+{
+	balancer->last_exec = jiffies;
+	balancer->interval = *interval_reset;
+	balancer->interval_reset = interval_reset;
+	balancer->nr_failed = 0;
+}
+
 /*
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
@@ -7395,7 +7417,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 			set_domain_attribute(sd, attr);
 			sd->span = *cpu_map;
 			sd->first_cpu = first_cpu(sd->span);
-			cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
+			cpu_to_allnodes_group(i, cpu_map,
+					      &sd->group_balancer.groups,
+					      tmpmask);
 			p = sd;
 			sd_allnodes = 1;
 		} else
@@ -7421,7 +7445,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		sd->parent = p;
 		if (p)
 			p->child = sd;
-		cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask);
+		cpu_to_phys_group(i, cpu_map, &sd->group_balancer.groups,
+				  tmpmask);
 
 #ifdef CONFIG_SCHED_MC
 		p = sd;
@@ -7433,7 +7458,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
-		cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
+		cpu_to_core_group(i, cpu_map, &sd->group_balancer.groups,
+				  tmpmask);
 #endif
 
 #ifdef CONFIG_SCHED_SMT
@@ -7446,7 +7472,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
-		cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
+		cpu_to_cpu_group(i, cpu_map, &sd->group_balancer.groups,
+				 tmpmask);
 #endif
 	}
 
@@ -7540,7 +7567,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 			struct sched_domain *sd;
 
 			sd = &per_cpu(node_domains, j);
-			sd->groups = sg;
+			sd->group_balancer.groups = sg;
 		}
 		sg->__cpu_power = 0;
 		sg->cpumask = *nodemask;
@@ -7626,6 +7653,12 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		sd = &per_cpu(phys_domains, i);
 #endif
 		cpu_attach_domain(sd, rd, i);
+
+		/* Initialize the balancers */
+		for_each_domain(i, sd) {
+			init_sched_balancer(&sd->group_balancer,
+					    &sd->min_interval);
+		}
 	}
 
 	SCHED_CPUMASK_FREE((void *)allmasks);


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [RFC PATCH 2/3] sched: pass specific sched_balancer to functions
  2008-05-15 18:35 [RFC PATCH 0/3] sched: core-balancer v2 Gregory Haskins
  2008-05-15 18:35 ` [RFC PATCH 1/3] sched: create sched_balancer container for sched_groups Gregory Haskins
@ 2008-05-15 18:35 ` Gregory Haskins
  2008-05-15 18:35 ` [RFC PATCH 3/3] sched: add a per-core balancer group Gregory Haskins
  2 siblings, 0 replies; 4+ messages in thread
From: Gregory Haskins @ 2008-05-15 18:35 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra
  Cc: Suresh Sidda, Srivatsa Vaddagiri, linux-kernel, Gregory Haskins

Instead of assuming sd->group_balancer.groups is the intended group. We
will use this later in the series to introduce alternate balancers.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched.c |   21 ++++++++++++---------
 1 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index a8e0bd3..0bdbfe6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3288,11 +3288,12 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
  * should be moved to restore balance via the imbalance parameter.
  */
 static struct sched_group *
-find_busiest_group(struct sched_domain *sd, int this_cpu,
+find_busiest_group(struct sched_domain *sd, struct sched_balancer *balancer,
+		   int this_cpu,
 		   unsigned long *imbalance, enum cpu_idle_type idle,
 		   int *sd_idle, const cpumask_t *cpus, int *balance)
 {
-	struct sched_group *first_group = sd->group_balancer.groups;
+	struct sched_group *first_group = balancer->groups;
 	struct sched_group *busiest = NULL, *this = NULL, *group = first_group;
 	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
 	unsigned long max_pull;
@@ -3630,10 +3631,11 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
  * tasks if there is an imbalance.
  */
 static int load_balance(int this_cpu, struct rq *this_rq,
-			struct sched_domain *sd, enum cpu_idle_type idle,
+			struct sched_domain *sd,
+			struct sched_balancer *balancer,
+			enum cpu_idle_type idle,
 			int *balance, cpumask_t *cpus)
 {
-	struct sched_balancer *balancer = &sd->group_balancer;
 	int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
 	struct sched_group *group;
 	unsigned long imbalance;
@@ -3658,8 +3660,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	schedstat_inc(sd, lb_count[idle]);
 
 redo:
-	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
-				   cpus, balance);
+	group = find_busiest_group(sd, balancer, this_cpu, &imbalance, idle,
+				   &sd_idle, cpus, balance);
 
 	if (*balance == 0)
 		goto out_balanced;
@@ -3819,8 +3821,8 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
 
 	schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
 redo:
-	group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
-				   &sd_idle, cpus, NULL);
+	group = find_busiest_group(sd, balancer, this_cpu, &imbalance,
+				   CPU_NEWLY_IDLE, &sd_idle, cpus, NULL);
 	if (!group) {
 		schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
 		goto out_balanced;
@@ -4082,7 +4084,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 		}
 
 		if (time_after_eq(jiffies, balancer->last_exec + interval)) {
-			if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
+			if (load_balance(cpu, rq, sd, balancer,
+					 idle, &balance, &tmp)) {
 				/*
 				 * We've pulled tasks over so either we're no
 				 * longer idle, or one of our SMT siblings is


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [RFC PATCH 3/3] sched: add a per-core balancer group
  2008-05-15 18:35 [RFC PATCH 0/3] sched: core-balancer v2 Gregory Haskins
  2008-05-15 18:35 ` [RFC PATCH 1/3] sched: create sched_balancer container for sched_groups Gregory Haskins
  2008-05-15 18:35 ` [RFC PATCH 2/3] sched: pass specific sched_balancer to functions Gregory Haskins
@ 2008-05-15 18:35 ` Gregory Haskins
  2 siblings, 0 replies; 4+ messages in thread
From: Gregory Haskins @ 2008-05-15 18:35 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra
  Cc: Suresh Sidda, Srivatsa Vaddagiri, linux-kernel, Gregory Haskins

The current scheduler balances SCHED_OTHER tasks based on a hierarchy of
sched_domains and sched_groups as dictated by the physical cache/node
topology of the hardware.  This policy leads to the overall optimal
balancing solution, but leaves a hole under certain circumstances (see
Documentation/scheduler/core_balancer.txt for more details).

This patch adds the concept of a new per-core grouping at each domain-level
to address the shortcomings in the group_balancer.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 Documentation/scheduler/core_balancer.txt |   69 +++++++++++++
 include/linux/sched.h                     |    2 
 kernel/sched.c                            |  149 +++++++++++++++++++++--------
 3 files changed, 180 insertions(+), 40 deletions(-)

diff --git a/Documentation/scheduler/core_balancer.txt b/Documentation/scheduler/core_balancer.txt
new file mode 100644
index 0000000..3b8d346
--- /dev/null
+++ b/Documentation/scheduler/core_balancer.txt
@@ -0,0 +1,69 @@
+Core Balancing
+----------------------
+
+The standard group_balancer manages SCHED_OTHER tasks based on a hierarchy
+of sched_domains and sched_groups as dictated by the physical cache/node
+topology of the hardware.  Each group may contain one or more cores which
+have a specific relationship to other members of the group.  Balancing
+is always performed on an inter-group basis.
+
+For example, consider a quad-core, dual socket Intel Xeon system.  It has
+a total of 8 cores across one logical NUMA node, with a cache shared
+between cores [0,2], [1,3], [4,6], [5,7].  From a sched_domain/group
+perspective on core 0, this looks like the following:
+
+domain-0: (MC)
+  span: 0x5
+  groups = 2 -> [0], [2]
+  domain-1: (SMP)
+    span: 0xff
+    groups = 4 -> [0,2], [1,3], [4,6], [5,7]
+    domain-2: (NUMA)
+      span: 0xff
+      groups = 1 -> [0-7]
+
+Recall that balancing is always inter-group, and will get more aggressive
+in the lower domains than the higher ones.  The balancing logic will
+attempt to balance between [0],[2] first, [0,2], [1,3], [4,6], [5,7]
+second, and [0-7] last.  Note that since domain-2 only consists of 1
+group, it will never result in a balance decision since there must be
+at least two groups to consider.
+
+This layout is quite logical.  The idea is that [0], and [2] can
+balance between each other aggresively in a very efficient manner
+since they share a cache.  Once the load is equalized between two
+cache-peers, domain-1 can spread the load out between the other
+peer-groups.  This represents a pretty good way to structure the
+balancing operations.
+
+However, there is one slight problem with the group_balancer: Since we
+always balance inter-group, intra-group imbalances may result in
+suboptimal behavior if we hit the condition where lower-level domains
+(domain-0 in this example) are ineffective.  This condition can arise
+whenever a domain-level imbalance cannot be resolved such that the group
+has a high aggregate load rating, yet some cores are relatively idle.
+
+For example, if a core has a large but affined load, or otherwise
+untouchable tasks (e.g. RT tasks), SCHED_OTHER will not be able to
+equalize the load.  The net result is that one or more members of the
+group may remain relatively unloaded, while the load rating for the
+entire group is high.  The higher layer domains will only consider the
+group as a whole, and the lower level domains are left powerless to
+equalize the vacuum.
+
+To address this concern, core_balancer adds the concept of a new grouping
+of cores at each domain-level: a per-core grouping (each core in its own
+unique group).  This "core_balancer" group is configured to run much less
+aggressively than its topologically relevant brother: "group_balancer".
+Core_balancer will sweep through the cores every so often, correcting
+intra-group vacuums left over from lower level domains.  In most cases,
+the group_balancer should have already established equilibrium, therefore
+benefiting from the hardwares natural affinity hierarchy.  In the cases
+where it cannot achieve equilibrium, the core_balancer tries to take it
+one step closer.
+
+By default, group_balancer runs at sd->min_interval, whereas
+core_balancer starts at sd->max_interval (both of which will respond
+to dynamic programming).  Both will employ a multiplicative backoff
+algorithm when faced with repeated migration failure.
+
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 95e46e3..98c9e90 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -715,6 +715,7 @@ enum cpu_idle_type {
 #define SD_SHARE_PKG_RESOURCES	512	/* Domain members share cpu pkg resources */
 #define SD_SERIALIZE		1024	/* Only a single load balancing instance */
 #define SD_WAKE_IDLE_FAR	2048	/* Gain latency sacrificing cache hit */
+#define SD_CORE_BALANCE	        4096	/* Balance on a per-core basis */
 
 #define BALANCE_FOR_MC_POWER	\
 	(sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0)
@@ -791,6 +792,7 @@ struct sched_domain {
 
 	/* Balancer data */
 	struct sched_balancer group_balancer;
+	struct sched_balancer core_balancer;
 
 #ifdef CONFIG_SCHEDSTATS
 	/* load_balance() stats */
diff --git a/kernel/sched.c b/kernel/sched.c
index 0bdbfe6..4ad3143 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4041,6 +4041,59 @@ int select_nohz_load_balancer(int stop_tick)
 
 static DEFINE_SPINLOCK(balancing);
 
+static unsigned int rebalance_domain(struct rq *rq,
+				     struct sched_domain *sd,
+				     struct sched_balancer *balancer,
+				     unsigned long *next_balance,
+				     enum cpu_idle_type *idle,
+				     int *balance)
+{
+	unsigned long interval;
+	int need_serialize;
+	cpumask_t tmp;
+
+	interval = balancer->interval;
+	if (*idle != CPU_IDLE)
+		interval *= sd->busy_factor;
+
+	/* scale ms to jiffies */
+	interval = msecs_to_jiffies(interval);
+	if (unlikely(!interval))
+		interval = 1;
+	if (interval > HZ*NR_CPUS/10)
+		interval = HZ*NR_CPUS/10;
+
+	need_serialize = sd->flags & SD_SERIALIZE;
+
+	if (need_serialize) {
+		if (!spin_trylock(&balancing))
+			goto out;
+	}
+
+	if (time_after_eq(jiffies, balancer->last_exec + interval)) {
+		if (load_balance(rq->cpu, rq, sd, balancer,
+				 *idle, balance, &tmp)) {
+			/*
+			 * We've pulled tasks over so either we're no
+			 * longer idle, or one of our SMT siblings is
+			 * not idle.
+			 */
+			*idle = CPU_NOT_IDLE;
+		}
+		balancer->last_exec = jiffies;
+	}
+
+	if (need_serialize)
+		spin_unlock(&balancing);
+out:
+	if (time_after(*next_balance, balancer->last_exec + interval)) {
+		*next_balance = balancer->last_exec + interval;
+		return 1;
+	}
+
+	return 0;
+}
+
 /*
  * It checks each scheduling domain to see if it is due to be balanced,
  * and initiates a balancing operation if so.
@@ -4051,57 +4104,23 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 {
 	int balance = 1;
 	struct rq *rq = cpu_rq(cpu);
-	unsigned long interval;
 	struct sched_domain *sd;
 	/* Earliest time when we have to do rebalance again */
 	unsigned long next_balance = jiffies + 60*HZ;
 	int update_next_balance = 0;
-	int need_serialize;
-	cpumask_t tmp;
 
 	for_each_domain(cpu, sd) {
-		struct sched_balancer *balancer = &sd->group_balancer;
-
 		if (!(sd->flags & SD_LOAD_BALANCE))
 			continue;
 
-		interval = balancer->interval;
-		if (idle != CPU_IDLE)
-			interval *= sd->busy_factor;
-
-		/* scale ms to jiffies */
-		interval = msecs_to_jiffies(interval);
-		if (unlikely(!interval))
-			interval = 1;
-		if (interval > HZ*NR_CPUS/10)
-			interval = HZ*NR_CPUS/10;
-
-		need_serialize = sd->flags & SD_SERIALIZE;
-
-		if (need_serialize) {
-			if (!spin_trylock(&balancing))
-				goto out;
-		}
+		if (rebalance_domain(rq, sd, &sd->group_balancer,
+				     &next_balance, &idle, &balance))
+			update_next_balance = 1;
 
-		if (time_after_eq(jiffies, balancer->last_exec + interval)) {
-			if (load_balance(cpu, rq, sd, balancer,
-					 idle, &balance, &tmp)) {
-				/*
-				 * We've pulled tasks over so either we're no
-				 * longer idle, or one of our SMT siblings is
-				 * not idle.
-				 */
-				idle = CPU_NOT_IDLE;
-			}
-			balancer->last_exec = jiffies;
-		}
-		if (need_serialize)
-			spin_unlock(&balancing);
-out:
-		if (time_after(next_balance, balancer->last_exec + interval)) {
-			next_balance = balancer->last_exec + interval;
+		if ((sd->flags & SD_CORE_BALANCE)
+		    && rebalance_domain(rq, sd, &sd->core_balancer,
+					&next_balance, &idle, &balance))
 			update_next_balance = 1;
-		}
 
 		/*
 		 * Stop the load balance at this level. There is another
@@ -7348,6 +7367,42 @@ static void init_sched_balancer(struct sched_balancer *balancer,
 	balancer->nr_failed = 0;
 }
 
+static __read_mostly struct sched_group smp_core_group[NR_CPUS];
+#ifdef CONFIG_NUMA
+static __read_mostly struct sched_group allnode_core_group[NR_CPUS];
+static __read_mostly struct sched_group node_core_group[NR_CPUS];
+#endif
+
+static void init_sched_core_balancer(int cpu, struct sched_domain *sd,
+				     struct sched_group *pool)
+{
+	struct sched_balancer *balancer = &sd->core_balancer;
+	struct sched_group *last = NULL;
+	int i = 0;
+
+	balancer->groups = &pool[sd->first_cpu];
+
+	if (cpu != sd->first_cpu)
+		return;
+
+	for_each_cpu_mask(i, sd->span) {
+		struct sched_group *sg = &pool[i];
+
+		cpus_clear(sg->cpumask);
+		cpu_set(i, sg->cpumask);
+		sg->__cpu_power = 0;
+
+		/* FIXME: We probably need more accuracy here */
+		sg_inc_cpu_power(sg, SCHED_LOAD_SCALE);
+
+		if (last)
+			last->next = sg;
+
+		last = sg;
+	}
+	last->next = balancer->groups;
+}
+
 /*
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
@@ -7425,6 +7480,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 					      tmpmask);
 			p = sd;
 			sd_allnodes = 1;
+
+			/* Enable core balancing for NUMA-allnode domain */
+			sd->flags |= SD_CORE_BALANCE;
+			init_sched_core_balancer(i, sd, allnode_core_group);
 		} else
 			p = NULL;
 
@@ -7437,6 +7496,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		if (p)
 			p->child = sd;
 		cpus_and(sd->span, sd->span, *cpu_map);
+
+		/* Enable core balancing for NUMA domain */
+		sd->flags |= SD_CORE_BALANCE;
+		init_sched_core_balancer(i, sd, node_core_group);
 #endif
 
 		p = sd;
@@ -7451,6 +7514,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		cpu_to_phys_group(i, cpu_map, &sd->group_balancer.groups,
 				  tmpmask);
 
+		/* Enable core balancing for smp domain */
+		sd->flags |= SD_CORE_BALANCE;
+		init_sched_core_balancer(i, sd, smp_core_group);
+
 #ifdef CONFIG_SCHED_MC
 		p = sd;
 		sd = &per_cpu(core_domains, i);
@@ -7661,6 +7728,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		for_each_domain(i, sd) {
 			init_sched_balancer(&sd->group_balancer,
 					    &sd->min_interval);
+			init_sched_balancer(&sd->core_balancer,
+					    &sd->max_interval);
 		}
 	}
 


^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2008-05-15 18:36 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-05-15 18:35 [RFC PATCH 0/3] sched: core-balancer v2 Gregory Haskins
2008-05-15 18:35 ` [RFC PATCH 1/3] sched: create sched_balancer container for sched_groups Gregory Haskins
2008-05-15 18:35 ` [RFC PATCH 2/3] sched: pass specific sched_balancer to functions Gregory Haskins
2008-05-15 18:35 ` [RFC PATCH 3/3] sched: add a per-core balancer group Gregory Haskins

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox