[PATCH 1/3] Dynamic sched domains: sched changes

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/3] Dynamic sched domains: sched changes
@ 2005-05-19 15:04 Dinakar Guniguntala
  2005-05-19 15:06 ` [PATCH 2/3] Dynamic sched domains: cpuset changes Dinakar Guniguntala
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Dinakar Guniguntala @ 2005-05-19 15:04 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Paul Jackson, Simon Derr, Matthew Dobson, Dipankar Sarma, lkml,
	lse-tech, Ingo Molnar, Jesse Barnes

[-- Attachment #1: Type: text/plain, Size: 1917 bytes --]

Hello Andrew,

The following patches add dynamic sched domains functionality that was
extensively discussed on lkml and lse-tech.
I would like to see this added to -mm

o The main advantage with this feature is that it ensures that the scheduler
  load balacing code only balances against the cpus that are in the sched
  domain as defined by an exclusive cpuset and not all of the cpus in the
  system. This removes any overhead due to load balancing code trying to
  pull tasks outside of the cpu exclusive cpuset only to be prevented by
  the tasks' cpus_allowed mask.
o cpu exclusive cpusets are useful for servers running orthogonal
  workloads such as RT applications requiring low latency and HPC
  applications that are throughput sensitive

o It provides a new API partition_sched_domains in sched.c
  that makes dynamic sched domains possible.
o cpu_exclusive cpusets sets are now associated with a sched domain.
  Which means that the users can dynamically modify the sched domains
  through the cpuset file system interface
o ia64 sched domain code has been updated to support this feature as well
o Currently, this does not support hotplug. (However some of my tests
  indicate hotplug+preempt is currently broken)
o I have tested it extensively on x86.
o This should have very minimal impact on performance as none of
  the fast paths are affected

        -Dinakar

Signed-off-by: Dinakar Guniguntala <dino@in.ibm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>

 linux-2.6.12-rc4-mm1-1/include/linux/sched.h     |    2
 linux-2.6.12-rc4-mm1-1/kernel/sched.c            |  130 +++++++++++++++--------
 linux-2.6.12-rc4-mm1-2/Documentation/cpusets.txt |   16 ++
 linux-2.6.12-rc4-mm1-2/kernel/cpuset.c           |   89 +++++++++++++--
 linux-2.6.12-rc4-mm1-3/arch/ia64/kernel/domain.c |   76 +++++++------
 5 files changed, 225 insertions(+), 88 deletions(-)




[-- Attachment #2: dyn-sd-rc4mm1-v0.6-1.patch --]
[-- Type: text/plain, Size: 10226 bytes --]

diff -Naurp linux-2.6.12-rc4-mm1-0/include/linux/sched.h linux-2.6.12-rc4-mm1-1/include/linux/sched.h
--- linux-2.6.12-rc4-mm1-0/include/linux/sched.h	2005-05-16 14:55:43.000000000 +0530
+++ linux-2.6.12-rc4-mm1-1/include/linux/sched.h	2005-05-16 15:13:22.000000000 +0530
@@ -561,6 +561,8 @@ struct sched_domain {
 #endif
 };
 
+extern void partition_sched_domains(cpumask_t *partition1, 
+				    cpumask_t *partition2);
 #ifdef ARCH_HAS_SCHED_DOMAIN
 /* Useful helpers that arch setup code may use. Defined in kernel/sched.c */
 extern cpumask_t cpu_isolated_map;
diff -Naurp linux-2.6.12-rc4-mm1-0/kernel/sched.c linux-2.6.12-rc4-mm1-1/kernel/sched.c
--- linux-2.6.12-rc4-mm1-0/kernel/sched.c	2005-05-16 14:58:01.000000000 +0530
+++ linux-2.6.12-rc4-mm1-1/kernel/sched.c	2005-05-18 12:38:45.000000000 +0530
@@ -264,7 +264,7 @@ static DEFINE_PER_CPU(struct runqueue, r
 
 /*
  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
- * See update_sched_domains: synchronize_kernel for details.
+ * See detach_destroy_domains: synchronize_sched for details.
  *
  * The domain tree of any CPU may only be accessed from within
  * preempt-disabled sections.
@@ -4615,7 +4615,7 @@ int __init migration_init(void)
 #endif
 
 #ifdef CONFIG_SMP
-#define SCHED_DOMAIN_DEBUG
+#undef SCHED_DOMAIN_DEBUG
 #ifdef SCHED_DOMAIN_DEBUG
 static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
@@ -4831,7 +4831,7 @@ static void init_sched_domain_sysctl(voi
 }
 #endif
 
-static int __devinit sd_degenerate(struct sched_domain *sd)
+static int sd_degenerate(struct sched_domain *sd)
 {
 	if (cpus_weight(sd->span) == 1)
 		return 1;
@@ -4854,7 +4854,7 @@ static int __devinit sd_degenerate(struc
 	return 1;
 }
 
-static int __devinit sd_parent_degenerate(struct sched_domain *sd,
+static int sd_parent_degenerate(struct sched_domain *sd,
 						struct sched_domain *parent)
 {
 	unsigned long cflags = sd->flags, pflags = parent->flags;
@@ -4886,7 +4886,7 @@ static int __devinit sd_parent_degenerat
  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
  * hold the hotplug lock.
  */
-void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
+void cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
 	runqueue_t *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
@@ -4937,7 +4937,7 @@ __setup ("isolcpus=", isolated_cpu_setup
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
-void __devinit init_sched_build_groups(struct sched_group groups[],
+void init_sched_build_groups(struct sched_group groups[],
 			cpumask_t span, int (*group_fn)(int cpu))
 {
 	struct sched_group *first = NULL, *last = NULL;
@@ -4973,13 +4973,14 @@ void __devinit init_sched_build_groups(s
 
 
 #ifdef ARCH_HAS_SCHED_DOMAIN
-extern void __devinit arch_init_sched_domains(void);
-extern void __devinit arch_destroy_sched_domains(void);
+extern void build_sched_domains(const cpumask_t *cpu_map);
+extern void arch_init_sched_domains(const cpumask_t *cpu_map);
+extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
 #else
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static struct sched_group sched_group_cpus[NR_CPUS];
-static int __devinit cpu_to_cpu_group(int cpu)
+static int cpu_to_cpu_group(int cpu)
 {
 	return cpu;
 }
@@ -4987,7 +4988,7 @@ static int __devinit cpu_to_cpu_group(in
 
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
 static struct sched_group sched_group_phys[NR_CPUS];
-static int __devinit cpu_to_phys_group(int cpu)
+static int cpu_to_phys_group(int cpu)
 {
 #ifdef CONFIG_SCHED_SMT
 	return first_cpu(cpu_sibling_map[cpu]);
@@ -5000,7 +5001,7 @@ static int __devinit cpu_to_phys_group(i
 
 static DEFINE_PER_CPU(struct sched_domain, node_domains);
 static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static int __devinit cpu_to_node_group(int cpu)
+static int cpu_to_node_group(int cpu)
 {
 	return cpu_to_node(cpu);
 }
@@ -5031,39 +5032,28 @@ static void check_sibling_maps(void)
 #endif
 
 /*
- * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
+ * Build sched domains for a given set of cpus and attach the sched domains
+ * to the individual cpus
  */
-static void __devinit arch_init_sched_domains(void)
+static void build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
-	cpumask_t cpu_default_map;
-
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
-	check_sibling_maps();
-#endif
-	/*
-	 * Setup mask for cpus without special case scheduling requirements.
-	 * For now this just excludes isolated cpus, but could be used to
-	 * exclude other special cases in the future.
-	 */
-	cpus_complement(cpu_default_map, cpu_isolated_map);
-	cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
 
 	/*
-	 * Set up domains. Isolated domains just stay on the NULL domain.
+	 * Set up domains for cpus specified by the cpu_map.
 	 */
-	for_each_cpu_mask(i, cpu_default_map) {
+	for_each_cpu_mask(i, *cpu_map) {
 		int group;
 		struct sched_domain *sd = NULL, *p;
 		cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
 
-		cpus_and(nodemask, nodemask, cpu_default_map);
+		cpus_and(nodemask, nodemask, *cpu_map);
 
 #ifdef CONFIG_NUMA
 		sd = &per_cpu(node_domains, i);
 		group = cpu_to_node_group(i);
 		*sd = SD_NODE_INIT;
-		sd->span = cpu_default_map;
+		sd->span = *cpu_map;
 		sd->groups = &sched_group_nodes[group];
 #endif
 
@@ -5081,7 +5071,7 @@ static void __devinit arch_init_sched_do
 		group = cpu_to_cpu_group(i);
 		*sd = SD_SIBLING_INIT;
 		sd->span = cpu_sibling_map[i];
-		cpus_and(sd->span, sd->span, cpu_default_map);
+		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		sd->groups = &sched_group_cpus[group];
 #endif
@@ -5091,7 +5081,7 @@ static void __devinit arch_init_sched_do
 	/* Set up CPU (sibling) groups */
 	for_each_online_cpu(i) {
 		cpumask_t this_sibling_map = cpu_sibling_map[i];
-		cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
+		cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
 		if (i != first_cpu(this_sibling_map))
 			continue;
 
@@ -5104,7 +5094,7 @@ static void __devinit arch_init_sched_do
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		cpumask_t nodemask = node_to_cpumask(i);
 
-		cpus_and(nodemask, nodemask, cpu_default_map);
+		cpus_and(nodemask, nodemask, *cpu_map);
 		if (cpus_empty(nodemask))
 			continue;
 
@@ -5114,12 +5104,12 @@ static void __devinit arch_init_sched_do
 
 #ifdef CONFIG_NUMA
 	/* Set up node groups */
-	init_sched_build_groups(sched_group_nodes, cpu_default_map,
+	init_sched_build_groups(sched_group_nodes, *cpu_map,
 					&cpu_to_node_group);
 #endif
 
 	/* Calculate CPU power for physical packages and nodes */
-	for_each_cpu_mask(i, cpu_default_map) {
+	for_each_cpu_mask(i, *cpu_map) {
 		int power;
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
@@ -5143,7 +5133,7 @@ static void __devinit arch_init_sched_do
 	}
 
 	/* Attach the domains */
-	for_each_online_cpu(i) {
+	for_each_cpu_mask(i, *cpu_map) {
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
 		sd = &per_cpu(cpu_domains, i);
@@ -5153,16 +5143,71 @@ static void __devinit arch_init_sched_do
 		cpu_attach_domain(sd, i);
 	}
 }
+/*
+ * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
+ */
+static void arch_init_sched_domains(cpumask_t *cpu_map)
+{
+	cpumask_t cpu_default_map;
 
-#ifdef CONFIG_HOTPLUG_CPU
-static void __devinit arch_destroy_sched_domains(void)
+#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
+	check_sibling_maps();
+#endif
+	/*
+	 * Setup mask for cpus without special case scheduling requirements.
+	 * For now this just excludes isolated cpus, but could be used to
+	 * exclude other special cases in the future.
+	 */
+	cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
+
+	build_sched_domains(&cpu_default_map);
+}
+
+static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
 	/* Do nothing: everything is statically allocated. */
 }
-#endif
 
 #endif /* ARCH_HAS_SCHED_DOMAIN */
 
+/*
+ * Detach sched domains from a group of cpus specified in cpu_map
+ * These cpus will now be attached to the NULL domain
+ */
+static inline void detach_destroy_domains(const cpumask_t *cpu_map)
+{
+	int i;
+
+	for_each_cpu_mask(i, *cpu_map)
+		cpu_attach_domain(NULL, i);
+	synchronize_sched();
+	arch_destroy_sched_domains(cpu_map);
+}
+
+/*
+ * Partition sched domains as specified by the cpumasks below.
+ * This attaches all cpus from the cpumasks to the NULL domain,
+ * waits for a RCU quiescent period, recalculates sched
+ * domain information and then attaches them back to the
+ * correct sched domains
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
+{
+	cpumask_t change_map;
+
+	cpus_and(*partition1, *partition1, cpu_online_map);
+	cpus_and(*partition2, *partition2, cpu_online_map);
+	cpus_or(change_map, *partition1, *partition2);
+
+	/* Detach sched domains from all of the affected cpus */
+	detach_destroy_domains(&change_map);
+	if (!cpus_empty(*partition1))
+		build_sched_domains(partition1);
+	if (!cpus_empty(*partition2))
+		build_sched_domains(partition2);
+}
+
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * Force a reinitialization of the sched domains hierarchy.  The domains
@@ -5178,10 +5223,7 @@ static int update_sched_domains(struct n
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_DOWN_PREPARE:
-		for_each_online_cpu(i)
-			cpu_attach_domain(NULL, i);
-		synchronize_kernel();
-		arch_destroy_sched_domains();
+		detach_destroy_domains(&cpu_online_map);
 		return NOTIFY_OK;
 
 	case CPU_UP_CANCELED:
@@ -5197,7 +5239,7 @@ static int update_sched_domains(struct n
 	}
 
 	/* The hotplug lock is already held by cpu_up/cpu_down */
-	arch_init_sched_domains();
+	arch_init_sched_domains(&cpu_online_map);
 
 	return NOTIFY_OK;
 }
@@ -5206,7 +5248,7 @@ static int update_sched_domains(struct n
 void __init sched_init_smp(void)
 {
 	lock_cpu_hotplug();
-	arch_init_sched_domains();
+	arch_init_sched_domains(&cpu_online_map);
 	unlock_cpu_hotplug();
 	/* XXX: Theoretical race here - CPU may be hotplugged now */
 	hotcpu_notifier(update_sched_domains, 0);

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH 2/3] Dynamic sched domains: cpuset changes
  2005-05-19 15:04 [PATCH 1/3] Dynamic sched domains: sched changes Dinakar Guniguntala
@ 2005-05-19 15:06 ` Dinakar Guniguntala
  2005-05-19 15:07 ` [PATCH 3/3] Dynamic sched domains: ia64 changes Dinakar Guniguntala
  2005-05-19 20:09 ` [PATCH 1/3] Dynamic sched domains: sched changes Matthew Dobson
  2 siblings, 0 replies; 4+ messages in thread
From: Dinakar Guniguntala @ 2005-05-19 15:06 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Paul Jackson, Simon Derr, Matthew Dobson, Dipankar Sarma, lkml,
	lse-tech, Ingo Molnar, Jesse Barnes

[-- Attachment #1: Type: text/plain, Size: 215 bytes --]


 Adds the core update_cpu_domains code and updated cpusets documentation

Signed-off-by: Dinakar Guniguntala <dino@in.ibm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>




[-- Attachment #2: dyn-sd-rc4mm1-v0.6-2.patch --]
[-- Type: text/plain, Size: 6580 bytes --]

diff -Naurp linux-2.6.12-rc4-mm1-1/Documentation/cpusets.txt linux-2.6.12-rc4-mm1-2/Documentation/cpusets.txt
--- linux-2.6.12-rc4-mm1-1/Documentation/cpusets.txt	2005-05-16 15:14:05.000000000 +0530
+++ linux-2.6.12-rc4-mm1-2/Documentation/cpusets.txt	2005-05-18 12:40:59.000000000 +0530
@@ -51,6 +51,14 @@ mems_allowed vector.
 
 If a cpuset is cpu or mem exclusive, no other cpuset, other than a direct
 ancestor or descendent, may share any of the same CPUs or Memory Nodes.
+A cpuset that is cpu exclusive has a sched domain associated with it.
+The sched domain consists of all cpus in the current cpuset that are not 
+part of any exclusive child cpusets.
+This ensures that the scheduler load balacing code only balances
+against the cpus that are in the sched domain as defined above and not
+all of the cpus in the system. This removes any overhead due to 
+load balancing code trying to pull tasks outside of the cpu exclusive
+cpuset only to be prevented by the tasks' cpus_allowed mask.
 
 User level code may create and destroy cpusets by name in the cpuset
 virtual file system, manage the attributes and permissions of these
@@ -84,6 +92,9 @@ This can be especially valuable on:
       and a database), or
     * NUMA systems running large HPC applications with demanding
       performance characteristics.
+    * Also cpu_exclusive cpusets are useful for servers running orthogonal 
+      workloads such as RT applications requiring low latency and HPC 
+      applications that are throughput sensitive
 
 These subsets, or "soft partitions" must be able to be dynamically
 adjusted, as the job mix changes, without impacting other concurrently
@@ -125,6 +136,8 @@ Cpusets extends these two mechanisms as 
  - A cpuset may be marked exclusive, which ensures that no other
    cpuset (except direct ancestors and descendents) may contain
    any overlapping CPUs or Memory Nodes.
+   Also a cpu_exclusive cpuset would be associated with a sched 
+   domain.
  - You can list all the tasks (by pid) attached to any cpuset.
 
 The implementation of cpusets requires a few, simple hooks
@@ -136,6 +149,9 @@ into the rest of the kernel, none in per
    allowed in that tasks cpuset.
  - in sched.c migrate_all_tasks(), to keep migrating tasks within
    the CPUs allowed by their cpuset, if possible.
+ - in sched.c, a new API partition_sched_domains for handling
+   sched domain changes associated with cpu_exclusive cpusets
+   and related changes in both sched.c and arch/ia64/kernel/domain.c
  - in the mbind and set_mempolicy system calls, to mask the requested
    Memory Nodes by what's allowed in that tasks cpuset.
  - in page_alloc, to restrict memory to allowed nodes.
diff -Naurp linux-2.6.12-rc4-mm1-1/kernel/cpuset.c linux-2.6.12-rc4-mm1-2/kernel/cpuset.c
--- linux-2.6.12-rc4-mm1-1/kernel/cpuset.c	2005-05-16 15:08:08.000000000 +0530
+++ linux-2.6.12-rc4-mm1-2/kernel/cpuset.c	2005-05-19 12:53:17.000000000 +0530
@@ -596,10 +596,62 @@ static int validate_change(const struct 
 	return 0;
 }
 
+/*
+ * For a given cpuset cur, partition the system as follows
+ * a. All cpus in the parent cpuset's cpus_allowed that are not part of any
+ *    exclusive child cpusets
+ * b. All cpus in the current cpuset's cpus_allowed that are not part of any
+ *    exclusive child cpusets
+ * Build these two partitions by calling partition_sched_domains
+ *
+ * Call with cpuset_sem held.  May nest a call to the
+ * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
+ */
+static void update_cpu_domains(struct cpuset *cur)
+{
+	struct cpuset *c, *par = cur->parent;
+	cpumask_t pspan, cspan;
+
+	if (par == NULL || cpus_empty(cur->cpus_allowed))
+		return;
+
+	/*
+	 * Get all cpus from parent's cpus_allowed not part of exclusive
+	 * children
+	 */
+	pspan = par->cpus_allowed;
+	list_for_each_entry(c, &par->children, sibling) {
+		if (is_cpu_exclusive(c))
+			cpus_andnot(pspan, pspan, c->cpus_allowed);
+	}
+	if (is_removed(cur) || !is_cpu_exclusive(cur)) {
+		cpus_or(pspan, pspan, cur->cpus_allowed);
+		if (cpus_equal(pspan, cur->cpus_allowed))
+			return;
+		cspan = CPU_MASK_NONE;
+	} else {
+		if (cpus_empty(pspan))
+			return;
+		cspan = cur->cpus_allowed;
+		/*
+		 * Get all cpus from current cpuset's cpus_allowed not part
+		 * of exclusive children
+		 */
+		list_for_each_entry(c, &cur->children, sibling) {
+			if (is_cpu_exclusive(c))
+				cpus_andnot(cspan, cspan, c->cpus_allowed);
+		}
+	}
+
+	lock_cpu_hotplug();
+	partition_sched_domains(&pspan, &cspan);
+	unlock_cpu_hotplug();
+}
+
 static int update_cpumask(struct cpuset *cs, char *buf)
 {
 	struct cpuset trialcs;
-	int retval;
+	int retval, cpus_unchanged;
 
 	trialcs = *cs;
 	retval = cpulist_parse(buf, trialcs.cpus_allowed);
@@ -609,9 +661,13 @@ static int update_cpumask(struct cpuset 
 	if (cpus_empty(trialcs.cpus_allowed))
 		return -ENOSPC;
 	retval = validate_change(cs, &trialcs);
-	if (retval == 0)
-		cs->cpus_allowed = trialcs.cpus_allowed;
-	return retval;
+	if (retval < 0)
+		return retval;
+	cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
+	cs->cpus_allowed = trialcs.cpus_allowed;
+	if (is_cpu_exclusive(cs) && !cpus_unchanged)
+		update_cpu_domains(cs);
+	return 0;
 }
 
 static int update_nodemask(struct cpuset *cs, char *buf)
@@ -647,7 +703,7 @@ static int update_flag(cpuset_flagbits_t
 {
 	int turning_on;
 	struct cpuset trialcs;
-	int err;
+	int err, cpu_exclusive_changed;
 
 	turning_on = (simple_strtoul(buf, NULL, 10) != 0);
 
@@ -658,13 +714,18 @@ static int update_flag(cpuset_flagbits_t
 		clear_bit(bit, &trialcs.flags);
 
 	err = validate_change(cs, &trialcs);
-	if (err == 0) {
-		if (turning_on)
-			set_bit(bit, &cs->flags);
-		else
-			clear_bit(bit, &cs->flags);
-	}
-	return err;
+	if (err < 0)
+		return err;
+	cpu_exclusive_changed = 
+		(is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
+	if (turning_on)
+		set_bit(bit, &cs->flags);
+	else
+		clear_bit(bit, &cs->flags);
+
+	if (cpu_exclusive_changed)
+                update_cpu_domains(cs);
+	return 0;
 }
 
 static int attach_task(struct cpuset *cs, char *buf)
@@ -1310,12 +1371,14 @@ static int cpuset_rmdir(struct inode *un
 		up(&cpuset_sem);
 		return -EBUSY;
 	}
-	spin_lock(&cs->dentry->d_lock);
 	parent = cs->parent;
 	set_bit(CS_REMOVED, &cs->flags);
+	if (is_cpu_exclusive(cs))
+		update_cpu_domains(cs);
 	list_del(&cs->sibling);	/* delete my sibling from parent->children */
 	if (list_empty(&parent->children))
 		check_for_release(parent);
+	spin_lock(&cs->dentry->d_lock);
 	d = dget(cs->dentry);
 	cs->dentry = NULL;
 	spin_unlock(&d->d_lock);

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH 3/3] Dynamic sched domains: ia64 changes
  2005-05-19 15:04 [PATCH 1/3] Dynamic sched domains: sched changes Dinakar Guniguntala
  2005-05-19 15:06 ` [PATCH 2/3] Dynamic sched domains: cpuset changes Dinakar Guniguntala
@ 2005-05-19 15:07 ` Dinakar Guniguntala
  2005-05-19 20:09 ` [PATCH 1/3] Dynamic sched domains: sched changes Matthew Dobson
  2 siblings, 0 replies; 4+ messages in thread
From: Dinakar Guniguntala @ 2005-05-19 15:07 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Paul Jackson, Simon Derr, Matthew Dobson, Dipankar Sarma, lkml,
	lse-tech, Ingo Molnar, Jesse Barnes

[-- Attachment #1: Type: text/plain, Size: 232 bytes --]


 ia64 changes similar to kernel/sched.c. This patch compiles ok, but has
 not been tested

Signed-off-by: Dinakar Guniguntala <dino@in.ibm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>




[-- Attachment #2: dyn-sd-rc4mm1-v0.6-3.patch --]
[-- Type: text/plain, Size: 6821 bytes --]

diff -Naurp linux-2.6.12-rc4-mm1-2/arch/ia64/kernel/domain.c linux-2.6.12-rc4-mm1-3/arch/ia64/kernel/domain.c
--- linux-2.6.12-rc4-mm1-2/arch/ia64/kernel/domain.c	2005-05-16 15:06:51.000000000 +0530
+++ linux-2.6.12-rc4-mm1-3/arch/ia64/kernel/domain.c	2005-05-19 16:48:59.000000000 +0530
@@ -27,7 +27,7 @@
  *
  * Should use nodemask_t.
  */
-static int __devinit find_next_best_node(int node, unsigned long *used_nodes)
+static int find_next_best_node(int node, unsigned long *used_nodes)
 {
 	int i, n, val, min_val, best_node = 0;
 
@@ -66,7 +66,7 @@ static int __devinit find_next_best_node
  * should be one that prevents unnecessary balancing, but also spreads tasks
  * out optimally.
  */
-static cpumask_t __devinit sched_domain_node_span(int node)
+static cpumask_t sched_domain_node_span(int node)
 {
 	int i;
 	cpumask_t span, nodemask;
@@ -96,7 +96,7 @@ static cpumask_t __devinit sched_domain_
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static struct sched_group sched_group_cpus[NR_CPUS];
-static int __devinit cpu_to_cpu_group(int cpu)
+static int cpu_to_cpu_group(int cpu)
 {
 	return cpu;
 }
@@ -104,7 +104,7 @@ static int __devinit cpu_to_cpu_group(in
 
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
 static struct sched_group sched_group_phys[NR_CPUS];
-static int __devinit cpu_to_phys_group(int cpu)
+static int cpu_to_phys_group(int cpu)
 {
 #ifdef CONFIG_SCHED_SMT
 	return first_cpu(cpu_sibling_map[cpu]);
@@ -125,44 +125,36 @@ static struct sched_group *sched_group_n
 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
 static struct sched_group sched_group_allnodes[MAX_NUMNODES];
 
-static int __devinit cpu_to_allnodes_group(int cpu)
+static int cpu_to_allnodes_group(int cpu)
 {
 	return cpu_to_node(cpu);
 }
 #endif
 
 /*
- * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
+ * Build sched domains for a given set of cpus and attach the sched domains
+ * to the individual cpus
  */
-void __devinit arch_init_sched_domains(void)
+void build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
-	cpumask_t cpu_default_map;
 
 	/*
-	 * Setup mask for cpus without special case scheduling requirements.
-	 * For now this just excludes isolated cpus, but could be used to
-	 * exclude other special cases in the future.
+	 * Set up domains for cpus specified by the cpu_map.
 	 */
-	cpus_complement(cpu_default_map, cpu_isolated_map);
-	cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
-
-	/*
-	 * Set up domains. Isolated domains just stay on the dummy domain.
-	 */
-	for_each_cpu_mask(i, cpu_default_map) {
+	for_each_cpu_mask(i, *cpu_map) {
 		int group;
 		struct sched_domain *sd = NULL, *p;
 		cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
 
-		cpus_and(nodemask, nodemask, cpu_default_map);
+		cpus_and(nodemask, nodemask, *cpu_map);
 
 #ifdef CONFIG_NUMA
 		if (num_online_cpus()
 				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
 			sd = &per_cpu(allnodes_domains, i);
 			*sd = SD_ALLNODES_INIT;
-			sd->span = cpu_default_map;
+			sd->span = *cpu_map;
 			group = cpu_to_allnodes_group(i);
 			sd->groups = &sched_group_allnodes[group];
 			p = sd;
@@ -173,7 +165,7 @@ void __devinit arch_init_sched_domains(v
 		*sd = SD_NODE_INIT;
 		sd->span = sched_domain_node_span(cpu_to_node(i));
 		sd->parent = p;
-		cpus_and(sd->span, sd->span, cpu_default_map);
+		cpus_and(sd->span, sd->span, *cpu_map);
 #endif
 
 		p = sd;
@@ -190,7 +182,7 @@ void __devinit arch_init_sched_domains(v
 		group = cpu_to_cpu_group(i);
 		*sd = SD_SIBLING_INIT;
 		sd->span = cpu_sibling_map[i];
-		cpus_and(sd->span, sd->span, cpu_default_map);
+		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		sd->groups = &sched_group_cpus[group];
 #endif
@@ -198,9 +190,9 @@ void __devinit arch_init_sched_domains(v
 
 #ifdef CONFIG_SCHED_SMT
 	/* Set up CPU (sibling) groups */
-	for_each_cpu_mask(i, cpu_default_map) {
+	for_each_cpu_mask(i, *cpu_map) {
 		cpumask_t this_sibling_map = cpu_sibling_map[i];
-		cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
+		cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
 		if (i != first_cpu(this_sibling_map))
 			continue;
 
@@ -213,7 +205,7 @@ void __devinit arch_init_sched_domains(v
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		cpumask_t nodemask = node_to_cpumask(i);
 
-		cpus_and(nodemask, nodemask, cpu_default_map);
+		cpus_and(nodemask, nodemask, *cpu_map);
 		if (cpus_empty(nodemask))
 			continue;
 
@@ -222,7 +214,7 @@ void __devinit arch_init_sched_domains(v
 	}
 
 #ifdef CONFIG_NUMA
-	init_sched_build_groups(sched_group_allnodes, cpu_default_map,
+	init_sched_build_groups(sched_group_allnodes, *cpu_map,
 				&cpu_to_allnodes_group);
 
 	for (i = 0; i < MAX_NUMNODES; i++) {
@@ -233,12 +225,12 @@ void __devinit arch_init_sched_domains(v
 		cpumask_t covered = CPU_MASK_NONE;
 		int j;
 
-		cpus_and(nodemask, nodemask, cpu_default_map);
+		cpus_and(nodemask, nodemask, *cpu_map);
 		if (cpus_empty(nodemask))
 			continue;
 
 		domainspan = sched_domain_node_span(i);
-		cpus_and(domainspan, domainspan, cpu_default_map);
+		cpus_and(domainspan, domainspan, *cpu_map);
 
 		sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
 		sched_group_nodes[i] = sg;
@@ -266,7 +258,7 @@ void __devinit arch_init_sched_domains(v
 			int n = (i + j) % MAX_NUMNODES;
 
 			cpus_complement(notcovered, covered);
-			cpus_and(tmp, notcovered, cpu_default_map);
+			cpus_and(tmp, notcovered, *cpu_map);
 			cpus_and(tmp, tmp, domainspan);
 			if (cpus_empty(tmp))
 				break;
@@ -293,7 +285,7 @@ void __devinit arch_init_sched_domains(v
 #endif
 
 	/* Calculate CPU power for physical packages and nodes */
-	for_each_cpu_mask(i, cpu_default_map) {
+	for_each_cpu_mask(i, *cpu_map) {
 		int power;
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
@@ -359,13 +351,35 @@ next_sg:
 		cpu_attach_domain(sd, i);
 	}
 }
+/*
+ * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
+ */
+void arch_init_sched_domains(const cpumask_t *cpu_map)
+{
+	cpumask_t cpu_default_map;
+
+	/*
+	 * Setup mask for cpus without special case scheduling requirements.
+	 * For now this just excludes isolated cpus, but could be used to
+	 * exclude other special cases in the future.
+	 */
+	cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
+
+	build_sched_domains(&cpu_default_map);
+}
 
-void __devinit arch_destroy_sched_domains(void)
+void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
 #ifdef CONFIG_NUMA
 	int i;
 	for (i = 0; i < MAX_NUMNODES; i++) {
+		cpumask_t nodemask = node_to_cpumask(i);
 		struct sched_group *oldsg, *sg = sched_group_nodes[i];
+
+		cpus_and(nodemask, nodemask, *cpu_map);
+		if (cpus_empty(nodemask))
+			continue;
+
 		if (sg == NULL)
 			continue;
 		sg = sg->next;

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 1/3] Dynamic sched domains: sched changes
  2005-05-19 15:04 [PATCH 1/3] Dynamic sched domains: sched changes Dinakar Guniguntala
  2005-05-19 15:06 ` [PATCH 2/3] Dynamic sched domains: cpuset changes Dinakar Guniguntala
  2005-05-19 15:07 ` [PATCH 3/3] Dynamic sched domains: ia64 changes Dinakar Guniguntala
@ 2005-05-19 20:09 ` Matthew Dobson
  2 siblings, 0 replies; 4+ messages in thread
From: Matthew Dobson @ 2005-05-19 20:09 UTC (permalink / raw)
  To: dino
  Cc: Andrew Morton, Paul Jackson, Simon Derr, Dipankar Sarma, lkml,
	lse-tech, Ingo Molnar, Jesse Barnes

Dinakar Guniguntala wrote:
> Hello Andrew,
> 
> The following patches add dynamic sched domains functionality that was
> extensively discussed on lkml and lse-tech.
> I would like to see this added to -mm
> 
> o The main advantage with this feature is that it ensures that the scheduler
>   load balacing code only balances against the cpus that are in the sched
>   domain as defined by an exclusive cpuset and not all of the cpus in the
>   system. This removes any overhead due to load balancing code trying to
>   pull tasks outside of the cpu exclusive cpuset only to be prevented by
>   the tasks' cpus_allowed mask.
> o cpu exclusive cpusets are useful for servers running orthogonal
>   workloads such as RT applications requiring low latency and HPC
>   applications that are throughput sensitive
> 
> o It provides a new API partition_sched_domains in sched.c
>   that makes dynamic sched domains possible.
> o cpu_exclusive cpusets sets are now associated with a sched domain.
>   Which means that the users can dynamically modify the sched domains
>   through the cpuset file system interface
> o ia64 sched domain code has been updated to support this feature as well
> o Currently, this does not support hotplug. (However some of my tests
>   indicate hotplug+preempt is currently broken)
> o I have tested it extensively on x86.
> o This should have very minimal impact on performance as none of
>   the fast paths are affected
> 
>         -Dinakar
> 
> Signed-off-by: Dinakar Guniguntala <dino@in.ibm.com>
> Acked-by: Paul Jackson <pj@sgi.com>
> Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>
> 
>  linux-2.6.12-rc4-mm1-1/include/linux/sched.h     |    2
>  linux-2.6.12-rc4-mm1-1/kernel/sched.c            |  130 +++++++++++++++--------
>  linux-2.6.12-rc4-mm1-2/Documentation/cpusets.txt |   16 ++
>  linux-2.6.12-rc4-mm1-2/kernel/cpuset.c           |   89 +++++++++++++--
>  linux-2.6.12-rc4-mm1-3/arch/ia64/kernel/domain.c |   76 +++++++------
>  5 files changed, 225 insertions(+), 88 deletions(-)

Looks good to me, Dinakar!

Acked-by: Matthew Dobson <colpatch@us.ibm.com>

-Matt

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2005-05-19 20:09 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-05-19 15:04 [PATCH 1/3] Dynamic sched domains: sched changes Dinakar Guniguntala
2005-05-19 15:06 ` [PATCH 2/3] Dynamic sched domains: cpuset changes Dinakar Guniguntala
2005-05-19 15:07 ` [PATCH 3/3] Dynamic sched domains: ia64 changes Dinakar Guniguntala
2005-05-19 20:09 ` [PATCH 1/3] Dynamic sched domains: sched changes Matthew Dobson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox