* [PATCH 2.6.16-mm2 1/4] sched_domain - handle kmalloc failure
@ 2006-04-01 18:52 Srivatsa Vaddagiri
2006-04-02 1:35 ` Nick Piggin
0 siblings, 1 reply; 4+ messages in thread
From: Srivatsa Vaddagiri @ 2006-04-01 18:52 UTC (permalink / raw)
To: Andrew Morton, Ingo Molnar, Nick Piggin
Cc: suresh.b.siddha, Dinakar Guniguntala, pj, hawkes, linux-kernel
Andrew/Nick/Ingo,
Here's a different version of the patch that tries to handle mem
allocation failures in build_sched_domains by bailing out and cleaning up
thus-far allocated memory. The patch has a direct consequence that we disable
load balancing completely (even at sibling level) upon *any* memory allocation
failure. Is that acceptable?
Patch (against 2.6.16-mm2) to handle memory allocation failure in
build_sched_domains.
Signed-off-by: Srivatsa Vaddagir <vatsa@in.ibm.com>
diff -puN kernel/sched.c~sd_handle_error kernel/sched.c
--- linux-2.6.16-mm2/kernel/sched.c~sd_handle_error 2006-04-01 23:36:43.000000000 +0530
+++ linux-2.6.16-mm2-root/kernel/sched.c 2006-04-01 23:37:35.000000000 +0530
@@ -6061,11 +6061,56 @@ next_sg:
}
#endif
+/* Free memory allocated for various sched_group structures */
+static void free_sched_groups(const cpumask_t *cpu_map)
+{
+#ifdef CONFIG_NUMA
+ int i;
+ int cpu;
+
+ for_each_cpu_mask(cpu, *cpu_map) {
+ struct sched_group *sched_group_allnodes
+ = sched_group_allnodes_bycpu[cpu];
+ struct sched_group **sched_group_nodes
+ = sched_group_nodes_bycpu[cpu];
+
+ if (sched_group_allnodes) {
+ kfree(sched_group_allnodes);
+ sched_group_allnodes_bycpu[cpu] = NULL;
+ }
+
+ if (!sched_group_nodes)
+ continue;
+
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ cpumask_t nodemask = node_to_cpumask(i);
+ struct sched_group *oldsg, *sg = sched_group_nodes[i];
+
+ cpus_and(nodemask, nodemask, *cpu_map);
+ if (cpus_empty(nodemask))
+ continue;
+
+ if (sg == NULL)
+ continue;
+ sg = sg->next;
+next_sg:
+ oldsg = sg;
+ sg = sg->next;
+ kfree(oldsg);
+ if (oldsg != sched_group_nodes[i])
+ goto next_sg;
+ }
+ kfree(sched_group_nodes);
+ sched_group_nodes_bycpu[cpu] = NULL;
+ }
+#endif
+}
+
/*
* Build sched domains for a given set of cpus and attach the sched domains
* to the individual cpus
*/
-void build_sched_domains(const cpumask_t *cpu_map)
+static int build_sched_domains(const cpumask_t *cpu_map)
{
int i;
#ifdef CONFIG_NUMA
@@ -6075,11 +6120,11 @@ void build_sched_domains(const cpumask_t
/*
* Allocate the per-node list of sched groups
*/
- sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
+ sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
GFP_ATOMIC);
if (!sched_group_nodes) {
printk(KERN_WARNING "Can not alloc sched group node list\n");
- return;
+ return -ENOMEM;
}
sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
#endif
@@ -6105,7 +6150,7 @@ void build_sched_domains(const cpumask_t
if (!sched_group_allnodes) {
printk(KERN_WARNING
"Can not alloc allnodes sched group\n");
- break;
+ goto error;
}
sched_group_allnodes_bycpu[i]
= sched_group_allnodes;
@@ -6219,23 +6264,20 @@ void build_sched_domains(const cpumask_t
cpus_and(domainspan, domainspan, *cpu_map);
sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+ if (!sg) {
+ printk(KERN_WARNING
+ "Can not alloc domain group for node %d\n", i);
+ goto error;
+ }
sched_group_nodes[i] = sg;
for_each_cpu_mask(j, nodemask) {
struct sched_domain *sd;
sd = &per_cpu(node_domains, j);
sd->groups = sg;
- if (sd->groups == NULL) {
- /* Turn off balancing if we have no groups */
- sd->flags = 0;
- }
- }
- if (!sg) {
- printk(KERN_WARNING
- "Can not alloc domain group for node %d\n", i);
- continue;
}
sg->cpu_power = 0;
sg->cpumask = nodemask;
+ sg->next = sg;
cpus_or(covered, covered, nodemask);
prev = sg;
@@ -6258,15 +6300,15 @@ void build_sched_domains(const cpumask_t
if (!sg) {
printk(KERN_WARNING
"Can not alloc domain group for node %d\n", j);
- break;
+ goto error;
}
sg->cpu_power = 0;
sg->cpumask = tmp;
+ sg->next = prev;
cpus_or(covered, covered, tmp);
prev->next = sg;
prev = sg;
}
- prev->next = sched_group_nodes[i];
}
#endif
@@ -6329,13 +6371,22 @@ void build_sched_domains(const cpumask_t
* Tune cache-hot values:
*/
calibrate_migration_costs(cpu_map);
+
+ return 0;
+
+#ifdef CONFIG_NUMA
+error:
+ free_sched_groups(cpu_map);
+ return -ENOMEM;
+#endif
}
/*
* Set up scheduler domains and groups. Callers must hold the hotplug lock.
*/
-static void arch_init_sched_domains(const cpumask_t *cpu_map)
+static int arch_init_sched_domains(const cpumask_t *cpu_map)
{
cpumask_t cpu_default_map;
+ int err;
/*
* Setup mask for cpus without special case scheduling requirements.
@@ -6344,51 +6395,14 @@ static void arch_init_sched_domains(cons
*/
cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
- build_sched_domains(&cpu_default_map);
+ err = build_sched_domains(&cpu_default_map);
+
+ return err;
}
static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
{
-#ifdef CONFIG_NUMA
- int i;
- int cpu;
-
- for_each_cpu_mask(cpu, *cpu_map) {
- struct sched_group *sched_group_allnodes
- = sched_group_allnodes_bycpu[cpu];
- struct sched_group **sched_group_nodes
- = sched_group_nodes_bycpu[cpu];
-
- if (sched_group_allnodes) {
- kfree(sched_group_allnodes);
- sched_group_allnodes_bycpu[cpu] = NULL;
- }
-
- if (!sched_group_nodes)
- continue;
-
- for (i = 0; i < MAX_NUMNODES; i++) {
- cpumask_t nodemask = node_to_cpumask(i);
- struct sched_group *oldsg, *sg = sched_group_nodes[i];
-
- cpus_and(nodemask, nodemask, *cpu_map);
- if (cpus_empty(nodemask))
- continue;
-
- if (sg == NULL)
- continue;
- sg = sg->next;
-next_sg:
- oldsg = sg;
- sg = sg->next;
- kfree(oldsg);
- if (oldsg != sched_group_nodes[i])
- goto next_sg;
- }
- kfree(sched_group_nodes);
- sched_group_nodes_bycpu[cpu] = NULL;
- }
-#endif
+ free_sched_groups(cpu_map);
}
/*
@@ -6413,9 +6427,10 @@ static void detach_destroy_domains(const
* correct sched domains
* Call with hotplug lock held
*/
-void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
+int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
{
cpumask_t change_map;
+ int err = 0;
cpus_and(*partition1, *partition1, cpu_online_map);
cpus_and(*partition2, *partition2, cpu_online_map);
@@ -6424,9 +6439,11 @@ void partition_sched_domains(cpumask_t *
/* Detach sched domains from all of the affected cpus */
detach_destroy_domains(&change_map);
if (!cpus_empty(*partition1))
- build_sched_domains(partition1);
- if (!cpus_empty(*partition2))
- build_sched_domains(partition2);
+ err = build_sched_domains(partition1);
+ if (!err && !cpus_empty(*partition2))
+ err = build_sched_domains(partition2);
+
+ return err;
}
#ifdef CONFIG_HOTPLUG_CPU
diff -puN include/linux/sched.h~sd_handle_error include/linux/sched.h
--- linux-2.6.16-mm2/include/linux/sched.h~sd_handle_error 2006-04-01 23:36:43.000000000 +0530
+++ linux-2.6.16-mm2-root/include/linux/sched.h 2006-04-01 23:36:43.000000000 +0530
@@ -632,7 +632,7 @@ struct sched_domain {
#endif
};
-extern void partition_sched_domains(cpumask_t *partition1,
+extern int partition_sched_domains(cpumask_t *partition1,
cpumask_t *partition2);
/*
_
--
Regards,
vatsa
^ permalink raw reply [flat|nested] 4+ messages in thread* Re: [PATCH 2.6.16-mm2 1/4] sched_domain - handle kmalloc failure
2006-04-01 18:52 [PATCH 2.6.16-mm2 1/4] sched_domain - handle kmalloc failure Srivatsa Vaddagiri
@ 2006-04-02 1:35 ` Nick Piggin
2006-04-02 5:25 ` Andrew Morton
0 siblings, 1 reply; 4+ messages in thread
From: Nick Piggin @ 2006-04-02 1:35 UTC (permalink / raw)
To: vatsa
Cc: Andrew Morton, Ingo Molnar, suresh.b.siddha, Dinakar Guniguntala,
pj, hawkes, linux-kernel
Srivatsa Vaddagiri wrote:
> Andrew/Nick/Ingo,
> Here's a different version of the patch that tries to handle mem
> allocation failures in build_sched_domains by bailing out and cleaning up
> thus-far allocated memory. The patch has a direct consequence that we disable
> load balancing completely (even at sibling level) upon *any* memory allocation
> failure. Is that acceptable?
>
I guess so. Ideal solution would be to make all required allocations first,
then fail the build_sched_domains and fall back to the old structure. But
I guess that gets pretty complicated.
In reality (and after your patch 2/4), I don't think the page allocator will
ever fail any of these allocations. In that case, would it be simpler just
to add a __GFP_NOFAIL here and forget about it?
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH 2.6.16-mm2 1/4] sched_domain - handle kmalloc failure
2006-04-02 1:35 ` Nick Piggin
@ 2006-04-02 5:25 ` Andrew Morton
2006-04-02 5:37 ` Nick Piggin
0 siblings, 1 reply; 4+ messages in thread
From: Andrew Morton @ 2006-04-02 5:25 UTC (permalink / raw)
To: Nick Piggin; +Cc: vatsa, mingo, suresh.b.siddha, dino, pj, hawkes, linux-kernel
Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>
> Srivatsa Vaddagiri wrote:
> > Andrew/Nick/Ingo,
> > Here's a different version of the patch that tries to handle mem
> > allocation failures in build_sched_domains by bailing out and cleaning up
> > thus-far allocated memory. The patch has a direct consequence that we disable
> > load balancing completely (even at sibling level) upon *any* memory allocation
> > failure. Is that acceptable?
> >
>
> I guess so. Ideal solution would be to make all required allocations first,
> then fail the build_sched_domains and fall back to the old structure.
I'd have thought so.
> But
> I guess that gets pretty complicated.
Maybe. Sometimes it's just a matter of running the code twice - first pass
is allocate-stuff mode, second pass is use-stuff mode.
> In reality (and after your patch 2/4), I don't think the page allocator will
> ever fail any of these allocations.
That is presently true, as long as the amounts being allocated are small
enough.
> In that case, would it be simpler just
> to add a __GFP_NOFAIL here and forget about it?
No new __GFP_NOFAILs, please.
The fact that the CPU addition will succeed, but it'll run forever more
with load balancing disabled still seems Just Wrong to me. We should
either completely succeed or completely fail.
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH 2.6.16-mm2 1/4] sched_domain - handle kmalloc failure
2006-04-02 5:25 ` Andrew Morton
@ 2006-04-02 5:37 ` Nick Piggin
0 siblings, 0 replies; 4+ messages in thread
From: Nick Piggin @ 2006-04-02 5:37 UTC (permalink / raw)
To: Andrew Morton
Cc: vatsa, mingo, suresh.b.siddha, dino, pj, hawkes, linux-kernel
Andrew Morton wrote:
> Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>>In that case, would it be simpler just
>>to add a __GFP_NOFAIL here and forget about it?
>
>
> No new __GFP_NOFAILs, please.
It isn't a new one as such. It would simply make explicit the fact
that this code really can't handle allocation failures, and it is
presently depending on the allocator implementation to work.
> The fact that the CPU addition will succeed, but it'll run forever more
> with load balancing disabled still seems Just Wrong to me. We should
> either completely succeed or completely fail.
>
Yes. But we shouldn't partially fail and leave the machine crippled.
Hence, __GFP_NOFAIL as a good marker for someone who gets keen and
comes along to fix it up properly. If it were trivial to fix it, I
wouldn't suggest adding the __GFP_NOFAIL.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2006-04-02 9:15 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-04-01 18:52 [PATCH 2.6.16-mm2 1/4] sched_domain - handle kmalloc failure Srivatsa Vaddagiri
2006-04-02 1:35 ` Nick Piggin
2006-04-02 5:25 ` Andrew Morton
2006-04-02 5:37 ` Nick Piggin
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox