[PATCH -rt 0/5] hotplug fixes

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH -rt 0/5] hotplug fixes
@ 2008-06-10 11:12 Peter Zijlstra
  2008-06-10 11:13 ` [PATCH -rt 1/5] cpu-hotplug: vs slab Peter Zijlstra
                   ` (4 more replies)
  0 siblings, 5 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-06-10 11:12 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Thomas Gleixner, Steven Rostedt, Clark Williams,
	Gregory Haskins, Paul E. McKenney, Gautham R Shenoy, Pekka Enberg,
	Arnaldo Carvalho de Melo

Hi, 

the following patches enable me to 'almost' build a kernel while
hotplugging like crazy.

while :; do 
  echo 0 > /sys/devices/system/cpu/cpu1/online; 
  echo 1 > /sys/devices/system/cpu/cpu1/online; 
done

vs

make -j5

There still seems to be a bit of weirdness left, but I throught it time to post
these patches so that others can have a peek.

(patches against .24-rt)


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH -rt 1/5] cpu-hotplug: vs slab
  2008-06-10 11:12 [PATCH -rt 0/5] hotplug fixes Peter Zijlstra
@ 2008-06-10 11:13 ` Peter Zijlstra
  2008-06-10 11:13 ` [PATCH -rt 2/5] cpu-hotplug: vs page_alloc Peter Zijlstra
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-06-10 11:13 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Thomas Gleixner, Steven Rostedt, Clark Williams,
	Gregory Haskins, Paul E. McKenney, Gautham R Shenoy, Pekka Enberg,
	Arnaldo Carvalho de Melo, Peter Zijlstra

[-- Attachment #1: hotplug-slab.patch --]
[-- Type: text/plain, Size: 10327 bytes --]

Fix up the slab allocator to be cpu-hotplug safe (again, pure -rt regression).

On -rt we protect per-cpu state by locks instead of disabling preemption/irqs.
This keeps all the code preemptible at the cost of possible remote memory
access.

The race was that cpu-hotplug - which assumes to be cpu local and non-
preemptible, didn't take the per-cpu lock.

This also means that the normal lock acquire needs to be aware of cpus getting
off-lined while its waiting.

Clean up some of the macro mess while we're there.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 mm/slab.c |  170 ++++++++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 122 insertions(+), 48 deletions(-)

Index: linux-2.6.24.7.noarch/mm/slab.c
===================================================================
--- linux-2.6.24.7.noarch.orig/mm/slab.c
+++ linux-2.6.24.7.noarch/mm/slab.c
@@ -125,43 +125,116 @@
  * the CPU number of the lock there.
  */
 #ifndef CONFIG_PREEMPT_RT
+
 # define slab_irq_disable(cpu) \
 	do { local_irq_disable(); (cpu) = smp_processor_id(); } while (0)
 # define slab_irq_enable(cpu)		local_irq_enable()
+
+static inline void slab_irq_disable_this_rt(int cpu)
+{
+}
+
+static inline void slab_irq_enable_rt(int cpu)
+{
+}
+
 # define slab_irq_save(flags, cpu) \
 	do { local_irq_save(flags); (cpu) = smp_processor_id(); } while (0)
 # define slab_irq_restore(flags, cpu)	local_irq_restore(flags)
+
 /*
  * In the __GFP_WAIT case we enable/disable interrupts on !PREEMPT_RT,
  * which has no per-CPU locking effect since we are holding the cache
  * lock in that case already.
- *
- * (On PREEMPT_RT, these are NOPs, but we have to drop/get the irq locks.)
  */
-# define slab_irq_disable_nort(cpu)	slab_irq_disable(cpu)
-# define slab_irq_enable_nort(cpu)	slab_irq_enable(cpu)
-# define slab_irq_disable_rt(flags)	do { (void)(flags); } while (0)
-# define slab_irq_enable_rt(flags)	do { (void)(flags); } while (0)
+static void slab_irq_enable_GFP_WAIT(gfp_t flags, int *cpu)
+{
+	if (flags & __GFP_WAIT)
+		local_irq_enable();
+}
+
+static void slab_irq_disable_GFP_WAIT(gfp_t flags, int *cpu)
+{
+	if (flags & __GFP_WAIT)
+		local_irq_disable();
+}
+
 # define slab_spin_lock_irq(lock, cpu) \
 	do { spin_lock_irq(lock); (cpu) = smp_processor_id(); } while (0)
-# define slab_spin_unlock_irq(lock, cpu) \
-					spin_unlock_irq(lock)
+# define slab_spin_unlock_irq(lock, cpu) spin_unlock_irq(lock)
+
 # define slab_spin_lock_irqsave(lock, flags, cpu) \
 	do { spin_lock_irqsave(lock, flags); (cpu) = smp_processor_id(); } while (0)
 # define slab_spin_unlock_irqrestore(lock, flags, cpu) \
 	do { spin_unlock_irqrestore(lock, flags); } while (0)
-#else
+
+#else /* CONFIG_PREEMPT_RT */
+
+/*
+ * Instead of serializing the per-cpu state by disabling interrupts we do so
+ * by a lock. This keeps the code preemptable - albeit at the cost of remote
+ * memory access when the task does get migrated away.
+ */
 DEFINE_PER_CPU_LOCKED(int, slab_irq_locks) = { 0, };
-# define slab_irq_disable(cpu)		(void)get_cpu_var_locked(slab_irq_locks, &(cpu))
-# define slab_irq_enable(cpu)		put_cpu_var_locked(slab_irq_locks, cpu)
+
+static void _slab_irq_disable(int *cpu)
+{
+	int this_cpu;
+	spinlock_t *lock;
+
+again:
+	this_cpu = raw_smp_processor_id();
+	lock = &__get_cpu_lock(slab_irq_locks, this_cpu);
+
+	spin_lock(lock);
+	if (unlikely(!cpu_online(this_cpu))) {
+		/*
+		 * Bail - the cpu got hot-unplugged while we were waiting
+		 * for the lock.
+		 */
+		spin_unlock(lock);
+		goto again;
+	}
+
+	*cpu = this_cpu;
+}
+
+#define slab_irq_disable(cpu) _slab_irq_disable(&(cpu))
+
+static inline void slab_irq_enable(int cpu)
+{
+	spin_unlock(&__get_cpu_lock(slab_irq_locks, cpu));
+}
+
+static inline void slab_irq_disable_this_rt(int cpu)
+{
+	spin_lock(&__get_cpu_lock(slab_irq_locks, cpu));
+}
+
+static inline void slab_irq_enable_rt(int cpu)
+{
+	spin_unlock(&__get_cpu_lock(slab_irq_locks, cpu));
+}
+
 # define slab_irq_save(flags, cpu) \
 	do { slab_irq_disable(cpu); (void) (flags); } while (0)
 # define slab_irq_restore(flags, cpu) \
 	do { slab_irq_enable(cpu); (void) (flags); } while (0)
-# define slab_irq_disable_rt(cpu)	slab_irq_disable(cpu)
-# define slab_irq_enable_rt(cpu)	slab_irq_enable(cpu)
-# define slab_irq_disable_nort(cpu)	do { } while (0)
-# define slab_irq_enable_nort(cpu)	do { } while (0)
+
+/*
+ * On PREEMPT_RT we have to drop the locks unconditionally to avoid lock
+ * recursion on the cache_grow()->alloc_slabmgmt() path.
+ */
+static void slab_irq_enable_GFP_WAIT(gfp_t flags, int *cpu)
+{
+	slab_irq_enable(*cpu);
+}
+
+static void slab_irq_disable_GFP_WAIT(gfp_t flags, int *cpu)
+{
+	slab_irq_disable(*cpu);
+}
+
 # define slab_spin_lock_irq(lock, cpu) \
 		do { slab_irq_disable(cpu); spin_lock(lock); } while (0)
 # define slab_spin_unlock_irq(lock, cpu) \
@@ -170,7 +243,8 @@ DEFINE_PER_CPU_LOCKED(int, slab_irq_lock
 	do { slab_irq_disable(cpu); spin_lock_irqsave(lock, flags); } while (0)
 # define slab_spin_unlock_irqrestore(lock, flags, cpu) \
 	do { spin_unlock_irqrestore(lock, flags); slab_irq_enable(cpu); } while (0)
-#endif
+
+#endif /* CONFIG_PREEMPT_RT */
 
 /*
  * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
@@ -1221,7 +1295,7 @@ cache_free_alien(struct kmem_cache *cach
 }
 #endif
 
-static void __cpuinit cpuup_canceled(long cpu)
+static void __cpuinit cpuup_canceled(int cpu)
 {
 	struct kmem_cache *cachep;
 	struct kmem_list3 *l3 = NULL;
@@ -1231,7 +1305,7 @@ static void __cpuinit cpuup_canceled(lon
 		struct array_cache *nc;
 		struct array_cache *shared;
 		struct array_cache **alien;
-		int this_cpu;
+		int orig_cpu = cpu;
 		cpumask_t mask;
 
 		mask = node_to_cpumask(node);
@@ -1243,31 +1317,30 @@ static void __cpuinit cpuup_canceled(lon
 		if (!l3)
 			goto free_array_cache;
 
-		slab_spin_lock_irq(&l3->list_lock, this_cpu);
+		spin_lock_irq(&l3->list_lock);
 
 		/* Free limit for this kmem_list3 */
 		l3->free_limit -= cachep->batchcount;
 		if (nc)
 			free_block(cachep, nc->entry, nc->avail, node,
-				   &this_cpu);
+				   &cpu);
 
 		if (!cpus_empty(mask)) {
-			slab_spin_unlock_irq(&l3->list_lock,
-					     this_cpu);
+			spin_unlock_irq(&l3->list_lock);
 			goto free_array_cache;
 		}
 
 		shared = l3->shared;
 		if (shared) {
 			free_block(cachep, shared->entry,
-				   shared->avail, node, &this_cpu);
+				   shared->avail, node, &cpu);
 			l3->shared = NULL;
 		}
 
 		alien = l3->alien;
 		l3->alien = NULL;
 
-		slab_spin_unlock_irq(&l3->list_lock, this_cpu);
+		spin_unlock_irq(&l3->list_lock);
 
 		kfree(shared);
 		if (alien) {
@@ -1276,6 +1349,7 @@ static void __cpuinit cpuup_canceled(lon
 		}
 free_array_cache:
 		kfree(nc);
+		BUG_ON(cpu != orig_cpu);
 	}
 	/*
 	 * In the previous loop, all the objects were freed to
@@ -1290,13 +1364,12 @@ free_array_cache:
 	}
 }
 
-static int __cpuinit cpuup_prepare(long cpu)
+static int __cpuinit cpuup_prepare(int cpu)
 {
 	struct kmem_cache *cachep;
 	struct kmem_list3 *l3 = NULL;
 	int node = cpu_to_node(cpu);
 	const int memsize = sizeof(struct kmem_list3);
-	int this_cpu;
 
 	/*
 	 * We need to do this right in the beginning since
@@ -1327,11 +1400,11 @@ static int __cpuinit cpuup_prepare(long 
 			cachep->nodelists[node] = l3;
 		}
 
-		slab_spin_lock_irq(&cachep->nodelists[node]->list_lock, this_cpu);
+		spin_lock_irq(&cachep->nodelists[node]->list_lock);
 		cachep->nodelists[node]->free_limit =
 			(1 + nr_cpus_node(node)) *
 			cachep->batchcount + cachep->num;
-		slab_spin_unlock_irq(&cachep->nodelists[node]->list_lock, this_cpu);
+		spin_unlock_irq(&cachep->nodelists[node]->list_lock);
 	}
 
 	/*
@@ -1368,7 +1441,7 @@ static int __cpuinit cpuup_prepare(long 
 		l3 = cachep->nodelists[node];
 		BUG_ON(!l3);
 
-		slab_spin_lock_irq(&l3->list_lock, this_cpu);
+		spin_lock_irq(&l3->list_lock);
 		if (!l3->shared) {
 			/*
 			 * We are serialised from CPU_DEAD or
@@ -1383,7 +1456,7 @@ static int __cpuinit cpuup_prepare(long 
 			alien = NULL;
 		}
 #endif
-		slab_spin_unlock_irq(&l3->list_lock, this_cpu);
+		spin_unlock_irq(&l3->list_lock);
 		kfree(shared);
 		free_alien_cache(alien);
 	}
@@ -1402,7 +1475,18 @@ static int __cpuinit cpuup_callback(stru
 	switch (action) {
 	case CPU_LOCK_ACQUIRE:
 		mutex_lock(&cache_chain_mutex);
+		return NOTIFY_OK;
+	case CPU_LOCK_RELEASE:
+		mutex_unlock(&cache_chain_mutex);
+		return NOTIFY_OK;
+
+	default:
 		break;
+	}
+
+	slab_irq_disable_this_rt(cpu);
+
+	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
 		err = cpuup_prepare(cpu);
@@ -1444,10 +1528,10 @@ static int __cpuinit cpuup_callback(stru
 	case CPU_UP_CANCELED_FROZEN:
 		cpuup_canceled(cpu);
 		break;
-	case CPU_LOCK_RELEASE:
-		mutex_unlock(&cache_chain_mutex);
-		break;
 	}
+
+	slab_irq_enable_rt(cpu);
+
 	return err ? NOTIFY_BAD : NOTIFY_OK;
 }
 
@@ -2898,9 +2982,7 @@ static int cache_grow(struct kmem_cache 
 
 	offset *= cachep->colour_off;
 
-	if (local_flags & __GFP_WAIT)
-		slab_irq_enable_nort(*this_cpu);
-	slab_irq_enable_rt(*this_cpu);
+	slab_irq_enable_GFP_WAIT(local_flags, this_cpu);
 
 	/*
 	 * The test for missing atomic flag is performed here, rather than
@@ -2930,9 +3012,7 @@ static int cache_grow(struct kmem_cache 
 
 	cache_init_objs(cachep, slabp);
 
-	slab_irq_disable_rt(*this_cpu);
-	if (local_flags & __GFP_WAIT)
-		slab_irq_disable_nort(*this_cpu);
+	slab_irq_disable_GFP_WAIT(local_flags, this_cpu);
 
 	check_irq_off();
 	spin_lock(&l3->list_lock);
@@ -2946,9 +3026,7 @@ static int cache_grow(struct kmem_cache 
 opps1:
 	kmem_freepages(cachep, objp);
 failed:
-	slab_irq_disable_rt(*this_cpu);
-	if (local_flags & __GFP_WAIT)
-		slab_irq_disable_nort(*this_cpu);
+	slab_irq_disable_GFP_WAIT(local_flags, this_cpu);
 	return 0;
 }
 
@@ -3395,16 +3473,12 @@ retry:
 		 * We may trigger various forms of reclaim on the allowed
 		 * set and go into memory reserves if necessary.
 		 */
-		if (local_flags & __GFP_WAIT)
-			slab_irq_enable_nort(*this_cpu);
-		slab_irq_enable_rt(*this_cpu);
+		slab_irq_enable_GFP_WAIT(local_flags, this_cpu);
 
 		kmem_flagcheck(cache, flags);
 		obj = kmem_getpages(cache, flags, -1);
 
-		slab_irq_disable_rt(*this_cpu);
-		if (local_flags & __GFP_WAIT)
-			slab_irq_disable_nort(*this_cpu);
+		slab_irq_disable_GFP_WAIT(local_flags, this_cpu);
 
 		if (obj) {
 			/*

-- 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH -rt 2/5] cpu-hotplug: vs page_alloc
  2008-06-10 11:12 [PATCH -rt 0/5] hotplug fixes Peter Zijlstra
  2008-06-10 11:13 ` [PATCH -rt 1/5] cpu-hotplug: vs slab Peter Zijlstra
@ 2008-06-10 11:13 ` Peter Zijlstra
  2008-06-10 11:13 ` [PATCH -rt 3/5] cpu-hotplug: cpu_up vs preempt-rt Peter Zijlstra
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-06-10 11:13 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Thomas Gleixner, Steven Rostedt, Clark Williams,
	Gregory Haskins, Paul E. McKenney, Gautham R Shenoy, Pekka Enberg,
	Arnaldo Carvalho de Melo, Peter Zijlstra

[-- Attachment #1: hotplug-page_alloc.patch --]
[-- Type: text/plain, Size: 1849 bytes --]

On -rt we protect per-cpu state by locks instead of disabling preemption/irqs.
This keeps all the code preemptible at the cost of possible remote memory
access.

The race was that cpu-hotplug - which assumes to be cpu local and non-
preemptible, didn't take the per-cpu lock.

This also means that the normal lock acquire needs to be aware of cpus getting
off-lined while its waiting.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 mm/page_alloc.c |   24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

Index: linux-2.6.24.7.noarch/mm/page_alloc.c
===================================================================
--- linux-2.6.24.7.noarch.orig/mm/page_alloc.c
+++ linux-2.6.24.7.noarch/mm/page_alloc.c
@@ -176,7 +176,19 @@ static inline void __lock_cpu_pcp(unsign
 static inline void lock_cpu_pcp(unsigned long *flags, int *this_cpu)
 {
 #ifdef CONFIG_PREEMPT_RT
-	(void)get_cpu_var_locked(pcp_locks, this_cpu);
+	spinlock_t *lock;
+	int cpu;
+
+again:
+	cpu = raw_smp_processor_id();
+	lock = &__get_cpu_lock(pcp_locks, cpu);
+
+	spin_lock(lock);
+	if (unlikely(!cpu_online(cpu))) {
+		spin_unlock(lock);
+		goto again;
+	}
+	*this_cpu = cpu;
 	flags = 0;
 #else
 	local_irq_save(*flags);
@@ -2781,12 +2793,17 @@ static inline void free_zone_pagesets(in
 	struct zone *zone;
 
 	for_each_zone(zone) {
-		struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
+		struct per_cpu_pageset *pset;
+		unsigned long flags;
+
+		__lock_cpu_pcp(&flags, cpu);
+		pset = zone_pcp(zone, cpu);
+		zone_pcp(zone, cpu) = NULL;
+		unlock_cpu_pcp(flags, cpu);
 
 		/* Free per_cpu_pageset if it is slab allocated */
 		if (pset != &boot_pageset[cpu])
 			kfree(pset);
-		zone_pcp(zone, cpu) = NULL;
 	}
 }
 
@@ -2812,6 +2829,7 @@ static int __cpuinit pageset_cpuup_callb
 	default:
 		break;
 	}
+
 	return ret;
 }
 

-- 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH -rt 3/5] cpu-hotplug: cpu_up vs preempt-rt
  2008-06-10 11:12 [PATCH -rt 0/5] hotplug fixes Peter Zijlstra
  2008-06-10 11:13 ` [PATCH -rt 1/5] cpu-hotplug: vs slab Peter Zijlstra
  2008-06-10 11:13 ` [PATCH -rt 2/5] cpu-hotplug: vs page_alloc Peter Zijlstra
@ 2008-06-10 11:13 ` Peter Zijlstra
  2008-06-10 11:13 ` [PATCH -rt 4/5] rcu: backport RCU cpu hotplug support Peter Zijlstra
  2008-06-10 11:13 ` [PATCH -rt 5/5] cpu-hotplug: cpu_down vs preempt-rt Peter Zijlstra
  4 siblings, 0 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-06-10 11:13 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Thomas Gleixner, Steven Rostedt, Clark Williams,
	Gregory Haskins, Paul E. McKenney, Gautham R Shenoy, Pekka Enberg,
	Arnaldo Carvalho de Melo, Peter Zijlstra

[-- Attachment #1: hotplug-smp-bootstrap.patch --]
[-- Type: text/plain, Size: 5218 bytes --]

On PREEMPT_RT the allocators use preemptible locks, cpu bootstrap must have IRQs
disabled because there are no IRQ/exception stacks yet, these we allocate 
atomically, which is not possible on -rt.

Solve this by allocating these stacks on the boot cpu (which already has its
stacks).

This also allows cpu-up to fail instead of panic on OOM scenarios.

I suspect it also fixes a memory leak, as I cannot find the place where 
cpu_down frees these cpu stacks, but each cpu_up used to allocate new ones.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 arch/x86/kernel/setup64.c      |   31 ++--------------------
 arch/x86/kernel/smpboot_64.c   |   57 +++++++++++++++++++++++++++++++++++++++++
 include/asm-x86/processor_64.h |    4 ++
 3 files changed, 65 insertions(+), 27 deletions(-)

Index: linux-2.6.24.7.noarch/arch/x86/kernel/setup64.c
===================================================================
--- linux-2.6.24.7.noarch.orig/arch/x86/kernel/setup64.c
+++ linux-2.6.24.7.noarch/arch/x86/kernel/setup64.c
@@ -137,19 +137,12 @@ void pda_init(int cpu)
 		pda->pcurrent = &init_task;
 		pda->irqstackptr = boot_cpu_stack; 
 	} else {
-		pda->irqstackptr = (char *)
-			__get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
-		if (!pda->irqstackptr)
-			panic("cannot allocate irqstack for cpu %d", cpu); 
+		pda->irqstackptr = (char *)per_cpu(init_tss, cpu).irqstack;
 	}
 
-
 	pda->irqstackptr += IRQSTACKSIZE-64;
 } 
 
-char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
-__attribute__((section(".bss.page_aligned")));
-
 extern asmlinkage void ignore_sysret(void);
 
 /* May not be marked __init: used by software suspend */
@@ -203,15 +196,13 @@ void __cpuinit cpu_init (void)
 	struct tss_struct *t = &per_cpu(init_tss, cpu);
 	struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
 	unsigned long v; 
-	char *estacks = NULL; 
 	struct task_struct *me;
 	int i;
 
 	/* CPU 0 is initialised in head64.c */
 	if (cpu != 0) {
 		pda_init(cpu);
-	} else 
-		estacks = boot_exception_stacks; 
+	}
 
 	me = current;
 
@@ -245,22 +236,8 @@ void __cpuinit cpu_init (void)
 	/*
 	 * set up and load the per-CPU TSS
 	 */
-	for (v = 0; v < N_EXCEPTION_STACKS; v++) {
-		static const unsigned int order[N_EXCEPTION_STACKS] = {
-			[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
-#if DEBUG_STACK > 0
-			[DEBUG_STACK - 1] = DEBUG_STACK_ORDER
-#endif
-		};
-		if (cpu) {
-			estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
-			if (!estacks)
-				panic("Cannot allocate exception stack %ld %d\n",
-				      v, cpu); 
-		}
-		estacks += PAGE_SIZE << order[v];
-		orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
-	}
+	for (v = 0; v < N_EXCEPTION_STACKS; v++)
+		orig_ist->ist[v] = t->ist[v] = (unsigned long)t->estacks[v];
 
 	t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
 	/*
Index: linux-2.6.24.7.noarch/arch/x86/kernel/smpboot_64.c
===================================================================
--- linux-2.6.24.7.noarch.orig/arch/x86/kernel/smpboot_64.c
+++ linux-2.6.24.7.noarch/arch/x86/kernel/smpboot_64.c
@@ -535,6 +535,60 @@ static void __cpuinit do_fork_idle(struc
 	complete(&c_idle->done);
 }
 
+static char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
+__attribute__((section(".bss.page_aligned")));
+
+static int __cpuinit allocate_stacks(int cpu)
+{
+	static const unsigned int order[N_EXCEPTION_STACKS] = {
+		[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
+#if DEBUG_STACK > 0
+		[DEBUG_STACK - 1] = DEBUG_STACK_ORDER
+#endif
+	};
+	struct tss_struct *t = &per_cpu(init_tss, cpu);
+	int node = cpu_to_node(cpu);
+	struct page *page;
+	char *estack;
+	int v;
+
+	if (cpu && !t->irqstack) {
+		page = alloc_pages_node(node, GFP_KERNEL,
+				IRQSTACK_ORDER);
+		if (!page)
+			goto fail_oom;
+		t->irqstack = page_address(page);
+	}
+
+	if (!cpu)
+		estack = boot_exception_stacks;
+
+	for (v = 0; v < N_EXCEPTION_STACKS; v++) {
+		if (t->estacks[v])
+			continue;
+
+		if (cpu) {
+			page = alloc_pages_node(node, GFP_KERNEL, order[v]);
+			if (!page)
+				goto fail_oom;
+			estack = page_address(page);
+		}
+		estack += PAGE_SIZE << order[v];
+		/*
+		 * XXX: can we set t->isr[v] here directly, or will that be
+		 * modified later? - the existance of orig_ist seems to suggest
+		 * it _can_ be modified, which would imply we'd need to reset
+		 * it.
+		 */
+		t->estacks[v] = estack;
+	}
+
+	return 0;
+
+fail_oom:
+	return -ENOMEM;
+}
+
 /*
  * Boot one CPU.
  */
@@ -605,6 +659,9 @@ static int __cpuinit do_boot_cpu(int cpu
 		return PTR_ERR(c_idle.idle);
 	}
 
+	if (allocate_stacks(cpu))
+		return -ENOMEM;
+
 	set_idle_for_cpu(cpu, c_idle.idle);
 
 do_rest:
Index: linux-2.6.24.7.noarch/include/asm-x86/processor_64.h
===================================================================
--- linux-2.6.24.7.noarch.orig/include/asm-x86/processor_64.h
+++ linux-2.6.24.7.noarch/include/asm-x86/processor_64.h
@@ -197,6 +197,10 @@ struct tss_struct {
 	 * 8 bytes, for an extra "long" of ~0UL
 	 */
 	unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
+
+	void *irqstack;
+	void *estacks[N_EXCEPTION_STACKS];
+
 } __attribute__((packed)) ____cacheline_aligned;
 
 

-- 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH -rt 4/5] rcu: backport RCU cpu hotplug support
  2008-06-10 11:12 [PATCH -rt 0/5] hotplug fixes Peter Zijlstra
                   ` (2 preceding siblings ...)
  2008-06-10 11:13 ` [PATCH -rt 3/5] cpu-hotplug: cpu_up vs preempt-rt Peter Zijlstra
@ 2008-06-10 11:13 ` Peter Zijlstra
  2008-06-10 15:15   ` Paul E. McKenney
  2008-06-10 11:13 ` [PATCH -rt 5/5] cpu-hotplug: cpu_down vs preempt-rt Peter Zijlstra
  4 siblings, 1 reply; 11+ messages in thread
From: Peter Zijlstra @ 2008-06-10 11:13 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Thomas Gleixner, Steven Rostedt, Clark Williams,
	Gregory Haskins, Paul E. McKenney, Gautham R Shenoy, Pekka Enberg,
	Arnaldo Carvalho de Melo, Peter Zijlstra

[-- Attachment #1: hotplug-rcu.patch --]
[-- Type: text/plain, Size: 3233 bytes --]

backport the RCU cpu-hotplug support from .26-rc to .24-rt

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/rcupreempt.c |   58 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 55 insertions(+), 3 deletions(-)

Index: linux-2.6.24.7.noarch/kernel/rcupreempt.c
===================================================================
--- linux-2.6.24.7.noarch.orig/kernel/rcupreempt.c
+++ linux-2.6.24.7.noarch/kernel/rcupreempt.c
@@ -820,6 +820,13 @@ void rcu_offline_cpu_rt(int cpu)
 		smp_mb();  /* Subsequent RCU read-side critical sections */
 			   /*  seen -after- acknowledgement. */
 	}
+
+	__get_cpu_var(rcu_flipctr)[0] += per_cpu(rcu_flipctr, cpu)[0];
+	__get_cpu_var(rcu_flipctr)[1] += per_cpu(rcu_flipctr, cpu)[1];
+
+	per_cpu(rcu_flipctr, cpu)[0] = 0;
+	per_cpu(rcu_flipctr, cpu)[1] = 0;
+
 	cpu_clear(cpu, rcu_cpu_online_map);
 	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq);
 
@@ -833,8 +840,9 @@ void rcu_offline_cpu_rt(int cpu)
 	 * fix.
 	 */
 
+	local_irq_save(oldirq);
 	rdp = RCU_DATA_ME();
-	spin_lock_irqsave(&rdp->lock, oldirq);
+	spin_lock(&rdp->lock);
 	*rdp->nexttail = list;
 	if (list)
 		rdp->nexttail = tail;
@@ -866,9 +874,11 @@ void rcu_process_callbacks_rt(struct sof
 {
 	unsigned long flags;
 	struct rcu_head *next, *list;
-	struct rcu_data *rdp = RCU_DATA_ME();
+	struct rcu_data *rdp;
 
-	spin_lock_irqsave(&rdp->lock, flags);
+	local_irq_save(flags);
+	rdp = RCU_DATA_ME();
+	spin_lock(&rdp->lock);
 	list = rdp->donelist;
 	if (list == NULL) {
 		spin_unlock_irqrestore(&rdp->lock, flags);
@@ -951,6 +961,32 @@ int rcu_pending_rt(int cpu)
 	return 0;
 }
 
+static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
+                                unsigned long action, void *hcpu)
+{
+        long cpu = (long)hcpu;
+
+        switch (action) {
+        case CPU_UP_PREPARE:
+        case CPU_UP_PREPARE_FROZEN:
+                rcu_online_cpu_rt(cpu);
+                break;
+        case CPU_UP_CANCELED:
+        case CPU_UP_CANCELED_FROZEN:
+        case CPU_DEAD:
+        case CPU_DEAD_FROZEN:
+                rcu_offline_cpu_rt(cpu);
+                break;
+        default:
+                break;
+        }
+        return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata rcu_nb = {
+        .notifier_call = rcu_cpu_notify,
+};
+
 void __init rcu_init_rt(void)
 {
 	int cpu;
@@ -972,6 +1008,22 @@ void __init rcu_init_rt(void)
 		rdp->donetail = &rdp->donelist;
 	}
 	rcu_preempt_boost_init();
+	register_cpu_notifier(&rcu_nb);
+
+	/*
+         * We don't need protection against CPU-Hotplug here
+         * since
+         * a) If a CPU comes online while we are iterating over the
+         *    cpu_online_map below, we would only end up making a
+         *    duplicate call to rcu_online_cpu() which sets the corresponding
+         *    CPU's mask in the rcu_cpu_online_map.
+         *
+         * b) A CPU cannot go offline at this point in time since the user
+         *    does not have access to the sysfs interface, nor do we
+         *    suspend the system.
+         */
+        for_each_online_cpu(cpu)
+                rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu);
 }
 
 /*

-- 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH -rt 4/5] rcu: backport RCU cpu hotplug support
  2008-06-10 11:13 ` [PATCH -rt 4/5] rcu: backport RCU cpu hotplug support Peter Zijlstra
@ 2008-06-10 15:15   ` Paul E. McKenney
  0 siblings, 0 replies; 11+ messages in thread
From: Paul E. McKenney @ 2008-06-10 15:15 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Ingo Molnar, Thomas Gleixner, Steven Rostedt,
	Clark Williams, Gregory Haskins, Gautham R Shenoy, Pekka Enberg,
	Arnaldo Carvalho de Melo

On Tue, Jun 10, 2008 at 01:13:03PM +0200, Peter Zijlstra wrote:
> backport the RCU cpu-hotplug support from .26-rc to .24-rt

Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> ---
>  kernel/rcupreempt.c |   58 +++++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 55 insertions(+), 3 deletions(-)
> 
> Index: linux-2.6.24.7.noarch/kernel/rcupreempt.c
> ===================================================================
> --- linux-2.6.24.7.noarch.orig/kernel/rcupreempt.c
> +++ linux-2.6.24.7.noarch/kernel/rcupreempt.c
> @@ -820,6 +820,13 @@ void rcu_offline_cpu_rt(int cpu)
>  		smp_mb();  /* Subsequent RCU read-side critical sections */
>  			   /*  seen -after- acknowledgement. */
>  	}
> +
> +	__get_cpu_var(rcu_flipctr)[0] += per_cpu(rcu_flipctr, cpu)[0];
> +	__get_cpu_var(rcu_flipctr)[1] += per_cpu(rcu_flipctr, cpu)[1];
> +
> +	per_cpu(rcu_flipctr, cpu)[0] = 0;
> +	per_cpu(rcu_flipctr, cpu)[1] = 0;
> +
>  	cpu_clear(cpu, rcu_cpu_online_map);
>  	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq);
> 
> @@ -833,8 +840,9 @@ void rcu_offline_cpu_rt(int cpu)
>  	 * fix.
>  	 */
> 
> +	local_irq_save(oldirq);
>  	rdp = RCU_DATA_ME();
> -	spin_lock_irqsave(&rdp->lock, oldirq);
> +	spin_lock(&rdp->lock);
>  	*rdp->nexttail = list;
>  	if (list)
>  		rdp->nexttail = tail;
> @@ -866,9 +874,11 @@ void rcu_process_callbacks_rt(struct sof
>  {
>  	unsigned long flags;
>  	struct rcu_head *next, *list;
> -	struct rcu_data *rdp = RCU_DATA_ME();
> +	struct rcu_data *rdp;
> 
> -	spin_lock_irqsave(&rdp->lock, flags);
> +	local_irq_save(flags);
> +	rdp = RCU_DATA_ME();
> +	spin_lock(&rdp->lock);
>  	list = rdp->donelist;
>  	if (list == NULL) {
>  		spin_unlock_irqrestore(&rdp->lock, flags);
> @@ -951,6 +961,32 @@ int rcu_pending_rt(int cpu)
>  	return 0;
>  }
> 
> +static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
> +                                unsigned long action, void *hcpu)
> +{
> +        long cpu = (long)hcpu;
> +
> +        switch (action) {
> +        case CPU_UP_PREPARE:
> +        case CPU_UP_PREPARE_FROZEN:
> +                rcu_online_cpu_rt(cpu);
> +                break;
> +        case CPU_UP_CANCELED:
> +        case CPU_UP_CANCELED_FROZEN:
> +        case CPU_DEAD:
> +        case CPU_DEAD_FROZEN:
> +                rcu_offline_cpu_rt(cpu);
> +                break;
> +        default:
> +                break;
> +        }
> +        return NOTIFY_OK;
> +}
> +
> +static struct notifier_block __cpuinitdata rcu_nb = {
> +        .notifier_call = rcu_cpu_notify,
> +};
> +
>  void __init rcu_init_rt(void)
>  {
>  	int cpu;
> @@ -972,6 +1008,22 @@ void __init rcu_init_rt(void)
>  		rdp->donetail = &rdp->donelist;
>  	}
>  	rcu_preempt_boost_init();
> +	register_cpu_notifier(&rcu_nb);
> +
> +	/*
> +         * We don't need protection against CPU-Hotplug here
> +         * since
> +         * a) If a CPU comes online while we are iterating over the
> +         *    cpu_online_map below, we would only end up making a
> +         *    duplicate call to rcu_online_cpu() which sets the corresponding
> +         *    CPU's mask in the rcu_cpu_online_map.
> +         *
> +         * b) A CPU cannot go offline at this point in time since the user
> +         *    does not have access to the sysfs interface, nor do we
> +         *    suspend the system.
> +         */
> +        for_each_online_cpu(cpu)
> +                rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu);
>  }
> 
>  /*
> 
> -- 
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH -rt 5/5] cpu-hotplug: cpu_down vs preempt-rt
  2008-06-10 11:12 [PATCH -rt 0/5] hotplug fixes Peter Zijlstra
                   ` (3 preceding siblings ...)
  2008-06-10 11:13 ` [PATCH -rt 4/5] rcu: backport RCU cpu hotplug support Peter Zijlstra
@ 2008-06-10 11:13 ` Peter Zijlstra
  2008-06-10 15:33   ` Paul E. McKenney
  2008-06-11  6:53   ` [PATCH -rt 6/5] " Peter Zijlstra
  4 siblings, 2 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-06-10 11:13 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Thomas Gleixner, Steven Rostedt, Clark Williams,
	Gregory Haskins, Paul E. McKenney, Gautham R Shenoy, Pekka Enberg,
	Arnaldo Carvalho de Melo, Peter Zijlstra

[-- Attachment #1: hotplug-idle_task_exit.patch --]
[-- Type: text/plain, Size: 3975 bytes --]

idle_task_exit() calls mmdrop() from the idle thread, but in PREEMPT_RT all the
allocator locks are sleeping locks - for obvious reasons scheduling away the
idle thread gives some curious problems.

Solve this by pushing the mmdrop() into an RCU callback, however we can't use
RCU because the CPU is already down and all the local RCU state has been
destroyed.

Therefore create a new call_rcu() variant that enqueues the callback on an
online cpu.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/mm_types.h   |    5 +++++
 include/linux/rcupreempt.h |    2 ++
 kernel/rcupreempt.c        |   29 +++++++++++++++++++++++++++++
 kernel/sched.c             |   13 +++++++++++++
 4 files changed, 49 insertions(+)

Index: linux-2.6.24.7.noarch/include/linux/mm_types.h
===================================================================
--- linux-2.6.24.7.noarch.orig/include/linux/mm_types.h
+++ linux-2.6.24.7.noarch/include/linux/mm_types.h
@@ -10,6 +10,7 @@
 #include <linux/rbtree.h>
 #include <linux/rwsem.h>
 #include <linux/completion.h>
+#include <linux/rcupdate.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
 
@@ -222,6 +223,10 @@ struct mm_struct {
 	/* aio bits */
 	rwlock_t		ioctx_list_lock;
 	struct kioctx		*ioctx_list;
+
+#ifdef CONFIG_PREEMPT_RT
+	struct rcu_head rcu_head;
+#endif
 };
 
 #endif /* _LINUX_MM_TYPES_H */
Index: linux-2.6.24.7.noarch/include/linux/rcupreempt.h
===================================================================
--- linux-2.6.24.7.noarch.orig/include/linux/rcupreempt.h
+++ linux-2.6.24.7.noarch/include/linux/rcupreempt.h
@@ -83,6 +83,8 @@ extern void FASTCALL(call_rcu_classic(st
 		     void (*func)(struct rcu_head *head)));
 extern void FASTCALL(call_rcu_preempt(struct rcu_head *head,
 		     void (*func)(struct rcu_head *head)));
+extern void FASTCALL(call_rcu_preempt_online(struct rcu_head *head,
+		     void (*func)(struct rcu_head *head)));
 extern void __rcu_read_lock(void);
 extern void __rcu_read_unlock(void);
 extern void __synchronize_sched(void);
Index: linux-2.6.24.7.noarch/kernel/rcupreempt.c
===================================================================
--- linux-2.6.24.7.noarch.orig/kernel/rcupreempt.c
+++ linux-2.6.24.7.noarch/kernel/rcupreempt.c
@@ -916,6 +916,35 @@ void fastcall call_rcu_preempt(struct rc
 }
 EXPORT_SYMBOL_GPL(call_rcu_preempt);
 
+void fastcall call_rcu_preempt_online(struct rcu_head *head,
+		void (*func)(struct rcu_head *rcu))
+{
+	struct rcu_data *rdp;
+	unsigned long flags;
+	int cpu;
+
+	head->func = func;
+	head->next = NULL;
+again:
+	cpu = first_cpu(cpu_online_map);
+	rdp = RCU_DATA_CPU(cpu);
+
+	spin_lock_irqsave(&rdp->lock, flags);
+	if (unlikely(!cpu_online(cpu))) {
+		/*
+		 * cpu is removed from the online map before rcu_offline_cpu
+		 * is called.
+		 */
+		spin_unlock_irqrestore(&rdp->lock, flags);
+		goto again;
+	}
+
+	*rdp->nexttail = head;
+	rdp->nexttail = &head->next;
+	spin_unlock_irqrestore(&rdp->lock, flags);
+
+}
+
 /*
  * Check to see if any future RCU-related work will need to be done
  * by the current CPU, even if none need be done immediately, returning
Index: linux-2.6.24.7.noarch/kernel/sched.c
===================================================================
--- linux-2.6.24.7.noarch.orig/kernel/sched.c
+++ linux-2.6.24.7.noarch/kernel/sched.c
@@ -5888,6 +5888,15 @@ void sched_idle_next(void)
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 
+#ifdef CONFIG_PREEMPT_RT
+void mmdrop_rcu(struct rcu_head *head)
+{
+	struct mm_struct *mm = container_of(head, struct mm_struct, rcu_head);
+
+	mmdrop(mm);
+}
+#endif
+
 /*
  * Ensures that the idle task is using init_mm right before its cpu goes
  * offline.
@@ -5900,7 +5909,11 @@ void idle_task_exit(void)
 
 	if (mm != &init_mm)
 		switch_mm(mm, &init_mm, current);
+#ifdef CONFIG_PREEMPT_RT
+	call_rcu_preempt_online(&mm->rcu_head, mmdrop_rcu);
+#else
 	mmdrop(mm);
+#endif
 }
 
 /* called under rq->lock with disabled interrupts */

-- 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH -rt 5/5] cpu-hotplug: cpu_down vs preempt-rt
  2008-06-10 11:13 ` [PATCH -rt 5/5] cpu-hotplug: cpu_down vs preempt-rt Peter Zijlstra
@ 2008-06-10 15:33   ` Paul E. McKenney
  2008-06-10 15:51     ` Peter Zijlstra
  2008-06-11  6:53   ` [PATCH -rt 6/5] " Peter Zijlstra
  1 sibling, 1 reply; 11+ messages in thread
From: Paul E. McKenney @ 2008-06-10 15:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Ingo Molnar, Thomas Gleixner, Steven Rostedt,
	Clark Williams, Gregory Haskins, Gautham R Shenoy, Pekka Enberg,
	Arnaldo Carvalho de Melo

On Tue, Jun 10, 2008 at 01:13:04PM +0200, Peter Zijlstra wrote:
> idle_task_exit() calls mmdrop() from the idle thread, but in PREEMPT_RT all the
> allocator locks are sleeping locks - for obvious reasons scheduling away the
> idle thread gives some curious problems.
> 
> Solve this by pushing the mmdrop() into an RCU callback, however we can't use
> RCU because the CPU is already down and all the local RCU state has been
> destroyed.
> 
> Therefore create a new call_rcu() variant that enqueues the callback on an
> online cpu.

I am a bit nervous about the non-determinism, but on the other hand
CPU online/offline events can only happen so often due to the locking.

So...

Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> ---
>  include/linux/mm_types.h   |    5 +++++
>  include/linux/rcupreempt.h |    2 ++
>  kernel/rcupreempt.c        |   29 +++++++++++++++++++++++++++++
>  kernel/sched.c             |   13 +++++++++++++
>  4 files changed, 49 insertions(+)
> 
> Index: linux-2.6.24.7.noarch/include/linux/mm_types.h
> ===================================================================
> --- linux-2.6.24.7.noarch.orig/include/linux/mm_types.h
> +++ linux-2.6.24.7.noarch/include/linux/mm_types.h
> @@ -10,6 +10,7 @@
>  #include <linux/rbtree.h>
>  #include <linux/rwsem.h>
>  #include <linux/completion.h>
> +#include <linux/rcupdate.h>
>  #include <asm/page.h>
>  #include <asm/mmu.h>
> 
> @@ -222,6 +223,10 @@ struct mm_struct {
>  	/* aio bits */
>  	rwlock_t		ioctx_list_lock;
>  	struct kioctx		*ioctx_list;
> +
> +#ifdef CONFIG_PREEMPT_RT
> +	struct rcu_head rcu_head;
> +#endif
>  };
> 
>  #endif /* _LINUX_MM_TYPES_H */
> Index: linux-2.6.24.7.noarch/include/linux/rcupreempt.h
> ===================================================================
> --- linux-2.6.24.7.noarch.orig/include/linux/rcupreempt.h
> +++ linux-2.6.24.7.noarch/include/linux/rcupreempt.h
> @@ -83,6 +83,8 @@ extern void FASTCALL(call_rcu_classic(st
>  		     void (*func)(struct rcu_head *head)));
>  extern void FASTCALL(call_rcu_preempt(struct rcu_head *head,
>  		     void (*func)(struct rcu_head *head)));
> +extern void FASTCALL(call_rcu_preempt_online(struct rcu_head *head,
> +		     void (*func)(struct rcu_head *head)));
>  extern void __rcu_read_lock(void);
>  extern void __rcu_read_unlock(void);
>  extern void __synchronize_sched(void);
> Index: linux-2.6.24.7.noarch/kernel/rcupreempt.c
> ===================================================================
> --- linux-2.6.24.7.noarch.orig/kernel/rcupreempt.c
> +++ linux-2.6.24.7.noarch/kernel/rcupreempt.c
> @@ -916,6 +916,35 @@ void fastcall call_rcu_preempt(struct rc
>  }
>  EXPORT_SYMBOL_GPL(call_rcu_preempt);
> 
> +void fastcall call_rcu_preempt_online(struct rcu_head *head,
> +		void (*func)(struct rcu_head *rcu))
> +{
> +	struct rcu_data *rdp;
> +	unsigned long flags;
> +	int cpu;
> +
> +	head->func = func;
> +	head->next = NULL;
> +again:
> +	cpu = first_cpu(cpu_online_map);
> +	rdp = RCU_DATA_CPU(cpu);
> +
> +	spin_lock_irqsave(&rdp->lock, flags);
> +	if (unlikely(!cpu_online(cpu))) {
> +		/*
> +		 * cpu is removed from the online map before rcu_offline_cpu
> +		 * is called.
> +		 */
> +		spin_unlock_irqrestore(&rdp->lock, flags);
> +		goto again;
> +	}
> +
> +	*rdp->nexttail = head;
> +	rdp->nexttail = &head->next;
> +	spin_unlock_irqrestore(&rdp->lock, flags);
> +
> +}
> +
>  /*
>   * Check to see if any future RCU-related work will need to be done
>   * by the current CPU, even if none need be done immediately, returning
> Index: linux-2.6.24.7.noarch/kernel/sched.c
> ===================================================================
> --- linux-2.6.24.7.noarch.orig/kernel/sched.c
> +++ linux-2.6.24.7.noarch/kernel/sched.c
> @@ -5888,6 +5888,15 @@ void sched_idle_next(void)
>  	spin_unlock_irqrestore(&rq->lock, flags);
>  }
> 
> +#ifdef CONFIG_PREEMPT_RT
> +void mmdrop_rcu(struct rcu_head *head)
> +{
> +	struct mm_struct *mm = container_of(head, struct mm_struct, rcu_head);
> +
> +	mmdrop(mm);
> +}
> +#endif
> +
>  /*
>   * Ensures that the idle task is using init_mm right before its cpu goes
>   * offline.
> @@ -5900,7 +5909,11 @@ void idle_task_exit(void)
> 
>  	if (mm != &init_mm)
>  		switch_mm(mm, &init_mm, current);
> +#ifdef CONFIG_PREEMPT_RT
> +	call_rcu_preempt_online(&mm->rcu_head, mmdrop_rcu);
> +#else
>  	mmdrop(mm);
> +#endif
>  }
> 
>  /* called under rq->lock with disabled interrupts */
> 
> -- 
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH -rt 5/5] cpu-hotplug: cpu_down vs preempt-rt
  2008-06-10 15:33   ` Paul E. McKenney
@ 2008-06-10 15:51     ` Peter Zijlstra
  2008-06-10 16:17       ` Paul E. McKenney
  0 siblings, 1 reply; 11+ messages in thread
From: Peter Zijlstra @ 2008-06-10 15:51 UTC (permalink / raw)
  To: paulmck
  Cc: linux-kernel, Ingo Molnar, Thomas Gleixner, Steven Rostedt,
	Clark Williams, Gregory Haskins, Gautham R Shenoy, Pekka Enberg,
	Arnaldo Carvalho de Melo

On Tue, 2008-06-10 at 08:33 -0700, Paul E. McKenney wrote:
> On Tue, Jun 10, 2008 at 01:13:04PM +0200, Peter Zijlstra wrote:
> > idle_task_exit() calls mmdrop() from the idle thread, but in PREEMPT_RT all the
> > allocator locks are sleeping locks - for obvious reasons scheduling away the
> > idle thread gives some curious problems.
> > 
> > Solve this by pushing the mmdrop() into an RCU callback, however we can't use
> > RCU because the CPU is already down and all the local RCU state has been
> > destroyed.
> > 
> > Therefore create a new call_rcu() variant that enqueues the callback on an
> > online cpu.
> 
> I am a bit nervous about the non-determinism, but on the other hand
> CPU online/offline events can only happen so often due to the locking.
> 
> So...
> 
> Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

Thanks!

Yesterday you suggested using rcu_cpu_online_map and fliplock to avoid
the loop here:

> > +void fastcall call_rcu_preempt_online(struct rcu_head *head,
> > +		void (*func)(struct rcu_head *rcu))
> > +{
> > +	struct rcu_data *rdp;
> > +	unsigned long flags;
> > +	int cpu;
> > +
> > +	head->func = func;
> > +	head->next = NULL;
> > +again:
> > +	cpu = first_cpu(cpu_online_map);
> > +	rdp = RCU_DATA_CPU(cpu);
> > +
> > +	spin_lock_irqsave(&rdp->lock, flags);
> > +	if (unlikely(!cpu_online(cpu))) {
> > +		/*
> > +		 * cpu is removed from the online map before rcu_offline_cpu
> > +		 * is called.
> > +		 */
> > +		spin_unlock_irqrestore(&rdp->lock, flags);
> > +		goto again;
> > +	}
> > +
> > +	*rdp->nexttail = head;
> > +	rdp->nexttail = &head->next;
> > +	spin_unlock_irqrestore(&rdp->lock, flags);
> > +
> > +}

But then the code would look like:

  spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
  cpu = first_cpu(rcu_cpu_online_map);
  rdp = RCU_DATA_CPU(cpu);
  spin_lock(&rdp->lock);

creating a nesting between these two locks, where I could not find one.

Do you still prefer I look into changing it into such a form, or are you
sufficiently non-caring that the current code can stand? :-)


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH -rt 5/5] cpu-hotplug: cpu_down vs preempt-rt
  2008-06-10 15:51     ` Peter Zijlstra
@ 2008-06-10 16:17       ` Paul E. McKenney
  0 siblings, 0 replies; 11+ messages in thread
From: Paul E. McKenney @ 2008-06-10 16:17 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Ingo Molnar, Thomas Gleixner, Steven Rostedt,
	Clark Williams, Gregory Haskins, Gautham R Shenoy, Pekka Enberg,
	Arnaldo Carvalho de Melo

On Tue, Jun 10, 2008 at 05:51:18PM +0200, Peter Zijlstra wrote:
> On Tue, 2008-06-10 at 08:33 -0700, Paul E. McKenney wrote:
> > On Tue, Jun 10, 2008 at 01:13:04PM +0200, Peter Zijlstra wrote:
> > > idle_task_exit() calls mmdrop() from the idle thread, but in PREEMPT_RT all the
> > > allocator locks are sleeping locks - for obvious reasons scheduling away the
> > > idle thread gives some curious problems.
> > > 
> > > Solve this by pushing the mmdrop() into an RCU callback, however we can't use
> > > RCU because the CPU is already down and all the local RCU state has been
> > > destroyed.
> > > 
> > > Therefore create a new call_rcu() variant that enqueues the callback on an
> > > online cpu.
> > 
> > I am a bit nervous about the non-determinism, but on the other hand
> > CPU online/offline events can only happen so often due to the locking.
> > 
> > So...
> > 
> > Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> 
> Thanks!
> 
> Yesterday you suggested using rcu_cpu_online_map and fliplock to avoid
> the loop here:
> 
> > > +void fastcall call_rcu_preempt_online(struct rcu_head *head,
> > > +		void (*func)(struct rcu_head *rcu))
> > > +{
> > > +	struct rcu_data *rdp;
> > > +	unsigned long flags;
> > > +	int cpu;
> > > +
> > > +	head->func = func;
> > > +	head->next = NULL;
> > > +again:
> > > +	cpu = first_cpu(cpu_online_map);
> > > +	rdp = RCU_DATA_CPU(cpu);
> > > +
> > > +	spin_lock_irqsave(&rdp->lock, flags);
> > > +	if (unlikely(!cpu_online(cpu))) {
> > > +		/*
> > > +		 * cpu is removed from the online map before rcu_offline_cpu
> > > +		 * is called.
> > > +		 */
> > > +		spin_unlock_irqrestore(&rdp->lock, flags);
> > > +		goto again;
> > > +	}
> > > +
> > > +	*rdp->nexttail = head;
> > > +	rdp->nexttail = &head->next;
> > > +	spin_unlock_irqrestore(&rdp->lock, flags);
> > > +
> > > +}
> 
> But then the code would look like:
> 
>   spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
>   cpu = first_cpu(rcu_cpu_online_map);
>   rdp = RCU_DATA_CPU(cpu);
>   spin_lock(&rdp->lock);
> 
> creating a nesting between these two locks, where I could not find one.
> 
> Do you still prefer I look into changing it into such a form, or are you
> sufficiently non-caring that the current code can stand? :-)

I am equally bothered by the non-determinism and by the nesting, hence
the current code can stand, at least until it causes a real problem.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH -rt 6/5] cpu-hotplug: cpu_down vs preempt-rt
  2008-06-10 11:13 ` [PATCH -rt 5/5] cpu-hotplug: cpu_down vs preempt-rt Peter Zijlstra
  2008-06-10 15:33   ` Paul E. McKenney
@ 2008-06-11  6:53   ` Peter Zijlstra
  1 sibling, 0 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-06-11  6:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Thomas Gleixner, Steven Rostedt, Clark Williams,
	Gregory Haskins, Paul E. McKenney, Gautham R Shenoy, Pekka Enberg,
	Arnaldo Carvalho de Melo

Because 5/5 has a horrible bug...

We should only do __mmdrop() from rcu, not mmdrop().

---
Index: linux-2.6.24.7.noarch/include/linux/sched.h
===================================================================
--- linux-2.6.24.7.noarch.orig/include/linux/sched.h
+++ linux-2.6.24.7.noarch/include/linux/sched.h
@@ -1832,6 +1832,7 @@ extern struct mm_struct * mm_alloc(void)
 /* mmdrop drops the mm and the page tables */
 extern void FASTCALL(__mmdrop(struct mm_struct *));
 extern void FASTCALL(__mmdrop_delayed(struct mm_struct *));
+extern void FASTCALL(__mmdrop_rcu(struct mm_struct *));
 
 static inline void mmdrop(struct mm_struct * mm)
 {
@@ -1845,6 +1846,12 @@ static inline void mmdrop_delayed(struct
 		__mmdrop_delayed(mm);
 }
 
+static inline void mmdrop_rcu(struct mm_struct * mm)
+{
+	if (atomic_dec_and_test(&mm->mm_count))
+		__mmdrop_rcu(mm);
+}
+
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
 /* Grab a reference to a task's mm, if it is not already going away */
Index: linux-2.6.24.7.noarch/kernel/fork.c
===================================================================
--- linux-2.6.24.7.noarch.orig/kernel/fork.c
+++ linux-2.6.24.7.noarch/kernel/fork.c
@@ -431,6 +431,18 @@ void fastcall __mmdrop(struct mm_struct 
 	free_mm(mm);
 }
 
+#ifdef CONFIG_PREEMPT_RT
+static void ___mmdrop_rcu(struct rcu_head *head)
+{
+	__mmdrop(container_of(head, struct mm_struct, rcu_head));
+}
+
+void fastcall __mmdrop_rcu(struct mm_struct *mm)
+{
+	call_rcu_preempt_online(&mm->rcu_head, ___mmdrop_rcu);
+}
+#endif
+
 /*
  * Decrement the use count and release all resources for an mm.
  */
Index: linux-2.6.24.7.noarch/kernel/sched.c
===================================================================
--- linux-2.6.24.7.noarch.orig/kernel/sched.c
+++ linux-2.6.24.7.noarch/kernel/sched.c
@@ -5888,15 +5888,6 @@ void sched_idle_next(void)
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 
-#ifdef CONFIG_PREEMPT_RT
-void mmdrop_rcu(struct rcu_head *head)
-{
-	struct mm_struct *mm = container_of(head, struct mm_struct, rcu_head);
-
-	mmdrop(mm);
-}
-#endif
-
 /*
  * Ensures that the idle task is using init_mm right before its cpu goes
  * offline.
@@ -5910,7 +5901,7 @@ void idle_task_exit(void)
 	if (mm != &init_mm)
 		switch_mm(mm, &init_mm, current);
 #ifdef CONFIG_PREEMPT_RT
-	call_rcu_preempt_online(&mm->rcu_head, mmdrop_rcu);
+	mmdrop_rcu(mm);
 #else
 	mmdrop(mm);
 #endif



^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2008-06-11  6:54 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-06-10 11:12 [PATCH -rt 0/5] hotplug fixes Peter Zijlstra
2008-06-10 11:13 ` [PATCH -rt 1/5] cpu-hotplug: vs slab Peter Zijlstra
2008-06-10 11:13 ` [PATCH -rt 2/5] cpu-hotplug: vs page_alloc Peter Zijlstra
2008-06-10 11:13 ` [PATCH -rt 3/5] cpu-hotplug: cpu_up vs preempt-rt Peter Zijlstra
2008-06-10 11:13 ` [PATCH -rt 4/5] rcu: backport RCU cpu hotplug support Peter Zijlstra
2008-06-10 15:15   ` Paul E. McKenney
2008-06-10 11:13 ` [PATCH -rt 5/5] cpu-hotplug: cpu_down vs preempt-rt Peter Zijlstra
2008-06-10 15:33   ` Paul E. McKenney
2008-06-10 15:51     ` Peter Zijlstra
2008-06-10 16:17       ` Paul E. McKenney
2008-06-11  6:53   ` [PATCH -rt 6/5] " Peter Zijlstra

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.