public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH, RFC] improved hacks to allow -rt to run kernbench on POWER
@ 2007-11-09 18:10 Paul E. McKenney
  2007-11-09 20:52 ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 15+ messages in thread
From: Paul E. McKenney @ 2007-11-09 18:10 UTC (permalink / raw)
  To: linux-kernel
  Cc: tony, paulus, benh, dino, tytso, dvhltc, niv, antonb, rostedt

Hello again!

A few random patches that permit POWER to pass kernbench on -rt.
Many of these have more focus on expediency than care for correctness,
so might best be thought of as workarounds than as complete solutions.
There are still issues not addressed by this patch, including:

o	kmem_cache_alloc() from non-preemptible context during
	bootup (xics_startup() building the irq_radix_revmap()).

o	unmap_vmas() freeing pages with preemption disabled.
	Might be able to address this by linking the pages together,
	then freeing them en masse after preemption has been re-enabled,
	but there is likely a better approach.

This patch handles the batched TLB-flush mechanism more cleanly (many
thanks to Ben Herrenschmidt for his guidance here).  It is also restricted
to powerpc arch-specific files along with one open-firmware file.

Thoughts?

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---

 arch/powerpc/kernel/process.c        |   18 ++++++++++++++++++
 arch/powerpc/kernel/prom.c           |    2 +-
 arch/powerpc/mm/fault.c              |    3 +++
 arch/powerpc/mm/tlb_64.c             |    5 ++++-
 arch/powerpc/platforms/pseries/eeh.c |    2 +-
 drivers/of/base.c                    |    2 +-
 include/asm-powerpc/tlb.h            |    5 ++++-
 include/asm-powerpc/tlbflush.h       |   15 ++++++++++-----
 8 files changed, 42 insertions(+), 10 deletions(-)

diff -urpNa -X dontdiff linux-2.6.23.1-rt4/arch/powerpc/kernel/process.c linux-2.6.23.1-rt4-fix/arch/powerpc/kernel/process.c
--- linux-2.6.23.1-rt4/arch/powerpc/kernel/process.c	2007-10-12 09:43:44.000000000 -0700
+++ linux-2.6.23.1-rt4-fix/arch/powerpc/kernel/process.c	2007-11-08 20:33:59.000000000 -0800
@@ -245,6 +245,8 @@ struct task_struct *__switch_to(struct t
 	struct thread_struct *new_thread, *old_thread;
 	unsigned long flags;
 	struct task_struct *last;
+	struct ppc64_tlb_batch *batch;
+	int hadbatch;
 
 #ifdef CONFIG_SMP
 	/* avoid complexity of lazy save/restore of fpu
@@ -325,6 +327,16 @@ struct task_struct *__switch_to(struct t
 	}
 #endif
 
+	batch = &get_cpu_var(ppc64_tlb_batch);
+	if (batch->active) {
+		hadbatch = 1;
+		if (batch->index) {
+			__flush_tlb_pending(batch);
+		}
+		batch->active = 0;
+	}
+	put_cpu_var(ppc64_tlb_batch);
+
 	local_irq_save(flags);
 
 	account_system_vtime(current);
@@ -335,6 +347,12 @@ struct task_struct *__switch_to(struct t
 
 	local_irq_restore(flags);
 
+	if (hadbatch) {
+		batch = &get_cpu_var(ppc64_tlb_batch);
+		batch->active = 1;
+		put_cpu_var(ppc64_tlb_batch);
+	}
+
 	return last;
 }
 
diff -urpNa -X dontdiff linux-2.6.23.1-rt4/arch/powerpc/kernel/prom.c linux-2.6.23.1-rt4-fix/arch/powerpc/kernel/prom.c
--- linux-2.6.23.1-rt4/arch/powerpc/kernel/prom.c	2007-10-12 09:43:44.000000000 -0700
+++ linux-2.6.23.1-rt4-fix/arch/powerpc/kernel/prom.c	2007-10-28 13:37:23.000000000 -0700
@@ -80,7 +80,7 @@ struct boot_param_header *initial_boot_p
 
 extern struct device_node *allnodes;	/* temporary while merging */
 
-extern rwlock_t devtree_lock;	/* temporary while merging */
+extern raw_rwlock_t devtree_lock;	/* temporary while merging */
 
 /* export that to outside world */
 struct device_node *of_chosen;
diff -urpNa -X dontdiff linux-2.6.23.1-rt4/arch/powerpc/mm/fault.c linux-2.6.23.1-rt4-fix/arch/powerpc/mm/fault.c
--- linux-2.6.23.1-rt4/arch/powerpc/mm/fault.c	2007-10-27 22:20:57.000000000 -0700
+++ linux-2.6.23.1-rt4-fix/arch/powerpc/mm/fault.c	2007-10-28 13:49:07.000000000 -0700
@@ -301,6 +301,7 @@ good_area:
 		if (get_pteptr(mm, address, &ptep, &pmdp)) {
 			spinlock_t *ptl = pte_lockptr(mm, pmdp);
 			spin_lock(ptl);
+			preempt_disable();
 			if (pte_present(*ptep)) {
 				struct page *page = pte_page(*ptep);
 
@@ -310,10 +311,12 @@ good_area:
 				}
 				pte_update(ptep, 0, _PAGE_HWEXEC);
 				_tlbie(address);
+				preempt_enable();
 				pte_unmap_unlock(ptep, ptl);
 				up_read(&mm->mmap_sem);
 				return 0;
 			}
+			preempt_enable();
 			pte_unmap_unlock(ptep, ptl);
 		}
 #endif
diff -urpNa -X dontdiff linux-2.6.23.1-rt4/arch/powerpc/mm/tlb_64.c linux-2.6.23.1-rt4-fix/arch/powerpc/mm/tlb_64.c
--- linux-2.6.23.1-rt4/arch/powerpc/mm/tlb_64.c	2007-10-27 22:20:57.000000000 -0700
+++ linux-2.6.23.1-rt4-fix/arch/powerpc/mm/tlb_64.c	2007-11-08 16:49:04.000000000 -0800
@@ -133,7 +133,7 @@ void pgtable_free_tlb(struct mmu_gather 
 void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, unsigned long pte, int huge)
 {
-	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+	struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch);
 	unsigned long vsid, vaddr;
 	unsigned int psize;
 	real_pte_t rpte;
@@ -180,6 +180,7 @@ void hpte_need_flush(struct mm_struct *m
 	 */
 	if (!batch->active) {
 		flush_hash_page(vaddr, rpte, psize, 0);
+		put_cpu_var(ppc64_tlb_batch);
 		return;
 	}
 
@@ -212,12 +213,14 @@ void hpte_need_flush(struct mm_struct *m
 	 */
 	if (machine_is(celleb)) {
 		__flush_tlb_pending(batch);
+		put_cpu_var(ppc64_tlb_batch);
 		return;
 	}
 #endif /* CONFIG_PREEMPT_RT */
 
 	if (i >= PPC64_TLB_BATCH_NR)
 		__flush_tlb_pending(batch);
+	put_cpu_var(ppc64_tlb_batch);
 }
 
 /*
diff -urpNa -X dontdiff linux-2.6.23.1-rt4/arch/powerpc/platforms/pseries/eeh.c linux-2.6.23.1-rt4-fix/arch/powerpc/platforms/pseries/eeh.c
--- linux-2.6.23.1-rt4/arch/powerpc/platforms/pseries/eeh.c	2007-10-12 09:43:44.000000000 -0700
+++ linux-2.6.23.1-rt4-fix/arch/powerpc/platforms/pseries/eeh.c	2007-10-28 15:43:54.000000000 -0700
@@ -97,7 +97,7 @@ int eeh_subsystem_enabled;
 EXPORT_SYMBOL(eeh_subsystem_enabled);
 
 /* Lock to avoid races due to multiple reports of an error */
-static DEFINE_SPINLOCK(confirm_error_lock);
+static DEFINE_RAW_SPINLOCK(confirm_error_lock);
 
 /* Buffer for reporting slot-error-detail rtas calls. Its here
  * in BSS, and not dynamically alloced, so that it ends up in
diff -urpNa -X dontdiff linux-2.6.23.1-rt4/drivers/of/base.c linux-2.6.23.1-rt4-fix/drivers/of/base.c
--- linux-2.6.23.1-rt4/drivers/of/base.c	2007-10-12 09:43:44.000000000 -0700
+++ linux-2.6.23.1-rt4-fix/drivers/of/base.c	2007-10-28 13:38:36.000000000 -0700
@@ -25,7 +25,7 @@ struct device_node *allnodes;
 /* use when traversing tree through the allnext, child, sibling,
  * or parent members of struct device_node.
  */
-DEFINE_RWLOCK(devtree_lock);
+DEFINE_RAW_RWLOCK(devtree_lock);
 
 int of_n_addr_cells(struct device_node *np)
 {
diff -urpNa -X dontdiff linux-2.6.23.1-rt4/include/asm-powerpc/tlbflush.h linux-2.6.23.1-rt4-fix/include/asm-powerpc/tlbflush.h
--- linux-2.6.23.1-rt4/include/asm-powerpc/tlbflush.h	2007-10-12 09:43:44.000000000 -0700
+++ linux-2.6.23.1-rt4-fix/include/asm-powerpc/tlbflush.h	2007-11-08 17:11:18.000000000 -0800
@@ -109,18 +109,23 @@ extern void hpte_need_flush(struct mm_st
 
 static inline void arch_enter_lazy_mmu_mode(void)
 {
-	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+	struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch);
 
 	batch->active = 1;
+	put_cpu_var(ppc64_tlb_batch);
 }
 
 static inline void arch_leave_lazy_mmu_mode(void)
 {
-	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+	struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch);
 
-	if (batch->index)
-		__flush_tlb_pending(batch);
-	batch->active = 0;
+	if (batch->active) {
+		if (batch->index) {
+			__flush_tlb_pending(batch);
+		}
+		batch->active = 0;
+	}
+	put_cpu_var(ppc64_tlb_batch);
 }
 
 #define arch_flush_lazy_mmu_mode()      do {} while (0)
diff -urpNa -X dontdiff linux-2.6.23.1-rt4/include/asm-powerpc/tlb.h linux-2.6.23.1-rt4-fix/include/asm-powerpc/tlb.h
--- linux-2.6.23.1-rt4/include/asm-powerpc/tlb.h	2007-10-12 09:43:44.000000000 -0700
+++ linux-2.6.23.1-rt4-fix/include/asm-powerpc/tlb.h	2007-10-28 11:36:05.000000000 -0700
@@ -44,8 +44,11 @@ static inline void tlb_flush(struct mmu_
 	 * pages are going to be freed and we really don't want to have a CPU
 	 * access a freed page because it has a stale TLB
 	 */
-	if (tlbbatch->index)
+	if (tlbbatch->index) {
+		preempt_disable();
 		__flush_tlb_pending(tlbbatch);
+		preempt_enable();
+	}
 
 	pte_free_finish();
 }

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH, RFC] improved hacks to allow -rt to run kernbench on POWER
  2007-11-09 18:10 [PATCH, RFC] improved hacks to allow -rt to run kernbench on POWER Paul E. McKenney
@ 2007-11-09 20:52 ` Benjamin Herrenschmidt
  2007-11-09 22:07   ` Paul E. McKenney
  2007-11-10  1:18   ` Nick Piggin
  0 siblings, 2 replies; 15+ messages in thread
From: Benjamin Herrenschmidt @ 2007-11-09 20:52 UTC (permalink / raw)
  To: paulmck
  Cc: linux-kernel, tony, paulus, dino, tytso, dvhltc, niv, antonb,
	rostedt

> diff -urpNa -X dontdiff linux-2.6.23.1-rt4/arch/powerpc/kernel/process.c linux-2.6.23.1-rt4-fix/arch/powerpc/kernel/process.c
> --- linux-2.6.23.1-rt4/arch/powerpc/kernel/process.c	2007-10-12 09:43:44.000000000 -0700
> +++ linux-2.6.23.1-rt4-fix/arch/powerpc/kernel/process.c	2007-11-08 20:33:59.000000000 -0800
> @@ -245,6 +245,8 @@ struct task_struct *__switch_to(struct t
>  	struct thread_struct *new_thread, *old_thread;
>  	unsigned long flags;
>  	struct task_struct *last;
> +	struct ppc64_tlb_batch *batch;
> +	int hadbatch;
>  
>  #ifdef CONFIG_SMP
>  	/* avoid complexity of lazy save/restore of fpu
> @@ -325,6 +327,16 @@ struct task_struct *__switch_to(struct t
>  	}
>  #endif
>  
> +	batch = &get_cpu_var(ppc64_tlb_batch);
> +	if (batch->active) {
> +		hadbatch = 1;
> +		if (batch->index) {
> +			__flush_tlb_pending(batch);
> +		}
> +		batch->active = 0;
> +	}
> +	put_cpu_var(ppc64_tlb_batch);
> +
>  	local_irq_save(flags);
>  
>  	account_system_vtime(current);
> @@ -335,6 +347,12 @@ struct task_struct *__switch_to(struct t
>  
>  	local_irq_restore(flags);
>  
> +	if (hadbatch) {
> +		batch = &get_cpu_var(ppc64_tlb_batch);
> +		batch->active = 1;
> +		put_cpu_var(ppc64_tlb_batch);
> +	}
> +
>  	return last;
>  }

I doubt we can schedule within __switch_to() (can somebody confirm
this ?), in which case, you can just use __get_cpu_var() and avoid
the put, thus saving a handful of cycles in the code above.

>  /* export that to outside world */
>  struct device_node *of_chosen;
> diff -urpNa -X dontdiff linux-2.6.23.1-rt4/arch/powerpc/mm/fault.c linux-2.6.23.1-rt4-fix/arch/powerpc/mm/fault.c
> --- linux-2.6.23.1-rt4/arch/powerpc/mm/fault.c	2007-10-27 22:20:57.000000000 -0700
> +++ linux-2.6.23.1-rt4-fix/arch/powerpc/mm/fault.c	2007-10-28 13:49:07.000000000 -0700
> @@ -301,6 +301,7 @@ good_area:
>  		if (get_pteptr(mm, address, &ptep, &pmdp)) {
>  			spinlock_t *ptl = pte_lockptr(mm, pmdp);
>  			spin_lock(ptl);
> +			preempt_disable();
>  			if (pte_present(*ptep)) {
>  				struct page *page = pte_page(*ptep);
>  
> @@ -310,10 +311,12 @@ good_area:
>  				}
>  				pte_update(ptep, 0, _PAGE_HWEXEC);
>  				_tlbie(address);
> +				preempt_enable();
>  				pte_unmap_unlock(ptep, ptl);
>  				up_read(&mm->mmap_sem);
>  				return 0;
>  			}
> +			preempt_enable();
>  			pte_unmap_unlock(ptep, ptl);
>  		}
>  #endif

 
The patch above will have to be rebased following a fix I did that makes
_tlbie() takes a context ID argument. I don't actually think
preempt_disable/enable is necessary here. I don't think it matters if we
preempt here, but I suppose better safe than sorry.... (this is 44x
code).

Overall, looks fine !

Cheers,
Ben.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH, RFC] improved hacks to allow -rt to run kernbench on POWER
  2007-11-09 20:52 ` Benjamin Herrenschmidt
@ 2007-11-09 22:07   ` Paul E. McKenney
  2007-11-11  3:59     ` Benjamin Herrenschmidt
  2007-11-10  1:18   ` Nick Piggin
  1 sibling, 1 reply; 15+ messages in thread
From: Paul E. McKenney @ 2007-11-09 22:07 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: linux-kernel, tony, paulus, dino, tytso, dvhltc, niv, antonb,
	rostedt

On Sat, Nov 10, 2007 at 07:52:04AM +1100, Benjamin Herrenschmidt wrote:
> > diff -urpNa -X dontdiff linux-2.6.23.1-rt4/arch/powerpc/kernel/process.c linux-2.6.23.1-rt4-fix/arch/powerpc/kernel/process.c
> > --- linux-2.6.23.1-rt4/arch/powerpc/kernel/process.c	2007-10-12 09:43:44.000000000 -0700
> > +++ linux-2.6.23.1-rt4-fix/arch/powerpc/kernel/process.c	2007-11-08 20:33:59.000000000 -0800
> > @@ -245,6 +245,8 @@ struct task_struct *__switch_to(struct t
> >  	struct thread_struct *new_thread, *old_thread;
> >  	unsigned long flags;
> >  	struct task_struct *last;
> > +	struct ppc64_tlb_batch *batch;
> > +	int hadbatch;
> >  
> >  #ifdef CONFIG_SMP
> >  	/* avoid complexity of lazy save/restore of fpu
> > @@ -325,6 +327,16 @@ struct task_struct *__switch_to(struct t
> >  	}
> >  #endif
> >  
> > +	batch = &get_cpu_var(ppc64_tlb_batch);
> > +	if (batch->active) {
> > +		hadbatch = 1;
> > +		if (batch->index) {
> > +			__flush_tlb_pending(batch);
> > +		}
> > +		batch->active = 0;
> > +	}
> > +	put_cpu_var(ppc64_tlb_batch);
> > +
> >  	local_irq_save(flags);
> >  
> >  	account_system_vtime(current);
> > @@ -335,6 +347,12 @@ struct task_struct *__switch_to(struct t
> >  
> >  	local_irq_restore(flags);
> >  
> > +	if (hadbatch) {
> > +		batch = &get_cpu_var(ppc64_tlb_batch);
> > +		batch->active = 1;
> > +		put_cpu_var(ppc64_tlb_batch);
> > +	}
> > +
> >  	return last;
> >  }
> 
> I doubt we can schedule within __switch_to() (can somebody confirm
> this ?), in which case, you can just use __get_cpu_var() and avoid
> the put, thus saving a handful of cycles in the code above.

Thank you for looking this over!

I tried this, and it seemed to work.

> >  /* export that to outside world */
> >  struct device_node *of_chosen;
> > diff -urpNa -X dontdiff linux-2.6.23.1-rt4/arch/powerpc/mm/fault.c linux-2.6.23.1-rt4-fix/arch/powerpc/mm/fault.c
> > --- linux-2.6.23.1-rt4/arch/powerpc/mm/fault.c	2007-10-27 22:20:57.000000000 -0700
> > +++ linux-2.6.23.1-rt4-fix/arch/powerpc/mm/fault.c	2007-10-28 13:49:07.000000000 -0700
> > @@ -301,6 +301,7 @@ good_area:
> >  		if (get_pteptr(mm, address, &ptep, &pmdp)) {
> >  			spinlock_t *ptl = pte_lockptr(mm, pmdp);
> >  			spin_lock(ptl);
> > +			preempt_disable();
> >  			if (pte_present(*ptep)) {
> >  				struct page *page = pte_page(*ptep);
> >  
> > @@ -310,10 +311,12 @@ good_area:
> >  				}
> >  				pte_update(ptep, 0, _PAGE_HWEXEC);
> >  				_tlbie(address);
> > +				preempt_enable();
> >  				pte_unmap_unlock(ptep, ptl);
> >  				up_read(&mm->mmap_sem);
> >  				return 0;
> >  			}
> > +			preempt_enable();
> >  			pte_unmap_unlock(ptep, ptl);
> >  		}
> >  #endif
> 
> 
> The patch above will have to be rebased following a fix I did that makes
> _tlbie() takes a context ID argument. I don't actually think
> preempt_disable/enable is necessary here. I don't think it matters if we
> preempt here, but I suppose better safe than sorry.... (this is 44x
> code).

Removed this as well, also seemed to work.  Please note, however, that
this is just running kernbench.  But this did seem to get rid of some
of the warnings as well.  ;-)  Now only have the xics_startup() warning.

> Overall, looks fine !

I bet that there are more gotchas lurking in there somewhere, but in
the meantime here is the updated patch.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---

 arch/powerpc/kernel/process.c        |   16 ++++++++++++++++
 arch/powerpc/kernel/prom.c           |    2 +-
 arch/powerpc/mm/tlb_64.c             |    5 ++++-
 arch/powerpc/platforms/pseries/eeh.c |    2 +-
 drivers/of/base.c                    |    2 +-
 include/asm-powerpc/tlb.h            |    5 ++++-
 include/asm-powerpc/tlbflush.h       |   15 ++++++++++-----
 7 files changed, 37 insertions(+), 10 deletions(-)

diff -urpNa -X dontdiff linux-2.6.23.1-rt4/arch/powerpc/kernel/process.c linux-2.6.23.1-rt4-fix/arch/powerpc/kernel/process.c
--- linux-2.6.23.1-rt4/arch/powerpc/kernel/process.c	2007-10-12 09:43:44.000000000 -0700
+++ linux-2.6.23.1-rt4-fix/arch/powerpc/kernel/process.c	2007-11-09 13:24:46.000000000 -0800
@@ -245,6 +245,8 @@ struct task_struct *__switch_to(struct t
 	struct thread_struct *new_thread, *old_thread;
 	unsigned long flags;
 	struct task_struct *last;
+	struct ppc64_tlb_batch *batch;
+	int hadbatch;
 
 #ifdef CONFIG_SMP
 	/* avoid complexity of lazy save/restore of fpu
@@ -325,6 +327,15 @@ struct task_struct *__switch_to(struct t
 	}
 #endif
 
+	batch = &__get_cpu_var(ppc64_tlb_batch);
+	if (batch->active) {
+		hadbatch = 1;
+		if (batch->index) {
+			__flush_tlb_pending(batch);
+		}
+		batch->active = 0;
+	}
+
 	local_irq_save(flags);
 
 	account_system_vtime(current);
@@ -335,6 +346,11 @@ struct task_struct *__switch_to(struct t
 
 	local_irq_restore(flags);
 
+	if (hadbatch) {
+		batch = &__get_cpu_var(ppc64_tlb_batch);
+		batch->active = 1;
+	}
+
 	return last;
 }
 
diff -urpNa -X dontdiff linux-2.6.23.1-rt4/arch/powerpc/kernel/prom.c linux-2.6.23.1-rt4-fix/arch/powerpc/kernel/prom.c
--- linux-2.6.23.1-rt4/arch/powerpc/kernel/prom.c	2007-10-12 09:43:44.000000000 -0700
+++ linux-2.6.23.1-rt4-fix/arch/powerpc/kernel/prom.c	2007-10-28 13:37:23.000000000 -0700
@@ -80,7 +80,7 @@ struct boot_param_header *initial_boot_p
 
 extern struct device_node *allnodes;	/* temporary while merging */
 
-extern rwlock_t devtree_lock;	/* temporary while merging */
+extern raw_rwlock_t devtree_lock;	/* temporary while merging */
 
 /* export that to outside world */
 struct device_node *of_chosen;
diff -urpNa -X dontdiff linux-2.6.23.1-rt4/arch/powerpc/mm/tlb_64.c linux-2.6.23.1-rt4-fix/arch/powerpc/mm/tlb_64.c
--- linux-2.6.23.1-rt4/arch/powerpc/mm/tlb_64.c	2007-10-27 22:20:57.000000000 -0700
+++ linux-2.6.23.1-rt4-fix/arch/powerpc/mm/tlb_64.c	2007-11-08 16:49:04.000000000 -0800
@@ -133,7 +133,7 @@ void pgtable_free_tlb(struct mmu_gather 
 void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, unsigned long pte, int huge)
 {
-	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+	struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch);
 	unsigned long vsid, vaddr;
 	unsigned int psize;
 	real_pte_t rpte;
@@ -180,6 +180,7 @@ void hpte_need_flush(struct mm_struct *m
 	 */
 	if (!batch->active) {
 		flush_hash_page(vaddr, rpte, psize, 0);
+		put_cpu_var(ppc64_tlb_batch);
 		return;
 	}
 
@@ -212,12 +213,14 @@ void hpte_need_flush(struct mm_struct *m
 	 */
 	if (machine_is(celleb)) {
 		__flush_tlb_pending(batch);
+		put_cpu_var(ppc64_tlb_batch);
 		return;
 	}
 #endif /* CONFIG_PREEMPT_RT */
 
 	if (i >= PPC64_TLB_BATCH_NR)
 		__flush_tlb_pending(batch);
+	put_cpu_var(ppc64_tlb_batch);
 }
 
 /*
diff -urpNa -X dontdiff linux-2.6.23.1-rt4/arch/powerpc/platforms/pseries/eeh.c linux-2.6.23.1-rt4-fix/arch/powerpc/platforms/pseries/eeh.c
--- linux-2.6.23.1-rt4/arch/powerpc/platforms/pseries/eeh.c	2007-10-12 09:43:44.000000000 -0700
+++ linux-2.6.23.1-rt4-fix/arch/powerpc/platforms/pseries/eeh.c	2007-10-28 15:43:54.000000000 -0700
@@ -97,7 +97,7 @@ int eeh_subsystem_enabled;
 EXPORT_SYMBOL(eeh_subsystem_enabled);
 
 /* Lock to avoid races due to multiple reports of an error */
-static DEFINE_SPINLOCK(confirm_error_lock);
+static DEFINE_RAW_SPINLOCK(confirm_error_lock);
 
 /* Buffer for reporting slot-error-detail rtas calls. Its here
  * in BSS, and not dynamically alloced, so that it ends up in
diff -urpNa -X dontdiff linux-2.6.23.1-rt4/drivers/of/base.c linux-2.6.23.1-rt4-fix/drivers/of/base.c
--- linux-2.6.23.1-rt4/drivers/of/base.c	2007-10-12 09:43:44.000000000 -0700
+++ linux-2.6.23.1-rt4-fix/drivers/of/base.c	2007-10-28 13:38:36.000000000 -0700
@@ -25,7 +25,7 @@ struct device_node *allnodes;
 /* use when traversing tree through the allnext, child, sibling,
  * or parent members of struct device_node.
  */
-DEFINE_RWLOCK(devtree_lock);
+DEFINE_RAW_RWLOCK(devtree_lock);
 
 int of_n_addr_cells(struct device_node *np)
 {
diff -urpNa -X dontdiff linux-2.6.23.1-rt4/include/asm-powerpc/tlbflush.h linux-2.6.23.1-rt4-fix/include/asm-powerpc/tlbflush.h
--- linux-2.6.23.1-rt4/include/asm-powerpc/tlbflush.h	2007-10-12 09:43:44.000000000 -0700
+++ linux-2.6.23.1-rt4-fix/include/asm-powerpc/tlbflush.h	2007-11-08 17:11:18.000000000 -0800
@@ -109,18 +109,23 @@ extern void hpte_need_flush(struct mm_st
 
 static inline void arch_enter_lazy_mmu_mode(void)
 {
-	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+	struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch);
 
 	batch->active = 1;
+	put_cpu_var(ppc64_tlb_batch);
 }
 
 static inline void arch_leave_lazy_mmu_mode(void)
 {
-	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+	struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch);
 
-	if (batch->index)
-		__flush_tlb_pending(batch);
-	batch->active = 0;
+	if (batch->active) {
+		if (batch->index) {
+			__flush_tlb_pending(batch);
+		}
+		batch->active = 0;
+	}
+	put_cpu_var(ppc64_tlb_batch);
 }
 
 #define arch_flush_lazy_mmu_mode()      do {} while (0)
diff -urpNa -X dontdiff linux-2.6.23.1-rt4/include/asm-powerpc/tlb.h linux-2.6.23.1-rt4-fix/include/asm-powerpc/tlb.h
--- linux-2.6.23.1-rt4/include/asm-powerpc/tlb.h	2007-10-12 09:43:44.000000000 -0700
+++ linux-2.6.23.1-rt4-fix/include/asm-powerpc/tlb.h	2007-10-28 11:36:05.000000000 -0700
@@ -44,8 +44,11 @@ static inline void tlb_flush(struct mmu_
 	 * pages are going to be freed and we really don't want to have a CPU
 	 * access a freed page because it has a stale TLB
 	 */
-	if (tlbbatch->index)
+	if (tlbbatch->index) {
+		preempt_disable();
 		__flush_tlb_pending(tlbbatch);
+		preempt_enable();
+	}
 
 	pte_free_finish();
 }

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH, RFC] improved hacks to allow -rt to run kernbench on POWER
  2007-11-09 20:52 ` Benjamin Herrenschmidt
  2007-11-09 22:07   ` Paul E. McKenney
@ 2007-11-10  1:18   ` Nick Piggin
  1 sibling, 0 replies; 15+ messages in thread
From: Nick Piggin @ 2007-11-10  1:18 UTC (permalink / raw)
  To: benh
  Cc: paulmck, linux-kernel, tony, paulus, dino, tytso, dvhltc, niv,
	antonb, rostedt

On Saturday 10 November 2007 07:52, Benjamin Herrenschmidt wrote:
> > diff -urpNa -X dontdiff linux-2.6.23.1-rt4/arch/powerpc/kernel/process.c
> > linux-2.6.23.1-rt4-fix/arch/powerpc/kernel/process.c ---
> > linux-2.6.23.1-rt4/arch/powerpc/kernel/process.c	2007-10-12
> > 09:43:44.000000000 -0700 +++
> > linux-2.6.23.1-rt4-fix/arch/powerpc/kernel/process.c	2007-11-08
> > 20:33:59.000000000 -0800 @@ -245,6 +245,8 @@ struct task_struct
> > *__switch_to(struct t
> >  	struct thread_struct *new_thread, *old_thread;
> >  	unsigned long flags;
> >  	struct task_struct *last;
> > +	struct ppc64_tlb_batch *batch;
> > +	int hadbatch;
> >
> >  #ifdef CONFIG_SMP
> >  	/* avoid complexity of lazy save/restore of fpu
> > @@ -325,6 +327,16 @@ struct task_struct *__switch_to(struct t
> >  	}
> >  #endif
> >
> > +	batch = &get_cpu_var(ppc64_tlb_batch);
> > +	if (batch->active) {
> > +		hadbatch = 1;
> > +		if (batch->index) {
> > +			__flush_tlb_pending(batch);
> > +		}
> > +		batch->active = 0;
> > +	}
> > +	put_cpu_var(ppc64_tlb_batch);
> > +
> >  	local_irq_save(flags);
> >
> >  	account_system_vtime(current);
> > @@ -335,6 +347,12 @@ struct task_struct *__switch_to(struct t
> >
> >  	local_irq_restore(flags);
> >
> > +	if (hadbatch) {
> > +		batch = &get_cpu_var(ppc64_tlb_batch);
> > +		batch->active = 1;
> > +		put_cpu_var(ppc64_tlb_batch);
> > +	}
> > +
> >  	return last;
> >  }
>
> I doubt we can schedule within __switch_to() (can somebody confirm
> this ?), in which case, you can just use __get_cpu_var() and avoid
> the put, thus saving a handful of cycles in the code above.

Preempt is always turned off over switch_to() call.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH, RFC] improved hacks to allow -rt to run kernbench on POWER
  2007-11-09 22:07   ` Paul E. McKenney
@ 2007-11-11  3:59     ` Benjamin Herrenschmidt
  2007-11-11 14:45       ` Steven Rostedt
  0 siblings, 1 reply; 15+ messages in thread
From: Benjamin Herrenschmidt @ 2007-11-11  3:59 UTC (permalink / raw)
  To: paulmck
  Cc: linux-kernel, tony, paulus, dino, tytso, dvhltc, niv, antonb,
	rostedt


> Removed this as well, also seemed to work.  Please note, however, that
> this is just running kernbench.  But this did seem to get rid of some
> of the warnings as well.  ;-)  Now only have the xics_startup() warning.
> 
> > Overall, looks fine !
> 
> I bet that there are more gotchas lurking in there somewhere, but in
> the meantime here is the updated patch.
> 
> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> ---

Well, I suppose the patch could go in, maybe with some ifdef's around
the bits in _switch_to, there's little point in doing that on non-rt
kernels.

Ben.

>  arch/powerpc/kernel/process.c        |   16 ++++++++++++++++
>  arch/powerpc/kernel/prom.c           |    2 +-
>  arch/powerpc/mm/tlb_64.c             |    5 ++++-
>  arch/powerpc/platforms/pseries/eeh.c |    2 +-
>  drivers/of/base.c                    |    2 +-
>  include/asm-powerpc/tlb.h            |    5 ++++-
>  include/asm-powerpc/tlbflush.h       |   15 ++++++++++-----
>  7 files changed, 37 insertions(+), 10 deletions(-)
> 
> diff -urpNa -X dontdiff linux-2.6.23.1-rt4/arch/powerpc/kernel/process.c linux-2.6.23.1-rt4-fix/arch/powerpc/kernel/process.c
> --- linux-2.6.23.1-rt4/arch/powerpc/kernel/process.c	2007-10-12 09:43:44.000000000 -0700
> +++ linux-2.6.23.1-rt4-fix/arch/powerpc/kernel/process.c	2007-11-09 13:24:46.000000000 -0800
> @@ -245,6 +245,8 @@ struct task_struct *__switch_to(struct t
>  	struct thread_struct *new_thread, *old_thread;
>  	unsigned long flags;
>  	struct task_struct *last;
> +	struct ppc64_tlb_batch *batch;
> +	int hadbatch;
>  
>  #ifdef CONFIG_SMP
>  	/* avoid complexity of lazy save/restore of fpu
> @@ -325,6 +327,15 @@ struct task_struct *__switch_to(struct t
>  	}
>  #endif
>  
> +	batch = &__get_cpu_var(ppc64_tlb_batch);
> +	if (batch->active) {
> +		hadbatch = 1;
> +		if (batch->index) {
> +			__flush_tlb_pending(batch);
> +		}
> +		batch->active = 0;
> +	}
> +
>  	local_irq_save(flags);
>  
>  	account_system_vtime(current);
> @@ -335,6 +346,11 @@ struct task_struct *__switch_to(struct t
>  
>  	local_irq_restore(flags);
>  
> +	if (hadbatch) {
> +		batch = &__get_cpu_var(ppc64_tlb_batch);
> +		batch->active = 1;
> +	}
> +
>  	return last;
>  }
>  
> diff -urpNa -X dontdiff linux-2.6.23.1-rt4/arch/powerpc/kernel/prom.c linux-2.6.23.1-rt4-fix/arch/powerpc/kernel/prom.c
> --- linux-2.6.23.1-rt4/arch/powerpc/kernel/prom.c	2007-10-12 09:43:44.000000000 -0700
> +++ linux-2.6.23.1-rt4-fix/arch/powerpc/kernel/prom.c	2007-10-28 13:37:23.000000000 -0700
> @@ -80,7 +80,7 @@ struct boot_param_header *initial_boot_p
>  
>  extern struct device_node *allnodes;	/* temporary while merging */
>  
> -extern rwlock_t devtree_lock;	/* temporary while merging */
> +extern raw_rwlock_t devtree_lock;	/* temporary while merging */
>  
>  /* export that to outside world */
>  struct device_node *of_chosen;
> diff -urpNa -X dontdiff linux-2.6.23.1-rt4/arch/powerpc/mm/tlb_64.c linux-2.6.23.1-rt4-fix/arch/powerpc/mm/tlb_64.c
> --- linux-2.6.23.1-rt4/arch/powerpc/mm/tlb_64.c	2007-10-27 22:20:57.000000000 -0700
> +++ linux-2.6.23.1-rt4-fix/arch/powerpc/mm/tlb_64.c	2007-11-08 16:49:04.000000000 -0800
> @@ -133,7 +133,7 @@ void pgtable_free_tlb(struct mmu_gather 
>  void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
>  		     pte_t *ptep, unsigned long pte, int huge)
>  {
> -	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
> +	struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch);
>  	unsigned long vsid, vaddr;
>  	unsigned int psize;
>  	real_pte_t rpte;
> @@ -180,6 +180,7 @@ void hpte_need_flush(struct mm_struct *m
>  	 */
>  	if (!batch->active) {
>  		flush_hash_page(vaddr, rpte, psize, 0);
> +		put_cpu_var(ppc64_tlb_batch);
>  		return;
>  	}
>  
> @@ -212,12 +213,14 @@ void hpte_need_flush(struct mm_struct *m
>  	 */
>  	if (machine_is(celleb)) {
>  		__flush_tlb_pending(batch);
> +		put_cpu_var(ppc64_tlb_batch);
>  		return;
>  	}
>  #endif /* CONFIG_PREEMPT_RT */
>  
>  	if (i >= PPC64_TLB_BATCH_NR)
>  		__flush_tlb_pending(batch);
> +	put_cpu_var(ppc64_tlb_batch);
>  }
>  
>  /*
> diff -urpNa -X dontdiff linux-2.6.23.1-rt4/arch/powerpc/platforms/pseries/eeh.c linux-2.6.23.1-rt4-fix/arch/powerpc/platforms/pseries/eeh.c
> --- linux-2.6.23.1-rt4/arch/powerpc/platforms/pseries/eeh.c	2007-10-12 09:43:44.000000000 -0700
> +++ linux-2.6.23.1-rt4-fix/arch/powerpc/platforms/pseries/eeh.c	2007-10-28 15:43:54.000000000 -0700
> @@ -97,7 +97,7 @@ int eeh_subsystem_enabled;
>  EXPORT_SYMBOL(eeh_subsystem_enabled);
>  
>  /* Lock to avoid races due to multiple reports of an error */
> -static DEFINE_SPINLOCK(confirm_error_lock);
> +static DEFINE_RAW_SPINLOCK(confirm_error_lock);
>  
>  /* Buffer for reporting slot-error-detail rtas calls. Its here
>   * in BSS, and not dynamically alloced, so that it ends up in
> diff -urpNa -X dontdiff linux-2.6.23.1-rt4/drivers/of/base.c linux-2.6.23.1-rt4-fix/drivers/of/base.c
> --- linux-2.6.23.1-rt4/drivers/of/base.c	2007-10-12 09:43:44.000000000 -0700
> +++ linux-2.6.23.1-rt4-fix/drivers/of/base.c	2007-10-28 13:38:36.000000000 -0700
> @@ -25,7 +25,7 @@ struct device_node *allnodes;
>  /* use when traversing tree through the allnext, child, sibling,
>   * or parent members of struct device_node.
>   */
> -DEFINE_RWLOCK(devtree_lock);
> +DEFINE_RAW_RWLOCK(devtree_lock);
>  
>  int of_n_addr_cells(struct device_node *np)
>  {
> diff -urpNa -X dontdiff linux-2.6.23.1-rt4/include/asm-powerpc/tlbflush.h linux-2.6.23.1-rt4-fix/include/asm-powerpc/tlbflush.h
> --- linux-2.6.23.1-rt4/include/asm-powerpc/tlbflush.h	2007-10-12 09:43:44.000000000 -0700
> +++ linux-2.6.23.1-rt4-fix/include/asm-powerpc/tlbflush.h	2007-11-08 17:11:18.000000000 -0800
> @@ -109,18 +109,23 @@ extern void hpte_need_flush(struct mm_st
>  
>  static inline void arch_enter_lazy_mmu_mode(void)
>  {
> -	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
> +	struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch);
>  
>  	batch->active = 1;
> +	put_cpu_var(ppc64_tlb_batch);
>  }
>  
>  static inline void arch_leave_lazy_mmu_mode(void)
>  {
> -	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
> +	struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch);
>  
> -	if (batch->index)
> -		__flush_tlb_pending(batch);
> -	batch->active = 0;
> +	if (batch->active) {
> +		if (batch->index) {
> +			__flush_tlb_pending(batch);
> +		}
> +		batch->active = 0;
> +	}
> +	put_cpu_var(ppc64_tlb_batch);
>  }
>  
>  #define arch_flush_lazy_mmu_mode()      do {} while (0)
> diff -urpNa -X dontdiff linux-2.6.23.1-rt4/include/asm-powerpc/tlb.h linux-2.6.23.1-rt4-fix/include/asm-powerpc/tlb.h
> --- linux-2.6.23.1-rt4/include/asm-powerpc/tlb.h	2007-10-12 09:43:44.000000000 -0700
> +++ linux-2.6.23.1-rt4-fix/include/asm-powerpc/tlb.h	2007-10-28 11:36:05.000000000 -0700
> @@ -44,8 +44,11 @@ static inline void tlb_flush(struct mmu_
>  	 * pages are going to be freed and we really don't want to have a CPU
>  	 * access a freed page because it has a stale TLB
>  	 */
> -	if (tlbbatch->index)
> +	if (tlbbatch->index) {
> +		preempt_disable();
>  		__flush_tlb_pending(tlbbatch);
> +		preempt_enable();
> +	}
>  
>  	pte_free_finish();
>  }


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH, RFC] improved hacks to allow -rt to run kernbench on POWER
  2007-11-11  3:59     ` Benjamin Herrenschmidt
@ 2007-11-11 14:45       ` Steven Rostedt
  2007-11-11 20:48         ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 15+ messages in thread
From: Steven Rostedt @ 2007-11-11 14:45 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: paulmck, linux-kernel, tony, paulus, dino, tytso, dvhltc, niv,
	antonb


On Sun, 11 Nov 2007, Benjamin Herrenschmidt wrote:

>
> > Removed this as well, also seemed to work.  Please note, however, that
> > this is just running kernbench.  But this did seem to get rid of some
> > of the warnings as well.  ;-)  Now only have the xics_startup() warning.
> >
> > > Overall, looks fine !
> >
> > I bet that there are more gotchas lurking in there somewhere, but in
> > the meantime here is the updated patch.
> >
> > Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> > ---
>
> Well, I suppose the patch could go in, maybe with some ifdef's around
> the bits in _switch_to, there's little point in doing that on non-rt
> kernels.

As Nick Piggin already stated, and I'll even state it for the RT kernel,
we do not allow preemption in switch_to. So it is safe to remove those
"preempt_disable" bits from the patch.

-- Steve

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH, RFC] improved hacks to allow -rt to run kernbench on POWER
  2007-11-11 14:45       ` Steven Rostedt
@ 2007-11-11 20:48         ` Benjamin Herrenschmidt
  2007-11-12 17:05           ` Paul E. McKenney
  2007-11-12 19:15           ` Paul E. McKenney
  0 siblings, 2 replies; 15+ messages in thread
From: Benjamin Herrenschmidt @ 2007-11-11 20:48 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: paulmck, linux-kernel, tony, paulus, dino, tytso, dvhltc, niv,
	antonb


On Sun, 2007-11-11 at 09:45 -0500, Steven Rostedt wrote:
> > Well, I suppose the patch could go in, maybe with some ifdef's
> around
> > the bits in _switch_to, there's little point in doing that on non-rt
> > kernels.
> 
> As Nick Piggin already stated, and I'll even state it for the RT
> kernel,
> we do not allow preemption in switch_to. So it is safe to remove those
> "preempt_disable" bits from the patch

Sure, I know that, I'm not talking about that, I'm talking about the
added code that flush pending batches & save the batch state, since on
non-rt kernel, this is not useful (the batch is only ever active within
a spinlocked section, which cannot be preempted on non-rt).

Ben.
 


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH, RFC] improved hacks to allow -rt to run kernbench on POWER
  2007-11-11 20:48         ` Benjamin Herrenschmidt
@ 2007-11-12 17:05           ` Paul E. McKenney
  2007-11-12 19:15           ` Paul E. McKenney
  1 sibling, 0 replies; 15+ messages in thread
From: Paul E. McKenney @ 2007-11-12 17:05 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Steven Rostedt, linux-kernel, tony, paulus, dino, tytso, dvhltc,
	niv, antonb

On Mon, Nov 12, 2007 at 07:48:45AM +1100, Benjamin Herrenschmidt wrote:
> 
> On Sun, 2007-11-11 at 09:45 -0500, Steven Rostedt wrote:
> > > Well, I suppose the patch could go in, maybe with some ifdef's
> > around
> > > the bits in _switch_to, there's little point in doing that on non-rt
> > > kernels.
> > 
> > As Nick Piggin already stated, and I'll even state it for the RT
> > kernel,
> > we do not allow preemption in switch_to. So it is safe to remove those
> > "preempt_disable" bits from the patch
> 
> Sure, I know that, I'm not talking about that, I'm talking about the
> added code that flush pending batches & save the batch state, since on
> non-rt kernel, this is not useful (the batch is only ever active within
> a spinlocked section, which cannot be preempted on non-rt).

Ah, I need a bit of conditional compilation.  Will fix.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH, RFC] improved hacks to allow -rt to run kernbench on POWER
  2007-11-11 20:48         ` Benjamin Herrenschmidt
  2007-11-12 17:05           ` Paul E. McKenney
@ 2007-11-12 19:15           ` Paul E. McKenney
  2007-11-12 21:12             ` Benjamin Herrenschmidt
  1 sibling, 1 reply; 15+ messages in thread
From: Paul E. McKenney @ 2007-11-12 19:15 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Steven Rostedt, linux-kernel, tony, paulus, dino, tytso, dvhltc,
	niv, antonb

On Mon, Nov 12, 2007 at 07:48:45AM +1100, Benjamin Herrenschmidt wrote:
> 
> On Sun, 2007-11-11 at 09:45 -0500, Steven Rostedt wrote:
> > > Well, I suppose the patch could go in, maybe with some ifdef's
> > around
> > > the bits in _switch_to, there's little point in doing that on non-rt
> > > kernels.
> > 
> > As Nick Piggin already stated, and I'll even state it for the RT
> > kernel,
> > we do not allow preemption in switch_to. So it is safe to remove those
> > "preempt_disable" bits from the patch
> 
> Sure, I know that, I'm not talking about that, I'm talking about the
> added code that flush pending batches & save the batch state, since on
> non-rt kernel, this is not useful (the batch is only ever active within
> a spinlocked section, which cannot be preempted on non-rt).

And here is an updated patch.  There has to be a better way than the
#ifdef, but I need the two local variables, and breaking the intervening
code out into a separate function didn't quite seem right either.

Thoughts?

This one does only one oops during boot-up, which I will start looking
at:

BUG: sleeping function called from invalid context ifconfig(994) at kernel/rtmutex.c:637
in_atomic():1 [00000002], irqs_disabled():1
Call Trace:
[c0000000ff49f030] [c000000000010028] .show_stack+0x6c/0x1a0 (unreliable)
[c0000000ff49f0d0] [c00000000004f8b4] .__might_sleep+0x11c/0x138
[c0000000ff49f150] [c00000000039c920] .__rt_spin_lock+0x38/0xa0
[c0000000ff49f1d0] [c0000000000cf8e8] .kmem_cache_alloc+0x68/0x184
[c0000000ff49f270] [c0000000001f7534] .radix_tree_node_alloc+0x3c/0x104
[c0000000ff49f300] [c0000000001f8418] .radix_tree_insert+0x19c/0x324
[c0000000ff49f3c0] [c00000000000b758] .irq_radix_revmap+0x140/0x178
[c0000000ff49f470] [c000000000044aec] .xics_startup+0x30/0x54
[c0000000ff49f500] [c0000000000997f4] .setup_irq+0x254/0x320
[c0000000ff49f5b0] [c000000000099984] .request_irq+0xc4/0x114
[c0000000ff49f660] [d00000000079b194] .e1000_open+0xdc/0x1b8 [e1000]
[c0000000ff49f6f0] [c00000000030a840] .dev_open+0x94/0x110
[c0000000ff49f790] [c00000000030a69c] .dev_change_flags+0x110/0x220
[c0000000ff49f830] [c00000000036a0a8] .devinet_ioctl+0x2cc/0x764
[c0000000ff49f930] [c00000000036a6a8] .inet_ioctl+0xe8/0x138
[c0000000ff49f9b0] [c0000000002f9acc] .sock_ioctl+0x2c8/0x314
[c0000000ff49fa50] [c0000000000e6dec] .do_ioctl+0x5c/0xf0
[c0000000ff49faf0] [c0000000000e731c] .vfs_ioctl+0x49c/0x4d4
[c0000000ff49fba0] [c0000000000e73ec] .sys_ioctl+0x98/0xe0
[c0000000ff49fc50] [c000000000117944] .dev_ifsioc+0x1e0/0x46c
[c0000000ff49fd40] [c00000000011e1d4] .compat_sys_ioctl+0x40c/0x4a0
[c0000000ff49fe30] [c00000000000852c] syscall_exit+0x0/0x40

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---

 arch/powerpc/kernel/process.c        |   22 ++++++++++++++++++++++
 arch/powerpc/kernel/prom.c           |    2 +-
 arch/powerpc/mm/tlb_64.c             |    5 ++++-
 arch/powerpc/platforms/pseries/eeh.c |    2 +-
 drivers/of/base.c                    |    2 +-
 include/asm-powerpc/tlb.h            |    5 ++++-
 include/asm-powerpc/tlbflush.h       |   15 ++++++++++-----
 7 files changed, 43 insertions(+), 10 deletions(-)

diff -urpNa -X dontdiff linux-2.6.23.1-rt4/arch/powerpc/kernel/process.c linux-2.6.23.1-rt4-fix/arch/powerpc/kernel/process.c
--- linux-2.6.23.1-rt4/arch/powerpc/kernel/process.c	2007-10-12 09:43:44.000000000 -0700
+++ linux-2.6.23.1-rt4-fix/arch/powerpc/kernel/process.c	2007-11-12 09:18:55.000000000 -0800
@@ -245,6 +245,10 @@ struct task_struct *__switch_to(struct t
 	struct thread_struct *new_thread, *old_thread;
 	unsigned long flags;
 	struct task_struct *last;
+#ifdef CONFIG_PREEMPT_RT
+	struct ppc64_tlb_batch *batch;
+	int hadbatch;
+#endif /* #ifdef CONFIG_PREEMPT_RT */
 
 #ifdef CONFIG_SMP
 	/* avoid complexity of lazy save/restore of fpu
@@ -325,6 +329,17 @@ struct task_struct *__switch_to(struct t
 	}
 #endif
 
+#ifdef CONFIG_PREEMPT_RT
+	batch = &__get_cpu_var(ppc64_tlb_batch);
+	if (batch->active) {
+		hadbatch = 1;
+		if (batch->index) {
+			__flush_tlb_pending(batch);
+		}
+		batch->active = 0;
+	}
+#endif /* #ifdef CONFIG_PREEMPT_RT */
+
 	local_irq_save(flags);
 
 	account_system_vtime(current);
@@ -335,6 +350,13 @@ struct task_struct *__switch_to(struct t
 
 	local_irq_restore(flags);
 
+#ifdef CONFIG_PREEMPT_RT
+	if (hadbatch) {
+		batch = &__get_cpu_var(ppc64_tlb_batch);
+		batch->active = 1;
+	}
+#endif /* #ifdef CONFIG_PREEMPT_RT */
+
 	return last;
 }
 
diff -urpNa -X dontdiff linux-2.6.23.1-rt4/arch/powerpc/kernel/prom.c linux-2.6.23.1-rt4-fix/arch/powerpc/kernel/prom.c
--- linux-2.6.23.1-rt4/arch/powerpc/kernel/prom.c	2007-10-12 09:43:44.000000000 -0700
+++ linux-2.6.23.1-rt4-fix/arch/powerpc/kernel/prom.c	2007-10-28 13:37:23.000000000 -0700
@@ -80,7 +80,7 @@ struct boot_param_header *initial_boot_p
 
 extern struct device_node *allnodes;	/* temporary while merging */
 
-extern rwlock_t devtree_lock;	/* temporary while merging */
+extern raw_rwlock_t devtree_lock;	/* temporary while merging */
 
 /* export that to outside world */
 struct device_node *of_chosen;
diff -urpNa -X dontdiff linux-2.6.23.1-rt4/arch/powerpc/mm/tlb_64.c linux-2.6.23.1-rt4-fix/arch/powerpc/mm/tlb_64.c
--- linux-2.6.23.1-rt4/arch/powerpc/mm/tlb_64.c	2007-10-27 22:20:57.000000000 -0700
+++ linux-2.6.23.1-rt4-fix/arch/powerpc/mm/tlb_64.c	2007-11-08 16:49:04.000000000 -0800
@@ -133,7 +133,7 @@ void pgtable_free_tlb(struct mmu_gather 
 void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, unsigned long pte, int huge)
 {
-	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+	struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch);
 	unsigned long vsid, vaddr;
 	unsigned int psize;
 	real_pte_t rpte;
@@ -180,6 +180,7 @@ void hpte_need_flush(struct mm_struct *m
 	 */
 	if (!batch->active) {
 		flush_hash_page(vaddr, rpte, psize, 0);
+		put_cpu_var(ppc64_tlb_batch);
 		return;
 	}
 
@@ -212,12 +213,14 @@ void hpte_need_flush(struct mm_struct *m
 	 */
 	if (machine_is(celleb)) {
 		__flush_tlb_pending(batch);
+		put_cpu_var(ppc64_tlb_batch);
 		return;
 	}
 #endif /* CONFIG_PREEMPT_RT */
 
 	if (i >= PPC64_TLB_BATCH_NR)
 		__flush_tlb_pending(batch);
+	put_cpu_var(ppc64_tlb_batch);
 }
 
 /*
diff -urpNa -X dontdiff linux-2.6.23.1-rt4/arch/powerpc/platforms/pseries/eeh.c linux-2.6.23.1-rt4-fix/arch/powerpc/platforms/pseries/eeh.c
--- linux-2.6.23.1-rt4/arch/powerpc/platforms/pseries/eeh.c	2007-10-12 09:43:44.000000000 -0700
+++ linux-2.6.23.1-rt4-fix/arch/powerpc/platforms/pseries/eeh.c	2007-10-28 15:43:54.000000000 -0700
@@ -97,7 +97,7 @@ int eeh_subsystem_enabled;
 EXPORT_SYMBOL(eeh_subsystem_enabled);
 
 /* Lock to avoid races due to multiple reports of an error */
-static DEFINE_SPINLOCK(confirm_error_lock);
+static DEFINE_RAW_SPINLOCK(confirm_error_lock);
 
 /* Buffer for reporting slot-error-detail rtas calls. Its here
  * in BSS, and not dynamically alloced, so that it ends up in
diff -urpNa -X dontdiff linux-2.6.23.1-rt4/drivers/of/base.c linux-2.6.23.1-rt4-fix/drivers/of/base.c
--- linux-2.6.23.1-rt4/drivers/of/base.c	2007-10-12 09:43:44.000000000 -0700
+++ linux-2.6.23.1-rt4-fix/drivers/of/base.c	2007-10-28 13:38:36.000000000 -0700
@@ -25,7 +25,7 @@ struct device_node *allnodes;
 /* use when traversing tree through the allnext, child, sibling,
  * or parent members of struct device_node.
  */
-DEFINE_RWLOCK(devtree_lock);
+DEFINE_RAW_RWLOCK(devtree_lock);
 
 int of_n_addr_cells(struct device_node *np)
 {
diff -urpNa -X dontdiff linux-2.6.23.1-rt4/include/asm-powerpc/tlbflush.h linux-2.6.23.1-rt4-fix/include/asm-powerpc/tlbflush.h
--- linux-2.6.23.1-rt4/include/asm-powerpc/tlbflush.h	2007-10-12 09:43:44.000000000 -0700
+++ linux-2.6.23.1-rt4-fix/include/asm-powerpc/tlbflush.h	2007-11-08 17:11:18.000000000 -0800
@@ -109,18 +109,23 @@ extern void hpte_need_flush(struct mm_st
 
 static inline void arch_enter_lazy_mmu_mode(void)
 {
-	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+	struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch);
 
 	batch->active = 1;
+	put_cpu_var(ppc64_tlb_batch);
 }
 
 static inline void arch_leave_lazy_mmu_mode(void)
 {
-	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+	struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch);
 
-	if (batch->index)
-		__flush_tlb_pending(batch);
-	batch->active = 0;
+	if (batch->active) {
+		if (batch->index) {
+			__flush_tlb_pending(batch);
+		}
+		batch->active = 0;
+	}
+	put_cpu_var(ppc64_tlb_batch);
 }
 
 #define arch_flush_lazy_mmu_mode()      do {} while (0)
diff -urpNa -X dontdiff linux-2.6.23.1-rt4/include/asm-powerpc/tlb.h linux-2.6.23.1-rt4-fix/include/asm-powerpc/tlb.h
--- linux-2.6.23.1-rt4/include/asm-powerpc/tlb.h	2007-10-12 09:43:44.000000000 -0700
+++ linux-2.6.23.1-rt4-fix/include/asm-powerpc/tlb.h	2007-10-28 11:36:05.000000000 -0700
@@ -44,8 +44,11 @@ static inline void tlb_flush(struct mmu_
 	 * pages are going to be freed and we really don't want to have a CPU
 	 * access a freed page because it has a stale TLB
 	 */
-	if (tlbbatch->index)
+	if (tlbbatch->index) {
+		preempt_disable();
 		__flush_tlb_pending(tlbbatch);
+		preempt_enable();
+	}
 
 	pte_free_finish();
 }

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH, RFC] improved hacks to allow -rt to run kernbench on POWER
  2007-11-12 19:15           ` Paul E. McKenney
@ 2007-11-12 21:12             ` Benjamin Herrenschmidt
  2007-11-13  9:10               ` Peter Zijlstra
  0 siblings, 1 reply; 15+ messages in thread
From: Benjamin Herrenschmidt @ 2007-11-12 21:12 UTC (permalink / raw)
  To: paulmck
  Cc: Steven Rostedt, linux-kernel, tony, paulus, dino, tytso, dvhltc,
	niv, antonb


> And here is an updated patch.  There has to be a better way than the
> #ifdef, but I need the two local variables, and breaking the intervening
> code out into a separate function didn't quite seem right either.
> 
> Thoughts?

Nothing comes to mind right now...

> This one does only one oops during boot-up, which I will start looking
> at:
> 
> BUG: sleeping function called from invalid context ifconfig(994) at kernel/rtmutex.c:637
> in_atomic():1 [00000002], irqs_disabled():1
> Call Trace:
> [c0000000ff49f030] [c000000000010028] .show_stack+0x6c/0x1a0 (unreliable)
> [c0000000ff49f0d0] [c00000000004f8b4] .__might_sleep+0x11c/0x138
> [c0000000ff49f150] [c00000000039c920] .__rt_spin_lock+0x38/0xa0
> [c0000000ff49f1d0] [c0000000000cf8e8] .kmem_cache_alloc+0x68/0x184
> [c0000000ff49f270] [c0000000001f7534] .radix_tree_node_alloc+0x3c/0x104
> [c0000000ff49f300] [c0000000001f8418] .radix_tree_insert+0x19c/0x324
> [c0000000ff49f3c0] [c00000000000b758] .irq_radix_revmap+0x140/0x178
> [c0000000ff49f470] [c000000000044aec] .xics_startup+0x30/0x54
> [c0000000ff49f500] [c0000000000997f4] .setup_irq+0x254/0x320
> [c0000000ff49f5b0] [c000000000099984] .request_irq+0xc4/0x114
> [c0000000ff49f660] [d00000000079b194] .e1000_open+0xdc/0x1b8 [e1000]
> [c0000000ff49f6f0] [c00000000030a840] .dev_open+0x94/0x110
> [c0000000ff49f790] [c00000000030a69c] .dev_change_flags+0x110/0x220
> [c0000000ff49f830] [c00000000036a0a8] .devinet_ioctl+0x2cc/0x764
> [c0000000ff49f930] [c00000000036a6a8] .inet_ioctl+0xe8/0x138
> [c0000000ff49f9b0] [c0000000002f9acc] .sock_ioctl+0x2c8/0x314
> [c0000000ff49fa50] [c0000000000e6dec] .do_ioctl+0x5c/0xf0
> [c0000000ff49faf0] [c0000000000e731c] .vfs_ioctl+0x49c/0x4d4
> [c0000000ff49fba0] [c0000000000e73ec] .sys_ioctl+0x98/0xe0
> [c0000000ff49fc50] [c000000000117944] .dev_ifsioc+0x1e0/0x46c
> [c0000000ff49fd40] [c00000000011e1d4] .compat_sys_ioctl+0x40c/0x4a0
> [c0000000ff49fe30] [c00000000000852c] syscall_exit+0x0/0x40

The radix tree is used by the powerpc IRQ subsystem with some hand-made
locking that involves per-cpu variables among others, you may want to
have a look at arch/powerpc/kernel/irq.c ... It should all be GFP_ATOMIC
though, but if -rt can't cope with even GFP_ATOMIC when preempt is off,
then we have a deeper problem (the allocation of the page for RCU in
freeing page tables is another one that will GFP_ATOMIC in
non-preemptible context afaik).

Ben.



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH, RFC] improved hacks to allow -rt to run kernbench on POWER
  2007-11-12 21:12             ` Benjamin Herrenschmidt
@ 2007-11-13  9:10               ` Peter Zijlstra
  2007-11-13  9:43                 ` Benjamin Herrenschmidt
  2007-11-13 10:05                 ` Peter Zijlstra
  0 siblings, 2 replies; 15+ messages in thread
From: Peter Zijlstra @ 2007-11-13  9:10 UTC (permalink / raw)
  To: benh
  Cc: paulmck, Steven Rostedt, linux-kernel, tony, paulus, dino, tytso,
	dvhltc, niv, antonb, Ingo Molnar


On Tue, 2007-11-13 at 08:12 +1100, Benjamin Herrenschmidt wrote:
> > And here is an updated patch.  There has to be a better way than the
> > #ifdef, but I need the two local variables, and breaking the intervening
> > code out into a separate function didn't quite seem right either.
> > 
> > Thoughts?
> 
> Nothing comes to mind right now...
> 
> > This one does only one oops during boot-up, which I will start looking
> > at:
> > 
> > BUG: sleeping function called from invalid context ifconfig(994) at kernel/rtmutex.c:637
> > in_atomic():1 [00000002], irqs_disabled():1
> > Call Trace:
> > [c0000000ff49f030] [c000000000010028] .show_stack+0x6c/0x1a0 (unreliable)
> > [c0000000ff49f0d0] [c00000000004f8b4] .__might_sleep+0x11c/0x138
> > [c0000000ff49f150] [c00000000039c920] .__rt_spin_lock+0x38/0xa0
> > [c0000000ff49f1d0] [c0000000000cf8e8] .kmem_cache_alloc+0x68/0x184
> > [c0000000ff49f270] [c0000000001f7534] .radix_tree_node_alloc+0x3c/0x104
> > [c0000000ff49f300] [c0000000001f8418] .radix_tree_insert+0x19c/0x324
> > [c0000000ff49f3c0] [c00000000000b758] .irq_radix_revmap+0x140/0x178
> > [c0000000ff49f470] [c000000000044aec] .xics_startup+0x30/0x54
> > [c0000000ff49f500] [c0000000000997f4] .setup_irq+0x254/0x320
> > [c0000000ff49f5b0] [c000000000099984] .request_irq+0xc4/0x114
> > [c0000000ff49f660] [d00000000079b194] .e1000_open+0xdc/0x1b8 [e1000]
> > [c0000000ff49f6f0] [c00000000030a840] .dev_open+0x94/0x110
> > [c0000000ff49f790] [c00000000030a69c] .dev_change_flags+0x110/0x220
> > [c0000000ff49f830] [c00000000036a0a8] .devinet_ioctl+0x2cc/0x764
> > [c0000000ff49f930] [c00000000036a6a8] .inet_ioctl+0xe8/0x138
> > [c0000000ff49f9b0] [c0000000002f9acc] .sock_ioctl+0x2c8/0x314
> > [c0000000ff49fa50] [c0000000000e6dec] .do_ioctl+0x5c/0xf0
> > [c0000000ff49faf0] [c0000000000e731c] .vfs_ioctl+0x49c/0x4d4
> > [c0000000ff49fba0] [c0000000000e73ec] .sys_ioctl+0x98/0xe0
> > [c0000000ff49fc50] [c000000000117944] .dev_ifsioc+0x1e0/0x46c
> > [c0000000ff49fd40] [c00000000011e1d4] .compat_sys_ioctl+0x40c/0x4a0
> > [c0000000ff49fe30] [c00000000000852c] syscall_exit+0x0/0x40
> 
> The radix tree is used by the powerpc IRQ subsystem with some hand-made
> locking that involves per-cpu variables among others, you may want to
> have a look at arch/powerpc/kernel/irq.c ... It should all be GFP_ATOMIC
> though, but if -rt can't cope with even GFP_ATOMIC when preempt is off,
> then we have a deeper problem (the allocation of the page for RCU in
> freeing page tables is another one that will GFP_ATOMIC in
> non-preemptible context afaik).

Correct, -rt can't allocate -anything- when preemption if off. That is
the cost for having the allocators itself preemptable.

Even radix_tree_preload() will not work as its functionality was based
on preempt disable to limit access to a global (per cpu) object reserve.
But maybe something similar could be done with a local reserve by using
struct radix_tree_context to pass it along.

I'll see if I can come up with anything like that, that is, if that
would suffice?


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH, RFC] improved hacks to allow -rt to run kernbench on POWER
  2007-11-13  9:10               ` Peter Zijlstra
@ 2007-11-13  9:43                 ` Benjamin Herrenschmidt
  2007-11-13 10:05                 ` Peter Zijlstra
  1 sibling, 0 replies; 15+ messages in thread
From: Benjamin Herrenschmidt @ 2007-11-13  9:43 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: paulmck, Steven Rostedt, linux-kernel, tony, paulus, dino, tytso,
	dvhltc, niv, antonb, Ingo Molnar


On Tue, 2007-11-13 at 10:10 +0100, Peter Zijlstra wrote:
> 
> Correct, -rt can't allocate -anything- when preemption if off. That is
> the cost for having the allocators itself preemptable.
> 
> Even radix_tree_preload() will not work as its functionality was based
> on preempt disable to limit access to a global (per cpu) object
> reserve.
> But maybe something similar could be done with a local reserve by
> using
> struct radix_tree_context to pass it along.
> 
> I'll see if I can come up with anything like that, that is, if that
> would suffice?

For that specific problem, as long as the radix tree can be made to work
while non-preemptible, I'm fine :-)

I'm still worried by other cases where things expect GFP_ATOMIC to work
at any time.

Ben.



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH, RFC] improved hacks to allow -rt to run kernbench on POWER
  2007-11-13  9:10               ` Peter Zijlstra
  2007-11-13  9:43                 ` Benjamin Herrenschmidt
@ 2007-11-13 10:05                 ` Peter Zijlstra
  2007-11-13 10:59                   ` Peter Zijlstra
  2007-11-13 11:09                   ` Benjamin Herrenschmidt
  1 sibling, 2 replies; 15+ messages in thread
From: Peter Zijlstra @ 2007-11-13 10:05 UTC (permalink / raw)
  To: benh
  Cc: paulmck, Steven Rostedt, linux-kernel, tony, paulus, dino, tytso,
	dvhltc, niv, antonb, Ingo Molnar


On Tue, 2007-11-13 at 10:10 +0100, Peter Zijlstra wrote:
> On Tue, 2007-11-13 at 08:12 +1100, Benjamin Herrenschmidt wrote:
> > > And here is an updated patch.  There has to be a better way than the
> > > #ifdef, but I need the two local variables, and breaking the intervening
> > > code out into a separate function didn't quite seem right either.
> > > 
> > > Thoughts?
> > 
> > Nothing comes to mind right now...
> > 
> > > This one does only one oops during boot-up, which I will start looking
> > > at:
> > > 
> > > BUG: sleeping function called from invalid context ifconfig(994) at kernel/rtmutex.c:637
> > > in_atomic():1 [00000002], irqs_disabled():1
> > > Call Trace:
> > > [c0000000ff49f030] [c000000000010028] .show_stack+0x6c/0x1a0 (unreliable)
> > > [c0000000ff49f0d0] [c00000000004f8b4] .__might_sleep+0x11c/0x138
> > > [c0000000ff49f150] [c00000000039c920] .__rt_spin_lock+0x38/0xa0
> > > [c0000000ff49f1d0] [c0000000000cf8e8] .kmem_cache_alloc+0x68/0x184
> > > [c0000000ff49f270] [c0000000001f7534] .radix_tree_node_alloc+0x3c/0x104
> > > [c0000000ff49f300] [c0000000001f8418] .radix_tree_insert+0x19c/0x324
> > > [c0000000ff49f3c0] [c00000000000b758] .irq_radix_revmap+0x140/0x178
> > > [c0000000ff49f470] [c000000000044aec] .xics_startup+0x30/0x54
> > > [c0000000ff49f500] [c0000000000997f4] .setup_irq+0x254/0x320
> > > [c0000000ff49f5b0] [c000000000099984] .request_irq+0xc4/0x114
> > > [c0000000ff49f660] [d00000000079b194] .e1000_open+0xdc/0x1b8 [e1000]
> > > [c0000000ff49f6f0] [c00000000030a840] .dev_open+0x94/0x110
> > > [c0000000ff49f790] [c00000000030a69c] .dev_change_flags+0x110/0x220
> > > [c0000000ff49f830] [c00000000036a0a8] .devinet_ioctl+0x2cc/0x764
> > > [c0000000ff49f930] [c00000000036a6a8] .inet_ioctl+0xe8/0x138
> > > [c0000000ff49f9b0] [c0000000002f9acc] .sock_ioctl+0x2c8/0x314
> > > [c0000000ff49fa50] [c0000000000e6dec] .do_ioctl+0x5c/0xf0
> > > [c0000000ff49faf0] [c0000000000e731c] .vfs_ioctl+0x49c/0x4d4
> > > [c0000000ff49fba0] [c0000000000e73ec] .sys_ioctl+0x98/0xe0
> > > [c0000000ff49fc50] [c000000000117944] .dev_ifsioc+0x1e0/0x46c
> > > [c0000000ff49fd40] [c00000000011e1d4] .compat_sys_ioctl+0x40c/0x4a0
> > > [c0000000ff49fe30] [c00000000000852c] syscall_exit+0x0/0x40
> > 
> > The radix tree is used by the powerpc IRQ subsystem with some hand-made
> > locking that involves per-cpu variables among others, you may want to
> > have a look at arch/powerpc/kernel/irq.c ... It should all be GFP_ATOMIC
> > though, but if -rt can't cope with even GFP_ATOMIC when preempt is off,
> > then we have a deeper problem (the allocation of the page for RCU in
> > freeing page tables is another one that will GFP_ATOMIC in
> > non-preemptible context afaik).
> 
> Correct, -rt can't allocate -anything- when preemption if off. That is
> the cost for having the allocators itself preemptable.
> 
> Even radix_tree_preload() will not work as its functionality was based
> on preempt disable to limit access to a global (per cpu) object reserve.
> But maybe something similar could be done with a local reserve by using
> struct radix_tree_context to pass it along.
> 
> I'll see if I can come up with anything like that, that is, if that
> would suffice?

Looking at the code:

/* radix tree not lockless safe ! we use a brlock-type mecanism
 * for now, until we can use a lockless radix tree
 */
static void irq_radix_wrlock(unsigned long *flags)

The RCU radix tree stuffs have gone upstream long ago.

Anyway, it seems its the generic irq stuff that uses raw_spinlock_t and
disables IRQs, so there isn't much we can do from the ARCH level I'm
afraid :-(

Ingo, any sane ideas?


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH, RFC] improved hacks to allow -rt to run kernbench on POWER
  2007-11-13 10:05                 ` Peter Zijlstra
@ 2007-11-13 10:59                   ` Peter Zijlstra
  2007-11-13 11:09                   ` Benjamin Herrenschmidt
  1 sibling, 0 replies; 15+ messages in thread
From: Peter Zijlstra @ 2007-11-13 10:59 UTC (permalink / raw)
  To: benh
  Cc: paulmck, Steven Rostedt, linux-kernel, tony, paulus, dino, tytso,
	dvhltc, niv, antonb, Ingo Molnar


On Tue, 2007-11-13 at 11:05 +0100, Peter Zijlstra wrote:

> Anyway, it seems its the generic irq stuff that uses raw_spinlock_t and
> disables IRQs, so there isn't much we can do from the ARCH level I'm
> afraid :-(
> 
> Ingo, any sane ideas?

Ok benh came up with a workable idea, he just needs a night's sleep to
come up with the details :-)

The idea is to fill the radix tree from host->ops->map() (or
irq_create_mapping()) as that should still be preemptable, and then
convert all other uses to RCU lookups.




^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH, RFC] improved hacks to allow -rt to run kernbench on POWER
  2007-11-13 10:05                 ` Peter Zijlstra
  2007-11-13 10:59                   ` Peter Zijlstra
@ 2007-11-13 11:09                   ` Benjamin Herrenschmidt
  1 sibling, 0 replies; 15+ messages in thread
From: Benjamin Herrenschmidt @ 2007-11-13 11:09 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: paulmck, Steven Rostedt, linux-kernel, tony, paulus, dino, tytso,
	dvhltc, niv, antonb, Ingo Molnar


On Tue, 2007-11-13 at 11:05 +0100, Peter Zijlstra wrote:
> Looking at the code:
> 
> /* radix tree not lockless safe ! we use a brlock-type mecanism
>  * for now, until we can use a lockless radix tree
>  */
> static void irq_radix_wrlock(unsigned long *flags)
> 
> The RCU radix tree stuffs have gone upstream long ago.
> 
> Anyway, it seems its the generic irq stuff that uses raw_spinlock_t
> and
> disables IRQs, so there isn't much we can do from the ARCH level I'm
> afraid :-(
> 
> Ingo, any sane ideas?

As discussed on IRC< there are a couple of other related issues, though
part of them go away if I can get rid of the brlock I have in there
thanks to the new RCU based radix tree.

I'll give that some thoughts and come back to you tomorrow or thursday
hopefully with a patch.

Cheers,
Ben.



^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2007-11-13 11:41 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-11-09 18:10 [PATCH, RFC] improved hacks to allow -rt to run kernbench on POWER Paul E. McKenney
2007-11-09 20:52 ` Benjamin Herrenschmidt
2007-11-09 22:07   ` Paul E. McKenney
2007-11-11  3:59     ` Benjamin Herrenschmidt
2007-11-11 14:45       ` Steven Rostedt
2007-11-11 20:48         ` Benjamin Herrenschmidt
2007-11-12 17:05           ` Paul E. McKenney
2007-11-12 19:15           ` Paul E. McKenney
2007-11-12 21:12             ` Benjamin Herrenschmidt
2007-11-13  9:10               ` Peter Zijlstra
2007-11-13  9:43                 ` Benjamin Herrenschmidt
2007-11-13 10:05                 ` Peter Zijlstra
2007-11-13 10:59                   ` Peter Zijlstra
2007-11-13 11:09                   ` Benjamin Herrenschmidt
2007-11-10  1:18   ` Nick Piggin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox