LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* Syscall auditing and seccomp for PPC32.
From: David Woodhouse @ 2005-05-08 15:07 UTC (permalink / raw)
  To: paulus, benh; +Cc: linuxppc-dev

For information (and feedback) only. Do not apply; it's already in 
    rsync://rsync.kernel.org/pub/scm/linux/kernel/git/dwmw2/audit-2.6.git

Why _is_ do_syscall_trace() exported, anyway?

Index: arch/ppc/Kconfig
===================================================================
--- 6cdd9771bd9a56de1b0246a330ccd916ecdb1d41/arch/ppc/Kconfig  (mode:100644)
+++ 27383b18b9f62d3c4f1b5dd9f3daeffb10416c15/arch/ppc/Kconfig  (mode:100644)
@@ -1083,6 +1083,23 @@
 
 source kernel/power/Kconfig
 
+config SECCOMP
+	bool "Enable seccomp to safely compute untrusted bytecode"
+	depends on PROC_FS
+	default y
+	help
+	  This kernel feature is useful for number crunching applications
+	  that may need to compute untrusted bytecode during their
+	  execution. By using pipes or other transports made available to
+	  the process as file descriptors supporting the read/write
+	  syscalls, it's possible to isolate those applications in
+	  their own address space using seccomp. Once seccomp is
+	  enabled via /proc/<pid>/seccomp, it cannot be disabled
+	  and the task is only allowed to execute a few safe syscalls
+	  defined by each seccomp mode.
+
+	  If unsure, say Y. Only embedded should say N here.
+
 endmenu
 
 config ISA_DMA_API
Index: arch/ppc/kernel/entry.S
===================================================================
--- 6cdd9771bd9a56de1b0246a330ccd916ecdb1d41/arch/ppc/kernel/entry.S  (mode:100644)
+++ 27383b18b9f62d3c4f1b5dd9f3daeffb10416c15/arch/ppc/kernel/entry.S  (mode:100644)
@@ -202,7 +202,7 @@
 	rlwinm	r11,r11,0,~_TIFL_FORCE_NOERROR
 	stw	r11,TI_LOCAL_FLAGS(r10)
 	lwz	r11,TI_FLAGS(r10)
-	andi.	r11,r11,_TIF_SYSCALL_TRACE
+	andi.	r11,r11,_TIF_SYSCALL_T_OR_A
 	bne-	syscall_dotrace
 syscall_dotrace_cont:
 	cmplwi	0,r0,NR_syscalls
@@ -237,7 +237,7 @@
 	SYNC
 	MTMSRD(r10)
 	lwz	r9,TI_FLAGS(r12)
-	andi.	r0,r9,(_TIF_SYSCALL_TRACE|_TIF_SIGPENDING|_TIF_NEED_RESCHED)
+	andi.	r0,r9,(_TIF_SYSCALL_T_OR_A|_TIF_SIGPENDING|_TIF_NEED_RESCHED)
 	bne-	syscall_exit_work
 syscall_exit_cont:
 #if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
@@ -277,7 +277,8 @@
 	SAVE_NVGPRS(r1)
 	li	r0,0xc00
 	stw	r0,TRAP(r1)
-	bl	do_syscall_trace
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	do_syscall_trace_enter
 	lwz	r0,GPR0(r1)	/* Restore original registers */
 	lwz	r3,GPR3(r1)
 	lwz	r4,GPR4(r1)
@@ -291,7 +292,7 @@
 syscall_exit_work:
 	stw	r6,RESULT(r1)	/* Save result */
 	stw	r3,GPR3(r1)	/* Update return value */
-	andi.	r0,r9,_TIF_SYSCALL_TRACE
+	andi.	r0,r9,_TIF_SYSCALL_T_OR_A
 	beq	5f
 	ori	r10,r10,MSR_EE
 	SYNC
@@ -303,7 +304,8 @@
 	li	r4,0xc00
 	stw	r4,TRAP(r1)
 4:
-	bl	do_syscall_trace
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	do_syscall_trace_leave
 	REST_NVGPRS(r1)
 2:
 	lwz	r3,GPR3(r1)
@@ -627,8 +629,8 @@
 	subi	r1,r3,STACK_FRAME_OVERHEAD
 	rlwinm	r12,r1,0,0,18	/* current_thread_info() */
 	lwz	r9,TI_FLAGS(r12)
-	andi.	r0,r9,_TIF_SYSCALL_TRACE
-	bnel-	do_syscall_trace
+	andi.	r0,r9,_TIF_SYSCALL_T_OR_A
+	bnel-	do_syscall_trace_leave
 	/* fall through */
 
 	.globl	ret_from_except_full
Index: arch/ppc/kernel/ppc_ksyms.c
===================================================================
--- 6cdd9771bd9a56de1b0246a330ccd916ecdb1d41/arch/ppc/kernel/ppc_ksyms.c  (mode:100644)
+++ 27383b18b9f62d3c4f1b5dd9f3daeffb10416c15/arch/ppc/kernel/ppc_ksyms.c  (mode:100644)
@@ -55,7 +55,6 @@
 #define EXPORT_SYMTAB_STROPS
 
 extern void transfer_to_handler(void);
-extern void do_syscall_trace(void);
 extern void do_IRQ(struct pt_regs *regs);
 extern void MachineCheckException(struct pt_regs *regs);
 extern void AlignmentException(struct pt_regs *regs);
@@ -74,7 +73,6 @@
 EXPORT_SYMBOL(clear_pages);
 EXPORT_SYMBOL(clear_user_page);
 EXPORT_SYMBOL(do_signal);
-EXPORT_SYMBOL(do_syscall_trace);
 EXPORT_SYMBOL(transfer_to_handler);
 EXPORT_SYMBOL(do_IRQ);
 EXPORT_SYMBOL(MachineCheckException);
Index: arch/ppc/kernel/ptrace.c
===================================================================
--- 6cdd9771bd9a56de1b0246a330ccd916ecdb1d41/arch/ppc/kernel/ptrace.c  (mode:100644)
+++ 27383b18b9f62d3c4f1b5dd9f3daeffb10416c15/arch/ppc/kernel/ptrace.c  (mode:100644)
@@ -27,6 +27,9 @@
 #include <linux/user.h>
 #include <linux/security.h>
 #include <linux/signal.h>
+#include <linux/seccomp.h>
+#include <linux/audit.h>
+#include <linux/module.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -455,11 +458,10 @@
 	return ret;
 }
 
-void do_syscall_trace(void)
+static void do_syscall_trace(void)
 {
-        if (!test_thread_flag(TIF_SYSCALL_TRACE)
-	    || !(current->ptrace & PT_PTRACED))
-		return;
+	/* the 0x80 provides a way for the tracing parent to distinguish
+	   between a syscall stop and SIGTRAP delivery */
 	ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
 				 ? 0x80 : 0));
 
@@ -473,3 +475,33 @@
 		current->exit_code = 0;
 	}
 }
+
+void do_syscall_trace_enter(struct pt_regs *regs)
+{
+	if (test_thread_flag(TIF_SYSCALL_TRACE)
+	    && (current->ptrace & PT_PTRACED))
+		do_syscall_trace();
+
+	if (unlikely(current->audit_context))
+		audit_syscall_entry(current, AUDIT_ARCH_PPC,
+				    regs->gpr[0],
+				    regs->gpr[3], regs->gpr[4],
+				    regs->gpr[5], regs->gpr[6]);
+}
+
+void do_syscall_trace_leave(struct pt_regs *regs)
+{
+	secure_computing(regs->gpr[0]);
+
+	if (unlikely(current->audit_context))
+		audit_syscall_exit(current,
+				   (regs->ccr&0x1000)?AUDITSC_FAILURE:AUDITSC_SUCCESS,
+				   regs->result);
+
+	if ((test_thread_flag(TIF_SYSCALL_TRACE))
+	    && (current->ptrace & PT_PTRACED))
+		do_syscall_trace();
+}
+
+EXPORT_SYMBOL(do_syscall_trace_enter);
+EXPORT_SYMBOL(do_syscall_trace_leave);
Index: include/asm-ppc/thread_info.h
===================================================================
--- 6cdd9771bd9a56de1b0246a330ccd916ecdb1d41/include/asm-ppc/thread_info.h  (mode:100644)
+++ 27383b18b9f62d3c4f1b5dd9f3daeffb10416c15/include/asm-ppc/thread_info.h  (mode:100644)
@@ -77,12 +77,19 @@
 #define TIF_POLLING_NRFLAG	4	/* true if poll_idle() is polling
 					   TIF_NEED_RESCHED */
 #define TIF_MEMDIE		5
+#define TIF_SYSCALL_AUDIT       6       /* syscall auditing active */
+#define TIF_SECCOMP             7      /* secure computing */
+
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
 #define _TIF_SIGPENDING		(1<<TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
+#define _TIF_SYSCALL_AUDIT      (1<<TIF_SYSCALL_AUDIT)
+#define _TIF_SECCOMP            (1<<TIF_SECCOMP)
+
+#define _TIF_SYSCALL_T_OR_A     (_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP)
 
 /*
  * Non racy (local) flags bit numbers
Index: init/Kconfig
===================================================================
--- 6cdd9771bd9a56de1b0246a330ccd916ecdb1d41/init/Kconfig  (mode:100644)
+++ 27383b18b9f62d3c4f1b5dd9f3daeffb10416c15/init/Kconfig  (mode:100644)
@@ -173,7 +173,7 @@
 
 config AUDITSYSCALL
 	bool "Enable system-call auditing support"
-	depends on AUDIT && (X86 || PPC64 || ARCH_S390 || IA64 || UML)
+	depends on AUDIT && (X86 || PPC || PPC64 || ARCH_S390 || IA64 || UML)
 	default y if SECURITY_SELINUX
 	help
 	  Enable low-overhead system-call auditing infrastructure that

-- 
dwmw2

^ permalink raw reply

* Re: Laptop sleep & current "git" tree
From: Colin Leroy @ 2005-05-08  9:39 UTC (permalink / raw)
  To: John Steele Scott; +Cc: linuxppc-dev, debian-powerpc
In-Reply-To: <87k6mamj6o.fsf@toojays.net>

On 08 May 2005 at 14h05, John Steele Scott wrote:

Hi, 

> I finally tried this today, using 2.6.12-rc4, which has this patch.
> Without USB devices attached, it's okay, but with my USB hub
> attached, I get a (reproducable) panic in echi_hcd on the way to
> sleep.
> 
> I took a photo of the panic and enhanced it for readability, it's at
> <http://www.toojays.net/portal/Members/toojays/ibook-g4-sleep-crash-2.6.12-rc4.jpg>.

linux-usb-devel may be interested. (I don't have an usb hub so can't
test it)

-- 
Colin

^ permalink raw reply

* Re: Laptop sleep & current "git" tree
From: John Steele Scott @ 2005-05-08  4:32 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: debian-powerpc
In-Reply-To: <20050507140644.5919b4c3@jack.colino.net>

[-- Attachment #1: Type: text/plain, Size: 664 bytes --]

Colin Leroy <colin@colino.net> writes:
> On 04 May 2005 at 09h05, Benjamin Herrenschmidt wrote:
>
> Hi, 
>
>> I have no problem but I didn't see the hub power switching patch on my
>> git commit logs and it wasn't in linus tree yesterday at least :)
>
> Btw, it's in now.

I finally tried this today, using 2.6.12-rc4, which has this patch. Without
USB devices attached, it's okay, but with my USB hub attached, I get a
(reproducable) panic in echi_hcd on the way to sleep.

I took a photo of the panic and enhanced it for readability, it's at
<http://www.toojays.net/portal/Members/toojays/ibook-g4-sleep-crash-2.6.12-rc4.jpg>.

cheers,

John

[-- Attachment #2: Type: application/pgp-signature, Size: 282 bytes --]

^ permalink raw reply

* Re: How to fix 8xx dcbst bug?
From: Dan Malek @ 2005-05-08  1:10 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Joakim Tjernlund, linux-ppc-embedded
In-Reply-To: <20050506154539.GA6452@logos.cnet>


On May 6, 2005, at 11:45 AM, Marcelo Tosatti wrote:

>
> Hi Dan,
>
> So, restarting this conversation...

One of the things I don't want to lose sight of during
all of this is the real performance problem in 2.6.
Your test results show there is something that needs
attention, regardless of using pinned entries.  We
need to continue some of this investigation, it
affects all processors.

Thanks.

	-- Dan

^ permalink raw reply

* Re: How to fix 8xx dcbst bug?
From: Dan Malek @ 2005-05-08  1:00 UTC (permalink / raw)
  To: Joakim.Tjernlund; +Cc: linux-ppc-embedded
In-Reply-To: <BCEFJBPJCGFCNMMMIDBHKEJOCLAA.Joakim.Tjernlund@lumentis.se>


On May 7, 2005, at 6:47 PM, Joakim Tjernlund wrote:

> I was once told by a senior kernel hacker(Dan?) that using swap could 
> make an already
> populated pte invalid. This was for 2.4 kernels.

The original problem was I didn't pay attention to the swap PTE when
constructing the originals for the 8xx.  The swap PTE has to be 
indicated
as such in one of the lowest 2 bits of the PTE, as the rest of the PTE 
is
used for the swap block number.  What would happen is there would
be the occasion where some control bit was set in the PTE within the
swap block number, and upon restoring from disk the wrong block
would be brought into memory (or the fact it was swapped was lost).

Thanks.


	-- Dan

^ permalink raw reply

* RE: How to fix 8xx dcbst bug?
From: Joakim Tjernlund @ 2005-05-07 22:47 UTC (permalink / raw)
  To: wd; +Cc: linux-ppc-embedded
In-Reply-To: <20050507183913.A7238C1512@atlas.denx.de>

> 
> In message <JPEALJAFNGDDLOPNDIEECEAKDDAA.joakim.tjernlund@lumentis.se> you wrote:
> >
> ...
> > safe, just look at ld.so. This should not be a requirement but for 8xx it is currently and I think 8xx gets
> > away with it because nobody is using swap on 8xx.
> 
> I understand that the "not using swap" refers to 2.6 kernel only, right?
> 
> Because on 2.4, we use swap on several systems quite heavily. I  have
> to admit that we had to patch pgtable.h to get it work.

I was once told by a senior kernel hacker(Dan?) that using swap could make an already
populated pte invalid. This was for 2.4 kernels.

ldso uses dcbst and icbi heavly during relocation but always after doing a store to the
address in question. That will make sure that if dcbst/icbi causes a TLB Miss, it will be
a simple/regular TLB miss not requiring any special handling i fault.c

Perhaps using dcbst/icbi in this way will not make the pte go away between the store and the
dcbst/icbi instructions when also using swap. Someone more exprienced will have to answer that.

 Jocke

^ permalink raw reply

* Re: [26-devel] v2.6 performance slowdown on MPC8xx: Measuring TLB cache misses
From: Dan Malek @ 2005-05-07 20:24 UTC (permalink / raw)
  To: Joakim.Tjernlund; +Cc: linuxppc-embedded
In-Reply-To: <BCEFJBPJCGFCNMMMIDBHGEJMCLAA.Joakim.Tjernlund@lumentis.se>


On May 7, 2005, at 2:10 PM, Joakim Tjernlund wrote:

> Not completly sure that this is correct. There are a few:
> 	addi	r10, r10, 0x0100
>  	mtspr	SPRN_MD_CTR, r10
> later on which will "overflow" 0x1f00 into 0x2000 etc.

Oh right, I forgot I did that.  I explicitly set the tlb index before
each write.  Sorry, I thought it was due to more bits of index
in the 885.

So, I guess what was there should have worked.

OK, so the reason TLB pinning doesn't work is a tlbie() can
evict the pinned entry.  That stupid code in the cpm reset
will throw them out, plus anything else that would do a
tlbie() of a kernel address within the pinned space (like
the update_mmu_cache() hack).  We have to fix those,
and look for any others where that may happen.

Thanks.


	-- Dan

^ permalink raw reply

* Re: [PATCH] 8xx: fix usage of pinned 8Mbyte TLB entries
From: Dan Malek @ 2005-05-07 20:02 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: linux-ppc-embedded
In-Reply-To: <20050507131609.GA16996@logos.cnet>


On May 7, 2005, at 9:16 AM, Marcelo Tosatti wrote:

> Not creating 4kb mappings for the first 8Mbytes of kernel
> virtual addresses fixed the problem for me.

Fixed what problem?

In the TLB dump, you replaced the initial 8M entry with
a bunch of 4K page entries, just as I would have expected
to happen.  Since it was able to run and load these, the
complete PTE tables must have been created.

How did you "not create" the 4K mappings?

Thanks.

	-- Dan

^ permalink raw reply

* Re: [26-devel] v2.6 performance slowdown on MPC8xx: Measuring TLB cache misses
From: Marcelo Tosatti @ 2005-05-07 14:42 UTC (permalink / raw)
  To: Joakim Tjernlund; +Cc: linuxppc-embedded
In-Reply-To: <BCEFJBPJCGFCNMMMIDBHGEJMCLAA.Joakim.Tjernlund@lumentis.se>

On Sat, May 07, 2005 at 08:10:38PM +0200, Joakim Tjernlund wrote:
> > Hi Dan, Joakim,
> > 
> > On Sat, Apr 23, 2005 at 08:00:39PM -0400, Dan Malek wrote:
> > > 
> > > On Apr 23, 2005, at 7:51 PM, Joakim Tjernlund wrote:
> > > 
> > > >hmm, I have more than 24MB of memory and I can run CONFIG_PIN_TLB just
> > > >fine with modules off in kernel 2.4. Havn't tried 2.6 yet.
> > > 
> > > Doh.  Oh, I see.  We only do the optimization for the instruction 
> > > misses.
> > > I'll have to take a closer look at Marcelo's 2.6 tests.
> > 
> > The PIN TLB entry option does not make much difference in my tests, 
> > never did.
> > 
> > Who wrote the code? Are there results which indicate a performance gain
> > from TLB pinning on 8xx? If so, where are such results? 
> > 
> > One problem that I've noted is that initial_mmu sets {I,D}TLB index
> > to be 27 (11100). 
> > 
> > MI_RSV4I protects TLB's 27...31.
> > 
> > Given that both {I,D}TLB INDEX's are _decreased_ on each update, it seems
> > to me that initial_mmu should set {I,D}TLB INDEX to 31, which will then 
> > decrease down to 27 after 4 TLB's are created.  
> > 
> > Another question that comes to mind is why initial_mmu does create 
> > additional 8Meg TLB entries for D-cache but not for I-cache: 
> > 
> > #ifdef CONFIG_PIN_TLB
> >         /* Map two more 8M kernel data pages.
> >         */
> > 	...
> > #endif
> 
> Not completly sure that this is correct. There are a few:
> 	addi	r10, r10, 0x0100
>  	mtspr	SPRN_MD_CTR, r10
> later on which will "overflow" 0x1f00 into 0x2000 etc.

Yep. This is not correct at all: the TLB index is increased 
at each miss, not decreased as the manual says. 

I have confirmed it with the BDI... 

>  Jocke
> > 
> > I'll do some more CONFIG_PIN_TLB tests this week...
> > 
> > --- head_8xx.S.orig2	2005-04-24 17:55:59.000000000 -0300
> > +++ head_8xx.S	2005-04-24 17:57:44.000000000 -0300
> > @@ -697,7 +697,7 @@
> >  	tlbia			/* Invalidate all TLB entries */
> >  #ifdef CONFIG_PIN_TLB
> >  	lis	r8, MI_RSV4I@h
> > -	ori	r8, r8, 0x1c00
> > +	ori	r8, r8, 0x1f00
> >  #else
> >  	li	r8, 0
> >  #endif
> > @@ -705,7 +705,7 @@
> >  
> >  #ifdef CONFIG_PIN_TLB
> >  	lis	r10, (MD_RSV4I | MD_RESETVAL)@h
> > -	ori	r10, r10, 0x1c00
> > +	ori	r10, r10, 0x1f00
> >  	mr	r8, r10
> >  #else
> >  	lis	r10, MD_RESETVAL@h

^ permalink raw reply

* Re: How to fix 8xx dcbst bug?
From: Marcelo Tosatti @ 2005-05-07 13:57 UTC (permalink / raw)
  To: Joakim Tjernlund; +Cc: linux-ppc-embedded
In-Reply-To: <JPEALJAFNGDDLOPNDIEECEAKDDAA.joakim.tjernlund@lumentis.se>

On Sat, May 07, 2005 at 08:24:01PM +0200, Joakim Tjernlund wrote:
> > 
> > 
> > Hi Dan,
> > 
> > So, restarting this conversation...
> > 
> > On Tue, Apr 05, 2005 at 11:58:17AM -0400, Dan Malek wrote:
> > > 
> > > On Apr 4, 2005, at 3:17 PM, Marcelo Tosatti wrote:
> > > 
> > > >Problem is that the "dcbst" instruction will, _sometimes_ (the 
> > > >failure/success rate is about 1/4
> > > >with my test application) fault as a _write_ operation on the data.
> > > 
> > > Oh, geeze .... It's all coming back to me now ....
> > > 
> > > The 8xx cache operations don't always operate as defined in the PEM.
> > > There are likely to be some archive discussions within the Freescale
> > > knowledge data base that describe the different behaviors I've seen
> > > with the chip variants and revisions.  I can't find any of those e-mail
> > > discussions, so I'll try to recall from memory.
> > > 
> > > The PEM cache instructions are all implemented in a microcode that
> > > uses the 8xx unique cache control SPRs.  Depending upon the state
> > > of the cache and MMU, it seems in some cases the EA translation is
> > > subject to a "normal" protection match instead of a load operation 
> > > match.
> > > 
> > > The behavior of these operations isn't consistent across all of the 8xx
> > > processor revisions, especially with early silicon if people are still
> > > using those.	During conversations with Freescale engineers, it seems
> > > the only guaranteed operation was to use the 8xx unique SPRs, but
> > > I think I only did that in 8xx specific functions.
> > >
> > > We have way too much code in the TLB exception handlers already,
> > > so let's just try a tlbia of the EA in the update_mmu_cache, with an 
> > > #ifdef
> > > for the 8xx.	
> > 
> > >  We may want to make the dcbxxx instructions 
> > > some
> > > kind of macro, so on 8xx we can include such operations in otherwise
> > > "standard" software.
> > 
> > Now that I think of it, userspace dcbst users should not be an issue
> > because the intermediate invalid TLB entry is not visible to applications.
> > 
> > Userspace sees only: not present pte, or valid present pte.
> > 
> > Well, at least the entry which has been causing problems,
> > created by DataStoreTLBMiss.
> > 
> > Is it safe to assume that dcbst usage on userspace is restricted
> > to valid TLBs? Since MMU SPRs are restricted to supervisor 
> > protection, I think so.
> 
> Not sure what you mean here. Currently all dcbX instr. in user space 
> have to be on valid\populated pte's since otherwise it will SEGV. 

OK. The BUG in update_mmu_cache() is triggered because dcbX happens 
on populated but invalid page mapping.

> If you write your app so that any dcbX will only cause a plain DTLB you are
> safe, just look at ld.so. This should not be a requirement but for 8xx it is currently and I think 8xx gets
> away with it because nobody is using swap on 8xx.

^ permalink raw reply

* Re: [PATCH] 8xx: fix usage of pinned 8Mbyte TLB entries
From: Marcelo Tosatti @ 2005-05-07 14:05 UTC (permalink / raw)
  To: Dan Malek; +Cc: linux-ppc-embedded
In-Reply-To: <6f231f4afce0886929ca99426a86b47a@embeddededge.com>


> NO.  Just leave that code alone.  I don't understand why you think
> doing this will have any effect on the system operation.  If you are
> able to run a system without creating these tables, then the pinned
> TLBs must be working.  If pinned TLBs weren't working, the kernel
> would crash.

I just booted a kernel with 4kb PTE mappings at KERNELBASE and
the pinned TLB was not trashed.

So, I was talking nonsense. :)

The only problem are DMA users who dont use dma_alloc_coherent API. 

^ permalink raw reply

* Re: [PATCH] 8xx: fix usage of pinned 8Mbyte TLB entries
From: Marcelo Tosatti @ 2005-05-07 13:16 UTC (permalink / raw)
  To: Dan Malek; +Cc: linux-ppc-embedded
In-Reply-To: <6f231f4afce0886929ca99426a86b47a@embeddededge.com>


> >So you do agree that pte's should not be created for the first
> >8MBytes if CONFIG_PIN_TLB is set? :)
> 
> NO.  Just leave that code alone.  I don't understand why you think
> doing this will have any effect on the system operation.   
>
> If you are able to run a system without creating these tables, then 
> the pinned TLBs must be working.  If pinned TLBs weren't working, 
> the kernel would crash. 

Not creating 4kb mappings for the first 8Mbytes of kernel 
virtual addresses fixed the problem for me. 

Break at first start_kernel instruction (0xc02284a4).

Dump TLB contents to "itlb-before".

[marcelo@dmt ~]$ grep SPR itlb-before | grep 816
SPR  816 : 0x10002080    268443776
SPR  816 : 0x10001080    268439680
SPR  816 : 0x0ff79080    267882624
SPR  816 : 0x0ff261c0    267542976
SPR  816 : 0x0ff521c0    267723200
SPR  816 : 0x100121c0    268509632
SPR  816 : 0x100011c0    268440000
SPR  816 : 0x0ffdd1c0    268292544
SPR  816 : 0x0ffdb1c0    268284352
SPR  816 : 0x0fef51c0    267342272
SPR  816 : 0x0fef91c0    267358656
SPR  816 : 0x0fe0b1c0    266383808
SPR  816 : 0x0fef71c0    267350464
SPR  816 : 0x0fef61c0    267346368
SPR  816 : 0x0ffee1c0    268362176
SPR  816 : 0x0ffdc1c0    268288448
SPR  816 : 0x0fef21c0    267329984
SPR  816 : 0x0fef11c0    267325888
SPR  816 : 0x0fe071c0    266367424
SPR  816 : 0x0ffc61c0    268198336
SPR  816 : 0x0fe0c1c0    266387904
SPR  816 : 0x0ffc51c0    268194240
SPR  816 : 0x0fe091c0    266375616
SPR  816 : 0x0ffea080    268345472
SPR  816 : 0x0ff20080    267518080
SPR  816 : 0x0ff81080    267915392
SPR  816 : 0x1001c080    268550272
SPR  816 : 0x10008080    268468352
SPR  816 : 0x100021e0    268444128
SPR  816 : 0x100241e0    268583392
SPR  816 : 0x100301e0    268632544
SPR  816 : 0xc0000e1f  -1073738209   <----- VALID 8MB TLB ENTRY

"go" 
BDI breaks at 

BDI>i
    Target state      : debug mode
    Debug entry cause : instruction breakpoint
    Current PC        : 0xc0228544
BDI>

0xc0228538 <start_kernel+148>:  bl      0xc023107c <pidhash_init>
0xc022853c <start_kernel+152>:  bl      0xc0230f1c <init_timers>
0xc0228540 <start_kernel+156>:  bl      0xc0230cf4 <softirq_init>
0xc0228544 <start_kernel+160>:  bl      0xc022ead0 <time_init>
0xc0228548 <start_kernel+164>:  bl      0xc02354b0 <console_init>
0xc022854c <start_kernel+168>:  lis     r9,-16348

[marcelo@dmt ~]$ grep SPR itlb-2 | grep 816
SPR  816 : 0x10001100    268439808
SPR  816 : 0x0ffdd100    268292352
SPR  816 : 0x0ffdb100    268284160
SPR  816 : 0x0fef5100    267342080
SPR  816 : 0x0fef9100    267358464
SPR  816 : 0x0fe0b100    266383616
SPR  816 : 0x0fef7100    267350272
SPR  816 : 0x0fef6100    267346176
SPR  816 : 0x0ffee100    268361984
SPR  816 : 0x0ffdc100    268288256
SPR  816 : 0xc0038110  -1073512176     <---------
SPR  816 : 0xc0063110  -1073336048     
SPR  816 : 0xc0024110  -1073594096
SPR  816 : 0xc0017110  -1073647344
SPR  816 : 0xc000e110  -1073684208
SPR  816 : 0xc0003110  -1073729264
SPR  816 : 0xc0002110  -1073733360
SPR  816 : 0xc000d110  -1073688304
SPR  816 : 0xc0004110  -1073725168	
SPR  816 : 0xc0012110  -1073667824
SPR  816 : 0xc01a1110  -1072033520	
SPR  816 : 0xc01a2110  -1072029424
SPR  816 : 0xc000a110  -1073700592	
SPR  816 : 0xc001c110  -1073626864 
SPR  816 : 0xc001b110  -1073630960      <---------
SPR  816 : 0x0ff26100    267542784
SPR  816 : 0x0ff52100    
SPR  816 : 0x10012100    268509440
SPR  816 : 0x100021e0    268444128
SPR  816 : 0x100241e0    268583392
SPR  816 : 0x100301e0    268632544
SPR  816 : 0xc0000e1f  -1073738209 

(gdb) disassemble start_kernel
Dump of assembler code for function start_kernel:
0xc02284a4 <start_kernel+0>:    lis     r3,-16358
0xc02284a8 <start_kernel+4>:    stwu    r1,-32(r1)
0xc02284ac <start_kernel+8>:    mflr    r0
0xc02284b0 <start_kernel+12>:   addi    r3,r3,21832
0xc02284b4 <start_kernel+16>:   stw     r0,36(r1)
0xc02284b8 <start_kernel+20>:   stmw    r29,20(r1)
0xc02284bc <start_kernel+24>:   bl      0xc0012130 <printk>
0xc02284c0 <start_kernel+28>:   addi    r3,r1,8
0xc02284c4 <start_kernel+32>:   bl      0xc022ee28 <setup_arch>
0xc02284c8 <start_kernel+36>:   bl      0xc0230548 <sched_init>
0xc02284cc <start_kernel+40>:   bl      0xc02321e8 <build_all_zonelists>
0xc02284d0 <start_kernel+44>:   bl      0xc02326f4 <page_alloc_init>
0xc02284d4 <start_kernel+48>:   lis     r4,-16348
0xc02284d8 <start_kernel+52>:   lis     r3,-16355
0xc02284dc <start_kernel+56>:   addi    r4,r4,-5804
0xc02284e0 <start_kernel+60>:   addi    r3,r3,-6768
0xc02284e4 <start_kernel+64>:   bl      0xc0012130 <printk>
0xc02284e8 <start_kernel+68>:   bl      0xc022842c <parse_early_param>
0xc02284ec <start_kernel+72>:   lis     r5,-16353
0xc02284f0 <start_kernel+76>:   lis     r6,-16353
0xc02284f4 <start_kernel+80>:   addi    r5,r5,4580
0xc02284f8 <start_kernel+84>:   addi    r6,r6,5060
---Type <return> to continue, or q <return> to quit---
0xc02284fc <start_kernel+88>:   subf    r6,r5,r6
0xc0228500 <start_kernel+92>:   lis     r0,-13108
0xc0228504 <start_kernel+96>:   ori     r0,r0,52429
0xc0228508 <start_kernel+100>:  srawi   r6,r6,2
0xc022850c <start_kernel+104>:  mullw   r6,r6,r0
0xc0228510 <start_kernel+108>:  lwz     r4,8(r1)
0xc0228514 <start_kernel+112>:  lis     r7,-16349
0xc0228518 <start_kernel+116>:  lis     r3,-16355
0xc022851c <start_kernel+120>:  addi    r7,r7,-32404
0xc0228520 <start_kernel+124>:  addi    r3,r3,-6740
0xc0228524 <start_kernel+128>:  bl      0xc0024dac <parse_args>
0xc0228528 <start_kernel+132>:  bl      0xc0231220 <sort_main_extable>
0xc022852c <start_kernel+136>:  bl      0xc022eaa0 <trap_init>
0xc0228530 <start_kernel+140>:  bl      0xc02311f0 <rcu_init>
0xc0228534 <start_kernel+144>:  bl      0xc022eaa4 <init_IRQ>
0xc0228538 <start_kernel+148>:  bl      0xc023107c <pidhash_init>
0xc022853c <start_kernel+152>:  bl      0xc0230f1c <init_timers>
0xc0228540 <start_kernel+156>:  bl      0xc0230cf4 <softirq_init>
0xc0228544 <start_kernel+160>:  bl      0xc022ead0 <time_init>
0xc0228548 <start_kernel+164>:  bl      0xc02354b0 <console_init>
0xc022854c <start_kernel+168>:  lis     r9,-16348
0xc0228550 <start_kernel+172>:  lwz     r3,-8180(r9)
0xc0228554 <start_kernel+176>:  cmpwi   r3,0

^ permalink raw reply

* Re: How to fix 8xx dcbst bug?
From: Wolfgang Denk @ 2005-05-07 18:39 UTC (permalink / raw)
  To: Joakim Tjernlund; +Cc: linux-ppc-embedded
In-Reply-To: <JPEALJAFNGDDLOPNDIEECEAKDDAA.joakim.tjernlund@lumentis.se>

In message <JPEALJAFNGDDLOPNDIEECEAKDDAA.joakim.tjernlund@lumentis.se> you wrote:
>
...
> safe, just look at ld.so. This should not be a requirement but for 8xx it is currently and I think 8xx gets
> away with it because nobody is using swap on 8xx.

I understand that the "not using swap" refers to 2.6 kernel only, right?

Because on 2.4, we use swap on several systems quite heavily. I  have
to admit that we had to patch pgtable.h to get it work.

[In case anybody needs it: it's this patchset in our CVS tree:

	PatchSet 80
	Date: 2003/08/24 14:58:59
	Author: wd
	Branch: HEAD
	Tag: (none)
	Log:
	Fix swap problem as observed on IVMS systems with IDE harddisk.
	
	Members:
	        include/asm-ppc/pgtable.h:1.6->1.7
]

Best regards,

Wolfgang Denk

-- 
Software Engineering:  Embedded and Realtime Systems,  Embedded Linux
Phone: (+49)-8142-66989-10 Fax: (+49)-8142-66989-80 Email: wd@denx.de
Single tasking: Just Say No.

^ permalink raw reply

* RE: How to fix 8xx dcbst bug?
From: Joakim Tjernlund @ 2005-05-07 18:24 UTC (permalink / raw)
  To: Marcelo Tosatti, Dan Malek, linux-ppc-embedded
In-Reply-To: <20050506154539.GA6452@logos.cnet>

> 
> 
> Hi Dan,
> 
> So, restarting this conversation...
> 
> On Tue, Apr 05, 2005 at 11:58:17AM -0400, Dan Malek wrote:
> > 
> > On Apr 4, 2005, at 3:17 PM, Marcelo Tosatti wrote:
> > 
> > >Problem is that the "dcbst" instruction will, _sometimes_ (the 
> > >failure/success rate is about 1/4
> > >with my test application) fault as a _write_ operation on the data.
> > 
> > Oh, geeze .... It's all coming back to me now ....
> > 
> > The 8xx cache operations don't always operate as defined in the PEM.
> > There are likely to be some archive discussions within the Freescale
> > knowledge data base that describe the different behaviors I've seen
> > with the chip variants and revisions.  I can't find any of those e-mail
> > discussions, so I'll try to recall from memory.
> > 
> > The PEM cache instructions are all implemented in a microcode that
> > uses the 8xx unique cache control SPRs.  Depending upon the state
> > of the cache and MMU, it seems in some cases the EA translation is
> > subject to a "normal" protection match instead of a load operation 
> > match.
> > 
> > The behavior of these operations isn't consistent across all of the 8xx
> > processor revisions, especially with early silicon if people are still
> > using those.	During conversations with Freescale engineers, it seems
> > the only guaranteed operation was to use the 8xx unique SPRs, but
> > I think I only did that in 8xx specific functions.
> >
> > We have way too much code in the TLB exception handlers already,
> > so let's just try a tlbia of the EA in the update_mmu_cache, with an 
> > #ifdef
> > for the 8xx.	
> 
> >  We may want to make the dcbxxx instructions 
> > some
> > kind of macro, so on 8xx we can include such operations in otherwise
> > "standard" software.
> 
> Now that I think of it, userspace dcbst users should not be an issue
> because the intermediate invalid TLB entry is not visible to applications.
> 
> Userspace sees only: not present pte, or valid present pte.
> 
> Well, at least the entry which has been causing problems,
> created by DataStoreTLBMiss.
> 
> Is it safe to assume that dcbst usage on userspace is restricted
> to valid TLBs? Since MMU SPRs are restricted to supervisor 
> protection, I think so.

Not sure what you mean here. Currently all dcbX instr. in user space have to be on valid\populated pte's
since otherwise it will SEGV. If you write your app so that any dcbX will only cause a plain DTLB you are
safe, just look at ld.so. This should not be a requirement but for 8xx it is currently and I think 8xx gets
away with it because nobody is using swap on 8xx.

   Jocke

> 
> So, if that is true, can you please merge the update_mmu_cache() fix 
> for the dcbst misbehaviour previously discussed ? 
> 
> Thanks!!

^ permalink raw reply

* RE: [26-devel] v2.6 performance slowdown on MPC8xx: Measuring TLB cache misses
From: Joakim Tjernlund @ 2005-05-07 18:10 UTC (permalink / raw)
  To: Marcelo Tosatti, Dan Malek; +Cc: linuxppc-embedded
In-Reply-To: <20050424165520.GA22786@logos.cnet>

> Hi Dan, Joakim,
> 
> On Sat, Apr 23, 2005 at 08:00:39PM -0400, Dan Malek wrote:
> > 
> > On Apr 23, 2005, at 7:51 PM, Joakim Tjernlund wrote:
> > 
> > >hmm, I have more than 24MB of memory and I can run CONFIG_PIN_TLB just
> > >fine with modules off in kernel 2.4. Havn't tried 2.6 yet.
> > 
> > Doh.  Oh, I see.  We only do the optimization for the instruction 
> > misses.
> > I'll have to take a closer look at Marcelo's 2.6 tests.
> 
> The PIN TLB entry option does not make much difference in my tests, 
> never did.
> 
> Who wrote the code? Are there results which indicate a performance gain
> from TLB pinning on 8xx? If so, where are such results? 
> 
> One problem that I've noted is that initial_mmu sets {I,D}TLB index
> to be 27 (11100). 
> 
> MI_RSV4I protects TLB's 27...31.
> 
> Given that both {I,D}TLB INDEX's are _decreased_ on each update, it seems
> to me that initial_mmu should set {I,D}TLB INDEX to 31, which will then 
> decrease down to 27 after 4 TLB's are created.  
> 
> Another question that comes to mind is why initial_mmu does create 
> additional 8Meg TLB entries for D-cache but not for I-cache: 
> 
> #ifdef CONFIG_PIN_TLB
>         /* Map two more 8M kernel data pages.
>         */
> 	...
> #endif

Not completly sure that this is correct. There are a few:
	addi	r10, r10, 0x0100
 	mtspr	SPRN_MD_CTR, r10
later on which will "overflow" 0x1f00 into 0x2000 etc.

 Jocke
> 
> I'll do some more CONFIG_PIN_TLB tests this week...
> 
> --- head_8xx.S.orig2	2005-04-24 17:55:59.000000000 -0300
> +++ head_8xx.S	2005-04-24 17:57:44.000000000 -0300
> @@ -697,7 +697,7 @@
>  	tlbia			/* Invalidate all TLB entries */
>  #ifdef CONFIG_PIN_TLB
>  	lis	r8, MI_RSV4I@h
> -	ori	r8, r8, 0x1c00
> +	ori	r8, r8, 0x1f00
>  #else
>  	li	r8, 0
>  #endif
> @@ -705,7 +705,7 @@
>  
>  #ifdef CONFIG_PIN_TLB
>  	lis	r10, (MD_RSV4I | MD_RESETVAL)@h
> -	ori	r10, r10, 0x1c00
> +	ori	r10, r10, 0x1f00
>  	mr	r8, r10
>  #else
>  	lis	r10, MD_RESETVAL@h

^ permalink raw reply

* Re: [PATCH] 8xx: fix usage of pinned 8Mbyte TLB entries
From: Wolfgang Denk @ 2005-05-07 14:59 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: linux-ppc-embedded
In-Reply-To: <20050506230523.GA15908@logos.cnet>

In message <20050506230523.GA15908@logos.cnet> you wrote:
>
> > The BDI can, but other software functions will walk the page
> > tables looking for PTE information.  
> 
> Do you have any practical example which you are certain is going
> to break? 

I think the BDM4GDB BDM debugger depends  on  this,  and  maybe  some
other tools, too.

> I dont remember any, and I dont think any software should be walking
> kernel pte's directly...

Maybe not regular software, but debug tools  that  live  outside  the
kernel.

Best regards,

Wolfgang Denk

-- 
Software Engineering:  Embedded and Realtime Systems,  Embedded Linux
Phone: (+49)-8142-66989-10 Fax: (+49)-8142-66989-80 Email: wd@denx.de
Testing can show the presense of bugs, but not their absence.
                                                   -- Edsger Dijkstra

^ permalink raw reply

* help
From: sen lin @ 2005-05-07 13:46 UTC (permalink / raw)
  To: linuxppc-embedded

only test how to use maillist

^ permalink raw reply

* Re: Laptop sleep & current "git" tree
From: Colin Leroy @ 2005-05-07 12:06 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev list, DebianPPC
In-Reply-To: <1115164256.7627.57.camel@gaston>

On 04 May 2005 at 09h05, Benjamin Herrenschmidt wrote:

Hi, 

> I have no problem but I didn't see the hub power switching patch on my
> git commit logs and it wasn't in linus tree yesterday at least :)

Btw, it's in now.
http://www.kernel.org/git/gitweb.cgi?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=56c1e26d75008d39f1067f453719857a81109d9f

-- 
Colin

^ permalink raw reply

* Re: Laptop sleep & current "git" tree
From: Jerome Glisse @ 2005-05-07 10:09 UTC (permalink / raw)
  To: Eddy Petrisor; +Cc: DebianPPC, linuxppc-dev list
In-Reply-To: <427BE8B0.5030802@gmail.com>

> I am just interfering as you talked about radeon models; is there any
> good news regarding the >9600 chipsets?
> I know there was a project (R300 iirc) that wanted to make a driver, but
> progress was slow or non-existant last time I heard about it.

If you only check the web site you will miss most of the things. We don't
take much time updating it. We haven't got much time for that. A better
place to look is the commit mailing list, where you can see things going
on...

I think we will have some "final" driver by the end of the year, maybe
before as some developer are at school and may use their hollydays to
give a boost at development.

Jerome Glisse

^ permalink raw reply

* Re: [PATCH] 8xx: fix usage of pinned 8Mbyte TLB entries
From: Dan Malek @ 2005-05-07  5:55 UTC (permalink / raw)
  To: Dan Malek; +Cc: linux-ppc-embedded
In-Reply-To: <50058b517f42f09720e6c8086bf294e0@embeddededge.com>


> On May 6, 2005, at 4:03 PM, Marcelo Tosatti wrote:
>
>>         /* get the PTE for the bootpage */
>>         if (!get_pteptr(&init_mm, bootpage, &pte))
>>                panic("get_pteptr failed\n");
>>
>>         /* and make it uncachable */
>>         pte_val(*pte) |= _PAGE_NO_CACHE;
>>         _tlbie(bootpage);

Can someone explain to me why this was necessary,
along with the weird hacks in the serial driver to
hostmem_alloc() if we are using the console and
dma_alloc_consistent() if we aren't?


This bootmem page stuff should not be necessary,
the cpm_reset() doesn't need to allocate the host
buffer, and it should be done the first time hostmem_alloc()
is called.

I don't have an 8xx handy.  Can someone remove all of this:

         /* get the PTE for the bootpage */
         if (!get_pteptr(&init_mm, bootpage, &pte))
                panic("get_pteptr failed\n");


         /* and make it uncachable */
         pte_val(*pte) |= _PAGE_NO_CACHE;
         _tlbie(bootpage);

         host_buffer = bootpage;
         host_end = host_buffer + PAGE_SIZE;


from arch/ppc/8xx_io/commproc.c and let me know
if the system still works?

Thanks.


	-- Dan

^ permalink raw reply

* Re: [PATCH] 8xx: fix usage of pinned 8Mbyte TLB entries
From: Dan Malek @ 2005-05-07  5:27 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: linux-ppc-embedded
In-Reply-To: <20050506200338.GC14486@logos.cnet>



The last patch I just sent isn't quite sufficient.  We still have
to fix this:

On May 6, 2005, at 4:03 PM, Marcelo Tosatti wrote:

>         /* get the PTE for the bootpage */
>         if (!get_pteptr(&init_mm, bootpage, &pte))
>                panic("get_pteptr failed\n");
>
>         /* and make it uncachable */
>         pte_val(*pte) |= _PAGE_NO_CACHE;
>         _tlbie(bootpage);


One of things that was corrected in linuxppc-2.4, that never made
if forward.  I did a late consistent_alloc() on the first call to
hostmem_alloc().  I'm looking for a similar solution in 2.6.

Thanks.


	-- Dan

^ permalink raw reply

* Re: [PATCH] 8xx: fix usage of pinned 8Mbyte TLB entries
From: Dan Malek @ 2005-05-07  5:16 UTC (permalink / raw)
  To: Dan Malek; +Cc: linux-ppc-embedded
In-Reply-To: <6f231f4afce0886929ca99426a86b47a@embeddededge.com>



The following patch is needed to properly wire the TLB
entries on the newer 8xx processors.  I think it will work
on all of them with sufficient entries to allow the pinning.
Don't do this on an 823 or 850.

	-- Dan

--- linux-2.6.11.5/arch/ppc/kernel/head_8xx.S   2005-03-19 
01:34:56.000000000 -0500
+++ linux-2.6-tlbpin/arch/ppc/kernel/head_8xx.S 2005-05-07 
00:57:32.000000000 -0400
@@ -663,7 +663,7 @@
	tlbia                   /* Invalidate all TLB entries */
  #ifdef CONFIG_PIN_TLB
	lis     r8, MI_RSV4I@h
-	ori     r8, r8, 0x1c00
+	ori     r8, r8, 0x1f00
  #else
	li      r8, 0
  #endif
@@ -671,7 +671,7 @@

  #ifdef CONFIG_PIN_TLB
	lis     r10, (MD_RSV4I | MD_RESETVAL)@h
-	ori     r10, r10, 0x1c00
+	ori     r10, r10, 0x1f00
	mr      r8, r10
  #else
	lis     r10, MD_RESETVAL@h

^ permalink raw reply

* Re: [PATCH] 8xx: fix usage of pinned 8Mbyte TLB entries
From: Dan Malek @ 2005-05-07  4:39 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: linux-ppc-embedded
In-Reply-To: <20050506230523.GA15908@logos.cnet>


On May 6, 2005, at 7:05 PM, Marcelo Tosatti wrote:

> Do you have any practical example which you are certain is going
> to break?

Not at the moment, but that doesn't mean we shouldn't maintain
consistency for anyone that wants to do so.

> I dont remember any, and I dont think any software should be walking
> kernel pte's directly...

Anyone can call get_pteptr and should get the proper information.

> It is not possible to have the 8Mbyte pinned TLB and 4kb pagetables
> mapping the same kernel virtual addresses.

I know, but we don't do that.  Like I said, if the 8M pinned entry is
in the TLB, we don't get exceptions for this space and we don't look
up PTEs and replace them.

> You can't have both a 4kb page and a 8Mbyte page mapping the virtual
> address KERNELBASE + 0.
>
> Do you agree?

Yes, but that isn't what we are doing.  We can have the 8M page
mapping virtual address 0xc0000000 to 0x0000000, and also another
4k page, at say 0xd0000000 map the same 0x00000000 physical page.
There are many circumstances when we have a kernel VM address
and a user VM address map the same physical page.  This is also
what we do to get uncached VM addresses for DMA.

> Right - I'm talking about kernel virtual addresses: in this specific 
> case,
> we can't have more than one mapping for the first page at KERNELBASE.

You can't do that in any case for anything, and I'm confused why you
keep mentioning this :-)

> So you do agree that pte's should not be created for the first
> 8MBytes if CONFIG_PIN_TLB is set? :)

NO.  Just leave that code alone.  I don't understand why you think
doing this will have any effect on the system operation.  If you are
able to run a system without creating these tables, then the pinned
TLBs must be working.  If pinned TLBs weren't working, the kernel
would crash.

Thanks.


	-- Dan

^ permalink raw reply

* Re: [PATCH] 8xx: fix usage of pinned 8Mbyte TLB entries
From: Marcelo Tosatti @ 2005-05-06 23:05 UTC (permalink / raw)
  To: Dan Malek; +Cc: linux-ppc-embedded
In-Reply-To: <84773f72b8bba863c9471c0d121223a8@embeddededge.com>

On Fri, May 06, 2005 at 11:09:15PM -0400, Dan Malek wrote:
> 
> On May 6, 2005, at 4:03 PM, Marcelo Tosatti wrote:
> 
> >The data I have tells me otherwise. I have seen the I-TLB entries
> >getting created for kernel space.
> 
> Of course.  That's because the pinned entries aren't working :-)
> 
> >Can't the BDI work on the 8Mbyte page?  Same for other software
> >or debuggers...
> 
> The BDI can, but other software functions will walk the page
> tables looking for PTE information.  

Do you have any practical example which you are certain is going
to break? 

I dont remember any, and I dont think any software should be walking
kernel pte's directly...

It is not possible to have the 8Mbyte pinned TLB and 4kb pagetables 
mapping the same kernel virtual addresses.

> >        /* get the PTE for the bootpage */
> >        if (!get_pteptr(&init_mm, bootpage, &pte))
> >               panic("get_pteptr failed\n");
> >
> >        /* and make it uncachable */
> >        pte_val(*pte) |= _PAGE_NO_CACHE;
> >        _tlbie(bootpage);
> 
> This is a bad hack (that I wrote) that needs to get fixed.
> 
> >Because DMA pages need to have their PTE's marked as uncached, which 
> >in turn
> >means their TLB's need to be marked as uncached.
> 
> Right, but these are allocated from the vmalloc() space, far away
> from the pinned entries. 
>
> >I dont think you can have multiple overlapping TLB entries.
> 
> Sure you can, we do it all of the time.  The kernel maps all of
> memory, and then user applications do it again.  The only time
> it causes a problem is when you have different cache attributes
> for the same physical page.  In this case, you need to ensure
> you only use one mapping.  You can't have the same virtual
> address twice in the TLB (iirc, the 8xx automatically invalidates
> an existing one if you do this), but you can have the same
> physical page mapped multiple times.  

You can't have both a 4kb page and a 8Mbyte page mapping the virtual 
address KERNELBASE + 0. 

Do you agree? 

> >How is the MMU supposed to decide between multiple mappings
> >for the same address ?
> 
> You are thinking backward.  The MMU maps the virtual address
> accessed, there is only one valid at a time.  You can have multiple
> VM addresses accessing the same physical page.

Right - I'm talking about kernel virtual addresses: in this specific case,
we can't have more than one mapping for the first page at KERNELBASE.

> >That is how it is now. See previous posts with detailed TLB debugging.
> 
> Something isn't correct if it isn't working.  
>
> >Maybe you thought you got it right because the initial 8Mbyte
> >mapping works?
> 
> No, this is required to work for some execute in place from rom
> systems I have done.  It was adapted from that.  The initial 8M
> mapping must be evicted when the  mapin_ram() is done.  It's
> supposed to happen that way.
> 
> >Unfortunately that mapping is trashed after overlapping
> >pte's are created.
> 
> Right, that is supposed to happen unless TLB pinning
> is configured.

OK, we seem to be on the same page now. 

So you do agree that pte's should not be created for the first
8MBytes if CONFIG_PIN_TLB is set? :) 

Should I send an updated patch or you plan to do that? 

^ permalink raw reply

* Re: [PATCH] 8xx: fix usage of pinned 8Mbyte TLB entries
From: Dan Malek @ 2005-05-07  3:09 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: linux-ppc-embedded
In-Reply-To: <20050506200338.GC14486@logos.cnet>


On May 6, 2005, at 4:03 PM, Marcelo Tosatti wrote:

> The data I have tells me otherwise. I have seen the I-TLB entries
> getting created for kernel space.

Of course.  That's because the pinned entries aren't working :-)

> Can't the BDI work on the 8Mbyte page?  Same for other software
> or debuggers...

The BDI can, but other software functions will walk the page
tables looking for PTE information.

>         /* get the PTE for the bootpage */
>         if (!get_pteptr(&init_mm, bootpage, &pte))
>                panic("get_pteptr failed\n");
>
>         /* and make it uncachable */
>         pte_val(*pte) |= _PAGE_NO_CACHE;
>         _tlbie(bootpage);

This is a bad hack (that I wrote) that needs to get fixed.

> Because DMA pages need to have their PTE's marked as uncached, which 
> in turn
> means their TLB's need to be marked as uncached.

Right, but these are allocated from the vmalloc() space, far away
from the pinned entries.

> I dont think you can have multiple overlapping TLB entries.

Sure you can, we do it all of the time.  The kernel maps all of
memory, and then user applications do it again.  The only time
it causes a problem is when you have different cache attributes
for the same physical page.  In this case, you need to ensure
you only use one mapping.  You can't have the same virtual
address twice in the TLB (iirc, the 8xx automatically invalidates
an existing one if you do this), but you can have the same
physical page mapped multiple times.


> How is the MMU supposed to decide between multiple mappings
> for the same address ?

You are thinking backward.  The MMU maps the virtual address
accessed, there is only one valid at a time.  You can have multiple
VM addresses accessing the same physical page.

> That is how it is now. See previous posts with detailed TLB debugging.

Something isn't correct if it isn't working.

> Maybe you thought you got it right because the initial 8Mbyte
> mapping works?

No, this is required to work for some execute in place from rom
systems I have done.  It was adapted from that.  The initial 8M
mapping must be evicted when the  mapin_ram() is done.  It's
supposed to happen that way.

> Unfortunately that mapping is trashed after overlapping
> pte's are created.

Right, that is supposed to happen unless TLB pinning
is configured.

Thanks.


	-- Dan

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox