First cut at large page support on 40x

linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed

* First cut at large page support on 40x
@ 2002-05-31  4:21 David Gibson
  2002-05-31  4:31 ` David Gibson
  2002-06-04  0:43 ` Dan Malek
  0 siblings, 2 replies; 30+ messages in thread
From: David Gibson @ 2002-05-31  4:21 UTC (permalink / raw)
  To: linuxppc-embedded; +Cc: Paul Mackerras


The patch below (against 2_4_devel) implements using large parge TLB
entries to map kernel pages on the 40x.  paulus did the basic design,
and I tweaked and degubbed it. It's a bit ugly in places (particularly
the handling of iopa()) and will need cleaning up, but it does seem to
work.

It works as follows: 40x now uses an explicit _PMD_PRESENT bit, rather
than just checking if the high bits are non-zero.  If this bit is set
in a PMD entry it means that it contains a valid pointer to a page of
PTEs.

If _PMD_PRESENT is not set, but any of bits 24-26 are non-zero, then
it is a large-page PTE.  Bits 24-26 give the size (and are shifted
into place by the TLB miss handler).  The remaining bits have the same
meaning as in a normal PTE.

Theoretically the entry can represent any of the 40x's allowed page
sizes, except size 0 (1k), but in practice only 4M and 16MB pages are
likely to be useful - since each PMD entry corresponds to a 4MB
region, using a smaller page size would lead to that page mapping
being repeated across that 4MB region.  To use 16MB pages 4 adjacent
PMD entries must all be filled with the same PTE value.

The only large-page PTEs used are created in mapin_ram() for the
kernel mapping of system RAM.

diff -urN /home/dgibson/kernel/linuxppc_2_4_devel/arch/ppc/kernel/head_4xx.S linux-grinch-largepage/arch/ppc/kernel/head_4xx.S
--- /home/dgibson/kernel/linuxppc_2_4_devel/arch/ppc/kernel/head_4xx.S	Thu May 30 18:15:28 2002
+++ linux-grinch-largepage/arch/ppc/kernel/head_4xx.S	Fri May 31 10:54:30 2002
@@ -261,10 +261,10 @@
 	tophys(r21, r21)
 	rlwimi	r21, r20, 12, 20, 29	/* Create L1 (pgdir/pmd) address */
 	lwz	r21, 0(r21)		/* Get L1 entry */
-	rlwinm.	r22, r21, 0, 0, 19	/* Extract L2 (pte) base address */
+	andi.	r22, r21, _PMD_PRESENT	/* Check if it points to a PTE page */
 	beq	2f			/* Bail if no table */

-	tophys(r22, r22)
+	tophys(r22, r21)
 	rlwimi	r22, r20, 22, 20, 29	/* Compute PTE address */
 	lwz	r21, 0(r22)		/* Get Linux PTE */

@@ -495,33 +495,40 @@
 	tophys(r21, r21)
 	rlwimi	r21, r20, 12, 20, 29	/* Create L1 (pgdir/pmd) address */
 	lwz	r21, 0(r21)		/* Get L1 entry */
-	rlwinm.	r22, r21, 0, 0, 19	/* Extract L2 (pte) base address */
+	andi.	r22, r21, _PMD_PRESENT	/* check if it points to pte page */
 	beq	2f			/* Bail if no table */

-	tophys(r22, r22)
+	tophys(r22, r21)
 	rlwimi	r22, r20, 22, 20, 29	/* Compute PTE address */
 	lwz	r21, 0(r22)		/* Get Linux PTE */
 	andi.	r23, r21, _PAGE_PRESENT
-	beq	2f
+	beq	5f

 	ori	r21, r21, _PAGE_ACCESSED
 	stw	r21, 0(r22)

-	/* Most of the Linux PTE is ready to load into the TLB LO.
-	 * We set ZSEL, where only the LS-bit determines user access.
-	 * We set execute, because we don't have the granularity to
-	 * properly set this at the page level (Linux problem).
-	 * If shared is set, we cause a zero PID->TID load.
-	 * Many of these bits are software only.  Bits we don't set
-	 * here we (properly should) assume have the appropriate value.
+	/* Create TLB tag.  This is the faulting address plus a static
+	 * set of bits.  These are size, valid, E, U0.
 	 */
-	li	r22, 0x0ce2
-	andc	r21, r21, r22		/* Make sure 20, 21 are zero */
+	li	r22, 0x00c0
+	rlwimi	r20, r22, 0, 20, 31

 	b	finish_tlb_load

-
+	/* Check for possible large-page pmd entry */
 2:
+	rlwinm.	r22,r21,2,22,24		/* size != 0 means large-page */
+	beq	5f
+
+	/* Create EPN.  This is the faulting address plus a static
+	 * set of bits (valid, E, U0) plus the size from the PMD.
+	 */
+	ori	r22,r22,0x40
+	rlwimi	r20, r22, 0, 20, 31
+
+	b	finish_tlb_load
+
+5:
 	/* The bailout.  Restore registers to pre-exception conditions
 	 * and call the heavyweights to help us out.
 	 */
@@ -588,32 +595,40 @@
 	tophys(r21, r21)
 	rlwimi	r21, r20, 12, 20, 29	/* Create L1 (pgdir/pmd) address */
 	lwz	r21, 0(r21)		/* Get L1 entry */
-	rlwinm.	r22, r21, 0, 0, 19	/* Extract L2 (pte) base address */
+	andi.	r22, r21, _PMD_PRESENT	/* check if it points to pte page */
 	beq	2f			/* Bail if no table */

-	tophys(r22, r22)
+	tophys(r22, r21)
 	rlwimi	r22, r20, 22, 20, 29	/* Compute PTE address */
 	lwz	r21, 0(r22)		/* Get Linux PTE */
 	andi.	r23, r21, _PAGE_PRESENT
-	beq	2f
+	beq	5f

 	ori	r21, r21, _PAGE_ACCESSED
 	stw	r21, 0(r22)

-	/* Most of the Linux PTE is ready to load into the TLB LO.
-	 * We set ZSEL, where only the LS-bit determines user access.
-	 * We set execute, because we don't have the granularity to
-	 * properly set this at the page level (Linux problem).
-	 * If shared is set, we cause a zero PID->TID load.
-	 * Many of these bits are software only.  Bits we don't set
-	 * here we (properly should) assume have the appropriate value.
+	/* Create EPN.  This is the faulting address plus a static
+	 * set of bits.  These are size, valid, E, U0.
 	 */
-	li	r22, 0x0ce2
-	andc	r21, r21, r22		/* Make sure 20, 21 are zero */
+	li	r22, 0x00c0
+	rlwimi	r20, r22, 0, 20, 31

 	b	finish_tlb_load

+	/* Check for possible large-page pmd entry */
 2:
+	rlwinm.	r22,r21,2,22,24		/* size != 0 means large-page */
+	beq	5f
+
+	/* Create EPN.  This is the faulting address plus a static
+	 * set of bits (valid=1, E=0, U0=0) plus the size from the PMD.
+	 */
+	ori	r22,r22,0x40
+	rlwimi	r20, r22, 0, 20, 31
+
+	b	finish_tlb_load
+
+5:
 	/* The bailout.  Restore registers to pre-exception conditions
 	 * and call the heavyweights to help us out.
 	 */
@@ -749,7 +764,14 @@
 	 * EPN is already in the TLB.
 	 */
 	tlbsx.	r23, 0, r20
-	beq	6f
+	bne	8f
+	lwz	r22,9f@l(0)
+	addi	r22,r22,1
+	stw	r22,9f@l(0)
+	b	6f
+tlb_miss_hit:
+9:	.long	0
+8:

 	/* load the next available TLB index.
 	*/
@@ -766,14 +788,16 @@
 	stw	r23, tlb_4xx_index@l(0)

 6:
+	/*
+	 * Clear out the software-only bits in the PTE to generate the
+	 * TLB_DATA value.  These are the bottom 2 bits of RPN, the
+	 * top 3 bits of the zone field, and M.
+	 */
+	li	r22, 0x0ce2
+	andc	r21, r21, r22		/* Make sure 20, 21 are zero */
+
 	tlbwe	r21, r23, TLB_DATA		/* Load TLB LO */

-	/* Create EPN.  This is the faulting address plus a static
-	 * set of bits.  These are size, valid, E, U0, and ensure
-	 * bits 20 and 21 are zero.
-	 */
-	li	r22, 0x00c0
-	rlwimi	r20, r22, 0, 20, 31
 	tlbwe	r20, r23, TLB_TAG		/* Load TLB HI */

 	/* Done...restore registers and get out of here.
diff -urN /home/dgibson/kernel/linuxppc_2_4_devel/arch/ppc/mm/pgtable.c linux-grinch-largepage/arch/ppc/mm/pgtable.c
--- /home/dgibson/kernel/linuxppc_2_4_devel/arch/ppc/mm/pgtable.c	Mon Apr  8 10:29:07 2002
+++ linux-grinch-largepage/arch/ppc/mm/pgtable.c	Fri May 31 13:51:48 2002
@@ -348,7 +348,38 @@

 	v = KERNELBASE;
 	p = PPC_MEMSTART;
-	for (s = 0; s < total_lowmem; s += PAGE_SIZE) {
+	s = 0;
+#if defined(CONFIG_40x)
+	for (; s <= (total_lowmem - 16*1024*1024); s += 16*1024*1024) {
+		pmd_t *pmdp;
+		unsigned long val = p | _PMD_SIZE_16M | _PAGE_HWEXEC | _PAGE_HWWRITE;
+
+		spin_lock(&init_mm.page_table_lock);
+		pmdp = pmd_offset(pgd_offset_k(v), v);
+		pmd_val(*pmdp++) = val;
+		pmd_val(*pmdp++) = val;
+		pmd_val(*pmdp++) = val;
+		pmd_val(*pmdp++) = val;
+		spin_unlock(&init_mm.page_table_lock);
+
+		v += 16*1024*1024;
+		p += 16*1024*1024;
+	}
+
+	for(; s <= (total_lowmem - 4*1024*1024); s += 4*1024*1024) {
+		pmd_t *pmdp;
+		unsigned long val = p | _PMD_SIZE_4M | _PAGE_HWEXEC | _PAGE_HWWRITE;
+
+		spin_lock(&init_mm.page_table_lock);
+		pmdp = pmd_offset(pgd_offset_k(v), v);
+		pmd_val(*pmdp) = val;
+		spin_unlock(&init_mm.page_table_lock);
+
+		v += 4*1024*1024;
+		p += 4*1024*1024;
+	}
+#endif
+	for (; s < total_lowmem; s += PAGE_SIZE) {
 		/* On the MPC8xx, we want the page shared so we
 		 * don't get ASID compares on kernel space.
 		 */
@@ -468,8 +499,33 @@
 		mm = &init_mm;

 	pa = 0;
+#ifdef CONFIG_40x
+	{
+		pgd_t	*pgd;
+		pmd_t	*pmd;
+		const unsigned long large_page_mask[] = {
+			0xfffff800, 0xffffe000, 0xffff8000, 0xfffe0000,
+			0xfff80000, 0xffe00000, 0xff800000, 0xfe000000
+		};
+
+		pgd = pgd_offset(mm, addr & PAGE_MASK);
+		if (pgd) {
+			pmd = pmd_offset(pgd, addr & PAGE_MASK);
+			if (pmd_present(*pmd)) {
+				pte = pte_offset(pmd, addr & PAGE_MASK);
+				pa = (pte_val(*pte) & PAGE_MASK) | (addr & ~PAGE_MASK);
+			} else if (pmd_val(*pmd) & _PMD_SIZE) {
+				unsigned long mask =
+					large_page_mask[(pmd_val(*pmd) & _PMD_SIZE) >> 5];
+				pa = (pmd_val(*pmd) & mask) | (addr & ~mask);
+			}
+		}
+	}
+
+#else
 	if (get_pteptr(mm, addr, &pte))
 		pa = (pte_val(*pte) & PAGE_MASK) | (addr & ~PAGE_MASK);
+#endif

 	return(pa);
 }
diff -urN /home/dgibson/kernel/linuxppc_2_4_devel/include/asm-ppc/pgtable.h linux-grinch-largepage/include/asm-ppc/pgtable.h
--- /home/dgibson/kernel/linuxppc_2_4_devel/include/asm-ppc/pgtable.h	Wed Apr 17 10:26:01 2002
+++ linux-grinch-largepage/include/asm-ppc/pgtable.h	Fri May 31 13:50:13 2002
@@ -285,8 +285,8 @@
      is cleared in the TLB miss handler before the TLB entry is loaded.
    - All other bits of the PTE are loaded into TLBLO without
      modification, leaving us only the bits 20, 21, 24, 25, 26, 30 for
-     software PTE bits.  We actually use use bits 21, 24, 25, 26, and
-     30 respectively for the software bits: ACCESSED, DIRTY, RW, EXEC,
+     software PTE bits.  We actually use use bits 21, 24, 25, and
+     30 respectively for the software bits: ACCESSED, DIRTY, RW, and
      PRESENT.
 */

@@ -301,8 +301,12 @@
 #define _PAGE_HWWRITE	0x100	/* hardware: Dirty & RW, set in exception */
 #define _PAGE_HWEXEC	0x200	/* hardware: EX permission */
 #define _PAGE_ACCESSED	0x400	/* software: R: page referenced */
-#define _PMD_PRESENT	PAGE_MASK

+#define _PMD_PRESENT	0x400	/* PMD points to page of PTEs */
+#define _PMD_SIZE	0x0e0	/* size field, != 0 for large-page PMD entry */
+#define _PMD_SIZE_4M	0x0c0
+#define _PMD_SIZE_16M	0x0e0
+#define _PMD_BAD	0x802
 #elif defined(CONFIG_440)

 /*
@@ -357,9 +361,10 @@
 #define _PAGE_HWWRITE	0x0100	/* h/w write enable: never set in Linux PTE */
 #define _PAGE_USER	0x0800	/* One of the PP bits, the other is USER&~RW */

-#define _PMD_PRESENT	PAGE_MASK
+#define _PMD_PRESENT	0x0001
 #define _PMD_PAGE_MASK	0x000c
 #define _PMD_PAGE_8M	0x000c
+#define _PMD_BAD	0x0ff0

 #else /* CONFIG_6xx */
 /* Definitions for 60x, 740/750, etc. */
@@ -374,7 +379,9 @@
 #define _PAGE_ACCESSED	0x100	/* R: page referenced */
 #define _PAGE_EXEC	0x200	/* software: i-cache coherency required */
 #define _PAGE_RW	0x400	/* software: user write access allowed */
-#define _PMD_PRESENT	PAGE_MASK
+
+#define _PMD_PRESENT	0x800
+#define _PMD_BAD	0x7ff
 #endif

 /* The non-standard PowerPC MMUs, which includes the 4xx and 8xx (and
@@ -474,7 +481,7 @@
 #define pte_clear(ptep)		do { set_pte((ptep), __pte(0)); } while (0)

 #define pmd_none(pmd)		(!pmd_val(pmd))
-#define	pmd_bad(pmd)		((pmd_val(pmd) & _PMD_PRESENT) == 0)
+#define	pmd_bad(pmd)		((pmd_val(pmd) & _PMD_BAD) != 0)
 #define	pmd_present(pmd)	((pmd_val(pmd) & _PMD_PRESENT) != 0)
 #define	pmd_clear(pmdp)		do { pmd_val(*(pmdp)) = 0; } while (0)



--
David Gibson			| For every complex problem there is a
david@gibson.dropbear.id.au	| solution which is simple, neat and
				| wrong.  -- H.L. Mencken
http://www.ozlabs.org/people/dgibson

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: First cut at large page support on 40x
  2002-05-31  4:21 First cut at large page support on 40x David Gibson
@ 2002-05-31  4:31 ` David Gibson
  2002-06-04  0:43 ` Dan Malek
  1 sibling, 0 replies; 30+ messages in thread
From: David Gibson @ 2002-05-31  4:31 UTC (permalink / raw)
  To: linuxppc-embedded


On Fri, May 31, 2002 at 02:21:53PM +1000, David Gibson wrote:
>
> The patch below (against 2_4_devel) implements using large parge TLB
> entries to map kernel pages on the 40x.  paulus did the basic design,
> and I tweaked and degubbed it. It's a bit ugly in places (particularly
> the handling of iopa()) and will need cleaning up, but it does seem to
> work.

Oh, yeah, after applying the patch compile with CONFIG_PIN_TLB=n.  I'm
not sure how they'd interact, but probably not well.

--
David Gibson			| For every complex problem there is a
david@gibson.dropbear.id.au	| solution which is simple, neat and
				| wrong.  -- H.L. Mencken
http://www.ozlabs.org/people/dgibson

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: First cut at large page support on 40x
  2002-05-31  4:21 First cut at large page support on 40x David Gibson
  2002-05-31  4:31 ` David Gibson
@ 2002-06-04  0:43 ` Dan Malek
  2002-06-04  3:59   ` David Gibson
  1 sibling, 1 reply; 30+ messages in thread
From: Dan Malek @ 2002-06-04  0:43 UTC (permalink / raw)
  To: David Gibson; +Cc: linuxppc-embedded, Paul Mackerras

David Gibson wrote:

> The only large-page PTEs used are created in mapin_ram() for the
> kernel mapping of system RAM.

I did a similar thing for 8xx, except I didn't change any of the existing
mapping code.  At the end of the kernel initialization, I added a function
to scan the page tables looking for areas we could coalesce into larger
pages.  I would then update the PMD entry to indicate the size of the pages
(4M or 8M in the case of 8xx) it could use to cover this space.  This way
I could also get any I/O mapping that was done and I didn't have to complicate
any of the existing mapin_ram() or other code with processor specific changes.
Also, iopa() and any page lookup functions should work since the only change
was to add control bits to the least significant part of pmd entry.

Thanks.

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: First cut at large page support on 40x
  2002-06-04  0:43 ` Dan Malek
@ 2002-06-04  3:59   ` David Gibson
  2002-06-04 17:42     ` Dan Malek
  0 siblings, 1 reply; 30+ messages in thread
From: David Gibson @ 2002-06-04  3:59 UTC (permalink / raw)
  To: Dan Malek; +Cc: linuxppc-embedded, Paul Mackerras


On Mon, Jun 03, 2002 at 08:43:43PM -0400, Dan Malek wrote:
>
> David Gibson wrote:
>
> >The only large-page PTEs used are created in mapin_ram() for the
> >kernel mapping of system RAM.
>
> I did a similar thing for 8xx, except I didn't change any of the existing
> mapping code.  At the end of the kernel initialization, I added a function
> to scan the page tables looking for areas we could coalesce into larger
> pages.  I would then update the PMD entry to indicate the size of the pages
> (4M or 8M in the case of 8xx) it could use to cover this space.  This way
> I could also get any I/O mapping that was done and I didn't have to
> complicate
> any of the existing mapin_ram() or other code with processor specific
> changes.
> Also, iopa() and any page lookup functions should work since the only change
> was to add control bits to the least significant part of pmd entry.

That sounds dangerous to me:  many of the kernel mappings could change
with vfree() or iounmap().  The mapping of physical RAM established in
mapin_ram() we know will be around forever.  Furthermore this way we
save a little bit of RAM, because we don't need to store the bottom
level page tables for the kernel mapping, and the TLB miss handler is
simpler and faster because like a normal PTE it can load most of the
TLB_DATA field directly from the PMD entry.

--
David Gibson			| For every complex problem there is a
david@gibson.dropbear.id.au	| solution which is simple, neat and
				| wrong.  -- H.L. Mencken
http://www.ozlabs.org/people/dgibson

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: First cut at large page support on 40x
  2002-06-04  3:59   ` David Gibson
@ 2002-06-04 17:42     ` Dan Malek
  2002-06-05  0:10       ` David Gibson
  2002-06-05 22:29       ` Paul Mackerras
  0 siblings, 2 replies; 30+ messages in thread
From: Dan Malek @ 2002-06-04 17:42 UTC (permalink / raw)
  To: David Gibson; +Cc: linuxppc-embedded, Paul Mackerras

David Gibson wrote:

> That sounds dangerous to me:

It's not.  All you end up finding are the kernel ram pages and the
early 1:1 mapping of I/O space that never changes. The vmalloc()'ed
space isn't properly ordered to find monotonically increasing page
frame numbers.  It's just a cleaner implementation because you have
processor specific functions to set up the PMD that aren't cluttering
the generic functions.  If necessary, you can sift through the VM ranges
and ensure the things you feel are inappropriate aren't put into the PMD
large page mapping.

> ......  Furthermore this way we
> save a little bit of RAM, because we don't need to store the bottom
> level page tables for the kernel mapping,

If you would allow these to stay, you wouldn't have to change any other
mapping functions, like iopa().  It's only a couple of pages.......

> ..... and the TLB miss handler is
> simpler and faster because like a normal PTE it can load most of the
> TLB_DATA field directly from the PMD entry.

That's the idea :-)

For the MPC8xx I did two simple things.  First, added the function to
scan the usual page tables that were built and update the PMD to indicate
the large pages.  Second, changed the tlb miss handler to load the PMD into
the MMU register with making any modifications to the bits.  The PTE is
then loaded just as it always was (the 8xx has nice support for large pages
following the normal PTE loading path).

This is of course after Paulus modified the page table macros to be aware of
additional control bits in the PMD :-)

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: First cut at large page support on 40x
  2002-06-04 17:42     ` Dan Malek
@ 2002-06-05  0:10       ` David Gibson
  2002-06-05 17:25         ` Dan Malek
  2002-06-05 22:29       ` Paul Mackerras
  1 sibling, 1 reply; 30+ messages in thread
From: David Gibson @ 2002-06-05  0:10 UTC (permalink / raw)
  To: Dan Malek; +Cc: linuxppc-embedded, Paul Mackerras


On Tue, Jun 04, 2002 at 01:42:56PM -0400, Dan Malek wrote:
>
> David Gibson wrote:
>
> >That sounds dangerous to me:
>
> It's not.  All you end up finding are the kernel ram pages and the
> early 1:1 mapping of I/O space that never changes. The vmalloc()'ed
> space isn't properly ordered to find monotonically increasing page
> frame numbers.  It's just a cleaner implementation because you have
> processor specific functions to set up the PMD that aren't cluttering
> the generic functions.  If necessary, you can sift through the VM ranges
> and ensure the things you feel are inappropriate aren't put into the PMD
> large page mapping.
>
>
> >......  Furthermore this way we
> >save a little bit of RAM, because we don't need to store the bottom
> >level page tables for the kernel mapping,
>
> If you would allow these to stay, you wouldn't have to change any other
> mapping functions, like iopa().  It's only a couple of pages.......
>
> >..... and the TLB miss handler is
> >simpler and faster because like a normal PTE it can load most of the
> >TLB_DATA field directly from the PMD entry.
>
> That's the idea :-)
>
> For the MPC8xx I did two simple things.  First, added the function to
> scan the usual page tables that were built and update the PMD to indicate
> the large pages.  Second, changed the tlb miss handler to load the PMD into
> the MMU register with making any modifications to the bits.  The PTE is
> then loaded just as it always was (the 8xx has nice support for large pages
> following the normal PTE loading path).

Hang on, I'm not clear about this.  Does your PMD contain an entry to
be loaded into the TLB or not?  In my implemention the PMD entry
itself contains the data to load into TLB_DATA (except that we borrow
the top three bits of ZSEL for the page size).  Since we're doing that
there's no room in the entry for a pointer to a page of PTEs.

--
David Gibson			| For every complex problem there is a
david@gibson.dropbear.id.au	| solution which is simple, neat and
				| wrong.  -- H.L. Mencken
http://www.ozlabs.org/people/dgibson

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: First cut at large page support on 40x
  2002-06-05  0:10       ` David Gibson
@ 2002-06-05 17:25         ` Dan Malek
  2002-06-06  1:35           ` David Gibson
  0 siblings, 1 reply; 30+ messages in thread
From: Dan Malek @ 2002-06-05 17:25 UTC (permalink / raw)
  To: David Gibson; +Cc: linuxppc-embedded, Paul Mackerras

David Gibson wrote:

> Hang on, I'm not clear about this.  Does your PMD contain an entry to
> be loaded into the TLB or not?

Yes (well, it's supposed to :-).  The PMD is loaded into the xx_TWC
register and the PTE into the xx_RPN register.  The bits in the PMD/TWC
contain the page size information, those in the PTE/RPN contain the
remainder of the control/status and real page number.  I don't have it
working quite right yet, but the bits are all aligned very nicely in
the hardware to make this happen regardless of the page size.

The basic logic on the 8xx is trivial.  Load the PMD into the TWC, the PTE
into the RPN, and you are done.  The PMD still has the pointer to the PTE
page, and the 8xx provides a hardware assist using that to give you the
pointer to the PTE.  The only thing we have to do in software is update
the accessed bit in the PTE.  The least significant bits of the PMD contain
the page size and cache control for the page, the PTE is identical to what
we have always used.

 > .....  In my implemention the PMD entry
> itself contains the data to load into TLB_DATA (except that we borrow
> the top three bits of ZSEL for the page size).  Since we're doing that
> there's no room in the entry for a pointer to a page of PTEs.

You may want to consider just emulating something like I have done.  Just
leave the page table structure alone, and use the LS bits of the PMD to
store the page size information (or anything else useful).  This way, the
Linux page tables remain intact, and the only change I needed was to
actually test for a PMD_VALID flag and ignore the least significant bits when
constructing a pointer to the PTE table.  The background debuggers that know
how to peruse the page tables will continue to work properly, the memory
management functions will work properly, and chances are the 4xx tlb miss handler
won't require any special case branches when loading the TLB either.

All of the framework for this is already done for the 8xx, I just have
not committed the function that scans the page tables and modifies the PMD
for the larger page.

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: First cut at large page support on 40x
  2002-06-04 17:42     ` Dan Malek
  2002-06-05  0:10       ` David Gibson
@ 2002-06-05 22:29       ` Paul Mackerras
  2002-06-06  4:48         ` Dan Malek
  1 sibling, 1 reply; 30+ messages in thread
From: Paul Mackerras @ 2002-06-05 22:29 UTC (permalink / raw)
  To: Dan Malek; +Cc: David Gibson, linuxppc-embedded

Dan Malek writes:

> If you would allow these to stay, you wouldn't have to change any other
> mapping functions, like iopa().  It's only a couple of pages.......

Dan, you continue to surprise me.  You complain about a kB or two for
a device tree but happily throw away anything from 16kB to 128kB or
more just to simplify iopa, which should not be needed anyway.

Paul.

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: First cut at large page support on 40x
  2002-06-05 17:25         ` Dan Malek
@ 2002-06-06  1:35           ` David Gibson
  2002-06-06  4:57             ` Dan Malek
  0 siblings, 1 reply; 30+ messages in thread
From: David Gibson @ 2002-06-06  1:35 UTC (permalink / raw)
  To: Dan Malek; +Cc: linuxppc-embedded, Paul Mackerras


On Wed, Jun 05, 2002 at 01:25:53PM -0400, Dan Malek wrote:
>
> David Gibson wrote:
>
> >Hang on, I'm not clear about this.  Does your PMD contain an entry to
> >be loaded into the TLB or not?
>
> Yes (well, it's supposed to :-).  The PMD is loaded into the xx_TWC
> register and the PTE into the xx_RPN register.  The bits in the PMD/TWC
> contain the page size information, those in the PTE/RPN contain the
> remainder of the control/status and real page number.  I don't have it
> working quite right yet, but the bits are all aligned very nicely in
> the hardware to make this happen regardless of the page size.
>
> The basic logic on the 8xx is trivial.  Load the PMD into the TWC, the PTE
> into the RPN, and you are done.  The PMD still has the pointer to the PTE
> page, and the 8xx provides a hardware assist using that to give you the
> pointer to the PTE.  The only thing we have to do in software is update
> the accessed bit in the PTE.  The least significant bits of the PMD contain
> the page size and cache control for the page, the PTE is identical to what
> we have always used.

Ok, which PTE do you use to load the RPN for the large page entry?
The first one in the page directory, or the one corresponding to the
actual address you're looking up?

I don't know about 8xx, but on 4xx the low bits in the RPN when
putting a large page entry into the TLB *must* be zeroed, or general
wierdness may occur.  So using the PTE corresponding to the actual
address would mean an extra branch in the large-page case and clearing
some bits.

> > .....  In my implemention the PMD entry
> >itself contains the data to load into TLB_DATA (except that we borrow
> >the top three bits of ZSEL for the page size).  Since we're doing that
> >there's no room in the entry for a pointer to a page of PTEs.
>
> You may want to consider just emulating something like I have done.  Just
> leave the page table structure alone, and use the LS bits of the PMD to
> store the page size information (or anything else useful).  This
> way, the

The fact that our implementation works now, and yours is still
unstable, does not encourage me to take this approach.

> Linux page tables remain intact, and the only change I needed was to
> actually test for a PMD_VALID flag and ignore the least significant bits
> when
> constructing a pointer to the PTE table.  The background debuggers that know
> how to peruse the page tables will continue to work properly, the memory
> management functions will work properly, and chances are the 4xx tlb miss
> handler
> won't require any special case branches when loading the TLB either.

We already must test to see if the PMD is present in the normal case.
In our implementation large page PMDs show up as not present.  So it's
only in the uncommon case (not present) that we test for a large page
PMD entry.

> All of the framework for this is already done for the 8xx, I just have
> not committed the function that scans the page tables and modifies the PMD
> for the larger page.

--
David Gibson			| For every complex problem there is a
david@gibson.dropbear.id.au	| solution which is simple, neat and
				| wrong.  -- H.L. Mencken
http://www.ozlabs.org/people/dgibson

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: First cut at large page support on 40x
  2002-06-05 22:29       ` Paul Mackerras
@ 2002-06-06  4:48         ` Dan Malek
  2002-06-06  5:44           ` Paul Mackerras
  0 siblings, 1 reply; 30+ messages in thread
From: Dan Malek @ 2002-06-06  4:48 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: David Gibson, linuxppc-embedded

Paul Mackerras wrote:

> Dan, you continue to surprise me.

Eh?

> .....  You complain about a kB or two for
> a device tree

When did I ever complain about a device tree?  I think it's the right
idea, I just didn't like the way we were getting there on systems
that don't have OF.

> .... but happily throw away anything from 16kB to 128kB or
> more just to simplify iopa, which should not be needed anyway.

What isn't needed?  The page tables or the iopa()?  I'm tired of having
different methods to look up VM information just because the memory
was allocated in a different way.  With iopa() (which seems fine for
other architectures to use) I don't care how the memory was allocated,
I just feed it a virtual address and get the answer.  What's wrong with
that (other than it's not a hack :-)?  The page tables have always
been there, and it's not a big deal.  Why haven't we done the same
hack for processors with BATs?  They don't need the page tables either.

I also stated the importance of the page tables is to allow background
hardware debuggers to look up translations so they can work with Linux.
Kind of a nice thing to have once in a while.

I find a simple solution for an enhancement and you don't like it because
it isn't a big hacked up mess (or maybe because I had an original thought).
If I would have made the same hacked up mess you have done it wouldn't have
been checked in.....not long ago all of the embedded stuff was viewed as a
problem child, and today it's OK to hack up generic code with an #ifdef
for a specific IBM embedded processor????  Does that surprise you? :-)

I haven't changed my point of view on any of the embedded Linux because
that is pretty much all I have ever done.  It's the rest of you that can't
determine on which side of the fence you want to play.  Stop picking on me
or I'll take all of my toys to a different playground :-).

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: First cut at large page support on 40x
  2002-06-06  1:35           ` David Gibson
@ 2002-06-06  4:57             ` Dan Malek
  0 siblings, 0 replies; 30+ messages in thread
From: Dan Malek @ 2002-06-06  4:57 UTC (permalink / raw)
  To: David Gibson; +Cc: linuxppc-embedded, Paul Mackerras

David Gibson wrote:

> Ok, which PTE do you use to load the RPN for the large page entry?

The PMD plus the PTE, both loaded into the 8xx MMU registers, provide
the information for page size and the real address.

> The first one in the page directory, or the one corresponding to the
> actual address you're looking up?

The one corresponding to the offset lookup.  The LS bits are masked.

> I don't know about 8xx, but on 4xx the low bits in the RPN when
> putting a large page entry into the TLB *must* be zeroed, or general
> wierdness may occur.

Doesn't seem to be that way on 8xx, so I guess I'm lucky :-)

> ...  So using the PTE corresponding to the actual
> address would mean an extra branch in the large-page case and clearing
> some bits.

Or, just generic code to clear bits based upon the page size.

> The fact that our implementation works now, and yours is still
> unstable, does not encourage me to take this approach.

I just haven't had time to finish it and verify it works.  I still
intend to continue to use the page tables as they exist and let
the PMD contain additional page size information, even if I have to
add some code to the tlbmiss exception.  There are more processors
coming in the future that will benefit from this rather than continuing
to make processor specific hacks in the generic MM code.

> We already must test to see if the PMD is present in the normal case.
> In our implementation large page PMDs show up as not present.  So it's
> only in the uncommon case (not present) that we test for a large page
> PMD entry.

OK.

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: First cut at large page support on 40x
  2002-06-06  4:48         ` Dan Malek
@ 2002-06-06  5:44           ` Paul Mackerras
  2002-06-06  7:58             ` Dan Malek
  0 siblings, 1 reply; 30+ messages in thread
From: Paul Mackerras @ 2002-06-06  5:44 UTC (permalink / raw)
  To: Dan Malek; +Cc: David Gibson, linuxppc-embedded

Dan Malek writes:

> When did I ever complain about a device tree?  I think it's the right
> idea, I just didn't like the way we were getting there on systems
> that don't have OF.

OK, that's cool then.  I didn't think we were getting "there"
(i.e. towards having a device tree) at all yet on systems without OF
though.  In 2.5 the devicefs stuff may yet give us an acceptable
unification.

> What isn't needed?  The page tables or the iopa()?  I'm tired of having

The page table pages aren't needed and are a pain for large-page
entries on 4xx since the pmd is already largely occupied with the
physical base address and other bits.

As for iopa(), what I mainly don't like is its use in virt_to_phys and
virt_to_bus.  The reason for that is that every other architecture
restricts the use of virt_to_phys/bus to addresses that are part of
the kernel mapping of lowmem, which means that they become very
simple.  Using iopa() in virt_to_* is just going to tempt us to use
them on other sorts of addresses, which will make our drivers less
portable.

Instead I think that we should only use virt_to_* on addresses that
are part of the kernel mapping of lowmem.  If a driver uses
consistent_alloc or pci_alloc_consistent, the driver should save and
use the physical address returned by those functions.  Ideally we
would have analogous routines to pci_[un]map_single for the on-chip
devices.  With that, I think there would be very few legitimate
reasons for a driver to need to use virt_to_* directly at all.

> different methods to look up VM information just because the memory
> was allocated in a different way.  With iopa() (which seems fine for
> other architectures to use) I don't care how the memory was allocated,
> I just feed it a virtual address and get the answer.  What's wrong with
> that (other than it's not a hack :-)?  The page tables have always

Well, if what you fed it was obtained from vmalloc, and you don't deal
explicitly with the fact that vmalloc'd memory is not physically
contiguous, you are in danger of DMA'ing into some random page
somewhere and corrupting it.  If what you feed it is a user address
and you start some DMA into it, you have in addition to the physical
discontiguity the fact that the page might get taken away from the
process and used for something else before the DMA finishes.

So in general if you want an address for doing DMA, virt_to_bus is
really only safe on kernel addresses (i.e. addresses that are within
the kernel mapping of lowmem).

Do you have other situations in mind (other than debugging-type
things) where you need to use virt_to_phys/bus on something that isn't
a lowmem address?

> been there, and it's not a big deal.  Why haven't we done the same
> hack for processors with BATs?  They don't need the page tables either.

True. :)

> I also stated the importance of the page tables is to allow background
> hardware debuggers to look up translations so they can work with Linux.
> Kind of a nice thing to have once in a while.

That's reasonable.

> I find a simple solution for an enhancement and you don't like it because
> it isn't a big hacked up mess (or maybe because I had an original thought).

I don't see a lot of value in doing things differently from all the
other architectures in this instance, and I think that restricting
virt_to_bus/phys to lowmem addresses is reasonable.  I don't mind if
iopa() stays around for a few specialized uses.

> If I would have made the same hacked up mess you have done it wouldn't have
> been checked in.....not long ago all of the embedded stuff was viewed as a
> problem child, and today it's OK to hack up generic code with an #ifdef

Viewed as a problem child?  By whom?

> for a specific IBM embedded processor????  Does that surprise you? :-)

You know I hate ifdefs. :)

Paul.

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: First cut at large page support on 40x
  2002-06-06  5:44           ` Paul Mackerras
@ 2002-06-06  7:58             ` Dan Malek
  2002-06-06  8:17               ` David Gibson
  2002-06-12  3:52               ` David Gibson
  0 siblings, 2 replies; 30+ messages in thread
From: Dan Malek @ 2002-06-06  7:58 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: David Gibson, linuxppc-embedded

Paul Mackerras wrote:

> .... I didn't think we were getting "there"
> (i.e. towards having a device tree) at all yet on systems without OF
> though.

It was one of the first things Ben and I discussed long ago before
this bi_rec stuff appeared.  We were just going to have the bootloaders
build OF-like device tree........and I'll just stop that discussion
right here :-)

> As for iopa(), what I mainly don't like is its use in virt_to_phys and
> virt_to_bus.

That's just because there wasn't any other well defined interface to
get the information, and it's the way m68k/APUS (IIRC) did it.  There
have been discussions going on forever about the demise of virt_to_*
functions which I thought would have occurred by now.

> ....  The reason for that is that every other architecture
> restricts the use of virt_to_phys/bus to addresses that are part of
> the kernel mapping of lowmem,

I know, and I've been exposed to operating systems that have emerged
from this primitive state long ago :-)  It's hard to go back to early-80's
designs :-)

> .... If a driver uses
> consistent_alloc or pci_alloc_consistent, the driver should save and
> use the physical address returned by those functions.

But, that's a relatively new addition and the support of these functions
in a non cache coherent system requires the use of something like iopa().
We allocate a contiguous virtual space to remap the pages we wish to
change cache attributes.  We can't find the physical addresses associated
with these pages unless we search the page tables.  Prior to having
consistent_* we had to do this behind the curtains and needed iopa() to
get the proper mapping result.

> .....  Ideally we
> would have analogous routines to pci_[un]map_single for the on-chip
> devices.

I agree, but it doesn't remove the lower level requirement of searching
page tables.

> Do you have other situations in mind (other than debugging-type
> things) where you need to use virt_to_phys/bus on something that isn't
> a lowmem address?

I just think Linux should at least move into the last decade and have
standard methods for managing memory regardless of how it is allocated or
where it is located :-)  The consistent_* functions have removed the need
for drivers to search out mappings on their own, so I don't see the need
for this function outside of supporting the consistent_* functions.

> I don't see a lot of value in doing things differently from all the
> other architectures in this instance, and I think that restricting
> virt_to_bus/phys to lowmem addresses is reasonable.  I don't mind if
> iopa() stays around for a few specialized uses.

I wasn't talking about this in my message.  I was talking about the
horrible hack you guys made to mapin_ram() and didn't like my suggestion
to just keep the page tables and just scan them later to set up the PMDs.
I think we could also clean up the PMD/PTE structure and look at the tlb
miss exception to streamline that as well.

> Viewed as a problem child?  By whom?

Wasn't there once this desire to split the trees into desktop and embedded
just so you wouldn't have to deal with any of my changes :-)  I seem to
remember such a conversation.....but then you changed employers :-)

> You know I hate ifdefs. :)

I can make them go away :-)

Thanks.

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: First cut at large page support on 40x
  2002-06-06  7:58             ` Dan Malek
@ 2002-06-06  8:17               ` David Gibson
  2002-06-12  3:52               ` David Gibson
  1 sibling, 0 replies; 30+ messages in thread
From: David Gibson @ 2002-06-06  8:17 UTC (permalink / raw)
  To: Dan Malek; +Cc: Paul Mackerras, linuxppc-embedded


On Thu, Jun 06, 2002 at 03:58:28AM -0400, Dan Malek wrote:
> Paul Mackerras wrote:
> >.... If a driver uses
> >consistent_alloc or pci_alloc_consistent, the driver should save and
> >use the physical address returned by those functions.
>
> But, that's a relatively new addition and the support of these functions
> in a non cache coherent system requires the use of something like iopa().
> We allocate a contiguous virtual space to remap the pages we wish to
> change cache attributes.  We can't find the physical addresses associated
> with these pages unless we search the page tables.  Prior to having
> consistent_* we had to do this behind the curtains and needed iopa() to
> get the proper mapping result.

Not true.  consistent_alloc() first obtains the memory with
__get_free_pages() - which will return an address within the kernel
mapping of lowmem.  So the physical address can be obtained simply by
subtracting KERNELBASE.  *Then* it allocates virtual memory to create
the new uncached mapping.

In 2.5 virt_to_bus() uses iopa() only on APUS.  consistent_alloc()
works fine.  I've now managed to boot 2.5 with nfsroot on a 405GP,
after tracking down what looks like a hardware bug.

--
David Gibson			| For every complex problem there is a
david@gibson.dropbear.id.au	| solution which is simple, neat and
				| wrong.  -- H.L. Mencken
http://www.ozlabs.org/people/dgibson

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: First cut at large page support on 40x
  2002-06-06  7:58             ` Dan Malek
  2002-06-06  8:17               ` David Gibson
@ 2002-06-12  3:52               ` David Gibson
  2002-06-12  6:15                 ` Dan Malek
  1 sibling, 1 reply; 30+ messages in thread
From: David Gibson @ 2002-06-12  3:52 UTC (permalink / raw)
  To: Dan Malek; +Cc: Paul Mackerras, linuxppc-embedded


On Thu, Jun 06, 2002 at 03:58:28AM -0400, Dan Malek wrote:
>
> Paul Mackerras wrote:
>
> >.... I didn't think we were getting "there"
> >(i.e. towards having a device tree) at all yet on systems without OF
> >though.
>
> It was one of the first things Ben and I discussed long ago before
> this bi_rec stuff appeared.  We were just going to have the bootloaders
> build OF-like device tree........and I'll just stop that discussion
> right here :-)

So what happened to the idea?

> >....  The reason for that is that every other architecture
> >restricts the use of virt_to_phys/bus to addresses that are part of
> >the kernel mapping of lowmem,
>
> I know, and I've been exposed to operating systems that have emerged
> from this primitive state long ago :-)  It's hard to go back to early-80's
> designs :-)

That isn't of itself an argument.  I haven't yet seen a case where
extending virt_to_phys() to non-lowmem addresses is useful.  In any
case I've so far thought of there are other considerations which make
the benefits of having a common interface illusory.

> >.... If a driver uses
> >consistent_alloc or pci_alloc_consistent, the driver should save and
> >use the physical address returned by those functions.
>
> But, that's a relatively new addition and the support of these functions
> in a non cache coherent system requires the use of something like iopa().
> We allocate a contiguous virtual space to remap the pages we wish to
> change cache attributes.  We can't find the physical addresses associated
> with these pages unless we search the page tables.  Prior to having
> consistent_* we had to do this behind the curtains and needed iopa() to
> get the proper mapping result.

Not only is that wrong, it's bloody obviously wrong.  How could we
create a new virtual mapping without knowing the physical addresses
first.

> >.....  Ideally we
> >would have analogous routines to pci_[un]map_single for the on-chip
> >devices.
>
> I agree, but it doesn't remove the lower level requirement of searching
> page tables.

No.  Where is the memory this is to be used on coming except from
kmalloc() in which case just subtracting KERNELBASE is sufficient.  We
can't use vmalloc()ed memory anyway, because it wouldn't be physically
contiguous.

> >Do you have other situations in mind (other than debugging-type
> >things) where you need to use virt_to_phys/bus on something that isn't
> >a lowmem address?
>
> I just think Linux should at least move into the last decade and have
> standard methods for managing memory regardless of how it is allocated or
> where it is located :-)  The consistent_* functions have removed the need
> for drivers to search out mappings on their own, so I don't see the need
> for this function outside of supporting the consistent_* functions.

And it isn't needed in consistent_*(), so where is it needed at all?

--
David Gibson			| For every complex problem there is a
david@gibson.dropbear.id.au	| solution which is simple, neat and
				| wrong.  -- H.L. Mencken
http://www.ozlabs.org/people/dgibson

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: First cut at large page support on 40x
  2002-06-12  3:52               ` David Gibson
@ 2002-06-12  6:15                 ` Dan Malek
  2002-06-12  6:43                   ` David Gibson
  2002-06-12 23:49                   ` Paul Mackerras
  0 siblings, 2 replies; 30+ messages in thread
From: Dan Malek @ 2002-06-12  6:15 UTC (permalink / raw)
  To: David Gibson; +Cc: Paul Mackerras, linuxppc-embedded

David Gibson wrote:

> So what happened to the idea?

Never got implemented, I guess.  Lots of things are discussed that
people just don't have time to do.

> That isn't of itself an argument.  I haven't yet seen a case where
> extending virt_to_phys() to non-lowmem addresses is useful.  In any
> case I've so far thought of there are other considerations which make
> the benefits of having a common interface illusory.

Let's just say it's nice to have designs that have common functional
interfaces.  The Linux VM is one old hack on top of another, and inserting
something like iopa() under virt_to_* is just the same.  As systems and
memory sizes grew, the functions to support them didn't for a variety of
reasons.  These problems have all been nicely solved long ago, and some
of us were around back then to appreciate the nice solutions :-)

> Not only is that wrong, it's bloody obviously wrong.  How could we
> create a new virtual mapping without knowing the physical addresses
> first.

Oh, cool down and just think about this for a moment.  All of this was
done to support noncoherent caches where we allocate some VM space to
remap pages with different cache attributes.  Everything that does
DMA expects to use virt_to_* functions to find the physical address.
Of course we know the physical address, but functions using the standard
pci_consistent_* don't know them.  To support PCI on noncoherent cache
processors you have to be able to find the physical address from the
virtual one, and just an arithmetic operation on the virtual address
won't work on these processors.  It's just the way these functions are
designed to work in Linux.  It's obviously the only way to make it work
with the way the higher level functions are designed.

> ..... We
> can't use vmalloc()ed memory anyway, because it wouldn't be physically
> contiguous.

Ummmm....we do use vmalloc()'ed (sort of) memory anyway.  We grab a set
of physical pages and then map them to a new vm_area.  Just what vmalloc()
does, except we do it so it is contiguous.

> And it isn't needed in consistent_*(), so where is it needed at all?

It's needed at the higher level of PCI and DMA functions that are simply
given virtual addresses and need to find the physical page(s) associated.

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: First cut at large page support on 40x
  2002-06-12  6:15                 ` Dan Malek
@ 2002-06-12  6:43                   ` David Gibson
  2002-06-12 15:19                     ` Tom Rini
                                       ` (2 more replies)
  2002-06-12 23:49                   ` Paul Mackerras
  1 sibling, 3 replies; 30+ messages in thread
From: David Gibson @ 2002-06-12  6:43 UTC (permalink / raw)
  To: Dan Malek; +Cc: Paul Mackerras, linuxppc-embedded


On Wed, Jun 12, 2002 at 02:15:30AM -0400, Dan Malek wrote:
> David Gibson wrote:
>
> >So what happened to the idea?
>
> Never got implemented, I guess.  Lots of things are discussed that
> people just don't have time to do.

Heh, so instead we got a half-assed re-implementation (with a whole
bunch of extraneous pointless crap) in the form of the OCP layer.

> >That isn't of itself an argument.  I haven't yet seen a case where
> >extending virt_to_phys() to non-lowmem addresses is useful.  In any
> >case I've so far thought of there are other considerations which make
> >the benefits of having a common interface illusory.
>
> Let's just say it's nice to have designs that have common functional
> interfaces.  The Linux VM is one old hack on top of another, and inserting
> something like iopa() under virt_to_* is just the same.  As systems and
> memory sizes grew, the functions to support them didn't for a variety of
> reasons.  These problems have all been nicely solved long ago, and some
> of us were around back then to appreciate the nice solutions :-)

So you keep saying, but I haven't seen you give a real example yet.

> >Not only is that wrong, it's bloody obviously wrong.  How could we
> >create a new virtual mapping without knowing the physical addresses
> >first.
>
> Oh, cool down and just think about this for a moment.  All of this was
> done to support noncoherent caches where we allocate some VM space to
> remap pages with different cache attributes.  Everything that does
> DMA expects to use virt_to_* functions to find the physical address.
> Of course we know the physical address, but functions using the standard
> pci_consistent_* don't know them.  To support PCI on noncoherent cache
> processors you have to be able to find the physical address from the
> virtual one, and just an arithmetic operation on the virtual address
> won't work on these processors.  It's just the way these functions are
> designed to work in Linux.  It's obviously the only way to make it work
> with the way the higher level functions are designed.

No.

I think confusion is coming from the fact that there are two
approaches to handling DMA on non-cache-coherent processors (each
appropriate for different circumstances).
	1) Allocate some coherent memory specially (with
consistent_alloc() or pci_alloc_consistent()).  Once that's done it
can just be used, no further worries about consistency.
	2) Get "ordinary" (but cacheline aligned) memory (say from
some other part of the kernel, like an skb) and frob it as appropriate
- i.e. flush and/or invalidate the cache and possibly mess with an
IOMMU - before/after we initiate the transfer (with consistent_sync()
or pci_map_single()).

For any given chunk of memory only one of these approaches will be in
use - consistent_sync() should never be called on memory allocated
with consistent_alloc(), there would be no point.  Likewise if we use
pci_alloc_consistent() there will be no need to call pci_map_single().

Now, in case (1) we have to find out the physical address in order to
allocate the buffer and create the uncache mapping in any case, so we
can (and do) stash away the physical address at the time of
allocation.  No iopa() needed.

In case (2) the memory will probably be coming from kmalloc() or
(rarely) a static buffer.  I can't see any useful case where it won't
be in the kernel's normal mapping of system RAM (since memory from
vmalloc() wouldn't be any good anyway).  So, again, no iopa() is
necessary.

> >..... We
> >can't use vmalloc()ed memory anyway, because it wouldn't be physically
> >contiguous.
>
> Ummmm....we do use vmalloc()'ed (sort of) memory anyway.  We grab a set
> of physical pages and then map them to a new vm_area.  Just what vmalloc()
> does, except we do it so it is contiguous.

Emphasis on the "sort of" here.  consistent_sync() does allocate the
memory specially, and returns the physical address to the caller.  I'm
talking about ordinary, everyday vmalloc().

--
David Gibson			| For every complex problem there is a
david@gibson.dropbear.id.au	| solution which is simple, neat and
				| wrong.  -- H.L. Mencken
http://www.ozlabs.org/people/dgibson

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: First cut at large page support on 40x
  2002-06-12  6:43                   ` David Gibson
@ 2002-06-12 15:19                     ` Tom Rini
  2002-06-12 23:23                     ` Dan Malek
  2002-06-13 18:13                     ` Armin
  2 siblings, 0 replies; 30+ messages in thread
From: Tom Rini @ 2002-06-12 15:19 UTC (permalink / raw)
  To: Dan Malek, Paul Mackerras, linuxppc-embedded, David Gibson


On Wed, Jun 12, 2002 at 04:43:44PM +1000, David Gibson wrote:
>
> On Wed, Jun 12, 2002 at 02:15:30AM -0400, Dan Malek wrote:
> > David Gibson wrote:
> >
> > >So what happened to the idea?
> >
> > Never got implemented, I guess.  Lots of things are discussed that
> > people just don't have time to do.
>
> Heh, so instead we got a half-assed re-implementation (with a whole
> bunch of extraneous pointless crap) in the form of the OCP layer.

... while we wait for the generic driver stuffs in 2.5 to exist in part.

--
Tom Rini (TR1265)
http://gate.crashing.org/~trini/

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: First cut at large page support on 40x
  2002-06-12  6:43                   ` David Gibson
  2002-06-12 15:19                     ` Tom Rini
@ 2002-06-12 23:23                     ` Dan Malek
  2002-06-12 23:42                       ` Paul Mackerras
  2002-06-13  1:38                       ` Paul Mackerras
  2002-06-13 18:13                     ` Armin
  2 siblings, 2 replies; 30+ messages in thread
From: Dan Malek @ 2002-06-12 23:23 UTC (permalink / raw)
  To: David Gibson; +Cc: Paul Mackerras, linuxppc-embedded

David Gibson wrote:

> Heh, so instead we got a half-assed re-implementation (with a whole
> bunch of extraneous pointless crap) in the form of the OCP layer.

Not exactly.  The OCP implementation was an attempt to provide a
software model that matched the way Blue Logic peripherals were
integrated into packages.  Sort of a "better" approach to the way
I implemented the 8xx drivers.  The resource trees are just a
representation of reasonable configuration information that is useful
to any driver implementation.  We couldn't decide what "reasonable"
information would be :-)

> So you keep saying, but I haven't seen you give a real example yet.

You are fortunate to work for one of the companies that have successfully
deployed commercial operating systems and have a variety of research
operating systems that implement and demonstrate some very nice
resource management methods.  I would suggest you spend some time learning
about these, and then maybe you would have some clue about what I'm
trying to describe.

Long ago, modern operating systems abstracted much of the VM implementation
away from the drivers.  The most useful example would be for us to use
an I/O vector triplet {physaddr, offset, length} in a driver for any DMA
or other physical address operation.  Drivers shouldn't care how memory
mapping is implemented, they should be given information about a VM address
range that is suitable for them to perform DMA.  This way, we don't have
one implementation for low memory drivers, yet another thing for high
memory bounce buffers, and who knows, maybe someday we will have a direct I/O
capability or even DMA from user space (like other operating systems have
for years) without having to hack up a driver yet again.  These concepts
were working on production systems long before Linux was started, and your
company has great examples of this.

> I think confusion is coming from the fact that there are two
> approaches to handling DMA on non-cache-coherent processors (each
> appropriate for different circumstances).

You are the only person confused and I don't know why you want to
keep arguing with me.

> 	1) Allocate some coherent memory specially (with
> consistent_alloc() or pci_alloc_consistent()).  Once that's done it
> can just be used, no further worries about consistency.

This has nothing to do with consistency, it has to do with implementing
the proper semantics for these functions.  Regardless of how the memory
is allocated, there are pci mapping functions (pci_map_single(), pci_map_sg(),
and so on) that are going to be handed an arbitrary virtual address and
try to convert those into physical addresses for DMA.  In the case of
non coherent processors, you can't simply subtract KERNELBASE from these
addresses and get the proper physical address.

> Emphasis on the "sort of" here.  consistent_sync() does allocate the
> memory specially, and returns the physical address to the caller.  I'm
> talking about ordinary, everyday vmalloc()

...and if you call pci_map_sg() (or pci_map_single()) on this address,
you won't get the right answer without using iopa().  Should drivers be
doing this or are they doing it correctly?  I don't know, but I do know
if they call these functions we better return the right answer.

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: First cut at large page support on 40x
  2002-06-12 23:23                     ` Dan Malek
@ 2002-06-12 23:42                       ` Paul Mackerras
  2002-06-13  0:28                         ` Dan Malek
  2002-06-13  1:38                       ` Paul Mackerras
  1 sibling, 1 reply; 30+ messages in thread
From: Paul Mackerras @ 2002-06-12 23:42 UTC (permalink / raw)
  To: Dan Malek; +Cc: David Gibson, linuxppc-embedded

Dan Malek writes:

> Long ago, modern operating systems abstracted much of the VM implementation
> away from the drivers.  The most useful example would be for us to use
> an I/O vector triplet {physaddr, offset, length} in a driver for any DMA
> or other physical address operation.  Drivers shouldn't care how memory
> mapping is implemented, they should be given information about a VM address
> range that is suitable for them to perform DMA.  This way, we don't have
> one implementation for low memory drivers, yet another thing for high
> memory bounce buffers, and who knows, maybe someday we will have a direct I/O
> capability or even DMA from user space (like other operating systems have
> for years) without having to hack up a driver yet again.  These concepts
> were working on production systems long before Linux was started, and your
> company has great examples of this.

This is an issue which is much wider than the question of what the PPC
port should do.  The linux-kernel@vger.kernel.org list is the
appropriate place to discuss this sort of thing.

One point to bear in mind is that Linux runs on a much wider range of
hardware than any of those research operating systems that you refer
to.  That does introduce a range of practical issues that those other
systems can just ignore.

> > 	1) Allocate some coherent memory specially (with
> > consistent_alloc() or pci_alloc_consistent()).  Once that's done it
> > can just be used, no further worries about consistency.
>
> This has nothing to do with consistency, it has to do with implementing
> the proper semantics for these functions.  Regardless of how the memory
> is allocated, there are pci mapping functions (pci_map_single(), pci_map_sg(),
> and so on) that are going to be handed an arbitrary virtual address and
> try to convert those into physical addresses for DMA.  In the case of
> non coherent processors, you can't simply subtract KERNELBASE from these
> addresses and get the proper physical address.

Then the driver is breaking the rules as laid down in DMA-mapping.txt.
You *don't* use pci_map_* on stuff you get from pci_alloc_consistent.
If you do you get to keep both pieces.  If you don't like the rules
then I suggest you take it up with Dave Miller.  The rules seem
perfectly sensible to me, they make an appropriate tradeoff between
functionality and complexity that works across a wide range of systems.

The PCI DMA API is designed so that drivers never need to translate
between virtual and physical addresses themselves.  Our API for local
peripherals on embedded chips should do the same.

> > Emphasis on the "sort of" here.  consistent_sync() does allocate the
> > memory specially, and returns the physical address to the caller.  I'm
> > talking about ordinary, everyday vmalloc()
>
> ...and if you call pci_map_sg() (or pci_map_single()) on this address,
> you won't get the right answer without using iopa().  Should drivers be

"Well don't do that then" :)  If you are calling pci_map_single on a
vmalloc'd address you are living dangerously even if virt_to_* does
use iopa, since vmalloc'd memory is not physically contiguous.

> doing this or are they doing it correctly?  I don't know, but I do know
> if they call these functions we better return the right answer.

Hitting a BUG() would be more appropriate in these cases.

Paul.

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: First cut at large page support on 40x
  2002-06-12  6:15                 ` Dan Malek
  2002-06-12  6:43                   ` David Gibson
@ 2002-06-12 23:49                   ` Paul Mackerras
  1 sibling, 0 replies; 30+ messages in thread
From: Paul Mackerras @ 2002-06-12 23:49 UTC (permalink / raw)
  To: Dan Malek; +Cc: David Gibson, linuxppc-embedded

Dan Malek writes:

> Oh, cool down and just think about this for a moment.  All of this was
> done to support noncoherent caches where we allocate some VM space to
> remap pages with different cache attributes.  Everything that does
> DMA expects to use virt_to_* functions to find the physical address.

Well no.  Drivers shouldn't need to use virt_to_* at all.

> Of course we know the physical address, but functions using the standard
> pci_consistent_* don't know them.  To support PCI on noncoherent cache
> processors you have to be able to find the physical address from the
> virtual one, and just an arithmetic operation on the virtual address
> won't work on these processors.  It's just the way these functions are
> designed to work in Linux.  It's obviously the only way to make it work
> with the way the higher level functions are designed.

Using the pci_* functions as a model, you have two ways to do things.
The first is to use pci_alloc_consistent.  It internally gets some
pages of lowmem and, if necessary, maps them cache-inhibited somewhere
and returns both the virtual and bus addresses to the driver.  The
driver gives the bus address to the device and uses the virtual
address itself and everyone is happy.

The second is to use pci_map_single/pci_unmap_single and friends.  The
*_single variants work on lowmem, the *_page variants work on any page
of memory.  You do a pci_map_* and get back a bus address that you
give to the device.  It does its DMA, you call pci_unmap_* and then
you can access the buffer again.  With this variant the buffer doesn't
need to be remapped cache-inhibited.  Since the buffer is either
lowmem or specified by its struct page *, working out its physical
address is trivial.  In no case do we need to go chasing through page
tables to find it.

Paul.

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: First cut at large page support on 40x
  2002-06-12 23:42                       ` Paul Mackerras
@ 2002-06-13  0:28                         ` Dan Malek
  2002-06-13  1:01                           ` Paul Mackerras
  0 siblings, 1 reply; 30+ messages in thread
From: Dan Malek @ 2002-06-13  0:28 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: David Gibson, linuxppc-embedded

Paul Mackerras wrote:

> This is an issue which is much wider than the question of what the PPC
> port should do.  The linux-kernel@vger.kernel.org list is the
> appropriate place to discuss this sort of thing.

Yeah, right :-)  I can't bear the signal to noise ratio out there :-)

> The PCI DMA API is designed so that drivers never need to translate
> between virtual and physical addresses themselves.  Our API for local
> peripherals on embedded chips should do the same.

As far as I know, they do.  Just call consistent_alloc() and keep track
of the addresses.  At the time I implemented consistent_alloc(), I
updated all drivers I could test (and some that I couldn't :-) to do this.
At the time, there were PCI drivers on 4xx and 8xx that didn't do the
right thing with the pci_* functions, so I had to keep some kind of mapping
to make them work.

> "Well don't do that then" :)  If you are calling pci_map_single on a
> vmalloc'd address you are living dangerously even if virt_to_* does
> use iopa, since vmalloc'd memory is not physically contiguous.
>
>
>>doing this or are they doing it correctly?  I don't know, but I do know
>>if they call these functions we better return the right answer.
>
>
> Hitting a BUG() would be more appropriate in these cases.

I know, but the higher level functions are sufficiently disjoint that you
can't keep context across them to know if someone is doing something bad.
I guess we could just check for an address in the VMALLOC space and not
translate that, but then I'll get criticized for adding code into that
fast virt_to_* path :-)  You also don't know, in the case of noncoherent
processors, that the virtual mapping you received is from a 'vmalloc'
space, even though it was done properly for DMA.  It's one thing to call
vmalloc() and try to do DMA, and it's another to use an alternate mapping
to properly implement a feature under a standard interface.

So, just toss iopa(), use the macros in their standard way, and see how
long we run before the system crashes (SCSI drivers, eepro100,... :-)

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: First cut at large page support on 40x
  2002-06-13  0:28                         ` Dan Malek
@ 2002-06-13  1:01                           ` Paul Mackerras
  2002-06-13  4:16                             ` Dan Malek
  0 siblings, 1 reply; 30+ messages in thread
From: Paul Mackerras @ 2002-06-13  1:01 UTC (permalink / raw)
  To: Dan Malek; +Cc: David Gibson, linuxppc-embedded

Dan Malek writes:

> > Hitting a BUG() would be more appropriate in these cases.
>
> I know, but the higher level functions are sufficiently disjoint that you
> can't keep context across them to know if someone is doing something bad.
> I guess we could just check for an address in the VMALLOC space and not
> translate that, but then I'll get criticized for adding code into that
> fast virt_to_* path :-)

Not by me, doing a range check in virt_to_* would be perfectly
appropriate.

>  You also don't know, in the case of noncoherent
> processors, that the virtual mapping you received is from a 'vmalloc'
> space, even though it was done properly for DMA.  It's one thing to call
> vmalloc() and try to do DMA, and it's another to use an alternate mapping
> to properly implement a feature under a standard interface.

Drivers shouldn't be doing virt_to_* on the address they get from a
consistent-alloc function.  Given that doing it the right way is easy
(just remember the physical address that the consistent alloc function
gives you) I don't have any qualms about breaking drivers that do it
the wrong way.

(I should note that I'm not intending to break them in 2.4, not even
in 2_4_devel; virt_to_* can continue to use iopa there.  But in 2.5 we
can be more brutal.)

> So, just toss iopa(), use the macros in their standard way, and see how
> long we run before the system crashes (SCSI drivers, eepro100,... :-)

There is the issue of making sure that we don't have DMA buffers and
other variables in the same cache line.  This is being thrashed out on
linux-kernel at the moment. :)

Paul.

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: First cut at large page support on 40x
  2002-06-12 23:23                     ` Dan Malek
  2002-06-12 23:42                       ` Paul Mackerras
@ 2002-06-13  1:38                       ` Paul Mackerras
  2002-06-13  4:47                         ` Dan Malek
  1 sibling, 1 reply; 30+ messages in thread
From: Paul Mackerras @ 2002-06-13  1:38 UTC (permalink / raw)
  To: Dan Malek; +Cc: David Gibson, linuxppc-embedded

Dan Malek writes:

> Long ago, modern operating systems abstracted much of the VM implementation
> away from the drivers.  The most useful example would be for us to use
> an I/O vector triplet {physaddr, offset, length} in a driver for any DMA
> or other physical address operation.  Drivers shouldn't care how memory
> mapping is implemented, they should be given information about a VM address
> range that is suitable for them to perform DMA.  This way, we don't have

I think this deserves some discussion, on a couple of levels, for the
sake of the other people who are on this list and following this
discussion.

At the high level, Linus and the other core kernel developers have a
mindset which seems to me a bit different from that of many other
software developers, including those responsible for various
proprietary operations systems.  It is a mindset which highly values
simplicity and clarity of code, and in particular being able to
understand what a piece of code is doing, in considerable detail, when
you read it.  Abstraction is valued but only to the extent that it
contributes to the clarity of the code.  Abstraction beyond that
point, that is to say abstraction that hides details that are relevant
to the code using the abstraction, is rejected.  Combined with that is
an emphasis on performance.  Generality is not seen as desirable in
itself, but only to the extent that it contributes to simplicity and
clarity of code.  And academic ideas of the "right" ways to do things,
while they will be considered, are not by any means taken as gospel.

I think this mindset is the reason why we have an efficient and
maintainable kernel rather than a microkernel-based system written in
C++.

In the context of drivers doing DMA, there are basically two ways for
the code calling the driver to specify where the buffer is that it
wants to use for I/O: (a) as a virtual address, (b) as a pointer to
the page struct for the page plus an offset.  Method (b) is
increasingly being used in 2.5 since it lets you specify a buffer
anywhere in memory even on 32-bit systems with multiple GB of RAM.

But let us consider a buffer specified by a virtual address.  There
are basically three kinds of virtual address the driver could be
given:

(1) kernel lowmem
(2) other kernel virtual addresses
(3) user addresses

Thinking about this from the driver's perspective, (1) is easy.  We
know the buffer will be contiguous and that it isn't going to go away
from under us.  (2) is a little harder because the buffer may not be
physically contiguous.  If our device handles scatter/gather then we
could possibly handle it, but we start to need allocate space for
scatter/gather lists of varying length even if we are only handling a
single buffer at a time.  And if our device doesn't handle
scatter/gather then we have a bigger problem - we may even need to use
a bounce buffer.  And (3) is harder still because the buffer may move
or disappear.  In this case we need to pin the buffer as well as
handling the fact that it may be physically discontiguous.

Given this, I would claim that an abstraction that tries to hide the
differences between different kinds of virtual address is one that
hides details that are relevant to drivers, and that that is one
reason why Linux doesn't have such an abstraction.  Instead, when
drivers are required to handle physically discontiguous buffers, we
make that apparent up front and provide the drivers with the details
they need to handle that case efficiently as well as the contiguous
case.

Paul.

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: First cut at large page support on 40x
  2002-06-13  1:01                           ` Paul Mackerras
@ 2002-06-13  4:16                             ` Dan Malek
  2002-06-13  5:12                               ` David Gibson
  0 siblings, 1 reply; 30+ messages in thread
From: Dan Malek @ 2002-06-13  4:16 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: David Gibson, linuxppc-embedded

Paul Mackerras wrote:

> Not by me, doing a range check in virt_to_* would be perfectly
> appropriate.

OK.  So, why don't we just look up the right address? :-)

> Drivers shouldn't be doing virt_to_* on the address they get from a
> consistent-alloc function.  Given that doing it the right way is easy
> (just remember the physical address that the consistent alloc function
> gives you) I don't have any qualms about breaking drivers that do it
> the wrong way.

It isn't the drivers doing it themselves.  I believe the biggest problem
was with drivers that wanted to do a scatter/gather list (like SCSI seems
to like to do).  They just stuff the virtual address, regardless of where
it came from, and expect virt_to_bus() to do the right thing.  I know,
you are going to tell me that shouldn't have been a consistent_alloc()
space, and perhaps that has been fixed by now.

> (I should note that I'm not intending to break them in 2.4, not even
> in 2_4_devel; virt_to_* can continue to use iopa there.  But in 2.5 we
> can be more brutal.)

Let's be brutal :-)

> There is the issue of making sure that we don't have DMA buffers and
> other variables in the same cache line.  This is being thrashed out on
> linux-kernel at the moment. :)

That's a totally different subject, and it comes up on many mailing lists.
There was a discussion about the eepro100 again today on a MIPS list,
where David (again :-) pointed out the problems with it on noncoherent systems.
There are enough people that know about this and the solution, I'm just
sitting back until the generic software is updated to deal with it :-)

Thanks.

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: First cut at large page support on 40x
  2002-06-13  1:38                       ` Paul Mackerras
@ 2002-06-13  4:47                         ` Dan Malek
  0 siblings, 0 replies; 30+ messages in thread
From: Dan Malek @ 2002-06-13  4:47 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: David Gibson, linuxppc-embedded

Paul Mackerras wrote:

> I think this deserves some discussion, on a couple of levels, for the
> sake of the other people who are on this list and following this
> discussion.

I really don't want to discuss it anymore here, but I'll add just a
few comments.

> .... It is a mindset which highly values
> simplicity and clarity of code, and in particular being able to
> understand what a piece of code is doing, in considerable detail, when
> you read it.

I dislike it at the times it costs us features compared to other
operating systems.  We are caught between being better than a minimal RTOS,
but really lacking when compared to commercial/mature systems.

> I think this mindset is the reason why we have an efficient and
> maintainable kernel rather than a microkernel-based system written in
> C++.

I'm not proposing we do such a thing, although some of the nicest features
have been implemented in such systems :-)  What we need to do is learn
about these systems and bring some of the concepts home.

> In the context of drivers doing DMA, ......

> (1) kernel lowmem
> (2) other kernel virtual addresses
> (3) user addresses
>
> Thinking about this from the driver's perspective, (1) is easy.
 > .... (2) is a little harder
 > ...... And (3) is harder still because the buffer may move
> or disappear.  In this case we need to pin the buffer as well as
> handling the fact that it may be physically discontiguous.

So?  All of this has been solved before in other systems.  Just solve
for (3) and you have everything covered.  This isn't an unusual implementation
detail.  In this case, the driver simply asks for the S/G list associated
with the VM range (the triplet I mentioned earlier).  In case (1), there
is just one entry, in case (2) there could be multiple entries, and in
case (3) you usually see the non-zero offsets and non-page size lengths.
The user page locking can be done by the underlying DMA support functions,
so the driver doesn't care about that.  If you don't have S/G hardware support
the driver has to break up the transfers, which is really the only complexity.
This is what other systems have been doing for decades, and it really
simplifies what the driver has to know (or doesn't need to know :-) about
the underlying VM implementation.  Of course, it would mean modification of
existing drivers, so this will never happen :-)  Maybe we could just do it
in the 4xx OCP stuff as proof of concept :-)

> Given this, I would claim that an abstraction that tries to hide the
> differences between different kinds of virtual address is one that
> hides details that are relevant to drivers, and that that is one
> reason why Linux doesn't have such an abstraction.

What do drivers care about how or where physical memory is allocated and
mapped to virtual addresses?  All they want is an address that allows DMA :-)
A driver will only care about this if you force it to do so.

> .....  Instead, when
> drivers are required to handle physically discontiguous buffers, we
> make that apparent up front and provide the drivers with the details
> they need to handle that case efficiently as well as the contiguous
> case.

Doesn't my description do that?  Seems pretty simple to me, but then
I'm a computer scientist and not a hacker :-)

Thanks.

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: First cut at large page support on 40x
  2002-06-13  4:16                             ` Dan Malek
@ 2002-06-13  5:12                               ` David Gibson
  2002-06-13  7:26                                 ` Dan Malek
  0 siblings, 1 reply; 30+ messages in thread
From: David Gibson @ 2002-06-13  5:12 UTC (permalink / raw)
  To: Dan Malek; +Cc: Paul Mackerras, linuxppc-embedded

On Thu, Jun 13, 2002 at 12:16:13AM -0400, Dan Malek wrote:
> Paul Mackerras wrote:
>
>
> >Not by me, doing a range check in virt_to_* would be perfectly
> >appropriate.
>
> OK.  So, why don't we just look up the right address? :-)

Because any driver that relies on it working (for non-lowmem
addresses) is very likely broken in other ways, and at best completely
non-portable.  Not doing it lets us get rid of iopa() (less code +
same features == good) which has basically no other users (a few 8xx
drivers, which I'm sure can be fixed not to and that's it).

Besides which virt_to_bus() is clearly ill-defined in general, since
which bus we're talking about is not specified.  iopa() has the same
problem.  So drivers really need to use the pci consistent functions
or (in future) the unified device model consistent allocation /
mapping functions.

> >Drivers shouldn't be doing virt_to_* on the address they get from a
> >consistent-alloc function.  Given that doing it the right way is easy
> >(just remember the physical address that the consistent alloc function
> >gives you) I don't have any qualms about breaking drivers that do it
> >the wrong way.
>
> It isn't the drivers doing it themselves.  I believe the biggest problem
> was with drivers that wanted to do a scatter/gather list (like SCSI seems
> to like to do).  They just stuff the virtual address, regardless of where
> it came from, and expect virt_to_bus() to do the right thing.  I know,
> you are going to tell me that shouldn't have been a consistent_alloc()
> space, and perhaps that has been fixed by now.

Huh?  First you say it isn't the drivers, then you say it is the
drivers.  From what I can see virt_to_bus still appears a hell of a
lot in drivers/scsi, so they haven't been fixed yet.  But do you have
any real case where one of the virt_to_bus()es is being called on a
non-lowmem address?

> >(I should note that I'm not intending to break them in 2.4, not even
> >in 2_4_devel; virt_to_* can continue to use iopa there.  But in 2.5 we
> >can be more brutal.)
>
> Let's be brutal :-)
>
> >There is the issue of making sure that we don't have DMA buffers and
> >other variables in the same cache line.  This is being thrashed out on
> >linux-kernel at the moment. :)
>
> That's a totally different subject, and it comes up on many mailing lists.
> There was a discussion about the eepro100 again today on a MIPS list,
> where David (again :-) pointed out the problems with it on noncoherent
> systems.

Err.. not me.  I ain't on any MIPS lists.  Nor do I recall writing
anything about the eepre100 driver recently.

--
David Gibson			| For every complex problem there is a
david@gibson.dropbear.id.au	| solution which is simple, neat and
				| wrong.  -- H.L. Mencken
http://www.ozlabs.org/people/dgibson

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: First cut at large page support on 40x
  2002-06-13  5:12                               ` David Gibson
@ 2002-06-13  7:26                                 ` Dan Malek
  0 siblings, 0 replies; 30+ messages in thread
From: Dan Malek @ 2002-06-13  7:26 UTC (permalink / raw)
  To: David Gibson; +Cc: Paul Mackerras, linuxppc-embedded

David Gibson wrote:

> Because any driver that relies on it working....

Just a joke, lighten up :-)  That's what thse things :-) :-) are all about.

> Huh?  First you say it isn't the drivers, then you say it is the
> drivers.

Arrgh.......actually it was a kernel problem............
IIRC, the first place I saw the problem was someone trying to build
an S/G list and we didn't look up the addresses correctly.

The bottom line is iopa() did the right thing when used on proper
lowmem addresses, and in case someone stumbled across something
we had remapped it also did the right thing.  I'm not out to be
pointing blame at drivers and I quite dislike the "don't do it
that way" response when the solution is so trivial.  We implemented
a solution (many different ways over time) to provide uncached
memory, and sometimes it didn't work right for some drivers.  This
solved the problem without me having to spend time debugging or
modifying drivers I didn't care to know anything about.  If we don't
need it anymore, then throw it away, but just make sure we don't
need it anymore.

> Err.. not me.  I ain't on any MIPS lists.  Nor do I recall writing
> anything about the eepre100 driver recently.

I didn't believe I indicated you did.  It was a comment to Paul's
statement that there is still discussion about cache line alignment
and sharing.  You can learn things from reading what some of the
other people are doing, though :-)

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: First cut at large page support on 40x
  2002-06-12  6:43                   ` David Gibson
  2002-06-12 15:19                     ` Tom Rini
  2002-06-12 23:23                     ` Dan Malek
@ 2002-06-13 18:13                     ` Armin
  2002-06-14  0:33                       ` David Gibson
  2 siblings, 1 reply; 30+ messages in thread
From: Armin @ 2002-06-13 18:13 UTC (permalink / raw)
  To: David Gibson; +Cc: Dan Malek, Paul Mackerras, linuxppc-embedded

David Gibson
>
> Heh, so instead we got a half-assed re-implementation (with a whole
> bunch of extraneous pointless crap) in the form of the OCP layer.
>

Crap...this pointless crap has made it easier to add driver support to
new IBM 4xx processors. And the Idea goes beyond PPC :)  Cosmetic
changes don't help make things not half-assed and mature.  Flushing the
idea, API's and usage does.

  armin

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: First cut at large page support on 40x
  2002-06-13 18:13                     ` Armin
@ 2002-06-14  0:33                       ` David Gibson
  0 siblings, 0 replies; 30+ messages in thread
From: David Gibson @ 2002-06-14  0:33 UTC (permalink / raw)
  To: Armin; +Cc: Dan Malek, Paul Mackerras, linuxppc-embedded


On Thu, Jun 13, 2002 at 11:13:33AM -0700, Armin wrote:
>
> David Gibson
> >
> >Heh, so instead we got a half-assed re-implementation (with a whole
> >bunch of extraneous pointless crap) in the form of the OCP layer.
> >
>
> Crap...this pointless crap has made it easier to add driver support to
> new IBM 4xx processors. And the Idea goes beyond PPC :)  Cosmetic
> changes don't help make things not half-assed and mature.  Flushing the
> idea, API's and usage does.

No actually.  It's become easier to add driver support to new IBM 4xx
processors because of the several sensible and useful ideas that have
gone in along with all the extraneous pointless crap.  All my ocp
patches have had the aim of leaving the former intact while
(gradually) removing the latter.

--
David Gibson			| For every complex problem there is a
david@gibson.dropbear.id.au	| solution which is simple, neat and
				| wrong.  -- H.L. Mencken
http://www.ozlabs.org/people/dgibson

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2002-06-14  0:33 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-05-31  4:21 First cut at large page support on 40x David Gibson
2002-05-31  4:31 ` David Gibson
2002-06-04  0:43 ` Dan Malek
2002-06-04  3:59   ` David Gibson
2002-06-04 17:42     ` Dan Malek
2002-06-05  0:10       ` David Gibson
2002-06-05 17:25         ` Dan Malek
2002-06-06  1:35           ` David Gibson
2002-06-06  4:57             ` Dan Malek
2002-06-05 22:29       ` Paul Mackerras
2002-06-06  4:48         ` Dan Malek
2002-06-06  5:44           ` Paul Mackerras
2002-06-06  7:58             ` Dan Malek
2002-06-06  8:17               ` David Gibson
2002-06-12  3:52               ` David Gibson
2002-06-12  6:15                 ` Dan Malek
2002-06-12  6:43                   ` David Gibson
2002-06-12 15:19                     ` Tom Rini
2002-06-12 23:23                     ` Dan Malek
2002-06-12 23:42                       ` Paul Mackerras
2002-06-13  0:28                         ` Dan Malek
2002-06-13  1:01                           ` Paul Mackerras
2002-06-13  4:16                             ` Dan Malek
2002-06-13  5:12                               ` David Gibson
2002-06-13  7:26                                 ` Dan Malek
2002-06-13  1:38                       ` Paul Mackerras
2002-06-13  4:47                         ` Dan Malek
2002-06-13 18:13                     ` Armin
2002-06-14  0:33                       ` David Gibson
2002-06-12 23:49                   ` Paul Mackerras

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).