Read *pgd again in vhpt

public inbox for linux-ia64@vger.kernel.org
 help / color / mirror / Atom feed

* Read *pgd again in vhpt_miss handler
@ 2006-04-26 13:46 Zoltan Menyhart
  2006-04-26 15:00 ` Chen, Kenneth W
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: Zoltan Menyhart @ 2006-04-26 13:46 UTC (permalink / raw)
  To: linux-ia64

I think the *pgd has to be re-read and compared as *pmd, ... are.
("free_pud_range()" includes "pgd_clear(pgd)".)

Thanks,

Zoltan


--- linux-2.6.16.9-save/arch/ia64/kernel/ivt.S	2006-04-21 09:58:55.000000000 +0200
+++ linux-2.6.16.9/arch/ia64/kernel/ivt.S	2006-04-26 15:15:17.000000000 +0200
@@ -138,8 +138,8 @@ ENTRY(vhpt_miss)
 (p6)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
 (p7)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
 	;;
-(p6)	dep r17=r18,r19,3,(PAGE_SHIFT-3)	// r17=pgd_offset for region 5
-(p7)	dep r17=r18,r17,3,(PAGE_SHIFT-6)	// r17=pgd_offset for region[0-4]
+(p6)	dep r27=r18,r19,3,(PAGE_SHIFT-3)	// r27=pgd_offset for region 5
+(p7)	dep r27=r18,r17,3,(PAGE_SHIFT-6)	// r27=pgd_offset for region[0-4]
 	cmp.eq p7,p6=0,r21			// unused address bits all zeroes?
 #ifdef CONFIG_PGTABLE_4
 	shr.u r28=r22,PUD_SHIFT			// shift pud index into position
@@ -147,11 +147,11 @@ ENTRY(vhpt_miss)
 	shr.u r18=r22,PMD_SHIFT			// shift pmd index into position
 #endif
 	;;
-	ld8 r17=[r17]				// get *pgd (may be 0)
+	ld8 r30=[r27]				// get *pgd (may be 0)
 	;;
-(p7)	cmp.eq p6,p7=r17,r0			// was pgd_present(*pgd) = NULL?
+(p7)	cmp.eq p6,p7=r30,r0			// was pgd_present(*pgd) = NULL?
 #ifdef CONFIG_PGTABLE_4
-	dep r28=r28,r17,3,(PAGE_SHIFT-3)	// r28=pud_offset(pgd,addr)
+	dep r28=r28,r30,3,(PAGE_SHIFT-3)	// r28=pud_offset(pgd,addr)
 	;;
 	shr.u r18=r22,PMD_SHIFT			// shift pmd index into position
 (p7)	ld8 r29=[r28]				// get *pud (may be 0)
@@ -159,7 +159,7 @@ ENTRY(vhpt_miss)
 (p7)	cmp.eq.or.andcm p6,p7=r29,r0		// was pud_present(*pud) = NULL?
 	dep r17=r18,r29,3,(PAGE_SHIFT-3)	// r17=pmd_offset(pud,addr)
 #else
-	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// r17=pmd_offset(pgd,addr)
+	dep r17=r18,r30,3,(PAGE_SHIFT-3)	// r17=pmd_offset(pgd,addr)
 #endif
 	;;
 (p7)	ld8 r20=[r17]				// get *pmd (may be 0)
@@ -207,10 +207,12 @@ ENTRY(vhpt_miss)
 	 * between reading the pagetable and the "itc".  If so, flush the entry we
 	 * inserted and retry.  At this point, we have:
 	 *
+	 * r27 = equivalent of pgd_offset(mm, ifa) or pgd_offset_k(ifa)
 	 * r28 = equivalent of pud_offset(pgd, ifa)
 	 * r17 = equivalent of pmd_offset(pud, ifa)
 	 * r21 = equivalent of pte_offset(pmd, ifa)
 	 *
+	 * r30 = *pgd
 	 * r29 = *pud
 	 * r20 = *pmd
 	 * r18 = *pte
@@ -220,7 +222,9 @@ ENTRY(vhpt_miss)
 #ifdef CONFIG_PGTABLE_4
 	ld8 r19=[r28]				// read *pud again
 #endif
-	cmp.ne p6,p7=r0,r0
+	ld8 r24=[r27]				// read *pgd again
+	;;
+	cmp.ne p6,p7=r24,r30			// did *pgd change
 	;;
 	cmp.ne.or.andcm p6,p7=r26,r20		// did *pmd change
 #ifdef CONFIG_PGTABLE_4


^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: Read *pgd again in vhpt_miss handler
  2006-04-26 13:46 Read *pgd again in vhpt_miss handler Zoltan Menyhart
@ 2006-04-26 15:00 ` Chen, Kenneth W
  2006-04-27 11:04 ` Zoltan Menyhart
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: Chen, Kenneth W @ 2006-04-26 15:00 UTC (permalink / raw)
  To: linux-ia64

Zoltan Menyhart wrote on Wednesday, April 26, 2006 6:47 AM
> I think the *pgd has to be re-read and compared as *pmd, ... are.
> ("free_pud_range()" includes "pgd_clear(pgd)".)

A more favorable change is to remove comparing pud/pmd entry, but
condition the vhpt TLB purging upon detecting pte modification.
I had this patch in my pocket for a while for other optimizations
I'm working on and it looks like a good timing to post it now.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>


--- ./arch/ia64/kernel/ivt.S.orig	2006-04-26 08:24:33.000000000 -0700
+++ ./arch/ia64/kernel/ivt.S	2006-04-26 08:31:43.000000000 -0700
@@ -216,21 +216,11 @@ ENTRY(vhpt_miss)
 	 * r18 = *pte
 	 */
 	ld8 r25=[r21]				// read *pte again
-	ld8 r26=[r17]				// read *pmd again
-#ifdef CONFIG_PGTABLE_4
-	ld8 r19=[r28]				// read *pud again
-#endif
-	cmp.ne p6,p7=r0,r0
 	;;
-	cmp.ne.or.andcm p6,p7=r26,r20		// did *pmd change
-#ifdef CONFIG_PGTABLE_4
-	cmp.ne.or.andcm p6,p7=r19,r29		// did *pud change
-#endif
+	cmp.ne p6,p7=r25,r18			// did *pte change
 	mov r27=PAGE_SHIFT<<2
 	;;
 (p6)	ptc.l r22,r27				// purge PTE page translation
-(p7)	cmp.ne.or.andcm p6,p7=r25,r18		// did *pte change
-	;;
 (p6)	ptc.l r16,r27				// purge translation
 #endif
 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: Read *pgd again in vhpt_miss handler
  2006-04-26 13:46 Read *pgd again in vhpt_miss handler Zoltan Menyhart
  2006-04-26 15:00 ` Chen, Kenneth W
@ 2006-04-27 11:04 ` Zoltan Menyhart
  2006-04-28  1:23 ` Christoph Lameter
  2006-04-28  7:53 ` Zoltan Menyhart
  3 siblings, 0 replies; 5+ messages in thread
From: Zoltan Menyhart @ 2006-04-27 11:04 UTC (permalink / raw)
  To: linux-ia64

Chen, Kenneth W wrote:

> A more favorable change is to remove comparing pud/pmd entry, but
> condition the vhpt TLB purging upon detecting pte modification.
> I had this patch in my pocket for a while for other optimizations
> I'm working on and it looks like a good timing to post it now.

I cannot agree with you.

1. If *pte becomes invalid in the mean time, then the translation
   for the PTE page can be still valid.
   E.g. the swapper removes PTE-s and purges the translations for
   user pages only, never for a PTE page.

2. Let's have a look at your posting on the 30th os March, in the thread
  "accessed/dirty bit handler tuning":

> cpu0                            cpu1                  cpu2
> Vhpt miss:
>   walk page table
>                                 free_pgtables
>                                 ptc.g fault address
>                                 ptc.g hash address
>                                                       pud_alloc/pmd_alloc
>                                                       new page instantiation
>   itc.d faulting address
>   itc.d hash address
>   read pte
>   kill tlb for fault addr
>   rfi
> 
> Touch fault addr
> Walker install the tlb
> with staled vhpt tlb
> -> using someone else's page
>  -> data corruption

I did agree with you.

In addition, I wanted to add a protection for protecting the
pgd ... pte chain walking.
I wanted to use the mm semaphore => no need to walk again the
pgd ... pte chain.

I think your new patch widens the security hole.

Regards,

Zoltan

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: Read *pgd again in vhpt_miss handler
  2006-04-26 13:46 Read *pgd again in vhpt_miss handler Zoltan Menyhart
  2006-04-26 15:00 ` Chen, Kenneth W
  2006-04-27 11:04 ` Zoltan Menyhart
@ 2006-04-28  1:23 ` Christoph Lameter
  2006-04-28  7:53 ` Zoltan Menyhart
  3 siblings, 0 replies; 5+ messages in thread
From: Christoph Lameter @ 2006-04-28  1:23 UTC (permalink / raw)
  To: linux-ia64

On Thu, 27 Apr 2006, Zoltan Menyhart wrote:

> I wanted to use the mm semaphore => no need to walk again the
> pgd ... pte chain.

The pgd ... pte chain does not change even without mmap until 
the usage of the memory area ceases.


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: Read *pgd again in vhpt_miss handler
  2006-04-26 13:46 Read *pgd again in vhpt_miss handler Zoltan Menyhart
                   ` (2 preceding siblings ...)
  2006-04-28  1:23 ` Christoph Lameter
@ 2006-04-28  7:53 ` Zoltan Menyhart
  3 siblings, 0 replies; 5+ messages in thread
From: Zoltan Menyhart @ 2006-04-28  7:53 UTC (permalink / raw)
  To: linux-ia64

Christoph Lameter wrote:
> On Thu, 27 Apr 2006, Zoltan Menyhart wrote:
> 
> 
>>I wanted to use the mm semaphore => no need to walk again the
>>pgd ... pte chain.
> 
> 
> The pgd ... pte chain does not change even without mmap until 
> the usage of the memory area ceases.

It is about about un-mapping a zone while another thread faults
on an address belonging to the same zone.

We have got a

	rx = ... -> pgd[i] -> pud[j] -> pmd[k] -> pte[l]

chain to walk in the VHPT miss handler.

Having reached somewhere in this chain walking, we have got
the ph. address of the next page in the chain in a register.

Before we can fetch the next item in the chain, "unpredictable
long" time can pass.

In the mean time:
- "free_pgtables()" kills the page we are about to touch.
- Someone re-uses the same page for something else.

As we are still keeping the same ph. address, we fetch an item
from a page that is no more ours.

Even if this security window is small, it does exist.

The probability to hit this bug grows higher on a NUMA machine
with lots of CPUs.

I can accept that the VHPT miss handler cannot protected by
some locks, it is the other end that should use some "careful
un-mapping" in order to avoid race conditions.

Here is what I'm working on:

PTE, PMD and PUD page usage perfectly fits into the RCU approach:

1. The VHPT miss handler is protected by "rcu_read_lock_bh()".
   There is not a single instruction added, the required semantics
   is provided by the fact that the interrupts are off.

2. "free_pgtables()" keeps working as today for the non multi-
   threaded applications.

3. "free_pgtables()" and its subroutines do not actually free
   the PTE, PMD and PUD pages for multi-threaded applications.
   These pages will set free via an "call_rcu_bh()"-activated
   service.

(Perhaps, the weaker protection "rcu_read_lock()" - "call_rcu()"
will be enough...)

Please note that:
- The life span of the PTE, PMD and PUD pages is rather long:
  they are freed when the usage of the memory area ceases,
  provided no other map (using the same PTE, PMD and PUD pages)
  is valid.
- The number of the PTE, PMD and PUD pages is much more smaller
  that that of the leaf pages.
Therefore freeing them is not really performance critical.
As the "call_rcu_bh()"-activated freeing service will do a batch
processing, these is a chance that freeing the PTE, PMD and PUD
pages in this way be more efficient then the "pte_free()"... etc.
services of today are.

Regards,

Zoltan

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2006-04-28  7:53 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-04-26 13:46 Read *pgd again in vhpt_miss handler Zoltan Menyhart
2006-04-26 15:00 ` Chen, Kenneth W
2006-04-27 11:04 ` Zoltan Menyhart
2006-04-28  1:23 ` Christoph Lameter
2006-04-28  7:53 ` Zoltan Menyhart

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox