* Read *pgd again in vhpt_miss handler
@ 2006-04-26 13:46 Zoltan Menyhart
2006-04-26 15:00 ` Chen, Kenneth W
` (3 more replies)
0 siblings, 4 replies; 5+ messages in thread
From: Zoltan Menyhart @ 2006-04-26 13:46 UTC (permalink / raw)
To: linux-ia64
I think the *pgd has to be re-read and compared as *pmd, ... are.
("free_pud_range()" includes "pgd_clear(pgd)".)
Thanks,
Zoltan
--- linux-2.6.16.9-save/arch/ia64/kernel/ivt.S 2006-04-21 09:58:55.000000000 +0200
+++ linux-2.6.16.9/arch/ia64/kernel/ivt.S 2006-04-26 15:15:17.000000000 +0200
@@ -138,8 +138,8 @@ ENTRY(vhpt_miss)
(p6) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
(p7) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
;;
-(p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=pgd_offset for region 5
-(p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=pgd_offset for region[0-4]
+(p6) dep r27=r18,r19,3,(PAGE_SHIFT-3) // r27=pgd_offset for region 5
+(p7) dep r27=r18,r17,3,(PAGE_SHIFT-6) // r27=pgd_offset for region[0-4]
cmp.eq p7,p6=0,r21 // unused address bits all zeroes?
#ifdef CONFIG_PGTABLE_4
shr.u r28=r22,PUD_SHIFT // shift pud index into position
@@ -147,11 +147,11 @@ ENTRY(vhpt_miss)
shr.u r18=r22,PMD_SHIFT // shift pmd index into position
#endif
;;
- ld8 r17=[r17] // get *pgd (may be 0)
+ ld8 r30=[r27] // get *pgd (may be 0)
;;
-(p7) cmp.eq p6,p7=r17,r0 // was pgd_present(*pgd) = NULL?
+(p7) cmp.eq p6,p7=r30,r0 // was pgd_present(*pgd) = NULL?
#ifdef CONFIG_PGTABLE_4
- dep r28=r28,r17,3,(PAGE_SHIFT-3) // r28=pud_offset(pgd,addr)
+ dep r28=r28,r30,3,(PAGE_SHIFT-3) // r28=pud_offset(pgd,addr)
;;
shr.u r18=r22,PMD_SHIFT // shift pmd index into position
(p7) ld8 r29=[r28] // get *pud (may be 0)
@@ -159,7 +159,7 @@ ENTRY(vhpt_miss)
(p7) cmp.eq.or.andcm p6,p7=r29,r0 // was pud_present(*pud) = NULL?
dep r17=r18,r29,3,(PAGE_SHIFT-3) // r17=pmd_offset(pud,addr)
#else
- dep r17=r18,r17,3,(PAGE_SHIFT-3) // r17=pmd_offset(pgd,addr)
+ dep r17=r18,r30,3,(PAGE_SHIFT-3) // r17=pmd_offset(pgd,addr)
#endif
;;
(p7) ld8 r20=[r17] // get *pmd (may be 0)
@@ -207,10 +207,12 @@ ENTRY(vhpt_miss)
* between reading the pagetable and the "itc". If so, flush the entry we
* inserted and retry. At this point, we have:
*
+ * r27 = equivalent of pgd_offset(mm, ifa) or pgd_offset_k(ifa)
* r28 = equivalent of pud_offset(pgd, ifa)
* r17 = equivalent of pmd_offset(pud, ifa)
* r21 = equivalent of pte_offset(pmd, ifa)
*
+ * r30 = *pgd
* r29 = *pud
* r20 = *pmd
* r18 = *pte
@@ -220,7 +222,9 @@ ENTRY(vhpt_miss)
#ifdef CONFIG_PGTABLE_4
ld8 r19=[r28] // read *pud again
#endif
- cmp.ne p6,p7=r0,r0
+ ld8 r24=[r27] // read *pgd again
+ ;;
+ cmp.ne p6,p7=r24,r30 // did *pgd change
;;
cmp.ne.or.andcm p6,p7=r26,r20 // did *pmd change
#ifdef CONFIG_PGTABLE_4
^ permalink raw reply [flat|nested] 5+ messages in thread
* RE: Read *pgd again in vhpt_miss handler
2006-04-26 13:46 Read *pgd again in vhpt_miss handler Zoltan Menyhart
@ 2006-04-26 15:00 ` Chen, Kenneth W
2006-04-27 11:04 ` Zoltan Menyhart
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: Chen, Kenneth W @ 2006-04-26 15:00 UTC (permalink / raw)
To: linux-ia64
Zoltan Menyhart wrote on Wednesday, April 26, 2006 6:47 AM
> I think the *pgd has to be re-read and compared as *pmd, ... are.
> ("free_pud_range()" includes "pgd_clear(pgd)".)
A more favorable change is to remove comparing pud/pmd entry, but
condition the vhpt TLB purging upon detecting pte modification.
I had this patch in my pocket for a while for other optimizations
I'm working on and it looks like a good timing to post it now.
Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
--- ./arch/ia64/kernel/ivt.S.orig 2006-04-26 08:24:33.000000000 -0700
+++ ./arch/ia64/kernel/ivt.S 2006-04-26 08:31:43.000000000 -0700
@@ -216,21 +216,11 @@ ENTRY(vhpt_miss)
* r18 = *pte
*/
ld8 r25=[r21] // read *pte again
- ld8 r26=[r17] // read *pmd again
-#ifdef CONFIG_PGTABLE_4
- ld8 r19=[r28] // read *pud again
-#endif
- cmp.ne p6,p7=r0,r0
;;
- cmp.ne.or.andcm p6,p7=r26,r20 // did *pmd change
-#ifdef CONFIG_PGTABLE_4
- cmp.ne.or.andcm p6,p7=r19,r29 // did *pud change
-#endif
+ cmp.ne p6,p7=r25,r18 // did *pte change
mov r27=PAGE_SHIFT<<2
;;
(p6) ptc.l r22,r27 // purge PTE page translation
-(p7) cmp.ne.or.andcm p6,p7=r25,r18 // did *pte change
- ;;
(p6) ptc.l r16,r27 // purge translation
#endif
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: Read *pgd again in vhpt_miss handler
2006-04-26 13:46 Read *pgd again in vhpt_miss handler Zoltan Menyhart
2006-04-26 15:00 ` Chen, Kenneth W
@ 2006-04-27 11:04 ` Zoltan Menyhart
2006-04-28 1:23 ` Christoph Lameter
2006-04-28 7:53 ` Zoltan Menyhart
3 siblings, 0 replies; 5+ messages in thread
From: Zoltan Menyhart @ 2006-04-27 11:04 UTC (permalink / raw)
To: linux-ia64
Chen, Kenneth W wrote:
> A more favorable change is to remove comparing pud/pmd entry, but
> condition the vhpt TLB purging upon detecting pte modification.
> I had this patch in my pocket for a while for other optimizations
> I'm working on and it looks like a good timing to post it now.
I cannot agree with you.
1. If *pte becomes invalid in the mean time, then the translation
for the PTE page can be still valid.
E.g. the swapper removes PTE-s and purges the translations for
user pages only, never for a PTE page.
2. Let's have a look at your posting on the 30th os March, in the thread
"accessed/dirty bit handler tuning":
> cpu0 cpu1 cpu2
> Vhpt miss:
> walk page table
> free_pgtables
> ptc.g fault address
> ptc.g hash address
> pud_alloc/pmd_alloc
> new page instantiation
> itc.d faulting address
> itc.d hash address
> read pte
> kill tlb for fault addr
> rfi
>
> Touch fault addr
> Walker install the tlb
> with staled vhpt tlb
> -> using someone else's page
> -> data corruption
I did agree with you.
In addition, I wanted to add a protection for protecting the
pgd ... pte chain walking.
I wanted to use the mm semaphore => no need to walk again the
pgd ... pte chain.
I think your new patch widens the security hole.
Regards,
Zoltan
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: Read *pgd again in vhpt_miss handler
2006-04-26 13:46 Read *pgd again in vhpt_miss handler Zoltan Menyhart
2006-04-26 15:00 ` Chen, Kenneth W
2006-04-27 11:04 ` Zoltan Menyhart
@ 2006-04-28 1:23 ` Christoph Lameter
2006-04-28 7:53 ` Zoltan Menyhart
3 siblings, 0 replies; 5+ messages in thread
From: Christoph Lameter @ 2006-04-28 1:23 UTC (permalink / raw)
To: linux-ia64
On Thu, 27 Apr 2006, Zoltan Menyhart wrote:
> I wanted to use the mm semaphore => no need to walk again the
> pgd ... pte chain.
The pgd ... pte chain does not change even without mmap until
the usage of the memory area ceases.
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: Read *pgd again in vhpt_miss handler
2006-04-26 13:46 Read *pgd again in vhpt_miss handler Zoltan Menyhart
` (2 preceding siblings ...)
2006-04-28 1:23 ` Christoph Lameter
@ 2006-04-28 7:53 ` Zoltan Menyhart
3 siblings, 0 replies; 5+ messages in thread
From: Zoltan Menyhart @ 2006-04-28 7:53 UTC (permalink / raw)
To: linux-ia64
Christoph Lameter wrote:
> On Thu, 27 Apr 2006, Zoltan Menyhart wrote:
>
>
>>I wanted to use the mm semaphore => no need to walk again the
>>pgd ... pte chain.
>
>
> The pgd ... pte chain does not change even without mmap until
> the usage of the memory area ceases.
It is about about un-mapping a zone while another thread faults
on an address belonging to the same zone.
We have got a
rx = ... -> pgd[i] -> pud[j] -> pmd[k] -> pte[l]
chain to walk in the VHPT miss handler.
Having reached somewhere in this chain walking, we have got
the ph. address of the next page in the chain in a register.
Before we can fetch the next item in the chain, "unpredictable
long" time can pass.
In the mean time:
- "free_pgtables()" kills the page we are about to touch.
- Someone re-uses the same page for something else.
As we are still keeping the same ph. address, we fetch an item
from a page that is no more ours.
Even if this security window is small, it does exist.
The probability to hit this bug grows higher on a NUMA machine
with lots of CPUs.
I can accept that the VHPT miss handler cannot protected by
some locks, it is the other end that should use some "careful
un-mapping" in order to avoid race conditions.
Here is what I'm working on:
PTE, PMD and PUD page usage perfectly fits into the RCU approach:
1. The VHPT miss handler is protected by "rcu_read_lock_bh()".
There is not a single instruction added, the required semantics
is provided by the fact that the interrupts are off.
2. "free_pgtables()" keeps working as today for the non multi-
threaded applications.
3. "free_pgtables()" and its subroutines do not actually free
the PTE, PMD and PUD pages for multi-threaded applications.
These pages will set free via an "call_rcu_bh()"-activated
service.
(Perhaps, the weaker protection "rcu_read_lock()" - "call_rcu()"
will be enough...)
Please note that:
- The life span of the PTE, PMD and PUD pages is rather long:
they are freed when the usage of the memory area ceases,
provided no other map (using the same PTE, PMD and PUD pages)
is valid.
- The number of the PTE, PMD and PUD pages is much more smaller
that that of the leaf pages.
Therefore freeing them is not really performance critical.
As the "call_rcu_bh()"-activated freeing service will do a batch
processing, these is a chance that freeing the PTE, PMD and PUD
pages in this way be more efficient then the "pte_free()"... etc.
services of today are.
Regards,
Zoltan
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2006-04-28 7:53 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-04-26 13:46 Read *pgd again in vhpt_miss handler Zoltan Menyhart
2006-04-26 15:00 ` Chen, Kenneth W
2006-04-27 11:04 ` Zoltan Menyhart
2006-04-28 1:23 ` Christoph Lameter
2006-04-28 7:53 ` Zoltan Menyhart
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox