* [RFC] invalidate_mmap_range() misses remap_file_pages()-affected targets
@ 2003-10-12 8:48 William Lee Irwin III
2003-10-12 10:34 ` William Lee Irwin III
` (2 more replies)
0 siblings, 3 replies; 9+ messages in thread
From: William Lee Irwin III @ 2003-10-12 8:48 UTC (permalink / raw)
To: linux-kernel; +Cc: akpm
invalidate_mmap_range(), and hence vmtruncate(), can miss its targets
due to remap_file_pages() disturbing the former invariant of file
offsets only being mapped within vmas tagged as mapping file offset
ranges containing them.
This patch uses the VM_NONLINEAR flag to detect when this could happen,
and does the full pagetable walk over the full range of virtualspace
the vma tracks to ensure that the search proceeds over vmas and virtual
addresses possibly cacheing file offsets outside vmas' "natural range"
due to remap_file_pages().
A further twist is that remap_file_pages() now needs to set VM_NONLINEAR
in a manner synchronized with invalidate_mmap_range(); this is done by
protecting the setting of VM_NONLINEAR with ->i_shared_sem. This will
suffice to exclude ->populate() even though it doesn't surround it, as
vmtruncate() alters the inode size prior to the invalidation. More
general uses of invalidate_mmap_range() may need to hold it during the
->populate() calls as they may not have protection from the inode size.
Untested, though it appears to compile. The only disturbing signs are
tlb_remove_tlb_entry() on an uncleared pte, which is at variance with
zap_pte_range() (include/asm-ppc64/tlb.h suggests zap_pte_range() is
erroneous) and a semantic question with respect to PTE_FILE ptes, i.e.
whether to clear or ignore (zap_page_range() clears as it stands now).
vs. 2.6.0-test7-bk3
diff -prauN linux-2.6.0-test7-bk3/mm/fremap.c rfp-2.6.0-test7-bk3-1/mm/fremap.c
--- linux-2.6.0-test7-bk3/mm/fremap.c 2003-10-08 12:24:00.000000000 -0700
+++ rfp-2.6.0-test7-bk3-1/mm/fremap.c 2003-10-12 00:48:45.000000000 -0700
@@ -201,9 +201,19 @@ long sys_remap_file_pages(unsigned long
end > start && start >= vma->vm_start &&
end <= vma->vm_end) {
- /* Must set VM_NONLINEAR before any pages are populated. */
- if (pgoff != ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff)
+ /*
+ * Must set VM_NONLINEAR before any pages are populated.
+ * Take ->i_shared_sem to lock out invalidate_mmap_range().
+ */
+ if (pgoff != ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff) {
+ struct file *file = vma->vm_file;
+ struct address_space *mapping;
+
+ mapping = file->f_dentry->d_inode->i_mapping;
+ down(&mapping->i_shared_sem);
vma->vm_flags |= VM_NONLINEAR;
+ up(&mapping->i_shared_sem);
+ }
/* ->populate can take a long time, so downgrade the lock. */
downgrade_write(&mm->mmap_sem);
diff -prauN linux-2.6.0-test7-bk3/mm/memory.c rfp-2.6.0-test7-bk3-1/mm/memory.c
--- linux-2.6.0-test7-bk3/mm/memory.c 2003-10-08 12:24:04.000000000 -0700
+++ rfp-2.6.0-test7-bk3-1/mm/memory.c 2003-10-12 01:10:30.000000000 -0700
@@ -1077,6 +1077,102 @@ out:
return ret;
}
+static void
+invalidate_mmap_nonlinear_range(struct vm_area_struct *vma,
+ const unsigned long pgoff,
+ const unsigned long len)
+{
+ unsigned long addr;
+ pgd_t *pgd;
+ struct mmu_gather *tlb;
+
+ spin_lock(&vma->vm_mm->page_table_lock);
+ addr = vma->vm_start;
+ pgd = pgd_offset(vma->vm_mm, addr);
+ tlb = tlb_gather_mmu(vma->vm_mm, vma->vm_start);
+
+ tlb_start_vma(tlb, vma);
+ while (1) {
+ pmd_t *pmd;
+
+ if (pgd_none(*pgd)) {
+ addr = (addr + PGDIR_SIZE) & PGDIR_MASK;
+ goto skip_pgd;
+ } else if (pgd_bad(*pgd)) {
+ pgd_ERROR(*pgd);
+ pgd_clear(pgd);
+skip_pgd: addr = (addr + PGDIR_SIZE) & PGDIR_MASK;
+ if (!addr || addr >= vma->vm_end)
+ break;
+ goto next_pgd;
+ }
+
+ pmd = pmd_offset(pgd, addr);
+ do {
+ pte_t *pte;
+
+ if (pmd_none(*pmd)) {
+ goto skip_pmd;
+ } else if (pmd_bad(*pmd)) {
+ pmd_ERROR(*pmd);
+ pmd_clear(pmd);
+skip_pmd: addr = (addr + PMD_SIZE) & PMD_MASK;
+ if (!addr || addr >= vma->vm_end)
+ goto out;
+ goto next_pmd;
+ }
+ pte = pte_offset_map(pmd, addr);
+ do {
+ unsigned long pfn;
+ struct page *page;
+
+ if (pte_none(*pte))
+ goto next_pte;
+ if (!pte_present(*pte)) {
+ unsigned long index;
+ if (!pte_file(*pte))
+ goto next_pte;
+ index = pte_to_pgoff(*pte);
+ if (index >= pgoff &&
+ index - pgoff < len)
+ pte_clear(pte);
+ goto next_pte;
+ }
+ pfn = pte_pfn(*pte);
+ if (!pfn_valid(pfn))
+ goto next_pte;
+ page = pfn_to_page(pfn);
+ if (page->index < pgoff ||
+ page->index - pgoff >= len)
+ goto next_pte;
+ tlb_remove_tlb_entry(tlb, pte, addr);
+ if (pte_dirty(*pte))
+ set_page_dirty(page);
+ if (page->mapping &&
+ pte_young(*pte) &&
+ !PageSwapCache(page))
+ mark_page_accessed(page);
+ tlb->freed++;
+ page_remove_rmap(page, pte);
+ tlb_remove_page(tlb, page);
+ pte_clear(pte);
+next_pte: addr += PAGE_SIZE;
+ if (addr >= vma->vm_end) {
+ pte_unmap(pte);
+ goto out;
+ }
+ ++pte;
+ } while ((unsigned long)pte & PTE_TABLE_MASK);
+ pte_unmap(pte - 1);
+next_pmd: ++pmd;
+ } while ((unsigned long)pmd & PMD_TABLE_MASK);
+next_pgd: ++pgd;
+ }
+out: tlb_end_vma(tlb, vma);
+ tlb_finish_mmu(tlb, vma->vm_start, vma->vm_end);
+ spin_unlock(&vma->vm_mm->page_table_lock);
+}
+
/*
* Helper function for invalidate_mmap_range().
* Both hba and hlen are page numbers in PAGE_SIZE units.
@@ -1100,6 +1196,10 @@ invalidate_mmap_range_list(struct list_h
hea = ULONG_MAX;
list_for_each(curr, head) {
vp = list_entry(curr, struct vm_area_struct, shared);
+ if (unlikely(vp->vm_flags & VM_NONLINEAR)) {
+ invalidate_mmap_nonlinear_range(vp, hba, hlen);
+ continue;
+ }
vba = vp->vm_pgoff;
vea = vba + ((vp->vm_end - vp->vm_start) >> PAGE_SHIFT) - 1;
if (hea < vba || vea < hba)
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [RFC] invalidate_mmap_range() misses remap_file_pages()-affected targets
2003-10-12 8:48 [RFC] invalidate_mmap_range() misses remap_file_pages()-affected targets William Lee Irwin III
@ 2003-10-12 10:34 ` William Lee Irwin III
2003-10-12 11:56 ` Andrew Morton
2003-10-12 11:53 ` Andrew Morton
2003-10-12 20:28 ` Rik van Riel
2 siblings, 1 reply; 9+ messages in thread
From: William Lee Irwin III @ 2003-10-12 10:34 UTC (permalink / raw)
To: linux-kernel, akpm
On Sun, Oct 12, 2003 at 01:48:42AM -0700, William Lee Irwin III wrote:
> invalidate_mmap_range(), and hence vmtruncate(), can miss its targets
> due to remap_file_pages() disturbing the former invariant of file
> offsets only being mapped within vmas tagged as mapping file offset
> ranges containing them.
It would seem that mincore() shares a similar issue on account of its
algorithm (surely concocted as defensive programming for deadlock
avoidance). The following patch checks pagetable entries in a path
where copy_to_user() is not involved. The cases where the results would
differ are when nonlinearly mapped pages are paged out and have
pte_file(*pte) true, or when vma protections differ from the ptes that
remap_file_pages() has instantiated. This is dealt with by holding
->i_shared_sem during mincore_page() and carrying out a pagetable
lookup for every virtual page when testing presence in nonlinear vmas.
Untested. The disturbing questions are whether mincore_page() should
actually return -ENOMEM when asked to report presence on vmas with
vma->vm_file == NULL, and, of course, the usual confusion over what on
earth the VM_MAY* flags really are.
vs. 2.6.0-test7-bk3
diff -prauN rfp-2.6.0-test7-bk3-1/mm/mincore.c rfp-2.6.0-test7-bk3-2/mm/mincore.c
--- rfp-2.6.0-test7-bk3-1/mm/mincore.c 2003-10-08 12:24:51.000000000 -0700
+++ rfp-2.6.0-test7-bk3-2/mm/mincore.c 2003-10-12 02:17:19.000000000 -0700
@@ -22,7 +22,7 @@
* and is up to date; i.e. that no page-in operation would be required
* at this time if an application were to map and access this page.
*/
-static unsigned char mincore_page(struct vm_area_struct * vma,
+static unsigned char mincore_linear_page(struct vm_area_struct *vma,
unsigned long pgoff)
{
unsigned char present = 0;
@@ -38,6 +38,58 @@ static unsigned char mincore_page(struct
return present;
}
+static unsigned char mincore_nonlinear_page(struct vm_area_struct *vma,
+ unsigned long pgoff)
+{
+ unsigned char present = 0;
+ unsigned long vaddr;
+ pgd_t *pgd;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ spin_lock(&vma->vm_mm->page_table_lock);
+ vaddr = PAGE_SIZE*(pgoff - vma->vm_pgoff) + vma->vm_start;
+ pgd = pgd_offset(vma->vm_mm, vaddr);
+ if (pgd_none(*pgd))
+ goto out;
+ else if (pgd_bad(*pgd)) {
+ pgd_ERROR(*pgd);
+ pgd_clear(pgd);
+ goto out;
+ }
+ pmd = pmd_offset(pgd, vaddr);
+ if (pmd_none(*pmd))
+ goto out;
+ else if (pmd_ERROR(*pmd)) {
+ pmd_ERROR(*pmd);
+ pmd_clear(pmd);
+ goto out;
+ }
+ pte = pte_offset_map(pmd, vaddr);
+ if (pte_file(*pte))
+ present = mincore_linear_page(vma, pte_to_pgoff(*pte));
+ else if (!(vma->vm_flags | (VM_READ|VM_WRITE|VM_MAYREAD|VM_MAYWRITE)))
+ present = pte_present(*pte);
+ else
+ present = mincore_linear_page(vma, pgoff);
+ pte_unmap(pte);
+out:
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ return present;
+}
+
+static inline unsigned char mincore_page(struct vm_area_struct *vma,
+ unsigned long pgoff)
+{
+ struct address_space *as = vma->vm_file->f_dentry->d_inode->i_mapping;
+ down(&as->i_shared_sem);
+ if (vma->vm_flags & VM_NONLINEAR)
+ return mincore_nonlinear_page(vma, pgoff);
+ else
+ return mincore_linear_page(vma, pgoff);
+ up(&as->i_shared_sem);
+}
+
static long mincore_vma(struct vm_area_struct * vma,
unsigned long start, unsigned long end, unsigned char __user * vec)
{
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [RFC] invalidate_mmap_range() misses remap_file_pages()-affected targets
2003-10-12 8:48 [RFC] invalidate_mmap_range() misses remap_file_pages()-affected targets William Lee Irwin III
2003-10-12 10:34 ` William Lee Irwin III
@ 2003-10-12 11:53 ` Andrew Morton
2003-10-12 19:38 ` William Lee Irwin III
2003-10-12 20:28 ` Rik van Riel
2 siblings, 1 reply; 9+ messages in thread
From: Andrew Morton @ 2003-10-12 11:53 UTC (permalink / raw)
To: William Lee Irwin III; +Cc: linux-kernel
William Lee Irwin III <wli@holomorphy.com> wrote:
>
> invalidate_mmap_range(), and hence vmtruncate(), can miss its targets
> due to remap_file_pages() disturbing the former invariant of file
> offsets only being mapped within vmas tagged as mapping file offset
> ranges containing them.
I was going to just not bother about this wart. After all, we get to write
the standard on remap_file_pages(), and we can say "the
truncate-causes-SIGBUS thing doesn't work". After all, it is not very
useful.
But I wonder if this effect could be used maliciously. Say, user A has
read-only access to user B's file, and uses that access to set up a
nonlinear mapping thereby causing user B's truncate to not behave
correctly. But this example is OK, isn't it? User A will just receive an
anonymous page for his troubles.
Can you think of any stability or security scenario which says that we
_should_ implement the conventional truncate behaviour?
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [RFC] invalidate_mmap_range() misses remap_file_pages()-affected targets
2003-10-12 10:34 ` William Lee Irwin III
@ 2003-10-12 11:56 ` Andrew Morton
2003-10-12 19:51 ` William Lee Irwin III
0 siblings, 1 reply; 9+ messages in thread
From: Andrew Morton @ 2003-10-12 11:56 UTC (permalink / raw)
To: William Lee Irwin III; +Cc: linux-kernel
William Lee Irwin III <wli@holomorphy.com> wrote:
>
> It would seem that mincore() shares a similar issue on account of its
> algorithm
Yes, I think it makes sense to fix mincore(). I don't fully understand the
reasoning behind the three cases though:
+ if (pte_file(*pte))
+ present = mincore_linear_page(vma, pte_to_pgoff(*pte));
+ else if (!(vma->vm_flags | (VM_READ|VM_WRITE|VM_MAYREAD|VM_MAYWRITE)))
+ present = pte_present(*pte);
+ else
+ present = mincore_linear_page(vma, pgoff);
Could you explain the logic behind each of these? Perhaps with permanent
comments?
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [RFC] invalidate_mmap_range() misses remap_file_pages()-affected targets
2003-10-12 11:53 ` Andrew Morton
@ 2003-10-12 19:38 ` William Lee Irwin III
0 siblings, 0 replies; 9+ messages in thread
From: William Lee Irwin III @ 2003-10-12 19:38 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel
On Sun, Oct 12, 2003 at 04:53:32AM -0700, Andrew Morton wrote:
> I was going to just not bother about this wart. After all, we get to write
> the standard on remap_file_pages(), and we can say "the
> truncate-causes-SIGBUS thing doesn't work". After all, it is not very
> useful.
> But I wonder if this effect could be used maliciously. Say, user A has
> read-only access to user B's file, and uses that access to set up a
> nonlinear mapping thereby causing user B's truncate to not behave
> correctly. But this example is OK, isn't it? User A will just receive an
> anonymous page for his troubles.
> Can you think of any stability or security scenario which says that we
> _should_ implement the conventional truncate behaviour?
At some point we burned a bit of effort to ensure we wiped all the ptes
and all the faulting looped until we finished when we vmtruncate();
not handling is a loophole in that, and if anything assumes it won't
find vmtruncate()'s orphans in-kernel, it will break. Apart from that
it's not so large an issue. I'm not so much attached to coddling
userspace (remap_file_pages() users should not be naive) as to shoring
up invalidate_mmap_range() with its intent (unmapping file offset ranges
instead of virtualspace ranges). Someone else might scream later.
Also, tlb_remove_tlb_entry()'s ordering with ptep_get_and_clear() seems
to be handled on ppc* by not having ptep_get_and_clear() fully clear the
pte, which is disturbing, but a sign ppc* maintainers already handle it.
-- wli
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [RFC] invalidate_mmap_range() misses remap_file_pages()-affected targets
2003-10-12 11:56 ` Andrew Morton
@ 2003-10-12 19:51 ` William Lee Irwin III
2003-10-13 0:59 ` William Lee Irwin III
0 siblings, 1 reply; 9+ messages in thread
From: William Lee Irwin III @ 2003-10-12 19:51 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel
William Lee Irwin III <wli@holomorphy.com> wrote:
>> It would seem that mincore() shares a similar issue on account of its
>> algorithm
On Sun, Oct 12, 2003 at 04:56:44AM -0700, Andrew Morton wrote:
> Yes, I think it makes sense to fix mincore(). I don't fully understand the
> reasoning behind the three cases though:
Probably because the second case is broken.
William Lee Irwin III <wli@holomorphy.com> wrote:
> + if (pte_file(*pte))
> + present = mincore_linear_page(vma, pte_to_pgoff(*pte));
> + else if (!(vma->vm_flags | (VM_READ|VM_WRITE|VM_MAYREAD|VM_MAYWRITE)))
> + present = pte_present(*pte);
> + else
> + present = mincore_linear_page(vma, pgoff);
On Sun, Oct 12, 2003 at 04:56:44AM -0700, Andrew Morton wrote:
> Could you explain the logic behind each of these? Perhaps with permanent
> comments?
I think the PROT_NONE handling is bogus; since we have the pte, we can
just override the pgoff-based lookup if the pte is present and ignore
vm_flags, and the behavior isn't unique to PROT_NONE anyway. The prior
post flubs the case of pte_present(*pte) with unaligned pgoff and rw
or ro vma protection.
Mike Galbraith also spotted a typical locking booboo in mincore_page(),
fixed here.
-- wli
diff -prauN rfp-2.6.0-test7-bk3-1/mm/mincore.c rfp-2.6.0-test7-bk3-2/mm/mincore.c
--- rfp-2.6.0-test7-bk3-1/mm/mincore.c 2003-10-08 12:24:51.000000000 -0700
+++ rfp-2.6.0-test7-bk3-2/mm/mincore.c 2003-10-12 12:36:52.000000000 -0700
@@ -22,7 +22,7 @@
* and is up to date; i.e. that no page-in operation would be required
* at this time if an application were to map and access this page.
*/
-static unsigned char mincore_page(struct vm_area_struct * vma,
+static unsigned char mincore_linear_page(struct vm_area_struct *vma,
unsigned long pgoff)
{
unsigned char present = 0;
@@ -38,6 +38,67 @@ static unsigned char mincore_page(struct
return present;
}
+static unsigned char mincore_nonlinear_page(struct vm_area_struct *vma,
+ unsigned long pgoff)
+{
+ unsigned char present = 0;
+ unsigned long vaddr;
+ pgd_t *pgd;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ spin_lock(&vma->vm_mm->page_table_lock);
+ vaddr = PAGE_SIZE*(pgoff - vma->vm_pgoff) + vma->vm_start;
+ pgd = pgd_offset(vma->vm_mm, vaddr);
+ if (pgd_none(*pgd))
+ goto out;
+ else if (pgd_bad(*pgd)) {
+ pgd_ERROR(*pgd);
+ pgd_clear(pgd);
+ goto out;
+ }
+ pmd = pmd_offset(pgd, vaddr);
+ if (pmd_none(*pmd))
+ goto out;
+ else if (pmd_ERROR(*pmd)) {
+ pmd_ERROR(*pmd);
+ pmd_clear(pmd);
+ goto out;
+ }
+
+ pte = pte_offset_map(pmd, vaddr);
+
+ /* PTE_FILE ptes have the same file, but pgoff can differ */
+ if (pte_file(*pte))
+ present = mincore_linear_page(vma, pte_to_pgoff(*pte));
+
+ /* pte presence overrides the calculated offset */
+ else if (pte_present(*pte))
+ present = 1;
+
+ /* matching offsets are the faulted in if the pte isn't set */
+ else
+ present = mincore_linear_page(vma, pgoff);
+ pte_unmap(pte);
+out:
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ return present;
+}
+
+static inline unsigned char mincore_page(struct vm_area_struct *vma,
+ unsigned long pgoff)
+{
+ unsigned char ret;
+ struct address_space *as = vma->vm_file->f_dentry->d_inode->i_mapping;
+ down(&as->i_shared_sem);
+ if (vma->vm_flags & VM_NONLINEAR)
+ ret = mincore_nonlinear_page(vma, pgoff);
+ else
+ ret = mincore_linear_page(vma, pgoff);
+ up(&as->i_shared_sem);
+ return ret;
+}
+
static long mincore_vma(struct vm_area_struct * vma,
unsigned long start, unsigned long end, unsigned char __user * vec)
{
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [RFC] invalidate_mmap_range() misses remap_file_pages()-affected targets
2003-10-12 8:48 [RFC] invalidate_mmap_range() misses remap_file_pages()-affected targets William Lee Irwin III
2003-10-12 10:34 ` William Lee Irwin III
2003-10-12 11:53 ` Andrew Morton
@ 2003-10-12 20:28 ` Rik van Riel
2003-10-12 21:19 ` William Lee Irwin III
2 siblings, 1 reply; 9+ messages in thread
From: Rik van Riel @ 2003-10-12 20:28 UTC (permalink / raw)
To: William Lee Irwin III; +Cc: linux-kernel, akpm
On Sun, 12 Oct 2003, William Lee Irwin III wrote:
> invalidate_mmap_range(), and hence vmtruncate(), can miss its targets
> due to remap_file_pages()
Please don't. Remap_file_pages() not 100% working the way
a normal mmap() works should be a case of "doctor, it hurts".
Making the VM more complex just to support the (allegedly
low overhead) hack of remap_file_pages() doesn't seem like
a worthwhile tradeoff to me.
In fact, I wouldn't mind if remap_file_pages() was simplified ;)
--
"Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are,
by definition, not smart enough to debug it." - Brian W. Kernighan
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [RFC] invalidate_mmap_range() misses remap_file_pages()-affected targets
2003-10-12 20:28 ` Rik van Riel
@ 2003-10-12 21:19 ` William Lee Irwin III
0 siblings, 0 replies; 9+ messages in thread
From: William Lee Irwin III @ 2003-10-12 21:19 UTC (permalink / raw)
To: Rik van Riel; +Cc: linux-kernel, akpm
On Sun, 12 Oct 2003, William Lee Irwin III wrote:
>> invalidate_mmap_range(), and hence vmtruncate(), can miss its targets
>> due to remap_file_pages()
On Sun, Oct 12, 2003 at 04:28:09PM -0400, Rik van Riel wrote:
> Please don't. Remap_file_pages() not 100% working the way
> a normal mmap() works should be a case of "doctor, it hurts".
> Making the VM more complex just to support the (allegedly
> low overhead) hack of remap_file_pages() doesn't seem like
> a worthwhile tradeoff to me.
> In fact, I wouldn't mind if remap_file_pages() was simplified ;)
I'm far less concerned about userspace shooting itself in the foot
than I am the kernel.
At some point a decision was made to at least try to prevent orphaned
pages arising from vmtruncate() vs. ->nopage(), with some userspace
semantic motive I'm not concerned about, and to mitigate or possibly
eliminate the need to handle the orphaned pages in-kernel, which is my
concern. This tries to finish getting rid of Morton pages.
The only complexity to be concerned about here is algorithmic; a hotly
contended lock is taken in the VM_NONLINEAR setting, and the pagetable
scan to find pages at vm_pgoff-unaligned ptes is an exhaustive search.
The algorithm itself is a trivial derivative of zap_page_range() that
just checks page->index before unmapping pages and is no cause for
concern with respect to complexity of implementation.
I appreciate the desire for simplicity in general, but walking
pagetables when needed isn't complex, especially with such a large
cut and paste component. The proper interpretation of this is as an
attempt to complete the simplification of eliminating Morton pages.
-- wli
(Prior to the attempt that was merged, there was a tradeoff between
best effort search for the ptes and just deliberately letting Morton
pages happen. Since it was merged, it's become a core kernel semantic
question: i.e. is the vmtruncate() atomicity solely for the benefit of
"naive userspace", or is it a new kernel invariant? I tend to favor
consistency, but it's ultimately arbitrary, hence [RFC].)
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [RFC] invalidate_mmap_range() misses remap_file_pages()-affected targets
2003-10-12 19:51 ` William Lee Irwin III
@ 2003-10-13 0:59 ` William Lee Irwin III
0 siblings, 0 replies; 9+ messages in thread
From: William Lee Irwin III @ 2003-10-13 0:59 UTC (permalink / raw)
To: Andrew Morton, linux-kernel
On Sun, Oct 12, 2003 at 12:51:46PM -0700, William Lee Irwin III wrote:
> + /* PTE_FILE ptes have the same file, but pgoff can differ */
> + if (pte_file(*pte))
> + present = mincore_linear_page(vma, pte_to_pgoff(*pte));
> + /* pte presence overrides the calculated offset */
> + else if (pte_present(*pte))
> + present = 1;
> + /* matching offsets are the faulted in if the pte isn't set */
> + else
> + present = mincore_linear_page(vma, pgoff);
This is broken; _PAGE_FILE can alias other flag bits used when the page
is present, i.e. presence must always be checked first. Amended patch
below.
I noticed another disturbing oddity after the anonymous vma issue:
The mincore function requests a vector describing which pages of a file
are in core and can be read without disk access. The kernel will supply
Strictly interpreted, this suggests PROT_NONE should be reported as not
present by mincore() regardless of presence in the pagecache.
-- wli
diff -prauN rfp-2.6.0-test7-bk3-1/mm/mincore.c rfp-2.6.0-test7-bk3-2/mm/mincore.c
--- rfp-2.6.0-test7-bk3-1/mm/mincore.c 2003-10-08 12:24:51.000000000 -0700
+++ rfp-2.6.0-test7-bk3-2/mm/mincore.c 2003-10-12 17:43:50.000000000 -0700
@@ -22,7 +22,7 @@
* and is up to date; i.e. that no page-in operation would be required
* at this time if an application were to map and access this page.
*/
-static unsigned char mincore_page(struct vm_area_struct * vma,
+static unsigned char mincore_linear_page(struct vm_area_struct *vma,
unsigned long pgoff)
{
unsigned char present = 0;
@@ -38,6 +38,68 @@ static unsigned char mincore_page(struct
return present;
}
+static unsigned char mincore_nonlinear_page(struct vm_area_struct *vma,
+ unsigned long pgoff)
+{
+ unsigned char present = 0;
+ unsigned long vaddr;
+ pgd_t *pgd;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ spin_lock(&vma->vm_mm->page_table_lock);
+ vaddr = PAGE_SIZE*(pgoff - vma->vm_pgoff) + vma->vm_start;
+ pgd = pgd_offset(vma->vm_mm, vaddr);
+ if (pgd_none(*pgd))
+ goto out;
+ else if (pgd_bad(*pgd)) {
+ pgd_ERROR(*pgd);
+ pgd_clear(pgd);
+ goto out;
+ }
+ pmd = pmd_offset(pgd, vaddr);
+ if (pmd_none(*pmd))
+ goto out;
+ else if (pmd_ERROR(*pmd)) {
+ pmd_ERROR(*pmd);
+ pmd_clear(pmd);
+ goto out;
+ }
+
+ pte = pte_offset_map(pmd, vaddr);
+
+ /* pte presence overrides the calculated offset */
+ if (pte_present(*pte))
+ present = 1;
+
+ /* PTE_FILE ptes have the same file, but pgoff can differ */
+ else if (pte_file(*pte))
+ present = mincore_linear_page(vma, pte_to_pgoff(*pte));
+
+ /* matching offsets are the faulted in if the pte isn't set */
+ else
+ present = mincore_linear_page(vma, pgoff);
+
+ pte_unmap(pte);
+out:
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ return present;
+}
+
+static inline unsigned char mincore_page(struct vm_area_struct *vma,
+ unsigned long pgoff)
+{
+ unsigned char ret;
+ struct address_space *as = vma->vm_file->f_dentry->d_inode->i_mapping;
+ down(&as->i_shared_sem);
+ if (vma->vm_flags & VM_NONLINEAR)
+ ret = mincore_nonlinear_page(vma, pgoff);
+ else
+ ret = mincore_linear_page(vma, pgoff);
+ up(&as->i_shared_sem);
+ return ret;
+}
+
static long mincore_vma(struct vm_area_struct * vma,
unsigned long start, unsigned long end, unsigned char __user * vec)
{
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2003-10-13 0:56 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2003-10-12 8:48 [RFC] invalidate_mmap_range() misses remap_file_pages()-affected targets William Lee Irwin III
2003-10-12 10:34 ` William Lee Irwin III
2003-10-12 11:56 ` Andrew Morton
2003-10-12 19:51 ` William Lee Irwin III
2003-10-13 0:59 ` William Lee Irwin III
2003-10-12 11:53 ` Andrew Morton
2003-10-12 19:38 ` William Lee Irwin III
2003-10-12 20:28 ` Rik van Riel
2003-10-12 21:19 ` William Lee Irwin III
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox