* [RFC][PATCH 1/9] /proc/pid/pagemap update
@ 2007-08-21 20:42 Dave Hansen
2007-08-21 20:42 ` [RFC][PATCH 2/9] pagemap: remove file header Dave Hansen
` (9 more replies)
0 siblings, 10 replies; 22+ messages in thread
From: Dave Hansen @ 2007-08-21 20:42 UTC (permalink / raw)
To: mpm; +Cc: linux-mm, Dave Hansen
This is a series of patches to update /proc/pid/pagemap,
to simplify the code, and fix bugs which caused userspace
memory corruption.
Since it is just in -mm, we should probably roll all of
these together and just update it all at once, or send
a simple drop-on replacement patch. These patches are
all here mostly to help with review.
Matt, if you're OK with these, do you mind if I send
the update into -mm, or would you like to do it?
--
From: Matt Mackall <mpm@selenic.com>
On Mon, Aug 06, 2007 at 01:44:19AM -0500, Dave Boutcher wrote:
>
> Matt, this patch set replaces the two patches I sent earlier and
> contains additional fixes. I've done some reasonably rigorous testing
> on x86_64, but not on a 32 bit arch. I'm pretty sure this isn't worse
> than what's in mm right now, which has some user-space corruption and
> a nasty infinite kernel loop. YMMV.
Dave, here's my current work-in-progress patch to deal with a couple
locking issues, primarily a possible deadlock on the mm semaphore that
can occur if someone unmaps the target buffer while we're walking the
tree. It currently hangs on my box and I haven't had any free cycles
to finish debugging it, but you might want to take a peek at it.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/proc/task_mmu.c | 121 ++++++++++++++++++++------------------------
1 file changed, 55 insertions(+), 66 deletions(-)
diff -puN fs/proc/task_mmu.c~Re-_PATCH_0_3_proc_pid_pagemap_fixes fs/proc/task_mmu.c
--- lxc/fs/proc/task_mmu.c~Re-_PATCH_0_3_proc_pid_pagemap_fixes 2007-08-21 13:30:50.000000000 -0700
+++ lxc-dave/fs/proc/task_mmu.c 2007-08-21 13:30:50.000000000 -0700
@@ -501,37 +501,21 @@ const struct file_operations proc_clear_
};
struct pagemapread {
- struct mm_struct *mm;
unsigned long next;
- unsigned long *buf;
- pte_t *ptebuf;
unsigned long pos;
size_t count;
int index;
- char __user *out;
+ unsigned long __user *out;
};
-static int flush_pagemap(struct pagemapread *pm)
-{
- int n = min(pm->count, pm->index * sizeof(unsigned long));
- if (copy_to_user(pm->out, pm->buf, n))
- return -EFAULT;
- pm->out += n;
- pm->pos += n;
- pm->count -= n;
- pm->index = 0;
- cond_resched();
- return 0;
-}
-
static int add_to_pagemap(unsigned long addr, unsigned long pfn,
struct pagemapread *pm)
{
- pm->buf[pm->index++] = pfn;
+ __put_user(pfn, pm->out);
+ pm->out++;
+ pm->pos += sizeof(unsigned long);
+ pm->count -= sizeof(unsigned long);
pm->next = addr + PAGE_SIZE;
- if (pm->index * sizeof(unsigned long) >= PAGE_SIZE ||
- pm->index * sizeof(unsigned long) >= pm->count)
- return flush_pagemap(pm);
return 0;
}
@@ -543,14 +527,6 @@ static int pagemap_pte_range(pmd_t *pmd,
int err;
pte = pte_offset_map(pmd, addr);
-
-#ifdef CONFIG_HIGHPTE
- /* copy PTE directory to temporary buffer and unmap it */
- memcpy(pm->ptebuf, pte, PAGE_ALIGN((unsigned long)pte) - (unsigned long)pte);
- pte_unmap(pte);
- pte = pm->ptebuf;
-#endif
-
for (; addr != end; pte++, addr += PAGE_SIZE) {
if (addr < pm->next)
continue;
@@ -560,11 +536,12 @@ static int pagemap_pte_range(pmd_t *pmd,
err = add_to_pagemap(addr, pte_pfn(*pte), pm);
if (err)
return err;
+ if (pm->count == 0)
+ break;
}
-
-#ifndef CONFIG_HIGHPTE
pte_unmap(pte - 1);
-#endif
+
+ cond_resched();
return 0;
}
@@ -573,7 +550,7 @@ static int pagemap_fill(struct pagemapre
{
int ret;
- while (pm->next != end) {
+ while (pm->next != end && pm->count > 0) {
ret = add_to_pagemap(pm->next, -1UL, pm);
if (ret)
return ret;
@@ -608,15 +585,16 @@ static ssize_t pagemap_read(struct file
{
struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
unsigned long src = *ppos;
- unsigned long *page;
- unsigned long addr, end, vend, svpfn, evpfn;
+ struct page **pages, *page;
+ unsigned long addr, end, vend, svpfn, evpfn, uaddr, uend;
struct mm_struct *mm;
struct vm_area_struct *vma;
struct pagemapread pm;
+ int pagecount;
int ret = -ESRCH;
if (!task)
- goto out_no_task;
+ goto out;
ret = -EACCES;
if (!ptrace_may_attach(task))
@@ -628,39 +606,43 @@ static ssize_t pagemap_read(struct file
if ((svpfn + 1) * sizeof(unsigned long) != src)
goto out;
evpfn = min((src + count) / sizeof(unsigned long) - 1,
- ((~0UL) >> PAGE_SHIFT) + 1);
+ ((~0UL) >> PAGE_SHIFT) + 1) - 1;
count = (evpfn - svpfn) * sizeof(unsigned long);
end = PAGE_SIZE * evpfn;
-
- ret = -ENOMEM;
- page = kzalloc(PAGE_SIZE, GFP_USER);
- if (!page)
- goto out;
-
-#ifdef CONFIG_HIGHPTE
- pm.ptebuf = kzalloc(PAGE_SIZE, GFP_USER);
- if (!pm.ptebuf)
- goto out_free;
-#endif
+ //printk("src %ld svpfn %d evpfn %d count %d\n", src, svpfn, evpfn, count);
ret = 0;
mm = get_task_mm(task);
if (!mm)
- goto out_freepte;
+ goto out;
+
+ ret = -ENOMEM;
+ uaddr = (unsigned long)buf & ~(PAGE_SIZE-1);
+ uend = (unsigned long)(buf + count);
+ pagecount = (uend - uaddr + PAGE_SIZE-1) / PAGE_SIZE;
+ pages = kmalloc(pagecount * sizeof(struct page *), GFP_KERNEL);
+ if (!pages)
+ goto out_task;
+
+ down_read(¤t->mm->mmap_sem);
+ ret = get_user_pages(current, current->mm, uaddr, pagecount,
+ 1, 0, pages, NULL);
+ up_read(¤t->mm->mmap_sem);
+
+ //printk("%x(%x):%x %d@%ld (%d pages) -> %d\n", uaddr, buf, uend, count, src, pagecount, ret);
+ if (ret < 0)
+ goto out_free;
- pm.mm = mm;
pm.next = addr;
- pm.buf = page;
pm.pos = src;
pm.count = count;
- pm.index = 0;
- pm.out = buf;
+ pm.out = (unsigned long __user *)buf;
if (svpfn == -1) {
- ((char *)page)[0] = (ntohl(1) != 1);
- ((char *)page)[1] = PAGE_SHIFT;
- ((char *)page)[2] = sizeof(unsigned long);
- ((char *)page)[3] = sizeof(unsigned long);
+ put_user((char)(ntohl(1) != 1), buf);
+ put_user((char)PAGE_SHIFT, buf + 1);
+ put_user((char)sizeof(unsigned long), buf + 2);
+ put_user((char)sizeof(unsigned long), buf + 3);
add_to_pagemap(pm.next, page[0], &pm);
}
@@ -669,7 +651,8 @@ static ssize_t pagemap_read(struct file
while (pm.count > 0 && vma) {
if (!ptrace_may_attach(task)) {
ret = -EIO;
- goto out_mm;
+ up_read(&mm->mmap_sem);
+ goto out_release;
}
vend = min(vma->vm_end - 1, end - 1) + 1;
ret = pagemap_fill(&pm, vend);
@@ -682,23 +665,29 @@ static ssize_t pagemap_read(struct file
}
up_read(&mm->mmap_sem);
+ //printk("before fill at %ld\n", pm.pos);
ret = pagemap_fill(&pm, end);
+ printk("after fill at %ld\n", pm.pos);
*ppos = pm.pos;
if (!ret)
ret = pm.pos - src;
-out_mm:
+out_release:
+ printk("releasing pages\n");
+ for (; pagecount; pagecount--) {
+ page = pages[pagecount-1];
+ if (!PageReserved(page))
+ SetPageDirty(page);
+ page_cache_release(page);
+ }
mmput(mm);
-out_freepte:
-#ifdef CONFIG_HIGHPTE
- kfree(pm.ptebuf);
out_free:
-#endif
- kfree(page);
-out:
+ kfree(pages);
+out_task:
put_task_struct(task);
-out_no_task:
+out:
+ printk("returning\n");
return ret;
}
_
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 22+ messages in thread
* [RFC][PATCH 2/9] pagemap: remove file header
2007-08-21 20:42 [RFC][PATCH 1/9] /proc/pid/pagemap update Dave Hansen
@ 2007-08-21 20:42 ` Dave Hansen
2007-08-21 21:24 ` Matt Mackall
2007-08-21 20:42 ` [RFC][PATCH 3/9] pagemap: use PAGE_MASK/PAGE_ALIGN() Dave Hansen
` (8 subsequent siblings)
9 siblings, 1 reply; 22+ messages in thread
From: Dave Hansen @ 2007-08-21 20:42 UTC (permalink / raw)
To: mpm; +Cc: linux-mm, Dave Hansen
The /proc/<pid>/pagemap file has a header containing:
* first byte: 0 for big endian, 1 for little
* second byte: page shift (eg 12 for 4096 byte pages)
* third byte: entry size in bytes (currently either 4 or 8)
* fourth byte: header size
The endianness is only useful when examining a raw dump of
pagemap from a different machine when you don't know the
source of the file. This is pretty rare, and the programs
or scripts doing the copying off-machine can certainly be
made to hold this information.
The page size is available in userspace at least with libc's
getpagesize(). This will also never vary across processes,
so putting it in a per-process file doesn't make any difference.
If we need a "kernel's page size" exported to userspace,
perhaps we can put it in /proc/meminfo.
The entry size is the really tricky one. This can't just
be sizeof(unsigned long) from userspace because we can have
32-bit processes on 64-bit kernels. But, userspace can
certainly derive this value if it lseek()s to the end of
the file, and divides the file position by the size of its
virtual address space.
In any case, I believe this information is redundant, and
can be removed.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/proc/task_mmu.c | 14 +++-----------
1 file changed, 3 insertions(+), 11 deletions(-)
diff -puN fs/proc/task_mmu.c~pagemap-no-header fs/proc/task_mmu.c
--- lxc/fs/proc/task_mmu.c~pagemap-no-header 2007-08-21 13:30:50.000000000 -0700
+++ lxc-dave/fs/proc/task_mmu.c 2007-08-21 13:30:50.000000000 -0700
@@ -601,12 +601,12 @@ static ssize_t pagemap_read(struct file
goto out;
ret = -EIO;
- svpfn = src / sizeof(unsigned long) - 1;
+ svpfn = src / sizeof(unsigned long);
addr = PAGE_SIZE * svpfn;
- if ((svpfn + 1) * sizeof(unsigned long) != src)
+ if (svpfn * sizeof(unsigned long) != src)
goto out;
evpfn = min((src + count) / sizeof(unsigned long) - 1,
- ((~0UL) >> PAGE_SHIFT) + 1) - 1;
+ ((~0UL) >> PAGE_SHIFT) + 1);
count = (evpfn - svpfn) * sizeof(unsigned long);
end = PAGE_SIZE * evpfn;
//printk("src %ld svpfn %d evpfn %d count %d\n", src, svpfn, evpfn, count);
@@ -638,14 +638,6 @@ static ssize_t pagemap_read(struct file
pm.count = count;
pm.out = (unsigned long __user *)buf;
- if (svpfn == -1) {
- put_user((char)(ntohl(1) != 1), buf);
- put_user((char)PAGE_SHIFT, buf + 1);
- put_user((char)sizeof(unsigned long), buf + 2);
- put_user((char)sizeof(unsigned long), buf + 3);
- add_to_pagemap(pm.next, page[0], &pm);
- }
-
down_read(&mm->mmap_sem);
vma = find_vma(mm, pm.next);
while (pm.count > 0 && vma) {
_
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 22+ messages in thread
* [RFC][PATCH 3/9] pagemap: use PAGE_MASK/PAGE_ALIGN()
2007-08-21 20:42 [RFC][PATCH 1/9] /proc/pid/pagemap update Dave Hansen
2007-08-21 20:42 ` [RFC][PATCH 2/9] pagemap: remove file header Dave Hansen
@ 2007-08-21 20:42 ` Dave Hansen
2007-08-21 21:25 ` Matt Mackall
2007-08-21 20:42 ` [RFC][PATCH 4/9] pagemap: remove open-coded sizeof(unsigned long) Dave Hansen
` (7 subsequent siblings)
9 siblings, 1 reply; 22+ messages in thread
From: Dave Hansen @ 2007-08-21 20:42 UTC (permalink / raw)
To: mpm; +Cc: linux-mm, Dave Hansen
Use existing macros (PAGE_MASK/PAGE_ALIGN()) instead of
open-coding them.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/proc/task_mmu.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff -puN fs/proc/task_mmu.c~pagemap-use-PAGE_MASK fs/proc/task_mmu.c
--- lxc/fs/proc/task_mmu.c~pagemap-use-PAGE_MASK 2007-08-21 13:30:51.000000000 -0700
+++ lxc-dave/fs/proc/task_mmu.c 2007-08-21 13:30:51.000000000 -0700
@@ -617,9 +617,9 @@ static ssize_t pagemap_read(struct file
goto out;
ret = -ENOMEM;
- uaddr = (unsigned long)buf & ~(PAGE_SIZE-1);
+ uaddr = (unsigned long)buf & PAGE_MASK;
uend = (unsigned long)(buf + count);
- pagecount = (uend - uaddr + PAGE_SIZE-1) / PAGE_SIZE;
+ pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE;
pages = kmalloc(pagecount * sizeof(struct page *), GFP_KERNEL);
if (!pages)
goto out_task;
_
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 22+ messages in thread
* [RFC][PATCH 4/9] pagemap: remove open-coded sizeof(unsigned long)
2007-08-21 20:42 [RFC][PATCH 1/9] /proc/pid/pagemap update Dave Hansen
2007-08-21 20:42 ` [RFC][PATCH 2/9] pagemap: remove file header Dave Hansen
2007-08-21 20:42 ` [RFC][PATCH 3/9] pagemap: use PAGE_MASK/PAGE_ALIGN() Dave Hansen
@ 2007-08-21 20:42 ` Dave Hansen
2007-08-21 21:26 ` Matt Mackall
2007-08-21 20:42 ` [RFC][PATCH 5/9] introduce TASK_SIZE_OF() for all arches Dave Hansen
` (6 subsequent siblings)
9 siblings, 1 reply; 22+ messages in thread
From: Dave Hansen @ 2007-08-21 20:42 UTC (permalink / raw)
To: mpm; +Cc: linux-mm, Dave Hansen
I think the code gets easier to read when we give symbolic names
to some of the operations we're performing. I was sure we needed
this when I saw the header being built like this:
...
buf[2] = sizeof(unsigned long)
buf[3] = sizeof(unsigned long)
I really couldn't remember what either field did ;(
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/proc/task_mmu.c | 12 +++++++-----
1 file changed, 7 insertions(+), 5 deletions(-)
diff -puN fs/proc/task_mmu.c~pagemap-use-ENTRY_SIZE fs/proc/task_mmu.c
--- lxc/fs/proc/task_mmu.c~pagemap-use-ENTRY_SIZE 2007-08-21 13:30:51.000000000 -0700
+++ lxc-dave/fs/proc/task_mmu.c 2007-08-21 13:30:51.000000000 -0700
@@ -508,14 +508,16 @@ struct pagemapread {
unsigned long __user *out;
};
+#define PM_ENTRY_BYTES sizeof(unsigned long)
+
static int add_to_pagemap(unsigned long addr, unsigned long pfn,
struct pagemapread *pm)
{
__put_user(pfn, pm->out);
pm->out++;
- pm->pos += sizeof(unsigned long);
- pm->count -= sizeof(unsigned long);
pm->next = addr + PAGE_SIZE;
+ pm->pos += PM_ENTRY_BYTES;
+ pm->count -= PM_ENTRY_BYTES;
return 0;
}
@@ -601,13 +603,13 @@ static ssize_t pagemap_read(struct file
goto out;
ret = -EIO;
- svpfn = src / sizeof(unsigned long);
+ svpfn = src / PM_ENTRY_BYTES;
addr = PAGE_SIZE * svpfn;
- if (svpfn * sizeof(unsigned long) != src)
+ if (svpfn * PM_ENTRY_BYTES != src)
goto out;
evpfn = min((src + count) / sizeof(unsigned long) - 1,
((~0UL) >> PAGE_SHIFT) + 1);
- count = (evpfn - svpfn) * sizeof(unsigned long);
+ count = (evpfn - svpfn) * PM_ENTRY_BYTES;
end = PAGE_SIZE * evpfn;
//printk("src %ld svpfn %d evpfn %d count %d\n", src, svpfn, evpfn, count);
_
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 22+ messages in thread
* [RFC][PATCH 5/9] introduce TASK_SIZE_OF() for all arches
2007-08-21 20:42 [RFC][PATCH 1/9] /proc/pid/pagemap update Dave Hansen
` (2 preceding siblings ...)
2007-08-21 20:42 ` [RFC][PATCH 4/9] pagemap: remove open-coded sizeof(unsigned long) Dave Hansen
@ 2007-08-21 20:42 ` Dave Hansen
2007-08-21 20:42 ` [RFC][PATCH 6/9] pagemap: give -1's a name Dave Hansen
` (5 subsequent siblings)
9 siblings, 0 replies; 22+ messages in thread
From: Dave Hansen @ 2007-08-21 20:42 UTC (permalink / raw)
To: mpm; +Cc: linux-mm, Dave Hansen
(already sent to linux-arch, just repeating here in case someone
wants to test these in their entirety)
For the /proc/<pid>/pagemap code[1], we need to able to query how
much virtual address space a particular task has. The trick is
that we do it through /proc and can't use TASK_SIZE since it
references "current" on some arches. The process opening the
/proc file might be a 32-bit process opening a 64-bit process's
pagemap file.
x86_64 already has a TASK_SIZE_OF() macro:
#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64)
I'd like to have that for other architectures. So, add it
for all the architectures that actually use "current" in
their TASK_SIZE. For the others, just add a quick #define
in sched.h to use plain old TASK_SIZE.
1. http://www.linuxworld.com/news/2007/042407-kernel.html
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/include/asm-ia64/processor.h | 3 ++-
lxc-dave/include/asm-parisc/processor.h | 3 ++-
lxc-dave/include/asm-powerpc/processor.h | 4 +++-
lxc-dave/include/asm-s390/processor.h | 2 ++
lxc-dave/include/linux/sched.h | 4 ++++
5 files changed, 13 insertions(+), 3 deletions(-)
diff -puN include/asm-ia64/processor.h~task_size_of include/asm-ia64/processor.h
--- lxc/include/asm-ia64/processor.h~task_size_of 2007-08-21 13:30:52.000000000 -0700
+++ lxc-dave/include/asm-ia64/processor.h 2007-08-21 13:30:52.000000000 -0700
@@ -31,7 +31,8 @@
* each (assuming 8KB page size), for a total of 8TB of user virtual
* address space.
*/
-#define TASK_SIZE (current->thread.task_size)
+#define TASK_SIZE_OF(tsk) ((tsk)->thread.task_size)
+#define TASK_SIZE TASK_SIZE_OF(current)
/*
* This decides where the kernel will search for a free chunk of vm
diff -puN include/asm-parisc/processor.h~task_size_of include/asm-parisc/processor.h
--- lxc/include/asm-parisc/processor.h~task_size_of 2007-08-21 13:30:52.000000000 -0700
+++ lxc-dave/include/asm-parisc/processor.h 2007-08-21 13:30:52.000000000 -0700
@@ -32,7 +32,8 @@
#endif
#define current_text_addr() ({ void *pc; current_ia(pc); pc; })
-#define TASK_SIZE (current->thread.task_size)
+#define TASK_SIZE_OF(tsk) ((tsk)->thread.task_size)
+#define TASK_SIZE (current->thread.task_size)
#define TASK_UNMAPPED_BASE (current->thread.map_base)
#define DEFAULT_TASK_SIZE32 (0xFFF00000UL)
diff -puN include/asm-powerpc/processor.h~task_size_of include/asm-powerpc/processor.h
--- lxc/include/asm-powerpc/processor.h~task_size_of 2007-08-21 13:30:52.000000000 -0700
+++ lxc-dave/include/asm-powerpc/processor.h 2007-08-21 13:30:52.000000000 -0700
@@ -99,7 +99,9 @@ extern struct task_struct *last_task_use
*/
#define TASK_SIZE_USER32 (0x0000000100000000UL - (1*PAGE_SIZE))
-#define TASK_SIZE (test_thread_flag(TIF_32BIT) ? \
+#define TASK_SIZE (test_thread_flag(TIF_32BIT) ? \
+ TASK_SIZE_USER32 : TASK_SIZE_USER64)
+#define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT) ? \
TASK_SIZE_USER32 : TASK_SIZE_USER64)
/* This decides where the kernel will search for a free chunk of vm
diff -puN include/asm-s390/processor.h~task_size_of include/asm-s390/processor.h
--- lxc/include/asm-s390/processor.h~task_size_of 2007-08-21 13:30:52.000000000 -0700
+++ lxc-dave/include/asm-s390/processor.h 2007-08-21 13:30:52.000000000 -0700
@@ -75,6 +75,8 @@ extern struct task_struct *last_task_use
# define TASK_SIZE (test_thread_flag(TIF_31BIT) ? \
(0x80000000UL) : (0x40000000000UL))
+# define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_31BIT) ? \
+ (0x80000000UL) : (0x40000000000UL))
# define TASK_UNMAPPED_BASE (TASK_SIZE / 2)
# define DEFAULT_TASK_SIZE (0x40000000000UL)
diff -puN include/linux/sched.h~task_size_of include/linux/sched.h
--- lxc/include/linux/sched.h~task_size_of 2007-08-21 13:30:52.000000000 -0700
+++ lxc-dave/include/linux/sched.h 2007-08-21 13:30:52.000000000 -0700
@@ -1810,6 +1810,10 @@ static inline void inc_syscw(struct task
}
#endif
+#ifndef TASK_SIZE_OF
+#define TASK_SIZE_OF(tsk) TASK_SIZE
+#endif
+
#endif /* __KERNEL__ */
#endif
_
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 22+ messages in thread
* [RFC][PATCH 6/9] pagemap: give -1's a name
2007-08-21 20:42 [RFC][PATCH 1/9] /proc/pid/pagemap update Dave Hansen
` (3 preceding siblings ...)
2007-08-21 20:42 ` [RFC][PATCH 5/9] introduce TASK_SIZE_OF() for all arches Dave Hansen
@ 2007-08-21 20:42 ` Dave Hansen
2007-08-21 21:27 ` Matt Mackall
2007-08-21 20:42 ` [RFC][PATCH 7/9] pagewalk: add handler for empty ranges Dave Hansen
` (4 subsequent siblings)
9 siblings, 1 reply; 22+ messages in thread
From: Dave Hansen @ 2007-08-21 20:42 UTC (permalink / raw)
To: mpm; +Cc: linux-mm, Dave Hansen
-1 is a magic number in /proc/$pid/pagemap. It means that
there was no pte present for a particular page. We're
going to be refining that a bit shortly, so give this a
real name for now.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/proc/task_mmu.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff -puN fs/proc/task_mmu.c~give_-1s_a_name fs/proc/task_mmu.c
--- lxc/fs/proc/task_mmu.c~give_-1s_a_name 2007-08-21 13:30:53.000000000 -0700
+++ lxc-dave/fs/proc/task_mmu.c 2007-08-21 13:30:53.000000000 -0700
@@ -509,6 +509,7 @@ struct pagemapread {
};
#define PM_ENTRY_BYTES sizeof(unsigned long)
+#define PM_NOT_PRESENT ((unsigned long)-1)
static int add_to_pagemap(unsigned long addr, unsigned long pfn,
struct pagemapread *pm)
@@ -533,7 +534,7 @@ static int pagemap_pte_range(pmd_t *pmd,
if (addr < pm->next)
continue;
if (!pte_present(*pte))
- err = add_to_pagemap(addr, -1, pm);
+ err = add_to_pagemap(addr, PM_NOT_PRESENT, pm);
else
err = add_to_pagemap(addr, pte_pfn(*pte), pm);
if (err)
_
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 22+ messages in thread
* [RFC][PATCH 7/9] pagewalk: add handler for empty ranges
2007-08-21 20:42 [RFC][PATCH 1/9] /proc/pid/pagemap update Dave Hansen
` (4 preceding siblings ...)
2007-08-21 20:42 ` [RFC][PATCH 6/9] pagemap: give -1's a name Dave Hansen
@ 2007-08-21 20:42 ` Dave Hansen
2007-08-30 7:28 ` Nick Piggin
2007-08-21 20:42 ` [RFC][PATCH 8/9] pagemap: use page walker pte_hole() helper Dave Hansen
` (3 subsequent siblings)
9 siblings, 1 reply; 22+ messages in thread
From: Dave Hansen @ 2007-08-21 20:42 UTC (permalink / raw)
To: mpm; +Cc: linux-mm, Dave Hansen
There's a pretty good deal of complexity surrounding dealing
with a sparse address space in the /proc/<pid>/pagemap code.
We have the pm->next pointer to help indicate how far we've
walked in the pagetables. We also attempt to fill empty
areas without vmas manually.
This code adds an extension to the mm_walk code: a new handler
for "empty" pte ranges. Those are areas where there is no
pte page present. This allows us to get rid of the code that
inspects VMAs or that trys to keep track of how much of the
pagemap we have filled.
Note that this truly does walk pte *holes*. That isn't just
places where we have a pte_none(). It includes places where
there are any missing higher-level pagetable entries like
puds.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/include/linux/mm.h | 1
lxc-dave/lib/pagewalk.c | 67 +++++++++++++++++++-------------------------
2 files changed, 30 insertions(+), 38 deletions(-)
diff -puN include/linux/mm.h~pagewalk-empty-ranges include/linux/mm.h
--- lxc/include/linux/mm.h~pagewalk-empty-ranges 2007-08-21 13:30:53.000000000 -0700
+++ lxc-dave/include/linux/mm.h 2007-08-21 13:30:53.000000000 -0700
@@ -699,6 +699,7 @@ struct mm_walk {
int (*pud_entry)(pud_t *, unsigned long, unsigned long, void *);
int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, void *);
int (*pte_entry)(pte_t *, unsigned long, unsigned long, void *);
+ int (*pte_hole) (unsigned long, unsigned long, void *);
};
int walk_page_range(struct mm_struct *, unsigned long addr, unsigned long end,
diff -puN lib/pagewalk.c~pagewalk-empty-ranges lib/pagewalk.c
--- lxc/lib/pagewalk.c~pagewalk-empty-ranges 2007-08-21 13:30:53.000000000 -0700
+++ lxc-dave/lib/pagewalk.c 2007-08-21 13:30:53.000000000 -0700
@@ -6,17 +6,13 @@ static int walk_pte_range(pmd_t *pmd, un
struct mm_walk *walk, void *private)
{
pte_t *pte;
- int err;
+ int err = 0;
for (pte = pte_offset_map(pmd, addr); addr != end;
addr += PAGE_SIZE, pte++) {
- if (pte_none(*pte))
- continue;
err = walk->pte_entry(pte, addr, addr, private);
- if (err) {
- pte_unmap(pte);
- return err;
- }
+ if (err)
+ break;
}
pte_unmap(pte);
return 0;
@@ -27,25 +23,23 @@ static int walk_pmd_range(pud_t *pud, un
{
pmd_t *pmd;
unsigned long next;
- int err;
+ int err = 0;
for (pmd = pmd_offset(pud, addr); addr != end;
pmd++, addr = next) {
next = pmd_addr_end(addr, end);
- if (pmd_none_or_clear_bad(pmd))
+ if (pmd_none(*pmd)) {
+ err = walk->pte_hole(addr, next, private);
+ } else if (pmd_none_or_clear_bad(pmd))
continue;
- if (walk->pmd_entry) {
+ if (!err && walk->pmd_entry)
err = walk->pmd_entry(pmd, addr, next, private);
- if (err)
- return err;
- }
- if (walk->pte_entry) {
+ if (!err && walk->pte_entry)
err = walk_pte_range(pmd, addr, next, walk, private);
- if (err)
- return err;
- }
+ if (err)
+ break;
}
- return 0;
+ return err;
}
static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
@@ -53,23 +47,21 @@ static int walk_pud_range(pgd_t *pgd, un
{
pud_t *pud;
unsigned long next;
- int err;
+ int err = 0;
for (pud = pud_offset(pgd, addr); addr != end;
pud++, addr = next) {
next = pud_addr_end(addr, end);
- if (pud_none_or_clear_bad(pud))
+ if (pud_none(*pud)) {
+ err = walk->pte_hole(addr, next, private);
+ } else if (pud_none_or_clear_bad(pud))
continue;
- if (walk->pud_entry) {
+ if (!err && walk->pud_entry)
err = walk->pud_entry(pud, addr, next, private);
- if (err)
- return err;
- }
- if (walk->pmd_entry || walk->pte_entry) {
+ if (!err && (walk->pmd_entry || walk->pte_entry))
err = walk_pmd_range(pud, addr, next, walk, private);
- if (err)
- return err;
- }
+ if (err)
+ return err;
}
return 0;
}
@@ -91,23 +83,22 @@ int walk_page_range(struct mm_struct *mm
{
pgd_t *pgd;
unsigned long next;
- int err;
+ int err = 0;
for (pgd = pgd_offset(mm, addr); addr != end;
pgd++, addr = next) {
next = pgd_addr_end(addr, end);
- if (pgd_none_or_clear_bad(pgd))
+ if (pgd_none(*pgd)) {
+ err = walk->pte_hole(addr, next, private);
+ } else if (pgd_none_or_clear_bad(pgd))
continue;
- if (walk->pgd_entry) {
+ if (!err && walk->pgd_entry)
err = walk->pgd_entry(pgd, addr, next, private);
- if (err)
- return err;
- }
- if (walk->pud_entry || walk->pmd_entry || walk->pte_entry) {
+ if (!err &&
+ (walk->pud_entry || walk->pmd_entry || walk->pte_entry))
err = walk_pud_range(pgd, addr, next, walk, private);
- if (err)
- return err;
- }
+ if (err)
+ return err;
}
return 0;
}
_
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 22+ messages in thread
* [RFC][PATCH 8/9] pagemap: use page walker pte_hole() helper
2007-08-21 20:42 [RFC][PATCH 1/9] /proc/pid/pagemap update Dave Hansen
` (5 preceding siblings ...)
2007-08-21 20:42 ` [RFC][PATCH 7/9] pagewalk: add handler for empty ranges Dave Hansen
@ 2007-08-21 20:42 ` Dave Hansen
2007-08-21 22:01 ` Matt Mackall
2007-08-21 20:42 ` [RFC][PATCH 9/9] pagemap: export swap ptes Dave Hansen
` (2 subsequent siblings)
9 siblings, 1 reply; 22+ messages in thread
From: Dave Hansen @ 2007-08-21 20:42 UTC (permalink / raw)
To: mpm; +Cc: linux-mm, Dave Hansen
I tried to do this a bit more incrementally, but it ended
up just looking like an even worse mess. So, this does
a a couple of different things.
1. use page walker pte_hole() helper, which
2. gets rid of the "next" value in "struct pagemapread"
3. allow 1-3 byte reads from pagemap. This at least
ensures that we don't write over user memory if they
ask us for 1 bytes and we tried to write 4.
4. Instead of trying to calculate what ranges of pages
we are going to walk, simply start walking them,
then return PAGEMAP_END_OF_BUFFER at the end of the
buffer, error out, and stop walking.
5. enforce that reads must be algined to PM_ENTRY_BYTES
Note that, despite these functional additions, and some
nice new comments, this patch still removes more code
than it adds.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/proc/task_mmu.c | 123 ++++++++++++++++++++------------------------
1 file changed, 56 insertions(+), 67 deletions(-)
diff -puN fs/proc/task_mmu.c~bail-instead-of-tracking fs/proc/task_mmu.c
--- lxc/fs/proc/task_mmu.c~bail-instead-of-tracking 2007-08-21 13:30:54.000000000 -0700
+++ lxc-dave/fs/proc/task_mmu.c 2007-08-21 13:30:54.000000000 -0700
@@ -501,7 +501,6 @@ const struct file_operations proc_clear_
};
struct pagemapread {
- unsigned long next;
unsigned long pos;
size_t count;
int index;
@@ -510,58 +509,64 @@ struct pagemapread {
#define PM_ENTRY_BYTES sizeof(unsigned long)
#define PM_NOT_PRESENT ((unsigned long)-1)
+#define PAGEMAP_END_OF_BUFFER 1
static int add_to_pagemap(unsigned long addr, unsigned long pfn,
struct pagemapread *pm)
{
- __put_user(pfn, pm->out);
- pm->out++;
- pm->next = addr + PAGE_SIZE;
+ int out_len = PM_ENTRY_BYTES;
+ if (pm->count < PM_ENTRY_BYTES)
+ out_len = pm->count;
+ copy_to_user(pm->out, &pfn, out_len);
pm->pos += PM_ENTRY_BYTES;
pm->count -= PM_ENTRY_BYTES;
+ if (pm->count <= 0)
+ return PAGEMAP_END_OF_BUFFER;
return 0;
}
+static int pagemap_pte_hole(unsigned long start, unsigned long end,
+ void *private)
+{
+ struct pagemapread *pm = private;
+ unsigned long addr;
+ int err = 0;
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
+ err = add_to_pagemap(addr, PM_NOT_PRESENT, pm);
+ if (err)
+ break;
+ }
+ return err;
+}
+
static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
void *private)
{
struct pagemapread *pm = private;
pte_t *pte;
- int err;
+ int err = 0;
pte = pte_offset_map(pmd, addr);
for (; addr != end; pte++, addr += PAGE_SIZE) {
- if (addr < pm->next)
- continue;
- if (!pte_present(*pte))
- err = add_to_pagemap(addr, PM_NOT_PRESENT, pm);
- else
- err = add_to_pagemap(addr, pte_pfn(*pte), pm);
+ unsigned long pfn = PM_NOT_PRESENT;
+ if (pte_present(*pte))
+ pfn = pte_pfn(*pte);
+ err = add_to_pagemap(addr, pfn, pm);
if (err)
return err;
- if (pm->count == 0)
- break;
}
pte_unmap(pte - 1);
cond_resched();
- return 0;
+ return err;
}
-static int pagemap_fill(struct pagemapread *pm, unsigned long end)
+static struct mm_walk pagemap_walk =
{
- int ret;
-
- while (pm->next != end && pm->count > 0) {
- ret = add_to_pagemap(pm->next, -1UL, pm);
- if (ret)
- return ret;
- }
- return 0;
-}
-
-static struct mm_walk pagemap_walk = { .pmd_entry = pagemap_pte_range };
+ .pmd_entry = pagemap_pte_range,
+ .pte_hole = pagemap_pte_hole
+};
/*
* /proc/pid/pagemap - an array mapping virtual pages to pfns
@@ -589,9 +594,8 @@ static ssize_t pagemap_read(struct file
struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
unsigned long src = *ppos;
struct page **pages, *page;
- unsigned long addr, end, vend, svpfn, evpfn, uaddr, uend;
+ unsigned long uaddr, uend;
struct mm_struct *mm;
- struct vm_area_struct *vma;
struct pagemapread pm;
int pagecount;
int ret = -ESRCH;
@@ -603,16 +607,10 @@ static ssize_t pagemap_read(struct file
if (!ptrace_may_attach(task))
goto out;
- ret = -EIO;
- svpfn = src / PM_ENTRY_BYTES;
- addr = PAGE_SIZE * svpfn;
- if (svpfn * PM_ENTRY_BYTES != src)
+ ret = -EINVAL;
+ /* file position must be aligned */
+ if (*ppos % PM_ENTRY_BYTES)
goto out;
- evpfn = min((src + count) / sizeof(unsigned long) - 1,
- ((~0UL) >> PAGE_SHIFT) + 1);
- count = (evpfn - svpfn) * PM_ENTRY_BYTES;
- end = PAGE_SIZE * evpfn;
- //printk("src %ld svpfn %d evpfn %d count %d\n", src, svpfn, evpfn, count);
ret = 0;
mm = get_task_mm(task);
@@ -632,44 +630,36 @@ static ssize_t pagemap_read(struct file
1, 0, pages, NULL);
up_read(¤t->mm->mmap_sem);
- //printk("%x(%x):%x %d@%ld (%d pages) -> %d\n", uaddr, buf, uend, count, src, pagecount, ret);
if (ret < 0)
goto out_free;
- pm.next = addr;
pm.pos = src;
pm.count = count;
pm.out = (unsigned long __user *)buf;
- down_read(&mm->mmap_sem);
- vma = find_vma(mm, pm.next);
- while (pm.count > 0 && vma) {
- if (!ptrace_may_attach(task)) {
- ret = -EIO;
- up_read(&mm->mmap_sem);
- goto out_release;
- }
- vend = min(vma->vm_end - 1, end - 1) + 1;
- ret = pagemap_fill(&pm, vend);
- if (ret || !pm.count)
- break;
- vend = min(vma->vm_end - 1, end - 1) + 1;
- ret = walk_page_range(mm, vma->vm_start, vend,
- &pagemap_walk, &pm);
- vma = vma->vm_next;
+ if (!ptrace_may_attach(task)) {
+ ret = -EIO;
+ } else {
+ unsigned long src = *ppos;
+ unsigned long svpfn = src / PM_ENTRY_BYTES;
+ unsigned long start_vaddr = svpfn << PAGE_SHIFT;
+ unsigned long end_vaddr = TASK_SIZE_OF(task);
+ /*
+ * The odds are that this will stop walking way
+ * before end_vaddr, because the length of the
+ * user buffer is tracked in "pm", and the walk
+ * will stop when we hit the end of the buffer.
+ */
+ ret = walk_page_range(mm, start_vaddr, end_vaddr,
+ &pagemap_walk, &pm);
+ if (ret == PAGEMAP_END_OF_BUFFER)
+ ret = 0;
+ /* don't need mmap_sem for these, but this looks cleaner */
+ *ppos = pm.pos;
+ if (!ret)
+ ret = pm.pos - src;
}
- up_read(&mm->mmap_sem);
-
- //printk("before fill at %ld\n", pm.pos);
- ret = pagemap_fill(&pm, end);
-
- printk("after fill at %ld\n", pm.pos);
- *ppos = pm.pos;
- if (!ret)
- ret = pm.pos - src;
-out_release:
- printk("releasing pages\n");
for (; pagecount; pagecount--) {
page = pages[pagecount-1];
if (!PageReserved(page))
@@ -682,7 +672,6 @@ out_free:
out_task:
put_task_struct(task);
out:
- printk("returning\n");
return ret;
}
_
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 22+ messages in thread
* [RFC][PATCH 9/9] pagemap: export swap ptes
2007-08-21 20:42 [RFC][PATCH 1/9] /proc/pid/pagemap update Dave Hansen
` (6 preceding siblings ...)
2007-08-21 20:42 ` [RFC][PATCH 8/9] pagemap: use page walker pte_hole() helper Dave Hansen
@ 2007-08-21 20:42 ` Dave Hansen
2007-08-21 21:49 ` Matt Mackall
2007-08-21 21:23 ` [RFC][PATCH 1/9] /proc/pid/pagemap update Matt Mackall
2007-08-21 22:07 ` Matt Mackall
9 siblings, 1 reply; 22+ messages in thread
From: Dave Hansen @ 2007-08-21 20:42 UTC (permalink / raw)
To: mpm; +Cc: linux-mm, Dave Hansen
In addition to understanding which physical pages are
used by a process, it would also be very nice to
enumerate how much swap space a process is using.
This patch enables /proc/<pid>/pagemap to display
swap ptes. In the process, it also changes the
constant that we used to indicate non-present ptes
before.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/proc/task_mmu.c | 29 ++++++++++++++++++++++++++---
1 file changed, 26 insertions(+), 3 deletions(-)
diff -puN fs/proc/task_mmu.c~pagemap-export-swap-ptes fs/proc/task_mmu.c
--- lxc/fs/proc/task_mmu.c~pagemap-export-swap-ptes 2007-08-21 13:30:55.000000000 -0700
+++ lxc-dave/fs/proc/task_mmu.c 2007-08-21 13:30:55.000000000 -0700
@@ -7,6 +7,8 @@
#include <linux/pagemap.h>
#include <linux/ptrace.h>
#include <linux/mempolicy.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
#include <asm/elf.h>
#include <asm/uaccess.h>
@@ -506,9 +508,13 @@ struct pagemapread {
int index;
unsigned long __user *out;
};
-
#define PM_ENTRY_BYTES sizeof(unsigned long)
-#define PM_NOT_PRESENT ((unsigned long)-1)
+#define PM_RESERVED_BITS 3
+#define PM_RESERVED_OFFSET (BITS_PER_LONG-PM_RESERVED_BITS)
+#define PM_RESERVED_MASK (((1<<PM_RESERVED_BITS)-1) << PM_RESERVED_OFFSET)
+#define PM_SPECIAL(nr) (((nr) << PM_RESERVED_OFFSET) | PM_RESERVED_MASK)
+#define PM_NOT_PRESENT PM_SPECIAL(1)
+#define PM_SWAP PM_SPECIAL(2)
#define PAGEMAP_END_OF_BUFFER 1
static int add_to_pagemap(unsigned long addr, unsigned long pfn,
@@ -539,6 +545,21 @@ static int pagemap_pte_hole(unsigned lon
return err;
}
+unsigned long swap_pte_to_pagemap_entry(pte_t pte)
+{
+ unsigned long ret = 0;
+ swp_entry_t entry = pte_to_swp_entry(pte);
+ unsigned long offset;
+ unsigned long swap_file_nr;
+
+ offset = swp_offset(entry);
+ swap_file_nr = swp_type(entry);
+ ret = PM_SWAP | swap_file_nr | (offset << MAX_SWAPFILES_SHIFT);
+ return ret;
+}
+
+
+
static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
void *private)
{
@@ -549,7 +570,9 @@ static int pagemap_pte_range(pmd_t *pmd,
pte = pte_offset_map(pmd, addr);
for (; addr != end; pte++, addr += PAGE_SIZE) {
unsigned long pfn = PM_NOT_PRESENT;
- if (pte_present(*pte))
+ if (is_swap_pte(*pte))
+ pfn = swap_pte_to_pagemap_entry(*pte);
+ else if (pte_present(*pte))
pfn = pte_pfn(*pte);
err = add_to_pagemap(addr, pfn, pm);
if (err)
_
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [RFC][PATCH 1/9] /proc/pid/pagemap update
2007-08-21 20:42 [RFC][PATCH 1/9] /proc/pid/pagemap update Dave Hansen
` (7 preceding siblings ...)
2007-08-21 20:42 ` [RFC][PATCH 9/9] pagemap: export swap ptes Dave Hansen
@ 2007-08-21 21:23 ` Matt Mackall
2007-08-21 21:26 ` Dave Hansen
2007-08-21 22:07 ` Matt Mackall
9 siblings, 1 reply; 22+ messages in thread
From: Matt Mackall @ 2007-08-21 21:23 UTC (permalink / raw)
To: Dave Hansen; +Cc: linux-mm
On Tue, Aug 21, 2007 at 01:42:48PM -0700, Dave Hansen wrote:
>
> This is a series of patches to update /proc/pid/pagemap,
> to simplify the code, and fix bugs which caused userspace
> memory corruption.
>
> Since it is just in -mm, we should probably roll all of
> these together and just update it all at once, or send
> a simple drop-on replacement patch. These patches are
> all here mostly to help with review.
>
> Matt, if you're OK with these, do you mind if I send
> the update into -mm, or would you like to do it?
Hmmm, is the below working for you? I was having trouble with it.
> --
> From: Matt Mackall <mpm@selenic.com>
>
> On Mon, Aug 06, 2007 at 01:44:19AM -0500, Dave Boutcher wrote:
> >
> > Matt, this patch set replaces the two patches I sent earlier and
> > contains additional fixes. I've done some reasonably rigorous testing
> > on x86_64, but not on a 32 bit arch. I'm pretty sure this isn't worse
> > than what's in mm right now, which has some user-space corruption and
> > a nasty infinite kernel loop. YMMV.
>
> Dave, here's my current work-in-progress patch to deal with a couple
> locking issues, primarily a possible deadlock on the mm semaphore that
> can occur if someone unmaps the target buffer while we're walking the
> tree. It currently hangs on my box and I haven't had any free cycles
> to finish debugging it, but you might want to take a peek at it.
>
> Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
> ---
>
> lxc-dave/fs/proc/task_mmu.c | 121 ++++++++++++++++++++------------------------
> 1 file changed, 55 insertions(+), 66 deletions(-)
>
> diff -puN fs/proc/task_mmu.c~Re-_PATCH_0_3_proc_pid_pagemap_fixes fs/proc/task_mmu.c
> --- lxc/fs/proc/task_mmu.c~Re-_PATCH_0_3_proc_pid_pagemap_fixes 2007-08-21 13:30:50.000000000 -0700
> +++ lxc-dave/fs/proc/task_mmu.c 2007-08-21 13:30:50.000000000 -0700
> @@ -501,37 +501,21 @@ const struct file_operations proc_clear_
> };
>
> struct pagemapread {
> - struct mm_struct *mm;
> unsigned long next;
> - unsigned long *buf;
> - pte_t *ptebuf;
> unsigned long pos;
> size_t count;
> int index;
> - char __user *out;
> + unsigned long __user *out;
> };
>
> -static int flush_pagemap(struct pagemapread *pm)
> -{
> - int n = min(pm->count, pm->index * sizeof(unsigned long));
> - if (copy_to_user(pm->out, pm->buf, n))
> - return -EFAULT;
> - pm->out += n;
> - pm->pos += n;
> - pm->count -= n;
> - pm->index = 0;
> - cond_resched();
> - return 0;
> -}
> -
> static int add_to_pagemap(unsigned long addr, unsigned long pfn,
> struct pagemapread *pm)
> {
> - pm->buf[pm->index++] = pfn;
> + __put_user(pfn, pm->out);
> + pm->out++;
> + pm->pos += sizeof(unsigned long);
> + pm->count -= sizeof(unsigned long);
> pm->next = addr + PAGE_SIZE;
> - if (pm->index * sizeof(unsigned long) >= PAGE_SIZE ||
> - pm->index * sizeof(unsigned long) >= pm->count)
> - return flush_pagemap(pm);
> return 0;
> }
>
> @@ -543,14 +527,6 @@ static int pagemap_pte_range(pmd_t *pmd,
> int err;
>
> pte = pte_offset_map(pmd, addr);
> -
> -#ifdef CONFIG_HIGHPTE
> - /* copy PTE directory to temporary buffer and unmap it */
> - memcpy(pm->ptebuf, pte, PAGE_ALIGN((unsigned long)pte) - (unsigned long)pte);
> - pte_unmap(pte);
> - pte = pm->ptebuf;
> -#endif
> -
> for (; addr != end; pte++, addr += PAGE_SIZE) {
> if (addr < pm->next)
> continue;
> @@ -560,11 +536,12 @@ static int pagemap_pte_range(pmd_t *pmd,
> err = add_to_pagemap(addr, pte_pfn(*pte), pm);
> if (err)
> return err;
> + if (pm->count == 0)
> + break;
> }
> -
> -#ifndef CONFIG_HIGHPTE
> pte_unmap(pte - 1);
> -#endif
> +
> + cond_resched();
>
> return 0;
> }
> @@ -573,7 +550,7 @@ static int pagemap_fill(struct pagemapre
> {
> int ret;
>
> - while (pm->next != end) {
> + while (pm->next != end && pm->count > 0) {
> ret = add_to_pagemap(pm->next, -1UL, pm);
> if (ret)
> return ret;
> @@ -608,15 +585,16 @@ static ssize_t pagemap_read(struct file
> {
> struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
> unsigned long src = *ppos;
> - unsigned long *page;
> - unsigned long addr, end, vend, svpfn, evpfn;
> + struct page **pages, *page;
> + unsigned long addr, end, vend, svpfn, evpfn, uaddr, uend;
> struct mm_struct *mm;
> struct vm_area_struct *vma;
> struct pagemapread pm;
> + int pagecount;
> int ret = -ESRCH;
>
> if (!task)
> - goto out_no_task;
> + goto out;
>
> ret = -EACCES;
> if (!ptrace_may_attach(task))
> @@ -628,39 +606,43 @@ static ssize_t pagemap_read(struct file
> if ((svpfn + 1) * sizeof(unsigned long) != src)
> goto out;
> evpfn = min((src + count) / sizeof(unsigned long) - 1,
> - ((~0UL) >> PAGE_SHIFT) + 1);
> + ((~0UL) >> PAGE_SHIFT) + 1) - 1;
> count = (evpfn - svpfn) * sizeof(unsigned long);
> end = PAGE_SIZE * evpfn;
> -
> - ret = -ENOMEM;
> - page = kzalloc(PAGE_SIZE, GFP_USER);
> - if (!page)
> - goto out;
> -
> -#ifdef CONFIG_HIGHPTE
> - pm.ptebuf = kzalloc(PAGE_SIZE, GFP_USER);
> - if (!pm.ptebuf)
> - goto out_free;
> -#endif
> + //printk("src %ld svpfn %d evpfn %d count %d\n", src, svpfn, evpfn, count);
>
> ret = 0;
> mm = get_task_mm(task);
> if (!mm)
> - goto out_freepte;
> + goto out;
> +
> + ret = -ENOMEM;
> + uaddr = (unsigned long)buf & ~(PAGE_SIZE-1);
> + uend = (unsigned long)(buf + count);
> + pagecount = (uend - uaddr + PAGE_SIZE-1) / PAGE_SIZE;
> + pages = kmalloc(pagecount * sizeof(struct page *), GFP_KERNEL);
> + if (!pages)
> + goto out_task;
> +
> + down_read(¤t->mm->mmap_sem);
> + ret = get_user_pages(current, current->mm, uaddr, pagecount,
> + 1, 0, pages, NULL);
> + up_read(¤t->mm->mmap_sem);
> +
> + //printk("%x(%x):%x %d@%ld (%d pages) -> %d\n", uaddr, buf, uend, count, src, pagecount, ret);
> + if (ret < 0)
> + goto out_free;
>
> - pm.mm = mm;
> pm.next = addr;
> - pm.buf = page;
> pm.pos = src;
> pm.count = count;
> - pm.index = 0;
> - pm.out = buf;
> + pm.out = (unsigned long __user *)buf;
>
> if (svpfn == -1) {
> - ((char *)page)[0] = (ntohl(1) != 1);
> - ((char *)page)[1] = PAGE_SHIFT;
> - ((char *)page)[2] = sizeof(unsigned long);
> - ((char *)page)[3] = sizeof(unsigned long);
> + put_user((char)(ntohl(1) != 1), buf);
> + put_user((char)PAGE_SHIFT, buf + 1);
> + put_user((char)sizeof(unsigned long), buf + 2);
> + put_user((char)sizeof(unsigned long), buf + 3);
> add_to_pagemap(pm.next, page[0], &pm);
> }
>
> @@ -669,7 +651,8 @@ static ssize_t pagemap_read(struct file
> while (pm.count > 0 && vma) {
> if (!ptrace_may_attach(task)) {
> ret = -EIO;
> - goto out_mm;
> + up_read(&mm->mmap_sem);
> + goto out_release;
> }
> vend = min(vma->vm_end - 1, end - 1) + 1;
> ret = pagemap_fill(&pm, vend);
> @@ -682,23 +665,29 @@ static ssize_t pagemap_read(struct file
> }
> up_read(&mm->mmap_sem);
>
> + //printk("before fill at %ld\n", pm.pos);
> ret = pagemap_fill(&pm, end);
>
> + printk("after fill at %ld\n", pm.pos);
> *ppos = pm.pos;
> if (!ret)
> ret = pm.pos - src;
>
> -out_mm:
> +out_release:
> + printk("releasing pages\n");
> + for (; pagecount; pagecount--) {
> + page = pages[pagecount-1];
> + if (!PageReserved(page))
> + SetPageDirty(page);
> + page_cache_release(page);
> + }
> mmput(mm);
> -out_freepte:
> -#ifdef CONFIG_HIGHPTE
> - kfree(pm.ptebuf);
> out_free:
> -#endif
> - kfree(page);
> -out:
> + kfree(pages);
> +out_task:
> put_task_struct(task);
> -out_no_task:
> +out:
> + printk("returning\n");
> return ret;
> }
>
> _
--
Mathematics is the supreme nostalgia of our time.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [RFC][PATCH 2/9] pagemap: remove file header
2007-08-21 20:42 ` [RFC][PATCH 2/9] pagemap: remove file header Dave Hansen
@ 2007-08-21 21:24 ` Matt Mackall
0 siblings, 0 replies; 22+ messages in thread
From: Matt Mackall @ 2007-08-21 21:24 UTC (permalink / raw)
To: Dave Hansen; +Cc: linux-mm
On Tue, Aug 21, 2007 at 01:42:48PM -0700, Dave Hansen wrote:
>
> The /proc/<pid>/pagemap file has a header containing:
> * first byte: 0 for big endian, 1 for little
> * second byte: page shift (eg 12 for 4096 byte pages)
> * third byte: entry size in bytes (currently either 4 or 8)
> * fourth byte: header size
>
> The endianness is only useful when examining a raw dump of
> pagemap from a different machine when you don't know the
> source of the file. This is pretty rare, and the programs
> or scripts doing the copying off-machine can certainly be
> made to hold this information.
>
> The page size is available in userspace at least with libc's
> getpagesize(). This will also never vary across processes,
> so putting it in a per-process file doesn't make any difference.
> If we need a "kernel's page size" exported to userspace,
> perhaps we can put it in /proc/meminfo.
>
> The entry size is the really tricky one. This can't just
> be sizeof(unsigned long) from userspace because we can have
> 32-bit processes on 64-bit kernels. But, userspace can
> certainly derive this value if it lseek()s to the end of
> the file, and divides the file position by the size of its
> virtual address space.
>
> In any case, I believe this information is redundant, and
> can be removed.
>
> Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Yeah, looks like various folks thought this ought to go. So:
Acked-by: Matt Mackall <mpm@selenic.com>
> ---
>
> lxc-dave/fs/proc/task_mmu.c | 14 +++-----------
> 1 file changed, 3 insertions(+), 11 deletions(-)
>
> diff -puN fs/proc/task_mmu.c~pagemap-no-header fs/proc/task_mmu.c
> --- lxc/fs/proc/task_mmu.c~pagemap-no-header 2007-08-21 13:30:50.000000000 -0700
> +++ lxc-dave/fs/proc/task_mmu.c 2007-08-21 13:30:50.000000000 -0700
> @@ -601,12 +601,12 @@ static ssize_t pagemap_read(struct file
> goto out;
>
> ret = -EIO;
> - svpfn = src / sizeof(unsigned long) - 1;
> + svpfn = src / sizeof(unsigned long);
> addr = PAGE_SIZE * svpfn;
> - if ((svpfn + 1) * sizeof(unsigned long) != src)
> + if (svpfn * sizeof(unsigned long) != src)
> goto out;
> evpfn = min((src + count) / sizeof(unsigned long) - 1,
> - ((~0UL) >> PAGE_SHIFT) + 1) - 1;
> + ((~0UL) >> PAGE_SHIFT) + 1);
> count = (evpfn - svpfn) * sizeof(unsigned long);
> end = PAGE_SIZE * evpfn;
> //printk("src %ld svpfn %d evpfn %d count %d\n", src, svpfn, evpfn, count);
> @@ -638,14 +638,6 @@ static ssize_t pagemap_read(struct file
> pm.count = count;
> pm.out = (unsigned long __user *)buf;
>
> - if (svpfn == -1) {
> - put_user((char)(ntohl(1) != 1), buf);
> - put_user((char)PAGE_SHIFT, buf + 1);
> - put_user((char)sizeof(unsigned long), buf + 2);
> - put_user((char)sizeof(unsigned long), buf + 3);
> - add_to_pagemap(pm.next, page[0], &pm);
> - }
> -
> down_read(&mm->mmap_sem);
> vma = find_vma(mm, pm.next);
> while (pm.count > 0 && vma) {
> _
--
Mathematics is the supreme nostalgia of our time.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [RFC][PATCH 3/9] pagemap: use PAGE_MASK/PAGE_ALIGN()
2007-08-21 20:42 ` [RFC][PATCH 3/9] pagemap: use PAGE_MASK/PAGE_ALIGN() Dave Hansen
@ 2007-08-21 21:25 ` Matt Mackall
0 siblings, 0 replies; 22+ messages in thread
From: Matt Mackall @ 2007-08-21 21:25 UTC (permalink / raw)
To: Dave Hansen; +Cc: linux-mm
On Tue, Aug 21, 2007 at 01:42:50PM -0700, Dave Hansen wrote:
>
> Use existing macros (PAGE_MASK/PAGE_ALIGN()) instead of
> open-coding them.
>
> Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Acked-by: Matt Mackall <mpm@selenic.com>
> ---
>
> lxc-dave/fs/proc/task_mmu.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff -puN fs/proc/task_mmu.c~pagemap-use-PAGE_MASK fs/proc/task_mmu.c
> --- lxc/fs/proc/task_mmu.c~pagemap-use-PAGE_MASK 2007-08-21 13:30:51.000000000 -0700
> +++ lxc-dave/fs/proc/task_mmu.c 2007-08-21 13:30:51.000000000 -0700
> @@ -617,9 +617,9 @@ static ssize_t pagemap_read(struct file
> goto out;
>
> ret = -ENOMEM;
> - uaddr = (unsigned long)buf & ~(PAGE_SIZE-1);
> + uaddr = (unsigned long)buf & PAGE_MASK;
> uend = (unsigned long)(buf + count);
> - pagecount = (uend - uaddr + PAGE_SIZE-1) / PAGE_SIZE;
> + pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE;
> pages = kmalloc(pagecount * sizeof(struct page *), GFP_KERNEL);
> if (!pages)
> goto out_task;
> _
--
Mathematics is the supreme nostalgia of our time.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [RFC][PATCH 4/9] pagemap: remove open-coded sizeof(unsigned long)
2007-08-21 20:42 ` [RFC][PATCH 4/9] pagemap: remove open-coded sizeof(unsigned long) Dave Hansen
@ 2007-08-21 21:26 ` Matt Mackall
0 siblings, 0 replies; 22+ messages in thread
From: Matt Mackall @ 2007-08-21 21:26 UTC (permalink / raw)
To: Dave Hansen; +Cc: linux-mm
On Tue, Aug 21, 2007 at 01:42:51PM -0700, Dave Hansen wrote:
>
> I think the code gets easier to read when we give symbolic names
> to some of the operations we're performing. I was sure we needed
> this when I saw the header being built like this:
>
> ...
> buf[2] = sizeof(unsigned long)
> buf[3] = sizeof(unsigned long)
>
> I really couldn't remember what either field did ;(
>
> Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Comment still rendered obsolete by previous patch. Otherwise:
Acked-by: Matt Mackall <mpm@selenic.com>
> ---
>
> lxc-dave/fs/proc/task_mmu.c | 12 +++++++-----
> 1 file changed, 7 insertions(+), 5 deletions(-)
>
> diff -puN fs/proc/task_mmu.c~pagemap-use-ENTRY_SIZE fs/proc/task_mmu.c
> --- lxc/fs/proc/task_mmu.c~pagemap-use-ENTRY_SIZE 2007-08-21 13:30:51.000000000 -0700
> +++ lxc-dave/fs/proc/task_mmu.c 2007-08-21 13:30:51.000000000 -0700
> @@ -508,14 +508,16 @@ struct pagemapread {
> unsigned long __user *out;
> };
>
> +#define PM_ENTRY_BYTES sizeof(unsigned long)
> +
> static int add_to_pagemap(unsigned long addr, unsigned long pfn,
> struct pagemapread *pm)
> {
> __put_user(pfn, pm->out);
> pm->out++;
> - pm->pos += sizeof(unsigned long);
> - pm->count -= sizeof(unsigned long);
> pm->next = addr + PAGE_SIZE;
> + pm->pos += PM_ENTRY_BYTES;
> + pm->count -= PM_ENTRY_BYTES;
> return 0;
> }
>
> @@ -601,13 +603,13 @@ static ssize_t pagemap_read(struct file
> goto out;
>
> ret = -EIO;
> - svpfn = src / sizeof(unsigned long);
> + svpfn = src / PM_ENTRY_BYTES;
> addr = PAGE_SIZE * svpfn;
> - if (svpfn * sizeof(unsigned long) != src)
> + if (svpfn * PM_ENTRY_BYTES != src)
> goto out;
> evpfn = min((src + count) / sizeof(unsigned long) - 1,
> ((~0UL) >> PAGE_SHIFT) + 1);
> - count = (evpfn - svpfn) * sizeof(unsigned long);
> + count = (evpfn - svpfn) * PM_ENTRY_BYTES;
> end = PAGE_SIZE * evpfn;
> //printk("src %ld svpfn %d evpfn %d count %d\n", src, svpfn, evpfn, count);
>
> _
--
Mathematics is the supreme nostalgia of our time.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [RFC][PATCH 1/9] /proc/pid/pagemap update
2007-08-21 21:23 ` [RFC][PATCH 1/9] /proc/pid/pagemap update Matt Mackall
@ 2007-08-21 21:26 ` Dave Hansen
0 siblings, 0 replies; 22+ messages in thread
From: Dave Hansen @ 2007-08-21 21:26 UTC (permalink / raw)
To: Matt Mackall; +Cc: linux-mm
On Tue, 2007-08-21 at 16:23 -0500, Matt Mackall wrote:
>
> > Matt, if you're OK with these, do you mind if I send
> > the update into -mm, or would you like to do it?
>
> Hmmm, is the below working for you? I was having trouble with it.
I think that was just a patch you sent as your work-in-progress a couple
of weeks ago. Either I messed it up when merging, or it never compiled.
The subsequent patches make it work again.
-- Dave
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [RFC][PATCH 6/9] pagemap: give -1's a name
2007-08-21 20:42 ` [RFC][PATCH 6/9] pagemap: give -1's a name Dave Hansen
@ 2007-08-21 21:27 ` Matt Mackall
0 siblings, 0 replies; 22+ messages in thread
From: Matt Mackall @ 2007-08-21 21:27 UTC (permalink / raw)
To: Dave Hansen; +Cc: linux-mm
On Tue, Aug 21, 2007 at 01:42:54PM -0700, Dave Hansen wrote:
>
> -1 is a magic number in /proc/$pid/pagemap. It means that
> there was no pte present for a particular page. We're
> going to be refining that a bit shortly, so give this a
> real name for now.
>
> Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Acked-by: Matt Mackall <mpm@selenic.com>
> ---
>
> lxc-dave/fs/proc/task_mmu.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff -puN fs/proc/task_mmu.c~give_-1s_a_name fs/proc/task_mmu.c
> --- lxc/fs/proc/task_mmu.c~give_-1s_a_name 2007-08-21 13:30:53.000000000 -0700
> +++ lxc-dave/fs/proc/task_mmu.c 2007-08-21 13:30:53.000000000 -0700
> @@ -509,6 +509,7 @@ struct pagemapread {
> };
>
> #define PM_ENTRY_BYTES sizeof(unsigned long)
> +#define PM_NOT_PRESENT ((unsigned long)-1)
>
> static int add_to_pagemap(unsigned long addr, unsigned long pfn,
> struct pagemapread *pm)
> @@ -533,7 +534,7 @@ static int pagemap_pte_range(pmd_t *pmd,
> if (addr < pm->next)
> continue;
> if (!pte_present(*pte))
> - err = add_to_pagemap(addr, -1, pm);
> + err = add_to_pagemap(addr, PM_NOT_PRESENT, pm);
> else
> err = add_to_pagemap(addr, pte_pfn(*pte), pm);
> if (err)
> _
--
Mathematics is the supreme nostalgia of our time.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [RFC][PATCH 9/9] pagemap: export swap ptes
2007-08-21 20:42 ` [RFC][PATCH 9/9] pagemap: export swap ptes Dave Hansen
@ 2007-08-21 21:49 ` Matt Mackall
2007-08-21 22:26 ` Dave Hansen
0 siblings, 1 reply; 22+ messages in thread
From: Matt Mackall @ 2007-08-21 21:49 UTC (permalink / raw)
To: Dave Hansen; +Cc: linux-mm
On Tue, Aug 21, 2007 at 01:42:59PM -0700, Dave Hansen wrote:
>
> In addition to understanding which physical pages are
> used by a process, it would also be very nice to
> enumerate how much swap space a process is using.
>
> This patch enables /proc/<pid>/pagemap to display
> swap ptes. In the process, it also changes the
> constant that we used to indicate non-present ptes
> before.
Nice. Can you update the doc comment on pagemap_read to match?
> Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
> ---
>
> lxc-dave/fs/proc/task_mmu.c | 29 ++++++++++++++++++++++++++---
> 1 file changed, 26 insertions(+), 3 deletions(-)
>
> diff -puN fs/proc/task_mmu.c~pagemap-export-swap-ptes fs/proc/task_mmu.c
> --- lxc/fs/proc/task_mmu.c~pagemap-export-swap-ptes 2007-08-21 13:30:55.000000000 -0700
> +++ lxc-dave/fs/proc/task_mmu.c 2007-08-21 13:30:55.000000000 -0700
> @@ -7,6 +7,8 @@
> #include <linux/pagemap.h>
> #include <linux/ptrace.h>
> #include <linux/mempolicy.h>
> +#include <linux/swap.h>
> +#include <linux/swapops.h>
>
> #include <asm/elf.h>
> #include <asm/uaccess.h>
> @@ -506,9 +508,13 @@ struct pagemapread {
> int index;
> unsigned long __user *out;
> };
> -
> #define PM_ENTRY_BYTES sizeof(unsigned long)
> -#define PM_NOT_PRESENT ((unsigned long)-1)
> +#define PM_RESERVED_BITS 3
> +#define PM_RESERVED_OFFSET (BITS_PER_LONG-PM_RESERVED_BITS)
> +#define PM_RESERVED_MASK (((1<<PM_RESERVED_BITS)-1) << PM_RESERVED_OFFSET)
> +#define PM_SPECIAL(nr) (((nr) << PM_RESERVED_OFFSET) | PM_RESERVED_MASK)
> +#define PM_NOT_PRESENT PM_SPECIAL(1)
> +#define PM_SWAP PM_SPECIAL(2)
> #define PAGEMAP_END_OF_BUFFER 1
>
> static int add_to_pagemap(unsigned long addr, unsigned long pfn,
> @@ -539,6 +545,21 @@ static int pagemap_pte_hole(unsigned lon
> return err;
> }
>
> +unsigned long swap_pte_to_pagemap_entry(pte_t pte)
> +{
> + unsigned long ret = 0;
Unused assignment?
> + swp_entry_t entry = pte_to_swp_entry(pte);
> + unsigned long offset;
> + unsigned long swap_file_nr;
> +
> + offset = swp_offset(entry);
> + swap_file_nr = swp_type(entry);
> + ret = PM_SWAP | swap_file_nr | (offset << MAX_SWAPFILES_SHIFT);
> + return ret;
How about just return <expression>?
> +}
This is a little problematic as we've added another not very visible
magic number to the mix. We're also not masking off swp_offset to
avoid colliding with our reserved bits. And we're also unpacking an
arch-independent value (swp_entry_t) just to repack it in more or less
the same shape? Or are we reversing the fields?
> static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
> void *private)
> {
> @@ -549,7 +570,9 @@ static int pagemap_pte_range(pmd_t *pmd,
> pte = pte_offset_map(pmd, addr);
> for (; addr != end; pte++, addr += PAGE_SIZE) {
> unsigned long pfn = PM_NOT_PRESENT;
> - if (pte_present(*pte))
> + if (is_swap_pte(*pte))
Hmm, unlikely?
> + pfn = swap_pte_to_pagemap_entry(*pte);
> + else if (pte_present(*pte))
> pfn = pte_pfn(*pte);
> err = add_to_pagemap(addr, pfn, pm);
> if (err)
> _
--
Mathematics is the supreme nostalgia of our time.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [RFC][PATCH 8/9] pagemap: use page walker pte_hole() helper
2007-08-21 20:42 ` [RFC][PATCH 8/9] pagemap: use page walker pte_hole() helper Dave Hansen
@ 2007-08-21 22:01 ` Matt Mackall
2007-08-21 22:38 ` Dave Hansen
0 siblings, 1 reply; 22+ messages in thread
From: Matt Mackall @ 2007-08-21 22:01 UTC (permalink / raw)
To: Dave Hansen; +Cc: linux-mm
On Tue, Aug 21, 2007 at 01:42:57PM -0700, Dave Hansen wrote:
>
> I tried to do this a bit more incrementally, but it ended
> up just looking like an even worse mess. So, this does
> a a couple of different things.
>
> 1. use page walker pte_hole() helper, which
> 2. gets rid of the "next" value in "struct pagemapread"
> 3. allow 1-3 byte reads from pagemap. This at least
> ensures that we don't write over user memory if they
> ask us for 1 bytes and we tried to write 4.
> 4. Instead of trying to calculate what ranges of pages
> we are going to walk, simply start walking them,
> then return PAGEMAP_END_OF_BUFFER at the end of the
> buffer, error out, and stop walking.
> 5. enforce that reads must be algined to PM_ENTRY_BYTES
>
> Note that, despite these functional additions, and some
> nice new comments, this patch still removes more code
> than it adds.
Ok, but deleting my debugging printks doesn't count!
> Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
> ---
>
> lxc-dave/fs/proc/task_mmu.c | 123 ++++++++++++++++++++------------------------
> 1 file changed, 56 insertions(+), 67 deletions(-)
>
> diff -puN fs/proc/task_mmu.c~bail-instead-of-tracking fs/proc/task_mmu.c
> --- lxc/fs/proc/task_mmu.c~bail-instead-of-tracking 2007-08-21 13:30:54.000000000 -0700
> +++ lxc-dave/fs/proc/task_mmu.c 2007-08-21 13:30:54.000000000 -0700
> @@ -501,7 +501,6 @@ const struct file_operations proc_clear_
> };
>
> struct pagemapread {
> - unsigned long next;
> unsigned long pos;
> size_t count;
> int index;
> @@ -510,58 +509,64 @@ struct pagemapread {
>
> #define PM_ENTRY_BYTES sizeof(unsigned long)
> #define PM_NOT_PRESENT ((unsigned long)-1)
> +#define PAGEMAP_END_OF_BUFFER 1
>
> static int add_to_pagemap(unsigned long addr, unsigned long pfn,
> struct pagemapread *pm)
> {
> - __put_user(pfn, pm->out);
> - pm->out++;
> - pm->next = addr + PAGE_SIZE;
> + int out_len = PM_ENTRY_BYTES;
> + if (pm->count < PM_ENTRY_BYTES)
> + out_len = pm->count;
This wants an unlikely.
> + copy_to_user(pm->out, &pfn, out_len);
And I think we want to keep the put_user in the fast path.
> pm->pos += PM_ENTRY_BYTES;
> pm->count -= PM_ENTRY_BYTES;
> + if (pm->count <= 0)
> + return PAGEMAP_END_OF_BUFFER;
> return 0;
> }
>
> +static int pagemap_pte_hole(unsigned long start, unsigned long end,
> + void *private)
> +{
> + struct pagemapread *pm = private;
> + unsigned long addr;
> + int err = 0;
> + for (addr = start; addr < end; addr += PAGE_SIZE) {
> + err = add_to_pagemap(addr, PM_NOT_PRESENT, pm);
> + if (err)
> + break;
> + }
> + return err;
> +}
> +
> static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
> void *private)
> {
> struct pagemapread *pm = private;
> pte_t *pte;
> - int err;
> + int err = 0;
>
> pte = pte_offset_map(pmd, addr);
> for (; addr != end; pte++, addr += PAGE_SIZE) {
> - if (addr < pm->next)
> - continue;
> - if (!pte_present(*pte))
> - err = add_to_pagemap(addr, PM_NOT_PRESENT, pm);
> - else
> - err = add_to_pagemap(addr, pte_pfn(*pte), pm);
> + unsigned long pfn = PM_NOT_PRESENT;
> + if (pte_present(*pte))
> + pfn = pte_pfn(*pte);
> + err = add_to_pagemap(addr, pfn, pm);
> if (err)
> return err;
> - if (pm->count == 0)
> - break;
> }
> pte_unmap(pte - 1);
>
> cond_resched();
>
> - return 0;
> + return err;
> }
>
> -static int pagemap_fill(struct pagemapread *pm, unsigned long end)
> +static struct mm_walk pagemap_walk =
> {
> - int ret;
> -
> - while (pm->next != end && pm->count > 0) {
> - ret = add_to_pagemap(pm->next, -1UL, pm);
> - if (ret)
> - return ret;
> - }
> - return 0;
> -}
> -
> -static struct mm_walk pagemap_walk = { .pmd_entry = pagemap_pte_range };
> + .pmd_entry = pagemap_pte_range,
> + .pte_hole = pagemap_pte_hole
> +};
>
> /*
> * /proc/pid/pagemap - an array mapping virtual pages to pfns
> @@ -589,9 +594,8 @@ static ssize_t pagemap_read(struct file
> struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
> unsigned long src = *ppos;
> struct page **pages, *page;
> - unsigned long addr, end, vend, svpfn, evpfn, uaddr, uend;
> + unsigned long uaddr, uend;
> struct mm_struct *mm;
> - struct vm_area_struct *vma;
> struct pagemapread pm;
> int pagecount;
> int ret = -ESRCH;
> @@ -603,16 +607,10 @@ static ssize_t pagemap_read(struct file
> if (!ptrace_may_attach(task))
> goto out;
>
> - ret = -EIO;
> - svpfn = src / PM_ENTRY_BYTES;
> - addr = PAGE_SIZE * svpfn;
> - if (svpfn * PM_ENTRY_BYTES != src)
> + ret = -EINVAL;
> + /* file position must be aligned */
> + if (*ppos % PM_ENTRY_BYTES)
> goto out;
> - evpfn = min((src + count) / sizeof(unsigned long) - 1,
> - ((~0UL) >> PAGE_SHIFT) + 1);
> - count = (evpfn - svpfn) * PM_ENTRY_BYTES;
> - end = PAGE_SIZE * evpfn;
> - //printk("src %ld svpfn %d evpfn %d count %d\n", src, svpfn, evpfn, count);
>
> ret = 0;
> mm = get_task_mm(task);
> @@ -632,44 +630,36 @@ static ssize_t pagemap_read(struct file
> 1, 0, pages, NULL);
> up_read(¤t->mm->mmap_sem);
>
> - //printk("%x(%x):%x %d@%ld (%d pages) -> %d\n", uaddr, buf, uend, count, src, pagecount, ret);
> if (ret < 0)
> goto out_free;
>
> - pm.next = addr;
> pm.pos = src;
> pm.count = count;
> pm.out = (unsigned long __user *)buf;
>
> - down_read(&mm->mmap_sem);
> - vma = find_vma(mm, pm.next);
> - while (pm.count > 0 && vma) {
> - if (!ptrace_may_attach(task)) {
> - ret = -EIO;
> - up_read(&mm->mmap_sem);
> - goto out_release;
> - }
> - vend = min(vma->vm_end - 1, end - 1) + 1;
> - ret = pagemap_fill(&pm, vend);
> - if (ret || !pm.count)
> - break;
> - vend = min(vma->vm_end - 1, end - 1) + 1;
> - ret = walk_page_range(mm, vma->vm_start, vend,
> - &pagemap_walk, &pm);
> - vma = vma->vm_next;
> + if (!ptrace_may_attach(task)) {
> + ret = -EIO;
> + } else {
> + unsigned long src = *ppos;
> + unsigned long svpfn = src / PM_ENTRY_BYTES;
> + unsigned long start_vaddr = svpfn << PAGE_SHIFT;
> + unsigned long end_vaddr = TASK_SIZE_OF(task);
> + /*
> + * The odds are that this will stop walking way
> + * before end_vaddr, because the length of the
> + * user buffer is tracked in "pm", and the walk
> + * will stop when we hit the end of the buffer.
> + */
> + ret = walk_page_range(mm, start_vaddr, end_vaddr,
> + &pagemap_walk, &pm);
> + if (ret == PAGEMAP_END_OF_BUFFER)
> + ret = 0;
> + /* don't need mmap_sem for these, but this looks cleaner */
> + *ppos = pm.pos;
> + if (!ret)
> + ret = pm.pos - src;
> }
> - up_read(&mm->mmap_sem);
> -
> - //printk("before fill at %ld\n", pm.pos);
> - ret = pagemap_fill(&pm, end);
> -
> - printk("after fill at %ld\n", pm.pos);
> - *ppos = pm.pos;
> - if (!ret)
> - ret = pm.pos - src;
>
> -out_release:
> - printk("releasing pages\n");
> for (; pagecount; pagecount--) {
> page = pages[pagecount-1];
> if (!PageReserved(page))
> @@ -682,7 +672,6 @@ out_free:
> out_task:
> put_task_struct(task);
> out:
> - printk("returning\n");
> return ret;
> }
>
> _
--
Mathematics is the supreme nostalgia of our time.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [RFC][PATCH 1/9] /proc/pid/pagemap update
2007-08-21 20:42 [RFC][PATCH 1/9] /proc/pid/pagemap update Dave Hansen
` (8 preceding siblings ...)
2007-08-21 21:23 ` [RFC][PATCH 1/9] /proc/pid/pagemap update Matt Mackall
@ 2007-08-21 22:07 ` Matt Mackall
9 siblings, 0 replies; 22+ messages in thread
From: Matt Mackall @ 2007-08-21 22:07 UTC (permalink / raw)
To: Dave Hansen; +Cc: linux-mm
On Tue, Aug 21, 2007 at 01:42:48PM -0700, Dave Hansen wrote:
>
> This is a series of patches to update /proc/pid/pagemap,
> to simplify the code, and fix bugs which caused userspace
> memory corruption.
>
> Since it is just in -mm, we should probably roll all of
> these together and just update it all at once, or send
> a simple drop-on replacement patch. These patches are
> all here mostly to help with review.
>
> Matt, if you're OK with these, do you mind if I send
> the update into -mm, or would you like to do it?
I suppose I'll give this a spin tomorrow along with some other bits I
have pending and push it to Andrew..
> --
> From: Matt Mackall <mpm@selenic.com>
>
> On Mon, Aug 06, 2007 at 01:44:19AM -0500, Dave Boutcher wrote:
> >
> > Matt, this patch set replaces the two patches I sent earlier and
> > contains additional fixes. I've done some reasonably rigorous testing
> > on x86_64, but not on a 32 bit arch. I'm pretty sure this isn't worse
> > than what's in mm right now, which has some user-space corruption and
> > a nasty infinite kernel loop. YMMV.
>
> Dave, here's my current work-in-progress patch to deal with a couple
> locking issues, primarily a possible deadlock on the mm semaphore that
> can occur if someone unmaps the target buffer while we're walking the
> tree. It currently hangs on my box and I haven't had any free cycles
> to finish debugging it, but you might want to take a peek at it.
>
> Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
> ---
>
> lxc-dave/fs/proc/task_mmu.c | 121 ++++++++++++++++++++------------------------
> 1 file changed, 55 insertions(+), 66 deletions(-)
>
> diff -puN fs/proc/task_mmu.c~Re-_PATCH_0_3_proc_pid_pagemap_fixes fs/proc/task_mmu.c
> --- lxc/fs/proc/task_mmu.c~Re-_PATCH_0_3_proc_pid_pagemap_fixes 2007-08-21 13:30:50.000000000 -0700
> +++ lxc-dave/fs/proc/task_mmu.c 2007-08-21 13:30:50.000000000 -0700
> @@ -501,37 +501,21 @@ const struct file_operations proc_clear_
> };
>
> struct pagemapread {
> - struct mm_struct *mm;
> unsigned long next;
> - unsigned long *buf;
> - pte_t *ptebuf;
> unsigned long pos;
> size_t count;
> int index;
> - char __user *out;
> + unsigned long __user *out;
> };
>
> -static int flush_pagemap(struct pagemapread *pm)
> -{
> - int n = min(pm->count, pm->index * sizeof(unsigned long));
> - if (copy_to_user(pm->out, pm->buf, n))
> - return -EFAULT;
> - pm->out += n;
> - pm->pos += n;
> - pm->count -= n;
> - pm->index = 0;
> - cond_resched();
> - return 0;
> -}
> -
> static int add_to_pagemap(unsigned long addr, unsigned long pfn,
> struct pagemapread *pm)
> {
> - pm->buf[pm->index++] = pfn;
> + __put_user(pfn, pm->out);
> + pm->out++;
> + pm->pos += sizeof(unsigned long);
> + pm->count -= sizeof(unsigned long);
> pm->next = addr + PAGE_SIZE;
> - if (pm->index * sizeof(unsigned long) >= PAGE_SIZE ||
> - pm->index * sizeof(unsigned long) >= pm->count)
> - return flush_pagemap(pm);
> return 0;
> }
>
> @@ -543,14 +527,6 @@ static int pagemap_pte_range(pmd_t *pmd,
> int err;
>
> pte = pte_offset_map(pmd, addr);
> -
> -#ifdef CONFIG_HIGHPTE
> - /* copy PTE directory to temporary buffer and unmap it */
> - memcpy(pm->ptebuf, pte, PAGE_ALIGN((unsigned long)pte) - (unsigned long)pte);
> - pte_unmap(pte);
> - pte = pm->ptebuf;
> -#endif
> -
> for (; addr != end; pte++, addr += PAGE_SIZE) {
> if (addr < pm->next)
> continue;
> @@ -560,11 +536,12 @@ static int pagemap_pte_range(pmd_t *pmd,
> err = add_to_pagemap(addr, pte_pfn(*pte), pm);
> if (err)
> return err;
> + if (pm->count == 0)
> + break;
> }
> -
> -#ifndef CONFIG_HIGHPTE
> pte_unmap(pte - 1);
> -#endif
> +
> + cond_resched();
>
> return 0;
> }
> @@ -573,7 +550,7 @@ static int pagemap_fill(struct pagemapre
> {
> int ret;
>
> - while (pm->next != end) {
> + while (pm->next != end && pm->count > 0) {
> ret = add_to_pagemap(pm->next, -1UL, pm);
> if (ret)
> return ret;
> @@ -608,15 +585,16 @@ static ssize_t pagemap_read(struct file
> {
> struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
> unsigned long src = *ppos;
> - unsigned long *page;
> - unsigned long addr, end, vend, svpfn, evpfn;
> + struct page **pages, *page;
> + unsigned long addr, end, vend, svpfn, evpfn, uaddr, uend;
> struct mm_struct *mm;
> struct vm_area_struct *vma;
> struct pagemapread pm;
> + int pagecount;
> int ret = -ESRCH;
>
> if (!task)
> - goto out_no_task;
> + goto out;
>
> ret = -EACCES;
> if (!ptrace_may_attach(task))
> @@ -628,39 +606,43 @@ static ssize_t pagemap_read(struct file
> if ((svpfn + 1) * sizeof(unsigned long) != src)
> goto out;
> evpfn = min((src + count) / sizeof(unsigned long) - 1,
> - ((~0UL) >> PAGE_SHIFT) + 1);
> + ((~0UL) >> PAGE_SHIFT) + 1) - 1;
> count = (evpfn - svpfn) * sizeof(unsigned long);
> end = PAGE_SIZE * evpfn;
> -
> - ret = -ENOMEM;
> - page = kzalloc(PAGE_SIZE, GFP_USER);
> - if (!page)
> - goto out;
> -
> -#ifdef CONFIG_HIGHPTE
> - pm.ptebuf = kzalloc(PAGE_SIZE, GFP_USER);
> - if (!pm.ptebuf)
> - goto out_free;
> -#endif
> + //printk("src %ld svpfn %d evpfn %d count %d\n", src, svpfn, evpfn, count);
>
> ret = 0;
> mm = get_task_mm(task);
> if (!mm)
> - goto out_freepte;
> + goto out;
> +
> + ret = -ENOMEM;
> + uaddr = (unsigned long)buf & ~(PAGE_SIZE-1);
> + uend = (unsigned long)(buf + count);
> + pagecount = (uend - uaddr + PAGE_SIZE-1) / PAGE_SIZE;
> + pages = kmalloc(pagecount * sizeof(struct page *), GFP_KERNEL);
> + if (!pages)
> + goto out_task;
> +
> + down_read(¤t->mm->mmap_sem);
> + ret = get_user_pages(current, current->mm, uaddr, pagecount,
> + 1, 0, pages, NULL);
> + up_read(¤t->mm->mmap_sem);
> +
> + //printk("%x(%x):%x %d@%ld (%d pages) -> %d\n", uaddr, buf, uend, count, src, pagecount, ret);
> + if (ret < 0)
> + goto out_free;
>
> - pm.mm = mm;
> pm.next = addr;
> - pm.buf = page;
> pm.pos = src;
> pm.count = count;
> - pm.index = 0;
> - pm.out = buf;
> + pm.out = (unsigned long __user *)buf;
>
> if (svpfn == -1) {
> - ((char *)page)[0] = (ntohl(1) != 1);
> - ((char *)page)[1] = PAGE_SHIFT;
> - ((char *)page)[2] = sizeof(unsigned long);
> - ((char *)page)[3] = sizeof(unsigned long);
> + put_user((char)(ntohl(1) != 1), buf);
> + put_user((char)PAGE_SHIFT, buf + 1);
> + put_user((char)sizeof(unsigned long), buf + 2);
> + put_user((char)sizeof(unsigned long), buf + 3);
> add_to_pagemap(pm.next, page[0], &pm);
> }
>
> @@ -669,7 +651,8 @@ static ssize_t pagemap_read(struct file
> while (pm.count > 0 && vma) {
> if (!ptrace_may_attach(task)) {
> ret = -EIO;
> - goto out_mm;
> + up_read(&mm->mmap_sem);
> + goto out_release;
> }
> vend = min(vma->vm_end - 1, end - 1) + 1;
> ret = pagemap_fill(&pm, vend);
> @@ -682,23 +665,29 @@ static ssize_t pagemap_read(struct file
> }
> up_read(&mm->mmap_sem);
>
> + //printk("before fill at %ld\n", pm.pos);
> ret = pagemap_fill(&pm, end);
>
> + printk("after fill at %ld\n", pm.pos);
> *ppos = pm.pos;
> if (!ret)
> ret = pm.pos - src;
>
> -out_mm:
> +out_release:
> + printk("releasing pages\n");
> + for (; pagecount; pagecount--) {
> + page = pages[pagecount-1];
> + if (!PageReserved(page))
> + SetPageDirty(page);
> + page_cache_release(page);
> + }
> mmput(mm);
> -out_freepte:
> -#ifdef CONFIG_HIGHPTE
> - kfree(pm.ptebuf);
> out_free:
> -#endif
> - kfree(page);
> -out:
> + kfree(pages);
> +out_task:
> put_task_struct(task);
> -out_no_task:
> +out:
> + printk("returning\n");
> return ret;
> }
>
> _
--
Mathematics is the supreme nostalgia of our time.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [RFC][PATCH 9/9] pagemap: export swap ptes
2007-08-21 21:49 ` Matt Mackall
@ 2007-08-21 22:26 ` Dave Hansen
0 siblings, 0 replies; 22+ messages in thread
From: Dave Hansen @ 2007-08-21 22:26 UTC (permalink / raw)
To: Matt Mackall; +Cc: linux-mm
On Tue, 2007-08-21 at 16:49 -0500, Matt Mackall wrote:
> On Tue, Aug 21, 2007 at 01:42:59PM -0700, Dave Hansen wrote:
> >
> > In addition to understanding which physical pages are
> > used by a process, it would also be very nice to
> > enumerate how much swap space a process is using.
> >
> > This patch enables /proc/<pid>/pagemap to display
> > swap ptes. In the process, it also changes the
> > constant that we used to indicate non-present ptes
> > before.
>
> Nice. Can you update the doc comment on pagemap_read to match?
Sure.
> > +unsigned long swap_pte_to_pagemap_entry(pte_t pte)
> > +{
> > + unsigned long ret = 0;
>
> Unused assignment?
Yep. I'll kill that.
> > + swp_entry_t entry = pte_to_swp_entry(pte);
> > + unsigned long offset;
> > + unsigned long swap_file_nr;
> > +
> > + offset = swp_offset(entry);
> > + swap_file_nr = swp_type(entry);
> > + ret = PM_SWAP | swap_file_nr | (offset << MAX_SWAPFILES_SHIFT);
> > + return ret;
>
> How about just return <expression>?
I had intended to put some debugging in there, but I'll take it out for
now.
> This is a little problematic as we've added another not very visible
> magic number to the mix. We're also not masking off swp_offset to
> avoid colliding with our reserved bits. And we're also unpacking an
> arch-independent value (swp_entry_t) just to repack it in more or less
> the same shape? Or are we reversing the fields?
I did it that way because swp_entry_t is implemented as an opaque type,
and we don't have any real guarantees that it will stay in its current
format, or that it will truly _stay_ arch independent, or not change
format. All we know is that running swp_offset/type() on it will get us
the offset and swap file.
> > static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
> > void *private)
> > {
> > @@ -549,7 +570,9 @@ static int pagemap_pte_range(pmd_t *pmd,
> > pte = pte_offset_map(pmd, addr);
> > for (; addr != end; pte++, addr += PAGE_SIZE) {
> > unsigned long pfn = PM_NOT_PRESENT;
> > - if (pte_present(*pte))
> > + if (is_swap_pte(*pte))
>
> Hmm, unlikely?
I tend to reserve unlikely()s for performance critical regions of code
or in other cases where I know the compiler is being really stupid. I
don't think this one is horribly performance critical. This whole
little section of code looks to me to be ~22 bytes on i386. It'll fit
in a cacheline. :)
-- Dave
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [RFC][PATCH 8/9] pagemap: use page walker pte_hole() helper
2007-08-21 22:01 ` Matt Mackall
@ 2007-08-21 22:38 ` Dave Hansen
0 siblings, 0 replies; 22+ messages in thread
From: Dave Hansen @ 2007-08-21 22:38 UTC (permalink / raw)
To: Matt Mackall; +Cc: linux-mm
On Tue, 2007-08-21 at 17:01 -0500, Matt Mackall wrote:
> > + copy_to_user(pm->out, &pfn, out_len);
>
> And I think we want to keep the put_user in the fast path.
OK, updated:
static int add_to_pagemap(unsigned long addr, unsigned long pfn,
struct pagemapread *pm)
{
/*
* Make sure there's room in the buffer for an
* entire entry. Otherwise, only copy part of
* the pfn.
*/
if (pm->count >= PM_ENTRY_BYTES)
__put_user(pfn pm->out);
else
copy_to_user(pm->out, &pfn, pm->count);
pm->pos += PM_ENTRY_BYTES;
pm->count -= PM_ENTRY_BYTES;
if (pm->count <= 0)
return PAGEMAP_END_OF_BUFFER;
return 0;
}
I'll patch-bomb you with the (small) updates I just made.
-- Dave
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [RFC][PATCH 7/9] pagewalk: add handler for empty ranges
2007-08-21 20:42 ` [RFC][PATCH 7/9] pagewalk: add handler for empty ranges Dave Hansen
@ 2007-08-30 7:28 ` Nick Piggin
2007-09-03 12:32 ` Matt Mackall
0 siblings, 1 reply; 22+ messages in thread
From: Nick Piggin @ 2007-08-30 7:28 UTC (permalink / raw)
To: Dave Hansen; +Cc: mpm, linux-mm
Dave Hansen wrote:
> @@ -27,25 +23,23 @@ static int walk_pmd_range(pud_t *pud, un
> {
> pmd_t *pmd;
> unsigned long next;
> - int err;
> + int err = 0;
>
> for (pmd = pmd_offset(pud, addr); addr != end;
> pmd++, addr = next) {
> next = pmd_addr_end(addr, end);
While you're there, do you mind fixing the actual page table walking so
that it follows the normal form?
--
SUSE Labs, Novell Inc.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [RFC][PATCH 7/9] pagewalk: add handler for empty ranges
2007-08-30 7:28 ` Nick Piggin
@ 2007-09-03 12:32 ` Matt Mackall
0 siblings, 0 replies; 22+ messages in thread
From: Matt Mackall @ 2007-09-03 12:32 UTC (permalink / raw)
To: Nick Piggin; +Cc: Dave Hansen, linux-mm
On Thu, Aug 30, 2007 at 05:28:02PM +1000, Nick Piggin wrote:
> Dave Hansen wrote:
>
> >@@ -27,25 +23,23 @@ static int walk_pmd_range(pud_t *pud, un
> > {
> > pmd_t *pmd;
> > unsigned long next;
> >- int err;
> >+ int err = 0;
> >
> > for (pmd = pmd_offset(pud, addr); addr != end;
> > pmd++, addr = next) {
> > next = pmd_addr_end(addr, end);
>
> While you're there, do you mind fixing the actual page table walking so
> that it follows the normal form?
Already done in my local series.
--
Mathematics is the supreme nostalgia of our time.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 22+ messages in thread
end of thread, other threads:[~2007-09-03 12:32 UTC | newest]
Thread overview: 22+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-08-21 20:42 [RFC][PATCH 1/9] /proc/pid/pagemap update Dave Hansen
2007-08-21 20:42 ` [RFC][PATCH 2/9] pagemap: remove file header Dave Hansen
2007-08-21 21:24 ` Matt Mackall
2007-08-21 20:42 ` [RFC][PATCH 3/9] pagemap: use PAGE_MASK/PAGE_ALIGN() Dave Hansen
2007-08-21 21:25 ` Matt Mackall
2007-08-21 20:42 ` [RFC][PATCH 4/9] pagemap: remove open-coded sizeof(unsigned long) Dave Hansen
2007-08-21 21:26 ` Matt Mackall
2007-08-21 20:42 ` [RFC][PATCH 5/9] introduce TASK_SIZE_OF() for all arches Dave Hansen
2007-08-21 20:42 ` [RFC][PATCH 6/9] pagemap: give -1's a name Dave Hansen
2007-08-21 21:27 ` Matt Mackall
2007-08-21 20:42 ` [RFC][PATCH 7/9] pagewalk: add handler for empty ranges Dave Hansen
2007-08-30 7:28 ` Nick Piggin
2007-09-03 12:32 ` Matt Mackall
2007-08-21 20:42 ` [RFC][PATCH 8/9] pagemap: use page walker pte_hole() helper Dave Hansen
2007-08-21 22:01 ` Matt Mackall
2007-08-21 22:38 ` Dave Hansen
2007-08-21 20:42 ` [RFC][PATCH 9/9] pagemap: export swap ptes Dave Hansen
2007-08-21 21:49 ` Matt Mackall
2007-08-21 22:26 ` Dave Hansen
2007-08-21 21:23 ` [RFC][PATCH 1/9] /proc/pid/pagemap update Matt Mackall
2007-08-21 21:26 ` Dave Hansen
2007-08-21 22:07 ` Matt Mackall
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).