* [PATCH 1/13] maps#2: Uninline some functions in the page walker
2007-04-06 22:03 [PATCH 0/13] maps#2: pagemap, kpagemap, and related cleanups take 2 Matt Mackall
@ 2007-04-06 22:03 ` Matt Mackall
2007-04-06 22:03 ` [PATCH 2/13] maps#2: Eliminate the pmd_walker struct " Matt Mackall
` (11 subsequent siblings)
12 siblings, 0 replies; 30+ messages in thread
From: Matt Mackall @ 2007-04-06 22:03 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel
Uninline some functions in the page walker
Signed-off-by: Matt Mackall <mpm@selenic.com>
Index: mm/fs/proc/task_mmu.c
===================================================================
--- mm.orig/fs/proc/task_mmu.c 2007-03-24 21:33:42.000000000 -0500
+++ mm/fs/proc/task_mmu.c 2007-03-24 21:33:47.000000000 -0500
@@ -284,7 +284,7 @@ static void clear_refs_pte_range(struct
cond_resched();
}
-static inline void walk_pmd_range(struct pmd_walker *walker, pud_t *pud,
+static void walk_pmd_range(struct pmd_walker *walker, pud_t *pud,
unsigned long addr, unsigned long end)
{
pmd_t *pmd;
@@ -299,7 +299,7 @@ static inline void walk_pmd_range(struct
}
}
-static inline void walk_pud_range(struct pmd_walker *walker, pgd_t *pgd,
+static void walk_pud_range(struct pmd_walker *walker, pgd_t *pgd,
unsigned long addr, unsigned long end)
{
pud_t *pud;
@@ -323,11 +323,11 @@ static inline void walk_pud_range(struct
* Recursively walk the page table for the memory area in a VMA, calling
* a callback for every bottom-level (PTE) page table.
*/
-static inline void walk_page_range(struct vm_area_struct *vma,
- void (*action)(struct vm_area_struct *,
- pmd_t *, unsigned long,
- unsigned long, void *),
- void *private)
+static void walk_page_range(struct vm_area_struct *vma,
+ void (*action)(struct vm_area_struct *,
+ pmd_t *, unsigned long,
+ unsigned long, void *),
+ void *private)
{
unsigned long addr = vma->vm_start;
unsigned long end = vma->vm_end;
^ permalink raw reply [flat|nested] 30+ messages in thread* [PATCH 2/13] maps#2: Eliminate the pmd_walker struct in the page walker
2007-04-06 22:03 [PATCH 0/13] maps#2: pagemap, kpagemap, and related cleanups take 2 Matt Mackall
2007-04-06 22:03 ` [PATCH 1/13] maps#2: Uninline some functions in the page walker Matt Mackall
@ 2007-04-06 22:03 ` Matt Mackall
2007-04-06 22:03 ` [PATCH 3/13] maps#2: Remove vma from args " Matt Mackall
` (10 subsequent siblings)
12 siblings, 0 replies; 30+ messages in thread
From: Matt Mackall @ 2007-04-06 22:03 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel
Eliminate the pmd_walker struct in the page walker
This slightly simplifies things for the next few cleanups.
Signed-off-by: Matt Mackall <mpm@selenic.com>
Index: mm/fs/proc/task_mmu.c
===================================================================
--- mm.orig/fs/proc/task_mmu.c 2007-03-24 21:33:47.000000000 -0500
+++ mm/fs/proc/task_mmu.c 2007-03-24 21:33:50.000000000 -0500
@@ -116,6 +116,7 @@ static void pad_len_spaces(struct seq_fi
struct mem_size_stats
{
+ struct vm_area_struct *vma;
unsigned long resident;
unsigned long shared_clean;
unsigned long shared_dirty;
@@ -124,13 +125,6 @@ struct mem_size_stats
unsigned long referenced;
};
-struct pmd_walker {
- struct vm_area_struct *vma;
- void *private;
- void (*action)(struct vm_area_struct *, pmd_t *, unsigned long,
- unsigned long, void *);
-};
-
static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
{
struct proc_maps_private *priv = m->private;
@@ -218,11 +212,11 @@ static int show_map(struct seq_file *m,
return show_map_internal(m, v, NULL);
}
-static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, unsigned long end,
+static void smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
void *private)
{
struct mem_size_stats *mss = private;
+ struct vm_area_struct *vma = mss->vma;
pte_t *pte, ptent;
spinlock_t *ptl;
struct page *page;
@@ -258,10 +252,10 @@ static void smaps_pte_range(struct vm_ar
cond_resched();
}
-static void clear_refs_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, unsigned long end,
- void *private)
+static void clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, void *private)
{
+ struct vm_area_struct *vma = private;
pte_t *pte, ptent;
spinlock_t *ptl;
struct page *page;
@@ -284,8 +278,10 @@ static void clear_refs_pte_range(struct
cond_resched();
}
-static void walk_pmd_range(struct pmd_walker *walker, pud_t *pud,
- unsigned long addr, unsigned long end)
+static void walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+ void (*action)(pmd_t *, unsigned long,
+ unsigned long, void *),
+ void *private)
{
pmd_t *pmd;
unsigned long next;
@@ -295,12 +291,14 @@ static void walk_pmd_range(struct pmd_wa
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd))
continue;
- walker->action(walker->vma, pmd, addr, next, walker->private);
+ action(pmd, addr, next, private);
}
}
-static void walk_pud_range(struct pmd_walker *walker, pgd_t *pgd,
- unsigned long addr, unsigned long end)
+static void walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+ void (*action)(pmd_t *, unsigned long,
+ unsigned long, void *),
+ void *private)
{
pud_t *pud;
unsigned long next;
@@ -310,7 +308,7 @@ static void walk_pud_range(struct pmd_wa
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- walk_pmd_range(walker, pud, addr, next);
+ walk_pmd_range(pud, addr, next, action, private);
}
}
@@ -324,18 +322,12 @@ static void walk_pud_range(struct pmd_wa
* a callback for every bottom-level (PTE) page table.
*/
static void walk_page_range(struct vm_area_struct *vma,
- void (*action)(struct vm_area_struct *,
- pmd_t *, unsigned long,
+ void (*action)(pmd_t *, unsigned long,
unsigned long, void *),
void *private)
{
unsigned long addr = vma->vm_start;
unsigned long end = vma->vm_end;
- struct pmd_walker walker = {
- .vma = vma,
- .private = private,
- .action = action,
- };
pgd_t *pgd;
unsigned long next;
@@ -344,7 +336,7 @@ static void walk_page_range(struct vm_ar
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- walk_pud_range(&walker, pgd, addr, next);
+ walk_pud_range(pgd, addr, next, action, private);
}
}
@@ -354,6 +346,7 @@ static int show_smap(struct seq_file *m,
struct mem_size_stats mss;
memset(&mss, 0, sizeof mss);
+ mss.vma = vma;
if (vma->vm_mm && !is_vm_hugetlb_page(vma))
walk_page_range(vma, smaps_pte_range, &mss);
return show_map_internal(m, v, &mss);
@@ -366,7 +359,7 @@ void clear_refs_smap(struct mm_struct *m
down_read(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next)
if (vma->vm_mm && !is_vm_hugetlb_page(vma))
- walk_page_range(vma, clear_refs_pte_range, NULL);
+ walk_page_range(vma, clear_refs_pte_range, vma);
flush_tlb_mm(mm);
up_read(&mm->mmap_sem);
}
^ permalink raw reply [flat|nested] 30+ messages in thread* [PATCH 3/13] maps#2: Remove vma from args in the page walker
2007-04-06 22:03 [PATCH 0/13] maps#2: pagemap, kpagemap, and related cleanups take 2 Matt Mackall
2007-04-06 22:03 ` [PATCH 1/13] maps#2: Uninline some functions in the page walker Matt Mackall
2007-04-06 22:03 ` [PATCH 2/13] maps#2: Eliminate the pmd_walker struct " Matt Mackall
@ 2007-04-06 22:03 ` Matt Mackall
2007-04-06 22:03 ` [PATCH 4/13] maps#2: Propagate errors from callback in " Matt Mackall
` (9 subsequent siblings)
12 siblings, 0 replies; 30+ messages in thread
From: Matt Mackall @ 2007-04-06 22:03 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel
Remove vma from args in the page walker
This makes the walker more generic.
Signed-off-by: Matt Mackall <mpm@selenic.com>
Index: mm/fs/proc/task_mmu.c
===================================================================
--- mm.orig/fs/proc/task_mmu.c 2007-03-24 21:33:50.000000000 -0500
+++ mm/fs/proc/task_mmu.c 2007-03-24 21:33:52.000000000 -0500
@@ -313,25 +313,26 @@ static void walk_pud_range(pgd_t *pgd, u
}
/*
- * walk_page_range - walk the page tables of a VMA with a callback
- * @vma - VMA to walk
+ * walk_page_range - walk a memory map's page tables with a callback
+ * @mm - memory map to walk
+ * @addr - starting address
+ * @end - ending address
* @action - callback invoked for every bottom-level (PTE) page table
* @private - private data passed to the callback function
*
* Recursively walk the page table for the memory area in a VMA, calling
* a callback for every bottom-level (PTE) page table.
*/
-static void walk_page_range(struct vm_area_struct *vma,
+static void walk_page_range(struct mm_struct *mm,
+ unsigned long addr, unsigned long end,
void (*action)(pmd_t *, unsigned long,
unsigned long, void *),
void *private)
{
- unsigned long addr = vma->vm_start;
- unsigned long end = vma->vm_end;
pgd_t *pgd;
unsigned long next;
- for (pgd = pgd_offset(vma->vm_mm, addr); addr != end;
+ for (pgd = pgd_offset(mm, addr); addr != end;
pgd++, addr = next) {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
@@ -348,7 +349,8 @@ static int show_smap(struct seq_file *m,
memset(&mss, 0, sizeof mss);
mss.vma = vma;
if (vma->vm_mm && !is_vm_hugetlb_page(vma))
- walk_page_range(vma, smaps_pte_range, &mss);
+ walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end,
+ smaps_pte_range, &mss);
return show_map_internal(m, v, &mss);
}
@@ -359,7 +361,8 @@ void clear_refs_smap(struct mm_struct *m
down_read(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next)
if (vma->vm_mm && !is_vm_hugetlb_page(vma))
- walk_page_range(vma, clear_refs_pte_range, vma);
+ walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end,
+ clear_refs_pte_range, vma);
flush_tlb_mm(mm);
up_read(&mm->mmap_sem);
}
^ permalink raw reply [flat|nested] 30+ messages in thread* [PATCH 4/13] maps#2: Propagate errors from callback in page walker
2007-04-06 22:03 [PATCH 0/13] maps#2: pagemap, kpagemap, and related cleanups take 2 Matt Mackall
` (2 preceding siblings ...)
2007-04-06 22:03 ` [PATCH 3/13] maps#2: Remove vma from args " Matt Mackall
@ 2007-04-06 22:03 ` Matt Mackall
2007-04-06 22:03 ` [PATCH 5/13] maps#2: Add callbacks for each level to " Matt Mackall
` (8 subsequent siblings)
12 siblings, 0 replies; 30+ messages in thread
From: Matt Mackall @ 2007-04-06 22:03 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel
Propagate errors from callback in page walker
Signed-off-by: Matt Mackall <mpm@selenic.com>
Index: mm/fs/proc/task_mmu.c
===================================================================
--- mm.orig/fs/proc/task_mmu.c 2007-03-24 21:33:52.000000000 -0500
+++ mm/fs/proc/task_mmu.c 2007-03-24 21:33:58.000000000 -0500
@@ -212,8 +212,8 @@ static int show_map(struct seq_file *m,
return show_map_internal(m, v, NULL);
}
-static void smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
- void *private)
+static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ void *private)
{
struct mem_size_stats *mss = private;
struct vm_area_struct *vma = mss->vma;
@@ -250,10 +250,11 @@ static void smaps_pte_range(pmd_t *pmd,
}
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
+ return 0;
}
-static void clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
- unsigned long end, void *private)
+static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, void *private)
{
struct vm_area_struct *vma = private;
pte_t *pte, ptent;
@@ -276,40 +277,51 @@ static void clear_refs_pte_range(pmd_t *
}
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
+ return 0;
}
-static void walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
- void (*action)(pmd_t *, unsigned long,
- unsigned long, void *),
- void *private)
+static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+ int (*action)(pmd_t *, unsigned long,
+ unsigned long, void *),
+ void *private)
{
pmd_t *pmd;
unsigned long next;
+ int err;
for (pmd = pmd_offset(pud, addr); addr != end;
pmd++, addr = next) {
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd))
continue;
- action(pmd, addr, next, private);
+ err = action(pmd, addr, next, private);
+ if (err)
+ return err;
}
+
+ return 0;
}
-static void walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
- void (*action)(pmd_t *, unsigned long,
- unsigned long, void *),
- void *private)
+static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+ int (*action)(pmd_t *, unsigned long,
+ unsigned long, void *),
+ void *private)
{
pud_t *pud;
unsigned long next;
+ int err;
for (pud = pud_offset(pgd, addr); addr != end;
pud++, addr = next) {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- walk_pmd_range(pud, addr, next, action, private);
+ err = walk_pmd_range(pud, addr, next, action, private);
+ if (err)
+ return err;
}
+
+ return 0;
}
/*
@@ -323,22 +335,27 @@ static void walk_pud_range(pgd_t *pgd, u
* Recursively walk the page table for the memory area in a VMA, calling
* a callback for every bottom-level (PTE) page table.
*/
-static void walk_page_range(struct mm_struct *mm,
- unsigned long addr, unsigned long end,
- void (*action)(pmd_t *, unsigned long,
- unsigned long, void *),
- void *private)
+static int walk_page_range(struct mm_struct *mm,
+ unsigned long addr, unsigned long end,
+ int (*action)(pmd_t *, unsigned long,
+ unsigned long, void *),
+ void *private)
{
pgd_t *pgd;
unsigned long next;
+ int err;
for (pgd = pgd_offset(mm, addr); addr != end;
pgd++, addr = next) {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- walk_pud_range(pgd, addr, next, action, private);
+ err = walk_pud_range(pgd, addr, next, action, private);
+ if (err)
+ return err;
}
+
+ return 0;
}
static int show_smap(struct seq_file *m, void *v)
^ permalink raw reply [flat|nested] 30+ messages in thread* [PATCH 5/13] maps#2: Add callbacks for each level to page walker
2007-04-06 22:03 [PATCH 0/13] maps#2: pagemap, kpagemap, and related cleanups take 2 Matt Mackall
` (3 preceding siblings ...)
2007-04-06 22:03 ` [PATCH 4/13] maps#2: Propagate errors from callback in " Matt Mackall
@ 2007-04-06 22:03 ` Matt Mackall
2007-04-06 22:03 ` [PATCH 6/13] maps#2: Move the page walker code to lib/ Matt Mackall
` (7 subsequent siblings)
12 siblings, 0 replies; 30+ messages in thread
From: Matt Mackall @ 2007-04-06 22:03 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel
Add callbacks for each level to page walker
This allows iterating over all levels of the page tables. Recursion
continues to the depth of the lowest supplied callback.
This makes the page walker nearly completely generic and should allow
it to replace some other hand-rolled page table walkers.
Signed-off-by: Matt Mackall <mpm@selenic.com>
Index: mm/fs/proc/task_mmu.c
===================================================================
--- mm.orig/fs/proc/task_mmu.c 2007-03-24 21:33:58.000000000 -0500
+++ mm/fs/proc/task_mmu.c 2007-03-24 21:34:07.000000000 -0500
@@ -280,10 +280,35 @@ static int clear_refs_pte_range(pmd_t *p
return 0;
}
+struct mm_walk {
+ int (*pgd_entry)(pgd_t *, unsigned long, unsigned long, void *);
+ int (*pud_entry)(pud_t *, unsigned long, unsigned long, void *);
+ int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, void *);
+ int (*pte_entry)(pte_t *, unsigned long, unsigned long, void *);
+};
+
+static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ struct mm_walk *walk, void *private)
+{
+ pte_t *pte;
+ int err;
+
+ for (pte = pte_offset_map(pmd, addr); addr != end;
+ addr += PAGE_SIZE, pte++) {
+ if (pte_none(*pte))
+ continue;
+ err = walk->pte_entry(pte, addr, addr, private);
+ if (err) {
+ pte_unmap(pte);
+ return err;
+ }
+ }
+ pte_unmap(pte);
+ return 0;
+}
+
static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
- int (*action)(pmd_t *, unsigned long,
- unsigned long, void *),
- void *private)
+ struct mm_walk *walk, void *private)
{
pmd_t *pmd;
unsigned long next;
@@ -294,18 +319,22 @@ static int walk_pmd_range(pud_t *pud, un
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd))
continue;
- err = action(pmd, addr, next, private);
- if (err)
- return err;
+ if (walk->pmd_entry) {
+ err = walk->pmd_entry(pmd, addr, next, private);
+ if (err)
+ return err;
+ }
+ if (walk->pte_entry) {
+ err = walk_pte_range(pmd, addr, next, walk, private);
+ if (err)
+ return err;
+ }
}
-
return 0;
}
static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
- int (*action)(pmd_t *, unsigned long,
- unsigned long, void *),
- void *private)
+ struct mm_walk *walk, void *private)
{
pud_t *pud;
unsigned long next;
@@ -316,11 +345,17 @@ static int walk_pud_range(pgd_t *pgd, un
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- err = walk_pmd_range(pud, addr, next, action, private);
- if (err)
- return err;
+ if (walk->pud_entry) {
+ err = walk->pud_entry(pud, addr, next, private);
+ if (err)
+ return err;
+ }
+ if (walk->pmd_entry || walk->pte_entry) {
+ err = walk_pmd_range(pud, addr, next, walk, private);
+ if (err)
+ return err;
+ }
}
-
return 0;
}
@@ -337,9 +372,7 @@ static int walk_pud_range(pgd_t *pgd, un
*/
static int walk_page_range(struct mm_struct *mm,
unsigned long addr, unsigned long end,
- int (*action)(pmd_t *, unsigned long,
- unsigned long, void *),
- void *private)
+ struct mm_walk *walk, void *private)
{
pgd_t *pgd;
unsigned long next;
@@ -350,14 +383,22 @@ static int walk_page_range(struct mm_str
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- err = walk_pud_range(pgd, addr, next, action, private);
- if (err)
- return err;
+ if (walk->pgd_entry) {
+ err = walk->pgd_entry(pgd, addr, next, private);
+ if (err)
+ return err;
+ }
+ if (walk->pud_entry || walk->pmd_entry || walk->pte_entry) {
+ err = walk_pud_range(pgd, addr, next, walk, private);
+ if (err)
+ return err;
+ }
}
-
return 0;
}
+static struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range };
+
static int show_smap(struct seq_file *m, void *v)
{
struct vm_area_struct *vma = v;
@@ -367,10 +408,12 @@ static int show_smap(struct seq_file *m,
mss.vma = vma;
if (vma->vm_mm && !is_vm_hugetlb_page(vma))
walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end,
- smaps_pte_range, &mss);
+ &smaps_walk, &mss);
return show_map_internal(m, v, &mss);
}
+static struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range };
+
void clear_refs_smap(struct mm_struct *mm)
{
struct vm_area_struct *vma;
@@ -379,7 +422,7 @@ void clear_refs_smap(struct mm_struct *m
for (vma = mm->mmap; vma; vma = vma->vm_next)
if (vma->vm_mm && !is_vm_hugetlb_page(vma))
walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end,
- clear_refs_pte_range, vma);
+ &clear_refs_walk, vma);
flush_tlb_mm(mm);
up_read(&mm->mmap_sem);
}
^ permalink raw reply [flat|nested] 30+ messages in thread* [PATCH 6/13] maps#2: Move the page walker code to lib/
2007-04-06 22:03 [PATCH 0/13] maps#2: pagemap, kpagemap, and related cleanups take 2 Matt Mackall
` (4 preceding siblings ...)
2007-04-06 22:03 ` [PATCH 5/13] maps#2: Add callbacks for each level to " Matt Mackall
@ 2007-04-06 22:03 ` Matt Mackall
2007-04-11 6:35 ` Nick Piggin
2007-04-06 22:03 ` [PATCH 7/13] maps#2: Simplify interdependence of /proc/pid/maps and smaps Matt Mackall
` (6 subsequent siblings)
12 siblings, 1 reply; 30+ messages in thread
From: Matt Mackall @ 2007-04-06 22:03 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel
Move the page walker code to lib/
This lets it get shared outside of proc/ and linked in only when
needed.
Signed-off-by: Matt Mackall <mpm@selenic.com>
Index: mm/fs/proc/task_mmu.c
===================================================================
--- mm.orig/fs/proc/task_mmu.c 2007-03-27 22:13:43.000000000 -0500
+++ mm/fs/proc/task_mmu.c 2007-03-27 22:13:51.000000000 -0500
@@ -280,123 +280,6 @@ static int clear_refs_pte_range(pmd_t *p
return 0;
}
-struct mm_walk {
- int (*pgd_entry)(pgd_t *, unsigned long, unsigned long, void *);
- int (*pud_entry)(pud_t *, unsigned long, unsigned long, void *);
- int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, void *);
- int (*pte_entry)(pte_t *, unsigned long, unsigned long, void *);
-};
-
-static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
- struct mm_walk *walk, void *private)
-{
- pte_t *pte;
- int err;
-
- for (pte = pte_offset_map(pmd, addr); addr != end;
- addr += PAGE_SIZE, pte++) {
- if (pte_none(*pte))
- continue;
- err = walk->pte_entry(pte, addr, addr, private);
- if (err) {
- pte_unmap(pte);
- return err;
- }
- }
- pte_unmap(pte);
- return 0;
-}
-
-static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
- struct mm_walk *walk, void *private)
-{
- pmd_t *pmd;
- unsigned long next;
- int err;
-
- for (pmd = pmd_offset(pud, addr); addr != end;
- pmd++, addr = next) {
- next = pmd_addr_end(addr, end);
- if (pmd_none_or_clear_bad(pmd))
- continue;
- if (walk->pmd_entry) {
- err = walk->pmd_entry(pmd, addr, next, private);
- if (err)
- return err;
- }
- if (walk->pte_entry) {
- err = walk_pte_range(pmd, addr, next, walk, private);
- if (err)
- return err;
- }
- }
- return 0;
-}
-
-static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
- struct mm_walk *walk, void *private)
-{
- pud_t *pud;
- unsigned long next;
- int err;
-
- for (pud = pud_offset(pgd, addr); addr != end;
- pud++, addr = next) {
- next = pud_addr_end(addr, end);
- if (pud_none_or_clear_bad(pud))
- continue;
- if (walk->pud_entry) {
- err = walk->pud_entry(pud, addr, next, private);
- if (err)
- return err;
- }
- if (walk->pmd_entry || walk->pte_entry) {
- err = walk_pmd_range(pud, addr, next, walk, private);
- if (err)
- return err;
- }
- }
- return 0;
-}
-
-/*
- * walk_page_range - walk a memory map's page tables with a callback
- * @mm - memory map to walk
- * @addr - starting address
- * @end - ending address
- * @action - callback invoked for every bottom-level (PTE) page table
- * @private - private data passed to the callback function
- *
- * Recursively walk the page table for the memory area in a VMA, calling
- * a callback for every bottom-level (PTE) page table.
- */
-static int walk_page_range(struct mm_struct *mm,
- unsigned long addr, unsigned long end,
- struct mm_walk *walk, void *private)
-{
- pgd_t *pgd;
- unsigned long next;
- int err;
-
- for (pgd = pgd_offset(mm, addr); addr != end;
- pgd++, addr = next) {
- next = pgd_addr_end(addr, end);
- if (pgd_none_or_clear_bad(pgd))
- continue;
- if (walk->pgd_entry) {
- err = walk->pgd_entry(pgd, addr, next, private);
- if (err)
- return err;
- }
- if (walk->pud_entry || walk->pmd_entry || walk->pte_entry) {
- err = walk_pud_range(pgd, addr, next, walk, private);
- if (err)
- return err;
- }
- }
- return 0;
-}
-
static struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range };
static int show_smap(struct seq_file *m, void *v)
Index: mm/include/linux/mm.h
===================================================================
--- mm.orig/include/linux/mm.h 2007-03-27 22:13:42.000000000 -0500
+++ mm/include/linux/mm.h 2007-03-27 22:13:51.000000000 -0500
@@ -747,6 +747,16 @@ unsigned long unmap_vmas(struct mmu_gath
struct vm_area_struct *start_vma, unsigned long start_addr,
unsigned long end_addr, unsigned long *nr_accounted,
struct zap_details *);
+
+struct mm_walk {
+ int (*pgd_entry)(pgd_t *, unsigned long, unsigned long, void *);
+ int (*pud_entry)(pud_t *, unsigned long, unsigned long, void *);
+ int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, void *);
+ int (*pte_entry)(pte_t *, unsigned long, unsigned long, void *);
+};
+
+int walk_page_range(struct mm_struct *, unsigned long addr, unsigned long end,
+ struct mm_walk *walk, void *private);
void free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
unsigned long end, unsigned long floor, unsigned long ceiling);
void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma,
Index: mm/lib/Makefile
===================================================================
--- mm.orig/lib/Makefile 2007-03-27 22:14:09.000000000 -0500
+++ mm/lib/Makefile 2007-03-27 22:16:49.000000000 -0500
@@ -7,7 +7,7 @@ lib-y := ctype.o string.o vsprintf.o cmd
idr.o div64.o int_sqrt.o bitmap.o extable.o prio_tree.o \
sha1.o irq_regs.o reciprocal_div.o
-lib-$(CONFIG_MMU) += ioremap.o
+lib-$(CONFIG_MMU) += ioremap.o pagewalk.o
lib-$(CONFIG_SMP) += cpumask.o
lib-y += kobject.o kref.o kobject_uevent.o klist.o
Index: mm/lib/pagewalk.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ mm/lib/pagewalk.c 2007-03-27 22:18:37.000000000 -0500
@@ -0,0 +1,111 @@
+#include <linux/mm.h>
+
+static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ struct mm_walk *walk, void *private)
+{
+ pte_t *pte;
+ int err;
+
+ for (pte = pte_offset_map(pmd, addr); addr != end;
+ addr += PAGE_SIZE, pte++) {
+ if (pte_none(*pte))
+ continue;
+ err = walk->pte_entry(pte, addr, addr, private);
+ if (err) {
+ pte_unmap(pte);
+ return err;
+ }
+ }
+ pte_unmap(pte);
+ return 0;
+}
+
+static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+ struct mm_walk *walk, void *private)
+{
+ pmd_t *pmd;
+ unsigned long next;
+ int err;
+
+ for (pmd = pmd_offset(pud, addr); addr != end;
+ pmd++, addr = next) {
+ next = pmd_addr_end(addr, end);
+ if (pmd_none_or_clear_bad(pmd))
+ continue;
+ if (walk->pmd_entry) {
+ err = walk->pmd_entry(pmd, addr, next, private);
+ if (err)
+ return err;
+ }
+ if (walk->pte_entry) {
+ err = walk_pte_range(pmd, addr, next, walk, private);
+ if (err)
+ return err;
+ }
+ }
+ return 0;
+}
+
+static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+ struct mm_walk *walk, void *private)
+{
+ pud_t *pud;
+ unsigned long next;
+ int err;
+
+ for (pud = pud_offset(pgd, addr); addr != end;
+ pud++, addr = next) {
+ next = pud_addr_end(addr, end);
+ if (pud_none_or_clear_bad(pud))
+ continue;
+ if (walk->pud_entry) {
+ err = walk->pud_entry(pud, addr, next, private);
+ if (err)
+ return err;
+ }
+ if (walk->pmd_entry || walk->pte_entry) {
+ err = walk_pmd_range(pud, addr, next, walk, private);
+ if (err)
+ return err;
+ }
+ }
+ return 0;
+}
+
+/*
+ * walk_page_range - walk a memory map's page tables with a callback
+ * @mm - memory map to walk
+ * @addr - starting address
+ * @end - ending address
+ * @walk - set of callbacks to invoke for each level of the tree
+ * @private - private data passed to the callback function
+ *
+ * Recursively walk the page table for the memory area in a VMA, calling
+ * a callback for every bottom-level (PTE) page table.
+ */
+int walk_page_range(struct mm_struct *mm,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk, void *private)
+{
+ pgd_t *pgd;
+ unsigned long next;
+ int err;
+
+ for (pgd = pgd_offset(mm, addr); addr != end;
+ pgd++, addr = next) {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd))
+ continue;
+ if (walk->pgd_entry) {
+ err = walk->pgd_entry(pgd, addr, next, private);
+ if (err)
+ return err;
+ }
+ if (walk->pud_entry || walk->pmd_entry || walk->pte_entry) {
+ err = walk_pud_range(pgd, addr, next, walk, private);
+ if (err)
+ return err;
+ }
+ }
+ return 0;
+}
^ permalink raw reply [flat|nested] 30+ messages in thread* Re: [PATCH 6/13] maps#2: Move the page walker code to lib/
2007-04-06 22:03 ` [PATCH 6/13] maps#2: Move the page walker code to lib/ Matt Mackall
@ 2007-04-11 6:35 ` Nick Piggin
2007-04-11 7:17 ` Andrew Morton
` (2 more replies)
0 siblings, 3 replies; 30+ messages in thread
From: Nick Piggin @ 2007-04-11 6:35 UTC (permalink / raw)
To: Matt Mackall; +Cc: Andrew Morton, linux-kernel
Matt Mackall wrote:
> Move the page walker code to lib/
>
> This lets it get shared outside of proc/ and linked in only when
> needed.
Still should go into mm/
If it had, you might have also noticed your pagetable walking code
is completely different from how everyone else does it, and fixed
that too.
BTW. Is it the case that unused and unexported symbols don't get
pruned by the linker except inside lib/?
>
> Signed-off-by: Matt Mackall <mpm@selenic.com>
>
> Index: mm/fs/proc/task_mmu.c
> ===================================================================
> --- mm.orig/fs/proc/task_mmu.c 2007-03-27 22:13:43.000000000 -0500
> +++ mm/fs/proc/task_mmu.c 2007-03-27 22:13:51.000000000 -0500
> @@ -280,123 +280,6 @@ static int clear_refs_pte_range(pmd_t *p
> return 0;
> }
>
> -struct mm_walk {
> - int (*pgd_entry)(pgd_t *, unsigned long, unsigned long, void *);
> - int (*pud_entry)(pud_t *, unsigned long, unsigned long, void *);
> - int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, void *);
> - int (*pte_entry)(pte_t *, unsigned long, unsigned long, void *);
> -};
> -
> -static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
> - struct mm_walk *walk, void *private)
> -{
> - pte_t *pte;
> - int err;
> -
> - for (pte = pte_offset_map(pmd, addr); addr != end;
> - addr += PAGE_SIZE, pte++) {
> - if (pte_none(*pte))
> - continue;
> - err = walk->pte_entry(pte, addr, addr, private);
> - if (err) {
> - pte_unmap(pte);
> - return err;
> - }
> - }
> - pte_unmap(pte);
> - return 0;
> -}
> -
> -static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
> - struct mm_walk *walk, void *private)
> -{
> - pmd_t *pmd;
> - unsigned long next;
> - int err;
> -
> - for (pmd = pmd_offset(pud, addr); addr != end;
> - pmd++, addr = next) {
> - next = pmd_addr_end(addr, end);
> - if (pmd_none_or_clear_bad(pmd))
> - continue;
> - if (walk->pmd_entry) {
> - err = walk->pmd_entry(pmd, addr, next, private);
> - if (err)
> - return err;
> - }
> - if (walk->pte_entry) {
> - err = walk_pte_range(pmd, addr, next, walk, private);
> - if (err)
> - return err;
> - }
> - }
> - return 0;
> -}
> -
> -static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
> - struct mm_walk *walk, void *private)
> -{
> - pud_t *pud;
> - unsigned long next;
> - int err;
> -
> - for (pud = pud_offset(pgd, addr); addr != end;
> - pud++, addr = next) {
> - next = pud_addr_end(addr, end);
> - if (pud_none_or_clear_bad(pud))
> - continue;
> - if (walk->pud_entry) {
> - err = walk->pud_entry(pud, addr, next, private);
> - if (err)
> - return err;
> - }
> - if (walk->pmd_entry || walk->pte_entry) {
> - err = walk_pmd_range(pud, addr, next, walk, private);
> - if (err)
> - return err;
> - }
> - }
> - return 0;
> -}
> -
> -/*
> - * walk_page_range - walk a memory map's page tables with a callback
> - * @mm - memory map to walk
> - * @addr - starting address
> - * @end - ending address
> - * @action - callback invoked for every bottom-level (PTE) page table
> - * @private - private data passed to the callback function
> - *
> - * Recursively walk the page table for the memory area in a VMA, calling
> - * a callback for every bottom-level (PTE) page table.
> - */
> -static int walk_page_range(struct mm_struct *mm,
> - unsigned long addr, unsigned long end,
> - struct mm_walk *walk, void *private)
> -{
> - pgd_t *pgd;
> - unsigned long next;
> - int err;
> -
> - for (pgd = pgd_offset(mm, addr); addr != end;
> - pgd++, addr = next) {
> - next = pgd_addr_end(addr, end);
> - if (pgd_none_or_clear_bad(pgd))
> - continue;
> - if (walk->pgd_entry) {
> - err = walk->pgd_entry(pgd, addr, next, private);
> - if (err)
> - return err;
> - }
> - if (walk->pud_entry || walk->pmd_entry || walk->pte_entry) {
> - err = walk_pud_range(pgd, addr, next, walk, private);
> - if (err)
> - return err;
> - }
> - }
> - return 0;
> -}
> -
> static struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range };
>
> static int show_smap(struct seq_file *m, void *v)
> Index: mm/include/linux/mm.h
> ===================================================================
> --- mm.orig/include/linux/mm.h 2007-03-27 22:13:42.000000000 -0500
> +++ mm/include/linux/mm.h 2007-03-27 22:13:51.000000000 -0500
> @@ -747,6 +747,16 @@ unsigned long unmap_vmas(struct mmu_gath
> struct vm_area_struct *start_vma, unsigned long start_addr,
> unsigned long end_addr, unsigned long *nr_accounted,
> struct zap_details *);
> +
> +struct mm_walk {
> + int (*pgd_entry)(pgd_t *, unsigned long, unsigned long, void *);
> + int (*pud_entry)(pud_t *, unsigned long, unsigned long, void *);
> + int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, void *);
> + int (*pte_entry)(pte_t *, unsigned long, unsigned long, void *);
> +};
> +
> +int walk_page_range(struct mm_struct *, unsigned long addr, unsigned long end,
> + struct mm_walk *walk, void *private);
> void free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
> unsigned long end, unsigned long floor, unsigned long ceiling);
> void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma,
> Index: mm/lib/Makefile
> ===================================================================
> --- mm.orig/lib/Makefile 2007-03-27 22:14:09.000000000 -0500
> +++ mm/lib/Makefile 2007-03-27 22:16:49.000000000 -0500
> @@ -7,7 +7,7 @@ lib-y := ctype.o string.o vsprintf.o cmd
> idr.o div64.o int_sqrt.o bitmap.o extable.o prio_tree.o \
> sha1.o irq_regs.o reciprocal_div.o
>
> -lib-$(CONFIG_MMU) += ioremap.o
> +lib-$(CONFIG_MMU) += ioremap.o pagewalk.o
> lib-$(CONFIG_SMP) += cpumask.o
>
> lib-y += kobject.o kref.o kobject_uevent.o klist.o
> Index: mm/lib/pagewalk.c
> ===================================================================
> --- /dev/null 1970-01-01 00:00:00.000000000 +0000
> +++ mm/lib/pagewalk.c 2007-03-27 22:18:37.000000000 -0500
> @@ -0,0 +1,111 @@
> +#include <linux/mm.h>
> +
> +static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
> + struct mm_walk *walk, void *private)
> +{
> + pte_t *pte;
> + int err;
> +
> + for (pte = pte_offset_map(pmd, addr); addr != end;
> + addr += PAGE_SIZE, pte++) {
> + if (pte_none(*pte))
> + continue;
> + err = walk->pte_entry(pte, addr, addr, private);
> + if (err) {
> + pte_unmap(pte);
> + return err;
> + }
> + }
> + pte_unmap(pte);
> + return 0;
> +}
> +
> +static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
> + struct mm_walk *walk, void *private)
> +{
> + pmd_t *pmd;
> + unsigned long next;
> + int err;
> +
> + for (pmd = pmd_offset(pud, addr); addr != end;
> + pmd++, addr = next) {
> + next = pmd_addr_end(addr, end);
> + if (pmd_none_or_clear_bad(pmd))
> + continue;
> + if (walk->pmd_entry) {
> + err = walk->pmd_entry(pmd, addr, next, private);
> + if (err)
> + return err;
> + }
> + if (walk->pte_entry) {
> + err = walk_pte_range(pmd, addr, next, walk, private);
> + if (err)
> + return err;
> + }
> + }
> + return 0;
> +}
> +
> +static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
> + struct mm_walk *walk, void *private)
> +{
> + pud_t *pud;
> + unsigned long next;
> + int err;
> +
> + for (pud = pud_offset(pgd, addr); addr != end;
> + pud++, addr = next) {
> + next = pud_addr_end(addr, end);
> + if (pud_none_or_clear_bad(pud))
> + continue;
> + if (walk->pud_entry) {
> + err = walk->pud_entry(pud, addr, next, private);
> + if (err)
> + return err;
> + }
> + if (walk->pmd_entry || walk->pte_entry) {
> + err = walk_pmd_range(pud, addr, next, walk, private);
> + if (err)
> + return err;
> + }
> + }
> + return 0;
> +}
> +
> +/*
> + * walk_page_range - walk a memory map's page tables with a callback
> + * @mm - memory map to walk
> + * @addr - starting address
> + * @end - ending address
> + * @walk - set of callbacks to invoke for each level of the tree
> + * @private - private data passed to the callback function
> + *
> + * Recursively walk the page table for the memory area in a VMA, calling
> + * a callback for every bottom-level (PTE) page table.
> + */
> +int walk_page_range(struct mm_struct *mm,
> + unsigned long addr, unsigned long end,
> + struct mm_walk *walk, void *private)
> +{
> + pgd_t *pgd;
> + unsigned long next;
> + int err;
> +
> + for (pgd = pgd_offset(mm, addr); addr != end;
> + pgd++, addr = next) {
> + next = pgd_addr_end(addr, end);
> + if (pgd_none_or_clear_bad(pgd))
> + continue;
> + if (walk->pgd_entry) {
> + err = walk->pgd_entry(pgd, addr, next, private);
> + if (err)
> + return err;
> + }
> + if (walk->pud_entry || walk->pmd_entry || walk->pte_entry) {
> + err = walk_pud_range(pgd, addr, next, walk, private);
> + if (err)
> + return err;
> + }
> + }
> + return 0;
> +}
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
--
SUSE Labs, Novell Inc.
^ permalink raw reply [flat|nested] 30+ messages in thread* Re: [PATCH 6/13] maps#2: Move the page walker code to lib/
2007-04-11 6:35 ` Nick Piggin
@ 2007-04-11 7:17 ` Andrew Morton
2007-04-11 7:33 ` Nick Piggin
2007-04-11 14:40 ` Matt Mackall
2007-04-17 20:45 ` Matt Mackall
2 siblings, 1 reply; 30+ messages in thread
From: Andrew Morton @ 2007-04-11 7:17 UTC (permalink / raw)
To: Nick Piggin; +Cc: Matt Mackall, linux-kernel
On Wed, 11 Apr 2007 16:35:44 +1000 Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> Matt Mackall wrote:
> > Move the page walker code to lib/
> >
> > This lets it get shared outside of proc/ and linked in only when
> > needed.
>
> Still should go into mm/
spose so.
> If it had, you might have also noticed your pagetable walking code
> is completely different from how everyone else does it, and fixed
> that too.
Different in what way?
> BTW. Is it the case that unused and unexported symbols don't get
> pruned by the linker except inside lib/?
If they're static and unreferenced then the linker will remove them. Of
course, usually humans remove these because they generate warnings, unless
special-things happen. We deliberately do special-things with
register_cpu_notifier() so the notifier-block and the handler go away if
!CONFIG_HOTPLUG_CPU.
If the functions are non-static then yes, I expect we end up with them in
vmlinux. There are tricks we can play with -ffunction-sections to fix
that, but we don't.
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 6/13] maps#2: Move the page walker code to lib/
2007-04-11 7:17 ` Andrew Morton
@ 2007-04-11 7:33 ` Nick Piggin
0 siblings, 0 replies; 30+ messages in thread
From: Nick Piggin @ 2007-04-11 7:33 UTC (permalink / raw)
To: Andrew Morton; +Cc: Matt Mackall, linux-kernel
Andrew Morton wrote:
> On Wed, 11 Apr 2007 16:35:44 +1000 Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>
>
>>Matt Mackall wrote:
>>
>>>Move the page walker code to lib/
>>>
>>>This lets it get shared outside of proc/ and linked in only when
>>>needed.
>>
>>Still should go into mm/
>
>
> spose so.
>
>
>>If it had, you might have also noticed your pagetable walking code
>>is completely different from how everyone else does it, and fixed
>>that too.
>
>
> Different in what way?
The form of the loops. It may not seem like a big deal, but before
Hugh did a big cleanup to make them all the same, we had about a
dozen versions which were all very slightly different ;) It was
painful.
>>BTW. Is it the case that unused and unexported symbols don't get
>>pruned by the linker except inside lib/?
>
>
> If they're static and unreferenced then the linker will remove them. Of
> course, usually humans remove these because they generate warnings, unless
> special-things happen. We deliberately do special-things with
> register_cpu_notifier() so the notifier-block and the handler go away if
> !CONFIG_HOTPLUG_CPU.
>
> If the functions are non-static then yes, I expect we end up with them in
> vmlinux. There are tricks we can play with -ffunction-sections to fix
> that, but we don't.
They aren't static. But anyway, AFAIKS, all the CONFIG_ stuff is
there anyway, so we can not only avoid the linking but also the
compiling by using them.
--
SUSE Labs, Novell Inc.
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 6/13] maps#2: Move the page walker code to lib/
2007-04-11 6:35 ` Nick Piggin
2007-04-11 7:17 ` Andrew Morton
@ 2007-04-11 14:40 ` Matt Mackall
2007-04-12 6:38 ` Nick Piggin
2007-04-17 20:45 ` Matt Mackall
2 siblings, 1 reply; 30+ messages in thread
From: Matt Mackall @ 2007-04-11 14:40 UTC (permalink / raw)
To: Nick Piggin; +Cc: Andrew Morton, linux-kernel
On Wed, Apr 11, 2007 at 04:35:44PM +1000, Nick Piggin wrote:
> Matt Mackall wrote:
> >Move the page walker code to lib/
> >
> >This lets it get shared outside of proc/ and linked in only when
> >needed.
>
> Still should go into mm/
>
> If it had, you might have also noticed your pagetable walking code
> is completely different from how everyone else does it, and fixed
> that too.
I actually did notice that, when I compared it to jsgf's page walking
code for Xen.
> BTW. Is it the case that unused and unexported symbols don't get
> pruned by the linker except inside lib/?
Yes, that's been my point all along. It also currently only happens at
the granularity of an object file, not a symbol, FYI.
--
Mathematics is the supreme nostalgia of our time.
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 6/13] maps#2: Move the page walker code to lib/
2007-04-11 14:40 ` Matt Mackall
@ 2007-04-12 6:38 ` Nick Piggin
0 siblings, 0 replies; 30+ messages in thread
From: Nick Piggin @ 2007-04-12 6:38 UTC (permalink / raw)
To: Matt Mackall; +Cc: Andrew Morton, linux-kernel
Matt Mackall wrote:
> On Wed, Apr 11, 2007 at 04:35:44PM +1000, Nick Piggin wrote:
>
>>Matt Mackall wrote:
>>
>>>Move the page walker code to lib/
>>>
>>>This lets it get shared outside of proc/ and linked in only when
>>>needed.
>>
>>Still should go into mm/
>>
>>If it had, you might have also noticed your pagetable walking code
>>is completely different from how everyone else does it, and fixed
>>that too.
>
>
> I actually did notice that, when I compared it to jsgf's page walking
> code for Xen.
Can you fix it then, since you are doing the big reorganisation?
>
>
>>BTW. Is it the case that unused and unexported symbols don't get
>>pruned by the linker except inside lib/?
>
>
> Yes, that's been my point all along. It also currently only happens at
> the granularity of an object file, not a symbol, FYI.
You have the config symbols, don't you? Please use them in the
makefile to prevent compiling and linking.
My point all along is that it belongs in mm/. If you don't want to
do it then I can't make you, but I'll submit a patch to use the
correct page table walking conventions and move it to mm/.
--
SUSE Labs, Novell Inc.
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 6/13] maps#2: Move the page walker code to lib/
2007-04-11 6:35 ` Nick Piggin
2007-04-11 7:17 ` Andrew Morton
2007-04-11 14:40 ` Matt Mackall
@ 2007-04-17 20:45 ` Matt Mackall
2007-04-17 21:26 ` Permanent Kgdb integration into the kernel - lets get with it Jason Wessel
2 siblings, 1 reply; 30+ messages in thread
From: Matt Mackall @ 2007-04-17 20:45 UTC (permalink / raw)
To: Nick Piggin; +Cc: Andrew Morton, linux-kernel
On Wed, Apr 11, 2007 at 04:35:44PM +1000, Nick Piggin wrote:
> Matt Mackall wrote:
> >Move the page walker code to lib/
> >
> >This lets it get shared outside of proc/ and linked in only when
> >needed.
>
> Still should go into mm/
>
> If it had, you might have also noticed your pagetable walking code
> is completely different from how everyone else does it, and fixed
> that too.
I'll try to fix this up next week.
--
Mathematics is the supreme nostalgia of our time.
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: Permanent Kgdb integration into the kernel - lets get with it.
2007-04-17 20:45 ` Matt Mackall
@ 2007-04-17 21:26 ` Jason Wessel
2007-04-17 22:09 ` Andi Kleen
0 siblings, 1 reply; 30+ messages in thread
From: Jason Wessel @ 2007-04-17 21:26 UTC (permalink / raw)
To: Andi Kleen; +Cc: linux-kernel
Andi Kleen wrote:
> > Is there any movement on this?
>
> I'm open to reasonable patches for the hooks at least. If that is done
> then the actual kgdb code can be reviewed and considered eventually too.
>
> But just having the hooks in would make it easy enough to use anyways
> (no patching, just dropping in of new files, or even loading of it as a
> module into any kernel)
>
> When I did the original x86-64 kgdb port this worked nicely --
> kgdb could work with just the standard die notifiers and a simple
> change in the serial console code.
>
> The recent kgdb seems to need much more changes again though.
>
> However every time when I suggested this (fixing the hooks first
> and submitting the really needed changes piece by piece)
> there didn't seem to be any interest from the various kgdb maintainers.
>
> So my impression currently is that they're not interested in merging.
>
> Another problem is that kgdb is moving more and more away from
> mainline by adding various weird hacks and workarounds in random
> code that just make merging harder.
>
> Before anything could be considered for merging that all would
> need to be cleaned up.
>
> -Andi
>
Andi,
I too am open to having a API, for KGDB, but it does need more than just
the trap vectors and the serial driver as hook points. There are a
number of patches to fix problems randing from the NET_POLL API, NMI
handling, and saving a bit more information when loading kernel modules.
If you have an API, that you would like to contribute or suggest, I for
one am interested. I have long thought it would be nice to be able to
choose between kernel debug tools KDB, KGDB, KEXEC etc... much like you
can dynamically load I/O modules in KGDB and choose either RS232 or
Ethernet after the kernel has booted.
At the current time, I am most certainly trying to consolidate the
source forge KGDB patches, Tom Rini's branch, as well as my own
development branch and hopefully Sergei's development branch as well.
Perhaps after the code stream is stable you we can take a further look
at what it takes to abstract a generic kernel debug interface. In the
mean time if you have code or skeleton API proposal, I am definitely
listening. It would be nice to have an in kernel soft single step API as
an example, which could be leveraged by KGDB and utrace/ptrace, but that
is a bit forward looking at this point.
Jason.
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: Permanent Kgdb integration into the kernel - lets get with it.
2007-04-17 21:26 ` Permanent Kgdb integration into the kernel - lets get with it Jason Wessel
@ 2007-04-17 22:09 ` Andi Kleen
0 siblings, 0 replies; 30+ messages in thread
From: Andi Kleen @ 2007-04-17 22:09 UTC (permalink / raw)
To: Jason Wessel; +Cc: Andi Kleen, linux-kernel
> I too am open to having a API, for KGDB, but it does need more than just
> the trap vectors and the serial driver as hook points. There are a
> number of patches to fix problems randing from the NET_POLL API, NMI
> handling,
You should submit these changes all as you fix them to the relevant
maintainers and mailing lists. There is only limited review capacity
and sending too many may tax it too much. So it's best used spaced
out -- as you do them. Also then you probably still remember why
you actually changed something. The few kgdb that were submitted
ever usually had the problem that they were submitted by someone
else who didn't do the work and couldn't quite remember why they
were actually needed. Unsurprisingly they often didn't get
merged then.
> If you have an API, that you would like to contribute or suggest, I for
The main interface are the (somewhat misnamed) die chains. More and more
architectures are moving to them.
There probably needs to be another one for serial intercept, but that
should be very simple. For network netpoll already exists.
> one am interested. I have long thought it would be nice to be able to
> choose between kernel debug tools KDB, KGDB, KEXEC etc... much like you
> can dynamically load I/O modules in KGDB and choose either RS232 or
> Ethernet after the kernel has booted.
At some point we had a fully modular kdb (modprobe kdb worked)
and kgdb was nearly there. Unfortunately the changes for kdb were
somewhat ugly. Unfortunately that work never hit the standard
patchkits for kgdb/kdb.
> At the current time, I am most certainly trying to consolidate the
> source forge KGDB patches, Tom Rini's branch, as well as my own
> development branch and hopefully Sergei's development branch as well.
> Perhaps after the code stream is stable you we can take a further look
> at what it takes to abstract a generic kernel debug interface. In the
There already is one. I don't see the need for another one.
> mean time if you have code or skeleton API proposal, I am definitely
> listening. It would be nice to have an in kernel soft single step API as
> an example, which could be leveraged by KGDB and utrace/ptrace, but that
> is a bit forward looking at this point.
kprobes already do that fine with the existing interfaces. Also
single stepping is not particularly complicated so i don't see why
you want to abstract it.
The other issue that needed some changes was stopping all CPUs for
debugging. Since some other subsystems (like kexec) have this problem
already i would suggest generalizing and exporting what they have already.
-Andi
^ permalink raw reply [flat|nested] 30+ messages in thread
* [PATCH 7/13] maps#2: Simplify interdependence of /proc/pid/maps and smaps
2007-04-06 22:03 [PATCH 0/13] maps#2: pagemap, kpagemap, and related cleanups take 2 Matt Mackall
` (5 preceding siblings ...)
2007-04-06 22:03 ` [PATCH 6/13] maps#2: Move the page walker code to lib/ Matt Mackall
@ 2007-04-06 22:03 ` Matt Mackall
2007-04-06 22:03 ` [PATCH 8/13] maps#2: Move clear_refs code to task_mmu.c Matt Mackall
` (5 subsequent siblings)
12 siblings, 0 replies; 30+ messages in thread
From: Matt Mackall @ 2007-04-06 22:03 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel
Simplify interdependence of /proc/pid/maps and smaps
This pulls the shared map display code out of show_map and puts it in
show_smap where it belongs.
Signed-off-by: Matt Mackall <mpm@selenic.com>
Index: mm/fs/proc/task_mmu.c
===================================================================
--- mm.orig/fs/proc/task_mmu.c 2007-04-03 14:50:33.000000000 -0500
+++ mm/fs/proc/task_mmu.c 2007-04-03 18:35:45.000000000 -0500
@@ -125,7 +125,7 @@ struct mem_size_stats
unsigned long referenced;
};
-static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
+static int show_map(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
struct task_struct *task = priv->task;
@@ -185,33 +185,11 @@ static int show_map_internal(struct seq_
}
seq_putc(m, '\n');
- if (mss)
- seq_printf(m,
- "Size: %8lu kB\n"
- "Rss: %8lu kB\n"
- "Shared_Clean: %8lu kB\n"
- "Shared_Dirty: %8lu kB\n"
- "Private_Clean: %8lu kB\n"
- "Private_Dirty: %8lu kB\n"
- "Referenced: %8lu kB\n",
- (vma->vm_end - vma->vm_start) >> 10,
- mss->resident >> 10,
- mss->shared_clean >> 10,
- mss->shared_dirty >> 10,
- mss->private_clean >> 10,
- mss->private_dirty >> 10,
- mss->referenced >> 10);
-
if (m->count < m->size) /* vma is copied successfully */
m->version = (vma != get_gate_vma(task))? vma->vm_start: 0;
return 0;
}
-static int show_map(struct seq_file *m, void *v)
-{
- return show_map_internal(m, v, NULL);
-}
-
static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
void *private)
{
@@ -286,13 +264,35 @@ static int show_smap(struct seq_file *m,
{
struct vm_area_struct *vma = v;
struct mem_size_stats mss;
+ int ret;
memset(&mss, 0, sizeof mss);
mss.vma = vma;
if (vma->vm_mm && !is_vm_hugetlb_page(vma))
walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end,
&smaps_walk, &mss);
- return show_map_internal(m, v, &mss);
+
+ ret = show_map(m, v);
+ if (ret)
+ return ret;
+
+ seq_printf(m,
+ "Size: %8lu kB\n"
+ "Rss: %8lu kB\n"
+ "Shared_Clean: %8lu kB\n"
+ "Shared_Dirty: %8lu kB\n"
+ "Private_Clean: %8lu kB\n"
+ "Private_Dirty: %8lu kB\n"
+ "Referenced: %8lu kB\n",
+ (vma->vm_end - vma->vm_start) >> 10,
+ mss.resident >> 10,
+ mss.shared_clean >> 10,
+ mss.shared_dirty >> 10,
+ mss.private_clean >> 10,
+ mss.private_dirty >> 10,
+ mss.referenced >> 10);
+
+ return ret;
}
static struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range };
^ permalink raw reply [flat|nested] 30+ messages in thread* [PATCH 8/13] maps#2: Move clear_refs code to task_mmu.c
2007-04-06 22:03 [PATCH 0/13] maps#2: pagemap, kpagemap, and related cleanups take 2 Matt Mackall
` (6 preceding siblings ...)
2007-04-06 22:03 ` [PATCH 7/13] maps#2: Simplify interdependence of /proc/pid/maps and smaps Matt Mackall
@ 2007-04-06 22:03 ` Matt Mackall
2007-04-06 22:03 ` [PATCH 9/13] maps#2: Regroup task_mmu by interface Matt Mackall
` (4 subsequent siblings)
12 siblings, 0 replies; 30+ messages in thread
From: Matt Mackall @ 2007-04-06 22:03 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel
Move clear_refs code to task_mmu.c
This puts all the clear_refs code where it belongs and probably lets
things compile on MMU-less systems as well.
Signed-off-by: Matt Mackall <mpm@selenic.com>
Index: mm/fs/proc/base.c
===================================================================
--- mm.orig/fs/proc/base.c 2007-04-04 18:01:05.000000000 -0500
+++ mm/fs/proc/base.c 2007-04-04 18:02:57.000000000 -0500
@@ -749,40 +749,6 @@ static const struct file_operations proc
.write = oom_adjust_write,
};
-static ssize_t clear_refs_write(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct task_struct *task;
- char buffer[PROC_NUMBUF], *end;
- struct mm_struct *mm;
-
- memset(buffer, 0, sizeof(buffer));
- if (count > sizeof(buffer) - 1)
- count = sizeof(buffer) - 1;
- if (copy_from_user(buffer, buf, count))
- return -EFAULT;
- if (!simple_strtol(buffer, &end, 0))
- return -EINVAL;
- if (*end == '\n')
- end++;
- task = get_proc_task(file->f_path.dentry->d_inode);
- if (!task)
- return -ESRCH;
- mm = get_task_mm(task);
- if (mm) {
- clear_refs_smap(mm);
- mmput(mm);
- }
- put_task_struct(task);
- if (end - buffer == 0)
- return -EIO;
- return end - buffer;
-}
-
-static struct file_operations proc_clear_refs_operations = {
- .write = clear_refs_write,
-};
-
#ifdef CONFIG_AUDITSYSCALL
#define TMPBUFLEN 21
static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
Index: mm/fs/proc/internal.h
===================================================================
--- mm.orig/fs/proc/internal.h 2007-04-04 18:01:05.000000000 -0500
+++ mm/fs/proc/internal.h 2007-04-04 18:01:16.000000000 -0500
@@ -49,11 +49,7 @@ extern int proc_pid_statm(struct task_st
extern const struct file_operations proc_maps_operations;
extern const struct file_operations proc_numa_maps_operations;
extern const struct file_operations proc_smaps_operations;
-
-extern const struct file_operations proc_maps_operations;
-extern const struct file_operations proc_numa_maps_operations;
-extern const struct file_operations proc_smaps_operations;
-
+extern const struct file_operations proc_clear_refs_operations;
void free_proc_entry(struct proc_dir_entry *de);
Index: mm/fs/proc/task_mmu.c
===================================================================
--- mm.orig/fs/proc/task_mmu.c 2007-04-04 18:01:12.000000000 -0500
+++ mm/fs/proc/task_mmu.c 2007-04-04 18:01:16.000000000 -0500
@@ -297,19 +297,47 @@ static int show_smap(struct seq_file *m,
static struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range };
-void clear_refs_smap(struct mm_struct *mm)
+static ssize_t clear_refs_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
{
+ struct task_struct *task;
+ char buffer[13], *end;
+ struct mm_struct *mm;
struct vm_area_struct *vma;
- down_read(&mm->mmap_sem);
- for (vma = mm->mmap; vma; vma = vma->vm_next)
- if (vma->vm_mm && !is_vm_hugetlb_page(vma))
- walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end,
- &clear_refs_walk, vma);
- flush_tlb_mm(mm);
- up_read(&mm->mmap_sem);
+ memset(buffer, 0, sizeof(buffer));
+ if (count > sizeof(buffer) - 1)
+ count = sizeof(buffer) - 1;
+ if (copy_from_user(buffer, buf, count))
+ return -EFAULT;
+ if (!simple_strtol(buffer, &end, 0))
+ return -EINVAL;
+ if (*end == '\n')
+ end++;
+ task = get_proc_task(file->f_path.dentry->d_inode);
+ if (!task)
+ return -ESRCH;
+ mm = get_task_mm(task);
+ if (mm) {
+ down_read(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next)
+ if (!is_vm_hugetlb_page(vma))
+ walk_page_range(mm, vma->vm_start, vma->vm_end,
+ &clear_refs_walk, vma);
+ flush_tlb_mm(mm);
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ }
+ put_task_struct(task);
+ if (end - buffer == 0)
+ return -EIO;
+ return end - buffer;
}
+const struct file_operations proc_clear_refs_operations = {
+ .write = clear_refs_write,
+};
+
static void *m_start(struct seq_file *m, loff_t *pos)
{
struct proc_maps_private *priv = m->private;
Index: mm/include/linux/proc_fs.h
===================================================================
--- mm.orig/include/linux/proc_fs.h 2007-04-04 18:01:05.000000000 -0500
+++ mm/include/linux/proc_fs.h 2007-04-04 18:01:16.000000000 -0500
@@ -117,7 +117,6 @@ int proc_pid_readdir(struct file * filp,
unsigned long task_vsize(struct mm_struct *);
int task_statm(struct mm_struct *, int *, int *, int *, int *);
char *task_mem(struct mm_struct *, char *);
-void clear_refs_smap(struct mm_struct *mm);
struct proc_dir_entry *de_get(struct proc_dir_entry *de);
void de_put(struct proc_dir_entry *de);
^ permalink raw reply [flat|nested] 30+ messages in thread* [PATCH 9/13] maps#2: Regroup task_mmu by interface
2007-04-06 22:03 [PATCH 0/13] maps#2: pagemap, kpagemap, and related cleanups take 2 Matt Mackall
` (7 preceding siblings ...)
2007-04-06 22:03 ` [PATCH 8/13] maps#2: Move clear_refs code to task_mmu.c Matt Mackall
@ 2007-04-06 22:03 ` Matt Mackall
2007-04-06 22:03 ` [PATCH 10/13] maps#2: Make /proc/pid/smaps optional under CONFIG_EMBEDDED Matt Mackall
` (3 subsequent siblings)
12 siblings, 0 replies; 30+ messages in thread
From: Matt Mackall @ 2007-04-06 22:03 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel
Regroup task_mmu by interface
Reorder source so that all the code and data for each interface is
together.
Signed-off-by: Matt Mackall <mpm@selenic.com>
Index: mm/fs/proc/task_mmu.c
===================================================================
--- mm.orig/fs/proc/task_mmu.c 2007-03-28 00:08:05.000000000 -0500
+++ mm/fs/proc/task_mmu.c 2007-03-28 00:21:48.000000000 -0500
@@ -114,16 +114,121 @@ static void pad_len_spaces(struct seq_fi
seq_printf(m, "%*c", len, ' ');
}
-struct mem_size_stats
+static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
{
- struct vm_area_struct *vma;
- unsigned long resident;
- unsigned long shared_clean;
- unsigned long shared_dirty;
- unsigned long private_clean;
- unsigned long private_dirty;
- unsigned long referenced;
-};
+ if (vma && vma != priv->tail_vma) {
+ struct mm_struct *mm = vma->vm_mm;
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ }
+}
+
+static void *m_start(struct seq_file *m, loff_t *pos)
+{
+ struct proc_maps_private *priv = m->private;
+ unsigned long last_addr = m->version;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma, *tail_vma = NULL;
+ loff_t l = *pos;
+
+ /* Clear the per syscall fields in priv */
+ priv->task = NULL;
+ priv->tail_vma = NULL;
+
+ /*
+ * We remember last_addr rather than next_addr to hit with
+ * mmap_cache most of the time. We have zero last_addr at
+ * the beginning and also after lseek. We will have -1 last_addr
+ * after the end of the vmas.
+ */
+
+ if (last_addr == -1UL)
+ return NULL;
+
+ priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
+ if (!priv->task)
+ return NULL;
+
+ mm = get_task_mm(priv->task);
+ if (!mm)
+ return NULL;
+
+ priv->tail_vma = tail_vma = get_gate_vma(priv->task);
+ down_read(&mm->mmap_sem);
+
+ /* Start with last addr hint */
+ if (last_addr && (vma = find_vma(mm, last_addr))) {
+ vma = vma->vm_next;
+ goto out;
+ }
+
+ /*
+ * Check the vma index is within the range and do
+ * sequential scan until m_index.
+ */
+ vma = NULL;
+ if ((unsigned long)l < mm->map_count) {
+ vma = mm->mmap;
+ while (l-- && vma)
+ vma = vma->vm_next;
+ goto out;
+ }
+
+ if (l != mm->map_count)
+ tail_vma = NULL; /* After gate vma */
+
+out:
+ if (vma)
+ return vma;
+
+ /* End of vmas has been reached */
+ m->version = (tail_vma != NULL)? 0: -1UL;
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ return tail_vma;
+}
+
+static void *m_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct proc_maps_private *priv = m->private;
+ struct vm_area_struct *vma = v;
+ struct vm_area_struct *tail_vma = priv->tail_vma;
+
+ (*pos)++;
+ if (vma && (vma != tail_vma) && vma->vm_next)
+ return vma->vm_next;
+ vma_stop(priv, vma);
+ return (vma != tail_vma)? tail_vma: NULL;
+}
+
+static void m_stop(struct seq_file *m, void *v)
+{
+ struct proc_maps_private *priv = m->private;
+ struct vm_area_struct *vma = v;
+
+ vma_stop(priv, vma);
+ if (priv->task)
+ put_task_struct(priv->task);
+}
+
+static int do_maps_open(struct inode *inode, struct file *file,
+ struct seq_operations *ops)
+{
+ struct proc_maps_private *priv;
+ int ret = -ENOMEM;
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (priv) {
+ priv->pid = proc_pid(inode);
+ ret = seq_open(file, ops);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = priv;
+ } else {
+ kfree(priv);
+ }
+ }
+ return ret;
+}
static int show_map(struct seq_file *m, void *v)
{
@@ -190,6 +295,36 @@ static int show_map(struct seq_file *m,
return 0;
}
+static struct seq_operations proc_pid_maps_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_map
+};
+
+static int maps_open(struct inode *inode, struct file *file)
+{
+ return do_maps_open(inode, file, &proc_pid_maps_op);
+}
+
+const struct file_operations proc_maps_operations = {
+ .open = maps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+struct mem_size_stats
+{
+ struct vm_area_struct *vma;
+ unsigned long resident;
+ unsigned long shared_clean;
+ unsigned long shared_dirty;
+ unsigned long private_clean;
+ unsigned long private_dirty;
+ unsigned long referenced;
+};
+
static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
void *private)
{
@@ -231,33 +366,6 @@ static int smaps_pte_range(pmd_t *pmd, u
return 0;
}
-static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
- unsigned long end, void *private)
-{
- struct vm_area_struct *vma = private;
- pte_t *pte, ptent;
- spinlock_t *ptl;
- struct page *page;
-
- pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- for (; addr != end; pte++, addr += PAGE_SIZE) {
- ptent = *pte;
- if (!pte_present(ptent))
- continue;
-
- page = vm_normal_page(vma, addr, ptent);
- if (!page)
- continue;
-
- /* Clear accessed and referenced bits. */
- ptep_test_and_clear_young(vma, addr, pte);
- ClearPageReferenced(page);
- }
- pte_unmap_unlock(pte - 1, ptl);
- cond_resched();
- return 0;
-}
-
static struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range };
static int show_smap(struct seq_file *m, void *v)
@@ -293,6 +401,52 @@ static int show_smap(struct seq_file *m,
return ret;
}
+static struct seq_operations proc_pid_smaps_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_smap
+};
+
+static int smaps_open(struct inode *inode, struct file *file)
+{
+ return do_maps_open(inode, file, &proc_pid_smaps_op);
+}
+
+const struct file_operations proc_smaps_operations = {
+ .open = smaps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, void *private)
+{
+ struct vm_area_struct *vma = private;
+ pte_t *pte, ptent;
+ spinlock_t *ptl;
+ struct page *page;
+
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ for (; addr != end; pte++, addr += PAGE_SIZE) {
+ ptent = *pte;
+ if (!pte_present(ptent))
+ continue;
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (!page)
+ continue;
+
+ /* Clear accessed and referenced bits. */
+ ptep_test_and_clear_young(vma, addr, pte);
+ ClearPageReferenced(page);
+ }
+ pte_unmap_unlock(pte - 1, ptl);
+ cond_resched();
+ return 0;
+}
+
static struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range };
static ssize_t clear_refs_write(struct file *file, const char __user *buf,
@@ -336,148 +490,6 @@ const struct file_operations proc_clear_
.write = clear_refs_write,
};
-static void *m_start(struct seq_file *m, loff_t *pos)
-{
- struct proc_maps_private *priv = m->private;
- unsigned long last_addr = m->version;
- struct mm_struct *mm;
- struct vm_area_struct *vma, *tail_vma = NULL;
- loff_t l = *pos;
-
- /* Clear the per syscall fields in priv */
- priv->task = NULL;
- priv->tail_vma = NULL;
-
- /*
- * We remember last_addr rather than next_addr to hit with
- * mmap_cache most of the time. We have zero last_addr at
- * the beginning and also after lseek. We will have -1 last_addr
- * after the end of the vmas.
- */
-
- if (last_addr == -1UL)
- return NULL;
-
- priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
- if (!priv->task)
- return NULL;
-
- mm = get_task_mm(priv->task);
- if (!mm)
- return NULL;
-
- priv->tail_vma = tail_vma = get_gate_vma(priv->task);
- down_read(&mm->mmap_sem);
-
- /* Start with last addr hint */
- if (last_addr && (vma = find_vma(mm, last_addr))) {
- vma = vma->vm_next;
- goto out;
- }
-
- /*
- * Check the vma index is within the range and do
- * sequential scan until m_index.
- */
- vma = NULL;
- if ((unsigned long)l < mm->map_count) {
- vma = mm->mmap;
- while (l-- && vma)
- vma = vma->vm_next;
- goto out;
- }
-
- if (l != mm->map_count)
- tail_vma = NULL; /* After gate vma */
-
-out:
- if (vma)
- return vma;
-
- /* End of vmas has been reached */
- m->version = (tail_vma != NULL)? 0: -1UL;
- up_read(&mm->mmap_sem);
- mmput(mm);
- return tail_vma;
-}
-
-static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
-{
- if (vma && vma != priv->tail_vma) {
- struct mm_struct *mm = vma->vm_mm;
- up_read(&mm->mmap_sem);
- mmput(mm);
- }
-}
-
-static void *m_next(struct seq_file *m, void *v, loff_t *pos)
-{
- struct proc_maps_private *priv = m->private;
- struct vm_area_struct *vma = v;
- struct vm_area_struct *tail_vma = priv->tail_vma;
-
- (*pos)++;
- if (vma && (vma != tail_vma) && vma->vm_next)
- return vma->vm_next;
- vma_stop(priv, vma);
- return (vma != tail_vma)? tail_vma: NULL;
-}
-
-static void m_stop(struct seq_file *m, void *v)
-{
- struct proc_maps_private *priv = m->private;
- struct vm_area_struct *vma = v;
-
- vma_stop(priv, vma);
- if (priv->task)
- put_task_struct(priv->task);
-}
-
-static struct seq_operations proc_pid_maps_op = {
- .start = m_start,
- .next = m_next,
- .stop = m_stop,
- .show = show_map
-};
-
-static struct seq_operations proc_pid_smaps_op = {
- .start = m_start,
- .next = m_next,
- .stop = m_stop,
- .show = show_smap
-};
-
-static int do_maps_open(struct inode *inode, struct file *file,
- struct seq_operations *ops)
-{
- struct proc_maps_private *priv;
- int ret = -ENOMEM;
- priv = kzalloc(sizeof(*priv), GFP_KERNEL);
- if (priv) {
- priv->pid = proc_pid(inode);
- ret = seq_open(file, ops);
- if (!ret) {
- struct seq_file *m = file->private_data;
- m->private = priv;
- } else {
- kfree(priv);
- }
- }
- return ret;
-}
-
-static int maps_open(struct inode *inode, struct file *file)
-{
- return do_maps_open(inode, file, &proc_pid_maps_op);
-}
-
-const struct file_operations proc_maps_operations = {
- .open = maps_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
-};
-
#ifdef CONFIG_NUMA
extern int show_numa_map(struct seq_file *m, void *v);
@@ -512,14 +524,3 @@ const struct file_operations proc_numa_m
};
#endif
-static int smaps_open(struct inode *inode, struct file *file)
-{
- return do_maps_open(inode, file, &proc_pid_smaps_op);
-}
-
-const struct file_operations proc_smaps_operations = {
- .open = smaps_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
-};
^ permalink raw reply [flat|nested] 30+ messages in thread* [PATCH 10/13] maps#2: Make /proc/pid/smaps optional under CONFIG_EMBEDDED
2007-04-06 22:03 [PATCH 0/13] maps#2: pagemap, kpagemap, and related cleanups take 2 Matt Mackall
` (8 preceding siblings ...)
2007-04-06 22:03 ` [PATCH 9/13] maps#2: Regroup task_mmu by interface Matt Mackall
@ 2007-04-06 22:03 ` Matt Mackall
2007-04-06 22:03 ` [PATCH 11/13] maps#2: Make /proc/pid/clear_refs option " Matt Mackall
` (2 subsequent siblings)
12 siblings, 0 replies; 30+ messages in thread
From: Matt Mackall @ 2007-04-06 22:03 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel
Make /proc/pid/smaps optional under CONFIG_EMBEDDED
This interface is primarily useful for doing memory profiling and not
much use on deployed embedded boxes. Make it optional. Together with
/proc/pid/clear_refs, this save a few K.
Signed-off-by: Matt Mackall <mpm@selenic.com>
Index: mm/fs/proc/base.c
===================================================================
--- mm.orig/fs/proc/base.c 2007-03-29 13:01:49.000000000 -0500
+++ mm/fs/proc/base.c 2007-03-29 13:05:21.000000000 -0500
@@ -1907,8 +1907,10 @@ static const struct pid_entry tgid_base_
REG("mountstats", S_IRUSR, mountstats),
#ifdef CONFIG_MMU
REG("clear_refs", S_IWUSR, clear_refs),
+#ifdef CONFIG_PROC_SMAPS
REG("smaps", S_IRUGO, smaps),
#endif
+#endif
#ifdef CONFIG_SECURITY
DIR("attr", S_IRUGO|S_IXUGO, attr_dir),
#endif
@@ -2189,8 +2191,10 @@ static const struct pid_entry tid_base_s
REG("mounts", S_IRUGO, mounts),
#ifdef CONFIG_MMU
REG("clear_refs", S_IWUSR, clear_refs),
+#ifdef CONFIG_PROC_SMAPS
REG("smaps", S_IRUGO, smaps),
#endif
+#endif
#ifdef CONFIG_SECURITY
DIR("attr", S_IRUGO|S_IXUGO, attr_dir),
#endif
Index: mm/fs/proc/task_mmu.c
===================================================================
--- mm.orig/fs/proc/task_mmu.c 2007-03-29 13:01:52.000000000 -0500
+++ mm/fs/proc/task_mmu.c 2007-03-29 13:05:21.000000000 -0500
@@ -314,6 +314,7 @@ const struct file_operations proc_maps_o
.release = seq_release_private,
};
+#ifdef CONFIG_PROC_SMAPS
struct mem_size_stats
{
struct vm_area_struct *vma;
@@ -421,6 +422,7 @@ const struct file_operations proc_smaps_
.llseek = seq_lseek,
.release = seq_release_private,
};
+#endif
static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, void *private)
Index: mm/init/Kconfig
===================================================================
--- mm.orig/init/Kconfig 2007-03-29 13:00:36.000000000 -0500
+++ mm/init/Kconfig 2007-03-29 13:05:21.000000000 -0500
@@ -514,6 +514,14 @@ config VM_EVENT_COUNTERS
on EMBEDDED systems. /proc/vmstat will only show page counts
if VM event counters are disabled.
+config PROC_SMAPS
+ default y
+ bool "Enable /proc/pid/smaps support" if EMBEDDED && PROC_FS && MMU
+ help
+ The /proc/pid/smaps interface reports a process's private and
+ shared memory per mapping. Disabling this interface will reduce
+ the size of the kernel for small machines.
+
endmenu # General setup
config RT_MUTEXES
^ permalink raw reply [flat|nested] 30+ messages in thread* [PATCH 11/13] maps#2: Make /proc/pid/clear_refs option under CONFIG_EMBEDDED
2007-04-06 22:03 [PATCH 0/13] maps#2: pagemap, kpagemap, and related cleanups take 2 Matt Mackall
` (9 preceding siblings ...)
2007-04-06 22:03 ` [PATCH 10/13] maps#2: Make /proc/pid/smaps optional under CONFIG_EMBEDDED Matt Mackall
@ 2007-04-06 22:03 ` Matt Mackall
2007-04-07 5:41 ` David Rientjes
2007-04-06 22:03 ` [PATCH 12/13] maps#2: Add /proc/pid/pagemap interface Matt Mackall
2007-04-06 22:03 ` [PATCH 13/13] maps#2: Add /proc/kpagemap interface Matt Mackall
12 siblings, 1 reply; 30+ messages in thread
From: Matt Mackall @ 2007-04-06 22:03 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel
Make /proc/pid/clear_refs option under CONFIG_EMBEDDED
This interface is primarily useful for doing memory profiling and not
much use on deployed embedded boxes. Make it optional. Together with
/proc/pid/smaps, this save a few K.
Signed-off-by: Matt Mackall <mpm@selenic.com>
Index: mm/fs/proc/base.c
===================================================================
--- mm.orig/fs/proc/base.c 2007-04-03 14:50:33.000000000 -0500
+++ mm/fs/proc/base.c 2007-04-03 18:04:59.000000000 -0500
@@ -2000,7 +2000,9 @@ static const struct pid_entry tgid_base_
REG("mounts", S_IRUGO, mounts),
REG("mountstats", S_IRUSR, mountstats),
#ifdef CONFIG_MMU
+#ifdef CONFIG_PROC_CLEAR_REFS
REG("clear_refs", S_IWUSR, clear_refs),
+#endif
#ifdef CONFIG_PROC_SMAPS
REG("smaps", S_IRUGO, smaps),
#endif
@@ -2285,7 +2287,9 @@ static const struct pid_entry tid_base_s
LNK("exe", exe),
REG("mounts", S_IRUGO, mounts),
#ifdef CONFIG_MMU
+#ifdef CONFIG_PROC_CLEAR_REFS
REG("clear_refs", S_IWUSR, clear_refs),
+#endif
#ifdef CONFIG_PROC_SMAPS
REG("smaps", S_IRUGO, smaps),
#endif
Index: mm/fs/proc/task_mmu.c
===================================================================
--- mm.orig/fs/proc/task_mmu.c 2007-04-03 14:50:33.000000000 -0500
+++ mm/fs/proc/task_mmu.c 2007-04-03 18:04:59.000000000 -0500
@@ -424,6 +424,7 @@ const struct file_operations proc_smaps_
};
#endif
+#ifdef CONFIG_PROC_CLEAR_REFS
static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, void *private)
{
@@ -493,6 +494,7 @@ static ssize_t clear_refs_write(struct f
const struct file_operations proc_clear_refs_operations = {
.write = clear_refs_write,
};
+#endif
#ifdef CONFIG_NUMA
extern int show_numa_map(struct seq_file *m, void *v);
Index: mm/init/Kconfig
===================================================================
--- mm.orig/init/Kconfig 2007-04-03 14:50:33.000000000 -0500
+++ mm/init/Kconfig 2007-04-03 18:04:59.000000000 -0500
@@ -593,6 +593,15 @@ config PROC_SMAPS
shared memory per mapping. Disabling this interface will reduce
the size of the kernel for small machines.
+config PROC_CLEAR_REFS
+ default y
+ bool "Enable /proc/pid/clear_refs support" if EMBEDDED && PROC_FS && MMU
+ help
+ The /proc/pid/clear_refs interface allows clearing the
+ referenced bits on a process's memory maps to allow monitoring
+ working set size. Disabling this interface will reduce
+ the size of the kernel for small machines.
+
endmenu # General setup
config RT_MUTEXES
^ permalink raw reply [flat|nested] 30+ messages in thread* [PATCH 12/13] maps#2: Add /proc/pid/pagemap interface
2007-04-06 22:03 [PATCH 0/13] maps#2: pagemap, kpagemap, and related cleanups take 2 Matt Mackall
` (10 preceding siblings ...)
2007-04-06 22:03 ` [PATCH 11/13] maps#2: Make /proc/pid/clear_refs option " Matt Mackall
@ 2007-04-06 22:03 ` Matt Mackall
2007-04-07 6:55 ` Andrew Morton
2007-04-19 19:12 ` Dave Hansen
2007-04-06 22:03 ` [PATCH 13/13] maps#2: Add /proc/kpagemap interface Matt Mackall
12 siblings, 2 replies; 30+ messages in thread
From: Matt Mackall @ 2007-04-06 22:03 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel
Add /proc/pid/pagemap interface
This interface provides a mapping for each page in an address space to
its physical page frame number, allowing precise determination of what
pages are mapped and what pages are shared between processes.
Signed-off-by: Matt Mackall <mpm@selenic.com>
Index: mm/fs/proc/base.c
===================================================================
--- mm.orig/fs/proc/base.c 2007-04-04 18:03:03.000000000 -0500
+++ mm/fs/proc/base.c 2007-04-04 18:03:03.000000000 -0500
@@ -664,7 +664,7 @@ out_no_task:
}
#endif
-static loff_t mem_lseek(struct file * file, loff_t offset, int orig)
+loff_t mem_lseek(struct file * file, loff_t offset, int orig)
{
switch (orig) {
case 0:
@@ -2006,6 +2006,9 @@ static const struct pid_entry tgid_base_
#ifdef CONFIG_PROC_SMAPS
REG("smaps", S_IRUGO, smaps),
#endif
+#ifdef CONFIG_PROC_PAGEMAP
+ REG("pagemap", S_IRUSR, pagemap),
+#endif
#endif
#ifdef CONFIG_SECURITY
DIR("attr", S_IRUGO|S_IXUGO, attr_dir),
@@ -2293,6 +2296,9 @@ static const struct pid_entry tid_base_s
#ifdef CONFIG_PROC_SMAPS
REG("smaps", S_IRUGO, smaps),
#endif
+#ifdef CONFIG_PROC_PAGEMAP
+ REG("pagemap", S_IRUSR, pagemap),
+#endif
#endif
#ifdef CONFIG_SECURITY
DIR("attr", S_IRUGO|S_IXUGO, attr_dir),
Index: mm/fs/proc/internal.h
===================================================================
--- mm.orig/fs/proc/internal.h 2007-04-04 18:01:16.000000000 -0500
+++ mm/fs/proc/internal.h 2007-04-04 18:03:03.000000000 -0500
@@ -45,11 +45,13 @@ extern int proc_tid_stat(struct task_str
extern int proc_tgid_stat(struct task_struct *, char *);
extern int proc_pid_status(struct task_struct *, char *);
extern int proc_pid_statm(struct task_struct *, char *);
+extern loff_t mem_lseek(struct file * file, loff_t offset, int orig);
extern const struct file_operations proc_maps_operations;
extern const struct file_operations proc_numa_maps_operations;
extern const struct file_operations proc_smaps_operations;
extern const struct file_operations proc_clear_refs_operations;
+extern const struct file_operations proc_pagemap_operations;
void free_proc_entry(struct proc_dir_entry *de);
Index: mm/fs/proc/task_mmu.c
===================================================================
--- mm.orig/fs/proc/task_mmu.c 2007-04-04 18:03:03.000000000 -0500
+++ mm/fs/proc/task_mmu.c 2007-04-05 14:25:39.000000000 -0500
@@ -530,3 +530,187 @@ const struct file_operations proc_numa_m
};
#endif
+#ifdef CONFIG_PROC_PAGEMAP
+struct pagemapread {
+ struct mm_struct *mm;
+ unsigned long next;
+ unsigned long *buf;
+ unsigned long pos;
+ size_t count;
+ int index;
+ char __user *out;
+};
+
+static int flush_pagemap(struct pagemapread *pm)
+{
+ int n = min(pm->count, pm->index * sizeof(unsigned long));
+ if (copy_to_user(pm->out, pm->buf, n))
+ return -EFAULT;
+ pm->out += n;
+ pm->pos += n;
+ pm->count -= n;
+ pm->index = 0;
+ cond_resched();
+ return 0;
+}
+
+static int add_to_pagemap(unsigned long addr, unsigned long pfn,
+ struct pagemapread *pm)
+{
+ pm->buf[pm->index++] = pfn;
+ pm->next = addr + PAGE_SIZE;
+ if (pm->index * sizeof(unsigned long) >= PAGE_SIZE ||
+ pm->index * sizeof(unsigned long) >= pm->count)
+ return flush_pagemap(pm);
+ return 0;
+}
+
+static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ void *private)
+{
+ struct pagemapread *pm = private;
+ pte_t *pte;
+ int err;
+
+ pte = pte_offset_map(pmd, addr);
+ for (; addr != end; pte++, addr += PAGE_SIZE) {
+ if (addr < pm->next)
+ continue;
+ if (!pte_present(*pte))
+ err = add_to_pagemap(addr, -1, pm);
+ else
+ err = add_to_pagemap(addr, pte_pfn(*pte), pm);
+ if (err)
+ return err;
+ }
+ pte_unmap(pte - 1);
+ return 0;
+}
+
+static int pagemap_fill(struct pagemapread *pm, unsigned long end)
+{
+ int ret;
+
+ while (pm->next != end) {
+ ret = add_to_pagemap(pm->next, -1UL, pm);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+static struct mm_walk pagemap_walk = { .pmd_entry = pagemap_pte_range };
+
+/*
+ * /proc/pid/pagemap - an array mapping virtual pages to pfns
+ *
+ * For each page in the address space, this file contains one long
+ * representing the corresponding physical page frame number (PFN) or
+ * -1 if the page isn't present. This allows determining precisely
+ * which pages are mapped and comparing mapped pages between
+ * processes.
+ *
+ * Efficient users of this interface will use /proc/pid/maps to
+ * determine which areas of memory are actually mapped and llseek to
+ * skip over unmapped regions.
+ *
+ * The first 4 bytes of this file form a simple header:
+ *
+ * first byte: 0 for big endian, 1 for little
+ * second byte: page shift (eg 12 for 4096 byte pages)
+ * third byte: entry size in bytes (currently either 4 or 8)
+ * fourth byte: header size
+ */
+static ssize_t pagemap_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+ unsigned long src = *ppos;
+ unsigned long *page;
+ unsigned long addr, end, vend, svpfn, evpfn;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ struct pagemapread pm;
+ int ret = -ESRCH;
+
+ if (!task)
+ goto out_no_task;
+
+ ret = -EACCES;
+ if (!ptrace_may_attach(task))
+ goto out;
+
+ ret = -EIO;
+ svpfn = src / sizeof(unsigned long) - 1;
+ addr = PAGE_SIZE * svpfn;
+ if ((svpfn + 1) * sizeof(unsigned long) != src)
+ goto out;
+ evpfn = min((src + count) / sizeof(unsigned long),
+ ((~0UL) >> PAGE_SHIFT) + 1);
+ count = (evpfn - svpfn) * sizeof(unsigned long);
+ end = PAGE_SIZE * evpfn;
+
+ ret = -ENOMEM;
+ page = kzalloc(PAGE_SIZE, GFP_USER);
+ if (!page)
+ goto out;
+
+ ret = 0;
+ mm = get_task_mm(task);
+ if (!mm)
+ goto out_free;
+
+ pm.mm = mm;
+ pm.next = addr;
+ pm.buf = page;
+ pm.pos = src;
+ pm.count = count;
+ pm.index = 0;
+ pm.out = buf;
+
+ if (svpfn == -1) {
+ add_to_pagemap(pm.next, 0, &pm);
+ ((char *)page)[0] = (ntohl(1) != 1);
+ ((char *)page)[1] = PAGE_SHIFT;
+ ((char *)page)[2] = sizeof(unsigned long);
+ ((char *)page)[3] = sizeof(unsigned long);
+ }
+
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, pm.next);
+ while (pm.count > 0 && vma) {
+ if (!ptrace_may_attach(task)) {
+ ret = -EIO;
+ goto out;
+ }
+ vend = min(vma->vm_start - 1, end - 1) + 1;
+ ret = pagemap_fill(&pm, vend);
+ if (ret || !pm.count)
+ break;
+ vend = min(vma->vm_end - 1, end - 1) + 1;
+ ret = walk_page_range(mm, vma->vm_start, vend,
+ &pagemap_walk, &pm);
+ vma = vma->vm_next;
+ }
+ up_read(&mm->mmap_sem);
+
+ ret = pagemap_fill(&pm, end);
+
+ *ppos = pm.pos;
+ if (!ret)
+ ret = pm.pos - src;
+
+ mmput(mm);
+out_free:
+ kfree(page);
+out:
+ put_task_struct(task);
+out_no_task:
+ return ret;
+}
+
+const struct file_operations proc_pagemap_operations = {
+ .llseek = mem_lseek, /* borrow this */
+ .read = pagemap_read,
+};
+#endif
Index: mm/init/Kconfig
===================================================================
--- mm.orig/init/Kconfig 2007-04-04 18:03:03.000000000 -0500
+++ mm/init/Kconfig 2007-04-05 14:18:49.000000000 -0500
@@ -602,6 +602,16 @@ config PROC_CLEAR_REFS
working set size. Disabling this interface will reduce
the size of the kernel for small machines.
+config PROC_PAGEMAP
+ default y
+ bool "Enable /proc/pid/pagemap support" if EMBEDDED && PROC_FS && MMU
+ help
+ The /proc/pid/pagemap interface allows reading the
+ kernel's virtual memory to page frame mapping to determine which
+ individual pages a process has mapped and which pages it shares
+ with other processes. Disabling this interface will reduce the
+ size of the kernel for small machines.
+
endmenu # General setup
config RT_MUTEXES
^ permalink raw reply [flat|nested] 30+ messages in thread* Re: [PATCH 12/13] maps#2: Add /proc/pid/pagemap interface
2007-04-06 22:03 ` [PATCH 12/13] maps#2: Add /proc/pid/pagemap interface Matt Mackall
@ 2007-04-07 6:55 ` Andrew Morton
2007-04-07 16:36 ` Matt Mackall
2007-04-19 19:12 ` Dave Hansen
1 sibling, 1 reply; 30+ messages in thread
From: Andrew Morton @ 2007-04-07 6:55 UTC (permalink / raw)
To: Matt Mackall; +Cc: linux-kernel
On Fri, 06 Apr 2007 17:03:13 -0500 Matt Mackall <mpm@selenic.com> wrote:
> Add /proc/pid/pagemap interface
>
> This interface provides a mapping for each page in an address space to
> its physical page frame number, allowing precise determination of what
> pages are mapped and what pages are shared between processes.
Could we please have a simple read-proc-pid-pagemap.c placed under
Documentation/ somewhere? Also some sample output for the changelog
so we can see what all this does.
Also for kpagemap, please.
Should /proc/pid/pagemap and kpagemap be versioned?
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 12/13] maps#2: Add /proc/pid/pagemap interface
2007-04-07 6:55 ` Andrew Morton
@ 2007-04-07 16:36 ` Matt Mackall
0 siblings, 0 replies; 30+ messages in thread
From: Matt Mackall @ 2007-04-07 16:36 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel
On Fri, Apr 06, 2007 at 11:55:10PM -0700, Andrew Morton wrote:
> On Fri, 06 Apr 2007 17:03:13 -0500 Matt Mackall <mpm@selenic.com> wrote:
>
> > Add /proc/pid/pagemap interface
> >
> > This interface provides a mapping for each page in an address space to
> > its physical page frame number, allowing precise determination of what
> > pages are mapped and what pages are shared between processes.
>
> Could we please have a simple read-proc-pid-pagemap.c placed under
> Documentation/ somewhere? Also some sample output for the changelog
> so we can see what all this does.
Working on that. The userspace portion of my tools are very rough at
the moment. And in Python.
> Also for kpagemap, please.
>
> Should /proc/pid/pagemap and kpagemap be versioned?
They've both got a variable-sized header, so we can add things there.
--
Mathematics is the supreme nostalgia of our time.
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 12/13] maps#2: Add /proc/pid/pagemap interface
2007-04-06 22:03 ` [PATCH 12/13] maps#2: Add /proc/pid/pagemap interface Matt Mackall
2007-04-07 6:55 ` Andrew Morton
@ 2007-04-19 19:12 ` Dave Hansen
2007-04-19 19:58 ` Matt Mackall
1 sibling, 1 reply; 30+ messages in thread
From: Dave Hansen @ 2007-04-19 19:12 UTC (permalink / raw)
To: Matt Mackall; +Cc: Andrew Morton, linux-kernel
On Fri, 2007-04-06 at 17:03 -0500, Matt Mackall wrote:
>
> +static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned
> long end,
> + void *private)
> +{
> + struct pagemapread *pm = private;
> + pte_t *pte;
> + int err;
> +
> + pte = pte_offset_map(pmd, addr);
> + for (; addr != end; pte++, addr += PAGE_SIZE) {
> + if (addr < pm->next)
> + continue;
> + if (!pte_present(*pte))
> + err = add_to_pagemap(addr, -1, pm);
> + else
> + err = add_to_pagemap(addr, pte_pfn(*pte), pm);
> + if (err)
> + return err;
> + }
> + pte_unmap(pte - 1);
> + return 0;
> +}
Sorry for the horribly late reply. ;)
Would you have any problems with this being extended for the !
pte_present() case to show pages that happen to be in swap?
I'm playing with some process checkpoint/restart code, and using the
existing swap mechanisms to get the current memory contents out of the
process. I've also created a hackish syscall to make a pretty raw dump
of pte contents.
Perhaps we could steal the high bits of the pfn and have its presence in
swap, plus some handle to which swapfile it is in. Or, would you rather
I just create a new /proc file that utilizes most of the code you
already put in place, and _just_ deals with swap?
-- Dave
^ permalink raw reply [flat|nested] 30+ messages in thread* Re: [PATCH 12/13] maps#2: Add /proc/pid/pagemap interface
2007-04-19 19:12 ` Dave Hansen
@ 2007-04-19 19:58 ` Matt Mackall
0 siblings, 0 replies; 30+ messages in thread
From: Matt Mackall @ 2007-04-19 19:58 UTC (permalink / raw)
To: Dave Hansen; +Cc: Andrew Morton, linux-kernel
On Thu, Apr 19, 2007 at 12:12:29PM -0700, Dave Hansen wrote:
> On Fri, 2007-04-06 at 17:03 -0500, Matt Mackall wrote:
> >
> > +static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned
> > long end,
> > + void *private)
> > +{
> > + struct pagemapread *pm = private;
> > + pte_t *pte;
> > + int err;
> > +
> > + pte = pte_offset_map(pmd, addr);
> > + for (; addr != end; pte++, addr += PAGE_SIZE) {
> > + if (addr < pm->next)
> > + continue;
> > + if (!pte_present(*pte))
> > + err = add_to_pagemap(addr, -1, pm);
> > + else
> > + err = add_to_pagemap(addr, pte_pfn(*pte), pm);
> > + if (err)
> > + return err;
> > + }
> > + pte_unmap(pte - 1);
> > + return 0;
> > +}
>
> Sorry for the horribly late reply. ;)
>
> Would you have any problems with this being extended for the !
> pte_present() case to show pages that happen to be in swap?
>
> I'm playing with some process checkpoint/restart code, and using the
> existing swap mechanisms to get the current memory contents out of the
> process. I've also created a hackish syscall to make a pretty raw dump
> of pte contents.
>
> Perhaps we could steal the high bits of the pfn and have its presence in
> swap, plus some handle to which swapfile it is in. Or, would you rather
> I just create a new /proc file that utilizes most of the code you
> already put in place, and _just_ deals with swap?
It seems reasonable to deal with swap here in some fashion. It's just
a matter of how. Current swap entries aren't terribly portable.
I'm planning on using a couple high bits for exposing active/dirty.
Adding present should be fine as well.
--
Mathematics is the supreme nostalgia of our time.
^ permalink raw reply [flat|nested] 30+ messages in thread
* [PATCH 13/13] maps#2: Add /proc/kpagemap interface
2007-04-06 22:03 [PATCH 0/13] maps#2: pagemap, kpagemap, and related cleanups take 2 Matt Mackall
` (11 preceding siblings ...)
2007-04-06 22:03 ` [PATCH 12/13] maps#2: Add /proc/pid/pagemap interface Matt Mackall
@ 2007-04-06 22:03 ` Matt Mackall
2007-04-19 19:06 ` Dave Hansen
12 siblings, 1 reply; 30+ messages in thread
From: Matt Mackall @ 2007-04-06 22:03 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel
Add /proc/kpagemap interface
This makes physical page flags and counts available to userspace.
Together with /proc/pid/pagemap and /proc/pid/clear_refs, this can be
used to measure memory usage on a per-page basis.
Signed-off-by: Matt Mackall <mpm@selenic.com>
Index: mm/fs/proc/proc_misc.c
===================================================================
--- mm.orig/fs/proc/proc_misc.c 2007-04-05 14:18:49.000000000 -0500
+++ mm/fs/proc/proc_misc.c 2007-04-05 14:26:23.000000000 -0500
@@ -46,6 +46,8 @@
#include <linux/vmalloc.h>
#include <linux/crash_dump.h>
#include <linux/pid_namespace.h>
+#include <linux/ptrace.h>
+#include <linux/bootmem.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/io.h>
@@ -733,6 +735,91 @@ static struct file_operations proc_page_
};
#endif
+#ifdef CONFIG_PROC_KPAGEMAP
+#define KPMSIZE (sizeof(unsigned long) * 2)
+#define KPMMASK (KPMSIZE - 1)
+/* /proc/kpagemap - an array exposing page flags and counts
+ *
+ * Each entry is a pair of unsigned longs representing the
+ * corresponding physical page, the first containing the page flags
+ * and the second containing the page use count.
+ *
+ * The first 4 bytes of this file form a simple header:
+ *
+ * first byte: 0 for big endian, 1 for little
+ * second byte: page shift (eg 12 for 4096 byte pages)
+ * third byte: entry size in bytes (currently either 4 or 8)
+ * fourth byte: header size
+ */
+static ssize_t kpagemap_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned long *page;
+ struct page *ppage;
+ unsigned long src = *ppos;
+ unsigned long pfn;
+ ssize_t ret = 0;
+ int chunk, i;
+
+ pfn = src / KPMSIZE - 1;
+ count = min_t(size_t, count, ((max_pfn + 1) * KPMSIZE) - src);
+ if (src & KPMMASK || count & KPMMASK)
+ return -EIO;
+
+ page = (unsigned long *)__get_free_page(GFP_USER);
+ if (!page)
+ return -ENOMEM;
+
+ while (count > 0) {
+ chunk = min_t(size_t, count, PAGE_SIZE);
+ i = 0;
+
+ if (pfn == -1) {
+ page[0] = 0;
+ page[1] = 0;
+ ((char *)page)[0] = (ntohl(1) != 1);
+ ((char *)page)[1] = PAGE_SHIFT;
+ ((char *)page)[2] = sizeof(unsigned long);
+ ((char *)page)[3] = KPMSIZE;
+ i = 2;
+ pfn++;
+ }
+
+ for (; i < 2 * chunk / KPMSIZE; i += 2, pfn++) {
+ ppage = pfn_to_page(pfn);
+ if (!ppage) {
+ page[i] = 0;
+ page[i + 1] = 0;
+ } else {
+ page[i] = ppage->flags;
+ page[i + 1] = atomic_read(&ppage->_count);
+ }
+ }
+ chunk = (i / 2) * KPMSIZE;
+
+ if (copy_to_user(buf, page, chunk)) {
+ ret = -EFAULT;
+ break;
+ }
+ ret += chunk;
+ src += chunk;
+ buf += chunk;
+ count -= chunk;
+ cond_resched();
+ }
+ *ppos = src;
+
+ free_page((unsigned long)page);
+ return ret;
+}
+
+struct proc_dir_entry *proc_kpagemap;
+static struct file_operations proc_kpagemap_operations = {
+ .llseek = mem_lseek,
+ .read = kpagemap_read,
+};
+#endif
+
struct proc_dir_entry *proc_root_kcore;
void create_seq_entry(char *name, mode_t mode, const struct file_operations *f)
@@ -812,6 +899,11 @@ void __init proc_misc_init(void)
(size_t)high_memory - PAGE_OFFSET + PAGE_SIZE;
}
#endif
+#ifdef CONFIG_PROC_KPAGEMAP
+ proc_kpagemap = create_proc_entry("kpagemap", S_IRUSR, NULL);
+ if (proc_kpagemap)
+ proc_kpagemap->proc_fops = &proc_kpagemap_operations;
+#endif
#ifdef CONFIG_PROC_VMCORE
proc_vmcore = create_proc_entry("vmcore", S_IRUSR, NULL);
if (proc_vmcore)
Index: mm/init/Kconfig
===================================================================
--- mm.orig/init/Kconfig 2007-04-05 14:18:49.000000000 -0500
+++ mm/init/Kconfig 2007-04-05 14:26:23.000000000 -0500
@@ -612,6 +612,15 @@ config PROC_PAGEMAP
with other processes. Disabling this interface will reduce the
size of the kernel for small machines.
+config PROC_KPAGEMAP
+ default y
+ bool "Enable /proc/kpagemap support" if EMBEDDED && PROC_FS
+ help
+ The /proc/pid/kpagemap interface allows reading the
+ kernel's per-page flag and usage counts to gather precise
+ information on page-level memory usage. Disabling this interface
+ will reduce the size of the kernel for small machines.
+
endmenu # General setup
config RT_MUTEXES
^ permalink raw reply [flat|nested] 30+ messages in thread* Re: [PATCH 13/13] maps#2: Add /proc/kpagemap interface
2007-04-06 22:03 ` [PATCH 13/13] maps#2: Add /proc/kpagemap interface Matt Mackall
@ 2007-04-19 19:06 ` Dave Hansen
2007-04-19 20:02 ` Matt Mackall
0 siblings, 1 reply; 30+ messages in thread
From: Dave Hansen @ 2007-04-19 19:06 UTC (permalink / raw)
To: Matt Mackall; +Cc: Andrew Morton, linux-kernel
On Fri, 2007-04-06 at 17:03 -0500, Matt Mackall wrote:
>
> +static ssize_t kpagemap_read(struct file *file, char __user *buf,
> + size_t count, loff_t *ppos)
> +{
...
> + for (; i < 2 * chunk / KPMSIZE; i += 2, pfn++) {
> + ppage = pfn_to_page(pfn);
> + if (!ppage) {
> + page[i] = 0;
> + page[i + 1] = 0;
> + } else {
> + page[i] = ppage->flags;
> + page[i + 1] = atomic_read(&ppage->_count);
> + }
> + }
I think this needs a pfn_valid() check for sparse/discontig systems. I
think we'll oops if we have holes because we don't check pfn_valid()
inside of pfn_to_page().
-- Dave
^ permalink raw reply [flat|nested] 30+ messages in thread* Re: [PATCH 13/13] maps#2: Add /proc/kpagemap interface
2007-04-19 19:06 ` Dave Hansen
@ 2007-04-19 20:02 ` Matt Mackall
2007-04-19 20:25 ` Dave Hansen
0 siblings, 1 reply; 30+ messages in thread
From: Matt Mackall @ 2007-04-19 20:02 UTC (permalink / raw)
To: Dave Hansen; +Cc: Andrew Morton, linux-kernel
On Thu, Apr 19, 2007 at 12:06:38PM -0700, Dave Hansen wrote:
> On Fri, 2007-04-06 at 17:03 -0500, Matt Mackall wrote:
> >
> > +static ssize_t kpagemap_read(struct file *file, char __user *buf,
> > + size_t count, loff_t *ppos)
> > +{
> ...
> > + for (; i < 2 * chunk / KPMSIZE; i += 2, pfn++) {
> > + ppage = pfn_to_page(pfn);
> > + if (!ppage) {
> > + page[i] = 0;
> > + page[i + 1] = 0;
> > + } else {
> > + page[i] = ppage->flags;
> > + page[i + 1] = atomic_read(&ppage->_count);
> > + }
> > + }
>
> I think this needs a pfn_valid() check for sparse/discontig systems. I
> think we'll oops if we have holes because we don't check pfn_valid()
> inside of pfn_to_page().
Are there cases where pfn_valid is true but pfn_to_page doesn't work?
In other words, do I still only need two cases?
--
Mathematics is the supreme nostalgia of our time.
^ permalink raw reply [flat|nested] 30+ messages in thread* Re: [PATCH 13/13] maps#2: Add /proc/kpagemap interface
2007-04-19 20:02 ` Matt Mackall
@ 2007-04-19 20:25 ` Dave Hansen
0 siblings, 0 replies; 30+ messages in thread
From: Dave Hansen @ 2007-04-19 20:25 UTC (permalink / raw)
To: Matt Mackall; +Cc: Andrew Morton, linux-kernel
On Thu, 2007-04-19 at 15:02 -0500, Matt Mackall wrote:
> On Thu, Apr 19, 2007 at 12:06:38PM -0700, Dave Hansen wrote:
> > On Fri, 2007-04-06 at 17:03 -0500, Matt Mackall wrote:
> > >
> > > +static ssize_t kpagemap_read(struct file *file, char __user *buf,
> > > + size_t count, loff_t *ppos)
> > > +{
> > ...
> > > + for (; i < 2 * chunk / KPMSIZE; i += 2, pfn++) {
> > > + ppage = pfn_to_page(pfn);
> > > + if (!ppage) {
> > > + page[i] = 0;
> > > + page[i + 1] = 0;
> > > + } else {
> > > + page[i] = ppage->flags;
> > > + page[i + 1] = atomic_read(&ppage->_count);
> > > + }
> > > + }
> >
> > I think this needs a pfn_valid() check for sparse/discontig systems. I
> > think we'll oops if we have holes because we don't check pfn_valid()
> > inside of pfn_to_page().
>
> Are there cases where pfn_valid is true but pfn_to_page doesn't work?
> In other words, do I still only need two cases?
pfn_valid() will at least guarantee that pfn_to_page() will give you
_some_ 'struct page' back.
However, it isn't an absolute guarantee that there is actual physical
memory backing that struct page. For your use here, it should be OK,
but it might be a bit confusing if anyone ever ends up with more entries
in that /proc file than they have pages of total memory.
But, this is a pretty low-level debugging mechanism, and I'm not sure
this is very easy to solve in an arch-independent way.
I'd probably just leave it looking something like this:
+ for (; i < 2 * chunk / KPMSIZE; i += 2, pfn++) {
+ if (!pfn_valid(pfn)) {
+ page[i] = 0;
+ page[i + 1] = 0;
+ } else {
+ ppage = pfn_to_page(pfn);
+ page[i] = ppage->flags;
+ page[i + 1] = atomic_read(&ppage->_count);
+ }
+ }
BTW, page->flags can be quite config-dependent on how stuff is stored in
there, especially with the zone, node, and section information encoded
in there. Do you, perhaps, want to just spit out the bits that are
actual PageFoo() flags?
-- Dave
^ permalink raw reply [flat|nested] 30+ messages in thread