public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
@ 2007-04-04  2:43 Matt Mackall
  2007-04-04  2:43 ` [PATCH 1/13] maps: Uninline some functions in the page walker Matt Mackall
                   ` (13 more replies)
  0 siblings, 14 replies; 77+ messages in thread
From: Matt Mackall @ 2007-04-04  2:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

This patch series introduces /proc/pid/pagemap and /proc/kpagemap,
which allow detailed run-time examination of process memory usage at a
page granularity.

The first several patches whip the page-walking code introduced for
/proc/pid/smaps and clear_refs into a more generic form, the next
couple make those interfaces optional, and the last two introduce the
new interfaces, also optional.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* [PATCH 1/13] maps: Uninline some functions in the page walker
  2007-04-04  2:43 [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups Matt Mackall
@ 2007-04-04  2:43 ` Matt Mackall
  2007-04-04  2:43 ` [PATCH 2/13] maps: Eliminate the pmd_walker struct " Matt Mackall
                   ` (12 subsequent siblings)
  13 siblings, 0 replies; 77+ messages in thread
From: Matt Mackall @ 2007-04-04  2:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

Uninline some functions in the page walker

Signed-off-by: Matt Mackall <mpm@selenic.com>

Index: mm/fs/proc/task_mmu.c
===================================================================
--- mm.orig/fs/proc/task_mmu.c	2007-03-24 21:33:42.000000000 -0500
+++ mm/fs/proc/task_mmu.c	2007-03-24 21:33:47.000000000 -0500
@@ -284,7 +284,7 @@ static void clear_refs_pte_range(struct 
 	cond_resched();
 }
 
-static inline void walk_pmd_range(struct pmd_walker *walker, pud_t *pud,
+static void walk_pmd_range(struct pmd_walker *walker, pud_t *pud,
 				  unsigned long addr, unsigned long end)
 {
 	pmd_t *pmd;
@@ -299,7 +299,7 @@ static inline void walk_pmd_range(struct
 	}
 }
 
-static inline void walk_pud_range(struct pmd_walker *walker, pgd_t *pgd,
+static void walk_pud_range(struct pmd_walker *walker, pgd_t *pgd,
 				  unsigned long addr, unsigned long end)
 {
 	pud_t *pud;
@@ -323,11 +323,11 @@ static inline void walk_pud_range(struct
  * Recursively walk the page table for the memory area in a VMA, calling
  * a callback for every bottom-level (PTE) page table.
  */
-static inline void walk_page_range(struct vm_area_struct *vma,
-				   void (*action)(struct vm_area_struct *,
-						  pmd_t *, unsigned long,
-						  unsigned long, void *),
-				   void *private)
+static void walk_page_range(struct vm_area_struct *vma,
+			    void (*action)(struct vm_area_struct *,
+					   pmd_t *, unsigned long,
+					   unsigned long, void *),
+			    void *private)
 {
 	unsigned long addr = vma->vm_start;
 	unsigned long end = vma->vm_end;

^ permalink raw reply	[flat|nested] 77+ messages in thread

* [PATCH 2/13] maps: Eliminate the pmd_walker struct in the page walker
  2007-04-04  2:43 [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups Matt Mackall
  2007-04-04  2:43 ` [PATCH 1/13] maps: Uninline some functions in the page walker Matt Mackall
@ 2007-04-04  2:43 ` Matt Mackall
  2007-04-04  2:43 ` [PATCH 3/13] maps: Remove vma from args " Matt Mackall
                   ` (11 subsequent siblings)
  13 siblings, 0 replies; 77+ messages in thread
From: Matt Mackall @ 2007-04-04  2:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

Eliminate the pmd_walker struct in the page walker

This slightly simplifies things for the next few cleanups.

Signed-off-by: Matt Mackall <mpm@selenic.com>

Index: mm/fs/proc/task_mmu.c
===================================================================
--- mm.orig/fs/proc/task_mmu.c	2007-03-24 21:33:47.000000000 -0500
+++ mm/fs/proc/task_mmu.c	2007-03-24 21:33:50.000000000 -0500
@@ -116,6 +116,7 @@ static void pad_len_spaces(struct seq_fi
 
 struct mem_size_stats
 {
+	struct vm_area_struct *vma;
 	unsigned long resident;
 	unsigned long shared_clean;
 	unsigned long shared_dirty;
@@ -124,13 +125,6 @@ struct mem_size_stats
 	unsigned long referenced;
 };
 
-struct pmd_walker {
-	struct vm_area_struct *vma;
-	void *private;
-	void (*action)(struct vm_area_struct *, pmd_t *, unsigned long,
-		       unsigned long, void *);
-};
-
 static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
 {
 	struct proc_maps_private *priv = m->private;
@@ -218,11 +212,11 @@ static int show_map(struct seq_file *m, 
 	return show_map_internal(m, v, NULL);
 }
 
-static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
-			    unsigned long addr, unsigned long end,
+static void smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 			    void *private)
 {
 	struct mem_size_stats *mss = private;
+	struct vm_area_struct *vma = mss->vma;
 	pte_t *pte, ptent;
 	spinlock_t *ptl;
 	struct page *page;
@@ -258,10 +252,10 @@ static void smaps_pte_range(struct vm_ar
 	cond_resched();
 }
 
-static void clear_refs_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
-				 unsigned long addr, unsigned long end,
-				 void *private)
+static void clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
+				 unsigned long end, void *private)
 {
+	struct vm_area_struct *vma = private;
 	pte_t *pte, ptent;
 	spinlock_t *ptl;
 	struct page *page;
@@ -284,8 +278,10 @@ static void clear_refs_pte_range(struct 
 	cond_resched();
 }
 
-static void walk_pmd_range(struct pmd_walker *walker, pud_t *pud,
-				  unsigned long addr, unsigned long end)
+static void walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+			   void (*action)(pmd_t *, unsigned long,
+					  unsigned long, void *),
+			   void *private)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -295,12 +291,14 @@ static void walk_pmd_range(struct pmd_wa
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		walker->action(walker->vma, pmd, addr, next, walker->private);
+		action(pmd, addr, next, private);
 	}
 }
 
-static void walk_pud_range(struct pmd_walker *walker, pgd_t *pgd,
-				  unsigned long addr, unsigned long end)
+static void walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+			   void (*action)(pmd_t *, unsigned long,
+					  unsigned long, void *),
+			   void *private)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -310,7 +308,7 @@ static void walk_pud_range(struct pmd_wa
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		walk_pmd_range(walker, pud, addr, next);
+		walk_pmd_range(pud, addr, next, action, private);
 	}
 }
 
@@ -324,18 +322,12 @@ static void walk_pud_range(struct pmd_wa
  * a callback for every bottom-level (PTE) page table.
  */
 static void walk_page_range(struct vm_area_struct *vma,
-			    void (*action)(struct vm_area_struct *,
-					   pmd_t *, unsigned long,
+			    void (*action)(pmd_t *, unsigned long,
 					   unsigned long, void *),
 			    void *private)
 {
 	unsigned long addr = vma->vm_start;
 	unsigned long end = vma->vm_end;
-	struct pmd_walker walker = {
-		.vma		= vma,
-		.private	= private,
-		.action		= action,
-	};
 	pgd_t *pgd;
 	unsigned long next;
 
@@ -344,7 +336,7 @@ static void walk_page_range(struct vm_ar
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		walk_pud_range(&walker, pgd, addr, next);
+		walk_pud_range(pgd, addr, next, action, private);
 	}
 }
 
@@ -354,6 +346,7 @@ static int show_smap(struct seq_file *m,
 	struct mem_size_stats mss;
 
 	memset(&mss, 0, sizeof mss);
+	mss.vma = vma;
 	if (vma->vm_mm && !is_vm_hugetlb_page(vma))
 		walk_page_range(vma, smaps_pte_range, &mss);
 	return show_map_internal(m, v, &mss);
@@ -366,7 +359,7 @@ void clear_refs_smap(struct mm_struct *m
 	down_read(&mm->mmap_sem);
 	for (vma = mm->mmap; vma; vma = vma->vm_next)
 		if (vma->vm_mm && !is_vm_hugetlb_page(vma))
-			walk_page_range(vma, clear_refs_pte_range, NULL);
+			walk_page_range(vma, clear_refs_pte_range, vma);
 	flush_tlb_mm(mm);
 	up_read(&mm->mmap_sem);
 }

^ permalink raw reply	[flat|nested] 77+ messages in thread

* [PATCH 3/13] maps: Remove vma from args in the page walker
  2007-04-04  2:43 [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups Matt Mackall
  2007-04-04  2:43 ` [PATCH 1/13] maps: Uninline some functions in the page walker Matt Mackall
  2007-04-04  2:43 ` [PATCH 2/13] maps: Eliminate the pmd_walker struct " Matt Mackall
@ 2007-04-04  2:43 ` Matt Mackall
  2007-04-04  2:43 ` [PATCH 4/13] maps: Propagate errors from callback in " Matt Mackall
                   ` (10 subsequent siblings)
  13 siblings, 0 replies; 77+ messages in thread
From: Matt Mackall @ 2007-04-04  2:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

Remove vma from args in the page walker

This makes the walker more generic.

Signed-off-by: Matt Mackall <mpm@selenic.com>

Index: mm/fs/proc/task_mmu.c
===================================================================
--- mm.orig/fs/proc/task_mmu.c	2007-03-24 21:33:50.000000000 -0500
+++ mm/fs/proc/task_mmu.c	2007-03-24 21:33:52.000000000 -0500
@@ -313,25 +313,26 @@ static void walk_pud_range(pgd_t *pgd, u
 }
 
 /*
- * walk_page_range - walk the page tables of a VMA with a callback
- * @vma - VMA to walk
+ * walk_page_range - walk a memory map's page tables with a callback
+ * @mm - memory map to walk
+ * @addr - starting address
+ * @end - ending address
  * @action - callback invoked for every bottom-level (PTE) page table
  * @private - private data passed to the callback function
  *
  * Recursively walk the page table for the memory area in a VMA, calling
  * a callback for every bottom-level (PTE) page table.
  */
-static void walk_page_range(struct vm_area_struct *vma,
+static void walk_page_range(struct mm_struct *mm,
+			    unsigned long addr, unsigned long end,
 			    void (*action)(pmd_t *, unsigned long,
 					   unsigned long, void *),
 			    void *private)
 {
-	unsigned long addr = vma->vm_start;
-	unsigned long end = vma->vm_end;
 	pgd_t *pgd;
 	unsigned long next;
 
-	for (pgd = pgd_offset(vma->vm_mm, addr); addr != end;
+	for (pgd = pgd_offset(mm, addr); addr != end;
 	     pgd++, addr = next) {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
@@ -348,7 +349,8 @@ static int show_smap(struct seq_file *m,
 	memset(&mss, 0, sizeof mss);
 	mss.vma = vma;
 	if (vma->vm_mm && !is_vm_hugetlb_page(vma))
-		walk_page_range(vma, smaps_pte_range, &mss);
+		walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end,
+				smaps_pte_range, &mss);
 	return show_map_internal(m, v, &mss);
 }
 
@@ -359,7 +361,8 @@ void clear_refs_smap(struct mm_struct *m
 	down_read(&mm->mmap_sem);
 	for (vma = mm->mmap; vma; vma = vma->vm_next)
 		if (vma->vm_mm && !is_vm_hugetlb_page(vma))
-			walk_page_range(vma, clear_refs_pte_range, vma);
+			walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end,
+					clear_refs_pte_range, vma);
 	flush_tlb_mm(mm);
 	up_read(&mm->mmap_sem);
 }

^ permalink raw reply	[flat|nested] 77+ messages in thread

* [PATCH 4/13] maps: Propagate errors from callback in page walker
  2007-04-04  2:43 [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups Matt Mackall
                   ` (2 preceding siblings ...)
  2007-04-04  2:43 ` [PATCH 3/13] maps: Remove vma from args " Matt Mackall
@ 2007-04-04  2:43 ` Matt Mackall
  2007-04-04  2:43 ` [PATCH 5/13] maps: Add callbacks for each level to " Matt Mackall
                   ` (9 subsequent siblings)
  13 siblings, 0 replies; 77+ messages in thread
From: Matt Mackall @ 2007-04-04  2:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

Propagate errors from callback in page walker

Signed-off-by: Matt Mackall <mpm@selenic.com>

Index: mm/fs/proc/task_mmu.c
===================================================================
--- mm.orig/fs/proc/task_mmu.c	2007-03-24 21:33:52.000000000 -0500
+++ mm/fs/proc/task_mmu.c	2007-03-24 21:33:58.000000000 -0500
@@ -212,8 +212,8 @@ static int show_map(struct seq_file *m, 
 	return show_map_internal(m, v, NULL);
 }
 
-static void smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
-			    void *private)
+static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+			   void *private)
 {
 	struct mem_size_stats *mss = private;
 	struct vm_area_struct *vma = mss->vma;
@@ -250,10 +250,11 @@ static void smaps_pte_range(pmd_t *pmd, 
 	}
 	pte_unmap_unlock(pte - 1, ptl);
 	cond_resched();
+	return 0;
 }
 
-static void clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
-				 unsigned long end, void *private)
+static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
+				unsigned long end, void *private)
 {
 	struct vm_area_struct *vma = private;
 	pte_t *pte, ptent;
@@ -276,40 +277,51 @@ static void clear_refs_pte_range(pmd_t *
 	}
 	pte_unmap_unlock(pte - 1, ptl);
 	cond_resched();
+	return 0;
 }
 
-static void walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
-			   void (*action)(pmd_t *, unsigned long,
-					  unsigned long, void *),
-			   void *private)
+static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+			  int (*action)(pmd_t *, unsigned long,
+					unsigned long, void *),
+			  void *private)
 {
 	pmd_t *pmd;
 	unsigned long next;
+	int err;
 
 	for (pmd = pmd_offset(pud, addr); addr != end;
 	     pmd++, addr = next) {
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		action(pmd, addr, next, private);
+		err = action(pmd, addr, next, private);
+		if (err)
+			return err;
 	}
+
+	return 0;
 }
 
-static void walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
-			   void (*action)(pmd_t *, unsigned long,
-					  unsigned long, void *),
-			   void *private)
+static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+			  int (*action)(pmd_t *, unsigned long,
+					unsigned long, void *),
+			  void *private)
 {
 	pud_t *pud;
 	unsigned long next;
+	int err;
 
 	for (pud = pud_offset(pgd, addr); addr != end;
 	     pud++, addr = next) {
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		walk_pmd_range(pud, addr, next, action, private);
+		err = walk_pmd_range(pud, addr, next, action, private);
+		if (err)
+			return err;
 	}
+
+	return 0;
 }
 
 /*
@@ -323,22 +335,27 @@ static void walk_pud_range(pgd_t *pgd, u
  * Recursively walk the page table for the memory area in a VMA, calling
  * a callback for every bottom-level (PTE) page table.
  */
-static void walk_page_range(struct mm_struct *mm,
-			    unsigned long addr, unsigned long end,
-			    void (*action)(pmd_t *, unsigned long,
-					   unsigned long, void *),
-			    void *private)
+static int walk_page_range(struct mm_struct *mm,
+			   unsigned long addr, unsigned long end,
+			   int (*action)(pmd_t *, unsigned long,
+					 unsigned long, void *),
+			   void *private)
 {
 	pgd_t *pgd;
 	unsigned long next;
+	int err;
 
 	for (pgd = pgd_offset(mm, addr); addr != end;
 	     pgd++, addr = next) {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		walk_pud_range(pgd, addr, next, action, private);
+		err = walk_pud_range(pgd, addr, next, action, private);
+		if (err)
+			return err;
 	}
+
+	return 0;
 }
 
 static int show_smap(struct seq_file *m, void *v)

^ permalink raw reply	[flat|nested] 77+ messages in thread

* [PATCH 5/13] maps: Add callbacks for each level to page walker
  2007-04-04  2:43 [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups Matt Mackall
                   ` (3 preceding siblings ...)
  2007-04-04  2:43 ` [PATCH 4/13] maps: Propagate errors from callback in " Matt Mackall
@ 2007-04-04  2:43 ` Matt Mackall
  2007-04-04  2:43 ` [PATCH 6/13] maps: Move the page walker code to lib/ Matt Mackall
                   ` (8 subsequent siblings)
  13 siblings, 0 replies; 77+ messages in thread
From: Matt Mackall @ 2007-04-04  2:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

Add callbacks for each level to page walker

This allows iterating over all levels of the page tables. Recursion
continues to the depth of the lowest supplied callback.

This makes the page walker nearly completely generic and should allow
it to replace some other hand-rolled page table walkers.

Signed-off-by: Matt Mackall <mpm@selenic.com>

Index: mm/fs/proc/task_mmu.c
===================================================================
--- mm.orig/fs/proc/task_mmu.c	2007-03-24 21:33:58.000000000 -0500
+++ mm/fs/proc/task_mmu.c	2007-03-24 21:34:07.000000000 -0500
@@ -280,10 +280,35 @@ static int clear_refs_pte_range(pmd_t *p
 	return 0;
 }
 
+struct mm_walk {
+	int (*pgd_entry)(pgd_t *, unsigned long, unsigned long, void *);
+	int (*pud_entry)(pud_t *, unsigned long, unsigned long, void *);
+	int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, void *);
+	int (*pte_entry)(pte_t *, unsigned long, unsigned long, void *);
+};
+
+static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+			  struct mm_walk *walk, void *private)
+{
+	pte_t *pte;
+	int err;
+
+	for (pte = pte_offset_map(pmd, addr); addr != end;
+	     addr += PAGE_SIZE, pte++) {
+		if (pte_none(*pte))
+			continue;
+		err = walk->pte_entry(pte, addr, addr, private);
+		if (err) {
+			pte_unmap(pte);
+			return err;
+		}
+	}
+	pte_unmap(pte);
+	return 0;
+}
+
 static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
-			  int (*action)(pmd_t *, unsigned long,
-					unsigned long, void *),
-			  void *private)
+			  struct mm_walk *walk, void *private)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -294,18 +319,22 @@ static int walk_pmd_range(pud_t *pud, un
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		err = action(pmd, addr, next, private);
-		if (err)
-			return err;
+		if (walk->pmd_entry) {
+			err = walk->pmd_entry(pmd, addr, next, private);
+			if (err)
+				return err;
+		}
+		if (walk->pte_entry) {
+			err = walk_pte_range(pmd, addr, next, walk, private);
+			if (err)
+				return err;
+		}
 	}
-
 	return 0;
 }
 
 static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
-			  int (*action)(pmd_t *, unsigned long,
-					unsigned long, void *),
-			  void *private)
+			  struct mm_walk *walk, void *private)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -316,11 +345,17 @@ static int walk_pud_range(pgd_t *pgd, un
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		err = walk_pmd_range(pud, addr, next, action, private);
-		if (err)
-			return err;
+		if (walk->pud_entry) {
+			err = walk->pud_entry(pud, addr, next, private);
+			if (err)
+				return err;
+		}
+		if (walk->pmd_entry || walk->pte_entry) {
+			err = walk_pmd_range(pud, addr, next, walk, private);
+			if (err)
+				return err;
+		}
 	}
-
 	return 0;
 }
 
@@ -337,9 +372,7 @@ static int walk_pud_range(pgd_t *pgd, un
  */
 static int walk_page_range(struct mm_struct *mm,
 			   unsigned long addr, unsigned long end,
-			   int (*action)(pmd_t *, unsigned long,
-					 unsigned long, void *),
-			   void *private)
+			   struct mm_walk *walk, void *private)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -350,14 +383,22 @@ static int walk_page_range(struct mm_str
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		err = walk_pud_range(pgd, addr, next, action, private);
-		if (err)
-			return err;
+		if (walk->pgd_entry) {
+			err = walk->pgd_entry(pgd, addr, next, private);
+			if (err)
+				return err;
+		}
+		if (walk->pud_entry || walk->pmd_entry || walk->pte_entry) {
+			err = walk_pud_range(pgd, addr, next, walk, private);
+			if (err)
+				return err;
+		}
 	}
-
 	return 0;
 }
 
+static struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range };
+
 static int show_smap(struct seq_file *m, void *v)
 {
 	struct vm_area_struct *vma = v;
@@ -367,10 +408,12 @@ static int show_smap(struct seq_file *m,
 	mss.vma = vma;
 	if (vma->vm_mm && !is_vm_hugetlb_page(vma))
 		walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end,
-				smaps_pte_range, &mss);
+				&smaps_walk, &mss);
 	return show_map_internal(m, v, &mss);
 }
 
+static struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range };
+
 void clear_refs_smap(struct mm_struct *mm)
 {
 	struct vm_area_struct *vma;
@@ -379,7 +422,7 @@ void clear_refs_smap(struct mm_struct *m
 	for (vma = mm->mmap; vma; vma = vma->vm_next)
 		if (vma->vm_mm && !is_vm_hugetlb_page(vma))
 			walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end,
-					clear_refs_pte_range, vma);
+					&clear_refs_walk, vma);
 	flush_tlb_mm(mm);
 	up_read(&mm->mmap_sem);
 }

^ permalink raw reply	[flat|nested] 77+ messages in thread

* [PATCH 6/13] maps: Move the page walker code to lib/
  2007-04-04  2:43 [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups Matt Mackall
                   ` (4 preceding siblings ...)
  2007-04-04  2:43 ` [PATCH 5/13] maps: Add callbacks for each level to " Matt Mackall
@ 2007-04-04  2:43 ` Matt Mackall
  2007-04-04  3:51   ` Nick Piggin
  2007-04-04  2:43 ` [PATCH 7/13] maps: Simplify interdependence of /proc/pid/maps and smaps Matt Mackall
                   ` (7 subsequent siblings)
  13 siblings, 1 reply; 77+ messages in thread
From: Matt Mackall @ 2007-04-04  2:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

Move the page walker code to lib/

This lets it get shared outside of proc/ and linked in only when
needed.

Signed-off-by: Matt Mackall <mpm@selenic.com>

Index: mm/fs/proc/task_mmu.c
===================================================================
--- mm.orig/fs/proc/task_mmu.c	2007-03-27 22:13:43.000000000 -0500
+++ mm/fs/proc/task_mmu.c	2007-03-27 22:13:51.000000000 -0500
@@ -280,123 +280,6 @@ static int clear_refs_pte_range(pmd_t *p
 	return 0;
 }
 
-struct mm_walk {
-	int (*pgd_entry)(pgd_t *, unsigned long, unsigned long, void *);
-	int (*pud_entry)(pud_t *, unsigned long, unsigned long, void *);
-	int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, void *);
-	int (*pte_entry)(pte_t *, unsigned long, unsigned long, void *);
-};
-
-static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
-			  struct mm_walk *walk, void *private)
-{
-	pte_t *pte;
-	int err;
-
-	for (pte = pte_offset_map(pmd, addr); addr != end;
-	     addr += PAGE_SIZE, pte++) {
-		if (pte_none(*pte))
-			continue;
-		err = walk->pte_entry(pte, addr, addr, private);
-		if (err) {
-			pte_unmap(pte);
-			return err;
-		}
-	}
-	pte_unmap(pte);
-	return 0;
-}
-
-static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
-			  struct mm_walk *walk, void *private)
-{
-	pmd_t *pmd;
-	unsigned long next;
-	int err;
-
-	for (pmd = pmd_offset(pud, addr); addr != end;
-	     pmd++, addr = next) {
-		next = pmd_addr_end(addr, end);
-		if (pmd_none_or_clear_bad(pmd))
-			continue;
-		if (walk->pmd_entry) {
-			err = walk->pmd_entry(pmd, addr, next, private);
-			if (err)
-				return err;
-		}
-		if (walk->pte_entry) {
-			err = walk_pte_range(pmd, addr, next, walk, private);
-			if (err)
-				return err;
-		}
-	}
-	return 0;
-}
-
-static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
-			  struct mm_walk *walk, void *private)
-{
-	pud_t *pud;
-	unsigned long next;
-	int err;
-
-	for (pud = pud_offset(pgd, addr); addr != end;
-	     pud++, addr = next) {
-		next = pud_addr_end(addr, end);
-		if (pud_none_or_clear_bad(pud))
-			continue;
-		if (walk->pud_entry) {
-			err = walk->pud_entry(pud, addr, next, private);
-			if (err)
-				return err;
-		}
-		if (walk->pmd_entry || walk->pte_entry) {
-			err = walk_pmd_range(pud, addr, next, walk, private);
-			if (err)
-				return err;
-		}
-	}
-	return 0;
-}
-
-/*
- * walk_page_range - walk a memory map's page tables with a callback
- * @mm - memory map to walk
- * @addr - starting address
- * @end - ending address
- * @action - callback invoked for every bottom-level (PTE) page table
- * @private - private data passed to the callback function
- *
- * Recursively walk the page table for the memory area in a VMA, calling
- * a callback for every bottom-level (PTE) page table.
- */
-static int walk_page_range(struct mm_struct *mm,
-			   unsigned long addr, unsigned long end,
-			   struct mm_walk *walk, void *private)
-{
-	pgd_t *pgd;
-	unsigned long next;
-	int err;
-
-	for (pgd = pgd_offset(mm, addr); addr != end;
-	     pgd++, addr = next) {
-		next = pgd_addr_end(addr, end);
-		if (pgd_none_or_clear_bad(pgd))
-			continue;
-		if (walk->pgd_entry) {
-			err = walk->pgd_entry(pgd, addr, next, private);
-			if (err)
-				return err;
-		}
-		if (walk->pud_entry || walk->pmd_entry || walk->pte_entry) {
-			err = walk_pud_range(pgd, addr, next, walk, private);
-			if (err)
-				return err;
-		}
-	}
-	return 0;
-}
-
 static struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range };
 
 static int show_smap(struct seq_file *m, void *v)
Index: mm/include/linux/mm.h
===================================================================
--- mm.orig/include/linux/mm.h	2007-03-27 22:13:42.000000000 -0500
+++ mm/include/linux/mm.h	2007-03-27 22:13:51.000000000 -0500
@@ -747,6 +747,16 @@ unsigned long unmap_vmas(struct mmu_gath
 		struct vm_area_struct *start_vma, unsigned long start_addr,
 		unsigned long end_addr, unsigned long *nr_accounted,
 		struct zap_details *);
+
+struct mm_walk {
+	int (*pgd_entry)(pgd_t *, unsigned long, unsigned long, void *);
+	int (*pud_entry)(pud_t *, unsigned long, unsigned long, void *);
+	int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, void *);
+	int (*pte_entry)(pte_t *, unsigned long, unsigned long, void *);
+};
+
+int walk_page_range(struct mm_struct *, unsigned long addr, unsigned long end,
+		    struct mm_walk *walk, void *private);
 void free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
 		unsigned long end, unsigned long floor, unsigned long ceiling);
 void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma,
Index: mm/lib/Makefile
===================================================================
--- mm.orig/lib/Makefile	2007-03-27 22:14:09.000000000 -0500
+++ mm/lib/Makefile	2007-03-27 22:16:49.000000000 -0500
@@ -7,7 +7,7 @@ lib-y := ctype.o string.o vsprintf.o cmd
 	 idr.o div64.o int_sqrt.o bitmap.o extable.o prio_tree.o \
 	 sha1.o irq_regs.o reciprocal_div.o
 
-lib-$(CONFIG_MMU) += ioremap.o
+lib-$(CONFIG_MMU) += ioremap.o pagewalk.o
 lib-$(CONFIG_SMP) += cpumask.o
 
 lib-y	+= kobject.o kref.o kobject_uevent.o klist.o
Index: mm/lib/pagewalk.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ mm/lib/pagewalk.c	2007-03-27 22:18:37.000000000 -0500
@@ -0,0 +1,111 @@
+#include <linux/mm.h>
+
+static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+			  struct mm_walk *walk, void *private)
+{
+	pte_t *pte;
+	int err;
+
+	for (pte = pte_offset_map(pmd, addr); addr != end;
+	     addr += PAGE_SIZE, pte++) {
+		if (pte_none(*pte))
+			continue;
+		err = walk->pte_entry(pte, addr, addr, private);
+		if (err) {
+			pte_unmap(pte);
+			return err;
+		}
+	}
+	pte_unmap(pte);
+	return 0;
+}
+
+static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+			  struct mm_walk *walk, void *private)
+{
+	pmd_t *pmd;
+	unsigned long next;
+	int err;
+
+	for (pmd = pmd_offset(pud, addr); addr != end;
+	     pmd++, addr = next) {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none_or_clear_bad(pmd))
+			continue;
+		if (walk->pmd_entry) {
+			err = walk->pmd_entry(pmd, addr, next, private);
+			if (err)
+				return err;
+		}
+		if (walk->pte_entry) {
+			err = walk_pte_range(pmd, addr, next, walk, private);
+			if (err)
+				return err;
+		}
+	}
+	return 0;
+}
+
+static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+			  struct mm_walk *walk, void *private)
+{
+	pud_t *pud;
+	unsigned long next;
+	int err;
+
+	for (pud = pud_offset(pgd, addr); addr != end;
+	     pud++, addr = next) {
+		next = pud_addr_end(addr, end);
+		if (pud_none_or_clear_bad(pud))
+			continue;
+		if (walk->pud_entry) {
+			err = walk->pud_entry(pud, addr, next, private);
+			if (err)
+				return err;
+		}
+		if (walk->pmd_entry || walk->pte_entry) {
+			err = walk_pmd_range(pud, addr, next, walk, private);
+			if (err)
+				return err;
+		}
+	}
+	return 0;
+}
+
+/*
+ * walk_page_range - walk a memory map's page tables with a callback
+ * @mm - memory map to walk
+ * @addr - starting address
+ * @end - ending address
+ * @walk - set of callbacks to invoke for each level of the tree
+ * @private - private data passed to the callback function
+ *
+ * Recursively walk the page table for the memory area in a VMA, calling
+ * a callback for every bottom-level (PTE) page table.
+ */
+int walk_page_range(struct mm_struct *mm,
+		    unsigned long addr, unsigned long end,
+		    struct mm_walk *walk, void *private)
+{
+	pgd_t *pgd;
+	unsigned long next;
+	int err;
+
+	for (pgd = pgd_offset(mm, addr); addr != end;
+	     pgd++, addr = next) {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd))
+			continue;
+		if (walk->pgd_entry) {
+			err = walk->pgd_entry(pgd, addr, next, private);
+			if (err)
+				return err;
+		}
+		if (walk->pud_entry || walk->pmd_entry || walk->pte_entry) {
+			err = walk_pud_range(pgd, addr, next, walk, private);
+			if (err)
+				return err;
+		}
+	}
+	return 0;
+}

^ permalink raw reply	[flat|nested] 77+ messages in thread

* [PATCH 7/13] maps: Simplify interdependence of /proc/pid/maps and smaps
  2007-04-04  2:43 [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups Matt Mackall
                   ` (5 preceding siblings ...)
  2007-04-04  2:43 ` [PATCH 6/13] maps: Move the page walker code to lib/ Matt Mackall
@ 2007-04-04  2:43 ` Matt Mackall
  2007-04-04  2:43 ` [PATCH 8/13] maps: Move clear_refs code to task_mmu.c Matt Mackall
                   ` (6 subsequent siblings)
  13 siblings, 0 replies; 77+ messages in thread
From: Matt Mackall @ 2007-04-04  2:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

Simplify interdependence of /proc/pid/maps and smaps

This pulls the shared map display code out of show_map and puts it in
show_smap where it belongs.

Signed-off-by: Matt Mackall <mpm@selenic.com>

Index: mm/fs/proc/task_mmu.c
===================================================================
--- mm.orig/fs/proc/task_mmu.c	2007-03-27 23:06:18.000000000 -0500
+++ mm/fs/proc/task_mmu.c	2007-03-29 13:01:44.000000000 -0500
@@ -125,7 +125,7 @@ struct mem_size_stats
 	unsigned long referenced;
 };
 
-static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
+static int show_map(struct seq_file *m, void *v)
 {
 	struct proc_maps_private *priv = m->private;
 	struct task_struct *task = priv->task;
@@ -185,33 +185,11 @@ static int show_map_internal(struct seq_
 	}
 	seq_putc(m, '\n');
 
-	if (mss)
-		seq_printf(m,
-			   "Size:           %8lu kB\n"
-			   "Rss:            %8lu kB\n"
-			   "Shared_Clean:   %8lu kB\n"
-			   "Shared_Dirty:   %8lu kB\n"
-			   "Private_Clean:  %8lu kB\n"
-			   "Private_Dirty:  %8lu kB\n"
-			   "Referenced:     %8lu kB\n",
-			   (vma->vm_end - vma->vm_start) >> 10,
-			   mss->resident >> 10,
-			   mss->shared_clean  >> 10,
-			   mss->shared_dirty  >> 10,
-			   mss->private_clean >> 10,
-			   mss->private_dirty >> 10,
-			   mss->referenced >> 10);
-
 	if (m->count < m->size)  /* vma is copied successfully */
 		m->version = (vma != get_gate_vma(task))? vma->vm_start: 0;
 	return 0;
 }
 
-static int show_map(struct seq_file *m, void *v)
-{
-	return show_map_internal(m, v, NULL);
-}
-
 static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 			   void *private)
 {
@@ -286,13 +264,35 @@ static int show_smap(struct seq_file *m,
 {
 	struct vm_area_struct *vma = v;
 	struct mem_size_stats mss;
+	int ret;
 
 	memset(&mss, 0, sizeof mss);
 	mss.vma = vma;
 	if (vma->vm_mm && !is_vm_hugetlb_page(vma))
 		walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end,
 				&smaps_walk, &mss);
-	return show_map_internal(m, v, &mss);
+
+	ret = show_map(m, v);
+	if (ret)
+		return ret;
+
+	seq_printf(m,
+		   "Size:           %8lu kB\n"
+		   "Rss:            %8lu kB\n"
+		   "Shared_Clean:   %8lu kB\n"
+		   "Shared_Dirty:   %8lu kB\n"
+		   "Private_Clean:  %8lu kB\n"
+		   "Private_Dirty:  %8lu kB\n"
+		   "Referenced:     %8lu kB\n",
+		   (vma->vm_end - vma->vm_start) >> 10,
+		   mss.resident >> 10,
+		   mss.shared_clean  >> 10,
+		   mss.shared_dirty  >> 10,
+		   mss.private_clean >> 10,
+		   mss.private_dirty >> 10,
+		   mss.referenced >> 10);
+
+	return ret;
 }
 
 static struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range };

^ permalink raw reply	[flat|nested] 77+ messages in thread

* [PATCH 8/13] maps: Move clear_refs code to task_mmu.c
  2007-04-04  2:43 [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups Matt Mackall
                   ` (6 preceding siblings ...)
  2007-04-04  2:43 ` [PATCH 7/13] maps: Simplify interdependence of /proc/pid/maps and smaps Matt Mackall
@ 2007-04-04  2:43 ` Matt Mackall
  2007-04-04  2:43 ` [PATCH 9/13] maps: Regroup task_mmu by interface Matt Mackall
                   ` (5 subsequent siblings)
  13 siblings, 0 replies; 77+ messages in thread
From: Matt Mackall @ 2007-04-04  2:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

Move clear_refs code to task_mmu.c

This puts all the clear_refs code where it belongs and probably lets
things compile on MMU-less systems as well.

Signed-off-by: Matt Mackall <mpm@selenic.com>

Index: mm/fs/proc/base.c
===================================================================
--- mm.orig/fs/proc/base.c	2007-03-29 13:00:41.000000000 -0500
+++ mm/fs/proc/base.c	2007-03-29 13:01:49.000000000 -0500
@@ -750,40 +750,6 @@ static const struct file_operations proc
 	.write		= oom_adjust_write,
 };
 
-static ssize_t clear_refs_write(struct file *file, const char __user *buf,
-				size_t count, loff_t *ppos)
-{
-	struct task_struct *task;
-	char buffer[PROC_NUMBUF], *end;
-	struct mm_struct *mm;

-
-	memset(buffer, 0, sizeof(buffer));
-	if (count > sizeof(buffer) - 1)
-		count = sizeof(buffer) - 1;
-	if (copy_from_user(buffer, buf, count))
-		return -EFAULT;
-	if (!simple_strtol(buffer, &end, 0))
-		return -EINVAL;
-	if (*end == '\n')
-		end++;
-	task = get_proc_task(file->f_path.dentry->d_inode);
-	if (!task)
-		return -ESRCH;
-	mm = get_task_mm(task);
-	if (mm) {
-		clear_refs_smap(mm);
-		mmput(mm);
-	}
-	put_task_struct(task);
-	if (end - buffer == 0)
-		return -EIO;
-	return end - buffer;
-}
-
-static struct file_operations proc_clear_refs_operations = {
-	.write		= clear_refs_write,
-};
-
 #ifdef CONFIG_AUDITSYSCALL
 #define TMPBUFLEN 21
 static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
Index: mm/fs/proc/internal.h
===================================================================
--- mm.orig/fs/proc/internal.h	2007-03-29 13:00:41.000000000 -0500
+++ mm/fs/proc/internal.h	2007-03-29 13:01:49.000000000 -0500
@@ -45,11 +45,7 @@ extern int proc_pid_statm(struct task_st
 extern const struct file_operations proc_maps_operations;
 extern const struct file_operations proc_numa_maps_operations;
 extern const struct file_operations proc_smaps_operations;
-
-extern const struct file_operations proc_maps_operations;
-extern const struct file_operations proc_numa_maps_operations;
-extern const struct file_operations proc_smaps_operations;
-
+extern const struct file_operations proc_clear_refs_operations;
 
 void free_proc_entry(struct proc_dir_entry *de);
 
Index: mm/fs/proc/task_mmu.c
===================================================================
--- mm.orig/fs/proc/task_mmu.c	2007-03-29 13:01:44.000000000 -0500
+++ mm/fs/proc/task_mmu.c	2007-03-29 13:01:49.000000000 -0500
@@ -297,19 +297,47 @@ static int show_smap(struct seq_file *m,
 
 static struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range };
 
-void clear_refs_smap(struct mm_struct *mm)
+static ssize_t clear_refs_write(struct file *file, const char __user *buf,
+				size_t count, loff_t *ppos)
 {
+	struct task_struct *task;
+	char buffer[13], *end;
+	struct mm_struct *mm;
 	struct vm_area_struct *vma;
 
-	down_read(&mm->mmap_sem);
-	for (vma = mm->mmap; vma; vma = vma->vm_next)
-		if (vma->vm_mm && !is_vm_hugetlb_page(vma))
-			walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end,
-					&clear_refs_walk, vma);
-	flush_tlb_mm(mm);
-	up_read(&mm->mmap_sem);
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count))
+		return -EFAULT;
+	if (!simple_strtol(buffer, &end, 0))
+		return -EINVAL;
+	if (*end == '\n')
+		end++;
+	task = get_proc_task(file->f_path.dentry->d_inode);
+	if (!task)
+		return -ESRCH;
+	mm = get_task_mm(task);
+	if (mm) {
+		down_read(&mm->mmap_sem);
+		for (vma = mm->mmap; vma; vma = vma->vm_next)
+			if (!is_vm_hugetlb_page(vma))
+				walk_page_range(mm, vma->vm_start, vma->vm_end,
+						&clear_refs_walk, vma);
+		flush_tlb_mm(mm);
+		up_read(&mm->mmap_sem);
+		mmput(mm);
+	}
+	put_task_struct(task);
+	if (end - buffer == 0)
+		return -EIO;
+	return end - buffer;
 }
 
+const struct file_operations proc_clear_refs_operations = {
+	.write		= clear_refs_write,
+};
+
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
 	struct proc_maps_private *priv = m->private;
Index: mm/include/linux/proc_fs.h
===================================================================
--- mm.orig/include/linux/proc_fs.h	2007-03-29 13:00:41.000000000 -0500
+++ mm/include/linux/proc_fs.h	2007-03-29 13:01:49.000000000 -0500
@@ -117,7 +117,6 @@ int proc_pid_readdir(struct file * filp,
 unsigned long task_vsize(struct mm_struct *);
 int task_statm(struct mm_struct *, int *, int *, int *, int *);
 char *task_mem(struct mm_struct *, char *);
-void clear_refs_smap(struct mm_struct *mm);
 
 struct proc_dir_entry *de_get(struct proc_dir_entry *de);
 void de_put(struct proc_dir_entry *de);

^ permalink raw reply	[flat|nested] 77+ messages in thread

* [PATCH 9/13] maps: Regroup task_mmu by interface
  2007-04-04  2:43 [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups Matt Mackall
                   ` (7 preceding siblings ...)
  2007-04-04  2:43 ` [PATCH 8/13] maps: Move clear_refs code to task_mmu.c Matt Mackall
@ 2007-04-04  2:43 ` Matt Mackall
  2007-04-04  2:43 ` [PATCH 10/13] maps: Make /proc/pid/smaps optional under CONFIG_EMBEDDED Matt Mackall
                   ` (4 subsequent siblings)
  13 siblings, 0 replies; 77+ messages in thread
From: Matt Mackall @ 2007-04-04  2:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

Regroup task_mmu by interface

Reorder source so that all the code and data for each interface is
together.

Signed-off-by: Matt Mackall <mpm@selenic.com>

Index: mm/fs/proc/task_mmu.c
===================================================================
--- mm.orig/fs/proc/task_mmu.c	2007-03-28 00:08:05.000000000 -0500
+++ mm/fs/proc/task_mmu.c	2007-03-28 00:21:48.000000000 -0500
@@ -114,16 +114,121 @@ static void pad_len_spaces(struct seq_fi
 	seq_printf(m, "%*c", len, ' ');
 }
 
-struct mem_size_stats
+static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
 {
-	struct vm_area_struct *vma;
-	unsigned long resident;
-	unsigned long shared_clean;
-	unsigned long shared_dirty;
-	unsigned long private_clean;
-	unsigned long private_dirty;
-	unsigned long referenced;
-};
+	if (vma && vma != priv->tail_vma) {
+		struct mm_struct *mm = vma->vm_mm;
+		up_read(&mm->mmap_sem);
+		mmput(mm);
+	}
+}
+
+static void *m_start(struct seq_file *m, loff_t *pos)
+{
+	struct proc_maps_private *priv = m->private;
+	unsigned long last_addr = m->version;
+	struct mm_struct *mm;
+	struct vm_area_struct *vma, *tail_vma = NULL;
+	loff_t l = *pos;
+
+	/* Clear the per syscall fields in priv */
+	priv->task = NULL;
+	priv->tail_vma = NULL;
+
+	/*
+	 * We remember last_addr rather than next_addr to hit with
+	 * mmap_cache most of the time. We have zero last_addr at
+	 * the beginning and also after lseek. We will have -1 last_addr
+	 * after the end of the vmas.
+	 */
+
+	if (last_addr == -1UL)
+		return NULL;
+
+	priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
+	if (!priv->task)
+		return NULL;
+
+	mm = get_task_mm(priv->task);
+	if (!mm)
+		return NULL;
+
+	priv->tail_vma = tail_vma = get_gate_vma(priv->task);
+	down_read(&mm->mmap_sem);
+
+	/* Start with last addr hint */
+	if (last_addr && (vma = find_vma(mm, last_addr))) {
+		vma = vma->vm_next;
+		goto out;
+	}
+
+	/*
+	 * Check the vma index is within the range and do
+	 * sequential scan until m_index.
+	 */
+	vma = NULL;
+	if ((unsigned long)l < mm->map_count) {
+		vma = mm->mmap;
+		while (l-- && vma)
+			vma = vma->vm_next;
+		goto out;
+	}
+
+	if (l != mm->map_count)
+		tail_vma = NULL; /* After gate vma */
+
+out:
+	if (vma)
+		return vma;
+
+	/* End of vmas has been reached */
+	m->version = (tail_vma != NULL)? 0: -1UL;
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+	return tail_vma;
+}
+
+static void *m_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct proc_maps_private *priv = m->private;
+	struct vm_area_struct *vma = v;
+	struct vm_area_struct *tail_vma = priv->tail_vma;
+
+	(*pos)++;
+	if (vma && (vma != tail_vma) && vma->vm_next)
+		return vma->vm_next;
+	vma_stop(priv, vma);
+	return (vma != tail_vma)? tail_vma: NULL;
+}
+
+static void m_stop(struct seq_file *m, void *v)
+{
+	struct proc_maps_private *priv = m->private;
+	struct vm_area_struct *vma = v;
+
+	vma_stop(priv, vma);
+	if (priv->task)
+		put_task_struct(priv->task);
+}
+
+static int do_maps_open(struct inode *inode, struct file *file,
+			struct seq_operations *ops)
+{
+	struct proc_maps_private *priv;
+	int ret = -ENOMEM;
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (priv) {
+		priv->pid = proc_pid(inode);
+		ret = seq_open(file, ops);
+		if (!ret) {
+			struct seq_file *m = file->private_data;
+			m->private = priv;
+		} else {
+			kfree(priv);
+		}
+	}
+	return ret;
+}
 
 static int show_map(struct seq_file *m, void *v)
 {
@@ -190,6 +295,36 @@ static int show_map(struct seq_file *m, 
 	return 0;
 }
 
+static struct seq_operations proc_pid_maps_op = {
+	.start	= m_start,
+	.next	= m_next,
+	.stop	= m_stop,
+	.show	= show_map
+};
+
+static int maps_open(struct inode *inode, struct file *file)
+{
+	return do_maps_open(inode, file, &proc_pid_maps_op);
+}
+
+const struct file_operations proc_maps_operations = {
+	.open		= maps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_private,
+};
+
+struct mem_size_stats
+{
+	struct vm_area_struct *vma;
+	unsigned long resident;
+	unsigned long shared_clean;
+	unsigned long shared_dirty;
+	unsigned long private_clean;
+	unsigned long private_dirty;
+	unsigned long referenced;
+};
+
 static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 			   void *private)
 {
@@ -231,33 +366,6 @@ static int smaps_pte_range(pmd_t *pmd, u
 	return 0;
 }
 
-static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
-				unsigned long end, void *private)
-{
-	struct vm_area_struct *vma = private;
-	pte_t *pte, ptent;
-	spinlock_t *ptl;
-	struct page *page;
-
-	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
-	for (; addr != end; pte++, addr += PAGE_SIZE) {
-		ptent = *pte;
-		if (!pte_present(ptent))
-			continue;
-
-		page = vm_normal_page(vma, addr, ptent);
-		if (!page)
-			continue;
-
-		/* Clear accessed and referenced bits. */
-		ptep_test_and_clear_young(vma, addr, pte);
-		ClearPageReferenced(page);
-	}
-	pte_unmap_unlock(pte - 1, ptl);
-	cond_resched();
-	return 0;
-}
-
 static struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range };
 
 static int show_smap(struct seq_file *m, void *v)
@@ -293,6 +401,52 @@ static int show_smap(struct seq_file *m,
 	return ret;
 }
 
+static struct seq_operations proc_pid_smaps_op = {
+	.start	= m_start,
+	.next	= m_next,
+	.stop	= m_stop,
+	.show	= show_smap
+};
+
+static int smaps_open(struct inode *inode, struct file *file)
+{
+	return do_maps_open(inode, file, &proc_pid_smaps_op);
+}
+
+const struct file_operations proc_smaps_operations = {
+	.open		= smaps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_private,
+};
+
+static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
+				unsigned long end, void *private)
+{
+	struct vm_area_struct *vma = private;
+	pte_t *pte, ptent;
+	spinlock_t *ptl;
+	struct page *page;
+
+	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	for (; addr != end; pte++, addr += PAGE_SIZE) {
+		ptent = *pte;
+		if (!pte_present(ptent))
+			continue;
+
+		page = vm_normal_page(vma, addr, ptent);
+		if (!page)
+			continue;
+
+		/* Clear accessed and referenced bits. */
+		ptep_test_and_clear_young(vma, addr, pte);
+		ClearPageReferenced(page);
+	}
+	pte_unmap_unlock(pte - 1, ptl);
+	cond_resched();
+	return 0;
+}
+
 static struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range };
 
 static ssize_t clear_refs_write(struct file *file, const char __user *buf,
@@ -336,148 +490,6 @@ const struct file_operations proc_clear_
 	.write		= clear_refs_write,
 };
 
-static void *m_start(struct seq_file *m, loff_t *pos)
-{
-	struct proc_maps_private *priv = m->private;
-	unsigned long last_addr = m->version;
-	struct mm_struct *mm;
-	struct vm_area_struct *vma, *tail_vma = NULL;
-	loff_t l = *pos;
-
-	/* Clear the per syscall fields in priv */
-	priv->task = NULL;
-	priv->tail_vma = NULL;
-
-	/*
-	 * We remember last_addr rather than next_addr to hit with
-	 * mmap_cache most of the time. We have zero last_addr at
-	 * the beginning and also after lseek. We will have -1 last_addr
-	 * after the end of the vmas.
-	 */
-
-	if (last_addr == -1UL)
-		return NULL;
-
-	priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
-	if (!priv->task)
-		return NULL;
-
-	mm = get_task_mm(priv->task);
-	if (!mm)
-		return NULL;
-
-	priv->tail_vma = tail_vma = get_gate_vma(priv->task);
-	down_read(&mm->mmap_sem);
-
-	/* Start with last addr hint */
-	if (last_addr && (vma = find_vma(mm, last_addr))) {
-		vma = vma->vm_next;
-		goto out;
-	}
-
-	/*
-	 * Check the vma index is within the range and do
-	 * sequential scan until m_index.
-	 */
-	vma = NULL;
-	if ((unsigned long)l < mm->map_count) {
-		vma = mm->mmap;
-		while (l-- && vma)
-			vma = vma->vm_next;
-		goto out;
-	}
-
-	if (l != mm->map_count)
-		tail_vma = NULL; /* After gate vma */
-
-out:
-	if (vma)
-		return vma;
-
-	/* End of vmas has been reached */
-	m->version = (tail_vma != NULL)? 0: -1UL;
-	up_read(&mm->mmap_sem);
-	mmput(mm);
-	return tail_vma;
-}
-
-static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
-{
-	if (vma && vma != priv->tail_vma) {
-		struct mm_struct *mm = vma->vm_mm;
-		up_read(&mm->mmap_sem);
-		mmput(mm);
-	}
-}
-
-static void *m_next(struct seq_file *m, void *v, loff_t *pos)
-{
-	struct proc_maps_private *priv = m->private;
-	struct vm_area_struct *vma = v;
-	struct vm_area_struct *tail_vma = priv->tail_vma;
-
-	(*pos)++;
-	if (vma && (vma != tail_vma) && vma->vm_next)
-		return vma->vm_next;
-	vma_stop(priv, vma);
-	return (vma != tail_vma)? tail_vma: NULL;
-}
-
-static void m_stop(struct seq_file *m, void *v)
-{
-	struct proc_maps_private *priv = m->private;
-	struct vm_area_struct *vma = v;
-
-	vma_stop(priv, vma);
-	if (priv->task)
-		put_task_struct(priv->task);
-}
-
-static struct seq_operations proc_pid_maps_op = {
-	.start	= m_start,
-	.next	= m_next,
-	.stop	= m_stop,
-	.show	= show_map
-};
-
-static struct seq_operations proc_pid_smaps_op = {
-	.start	= m_start,
-	.next	= m_next,
-	.stop	= m_stop,
-	.show	= show_smap
-};
-
-static int do_maps_open(struct inode *inode, struct file *file,
-			struct seq_operations *ops)
-{
-	struct proc_maps_private *priv;
-	int ret = -ENOMEM;
-	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
-	if (priv) {
-		priv->pid = proc_pid(inode);
-		ret = seq_open(file, ops);
-		if (!ret) {
-			struct seq_file *m = file->private_data;
-			m->private = priv;
-		} else {
-			kfree(priv);
-		}
-	}
-	return ret;
-}
-
-static int maps_open(struct inode *inode, struct file *file)
-{
-	return do_maps_open(inode, file, &proc_pid_maps_op);
-}
-
-const struct file_operations proc_maps_operations = {
-	.open		= maps_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_private,
-};
-
 #ifdef CONFIG_NUMA
 extern int show_numa_map(struct seq_file *m, void *v);
 
@@ -512,14 +524,3 @@ const struct file_operations proc_numa_m
 };
 #endif
 
-static int smaps_open(struct inode *inode, struct file *file)
-{
-	return do_maps_open(inode, file, &proc_pid_smaps_op);
-}
-
-const struct file_operations proc_smaps_operations = {
-	.open		= smaps_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_private,
-};

^ permalink raw reply	[flat|nested] 77+ messages in thread

* [PATCH 10/13] maps: Make /proc/pid/smaps optional under CONFIG_EMBEDDED
  2007-04-04  2:43 [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups Matt Mackall
                   ` (8 preceding siblings ...)
  2007-04-04  2:43 ` [PATCH 9/13] maps: Regroup task_mmu by interface Matt Mackall
@ 2007-04-04  2:43 ` Matt Mackall
  2007-04-04  2:43 ` [PATCH 11/13] maps: Make /proc/pid/clear_refs option " Matt Mackall
                   ` (3 subsequent siblings)
  13 siblings, 0 replies; 77+ messages in thread
From: Matt Mackall @ 2007-04-04  2:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

Make /proc/pid/smaps optional under CONFIG_EMBEDDED

This interface is primarily useful for doing memory profiling and not
much use on deployed embedded boxes. Make it optional. Together with
/proc/pid/clear_refs, this save a few K.

Signed-off-by: Matt Mackall <mpm@selenic.com>

Index: mm/fs/proc/base.c
===================================================================
--- mm.orig/fs/proc/base.c	2007-03-29 13:01:49.000000000 -0500
+++ mm/fs/proc/base.c	2007-03-29 13:05:21.000000000 -0500
@@ -1907,8 +1907,10 @@ static const struct pid_entry tgid_base_
 	REG("mountstats", S_IRUSR, mountstats),
 #ifdef CONFIG_MMU
 	REG("clear_refs", S_IWUSR, clear_refs),
+#ifdef CONFIG_PROC_SMAPS
 	REG("smaps",      S_IRUGO, smaps),
 #endif
+#endif
 #ifdef CONFIG_SECURITY
 	DIR("attr",       S_IRUGO|S_IXUGO, attr_dir),
 #endif
@@ -2189,8 +2191,10 @@ static const struct pid_entry tid_base_s
 	REG("mounts",    S_IRUGO, mounts),
 #ifdef CONFIG_MMU
 	REG("clear_refs", S_IWUSR, clear_refs),
+#ifdef CONFIG_PROC_SMAPS
 	REG("smaps",     S_IRUGO, smaps),
 #endif
+#endif
 #ifdef CONFIG_SECURITY
 	DIR("attr",      S_IRUGO|S_IXUGO, attr_dir),
 #endif
Index: mm/fs/proc/task_mmu.c
===================================================================
--- mm.orig/fs/proc/task_mmu.c	2007-03-29 13:01:52.000000000 -0500
+++ mm/fs/proc/task_mmu.c	2007-03-29 13:05:21.000000000 -0500
@@ -314,6 +314,7 @@ const struct file_operations proc_maps_o
 	.release	= seq_release_private,
 };
 
+#ifdef CONFIG_PROC_SMAPS
 struct mem_size_stats
 {
 	struct vm_area_struct *vma;
@@ -421,6 +422,7 @@ const struct file_operations proc_smaps_
 	.llseek		= seq_lseek,
 	.release	= seq_release_private,
 };
+#endif
 
 static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 				unsigned long end, void *private)
Index: mm/init/Kconfig
===================================================================
--- mm.orig/init/Kconfig	2007-03-29 13:00:36.000000000 -0500
+++ mm/init/Kconfig	2007-03-29 13:05:21.000000000 -0500
@@ -514,6 +514,14 @@ config VM_EVENT_COUNTERS
 	  on EMBEDDED systems.  /proc/vmstat will only show page counts
 	  if VM event counters are disabled.
 
+config PROC_SMAPS
+	default y
+	bool "Enable /proc/pid/smaps support" if EMBEDDED && PROC_FS && MMU
+	help
+	  The /proc/pid/smaps interface reports a process's private and
+          shared memory per mapping. Disabling this interface will reduce
+          the size of the kernel for small machines.
+
 endmenu		# General setup
 
 config RT_MUTEXES

^ permalink raw reply	[flat|nested] 77+ messages in thread

* [PATCH 11/13] maps: Make /proc/pid/clear_refs option under CONFIG_EMBEDDED
  2007-04-04  2:43 [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups Matt Mackall
                   ` (9 preceding siblings ...)
  2007-04-04  2:43 ` [PATCH 10/13] maps: Make /proc/pid/smaps optional under CONFIG_EMBEDDED Matt Mackall
@ 2007-04-04  2:43 ` Matt Mackall
  2007-04-04  6:22   ` David Rientjes
  2007-04-04  2:43 ` [PATCH 12/13] maps: Add /proc/pid/pagemap interface Matt Mackall
                   ` (2 subsequent siblings)
  13 siblings, 1 reply; 77+ messages in thread
From: Matt Mackall @ 2007-04-04  2:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

Make /proc/pid/clear_refs option under CONFIG_EMBEDDED

This interface is primarily useful for doing memory profiling and not
much use on deployed embedded boxes. Make it optional. Together with
/proc/pid/smaps, this save a few K.

Signed-off-by: Matt Mackall <mpm@selenic.com>

Index: mm/fs/proc/base.c
===================================================================
--- mm.orig/fs/proc/base.c	2007-04-03 14:50:33.000000000 -0500
+++ mm/fs/proc/base.c	2007-04-03 18:04:59.000000000 -0500
@@ -2000,7 +2000,9 @@ static const struct pid_entry tgid_base_
 	REG("mounts",     S_IRUGO, mounts),
 	REG("mountstats", S_IRUSR, mountstats),
 #ifdef CONFIG_MMU
+#ifdef CONFIG_PROC_CLEAR_REFS
 	REG("clear_refs", S_IWUSR, clear_refs),
+#endif
 #ifdef CONFIG_PROC_SMAPS
 	REG("smaps",      S_IRUGO, smaps),
 #endif
@@ -2285,7 +2287,9 @@ static const struct pid_entry tid_base_s
 	LNK("exe",       exe),
 	REG("mounts",    S_IRUGO, mounts),
 #ifdef CONFIG_MMU
+#ifdef CONFIG_PROC_CLEAR_REFS
 	REG("clear_refs", S_IWUSR, clear_refs),
+#endif
 #ifdef CONFIG_PROC_SMAPS
 	REG("smaps",     S_IRUGO, smaps),
 #endif
Index: mm/fs/proc/task_mmu.c
===================================================================
--- mm.orig/fs/proc/task_mmu.c	2007-04-03 14:50:33.000000000 -0500
+++ mm/fs/proc/task_mmu.c	2007-04-03 18:04:59.000000000 -0500
@@ -424,6 +424,7 @@ const struct file_operations proc_smaps_
 };
 #endif
 
+#ifdef CONFIG_PROC_CLEAR_REFS
 static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 				unsigned long end, void *private)
 {
@@ -493,6 +494,7 @@ static ssize_t clear_refs_write(struct f
 const struct file_operations proc_clear_refs_operations = {
 	.write		= clear_refs_write,
 };
+#endif
 
 #ifdef CONFIG_NUMA
 extern int show_numa_map(struct seq_file *m, void *v);
Index: mm/init/Kconfig
===================================================================
--- mm.orig/init/Kconfig	2007-04-03 14:50:33.000000000 -0500
+++ mm/init/Kconfig	2007-04-03 18:04:59.000000000 -0500
@@ -593,6 +593,15 @@ config PROC_SMAPS
           shared memory per mapping. Disabling this interface will reduce
           the size of the kernel for small machines.
 
+config PROC_CLEAR_REFS
+	default y
+	bool "Enable /proc/pid/clear_refs support" if EMBEDDED && PROC_FS && MMU
+	help
+	  The /proc/pid/clear_refs interface allows clearing the
+          referenced bits on a process's memory maps to allow monitoring
+          working set size. Disabling this interface will reduce
+          the size of the kernel for small machines.
+
 endmenu		# General setup
 
 config RT_MUTEXES

^ permalink raw reply	[flat|nested] 77+ messages in thread

* [PATCH 12/13] maps: Add /proc/pid/pagemap interface
  2007-04-04  2:43 [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups Matt Mackall
                   ` (10 preceding siblings ...)
  2007-04-04  2:43 ` [PATCH 11/13] maps: Make /proc/pid/clear_refs option " Matt Mackall
@ 2007-04-04  2:43 ` Matt Mackall
  2007-04-04 11:18   ` Nikita Danilov
  2007-04-04  2:43 ` [PATCH 13/13] maps: Add /proc/kpagemap interface Matt Mackall
  2007-04-12 23:10 ` [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups William Lee Irwin III
  13 siblings, 1 reply; 77+ messages in thread
From: Matt Mackall @ 2007-04-04  2:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

Add /proc/pid/pagemap interface

This interface provides a mapping for each page in an address space to
its physical page frame number, allowing precise determination of what
pages are mapped and what pages are shared between processes.

Signed-off-by: Matt Mackall <mpm@selenic.com>

Index: mm/fs/proc/base.c
===================================================================
--- mm.orig/fs/proc/base.c	2007-04-03 14:50:33.000000000 -0500
+++ mm/fs/proc/base.c	2007-04-03 14:50:33.000000000 -0500
@@ -664,7 +664,7 @@ out_no_task:
 }
 #endif
 
-static loff_t mem_lseek(struct file * file, loff_t offset, int orig)
+loff_t mem_lseek(struct file * file, loff_t offset, int orig)
 {
 	switch (orig) {
 	case 0:
@@ -2006,6 +2006,9 @@ static const struct pid_entry tgid_base_
 #ifdef CONFIG_PROC_SMAPS
 	REG("smaps",      S_IRUGO, smaps),
 #endif
+#ifdef CONFIG_PROC_PAGEMAP
+	REG("pagemap",    S_IRUSR, pagemap),
+#endif
 #endif
 #ifdef CONFIG_SECURITY
 	DIR("attr",       S_IRUGO|S_IXUGO, attr_dir),
@@ -2293,6 +2296,9 @@ static const struct pid_entry tid_base_s
 #ifdef CONFIG_PROC_SMAPS
 	REG("smaps",     S_IRUGO, smaps),
 #endif
+#ifdef CONFIG_PROC_PAGEMAP
+	REG("pagemap",    S_IRUSR, pagemap),
+#endif
 #endif
 #ifdef CONFIG_SECURITY
 	DIR("attr",      S_IRUGO|S_IXUGO, attr_dir),
Index: mm/fs/proc/internal.h
===================================================================
--- mm.orig/fs/proc/internal.h	2007-04-03 14:50:33.000000000 -0500
+++ mm/fs/proc/internal.h	2007-04-03 14:50:33.000000000 -0500
@@ -45,11 +45,13 @@ extern int proc_tid_stat(struct task_str
 extern int proc_tgid_stat(struct task_struct *, char *);
 extern int proc_pid_status(struct task_struct *, char *);
 extern int proc_pid_statm(struct task_struct *, char *);
+extern loff_t mem_lseek(struct file * file, loff_t offset, int orig);
 
 extern const struct file_operations proc_maps_operations;
 extern const struct file_operations proc_numa_maps_operations;
 extern const struct file_operations proc_smaps_operations;
 extern const struct file_operations proc_clear_refs_operations;
+extern const struct file_operations proc_pagemap_operations;
 
 void free_proc_entry(struct proc_dir_entry *de);
 
Index: mm/fs/proc/task_mmu.c
===================================================================
--- mm.orig/fs/proc/task_mmu.c	2007-04-03 14:50:33.000000000 -0500
+++ mm/fs/proc/task_mmu.c	2007-04-03 18:02:47.000000000 -0500
@@ -530,3 +530,171 @@ const struct file_operations proc_numa_m
 };
 #endif
 
+#ifdef CONFIG_PROC_PAGEMAP
+struct pagemapread {
+	struct mm_struct *mm;
+	unsigned long next;
+	unsigned long *buf;
+	unsigned long pos;
+	size_t count;
+	int index;
+	char __user *out;
+};
+
+static int flush_pagemap(struct pagemapread *pm)
+{
+	int n = min(pm->count, pm->index * sizeof(unsigned long));
+	if (copy_to_user(pm->out, pm->buf, n))
+		return -EFAULT;
+	pm->out += n;
+	pm->pos += n;
+	pm->count -= n;
+	pm->index = 0;
+	cond_resched();
+	return 0;
+}
+
+static int add_to_pagemap(unsigned long addr, unsigned long pfn,
+			  struct pagemapread *pm)
+{
+	pm->buf[pm->index++] = pfn;
+	pm->next = addr + PAGE_SIZE;
+	if (pm->index * sizeof(unsigned long) >= PAGE_SIZE ||
+	    pm->index * sizeof(unsigned long) >= pm->count)
+		return flush_pagemap(pm);
+	return 0;
+}
+
+static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+			     void *private)
+{
+	struct pagemapread *pm = private;
+	pte_t *pte;
+	int err;
+
+	pte = pte_offset_map(pmd, addr);
+	for (; addr != end; pte++, addr += PAGE_SIZE) {
+		if (addr < pm->next)
+			continue;
+		if (!pte_present(*pte))
+			err = add_to_pagemap(addr, -1, pm);
+		else
+			err = add_to_pagemap(addr, pte_pfn(*pte), pm);
+		if (err)
+			return err;
+	}
+	pte_unmap(pte - 1);
+	return 0;
+}
+
+static int pagemap_fill(struct pagemapread *pm, unsigned long end)
+{
+	int ret;
+
+	while (pm->next != end) {
+		ret = add_to_pagemap(pm->next, -1UL, pm);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+static struct mm_walk pagemap_walk = { .pmd_entry = pagemap_pte_range };
+
+/* /proc/pid/pagemap - an array mapping virtual pages to pfns
+ *
+ * For each page in the address space, this file contains one long
+ * representing the corresponding physical page frame number (PFN) or
+ * -1 if the page isn't present. This allows determining precisely
+ * which pages are mapped and comparing mapped pages between
+ * processes.
+ *
+ * Efficient users of this interface will use /proc/pid/maps to
+ * determine which areas of memory are actually mapped and llseek to
+ * skip over unmapped regions.
+ */
+static ssize_t pagemap_read(struct file *file, char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+	unsigned long src = *ppos;
+	unsigned long *page;
+	unsigned long addr, end, vend, svpfn, evpfn;
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+	struct pagemapread pm;
+	int ret = -ESRCH;
+
+	if (!task)
+		goto out_no_task;
+
+	ret = -EACCES;
+	if (!ptrace_may_attach(task))
+		goto out;
+
+	ret = -EIO;
+	svpfn = src / sizeof(unsigned long);
+	addr = PAGE_SIZE * svpfn;
+	if (svpfn * sizeof(unsigned long) != src)
+		goto out;
+	evpfn = min((src + count) / sizeof(unsigned long),
+		    ((~0UL) >> PAGE_SHIFT) + 1);
+	count = (evpfn - svpfn) * sizeof(unsigned long);
+	end = PAGE_SIZE * evpfn;
+
+	ret = -ENOMEM;
+	page = kzalloc(PAGE_SIZE, GFP_USER);
+	if (!page)
+		goto out;
+
+	ret = 0;
+	mm = get_task_mm(task);
+	if (!mm)
+		goto out_free;
+
+	pm.mm = mm;
+	pm.next = addr;
+	pm.buf = (unsigned long *)page;
+	pm.pos = src;
+	pm.count = count;
+	pm.index = 0;
+	pm.out = buf;
+
+	down_read(&mm->mmap_sem);
+	vma = find_vma(mm, pm.next);
+	while (pm.count > 0 && vma) {
+		if (!ptrace_may_attach(task)) {
+			ret = -EIO;
+			goto out;
+		}
+		vend = min(vma->vm_start - 1, end - 1) + 1;
+		ret = pagemap_fill(&pm, vend);
+		if (ret || !pm.count)
+			break;
+		vend = min(vma->vm_end - 1, end - 1) + 1;
+		ret = walk_page_range(mm, vma->vm_start, vend,
+				      &pagemap_walk, &pm);
+		vma = vma->vm_next;
+	}
+	up_read(&mm->mmap_sem);
+
+	ret = pagemap_fill(&pm, end);
+
+	*ppos = pm.pos;
+	if (!ret)
+		ret = pm.pos - src;
+
+	mmput(mm);
+out_free:
+	kfree(page);
+out:
+	put_task_struct(task);
+out_no_task:
+	return ret;
+}
+
+const struct file_operations proc_pagemap_operations = {
+	.llseek		= mem_lseek, /* borrow this */
+	.read		= pagemap_read,
+};
+#endif
Index: mm/init/Kconfig
===================================================================
--- mm.orig/init/Kconfig	2007-04-03 14:50:33.000000000 -0500
+++ mm/init/Kconfig	2007-04-03 17:57:29.000000000 -0500
@@ -602,6 +602,16 @@ config PROC_CLEAR_REFS
           working set size. Disabling this interface will reduce
           the size of the kernel for small machines.
 
+config PROC_PAGEMAP
+	default y
+	bool "Enable /proc/pid/pagemap support" if EMBEDDED && PROC_FS && MMU
+	help
+	  The /proc/pid/pagemap interface allows reading the
+          kernel's virtual memory to page frame mapping to determine which
+          individual pages a process has mapped and which pages it shares
+          with other processes. Disabling this interface will reduce the
+          size of the kernel for small machines.
+
 endmenu		# General setup
 
 config RT_MUTEXES

^ permalink raw reply	[flat|nested] 77+ messages in thread

* [PATCH 13/13] maps: Add /proc/kpagemap interface
  2007-04-04  2:43 [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups Matt Mackall
                   ` (11 preceding siblings ...)
  2007-04-04  2:43 ` [PATCH 12/13] maps: Add /proc/pid/pagemap interface Matt Mackall
@ 2007-04-04  2:43 ` Matt Mackall
  2007-04-12 23:10 ` [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups William Lee Irwin III
  13 siblings, 0 replies; 77+ messages in thread
From: Matt Mackall @ 2007-04-04  2:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

Add /proc/kpagemap interface

This makes physical page flags and counts available to userspace.
Together with /proc/pid/pagemap and /proc/pid/clear_refs, this can be
used to measure memory usage on a per-page basis.

Signed-off-by: Matt Mackall <mpm@selenic.com>

Index: mm/fs/proc/proc_misc.c
===================================================================
--- mm.orig/fs/proc/proc_misc.c	2007-04-03 14:47:45.000000000 -0500
+++ mm/fs/proc/proc_misc.c	2007-04-03 17:57:14.000000000 -0500
@@ -46,6 +46,8 @@
 #include <linux/vmalloc.h>
 #include <linux/crash_dump.h>
 #include <linux/pid_namespace.h>
+#include <linux/ptrace.h>
+#include <linux/bootmem.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/io.h>
@@ -733,6 +735,72 @@ static struct file_operations proc_page_
 };
 #endif
 
+#ifdef CONFIG_PROC_KPAGEMAP
+#define KPMSIZE (sizeof(unsigned long) * 2)
+#define KPMMASK (KPMSIZE - 1)
+/* /proc/kpagemap - an array exposing page flags and counts
+ *
+ * Each entry is a pair of unsigned longs representing the
+ * corresponding physical page, the first containing the page flags
+ * and the second containing the page use count.
+ */
+static ssize_t kpagemap_read(struct file *file, char __user *buf,
+			     size_t count, loff_t *ppos)
+{
+	unsigned long *page;
+	struct page *ppage;
+	unsigned long src = *ppos;
+	unsigned long pfn;
+	ssize_t ret = 0;
+	int chunk, i;
+
+	pfn = src / KPMSIZE;
+	count = min_t(size_t, count, (max_pfn * KPMSIZE) - src);
+	if (src & KPMMASK || count & KPMMASK)
+		return -EIO;
+
+	page = (unsigned long *)__get_free_page(GFP_USER);
+	if (!page)
+		return -ENOMEM;
+
+	while (count > 0) {
+		chunk = min_t(size_t, count, PAGE_SIZE);
+		for (i = 0; i < 2 * chunk / KPMSIZE; i += 2) {
+			ppage = pfn_to_page(pfn);
+			if (!ppage) {
+				page[i] = 0;
+				page[i + 1] = 0;
+			} else {
+				page[i] = ppage->flags;
+				page[i + 1] = atomic_read(&ppage->_count);
+			}
+			pfn++;
+		}
+		chunk = (i / 2) * KPMSIZE;
+
+		if (copy_to_user(buf, page, chunk)) {
+			ret = -EFAULT;
+			break;
+		}
+		ret += chunk;
+		src += chunk;
+		buf += chunk;
+		count -= chunk;
+		cond_resched();
+	}
+	*ppos = src;
+
+	free_page((unsigned long)page);
+	return ret;
+}
+
+struct proc_dir_entry *proc_kpagemap;
+static struct file_operations proc_kpagemap_operations = {
+	.llseek = mem_lseek,
+	.read = kpagemap_read,
+};
+#endif
+
 struct proc_dir_entry *proc_root_kcore;
 
 void create_seq_entry(char *name, mode_t mode, const struct file_operations *f)
@@ -812,6 +880,11 @@ void __init proc_misc_init(void)
 				(size_t)high_memory - PAGE_OFFSET + PAGE_SIZE;
 	}
 #endif
+#ifdef CONFIG_PROC_KPAGEMAP
+	proc_kpagemap = create_proc_entry("kpagemap", S_IRUSR, NULL);
+	if (proc_kpagemap)
+		proc_kpagemap->proc_fops = &proc_kpagemap_operations;
+#endif
 #ifdef CONFIG_PROC_VMCORE
 	proc_vmcore = create_proc_entry("vmcore", S_IRUSR, NULL);
 	if (proc_vmcore)
Index: mm/init/Kconfig
===================================================================
--- mm.orig/init/Kconfig	2007-04-03 14:50:33.000000000 -0500
+++ mm/init/Kconfig	2007-04-03 14:50:33.000000000 -0500
@@ -612,6 +612,15 @@ config PROC_PAGEMAP
           with other processes. Disabling this interface will reduce the
           size of the kernel for small machines.
 
+config PROC_KPAGEMAP
+	default y
+	bool "Enable /proc/kpagemap support" if EMBEDDED && PROC_FS
+	help
+	  The /proc/pid/kpagemap interface allows reading the
+          kernel's per-page flag and usage counts to gather precise
+          information on page-level memory usage. Disabling this interface
+          will reduce the size of the kernel for small machines.
+
 endmenu		# General setup
 
 config RT_MUTEXES

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 6/13] maps: Move the page walker code to lib/
  2007-04-04  2:43 ` [PATCH 6/13] maps: Move the page walker code to lib/ Matt Mackall
@ 2007-04-04  3:51   ` Nick Piggin
  2007-04-04  5:08     ` Matt Mackall
  0 siblings, 1 reply; 77+ messages in thread
From: Nick Piggin @ 2007-04-04  3:51 UTC (permalink / raw)
  To: Matt Mackall; +Cc: Andrew Morton, linux-kernel

Matt Mackall wrote:
> Move the page walker code to lib/
> 
> This lets it get shared outside of proc/ and linked in only when
> needed.

I think it would be better in mm/.

So would clear_refs_pte_range, and clear_refs_write (in a more
generic form), IMO.

Sweet patchset, though.

-- 
SUSE Labs, Novell Inc.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 6/13] maps: Move the page walker code to lib/
  2007-04-04  3:51   ` Nick Piggin
@ 2007-04-04  5:08     ` Matt Mackall
  2007-04-04  5:50       ` Nick Piggin
  0 siblings, 1 reply; 77+ messages in thread
From: Matt Mackall @ 2007-04-04  5:08 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, linux-kernel

On Wed, Apr 04, 2007 at 01:51:37PM +1000, Nick Piggin wrote:
> Matt Mackall wrote:
> >Move the page walker code to lib/
> >
> >This lets it get shared outside of proc/ and linked in only when
> >needed.
> 
> I think it would be better in mm/.

I originally was looking at putting it in mm/memory.c and possibly
converting some users there. But I decided it ought to be
conditionally linked until it had some core users and I wasn't quite
ready to tackle the tricky mm/ bits.

> So would clear_refs_pte_range, and clear_refs_write (in a more
> generic form), IMO.
> 
> Sweet patchset, though.

Thanks.

-- 
Mathematics is the supreme nostalgia of our time.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 6/13] maps: Move the page walker code to lib/
  2007-04-04  5:08     ` Matt Mackall
@ 2007-04-04  5:50       ` Nick Piggin
  2007-04-04 21:48         ` Matt Mackall
  0 siblings, 1 reply; 77+ messages in thread
From: Nick Piggin @ 2007-04-04  5:50 UTC (permalink / raw)
  To: Matt Mackall; +Cc: Andrew Morton, linux-kernel

Matt Mackall wrote:
> On Wed, Apr 04, 2007 at 01:51:37PM +1000, Nick Piggin wrote:
> 
>>Matt Mackall wrote:
>>
>>>Move the page walker code to lib/
>>>
>>>This lets it get shared outside of proc/ and linked in only when
>>>needed.
>>
>>I think it would be better in mm/.
> 
> 
> I originally was looking at putting it in mm/memory.c and possibly

Just put it in its own file in mm/ rather than its own file in lib.
lib should be for almost-standalone stuff, IMO (ie. only using basic
kernel functionality).

Yes, I think ioremap.c should be in mm/ as well.

> converting some users there. But I decided it ought to be
> conditionally linked until it had some core users and I wasn't quite
> ready to tackle the tricky mm/ bits.

Apart from these users outside mm/, I don't see much point in converting
things over. The page table walking API we have now is neat and simple.
It takes a few lines of code, but is it a big problem?

-- 
SUSE Labs, Novell Inc.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 11/13] maps: Make /proc/pid/clear_refs option under CONFIG_EMBEDDED
  2007-04-04  2:43 ` [PATCH 11/13] maps: Make /proc/pid/clear_refs option " Matt Mackall
@ 2007-04-04  6:22   ` David Rientjes
  0 siblings, 0 replies; 77+ messages in thread
From: David Rientjes @ 2007-04-04  6:22 UTC (permalink / raw)
  To: Matt Mackall; +Cc: Andrew Morton, linux-kernel

On Tue, 3 Apr 2007, Matt Mackall wrote:

> Make /proc/pid/clear_refs option under CONFIG_EMBEDDED
> 
> This interface is primarily useful for doing memory profiling and not
> much use on deployed embedded boxes. Make it optional. Together with
> /proc/pid/smaps, this save a few K.
> 
> Signed-off-by: Matt Mackall <mpm@selenic.com>

Acked-by: David Rientjes <rientjes@google.com>

Althought /proc/pid/clear_refs isn't actually useful without smaps as 
well, so you could make it depend on CONFIG_PROC_SMAPS.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 12/13] maps: Add /proc/pid/pagemap interface
  2007-04-04  2:43 ` [PATCH 12/13] maps: Add /proc/pid/pagemap interface Matt Mackall
@ 2007-04-04 11:18   ` Nikita Danilov
  2007-04-04 16:32     ` Matt Mackall
  0 siblings, 1 reply; 77+ messages in thread
From: Nikita Danilov @ 2007-04-04 11:18 UTC (permalink / raw)
  To: Matt Mackall; +Cc: Andrew Morton, linux-kernel

Matt Mackall writes:
 > Add /proc/pid/pagemap interface
 > 
 > This interface provides a mapping for each page in an address space to
 > its physical page frame number, allowing precise determination of what
 > pages are mapped and what pages are shared between processes.

[...]

 >  
 > +#ifdef CONFIG_PROC_PAGEMAP
 > +struct pagemapread {
 > +	struct mm_struct *mm;
 > +	unsigned long next;
 > +	unsigned long *buf;
 > +	unsigned long pos;
 > +	size_t count;
 > +	int index;
 > +	char __user *out;
 > +};
 > +
 > +static int flush_pagemap(struct pagemapread *pm)
 > +{
 > +	int n = min(pm->count, pm->index * sizeof(unsigned long));
 > +	if (copy_to_user(pm->out, pm->buf, n))
 > +		return -EFAULT;

This pushes binary data to the user space. Wasn't /proc supposed to be
ascii-based to avoid compatibility problems (e.g., size of unsigned long
changing, endianness, etc.)?

Nikita.


^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 12/13] maps: Add /proc/pid/pagemap interface
  2007-04-04 11:18   ` Nikita Danilov
@ 2007-04-04 16:32     ` Matt Mackall
  2007-04-04 18:03       ` Nikita Danilov
  0 siblings, 1 reply; 77+ messages in thread
From: Matt Mackall @ 2007-04-04 16:32 UTC (permalink / raw)
  To: Nikita Danilov; +Cc: Andrew Morton, linux-kernel

On Wed, Apr 04, 2007 at 03:18:41PM +0400, Nikita Danilov wrote:
> This pushes binary data to the user space. Wasn't /proc supposed to be
> ascii-based to avoid compatibility problems (e.g., size of unsigned long
> changing, endianness, etc.)?

Most of what's in /proc is ASCII, true. But there are notable
exceptions - kcore, auxv, mem. This is just another window into memory
like kcore and mem. It's also a fairly big window. To examine a 1G
mapping, you need to read 1M of data.

ASCII would be rather ugly here. It's important that the interface be
seekable so that you can skip over wide ranges of data with no maps.
So all the elements would have to be nice fixed sizes. That'd mean 16
hex digits plus a space/newline. So to figure out what's at b7f81000,
I'd have to divide by 4096 (I already have to know the page size),
then multiply by 17, getting c37791. And then I'd have to copy more
than 2x the data in byte-wide chunks due to alignment concerns.

With the extra translation on both ends, the expanded data size, and
the alignment issues, this probably just got 10x-20x slower. Which
means taking a snapshot of a typical system and doing any sort of live
display just got a hell of a lot more impractical.

Now I could adjust these to only export u64s in some preferred
endianness. But given I already need details like the page size to
make any sense of it, it seems unnecessary. Also, the PFNs are fairly
opaque unless you're attempting to correlate them with /proc/kpagemap.

--
Mathematics is the supreme nostalgia of our time.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 12/13] maps: Add /proc/pid/pagemap interface
  2007-04-04 16:32     ` Matt Mackall
@ 2007-04-04 18:03       ` Nikita Danilov
  2007-04-04 21:59         ` Matt Mackall
  0 siblings, 1 reply; 77+ messages in thread
From: Nikita Danilov @ 2007-04-04 18:03 UTC (permalink / raw)
  To: Matt Mackall; +Cc: Andrew Morton, linux-kernel

Matt Mackall writes:

[...]

 > 
 > Now I could adjust these to only export u64s in some preferred
 > endianness. But given I already need details like the page size to
 > make any sense of it, it seems unnecessary. Also, the PFNs are fairly
 > opaque unless you're attempting to correlate them with /proc/kpagemap.

Alternatively, you can export some meta-data at the beginning of that
file like /proc/profile does.

Nikita.


^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 6/13] maps: Move the page walker code to lib/
  2007-04-04  5:50       ` Nick Piggin
@ 2007-04-04 21:48         ` Matt Mackall
  2007-04-05  1:32           ` Nick Piggin
  0 siblings, 1 reply; 77+ messages in thread
From: Matt Mackall @ 2007-04-04 21:48 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, linux-kernel

On Wed, Apr 04, 2007 at 03:50:56PM +1000, Nick Piggin wrote:
> Matt Mackall wrote:
> >On Wed, Apr 04, 2007 at 01:51:37PM +1000, Nick Piggin wrote:
> >
> >>Matt Mackall wrote:
> >>
> >>>Move the page walker code to lib/
> >>>
> >>>This lets it get shared outside of proc/ and linked in only when
> >>>needed.
> >>
> >>I think it would be better in mm/.
> >
> >
> >I originally was looking at putting it in mm/memory.c and possibly
> 
> Just put it in its own file in mm/ rather than its own file in lib.
> lib should be for almost-standalone stuff, IMO (ie. only using basic
> kernel functionality).

Arguably that's what lib/ should be for, but it's currently largely
used to avoid linking in unused code without adding more hair to
Kconfig. Which is what I'm trying to do here.

> Apart from these users outside mm/, I don't see much point in converting
> things over. The page table walking API we have now is neat and simple.
> It takes a few lines of code, but is it a big problem?

I don't think it really qualifies as either neat or simple. It may be
about as neat as walking a heterogenous tree with an inconsistent
naming scheme can be, but it's still a headache. A maze of twisty
little passages, all slightly different.

-- 
Mathematics is the supreme nostalgia of our time.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 12/13] maps: Add /proc/pid/pagemap interface
  2007-04-04 18:03       ` Nikita Danilov
@ 2007-04-04 21:59         ` Matt Mackall
  0 siblings, 0 replies; 77+ messages in thread
From: Matt Mackall @ 2007-04-04 21:59 UTC (permalink / raw)
  To: Nikita Danilov; +Cc: Andrew Morton, linux-kernel

On Wed, Apr 04, 2007 at 10:03:04PM +0400, Nikita Danilov wrote:
> Matt Mackall writes:
> 
> [...]
> 
>  > 
>  > Now I could adjust these to only export u64s in some preferred
>  > endianness. But given I already need details like the page size to
>  > make any sense of it, it seems unnecessary. Also, the PFNs are fairly
>  > opaque unless you're attempting to correlate them with /proc/kpagemap.
> 
> Alternatively, you can export some meta-data at the beginning of that
> file like /proc/profile does.

That seems reasonable. We need page size, entry size (4 or 8 bytes), and
an endian marker. I'll see what I can come up with.

-- 
Mathematics is the supreme nostalgia of our time.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 6/13] maps: Move the page walker code to lib/
  2007-04-04 21:48         ` Matt Mackall
@ 2007-04-05  1:32           ` Nick Piggin
  2007-04-05  1:50             ` Nick Piggin
  0 siblings, 1 reply; 77+ messages in thread
From: Nick Piggin @ 2007-04-05  1:32 UTC (permalink / raw)
  To: Matt Mackall; +Cc: Andrew Morton, linux-kernel

Matt Mackall wrote:
> On Wed, Apr 04, 2007 at 03:50:56PM +1000, Nick Piggin wrote:
> 
>>Matt Mackall wrote:
>>
>>>On Wed, Apr 04, 2007 at 01:51:37PM +1000, Nick Piggin wrote:
>>>
>>>
>>>>Matt Mackall wrote:
>>>>
>>>>
>>>>>Move the page walker code to lib/
>>>>>
>>>>>This lets it get shared outside of proc/ and linked in only when
>>>>>needed.
>>>>
>>>>I think it would be better in mm/.
>>>
>>>
>>>I originally was looking at putting it in mm/memory.c and possibly
>>
>>Just put it in its own file in mm/ rather than its own file in lib.
>>lib should be for almost-standalone stuff, IMO (ie. only using basic
>>kernel functionality).
> 
> 
> Arguably that's what lib/ should be for, but it's currently largely

I disagree. There is code everywhere that exists to provide some
functionality via an API to other parts of the kernel. You don't
think mm/page_alloc.c should go in lib/?

> used to avoid linking in unused code without adding more hair to
> Kconfig. Which is what I'm trying to do here.

Well if you're doing a nice big reorganisation and jumble, then
why would you worry about a few lines in Kconfig?

>>Apart from these users outside mm/, I don't see much point in converting
>>things over. The page table walking API we have now is neat and simple.
>>It takes a few lines of code, but is it a big problem?
> 
> 
> I don't think it really qualifies as either neat or simple. It may be
> about as neat as walking a heterogenous tree with an inconsistent
> naming scheme can be, but it's still a headache. A maze of twisty
> little passages, all slightly different.

All page table range walking code should be the same.
It is a simple template to use to walk a range of ptes. It is
a headache outside mm/, sure, but not because of its incredible
complexity.

-- 
SUSE Labs, Novell Inc.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 6/13] maps: Move the page walker code to lib/
  2007-04-05  1:32           ` Nick Piggin
@ 2007-04-05  1:50             ` Nick Piggin
  0 siblings, 0 replies; 77+ messages in thread
From: Nick Piggin @ 2007-04-05  1:50 UTC (permalink / raw)
  To: Matt Mackall; +Cc: Andrew Morton, linux-kernel

Nick Piggin wrote:
> Matt Mackall wrote:
> 
>> On Wed, Apr 04, 2007 at 03:50:56PM +1000, Nick Piggin wrote:

>>> Just put it in its own file in mm/ rather than its own file in lib.
>>> lib should be for almost-standalone stuff, IMO (ie. only using basic
>>> kernel functionality).
>>
>>
>>
>> Arguably that's what lib/ should be for, but it's currently largely
> 
> 
> I disagree. There is code everywhere that exists to provide some
> functionality via an API to other parts of the kernel. You don't
> think mm/page_alloc.c should go in lib/?

Oh, I think I misunderstood you.

Anyway, we have lots of conditionally compiled code in mm as well.
It doesn't seem to be much hardship.

-- 
SUSE Labs, Novell Inc.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-04  2:43 [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups Matt Mackall
                   ` (12 preceding siblings ...)
  2007-04-04  2:43 ` [PATCH 13/13] maps: Add /proc/kpagemap interface Matt Mackall
@ 2007-04-12 23:10 ` William Lee Irwin III
  2007-04-12 23:32   ` Andrew Morton
  13 siblings, 1 reply; 77+ messages in thread
From: William Lee Irwin III @ 2007-04-12 23:10 UTC (permalink / raw)
  To: Matt Mackall; +Cc: Andrew Morton, linux-kernel

On Tue, Apr 03, 2007 at 09:43:30PM -0500, Matt Mackall wrote:
> This patch series introduces /proc/pid/pagemap and /proc/kpagemap,
> which allow detailed run-time examination of process memory usage at a
> page granularity.
> The first several patches whip the page-walking code introduced for
> /proc/pid/smaps and clear_refs into a more generic form, the next
> couple make those interfaces optional, and the last two introduce the
> new interfaces, also optional.

This solves a real-life problem for Oracle system monitoring software
(specifically EM). Among the tasks it must carry out is determining
per-process memory footprint of a set of cooperating tasks (i.e. Oracle
processes). RSS is inadequate for this due to page sharing; this work
provides sufficient information to determine what EM needs.


-- wli

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-12 23:10 ` [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups William Lee Irwin III
@ 2007-04-12 23:32   ` Andrew Morton
  2007-04-12 23:42     ` William Lee Irwin III
                       ` (2 more replies)
  0 siblings, 3 replies; 77+ messages in thread
From: Andrew Morton @ 2007-04-12 23:32 UTC (permalink / raw)
  To: William Lee Irwin III; +Cc: Matt Mackall, linux-kernel

On Thu, 12 Apr 2007 16:10:50 -0700
William Lee Irwin III <wli@holomorphy.com> wrote:

> On Tue, Apr 03, 2007 at 09:43:30PM -0500, Matt Mackall wrote:
> > This patch series introduces /proc/pid/pagemap and /proc/kpagemap,
> > which allow detailed run-time examination of process memory usage at a
> > page granularity.
> > The first several patches whip the page-walking code introduced for
> > /proc/pid/smaps and clear_refs into a more generic form, the next
> > couple make those interfaces optional, and the last two introduce the
> > new interfaces, also optional.
> 
> This solves a real-life problem for Oracle system monitoring software
> (specifically EM). Among the tasks it must carry out is determining
> per-process memory footprint of a set of cooperating tasks (i.e. Oracle
> processes). RSS is inadequate for this due to page sharing; this work
> provides sufficient information to determine what EM needs.
> 
> 

I'm still dying to see what the human-readable output from this
thing looks like.

<looks>

> + * Each entry is a pair of unsigned longs representing the
> + * corresponding physical page, the first containing the page flags
> + * and the second containing the page use count.
> + *
> + * The first 4 bytes of this file form a simple header:
> + *
> + * first byte:   0 for big endian, 1 for little
> + * second byte:  page shift (eg 12 for 4096 byte pages)
> + * third byte:   entry size in bytes (currently either 4 or 8)
> + * fourth byte:  header size
>
> ...
>
> +       while (count > 0) {
> +               chunk = min_t(size_t, count, PAGE_SIZE);
> +               i = 0;
> +
> +               if (pfn == -1) {
> +                       page[0] = 0;
> +                       page[1] = 0;
> +                       ((char *)page)[0] = (ntohl(1) != 1);

OK.

> +                       ((char *)page)[1] = PAGE_SHIFT;

OK.

> +                       ((char *)page)[2] = sizeof(unsigned long);

OK.

> +                       ((char *)page)[3] = KPMSIZE;

OK.

> +                       i = 2;
> +                       pfn++;
> +               }
> +
> +               for (; i < 2 * chunk / KPMSIZE; i += 2, pfn++) {
> +                       ppage = pfn_to_page(pfn);
> +                       if (!ppage) {
> +                               page[i] = 0;
> +                               page[i + 1] = 0;
> +                       } else {
> +                               page[i] = ppage->flags;
> +                               page[i + 1] = atomic_read(&ppage->_count);
> +                       }
> +               }

Not a good idea to expose raw flags in this manner - it changes at the drop
of a hat.  We'd need to also expose the kernel's PG_foo-to-bitnumber
mapping to make this viable.

Not a good idea to use page->_count: page_count() will be more stable. 
Otherwise OK, I guess: the interpretation of the page refcount is unlikely
to change much over time.


^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-12 23:32   ` Andrew Morton
@ 2007-04-12 23:42     ` William Lee Irwin III
  2007-04-13  0:25       ` Nick Piggin
  2007-04-13  0:15     ` Nick Piggin
  2007-04-13  0:15     ` Matt Mackall
  2 siblings, 1 reply; 77+ messages in thread
From: William Lee Irwin III @ 2007-04-12 23:42 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Matt Mackall, linux-kernel

On Thu, 12 Apr 2007 16:10:50 -0700 William Lee Irwin III <wli@holomorphy.com> wrote:
>> This solves a real-life problem for Oracle system monitoring software
>> (specifically EM). Among the tasks it must carry out is determining
>> per-process memory footprint of a set of cooperating tasks (i.e. Oracle
>> processes). RSS is inadequate for this due to page sharing; this work
>> provides sufficient information to determine what EM needs.

On Thu, Apr 12, 2007 at 04:32:35PM -0700, Andrew Morton wrote:
> Not a good idea to expose raw flags in this manner - it changes at the drop
> of a hat.  We'd need to also expose the kernel's PG_foo-to-bitnumber
> mapping to make this viable.
> Not a good idea to use page->_count: page_count() will be more stable. 
> Otherwise OK, I guess: the interpretation of the page refcount is unlikely
> to change much over time.

EM wants to determine page_mapcount() for the most part for the
purposes of determining "uniquely attributable RSS" (my ca. 2004
nomenclature) or "proportional RSS" (mpm's more recent nomenclature);
as things now stand it will have to infer them by maintaining a table
of pfn's and mappings thereof, but at least that can be done with it.


-- wli

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-12 23:32   ` Andrew Morton
  2007-04-12 23:42     ` William Lee Irwin III
@ 2007-04-13  0:15     ` Nick Piggin
  2007-04-13  0:25       ` Matt Mackall
  2007-04-13  0:42       ` Andrew Morton
  2007-04-13  0:15     ` Matt Mackall
  2 siblings, 2 replies; 77+ messages in thread
From: Nick Piggin @ 2007-04-13  0:15 UTC (permalink / raw)
  To: Andrew Morton; +Cc: William Lee Irwin III, Matt Mackall, linux-kernel

Andrew Morton wrote:
> On Thu, 12 Apr 2007 16:10:50 -0700
> William Lee Irwin III <wli@holomorphy.com> wrote:

>>+       while (count > 0) {
>>+               chunk = min_t(size_t, count, PAGE_SIZE);
>>+               i = 0;
>>+
>>+               if (pfn == -1) {
>>+                       page[0] = 0;
>>+                       page[1] = 0;
>>+                       ((char *)page)[0] = (ntohl(1) != 1);
> 
> 
> OK.
> 
> 
>>+                       ((char *)page)[1] = PAGE_SHIFT;
> 
> 
> OK.

Shouldn't we just expose page size and endianness by other means? (another file or
syscall).

>>+               for (; i < 2 * chunk / KPMSIZE; i += 2, pfn++) {
>>+                       ppage = pfn_to_page(pfn);
>>+                       if (!ppage) {
>>+                               page[i] = 0;
>>+                               page[i + 1] = 0;
>>+                       } else {
>>+                               page[i] = ppage->flags;
>>+                               page[i + 1] = atomic_read(&ppage->_count);
>>+                       }
>>+               }
> 
> 
> Not a good idea to expose raw flags in this manner - it changes at the drop
> of a hat.  We'd need to also expose the kernel's PG_foo-to-bitnumber
> mapping to make this viable.

I don't think it is viable because that makes the flags part of the
userspace ABI. I wonder what they are needed for.

-- 
SUSE Labs, Novell Inc.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-12 23:32   ` Andrew Morton
  2007-04-12 23:42     ` William Lee Irwin III
  2007-04-13  0:15     ` Nick Piggin
@ 2007-04-13  0:15     ` Matt Mackall
  2 siblings, 0 replies; 77+ messages in thread
From: Matt Mackall @ 2007-04-13  0:15 UTC (permalink / raw)
  To: Andrew Morton; +Cc: William Lee Irwin III, linux-kernel

On Thu, Apr 12, 2007 at 04:32:35PM -0700, Andrew Morton wrote:
> On Thu, 12 Apr 2007 16:10:50 -0700
> William Lee Irwin III <wli@holomorphy.com> wrote:
> 
> > On Tue, Apr 03, 2007 at 09:43:30PM -0500, Matt Mackall wrote:
> > > This patch series introduces /proc/pid/pagemap and /proc/kpagemap,
> > > which allow detailed run-time examination of process memory usage at a
> > > page granularity.
> > > The first several patches whip the page-walking code introduced for
> > > /proc/pid/smaps and clear_refs into a more generic form, the next
> > > couple make those interfaces optional, and the last two introduce the
> > > new interfaces, also optional.
> > 
> > This solves a real-life problem for Oracle system monitoring software
> > (specifically EM). Among the tasks it must carry out is determining
> > per-process memory footprint of a set of cooperating tasks (i.e. Oracle
> > processes). RSS is inadequate for this due to page sharing; this work
> > provides sufficient information to determine what EM needs.
> 
> I'm still dying to see what the human-readable output from this
> thing looks like.

Still a work-in-progress. It's a monstrous amount of data and it
basically requires a GUI to really get a handle on. Here's a couple
apps I've been tinkering with (aka My First GTK Apps):

http://selenic.com/Screenshot-pagemap.png

That's a snapshot of a live-updating image of memory usage for a
running process (Galeon). Each pixel is a page. Each 32x32 block is
4MB. Mappings are dark red. Pages that are actually faulted in are
bright red. You can poke around in the memory map with the mouse and
highlight mappings (blue). And pages that get faulted in flash green
(hard to capture in a screenshot).

http://selenic.com/Screenshot-kpagemap.png

And that's a live-updating image of system-wide memory usage. Bright
red are pages with a count of 1, dark red are pages with higher
counts. Next is to visualize slab/page cache/buddy/active/lru data as
well as highlight changing pages.

This isn't terribly interesting yet. It can tell you things about page
cache usage and fragmentation and readahead and so on.

But correlating across the two sources, we'll be able to show
information like "what pages in a process are actually
shared/active/lru/etc." You can take it even further by correlating
the above data with symbol info from nm, /proc/pid/clear_refs, etc.

Also, something I immediately noticed on looking at the raw data
(cat /proc/`pidof`/pagemap | hexdump -C | less):

002c8fd0  ff ff ff ff ff ff ff ff  ff ff ff ff 6d f8 03 00 |............m...|
002c8fe0  6c f8 03 00 b9 f8 03 00  6b f8 03 00 6a f8 03 00 |l.......k...j...|
002c8ff0  b8 f8 03 00 69 f8 03 00  68 f8 03 00 b7 f8 03 00 |....i...h.......|
002c9000  67 f8 03 00 66 f8 03 00  b6 f8 03 00 65 f8 03 00 |g...f.......e...|
002c9010  64 f8 03 00 b5 f8 03 00  63 f8 03 00 62 f8 03 00 |d.......c...b...|
002c9020  b4 f8 03 00 61 f8 03 00  60 f8 03 00 b3 f8 03 00 |....a...`.......|
002c9030  7f f8 03 00 7e f8 03 00  b2 f8 03 00 7d f8 03 00 |....~.......}...|
002c9040  7c f8 03 00 b1 f8 03 00  5f f8 03 00 5e f8 03 00 ||......._...^...|
002c9050  b0 f8 03 00 5d f8 03 00  5c f8 03 00 af f8 03 00 |....]...\.......|

Most of the consecutive page frames are allocated in descending order
(6d 6c 6b 6a ...). That's pessimal for physical merging of block I/O.
Given that we theoretically fixed this long-standing problem in 2.6
but it's obviously still happening, it's clear that a little more
visibility into the VM would be useful.

-- 
Mathematics is the supreme nostalgia of our time.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  0:15     ` Nick Piggin
@ 2007-04-13  0:25       ` Matt Mackall
  2007-04-13  1:01         ` Nick Piggin
  2007-04-13  0:42       ` Andrew Morton
  1 sibling, 1 reply; 77+ messages in thread
From: Matt Mackall @ 2007-04-13  0:25 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, William Lee Irwin III, linux-kernel

On Fri, Apr 13, 2007 at 10:15:24AM +1000, Nick Piggin wrote:
> Andrew Morton wrote:
> >On Thu, 12 Apr 2007 16:10:50 -0700
> >William Lee Irwin III <wli@holomorphy.com> wrote:
> 
> >>+       while (count > 0) {
> >>+               chunk = min_t(size_t, count, PAGE_SIZE);
> >>+               i = 0;
> >>+
> >>+               if (pfn == -1) {
> >>+                       page[0] = 0;
> >>+                       page[1] = 0;
> >>+                       ((char *)page)[0] = (ntohl(1) != 1);
> >
> >
> >OK.
> >
> >
> >>+                       ((char *)page)[1] = PAGE_SHIFT;
> >
> >
> >OK.
> 
> Shouldn't we just expose page size and endianness by other means? (another 
> file or
> syscall).

If I send you this file dumped from a random machine, you won't know
what to make of it.

I'm planning to write a trivial server to sit on, say, my embedded
target and spew this over the wire to a client. 

> >Not a good idea to expose raw flags in this manner - it changes at the drop
> >of a hat.  We'd need to also expose the kernel's PG_foo-to-bitnumber
> >mapping to make this viable.
> 
> I don't think it is viable because that makes the flags part of the
> userspace ABI. I wonder what they are needed for.

Basically: to show what the hell's going on in the VM.

-- 
Mathematics is the supreme nostalgia of our time.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-12 23:42     ` William Lee Irwin III
@ 2007-04-13  0:25       ` Nick Piggin
  0 siblings, 0 replies; 77+ messages in thread
From: Nick Piggin @ 2007-04-13  0:25 UTC (permalink / raw)
  To: William Lee Irwin III; +Cc: Andrew Morton, Matt Mackall, linux-kernel

William Lee Irwin III wrote:
> On Thu, 12 Apr 2007 16:10:50 -0700 William Lee Irwin III <wli@holomorphy.com> wrote:
> 
>>>This solves a real-life problem for Oracle system monitoring software
>>>(specifically EM). Among the tasks it must carry out is determining
>>>per-process memory footprint of a set of cooperating tasks (i.e. Oracle
>>>processes). RSS is inadequate for this due to page sharing; this work
>>>provides sufficient information to determine what EM needs.
> 
> 
> On Thu, Apr 12, 2007 at 04:32:35PM -0700, Andrew Morton wrote:
> 
>>Not a good idea to expose raw flags in this manner - it changes at the drop
>>of a hat.  We'd need to also expose the kernel's PG_foo-to-bitnumber
>>mapping to make this viable.
>>Not a good idea to use page->_count: page_count() will be more stable. 
>>Otherwise OK, I guess: the interpretation of the page refcount is unlikely
>>to change much over time.
> 
> 
> EM wants to determine page_mapcount() for the most part for the
> purposes of determining "uniquely attributable RSS" (my ca. 2004
> nomenclature) or "proportional RSS" (mpm's more recent nomenclature);
> as things now stand it will have to infer them by maintaining a table
> of pfn's and mappings thereof, but at least that can be done with it.

I don't know whether you can easily determine page_mapcount with
page_count and flags, though (count gives you an educated guess,
but mapcount is the real thing).

page_mapcount sounds very reasonable to export. It is directly
tied with the userspace concept of mapping pages. page_count doesn't
seem very useful (and if you must have it, please use page_count),
neither does page flags.

You could have a bit indicating whether the page is free or not (but
that doesn't tell you much that meminfo or zoneinfo or buddyinfo does
not). Dirty/writeback/referenced/uptodate maybe?... I'm stumped,
what's flags for?

-- 
SUSE Labs, Novell Inc.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  0:15     ` Nick Piggin
  2007-04-13  0:25       ` Matt Mackall
@ 2007-04-13  0:42       ` Andrew Morton
  2007-04-13  1:14         ` Nick Piggin
  2007-04-13 16:24         ` Matt Mackall
  1 sibling, 2 replies; 77+ messages in thread
From: Andrew Morton @ 2007-04-13  0:42 UTC (permalink / raw)
  To: Nick Piggin; +Cc: William Lee Irwin III, Matt Mackall, linux-kernel

On Fri, 13 Apr 2007 10:15:24 +1000 Nick Piggin <nickpiggin@yahoo.com.au> wrote:

> >>+                       ((char *)page)[1] = PAGE_SHIFT;
> > 
> > 
> > OK.
> 
> Shouldn't we just expose page size and endianness by other means? (another file or
> syscall).

I don't think so - this file exposes fairly deep kernel internals and
that's unavoidable, really - it's *supposed* to do that.  It is explicitly
designed for monitoring kernel behaviour.

So it needs special handling by userspace.  Keeping the number of files
which need such special handling to a minimum will keep the number of
applications which are exposed to kernel changes to a minimum.

> >>+               for (; i < 2 * chunk / KPMSIZE; i += 2, pfn++) {
> >>+                       ppage = pfn_to_page(pfn);
> >>+                       if (!ppage) {
> >>+                               page[i] = 0;
> >>+                               page[i + 1] = 0;
> >>+                       } else {
> >>+                               page[i] = ppage->flags;
> >>+                               page[i + 1] = atomic_read(&ppage->_count);
> >>+                       }
> >>+               }
> > 
> > 
> > Not a good idea to expose raw flags in this manner - it changes at the drop
> > of a hat.  We'd need to also expose the kernel's PG_foo-to-bitnumber
> > mapping to make this viable.
> 
> I don't think it is viable because that makes the flags part of the
> userspace ABI.

It *will* be viable.  If the application wants to know if a page is dirty,
it looks up "PG_dirty" in /proc/pg_foo-to-bitnumber and uses PG_dirty's
numerical offset when inspecting fields in /proc/kpagemap.  If correctly
designed, such a monitoring application will be able to report upon page
flags which we haven't even thought up yet.

> I wonder what they are needed for.

Poking deeply into the kernel to provide information about kernel state. 

There are real-world needs for this, and the people who develop tools to
process this information will have decent kernel understanding and will
know that the file's contents may alter across kernel versions.  It sure
beats poking around in /dev/kmem.

I doubt if there's a sensible way in which we can prettify this interface
without losing information.  But we should aim to make it as robust as
possible agaisnt future kenrel changes, of course.

And we should satisfy ourselves that all the required information has been
made available.  The fact that it will satisfy the Oracle requirement is
encouraging.

Matt, these changes make the new field in /proc/pid/smaps redundant, don't
they?


^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  0:25       ` Matt Mackall
@ 2007-04-13  1:01         ` Nick Piggin
  2007-04-13  1:38           ` Matt Mackall
  0 siblings, 1 reply; 77+ messages in thread
From: Nick Piggin @ 2007-04-13  1:01 UTC (permalink / raw)
  To: Matt Mackall; +Cc: Andrew Morton, William Lee Irwin III, linux-kernel

Matt Mackall wrote:
> On Fri, Apr 13, 2007 at 10:15:24AM +1000, Nick Piggin wrote:
> 
>>Andrew Morton wrote:
>>
>>>On Thu, 12 Apr 2007 16:10:50 -0700
>>>William Lee Irwin III <wli@holomorphy.com> wrote:
>>
>>>>+       while (count > 0) {
>>>>+               chunk = min_t(size_t, count, PAGE_SIZE);
>>>>+               i = 0;
>>>>+
>>>>+               if (pfn == -1) {
>>>>+                       page[0] = 0;
>>>>+                       page[1] = 0;
>>>>+                       ((char *)page)[0] = (ntohl(1) != 1);
>>>
>>>
>>>OK.
>>>
>>>
>>>
>>>>+                       ((char *)page)[1] = PAGE_SHIFT;
>>>
>>>
>>>OK.
>>
>>Shouldn't we just expose page size and endianness by other means? (another 
>>file or
>>syscall).
> 
> 
> If I send you this file dumped from a random machine, you won't know
> what to make of it.

That's a good reason ;)

> I'm planning to write a trivial server to sit on, say, my embedded
> target and spew this over the wire to a client. 
> 
> 
>>>Not a good idea to expose raw flags in this manner - it changes at the drop
>>>of a hat.  We'd need to also expose the kernel's PG_foo-to-bitnumber
>>>mapping to make this viable.
>>
>>I don't think it is viable because that makes the flags part of the
>>userspace ABI. I wonder what they are needed for.
> 
> 
> Basically: to show what the hell's going on in the VM.

kprobes / systemtap isn't good enough?

-- 
SUSE Labs, Novell Inc.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  0:42       ` Andrew Morton
@ 2007-04-13  1:14         ` Nick Piggin
  2007-04-13  1:22           ` Andrew Morton
  2007-04-13 16:24         ` Matt Mackall
  1 sibling, 1 reply; 77+ messages in thread
From: Nick Piggin @ 2007-04-13  1:14 UTC (permalink / raw)
  To: Andrew Morton; +Cc: William Lee Irwin III, Matt Mackall, linux-kernel

Andrew Morton wrote:
> On Fri, 13 Apr 2007 10:15:24 +1000 Nick Piggin <nickpiggin@yahoo.com.au> wrote:

>>>>+               for (; i < 2 * chunk / KPMSIZE; i += 2, pfn++) {
>>>>+                       ppage = pfn_to_page(pfn);
>>>>+                       if (!ppage) {
>>>>+                               page[i] = 0;
>>>>+                               page[i + 1] = 0;
>>>>+                       } else {
>>>>+                               page[i] = ppage->flags;
>>>>+                               page[i + 1] = atomic_read(&ppage->_count);
>>>>+                       }
>>>>+               }
>>>
>>>
>>>Not a good idea to expose raw flags in this manner - it changes at the drop
>>>of a hat.  We'd need to also expose the kernel's PG_foo-to-bitnumber
>>>mapping to make this viable.
>>
>>I don't think it is viable because that makes the flags part of the
>>userspace ABI.
> 
> 
> It *will* be viable.  If the application wants to know if a page is dirty,
> it looks up "PG_dirty" in /proc/pg_foo-to-bitnumber and uses PG_dirty's
> numerical offset when inspecting fields in /proc/kpagemap.  If correctly
> designed, such a monitoring application will be able to report upon page
> flags which we haven't even thought up yet.

Ooh, you wanted a _runtime_ mapping of flags, yeah then I guess that works.
Still seems like a basically hit and miss affair to just use flags. What if
you want to know the process mapping a page? With systemtap or something you
could walk the rmap structures. What if you want to look at pages along the
LRU list rather than per-pfn? What about connecting pages to inodes?

I thought this type of deep poking was the whole reason the probles thingies
were merged. I'm saddened that they're no good for this. I thought it would
be an ideal usage :(


>>I wonder what they are needed for.
> 
> 
> Poking deeply into the kernel to provide information about kernel state. 
> 
> There are real-world needs for this, and the people who develop tools to
> process this information will have decent kernel understanding and will
> know that the file's contents may alter across kernel versions.  It sure
> beats poking around in /dev/kmem.
> 
> I doubt if there's a sensible way in which we can prettify this interface
> without losing information.  But we should aim to make it as robust as
> possible agaisnt future kenrel changes, of course.
> 
> And we should satisfy ourselves that all the required information has been
> made available.  The fact that it will satisfy the Oracle requirement is
> encouraging.

Yeah it is close, they need page_mapcount I think. But I was going to say
that satisfying an Oracle requirement is a good reason _not_ to merge it ;)
(I joke!)

-- 
SUSE Labs, Novell Inc.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  1:14         ` Nick Piggin
@ 2007-04-13  1:22           ` Andrew Morton
  2007-04-13  1:42             ` Nick Piggin
  0 siblings, 1 reply; 77+ messages in thread
From: Andrew Morton @ 2007-04-13  1:22 UTC (permalink / raw)
  To: Nick Piggin; +Cc: William Lee Irwin III, Matt Mackall, linux-kernel

On Fri, 13 Apr 2007 11:14:20 +1000 Nick Piggin <nickpiggin@yahoo.com.au> wrote:

> Andrew Morton wrote:
> > On Fri, 13 Apr 2007 10:15:24 +1000 Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> 
> >>>>+               for (; i < 2 * chunk / KPMSIZE; i += 2, pfn++) {
> >>>>+                       ppage = pfn_to_page(pfn);
> >>>>+                       if (!ppage) {
> >>>>+                               page[i] = 0;
> >>>>+                               page[i + 1] = 0;
> >>>>+                       } else {
> >>>>+                               page[i] = ppage->flags;
> >>>>+                               page[i + 1] = atomic_read(&ppage->_count);
> >>>>+                       }
> >>>>+               }
> >>>
> >>>
> >>>Not a good idea to expose raw flags in this manner - it changes at the drop
> >>>of a hat.  We'd need to also expose the kernel's PG_foo-to-bitnumber
> >>>mapping to make this viable.
> >>
> >>I don't think it is viable because that makes the flags part of the
> >>userspace ABI.
> > 
> > 
> > It *will* be viable.  If the application wants to know if a page is dirty,
> > it looks up "PG_dirty" in /proc/pg_foo-to-bitnumber and uses PG_dirty's
> > numerical offset when inspecting fields in /proc/kpagemap.  If correctly
> > designed, such a monitoring application will be able to report upon page
> > flags which we haven't even thought up yet.
> 
> Ooh, you wanted a _runtime_ mapping of flags, yeah then I guess that works.
> Still seems like a basically hit and miss affair to just use flags. What if
> you want to know the process mapping a page? With systemtap or something you
> could walk the rmap structures. What if you want to look at pages along the
> LRU list rather than per-pfn? What about connecting pages to inodes?

Well hang on.  This isn't a tool for understanding kernel behaviour.  It's
a tool for understanding applciation behaviour.

So one doesn't ask "who is mapping that page" - that's a kernel developer
thing.

Instead, one says "what pages are being used by my application", then, for
each of those pages "what is that page's state".  So the first step is to
collect all the pfns from /proc/$(pidof my-application)/pagemap and then to
use those pfns to look the individual pages up in /proc/kpagemap.

If you really want to know "who is using page 123435" then you'd need to
search /proc/*/pagemap.  There are possibly legitimate reasons why an
application developer would want to at least pertially perform such an
operation ("who am I sharing with"), but I doubt if it's the common case.

> 
> But I was going to say
> that satisfying an Oracle requirement is a good reason _not_ to merge it ;)
>

hm, yes, there's plenty of precedent for that.

> (I joke!)

I akpm!

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  1:01         ` Nick Piggin
@ 2007-04-13  1:38           ` Matt Mackall
  2007-04-13  2:11             ` Nick Piggin
  0 siblings, 1 reply; 77+ messages in thread
From: Matt Mackall @ 2007-04-13  1:38 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, William Lee Irwin III, linux-kernel

On Fri, Apr 13, 2007 at 11:01:41AM +1000, Nick Piggin wrote:
> >Basically: to show what the hell's going on in the VM.
> 
> kprobes / systemtap isn't good enough?

It's not really a good match to the kprobes model. I'm not interested
in events, per se. I don't want to need to know about every single
alloc/free of N different varieties integrated from boot onward to
build up an image of the state of the system. Instead, I want to take
snapshots of the state of the VM.

The main goal here is to be able to answer the question "where's my
memory going?". Currently you can't really give a good answer to that
question from userspace because of shared mappings, etc.

There are lots of secondary questions that follow on very quickly from
that, like "what parts of my shared mappings are or aren't shared, and
why?", "what's actually in my application's working set?" and "how much
of this crap can I ditch?".

-- 
Mathematics is the supreme nostalgia of our time.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  1:22           ` Andrew Morton
@ 2007-04-13  1:42             ` Nick Piggin
  2007-04-13  1:57               ` Matt Mackall
  2007-04-13  1:57               ` Andrew Morton
  0 siblings, 2 replies; 77+ messages in thread
From: Nick Piggin @ 2007-04-13  1:42 UTC (permalink / raw)
  To: Andrew Morton; +Cc: William Lee Irwin III, Matt Mackall, linux-kernel

Andrew Morton wrote:
> On Fri, 13 Apr 2007 11:14:20 +1000 Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> 
> 
>>Andrew Morton wrote:

>>>It *will* be viable.  If the application wants to know if a page is dirty,
>>>it looks up "PG_dirty" in /proc/pg_foo-to-bitnumber and uses PG_dirty's
>>>numerical offset when inspecting fields in /proc/kpagemap.  If correctly
>>>designed, such a monitoring application will be able to report upon page
>>>flags which we haven't even thought up yet.
>>
>>Ooh, you wanted a _runtime_ mapping of flags, yeah then I guess that works.
>>Still seems like a basically hit and miss affair to just use flags. What if
>>you want to know the process mapping a page? With systemtap or something you
>>could walk the rmap structures. What if you want to look at pages along the
>>LRU list rather than per-pfn? What about connecting pages to inodes?
> 
> 
> Well hang on.  This isn't a tool for understanding kernel behaviour.  It's
> a tool for understanding applciation behaviour.
> 
> So one doesn't ask "who is mapping that page" - that's a kernel developer
> thing.
> 
> Instead, one says "what pages are being used by my application", then, for

That includes unmapped pagecache being used by my application, doesn't it?
Maybe that's too hard to do via /proc so we forget about it...


> each of those pages "what is that page's state".  So the first step is to
> collect all the pfns from /proc/$(pidof my-application)/pagemap and then to
> use those pfns to look the individual pages up in /proc/kpagemap.

OK I realise you could do it that way, but systemtap can definitely be
used as a tool for understanding application behaviour in the context of
the kernel, I think? The purpose for it is so that various little bits
of deep kernel internals do not have to be exposed on a case by case basis.

If kprobes is simply crappy and doesn't work properly for this, then I
could accept that. I'm not someone trying to get this info. So why can't
it be used? (not just for kpagemap, but for clear_refs and all that gunk
too).

 > If you really want to know "who is using page 123435" then you'd need to
 > search /proc/*/pagemap.  There are possibly legitimate reasons why an
 > application developer would want to at least pertially perform such an
 > operation ("who am I sharing with"), but I doubt if it's the common case.

Maybe. How about LRU? Reclaim performance is bad, and you want to work out
which pages keep going off the end of it, or which pages keep getting
written out via it, or who's pages are on the active list, forcing mine
out.

-- 
SUSE Labs, Novell Inc.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  1:42             ` Nick Piggin
@ 2007-04-13  1:57               ` Matt Mackall
  2007-04-13  2:21                 ` Nick Piggin
  2007-04-13  1:57               ` Andrew Morton
  1 sibling, 1 reply; 77+ messages in thread
From: Matt Mackall @ 2007-04-13  1:57 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, William Lee Irwin III, linux-kernel

On Fri, Apr 13, 2007 at 11:42:29AM +1000, Nick Piggin wrote:
> >Instead, one says "what pages are being used by my application", then, for
> 
> That includes unmapped pagecache being used by my application, doesn't it?
> Maybe that's too hard to do via /proc so we forget about it...

It'd be really nice to have a window into the pagecache too. But I for
one couldn't come up with a sensible scheme for it.

> >each of those pages "what is that page's state".  So the first step is to
> >collect all the pfns from /proc/$(pidof my-application)/pagemap and then to
> >use those pfns to look the individual pages up in /proc/kpagemap.
> 
> OK I realise you could do it that way, but systemtap can definitely be
> used as a tool for understanding application behaviour in the context of
> the kernel, I think? The purpose for it is so that various little bits
> of deep kernel internals do not have to be exposed on a case by case basis.
> 
> If kprobes is simply crappy and doesn't work properly for this, then I
> could accept that. I'm not someone trying to get this info. So why can't
> it be used? (not just for kpagemap, but for clear_refs and all that gunk
> too).

kprobes is good for looking at events, but bad for looking at state.
Especially metric shitloads of state.

> > If you really want to know "who is using page 123435" then you'd need to
> > search /proc/*/pagemap.  There are possibly legitimate reasons why an
> > application developer would want to at least pertially perform such an
> > operation ("who am I sharing with"), but I doubt if it's the common case.
> 
> Maybe. How about LRU? Reclaim performance is bad, and you want to work out
> which pages keep going off the end of it, or which pages keep getting
> written out via it, or who's pages are on the active list, forcing mine
> out.

Those are actually probably a good match for systemtap as they're all events.

-- 
Mathematics is the supreme nostalgia of our time.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  1:42             ` Nick Piggin
  2007-04-13  1:57               ` Matt Mackall
@ 2007-04-13  1:57               ` Andrew Morton
  2007-04-13  2:05                 ` Matt Mackall
  2007-04-13  2:18                 ` Nick Piggin
  1 sibling, 2 replies; 77+ messages in thread
From: Andrew Morton @ 2007-04-13  1:57 UTC (permalink / raw)
  To: Nick Piggin; +Cc: William Lee Irwin III, Matt Mackall, linux-kernel

On Fri, 13 Apr 2007 11:42:29 +1000 Nick Piggin <nickpiggin@yahoo.com.au> wrote:

> Andrew Morton wrote:
> > On Fri, 13 Apr 2007 11:14:20 +1000 Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> > 
> > 
> >>Andrew Morton wrote:
> 
> >>>It *will* be viable.  If the application wants to know if a page is dirty,
> >>>it looks up "PG_dirty" in /proc/pg_foo-to-bitnumber and uses PG_dirty's
> >>>numerical offset when inspecting fields in /proc/kpagemap.  If correctly
> >>>designed, such a monitoring application will be able to report upon page
> >>>flags which we haven't even thought up yet.
> >>
> >>Ooh, you wanted a _runtime_ mapping of flags, yeah then I guess that works.
> >>Still seems like a basically hit and miss affair to just use flags. What if
> >>you want to know the process mapping a page? With systemtap or something you
> >>could walk the rmap structures. What if you want to look at pages along the
> >>LRU list rather than per-pfn? What about connecting pages to inodes?
> > 
> > 
> > Well hang on.  This isn't a tool for understanding kernel behaviour.  It's
> > a tool for understanding applciation behaviour.
> > 
> > So one doesn't ask "who is mapping that page" - that's a kernel developer
> > thing.
> > 
> > Instead, one says "what pages are being used by my application", then, for
> 
> That includes unmapped pagecache being used by my application, doesn't it?
> Maybe that's too hard to do via /proc so we forget about it...

Yes, harder.  I'm hoping that sampling of /proc/pid/io can be used to
determine pagecache use sufficiently accurately.  I know of one large
hosting company who are using it ("BTW, we are making great use of
taskstats!!  Its GREAT!")

> 
> > each of those pages "what is that page's state".  So the first step is to
> > collect all the pfns from /proc/$(pidof my-application)/pagemap and then to
> > use those pfns to look the individual pages up in /proc/kpagemap.
> 
> OK I realise you could do it that way, but systemtap can definitely be
> used as a tool for understanding application behaviour in the context of
> the kernel, I think? The purpose for it is so that various little bits
> of deep kernel internals do not have to be exposed on a case by case basis.
> 
> If kprobes is simply crappy and doesn't work properly for this, then I
> could accept that. I'm not someone trying to get this info. So why can't
> it be used? (not just for kpagemap, but for clear_refs and all that gunk
> too).
> 
>  > If you really want to know "who is using page 123435" then you'd need to
>  > search /proc/*/pagemap.  There are possibly legitimate reasons why an
>  > application developer would want to at least pertially perform such an
>  > operation ("who am I sharing with"), but I doubt if it's the common case.
> 
> Maybe. How about LRU? Reclaim performance is bad, and you want to work out
> which pages keep going off the end of it, or which pages keep getting
> written out via it, or who's pages are on the active list, forcing mine
> out.

I guess we have static analysis versus dynamic.  The interfaces which Matt
is proposing are suited to answering the question "what is my memory being
used for" (static).  They're unlikely to be useful for answering the question
"what's happening in the VM" (dynamic).  Systemtap is probably better for the
dynamic analysis.

I guess one could generate an answer to the static question with systemtap,
by accumulating running counts across the application lifetime and then
snapshotting them.  Sounds hard though.


^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  1:57               ` Andrew Morton
@ 2007-04-13  2:05                 ` Matt Mackall
  2007-04-13  2:29                   ` Nick Piggin
  2007-04-13  2:18                 ` Nick Piggin
  1 sibling, 1 reply; 77+ messages in thread
From: Matt Mackall @ 2007-04-13  2:05 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Nick Piggin, William Lee Irwin III, linux-kernel

On Thu, Apr 12, 2007 at 06:57:23PM -0700, Andrew Morton wrote:
> I guess one could generate an answer to the static question with systemtap,
> by accumulating running counts across the application lifetime and then
> snapshotting them.  Sounds hard though.

You'd have to do it from boot onward to get a complete system image.
One way to look at it is that systemtap can give you the derivative of
the information, and you have to integrate it.

-- 
Mathematics is the supreme nostalgia of our time.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  1:38           ` Matt Mackall
@ 2007-04-13  2:11             ` Nick Piggin
  0 siblings, 0 replies; 77+ messages in thread
From: Nick Piggin @ 2007-04-13  2:11 UTC (permalink / raw)
  To: Matt Mackall; +Cc: Andrew Morton, William Lee Irwin III, linux-kernel

Matt Mackall wrote:
> On Fri, Apr 13, 2007 at 11:01:41AM +1000, Nick Piggin wrote:
> 
>>>Basically: to show what the hell's going on in the VM.
>>
>>kprobes / systemtap isn't good enough?
> 
> 
> It's not really a good match to the kprobes model. I'm not interested
> in events, per se. I don't want to need to know about every single
> alloc/free of N different varieties integrated from boot onward to
> build up an image of the state of the system. Instead, I want to take
> snapshots of the state of the VM.

Systemtap can't output a large set of values?

Why can't you attach a kprobe to a dummy syscall, and from there
iterate over pgdat/zone/memmap and output what you want?

Actually I'm surprised that kind of data querying facility isn't
already in there (I haven't used it seriously though).


> The main goal here is to be able to answer the question "where's my
> memory going?". Currently you can't really give a good answer to that
> question from userspace because of shared mappings, etc.
> 
> There are lots of secondary questions that follow on very quickly from
> that, like "what parts of my shared mappings are or aren't shared, and
> why?", "what's actually in my application's working set?" and "how much
> of this crap can I ditch?".

I understand roughly what you want, and that you can't easily get
it from /proc currently. My question at this point is just why can
we not use systemtap.

-- 
SUSE Labs, Novell Inc.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  1:57               ` Andrew Morton
  2007-04-13  2:05                 ` Matt Mackall
@ 2007-04-13  2:18                 ` Nick Piggin
  2007-04-13  2:32                   ` Andrew Morton
  2007-04-13 17:13                   ` Matt Mackall
  1 sibling, 2 replies; 77+ messages in thread
From: Nick Piggin @ 2007-04-13  2:18 UTC (permalink / raw)
  To: Andrew Morton; +Cc: William Lee Irwin III, Matt Mackall, linux-kernel

Andrew Morton wrote:
> On Fri, 13 Apr 2007 11:42:29 +1000 Nick Piggin <nickpiggin@yahoo.com.au> wrote:

>>Maybe. How about LRU? Reclaim performance is bad, and you want to work out
>>which pages keep going off the end of it, or which pages keep getting
>>written out via it, or who's pages are on the active list, forcing mine
>>out.
> 
> 
> I guess we have static analysis versus dynamic.  The interfaces which Matt
> is proposing are suited to answering the question "what is my memory being
> used for" (static).  They're unlikely to be useful for answering the question
> "what's happening in the VM" (dynamic).  Systemtap is probably better for the
> dynamic analysis.

"what is my memory being used for *now*" ;)


> I guess one could generate an answer to the static question with systemtap,
> by accumulating running counts across the application lifetime and then
> snapshotting them.  Sounds hard though.

Can't you just traverse arbitrary kernel data structures at a given point
in time, exactly like the /proc/ call is doing?

-- 
SUSE Labs, Novell Inc.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  1:57               ` Matt Mackall
@ 2007-04-13  2:21                 ` Nick Piggin
  2007-04-13  2:23                   ` Matt Mackall
  0 siblings, 1 reply; 77+ messages in thread
From: Nick Piggin @ 2007-04-13  2:21 UTC (permalink / raw)
  To: Matt Mackall; +Cc: Andrew Morton, William Lee Irwin III, linux-kernel

Matt Mackall wrote:
> On Fri, Apr 13, 2007 at 11:42:29AM +1000, Nick Piggin wrote:

>>If kprobes is simply crappy and doesn't work properly for this, then I
>>could accept that. I'm not someone trying to get this info. So why can't
>>it be used? (not just for kpagemap, but for clear_refs and all that gunk
>>too).
> 
> 
> kprobes is good for looking at events, but bad for looking at state.
> Especially metric shitloads of state.

Why? Why is a kprobes trap significantly more expensive than a read
syscall?

>>Maybe. How about LRU? Reclaim performance is bad, and you want to work out
>>which pages keep going off the end of it, or which pages keep getting
>>written out via it, or who's pages are on the active list, forcing mine
>>out.
> 
> 
> Those are actually probably a good match for systemtap as they're all events.

Traverse the LRU? Which files do they belong to? What process maps them?

-- 
SUSE Labs, Novell Inc.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  2:21                 ` Nick Piggin
@ 2007-04-13  2:23                   ` Matt Mackall
  2007-04-13  2:54                     ` Nick Piggin
  2007-04-14  8:13                     ` Maneesh Soni
  0 siblings, 2 replies; 77+ messages in thread
From: Matt Mackall @ 2007-04-13  2:23 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, William Lee Irwin III, linux-kernel

On Fri, Apr 13, 2007 at 12:21:25PM +1000, Nick Piggin wrote:
> Matt Mackall wrote:
> >On Fri, Apr 13, 2007 at 11:42:29AM +1000, Nick Piggin wrote:
> 
> >>If kprobes is simply crappy and doesn't work properly for this, then I
> >>could accept that. I'm not someone trying to get this info. So why can't
> >>it be used? (not just for kpagemap, but for clear_refs and all that gunk
> >>too).
> >
> >
> >kprobes is good for looking at events, but bad for looking at state.
> >Especially metric shitloads of state.
> 
> Why? Why is a kprobes trap significantly more expensive than a read
> syscall?

I guess I'm not clear on what you're proposing. From my understanding
of kprobes (admittedly not an expert), this is hard to do and not a
very good match.
 
> >>Maybe. How about LRU? Reclaim performance is bad, and you want to work out
> >>which pages keep going off the end of it, or which pages keep getting
> >>written out via it, or who's pages are on the active list, forcing mine
> >>out.
> >
> >
> >Those are actually probably a good match for systemtap as they're all 
> >events.
> 
> Traverse the LRU? Which files do they belong to? What process maps them?

-ENOPARSE.

-- 
Mathematics is the supreme nostalgia of our time.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  2:05                 ` Matt Mackall
@ 2007-04-13  2:29                   ` Nick Piggin
  0 siblings, 0 replies; 77+ messages in thread
From: Nick Piggin @ 2007-04-13  2:29 UTC (permalink / raw)
  To: Matt Mackall; +Cc: Andrew Morton, William Lee Irwin III, linux-kernel

Matt Mackall wrote:
> On Thu, Apr 12, 2007 at 06:57:23PM -0700, Andrew Morton wrote:
> 
>>I guess one could generate an answer to the static question with systemtap,
>>by accumulating running counts across the application lifetime and then
>>snapshotting them.  Sounds hard though.
> 
> 
> You'd have to do it from boot onward to get a complete system image.
> One way to look at it is that systemtap can give you the derivative of
> the information, and you have to integrate it.

So everyone keeps saying.

Would you tell me why you can't just traverse the data structures
in the same way as your proc handler? From the systemtap example
scripts it seems like you can traverse arbitrary kernel data
structures.

-- 
SUSE Labs, Novell Inc.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  2:18                 ` Nick Piggin
@ 2007-04-13  2:32                   ` Andrew Morton
  2007-04-13  2:50                     ` Nick Piggin
  2007-04-13  3:40                     ` Nick Piggin
  2007-04-13 17:13                   ` Matt Mackall
  1 sibling, 2 replies; 77+ messages in thread
From: Andrew Morton @ 2007-04-13  2:32 UTC (permalink / raw)
  To: Nick Piggin; +Cc: William Lee Irwin III, Matt Mackall, linux-kernel

On Fri, 13 Apr 2007 12:18:56 +1000 Nick Piggin <nickpiggin@yahoo.com.au> wrote:

> > I guess one could generate an answer to the static question with systemtap,
> > by accumulating running counts across the application lifetime and then
> > snapshotting them.  Sounds hard though.
> 
> Can't you just traverse arbitrary kernel data structures at a given point
> in time, exactly like the /proc/ call is doing?

Do a full pagetable walk, with all the associated locking from within
a systemtap script?  I'd be surprised.  Maybe if it's mostly hand-coded
in C, perhaps.  Then you just end up with the same thing, don't you?

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  2:32                   ` Andrew Morton
@ 2007-04-13  2:50                     ` Nick Piggin
  2007-04-13  3:10                       ` Nick Piggin
                                         ` (2 more replies)
  2007-04-13  3:40                     ` Nick Piggin
  1 sibling, 3 replies; 77+ messages in thread
From: Nick Piggin @ 2007-04-13  2:50 UTC (permalink / raw)
  To: Andrew Morton; +Cc: William Lee Irwin III, Matt Mackall, linux-kernel

Andrew Morton wrote:
> On Fri, 13 Apr 2007 12:18:56 +1000 Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> 
> 
>>>I guess one could generate an answer to the static question with systemtap,
>>>by accumulating running counts across the application lifetime and then
>>>snapshotting them.  Sounds hard though.
>>
>>Can't you just traverse arbitrary kernel data structures at a given point
>>in time, exactly like the /proc/ call is doing?
> 
> 
> Do a full pagetable walk, with all the associated locking from within
> a systemtap script?  I'd be surprised.  Maybe if it's mostly hand-coded
> in C, perhaps.

It looks like you can traverse arbitrary data structures, yes.

It definitely seems like you can use some kernel functions, but the
ones I saw may just be systemtap facilities. But what is so surprising
about being able to call a kernel function when running in kernel
context? Perhaps there is some fundamental limitation of kprobes that
I don't understand.

>  Then you just end up with the same thing, don't you?

Well _you_ do, because that happens to be exactly what you want. Bill
ends up with something that displays page_mapcount instead. And I
end up with something that traverses LRU lists rather than pfns. And
none of it goes in /proc/ or linux-2.6/.

So it isn't really the same thing at all.

-- 
SUSE Labs, Novell Inc.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  2:23                   ` Matt Mackall
@ 2007-04-13  2:54                     ` Nick Piggin
  2007-04-13 12:24                       ` Ananth N Mavinakayanahalli
  2007-04-14  8:13                     ` Maneesh Soni
  1 sibling, 1 reply; 77+ messages in thread
From: Nick Piggin @ 2007-04-13  2:54 UTC (permalink / raw)
  To: Matt Mackall; +Cc: Andrew Morton, William Lee Irwin III, linux-kernel

Matt Mackall wrote:
> On Fri, Apr 13, 2007 at 12:21:25PM +1000, Nick Piggin wrote:
> 
>>Matt Mackall wrote:
>>
>>>On Fri, Apr 13, 2007 at 11:42:29AM +1000, Nick Piggin wrote:
>>
>>>>If kprobes is simply crappy and doesn't work properly for this, then I
>>>>could accept that. I'm not someone trying to get this info. So why can't
>>>>it be used? (not just for kpagemap, but for clear_refs and all that gunk
>>>>too).
>>>
>>>
>>>kprobes is good for looking at events, but bad for looking at state.
>>>Especially metric shitloads of state.
>>
>>Why? Why is a kprobes trap significantly more expensive than a read
>>syscall?
> 
> 
> I guess I'm not clear on what you're proposing. From my understanding
> of kprobes (admittedly not an expert), this is hard to do and not a
> very good match.

But you have an idea that it is bad for exposing lots of data. Why?
(I'm not a kprobes expert either, these are not rhetorical questions)

 From what it looks like, you can traverse data structures and copy data
back to userspace. Which is what makes me think it might be suitable
(or could be made suitable).


>>>>Maybe. How about LRU? Reclaim performance is bad, and you want to work out
>>>>which pages keep going off the end of it, or which pages keep getting
>>>>written out via it, or who's pages are on the active list, forcing mine
>>>>out.
>>>
>>>
>>>Those are actually probably a good match for systemtap as they're all 
>>>events.
>>
>>Traverse the LRU? Which files do they belong to? What process maps them?
> 
> 
> -ENOPARSE.

Basically, any "stuff" other than what you're exposing.

-- 
SUSE Labs, Novell Inc.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  2:50                     ` Nick Piggin
@ 2007-04-13  3:10                       ` Nick Piggin
  2007-04-13  6:53                       ` William Lee Irwin III
  2007-04-13 12:13                       ` Ananth N Mavinakayanahalli
  2 siblings, 0 replies; 77+ messages in thread
From: Nick Piggin @ 2007-04-13  3:10 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrew Morton, William Lee Irwin III, Matt Mackall, linux-kernel

Nick Piggin wrote:
> Andrew Morton wrote:

>>  Then you just end up with the same thing, don't you?
> 
> 
> Well _you_ do, because that happens to be exactly what you want. Bill
> ends up with something that displays page_mapcount instead. And I
> end up with something that traverses LRU lists rather than pfns. And
> none of it goes in /proc/ or linux-2.6/.

Oh, and you get to change it without recompiling and rebooting your
kernel.

-- 
SUSE Labs, Novell Inc.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  2:32                   ` Andrew Morton
  2007-04-13  2:50                     ` Nick Piggin
@ 2007-04-13  3:40                     ` Nick Piggin
  2007-04-13  6:55                       ` William Lee Irwin III
  2007-04-13 14:08                       ` Theodore Tso
  1 sibling, 2 replies; 77+ messages in thread
From: Nick Piggin @ 2007-04-13  3:40 UTC (permalink / raw)
  To: Andrew Morton; +Cc: William Lee Irwin III, Matt Mackall, linux-kernel

Andrew Morton wrote:
> On Fri, 13 Apr 2007 12:18:56 +1000 Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> 
> 
>>>I guess one could generate an answer to the static question with systemtap,
>>>by accumulating running counts across the application lifetime and then
>>>snapshotting them.  Sounds hard though.
>>
>>Can't you just traverse arbitrary kernel data structures at a given point
>>in time, exactly like the /proc/ call is doing?
> 
> 
> Do a full pagetable walk, with all the associated locking from within
> a systemtap script?  I'd be surprised.  Maybe if it's mostly hand-coded
> in C, perhaps.  Then you just end up with the same thing, don't you?

And my problem isn't with the hardcoded pagetable walker. Yeah, we'd
probably still keep the pagetable callback walker thingy with Matt's
associated cleanups (and my subsequent ones to clean it up more and
move it to mm/): there are other in-kernel users for that anyway.

The point is the proc API, and exposing random little parts of deep
kernel internals that some people happen to find useful at the time.
(which is why we have an incredible proliferation of these things).

With systemtap scripts, you could walk pagetables and print *the exact
page information you want*, or you could walk pfns, or LRU, or page_tree,
or walk the page tree then the rmap structures. And you can selectively
cull out items you don't care about if you only care about a subset of
items, based on arbitrary criteria. And you can most likely do all that
more efficiently than with a conglomeration of various /proc files
(assuming they even provide what you want in the first place).

-- 
SUSE Labs, Novell Inc.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  2:50                     ` Nick Piggin
  2007-04-13  3:10                       ` Nick Piggin
@ 2007-04-13  6:53                       ` William Lee Irwin III
  2007-04-13  7:05                         ` Nick Piggin
  2007-04-13 12:13                       ` Ananth N Mavinakayanahalli
  2 siblings, 1 reply; 77+ messages in thread
From: William Lee Irwin III @ 2007-04-13  6:53 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, Matt Mackall, linux-kernel

Andrew Morton wrote:
>> Then you just end up with the same thing, don't you?

On Fri, Apr 13, 2007 at 12:50:20PM +1000, Nick Piggin wrote:
> Well _you_ do, because that happens to be exactly what you want. Bill
> ends up with something that displays page_mapcount instead. And I
> end up with something that traverses LRU lists rather than pfns. And
> none of it goes in /proc/ or linux-2.6/.
> So it isn't really the same thing at all.

The EM guys aren't dealing with the database; they're dealing with some
enterprise management thingie that does things like control how many
client connections are allowed for each database instance. Unless
they're doing less than I expect, and are largely something like procps
on steroids and enterprise silliness.


-- wli

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  3:40                     ` Nick Piggin
@ 2007-04-13  6:55                       ` William Lee Irwin III
  2007-04-13  7:03                         ` Nick Piggin
  2007-04-13 14:08                       ` Theodore Tso
  1 sibling, 1 reply; 77+ messages in thread
From: William Lee Irwin III @ 2007-04-13  6:55 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, Matt Mackall, linux-kernel

Andrew Morton wrote:
>> Do a full pagetable walk, with all the associated locking from within
>> a systemtap script?  I'd be surprised.  Maybe if it's mostly hand-coded
>> in C, perhaps.  Then you just end up with the same thing, don't you?

On Fri, Apr 13, 2007 at 01:40:08PM +1000, Nick Piggin wrote:
> And my problem isn't with the hardcoded pagetable walker. Yeah, we'd
> probably still keep the pagetable callback walker thingy with Matt's
> associated cleanups (and my subsequent ones to clean it up more and
> move it to mm/): there are other in-kernel users for that anyway.
> The point is the proc API, and exposing random little parts of deep
> kernel internals that some people happen to find useful at the time.
> (which is why we have an incredible proliferation of these things).
> With systemtap scripts, you could walk pagetables and print *the exact
> page information you want*, or you could walk pfns, or LRU, or page_tree,
> or walk the page tree then the rmap structures. And you can selectively
> cull out items you don't care about if you only care about a subset of
> items, based on arbitrary criteria. And you can most likely do all that
> more efficiently than with a conglomeration of various /proc files
> (assuming they even provide what you want in the first place).

The EM guys are unwilling or unable for support-oriented reasons to
deal with anything but unmodified kernels as shipped by distros.


-- wli

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  6:55                       ` William Lee Irwin III
@ 2007-04-13  7:03                         ` Nick Piggin
  2007-04-13  7:08                           ` William Lee Irwin III
  0 siblings, 1 reply; 77+ messages in thread
From: Nick Piggin @ 2007-04-13  7:03 UTC (permalink / raw)
  To: William Lee Irwin III; +Cc: Andrew Morton, Matt Mackall, linux-kernel

William Lee Irwin III wrote:
> Andrew Morton wrote:
> 
>>>Do a full pagetable walk, with all the associated locking from within
>>>a systemtap script?  I'd be surprised.  Maybe if it's mostly hand-coded
>>>in C, perhaps.  Then you just end up with the same thing, don't you?
> 
> 
> On Fri, Apr 13, 2007 at 01:40:08PM +1000, Nick Piggin wrote:
> 
>>And my problem isn't with the hardcoded pagetable walker. Yeah, we'd
>>probably still keep the pagetable callback walker thingy with Matt's
>>associated cleanups (and my subsequent ones to clean it up more and
>>move it to mm/): there are other in-kernel users for that anyway.
>>The point is the proc API, and exposing random little parts of deep
>>kernel internals that some people happen to find useful at the time.
>>(which is why we have an incredible proliferation of these things).
>>With systemtap scripts, you could walk pagetables and print *the exact
>>page information you want*, or you could walk pfns, or LRU, or page_tree,
>>or walk the page tree then the rmap structures. And you can selectively
>>cull out items you don't care about if you only care about a subset of
>>items, based on arbitrary criteria. And you can most likely do all that
>>more efficiently than with a conglomeration of various /proc files
>>(assuming they even provide what you want in the first place).
> 
> 
> The EM guys are unwilling or unable for support-oriented reasons to
> deal with anything but unmodified kernels as shipped by distros.

And I think major distros ship with kprobes enabled, so that is yet
another reason why systemtap should be considered before adding these
proc interfaces.

Thanks,
Nick

-- 
SUSE Labs, Novell Inc.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  6:53                       ` William Lee Irwin III
@ 2007-04-13  7:05                         ` Nick Piggin
  2007-04-13  7:51                           ` Christoph Hellwig
  0 siblings, 1 reply; 77+ messages in thread
From: Nick Piggin @ 2007-04-13  7:05 UTC (permalink / raw)
  To: William Lee Irwin III; +Cc: Andrew Morton, Matt Mackall, linux-kernel

William Lee Irwin III wrote:
> Andrew Morton wrote:
> 
>>>Then you just end up with the same thing, don't you?
> 
> 
> On Fri, Apr 13, 2007 at 12:50:20PM +1000, Nick Piggin wrote:
> 
>>Well _you_ do, because that happens to be exactly what you want. Bill
>>ends up with something that displays page_mapcount instead. And I
>>end up with something that traverses LRU lists rather than pfns. And
>>none of it goes in /proc/ or linux-2.6/.
>>So it isn't really the same thing at all.
> 
> 
> The EM guys aren't dealing with the database; they're dealing with some
> enterprise management thingie that does things like control how many
> client connections are allowed for each database instance. Unless
> they're doing less than I expect, and are largely something like procps
> on steroids and enterprise silliness.

Ah, OK. Anyway, with kprobes/systemtap they can do whatever they like
and none of us need to care in the slightest ;)

-- 
SUSE Labs, Novell Inc.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  7:03                         ` Nick Piggin
@ 2007-04-13  7:08                           ` William Lee Irwin III
  0 siblings, 0 replies; 77+ messages in thread
From: William Lee Irwin III @ 2007-04-13  7:08 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, Matt Mackall, linux-kernel

William Lee Irwin III wrote:
>> The EM guys are unwilling or unable for support-oriented reasons to
>> deal with anything but unmodified kernels as shipped by distros.

On Fri, Apr 13, 2007 at 05:03:43PM +1000, Nick Piggin wrote:
> And I think major distros ship with kprobes enabled, so that is yet
> another reason why systemtap should be considered before adding these
> proc interfaces.

I'll have to check in and see if that will work for them. A lot of this
is about customer/distro/support interaction constraints on how it works
as opposed to purely technical affairs.


-- wli

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  7:05                         ` Nick Piggin
@ 2007-04-13  7:51                           ` Christoph Hellwig
  2007-04-13  8:03                             ` Nick Piggin
  2007-04-13  8:15                             ` William Lee Irwin III
  0 siblings, 2 replies; 77+ messages in thread
From: Christoph Hellwig @ 2007-04-13  7:51 UTC (permalink / raw)
  To: Nick Piggin
  Cc: William Lee Irwin III, Andrew Morton, Matt Mackall, linux-kernel

On Fri, Apr 13, 2007 at 05:05:47PM +1000, Nick Piggin wrote:
> Ah, OK. Anyway, with kprobes/systemtap they can do whatever they like
> and none of us need to care in the slightest ;)

Umm, folks.  systemtap basically means people compile kernel modules
from an odd scripting language with embedded C snipplets into kernel
modules.  The kernel modules don't use normal exported APIs but use
kallsysms and dwarf info to poke into every possible private bit.

Saying you don't care the slightest whether oracle will load huge
amounts of code into the kernel that depends on intimate implementation
details, and that you don't even have source to to debug it is not what
I'd call "none of us need to care in the slightest", at least for those
of you working for distributions that may actually have to debug this
shit in the end.

While we're at it, what happened to the idea of tainting the kernel
as soon as krpobes are placed in the kernel to at least make people
aware of it?


^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  7:51                           ` Christoph Hellwig
@ 2007-04-13  8:03                             ` Nick Piggin
  2007-04-13  8:13                               ` Christoph Hellwig
  2007-04-13  8:15                             ` William Lee Irwin III
  1 sibling, 1 reply; 77+ messages in thread
From: Nick Piggin @ 2007-04-13  8:03 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: William Lee Irwin III, Andrew Morton, Matt Mackall, linux-kernel

Christoph Hellwig wrote:
> On Fri, Apr 13, 2007 at 05:05:47PM +1000, Nick Piggin wrote:
> 
>>Ah, OK. Anyway, with kprobes/systemtap they can do whatever they like
>>and none of us need to care in the slightest ;)
> 
> 
> Umm, folks.  systemtap basically means people compile kernel modules
> from an odd scripting language with embedded C snipplets into kernel
> modules.  The kernel modules don't use normal exported APIs but use
> kallsysms and dwarf info to poke into every possible private bit.
> 
> Saying you don't care the slightest whether oracle will load huge
> amounts of code into the kernel that depends on intimate implementation
> details, and that you don't even have source to to debug it is not what
> I'd call "none of us need to care in the slightest", at least for those
> of you working for distributions that may actually have to debug this
> shit in the end.

Yeah good point ;) I just meant the wider "we".

With all the problems kprobes has, something like poking deep into
kernel internals seems like a good thing to use it for instead of
hardcoding that stuff into the kernel. If not, then why did we even
merge it in the first place?

We could distribute some systemtap scripts, and even distribute some
basic useful ones like this sort of page info in the kernel source
tree.


> While we're at it, what happened to the idea of tainting the kernel
> as soon as krpobes are placed in the kernel to at least make people
> aware of it?

Seems like a very good idea.


-- 
SUSE Labs, Novell Inc.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  8:03                             ` Nick Piggin
@ 2007-04-13  8:13                               ` Christoph Hellwig
  2007-04-13  8:25                                 ` Nick Piggin
                                                   ` (2 more replies)
  0 siblings, 3 replies; 77+ messages in thread
From: Christoph Hellwig @ 2007-04-13  8:13 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Christoph Hellwig, William Lee Irwin III, Andrew Morton,
	Matt Mackall, linux-kernel

On Fri, Apr 13, 2007 at 06:03:45PM +1000, Nick Piggin wrote:
> Yeah good point ;) I just meant the wider "we".
> 
> With all the problems kprobes has, something like poking deep into
> kernel internals seems like a good thing to use it for instead of
> hardcoding that stuff into the kernel. If not, then why did we even
> merge it in the first place?

It's very nice to poke deep into the kernel for development purposes.
For example for the spu scheduler work I'm doing currently I have
a module using kprobes (note the systemtap crap because it's big, bloated,
in and odd language, and does a lot of really wrong things in it's runtime).

This module allows me to put probes into various places in the scheduler
and writes them into a ringbuffer with timestampts allowing me to
trace what's going on there.  This is really neat.  Unfortunately it
breaks as soon as I do some major reshuffling because then the points
it hooks up to are not there anymore.  That's perfectly fine for my
setup, because _I_ know what I have to change when it breaks, and can
easily fix that.  Now imagine a similar module to trace pagecache activity
used by a third-party monitoring tool.  We now get a major change to
the pagecache (say to make it lockless), and the probes just break.  In
the est case it just doesn't work anymore, in the worst case it crashes
the kernel.  Now if the app vendor at least gave me their source I
could at least fix it to not crash, but there's a fair enough chance
they poke into bits that simply aren't there anymore.

Now if we have a proper user interface with real code behing it we can
have a defined interface.  If the interface is bad enough (or just too
lowlevel) we might have the last problem of stastistic that were there
once to go away aswell, but we can deal with that gracefully by declaring
parts of the stats volatile and make sure people don't mess with them.

To summarize, I really love kprobes to ease my debugging work, but using
it for any kind of production code is a total nightmare.

> We could distribute some systemtap scripts, and even distribute some
> basic useful ones like this sort of page info in the kernel source
> tree.

We could not really distribute systemtap scripts with the kernel.
systemtap is a bloody complicated piece of shit outside the kernel
tree that breaks all the time we change kernel internals.  We could
provide useful kprobes modules, a proper tracing system (ltt-ng-lite)
and surrounding infrastucture.


^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  7:51                           ` Christoph Hellwig
  2007-04-13  8:03                             ` Nick Piggin
@ 2007-04-13  8:15                             ` William Lee Irwin III
  1 sibling, 0 replies; 77+ messages in thread
From: William Lee Irwin III @ 2007-04-13  8:15 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Nick Piggin, Andrew Morton, Matt Mackall, linux-kernel

On Fri, Apr 13, 2007 at 08:51:42AM +0100, Christoph Hellwig wrote:
> Umm, folks.  systemtap basically means people compile kernel modules
> from an odd scripting language with embedded C snipplets into kernel
> modules.  The kernel modules don't use normal exported APIs but use
> kallsysms and dwarf info to poke into every possible private bit.
> Saying you don't care the slightest whether oracle will load huge
> amounts of code into the kernel that depends on intimate implementation
> details, and that you don't even have source to to debug it is not what
> I'd call "none of us need to care in the slightest", at least for those
> of you working for distributions that may actually have to debug this
> shit in the end.
> While we're at it, what happened to the idea of tainting the kernel
> as soon as krpobes are placed in the kernel to at least make people
> aware of it?

This is for a system monitoring app outside the database proper that
just happens to be done by the same .com as makes the database. It's
got little to do with the database itself apart from knowing how to
tell the database to e.g. let fewer clients in. The part that deals
with this is sort of like a custom procps that does things rather
specifically how they need them, including being portable to other
OS's IIRC, though the scope of the app is much larger than that.

They're actually quite concerned about issues of this sort since they
want to run all the time in the background in order to respond to
system conditions, though probably not necessarily rapid-fire sorts of
responses to second-by-second changes in conditions.

Basically, they're not a debugging affair, and they need to be able to
run in supported conditions. They're rather disinterested in things
that would, say, taint the kernel or take customers out of supported
configurations. They'll fall back to the known-grossly-inaccurate
RSS-based estimates they're using now in preference to such.

They don't want omnibus back doors into the kernel and I honestly
expect them to NAK the systemtap solution. They really want the
"uniquely attributable RSS" or "proportional RSS" reported directly,
and it takes some doing to convince them that this can't be done
directly for various reasons, e.g. floating point in the kernel won't
fly. They can program sufficiently well to maintain a database of pfn's,
pid's of processes mapping them, and user virtual addresses they're
mapped at (easy enough to kick off a database instance for it if they
don't feel comfortable just hashing the triples) and do the tabulation
from there, though they're not happy having to do so much of the
calculation themselves. Actually, I promised them reporting of mapcount
which would make per-process UARSS/PRSS calculation able to be done on
a process-by-process basis, though I can probably convince them to do
whole-system pfn-by-pfn tabulation if such is lacking.


-- wli

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  8:13                               ` Christoph Hellwig
@ 2007-04-13  8:25                                 ` Nick Piggin
  2007-04-13  9:46                                   ` Christoph Hellwig
  2007-04-13 21:17                                 ` Frank Ch. Eigler
  2007-04-16 21:36                                 ` Andi Kleen
  2 siblings, 1 reply; 77+ messages in thread
From: Nick Piggin @ 2007-04-13  8:25 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: William Lee Irwin III, Andrew Morton, Matt Mackall, linux-kernel

Christoph Hellwig wrote:
> On Fri, Apr 13, 2007 at 06:03:45PM +1000, Nick Piggin wrote:
> 
>>Yeah good point ;) I just meant the wider "we".
>>
>>With all the problems kprobes has, something like poking deep into
>>kernel internals seems like a good thing to use it for instead of
>>hardcoding that stuff into the kernel. If not, then why did we even
>>merge it in the first place?
> 
> 
> It's very nice to poke deep into the kernel for development purposes.
> For example for the spu scheduler work I'm doing currently I have
> a module using kprobes (note the systemtap crap because it's big, bloated,
> in and odd language, and does a lot of really wrong things in it's runtime).

OK, I pick systemtap because I don't know any better... but kprobes
is what I mean for the kernel interface.


> This module allows me to put probes into various places in the scheduler
> and writes them into a ringbuffer with timestampts allowing me to
> trace what's going on there.  This is really neat.  Unfortunately it
> breaks as soon as I do some major reshuffling because then the points
> it hooks up to are not there anymore.  That's perfectly fine for my
> setup, because _I_ know what I have to change when it breaks, and can
> easily fix that.  Now imagine a similar module to trace pagecache activity
> used by a third-party monitoring tool.  We now get a major change to
> the pagecache (say to make it lockless), and the probes just break.  In
> the est case it just doesn't work anymore, in the worst case it crashes
> the kernel.  Now if the app vendor at least gave me their source I
> could at least fix it to not crash, but there's a fair enough chance
> they poke into bits that simply aren't there anymore.
> 
> Now if we have a proper user interface with real code behing it we can
> have a defined interface.  If the interface is bad enough (or just too
> lowlevel) we might have the last problem of stastistic that were there
> once to go away aswell, but we can deal with that gracefully by declaring
> parts of the stats volatile and make sure people don't mess with them.
> 
> To summarize, I really love kprobes to ease my debugging work, but using
> it for any kind of production code is a total nightmare.

OK, well Matt's stuff that he needs doesn't have to be kprobes at all,
and yes if lots of people want the same thing then we could distribute
it with the kernel.

But at least make it into its own module with a debugfs interface or
something. I mean, exposing a PG_name-to-nr and page count pfn and flags
as a supposedly formal proc interface doesn't sound nice to me. Page
flags does not tell you what is going on in the VM, it gives you a tiny
window into "something". Between reading a /proc/pid/ pfn and finding
the pfn's page flags, it could be used for something completely different
anyway.


>>We could distribute some systemtap scripts, and even distribute some
>>basic useful ones like this sort of page info in the kernel source
>>tree.
> 
> 
> We could not really distribute systemtap scripts with the kernel.
> systemtap is a bloody complicated piece of shit outside the kernel
> tree that breaks all the time we change kernel internals.  We could
> provide useful kprobes modules, a proper tracing system (ltt-ng-lite)
> and surrounding infrastucture.

OK ;)

-- 
SUSE Labs, Novell Inc.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  8:25                                 ` Nick Piggin
@ 2007-04-13  9:46                                   ` Christoph Hellwig
  0 siblings, 0 replies; 77+ messages in thread
From: Christoph Hellwig @ 2007-04-13  9:46 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Christoph Hellwig, William Lee Irwin III, Andrew Morton,
	Matt Mackall, linux-kernel

On Fri, Apr 13, 2007 at 06:25:46PM +1000, Nick Piggin wrote:
> But at least make it into its own module with a debugfs interface or
> something. I mean, exposing a PG_name-to-nr and page count pfn and flags
> as a supposedly formal proc interface doesn't sound nice to me. Page
> flags does not tell you what is going on in the VM, it gives you a tiny
> window into "something". Between reading a /proc/pid/ pfn and finding
> the pfn's page flags, it could be used for something completely different
> anyway.

I agree that exposing numerical values of page flags is not a very good
idea at all.  If we really want to expose this information it should
at least be in string form, although that is quite a bit of a maintaince
horror aswell.


^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  2:50                     ` Nick Piggin
  2007-04-13  3:10                       ` Nick Piggin
  2007-04-13  6:53                       ` William Lee Irwin III
@ 2007-04-13 12:13                       ` Ananth N Mavinakayanahalli
  2007-04-13 12:46                         ` Nick Piggin
  2 siblings, 1 reply; 77+ messages in thread
From: Ananth N Mavinakayanahalli @ 2007-04-13 12:13 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrew Morton, William Lee Irwin III, Matt Mackall, linux-kernel

On Fri, Apr 13, 2007 at 12:50:20PM +1000, Nick Piggin wrote:
> Andrew Morton wrote:
> >On Fri, 13 Apr 2007 12:18:56 +1000 Nick Piggin <nickpiggin@yahoo.com.au> 
> >wrote:
> >
> >
> >>>I guess one could generate an answer to the static question with 
> >>>systemtap,
> >>>by accumulating running counts across the application lifetime and then
> >>>snapshotting them.  Sounds hard though.
> >>
> >>Can't you just traverse arbitrary kernel data structures at a given point
> >>in time, exactly like the /proc/ call is doing?
> >
> >
> >Do a full pagetable walk, with all the associated locking from within
> >a systemtap script?  I'd be surprised.  Maybe if it's mostly hand-coded
> >in C, perhaps.
> 
> It looks like you can traverse arbitrary data structures, yes.
> 
> It definitely seems like you can use some kernel functions, but the
> ones I saw may just be systemtap facilities. But what is so surprising
> about being able to call a kernel function when running in kernel
> context? Perhaps there is some fundamental limitation of kprobes that
> I don't understand.

The main requirement for kprobes handlers is that they can't sleep. You
could definitely call a kernel function from kprobe handlers as long as
the function doesn't sleep.

Ananth

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  2:54                     ` Nick Piggin
@ 2007-04-13 12:24                       ` Ananth N Mavinakayanahalli
  0 siblings, 0 replies; 77+ messages in thread
From: Ananth N Mavinakayanahalli @ 2007-04-13 12:24 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Matt Mackall, Andrew Morton, William Lee Irwin III, linux-kernel

On Fri, Apr 13, 2007 at 12:54:36PM +1000, Nick Piggin wrote:
> Matt Mackall wrote:
> >On Fri, Apr 13, 2007 at 12:21:25PM +1000, Nick Piggin wrote:
> >
> >>Matt Mackall wrote:
> >>
> >>>On Fri, Apr 13, 2007 at 11:42:29AM +1000, Nick Piggin wrote:
> >>
> >>>>If kprobes is simply crappy and doesn't work properly for this, then I
> >>>>could accept that. I'm not someone trying to get this info. So why can't
> >>>>it be used? (not just for kpagemap, but for clear_refs and all that gunk
> >>>>too).
> >>>
> >>>
> >>>kprobes is good for looking at events, but bad for looking at state.
> >>>Especially metric shitloads of state.
> >>
> >>Why? Why is a kprobes trap significantly more expensive than a read
> >>syscall?
> >
> >
> >I guess I'm not clear on what you're proposing. From my understanding
> >of kprobes (admittedly not an expert), this is hard to do and not a
> >very good match.
> 
> But you have an idea that it is bad for exposing lots of data. Why?
> (I'm not a kprobes expert either, these are not rhetorical questions)

You could tie your kprobe module to use relay channels. Kprobe handlers
run lockless and using the per-cpu relay channels will provide a fast
transport mechanism for exposing lots of data.

http://relayfs.sourceforge.net/examples.html#tprintk_kprobes is an
example using the earlier relayfs interface. It shouldn't be that hard
to change it to use the newer relay stuff.

AFAIK acme is using a similar mechanism for ctracer
(http://oops.ghostprotocols.net:81/blog/?p=50)

Ananth

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13 12:13                       ` Ananth N Mavinakayanahalli
@ 2007-04-13 12:46                         ` Nick Piggin
  0 siblings, 0 replies; 77+ messages in thread
From: Nick Piggin @ 2007-04-13 12:46 UTC (permalink / raw)
  To: ananth; +Cc: Andrew Morton, William Lee Irwin III, Matt Mackall, linux-kernel

Ananth N Mavinakayanahalli wrote:
> On Fri, Apr 13, 2007 at 12:50:20PM +1000, Nick Piggin wrote:

>>It definitely seems like you can use some kernel functions, but the
>>ones I saw may just be systemtap facilities. But what is so surprising
>>about being able to call a kernel function when running in kernel
>>context? Perhaps there is some fundamental limitation of kprobes that
>>I don't understand.
> 
> 
> The main requirement for kprobes handlers is that they can't sleep. You
> could definitely call a kernel function from kprobe handlers as long as
> the function doesn't sleep.

That would be enough to access basically all the VM data structures.

-- 
SUSE Labs, Novell Inc.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  3:40                     ` Nick Piggin
  2007-04-13  6:55                       ` William Lee Irwin III
@ 2007-04-13 14:08                       ` Theodore Tso
  2007-04-16 11:00                         ` Christoph Hellwig
  1 sibling, 1 reply; 77+ messages in thread
From: Theodore Tso @ 2007-04-13 14:08 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrew Morton, William Lee Irwin III, Matt Mackall, linux-kernel

On Fri, Apr 13, 2007 at 01:40:08PM +1000, Nick Piggin wrote:
> With systemtap scripts, you could walk pagetables and print *the exact
> page information you want*, or you could walk pfns, or LRU, or page_tree,
> or walk the page tree then the rmap structures. And you can selectively
> cull out items you don't care about if you only care about a subset of
> items, based on arbitrary criteria. And you can most likely do all that
> more efficiently than with a conglomeration of various /proc files
> (assuming they even provide what you want in the first place).

Yes, but maintaining the systemtap scripts will be a nightmare, since
they would be outside the kernel, and as we change our internal data
structure, the scripts would become useless.

This is a fundamental problem with systemtap that we haven't been able
to solve yet, because solving it would freeze various internal data
structures or kernel functions.  I agree that's not acceptable; which
is why I don't think systemtap would be a good match for the problem
we're trying to solve here.

						- Ted

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  0:42       ` Andrew Morton
  2007-04-13  1:14         ` Nick Piggin
@ 2007-04-13 16:24         ` Matt Mackall
  2007-04-13 17:03           ` Andrew Morton
  1 sibling, 1 reply; 77+ messages in thread
From: Matt Mackall @ 2007-04-13 16:24 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Nick Piggin, William Lee Irwin III, linux-kernel

On Thu, Apr 12, 2007 at 05:42:01PM -0700, Andrew Morton wrote:
> On Fri, 13 Apr 2007 10:15:24 +1000 Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> 
> > >>+                       ((char *)page)[1] = PAGE_SHIFT;
> > > 
> > > 
> > > OK.
> > 
> > Shouldn't we just expose page size and endianness by other means? (another file or
> > syscall).
> 
> I don't think so - this file exposes fairly deep kernel internals and
> that's unavoidable, really - it's *supposed* to do that.  It is explicitly
> designed for monitoring kernel behaviour.
> 
> So it needs special handling by userspace.  Keeping the number of files
> which need such special handling to a minimum will keep the number of
> applications which are exposed to kernel changes to a minimum.
> 
> > >>+               for (; i < 2 * chunk / KPMSIZE; i += 2, pfn++) {
> > >>+                       ppage = pfn_to_page(pfn);
> > >>+                       if (!ppage) {
> > >>+                               page[i] = 0;
> > >>+                               page[i + 1] = 0;
> > >>+                       } else {
> > >>+                               page[i] = ppage->flags;
> > >>+                               page[i + 1] = atomic_read(&ppage->_count);
> > >>+                       }
> > >>+               }
> > > 
> > > 
> > > Not a good idea to expose raw flags in this manner - it changes at the drop
> > > of a hat.  We'd need to also expose the kernel's PG_foo-to-bitnumber
> > > mapping to make this viable.
> > 
> > I don't think it is viable because that makes the flags part of the
> > userspace ABI.
> 
> It *will* be viable.  If the application wants to know if a page is dirty,
> it looks up "PG_dirty" in /proc/pg_foo-to-bitnumber and uses PG_dirty's
> numerical offset when inspecting fields in /proc/kpagemap.  If correctly
> designed, such a monitoring application will be able to report upon page
> flags which we haven't even thought up yet.

We can probably fit this in the existing (variable-sized) header.
 
> > I wonder what they are needed for.
> 
> Poking deeply into the kernel to provide information about kernel state. 
> 
> There are real-world needs for this, and the people who develop tools to
> process this information will have decent kernel understanding and will
> know that the file's contents may alter across kernel versions.  It sure
> beats poking around in /dev/kmem.
> 
> I doubt if there's a sensible way in which we can prettify this interface
> without losing information.  But we should aim to make it as robust as
> possible agaisnt future kenrel changes, of course.
> 
> And we should satisfy ourselves that all the required information has been
> made available.  The fact that it will satisfy the Oracle requirement is
> encouraging.
> 
> Matt, these changes make the new field in /proc/pid/smaps redundant, don't
> they?

Which new field? From /proc/kpagemap + /proc/*/pagemap, you can
basically synthesize any statistic you want, including all the
existing ones. For some data, /proc/pid/smaps (or /proc/meminfo) will
be considerably more efficient.

But in general, most of the statistics in smaps are basically useless
for shared mappings, just like RSS. Problem is, we really don't know
what statistics we want yet, or even if it can be distilled down to
simple numbers anyway.

-- 
Mathematics is the supreme nostalgia of our time.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13 16:24         ` Matt Mackall
@ 2007-04-13 17:03           ` Andrew Morton
  2007-04-13 17:24             ` Matt Mackall
  0 siblings, 1 reply; 77+ messages in thread
From: Andrew Morton @ 2007-04-13 17:03 UTC (permalink / raw)
  To: Matt Mackall; +Cc: Nick Piggin, William Lee Irwin III, linux-kernel

On Fri, 13 Apr 2007 11:24:36 -0500 Matt Mackall <mpm@selenic.com> wrote:

> > It *will* be viable.  If the application wants to know if a page is dirty,
> > it looks up "PG_dirty" in /proc/pg_foo-to-bitnumber and uses PG_dirty's
> > numerical offset when inspecting fields in /proc/kpagemap.  If correctly
> > designed, such a monitoring application will be able to report upon page
> > flags which we haven't even thought up yet.
> 
> We can probably fit this in the existing (variable-sized) header.

hm, OK..

> > > I wonder what they are needed for.
> > 
> > Poking deeply into the kernel to provide information about kernel state. 
> > 
> > There are real-world needs for this, and the people who develop tools to
> > process this information will have decent kernel understanding and will
> > know that the file's contents may alter across kernel versions.  It sure
> > beats poking around in /dev/kmem.
> > 
> > I doubt if there's a sensible way in which we can prettify this interface
> > without losing information.  But we should aim to make it as robust as
> > possible agaisnt future kenrel changes, of course.
> > 
> > And we should satisfy ourselves that all the required information has been
> > made available.  The fact that it will satisfy the Oracle requirement is
> > encouraging.
> > 
> > Matt, these changes make the new field in /proc/pid/smaps redundant, don't
> > they?
> 
> Which new field?

Referenced:

> From /proc/kpagemap + /proc/*/pagemap, you can
> basically synthesize any statistic you want, including all the
> existing ones. For some data, /proc/pid/smaps (or /proc/meminfo) will
> be considerably more efficient.

You'd need to poke clear_refs beforehand to make the referenced bits useful.

Actually, we also need to run around the ptes and collect the pte-referenced
bits too.  I don't think your code copes with any of that?
 
> But in general, most of the statistics in smaps are basically useless
> for shared mappings, just like RSS. Problem is, we really don't know
> what statistics we want yet, or even if it can be distilled down to
> simple numbers anyway.

yup.  But that's the whole point, really: don't prejudge what info userspace
is trying to collect.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  2:18                 ` Nick Piggin
  2007-04-13  2:32                   ` Andrew Morton
@ 2007-04-13 17:13                   ` Matt Mackall
  1 sibling, 0 replies; 77+ messages in thread
From: Matt Mackall @ 2007-04-13 17:13 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, William Lee Irwin III, linux-kernel

On Fri, Apr 13, 2007 at 12:18:56PM +1000, Nick Piggin wrote:
> Can't you just traverse arbitrary kernel data structures at a given point
> in time, exactly like the /proc/ call is doing?

Perhaps.

My understanding is that you hook a kprobe to an event. An event is a
particular instruction getting executed. Indeed, you can do whatever
poking around in the kernel you want at that point. And then you can
stuff that data in a buffer that eventually gets to userspace.

This is very different from a read/seek/syscall. Rather than just
asking the kernel for some data, we have to wait for the relevent
events. Now, of course, you can make an ugly hack like hooking
sys_getpid() and basically make your own system call. Hopefully no one
else will call getpid() while you're doing this, etc. Not really how
it's intended to work at all, and probably a bitch to use, but
possible. Then the question becomes: why don't we do this for
everything else in /proc?

And the answer of course is: we put stuff in /proc because it's
generally useful. Extra points if it's actually related to
'proc'esses. Being able to tell what's paged in in a given mapping is
useful. Being able to tell what's shared between two mappings is
useful. Being able to get an accurate, meaningful picture of how your
memory is being used is useful. Heck, I bet some people might find it
useful to be able to see what nodes the pages in their process are on.
All stuff you shouldn't need to be a kernel hacker to answer.

The flags part of /proc/kpagemap exposes some (very interesting!)
implementation details. The rest of it is completely generic to any
system with a VM. It's only deep kernel magic in the sense that it's
not yet exposed.

-- 
Mathematics is the supreme nostalgia of our time.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13 17:03           ` Andrew Morton
@ 2007-04-13 17:24             ` Matt Mackall
  2007-04-13 17:58               ` Andrew Morton
  0 siblings, 1 reply; 77+ messages in thread
From: Matt Mackall @ 2007-04-13 17:24 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Nick Piggin, William Lee Irwin III, linux-kernel

On Fri, Apr 13, 2007 at 10:03:56AM -0700, Andrew Morton wrote:
> On Fri, 13 Apr 2007 11:24:36 -0500 Matt Mackall <mpm@selenic.com> wrote:
> 
> > > It *will* be viable.  If the application wants to know if a page is dirty,
> > > it looks up "PG_dirty" in /proc/pg_foo-to-bitnumber and uses PG_dirty's
> > > numerical offset when inspecting fields in /proc/kpagemap.  If correctly
> > > designed, such a monitoring application will be able to report upon page
> > > flags which we haven't even thought up yet.
> > 
> > We can probably fit this in the existing (variable-sized) header.
> 
> hm, OK..
> 
> > > > I wonder what they are needed for.
> > > 
> > > Poking deeply into the kernel to provide information about kernel state. 
> > > 
> > > There are real-world needs for this, and the people who develop tools to
> > > process this information will have decent kernel understanding and will
> > > know that the file's contents may alter across kernel versions.  It sure
> > > beats poking around in /dev/kmem.
> > > 
> > > I doubt if there's a sensible way in which we can prettify this interface
> > > without losing information.  But we should aim to make it as robust as
> > > possible agaisnt future kenrel changes, of course.
> > > 
> > > And we should satisfy ourselves that all the required information has been
> > > made available.  The fact that it will satisfy the Oracle requirement is
> > > encouraging.
> > > 
> > > Matt, these changes make the new field in /proc/pid/smaps redundant, don't
> > > they?
> > 
> > Which new field?
> 
> Referenced:
> 
> > From /proc/kpagemap + /proc/*/pagemap, you can
> > basically synthesize any statistic you want, including all the
> > existing ones. For some data, /proc/pid/smaps (or /proc/meminfo) will
> > be considerably more efficient.
> 
> You'd need to poke clear_refs beforehand to make the referenced bits useful.
> 
> Actually, we also need to run around the ptes and collect the pte-referenced
> bits too.  I don't think your code copes with any of that?

No, and it probably should. Perhaps dirty as well, though I've kindof
lost the plot on how that works lately.

> > But in general, most of the statistics in smaps are basically useless
> > for shared mappings, just like RSS. Problem is, we really don't know
> > what statistics we want yet, or even if it can be distilled down to
> > simple numbers anyway.
> 
> yup.  But that's the whole point, really: don't prejudge what info userspace
> is trying to collect.

Right.

-- 
Mathematics is the supreme nostalgia of our time.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13 17:24             ` Matt Mackall
@ 2007-04-13 17:58               ` Andrew Morton
  0 siblings, 0 replies; 77+ messages in thread
From: Andrew Morton @ 2007-04-13 17:58 UTC (permalink / raw)
  To: Matt Mackall; +Cc: Nick Piggin, William Lee Irwin III, linux-kernel

On Fri, 13 Apr 2007 12:24:51 -0500 Matt Mackall <mpm@selenic.com> wrote:

> > > From /proc/kpagemap + /proc/*/pagemap, you can
> > > basically synthesize any statistic you want, including all the
> > > existing ones. For some data, /proc/pid/smaps (or /proc/meminfo) will
> > > be considerably more efficient.
> > 
> > You'd need to poke clear_refs beforehand to make the referenced bits useful.
> > 
> > Actually, we also need to run around the ptes and collect the pte-referenced
> > bits too.  I don't think your code copes with any of that?
> 
> No, and it probably should. Perhaps dirty as well, though I've kindof
> lost the plot on how that works lately.

Dirty is OK: the VM keeps pte-dirtiness and page-dirtiness in sync now.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  8:13                               ` Christoph Hellwig
  2007-04-13  8:25                                 ` Nick Piggin
@ 2007-04-13 21:17                                 ` Frank Ch. Eigler
  2007-04-16 10:59                                   ` Christoph Hellwig
  2007-04-16 21:36                                 ` Andi Kleen
  2 siblings, 1 reply; 77+ messages in thread
From: Frank Ch. Eigler @ 2007-04-13 21:17 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Nick Piggin, William Lee Irwin III, Andrew Morton, Matt Mackall,
	linux-kernel


Christoph Hellwig <hch@infradead.org> writes:

> [...]
> > merge it in the first place?
> 
> It's very nice to poke deep into the kernel for development purposes.
> For example for the spu scheduler work I'm doing currently I have
> a module using kprobes (note the systemtap crap because it's big, bloated,
> in and odd language, and does a lot of really wrong things in its runtime).

It may be worthwhile to remind people that it is easy to use systemtap
only to the extent of automating the placement of kprobes: just to
perform the function-name/source-file/line-number triplet to PC
mapping.  They can use embedded-C code to do all the same stuff they'd
do with kprobes.  They are not obligated to write any odd script code
for probing logic, nor indeed use any of this really wrong runtime.

> This module allows me to put probes into various places in the scheduler
> and writes them into a ringbuffer with timestampts allowing me to
> trace what's going on there.  This is really neat.  [...]

Indeed, and we too try to make this simple & fast: a couple of lines.

> [...] To summarize, I really love kprobes to ease my debugging work,
> but using it for any kind of production code is a total nightmare.

But at some point, some interface needs to be fixed for a final
user-space tool.  Whether that interface fixing is performed by kernel
developers being more reluctant to rewrite basic things, or by
providing a proc interface, or maintaining a kprobes module does not
matter.  Someone will feel constrained, and someone will be liberated.

One neat thing about our systemtap tool is that, no matter what layer
such interfaces become fixed within, it can probably interface to
them.  If there is no fixed interface, it can go down to debugging
info.  If there are tracing hooks present, it can attach.  It can make
appear as unified the disparate standardization policies of different
subsystems.


> > We could distribute some systemtap scripts, and even distribute some
> > basic useful ones like this sort of page info in the kernel source
> > tree.
>
> We could not really distribute systemtap scripts with the kernel.
> systemtap is a bloody complicated piece of [software] 

I don't know if that should be treated a compliment to our team, for
being able to work quickly on something that a full-grown kernel
developer finds bloody complicated.  Perhaps your information is
simply outdated.  Big & bloated?  We have several times asked for
specifics rather than smears - what about it?

> outside the kernel tree that breaks all the time we change kernel
> internals. [...]

That's begging the question.  If kernel folks are willing to maintain
some included systemtap-related code, then by definition it would not
break all the time.


- FChE

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  2:23                   ` Matt Mackall
  2007-04-13  2:54                     ` Nick Piggin
@ 2007-04-14  8:13                     ` Maneesh Soni
  1 sibling, 0 replies; 77+ messages in thread
From: Maneesh Soni @ 2007-04-14  8:13 UTC (permalink / raw)
  To: Matt Mackall
  Cc: Nick Piggin, Andrew Morton, William Lee Irwin III, linux-kernel

On Thu, Apr 12, 2007 at 09:23:45PM -0500, Matt Mackall wrote:
> On Fri, Apr 13, 2007 at 12:21:25PM +1000, Nick Piggin wrote:
> > Matt Mackall wrote:
> > >On Fri, Apr 13, 2007 at 11:42:29AM +1000, Nick Piggin wrote:
> > 
> > >>If kprobes is simply crappy and doesn't work properly for this, then I
> > >>could accept that. I'm not someone trying to get this info. So why can't
> > >>it be used? (not just for kpagemap, but for clear_refs and all that gunk
> > >>too).
> > >
> > >
> > >kprobes is good for looking at events, but bad for looking at state.
> > >Especially metric shitloads of state.
> > 
> > Why? Why is a kprobes trap significantly more expensive than a read
> > syscall?
> 
> I guess I'm not clear on what you're proposing. From my understanding
> of kprobes (admittedly not an expert), this is hard to do and not a
> very good match.
> 
> > >>Maybe. How about LRU? Reclaim performance is bad, and you want to work out
> > >>which pages keep going off the end of it, or which pages keep getting
> > >>written out via it, or who's pages are on the active list, forcing mine
> > >>out.
> > >
> > >
> > >Those are actually probably a good match for systemtap as they're all 
> > >events.
> > 
> > Traverse the LRU? Which files do they belong to? What process maps them?
> 
> -ENOPARSE.
> 

For non-event based data gathering using kprobes we can have a debugfs file
like /debug/kprobes/snapshot_probe and write a kprobe module with probe at
->write() function and then the user space can trigger the data collection

echo "1" > /debug/kprobes/snapshot_probe

Thus, the actual data collection code can reside in a separate
module or a systemtap script which provides very good post-processing
capabalities, and can be used without recompiling or rebooting the kernel.

Thanks
Maneesh

-- 
Maneesh Soni
Linux Technology Center,
IBM India Systems and Technology Lab, 
Bangalore, India

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13 21:17                                 ` Frank Ch. Eigler
@ 2007-04-16 10:59                                   ` Christoph Hellwig
  0 siblings, 0 replies; 77+ messages in thread
From: Christoph Hellwig @ 2007-04-16 10:59 UTC (permalink / raw)
  To: Frank Ch. Eigler
  Cc: Christoph Hellwig, Nick Piggin, William Lee Irwin III,
	Andrew Morton, Matt Mackall, linux-kernel

On Fri, Apr 13, 2007 at 05:17:00PM -0400, Frank Ch. Eigler wrote:
> It may be worthwhile to remind people that it is easy to use systemtap
> only to the extent of automating the placement of kprobes: just to
> perform the function-name/source-file/line-number triplet to PC
> mapping.  They can use embedded-C code to do all the same stuff they'd
> do with kprobes.  They are not obligated to write any odd script code
> for probing logic, nor indeed use any of this really wrong runtime.

Umm, yes- as long as you write systemtap the runtime gets linked in
currently.  That doesn't mean you actually use a lot of it in the end,
but the maintaince horror of actually getting all the junk code to
compile still is there.

Now the actual function-name/source-file/line-number triplet to PC is
really useful functionality, and for my tracing work I could really
use this a lot.  Unfortunately systemtap doesn't have a proper layered
approach and you can't use this bit without pulling in all the
junk.  If started some dward based function-name/source-file/line-number
of my own based on acme's work, but it's stalled due to more important
issues going on.

> > We could not really distribute systemtap scripts with the kernel.
> > systemtap is a bloody complicated piece of [software] 
> 
> I don't know if that should be treated a compliment to our team, for
> being able to work quickly on something that a full-grown kernel
> developer finds bloody complicated.  Perhaps your information is
> simply outdated.  Big & bloated?  We have several times asked for
> specifics rather than smears - what about it?

There's a lot of stuff unneeded for basic tasks.  But if you want
a detailed review you could submit your runtime bits for review and
get feedback from everyone.

> > outside the kernel tree that breaks all the time we change kernel
> > internals. [...]
> 
> That's begging the question.  If kernel folks are willing to maintain
> some included systemtap-related code, then by definition it would not
> break all the time.

We'll definitly need a trace transport.  I currently use a hackish
kfifo rinbuffer derived from net/ipv4/tcp_probe.c, but it's showing
it's limitations.  Tom promised long ago to factor our the trace
code from blktrace into generic bits, but as he doesn't deliver
I suspect I'll have to do that myself soon.

The safe dereference bits are a bit questionable, but at least worth
a try to put into the tree proper, because there's no chance they'd
be properly maintained outside.

The register dumps you do would could definitly stand some integration
with the register dumps in panic messages, and would be useful library
functions for proper C language kprobes, but that means detangling
the core from the utterly horrible systemtrap pascal string handling.

Stack backtrace handling could use some integration with the stack
tracing framework in for lockdep and fault injection and be available
more genericly for C kprobes.

With a proper tracing infrastructure we'll need the timing bits for
it aswell, which should superceed the utter mess in systemtap in
that area (I'm hoping for Matthew to come up with something there
as part of lttng)

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13 14:08                       ` Theodore Tso
@ 2007-04-16 11:00                         ` Christoph Hellwig
  0 siblings, 0 replies; 77+ messages in thread
From: Christoph Hellwig @ 2007-04-16 11:00 UTC (permalink / raw)
  To: Theodore Tso, Nick Piggin, Andrew Morton, William Lee Irwin III,
	Matt Mackall, linux-kernel

On Fri, Apr 13, 2007 at 10:08:27AM -0400, Theodore Tso wrote:
> On Fri, Apr 13, 2007 at 01:40:08PM +1000, Nick Piggin wrote:
> > With systemtap scripts, you could walk pagetables and print *the exact
> > page information you want*, or you could walk pfns, or LRU, or page_tree,
> > or walk the page tree then the rmap structures. And you can selectively
> > cull out items you don't care about if you only care about a subset of
> > items, based on arbitrary criteria. And you can most likely do all that
> > more efficiently than with a conglomeration of various /proc files
> > (assuming they even provide what you want in the first place).
> 
> Yes, but maintaining the systemtap scripts will be a nightmare, since
> they would be outside the kernel, and as we change our internal data
> structure, the scripts would become useless.
> 
> This is a fundamental problem with systemtap that we haven't been able
> to solve yet, because solving it would freeze various internal data
> structures or kernel functions.  I agree that's not acceptable; which
> is why I don't think systemtap would be a good match for the problem
> we're trying to solve here.

It's also fundamentally not solveable.  Even Sun doesn't guarantee
dor dtrace scripts to be portable, because it simply means you'd
have to freeze all internals.  Of course systemtap managment with their
execute visibility and plain stupidity of copying whatever sun does will
never ever get it.  This whole mess will only be solvable if IBM fires
the right people in managment.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-16 21:36                                 ` Andi Kleen
@ 2007-04-16 21:01                                   ` Frank Ch. Eigler
  0 siblings, 0 replies; 77+ messages in thread
From: Frank Ch. Eigler @ 2007-04-16 21:01 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Christoph Hellwig, Nick Piggin, William Lee Irwin III,
	Andrew Morton, Matt Mackall, linux-kernel, fche, systemtap

Hi -

On Mon, Apr 16, 2007 at 11:36:05PM +0200, Andi Kleen wrote:
> Christoph Hellwig <hch@infradead.org> writes:
> > and [systemtap] does a lot of really wrong things in it's
> > runtime). [...]

(Thanks, Christoph, for at least a few specifics.  Some of them have
already been dealt with in the recent past.)


> I must agree with that. Perhaps it would be good if its runtime code
> was posted to l-k at some point and reviewed in the standard way
> even when it isn't merged.

I'll let the runtime's maintainers judge whether this particular venue
would be helpful.  But is the choice of venue really an obstacle?
Everyone who cares is *already* welcome to browse the code (available
on CVS, cvsweb, tarballs - would git help?), and critique it (e.g. on
our open public mailing list or via bugzilla).

http://sourceware.org/systemtap/getinvolved.html

- FChE

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups
  2007-04-13  8:13                               ` Christoph Hellwig
  2007-04-13  8:25                                 ` Nick Piggin
  2007-04-13 21:17                                 ` Frank Ch. Eigler
@ 2007-04-16 21:36                                 ` Andi Kleen
  2007-04-16 21:01                                   ` Frank Ch. Eigler
  2 siblings, 1 reply; 77+ messages in thread
From: Andi Kleen @ 2007-04-16 21:36 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Nick Piggin, William Lee Irwin III, Andrew Morton, Matt Mackall,
	linux-kernel, fche

Christoph Hellwig <hch@infradead.org> writes:

> and does a lot of really wrong things in it's runtime).

I must agree with that. Perhaps it would be good if its runtime code
was posted to l-k at some point and reviewed in the standard way
even when it isn't merged.

-Andi

^ permalink raw reply	[flat|nested] 77+ messages in thread

end of thread, other threads:[~2007-04-16 21:02 UTC | newest]

Thread overview: 77+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-04-04  2:43 [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups Matt Mackall
2007-04-04  2:43 ` [PATCH 1/13] maps: Uninline some functions in the page walker Matt Mackall
2007-04-04  2:43 ` [PATCH 2/13] maps: Eliminate the pmd_walker struct " Matt Mackall
2007-04-04  2:43 ` [PATCH 3/13] maps: Remove vma from args " Matt Mackall
2007-04-04  2:43 ` [PATCH 4/13] maps: Propagate errors from callback in " Matt Mackall
2007-04-04  2:43 ` [PATCH 5/13] maps: Add callbacks for each level to " Matt Mackall
2007-04-04  2:43 ` [PATCH 6/13] maps: Move the page walker code to lib/ Matt Mackall
2007-04-04  3:51   ` Nick Piggin
2007-04-04  5:08     ` Matt Mackall
2007-04-04  5:50       ` Nick Piggin
2007-04-04 21:48         ` Matt Mackall
2007-04-05  1:32           ` Nick Piggin
2007-04-05  1:50             ` Nick Piggin
2007-04-04  2:43 ` [PATCH 7/13] maps: Simplify interdependence of /proc/pid/maps and smaps Matt Mackall
2007-04-04  2:43 ` [PATCH 8/13] maps: Move clear_refs code to task_mmu.c Matt Mackall
2007-04-04  2:43 ` [PATCH 9/13] maps: Regroup task_mmu by interface Matt Mackall
2007-04-04  2:43 ` [PATCH 10/13] maps: Make /proc/pid/smaps optional under CONFIG_EMBEDDED Matt Mackall
2007-04-04  2:43 ` [PATCH 11/13] maps: Make /proc/pid/clear_refs option " Matt Mackall
2007-04-04  6:22   ` David Rientjes
2007-04-04  2:43 ` [PATCH 12/13] maps: Add /proc/pid/pagemap interface Matt Mackall
2007-04-04 11:18   ` Nikita Danilov
2007-04-04 16:32     ` Matt Mackall
2007-04-04 18:03       ` Nikita Danilov
2007-04-04 21:59         ` Matt Mackall
2007-04-04  2:43 ` [PATCH 13/13] maps: Add /proc/kpagemap interface Matt Mackall
2007-04-12 23:10 ` [PATCH 0/13] maps: pagemap, kpagemap, and related cleanups William Lee Irwin III
2007-04-12 23:32   ` Andrew Morton
2007-04-12 23:42     ` William Lee Irwin III
2007-04-13  0:25       ` Nick Piggin
2007-04-13  0:15     ` Nick Piggin
2007-04-13  0:25       ` Matt Mackall
2007-04-13  1:01         ` Nick Piggin
2007-04-13  1:38           ` Matt Mackall
2007-04-13  2:11             ` Nick Piggin
2007-04-13  0:42       ` Andrew Morton
2007-04-13  1:14         ` Nick Piggin
2007-04-13  1:22           ` Andrew Morton
2007-04-13  1:42             ` Nick Piggin
2007-04-13  1:57               ` Matt Mackall
2007-04-13  2:21                 ` Nick Piggin
2007-04-13  2:23                   ` Matt Mackall
2007-04-13  2:54                     ` Nick Piggin
2007-04-13 12:24                       ` Ananth N Mavinakayanahalli
2007-04-14  8:13                     ` Maneesh Soni
2007-04-13  1:57               ` Andrew Morton
2007-04-13  2:05                 ` Matt Mackall
2007-04-13  2:29                   ` Nick Piggin
2007-04-13  2:18                 ` Nick Piggin
2007-04-13  2:32                   ` Andrew Morton
2007-04-13  2:50                     ` Nick Piggin
2007-04-13  3:10                       ` Nick Piggin
2007-04-13  6:53                       ` William Lee Irwin III
2007-04-13  7:05                         ` Nick Piggin
2007-04-13  7:51                           ` Christoph Hellwig
2007-04-13  8:03                             ` Nick Piggin
2007-04-13  8:13                               ` Christoph Hellwig
2007-04-13  8:25                                 ` Nick Piggin
2007-04-13  9:46                                   ` Christoph Hellwig
2007-04-13 21:17                                 ` Frank Ch. Eigler
2007-04-16 10:59                                   ` Christoph Hellwig
2007-04-16 21:36                                 ` Andi Kleen
2007-04-16 21:01                                   ` Frank Ch. Eigler
2007-04-13  8:15                             ` William Lee Irwin III
2007-04-13 12:13                       ` Ananth N Mavinakayanahalli
2007-04-13 12:46                         ` Nick Piggin
2007-04-13  3:40                     ` Nick Piggin
2007-04-13  6:55                       ` William Lee Irwin III
2007-04-13  7:03                         ` Nick Piggin
2007-04-13  7:08                           ` William Lee Irwin III
2007-04-13 14:08                       ` Theodore Tso
2007-04-16 11:00                         ` Christoph Hellwig
2007-04-13 17:13                   ` Matt Mackall
2007-04-13 16:24         ` Matt Mackall
2007-04-13 17:03           ` Andrew Morton
2007-04-13 17:24             ` Matt Mackall
2007-04-13 17:58               ` Andrew Morton
2007-04-13  0:15     ` Matt Mackall

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox