[PATCH] procfs: expose page cache contents

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] procfs: expose page cache contents
@ 2013-04-25 14:27 Nick White
  2013-04-26  0:47 ` Dave Chinner
  0 siblings, 1 reply; 4+ messages in thread
From: Nick White @ 2013-04-25 14:27 UTC (permalink / raw)
  To: linux-fsdevel@vger.kernel.org, viro@zeniv.linux.org.uk

[-- Attachment #1: Type: text/plain, Size: 8841 bytes --]

From: Nick White <nwhite@palantir.com>

This patch adds a /proc/page_cache file to the procfs. The file contains a
line for each inode belonging to a block filesystem which has one or more
of its pages in memory (as determined by PageUptodate). This line includes
a run-length-encoded bitmap of which pages are up-to-date, starting with
pages that aren't (so a string 0,4,2,3 means the first four pages are
up-to-date, the next two aren't and the final three are).

A sample output (columns are inode, super block id, total number of pages
for this inode, run-length-encoded bitmap of which pages are up-to-date,
total pages up-to-date, total pages not up-to-date):

148073	sda1	1	0,1	1	0
397793	sda1	4518	0,3,2,1,67,32,231,1,29,2,396,9,32,1,12,2,2613,1,1084	52	44
66
133941	sda1	13	0,4,3,3,3	7	6
173947	sda1	43	0,5,2,1,1,4,1,1,2,1,8,6,1,10	28	15
148499	sda1	4	0,4	4	0

It's currently possible to query this information for a specific file from
userspace using mmap / mincore system calls, however this patch solves the
slightly different question "What's in the cache?" (not "Is this in the
cache?").

The patch is currently missing updates to the procfs documentation, but
I'd appreciate any other comments. Thanks -


Nick

Signed-off-by: Nick White <nwhite@palantir.com>
---
 fs/proc/Makefile     |    1 +
 fs/proc/internal.h   |    1 +
 fs/proc/page_cache.c |  269
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 271 insertions(+)

diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 712f24d..69cbed8 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -17,6 +17,7 @@ proc-y	+= devices.o
 proc-y	+= interrupts.o
 proc-y	+= loadavg.o
 proc-y	+= meminfo.o
+proc-y	+= page_cache.o
 proc-y	+= stat.o
 proc-y	+= uptime.o
 proc-y	+= version.o
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 85ff3a4..522beea 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -159,6 +159,7 @@ struct pde_opener {
 void pde_users_dec(struct proc_dir_entry *pde);
 
 extern spinlock_t proc_subdir_lock;
+extern spinlock_t inode_sb_list_lock;
 
 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry,
unsigned int);
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t
filldir);
diff --git a/fs/proc/page_cache.c b/fs/proc/page_cache.c
new file mode 100644
index 0000000..2b709b9
--- /dev/null
+++ b/fs/proc/page_cache.c
@@ -0,0 +1,269 @@
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/page-flags.h>
+#include <linux/pagevec.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+struct page_cache_proc_iter {
+	struct list_head *sb_ptr;
+	struct list_head *inode_ptr;
+};
+
+#define GET_SB(iter) list_entry(iter->sb_ptr, struct super_block, s_list);
+#define GET_INODE(iter) list_entry(iter->inode_ptr, struct inode,
i_sb_list);
+
+/*
+ * prints RLE-ed bitmap for a single inode, starting with zeros &
separated
+ * by commas.
+ */
+static int proc_page_cache_seq_show(struct seq_file *s, void *v)
+{
+	struct page_cache_proc_iter *state = v;
+	struct inode *inode;
+	unsigned long i, nr_read, gap, index = 0,
+		nr_uptodate = 0, nr_pages, run_length = 0;
+	struct pagevec pvec;
+	bool is_printing = false, ones = false;
+	struct super_block *sb;
+
+	if (!state)
+		return -EINVAL;
+	inode = GET_INODE(state);
+	sb = GET_SB(state);
+
+	spin_lock(&inode->i_lock);
+
+	if (!inode->i_mapping)
+		goto end;
+
+	nr_pages = DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
+
+	pagevec_init(&pvec, 0);
+
+	/* we have to process in ~16 page chunks */
+	while ((nr_read = pagevec_lookup(
+		&pvec,
+		inode->i_mapping,
+		index,
+		PAGEVEC_SIZE))) {
+		for (i = 0; i < nr_read; i++) {
+			struct page *page = pvec.pages[i];
+
+			/* the gap is how many pages we've skipped */
+			if (index + 1 >= page->index)
+				gap = 0;
+			else
+				gap = index->index - 1 - page;
+
+			if (PageUptodate(page)) {
+				/* we need to print a 1 */
+				if (is_printing) {
+					if (gap) {
+						if (ones) {
+							/* a gap between this one and the last */
+							seq_printf(s, ",%lu", run_length); /* ones */
+							seq_printf(s, ",%lu", gap); /* zeros */
+							run_length = 1; /* back to ones */
+						} else {
+							/* we were printing zeros */
+							seq_printf(s, ",%lu", run_length + gap);
+							run_length = 1;
+						}
+					} else {
+						if (ones) {
+							/* a consecutive one */
+							++run_length;
+						} else {
+							seq_printf(s, ",%lu", run_length);
+							seq_printf(s, ",%lu", gap);
+							run_length = 1;
+						}
+					}
+				} else {
+					/* page->index leading zeros (as this is a one) */
+					if (gap)
+						++gap;
+					seq_printf(
+						s,
+						"%lu\t%s\t%lu\t%lu",
+						inode->i_ino,
+						sb->s_id,
+						nr_pages,
+						gap);
+					is_printing = true;
+					run_length = 1;
+				}
+				ones = true;
+				++nr_uptodate;
+			} else {
+				if (is_printing) {
+					if (ones) {
+						seq_printf(s, ",%lu", run_length);
+						run_length = 1;
+					} else {
+						++run_length;
+					}
+				}
+				ones = false;
+			}
+
+			index = page->index;
+		}
+
+		pagevec_release(&pvec);
+		++index;
+	}
+
+	if (is_printing) {
+		/* print the last run */
+		if (index == nr_pages) {
+			seq_printf(s, ",%lu", run_length);
+		} else {
+			/* gap between last page we processed and end */
+			gap = index > nr_pages ? 0 : nr_pages - index;
+			if (ones) {
+				/* terminate 1 run and add 0 run */
+				seq_printf(s, ",%lu", run_length);
+				if (gap)
+					seq_printf(s, ",%lu", gap);
+			} else {
+				/* extend 0 run */
+				seq_printf(s, ",%lu", gap + run_length);
+			}
+		}
+		seq_printf(s, "\t%lu\t%lu\n",
+			nr_uptodate,
+			nr_uptodate > nr_pages ? 0 : nr_pages - nr_uptodate);
+	}
+
+end:
+	spin_unlock(&inode->i_lock);
+	return 0;
+}
+
+static void *proc_page_cache_seq_next(struct seq_file *s, void *v, loff_t
*pos)
+{
+	struct page_cache_proc_iter *state = v;
+	struct super_block *sb;
+	loff_t ignore = 0;
+
+	if (!state)
+		return NULL;
+
+	sb = GET_SB(state);
+	if (!sb)
+		return NULL;
+
+	/* first, try the next inode of this SB */
+	spin_lock(&inode_sb_list_lock);
+	state->inode_ptr = seq_list_next(state->inode_ptr, &sb->s_inodes, pos);
+	spin_unlock(&inode_sb_list_lock);
+	if (state->inode_ptr)
+		return state;
+
+	/* second, try the first inode of the next SB */
+	spin_lock(&sb_lock);
+	while (state->sb_ptr) {
+		state->sb_ptr = seq_list_next(
+			state->sb_ptr,
+			&super_blocks,
+			&ignore);
+		if (!state->sb_ptr) {
+			spin_unlock(&sb_lock);
+			return NULL;
+		}
+		sb = GET_SB(state);
+		if (sb->s_type->fs_flags & FS_REQUIRES_DEV)
+			break;
+	}
+	spin_unlock(&sb_lock);
+
+	spin_lock(&inode_sb_list_lock);
+	state->inode_ptr = seq_list_start(&sb->s_inodes, 0);
+	spin_unlock(&inode_sb_list_lock);
+	if (state->inode_ptr)
+		return state;
+
+	/* we've passed the last node of the last SB */
+	return NULL;
+}
+
+static void *proc_page_cache_seq_start(struct seq_file *s, loff_t *pos)
+{
+	struct page_cache_proc_iter *state = kmalloc(
+		sizeof(struct page_cache_proc_iter),
+		GFP_KERNEL);
+	struct super_block *sb;
+	loff_t ff = *pos, ignore = 0;
+
+	spin_lock(&sb_lock);
+	state->sb_ptr = seq_list_start(&super_blocks, 0);
+	sb = GET_SB(state);
+	while (state->sb_ptr) {
+		sb = GET_SB(state);
+		if (sb->s_type->fs_flags & FS_REQUIRES_DEV)
+			break;
+		state->sb_ptr = seq_list_next(
+			state->sb_ptr,
+			&super_blocks,
+			&ignore);
+	}
+	spin_unlock(&sb_lock);
+	if (!state->sb_ptr)
+		return NULL;
+
+	spin_lock(&inode_sb_list_lock);
+	state->inode_ptr = seq_list_start(&sb->s_inodes, 0);
+	spin_unlock(&inode_sb_list_lock);
+	if (!state->inode_ptr)
+		return NULL;
+
+	while (ff-- > 0 && state)
+		state = proc_page_cache_seq_next(s, state, &ignore);
+
+	return state;
+}
+
+static void proc_page_cache_seq_stop(struct seq_file *s, void *v)
+{
+	kfree(v);
+}
+
+static const struct seq_operations proc_page_cache_seq_ops = {
+	.start = proc_page_cache_seq_start,
+	.next  = proc_page_cache_seq_next,
+	.stop  = proc_page_cache_seq_stop,
+	.show  = proc_page_cache_seq_show
+};
+
+static int proc_page_cache_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &proc_page_cache_seq_ops);
+}
+
+static const struct file_operations proc_page_cache_file_ops = {
+	.open		= proc_page_cache_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static void __exit proc_page_cache_exit(void)
+{
+	remove_proc_entry("page_cache", NULL);
+}
+module_exit(proc_page_cache_exit);
+
+static int __init proc_page_cache_init(void)
+{
+	if (proc_create("page_cache", 0, NULL, &proc_page_cache_file_ops))
+		return 0;
+	else
+		return -ENOMEM;
+}
+module_init(proc_page_cache_init);
+


[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5000 bytes --]

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] procfs: expose page cache contents
  2013-04-25 14:27 [PATCH] procfs: expose page cache contents Nick White
@ 2013-04-26  0:47 ` Dave Chinner
  2013-04-28  0:05   ` Nick White
  0 siblings, 1 reply; 4+ messages in thread
From: Dave Chinner @ 2013-04-26  0:47 UTC (permalink / raw)
  To: Nick White; +Cc: linux-fsdevel@vger.kernel.org, viro@zeniv.linux.org.uk

On Thu, Apr 25, 2013 at 02:27:10PM +0000, Nick White wrote:
> From: Nick White <nwhite@palantir.com>
> 
> This patch adds a /proc/page_cache file to the procfs. The file contains a
> line for each inode belonging to a block filesystem which has one or more
> of its pages in memory (as determined by PageUptodate). This line includes
> a run-length-encoded bitmap of which pages are up-to-date, starting with
> pages that aren't (so a string 0,4,2,3 means the first four pages are
> up-to-date, the next two aren't and the final three are).
> 
> A sample output (columns are inode, super block id, total number of pages
> for this inode, run-length-encoded bitmap of which pages are up-to-date,
> total pages up-to-date, total pages not up-to-date):
> 
> 148073	sda1	1	0,1	1	0
> 397793	sda1	4518	0,3,2,1,67,32,231,1,29,2,396,9,32,1,12,2,2613,1,1084	52	44
> 66
> 133941	sda1	13	0,4,3,3,3	7	6
> 173947	sda1	43	0,5,2,1,1,4,1,1,2,1,8,6,1,10	28	15
> 148499	sda1	4	0,4	4	0
>
> It's currently possible to query this information for a specific file from
> userspace using mmap / mincore system calls, however this patch solves the
> slightly different question "What's in the cache?" (not "Is this in the
> cache?").

Leaving aside the implementation problems in your patch (and there
are lots of them), what use does this information have?

It's out of date by the time the caller can make any sense of it,
and there's nothing the caller can do with the information, anyway.
Indeed, the act of dumping this information and storing/parsing it
in userspace will generate memory pressure and perturb the very
thing you are trying to measure....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] procfs: expose page cache contents
  2013-04-26  0:47 ` Dave Chinner
@ 2013-04-28  0:05   ` Nick White
  2013-04-29  1:38     ` Dave Chinner
  0 siblings, 1 reply; 4+ messages in thread
From: Nick White @ 2013-04-28  0:05 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel@vger.kernel.org, viro@zeniv.linux.org.uk

[-- Attachment #1: Type: text/plain, Size: 2482 bytes --]

> what use does this information have?

There are two main ways I'd find this data (as distinct from this format)
useful:

Some applications would benefit from knowing which files are cheaper to
access. A good example would be a database's query planner, when deciding
whether to use an index or just sequentially scan a table. If the table's
blocks were resident in memory but the index's weren't, then it might be
faster just to start scan the table. While mmap / mincore'ing the files
would
provide this information for a specific file, when the size of the files
you're interested in exceed the address space available (admittedly unlike
on
64-bit machines, but easy on 32-bit machines) you'd have to start
processing
the files in chunks; this would take much longer and so increase the
accuracy
problems you highlight.

This scenario actually highlights an algorithmic problem with my solution
- it
loops through the inodes of each (block-device) super-block, querying if
any
of their pages are resident. It'd be far more efficient to look through the
resident pages, and see which inodes they pointed at (if any), possibly by
walking through the memory zones (like /proc/zoneinfo), iterating over the
per_cpu_pages and mapping them to inodes (if applicable) via
page->mapping->host?

The other use-case I had in mind was when profiling existing processes that
either use memory-mapping or otherwise rely on the kernel to cache the data
they frequently rely on. If I'm trying to validate a process' assumption
that
the page cache will help it I'd like to verify that the blocks it needs
are in
the page cache. This is especially useful if two processes are competing
for
page cache space, and is much more accurate  (and definitely more granular)
than either comparing per-process major page fault counts or indirect
timing
methods (such as the process' response latency).

> Indeed, the act of dumping this information and storing/parsing it in
> userspace will generate memory pressure and perturb the very thing you
>are
> trying to measure....

That's true, although the impact could be minimised by writing the results
out
using O_DIRECT. Reducing the size of the /proc/page_cache report (possible
even using a binary representation like /proc/???/pagemap does) would also
help.

I understand your concerns, but I believe more transparency around what the
page cache is doing would be useful due to its significant impact on a
system's performance.

Thanks -

Nick White

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5000 bytes --]

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] procfs: expose page cache contents
  2013-04-28  0:05   ` Nick White
@ 2013-04-29  1:38     ` Dave Chinner
  0 siblings, 0 replies; 4+ messages in thread
From: Dave Chinner @ 2013-04-29  1:38 UTC (permalink / raw)
  To: Nick White; +Cc: linux-fsdevel@vger.kernel.org, viro@zeniv.linux.org.uk

On Sun, Apr 28, 2013 at 12:05:47AM +0000, Nick White wrote:
> > what use does this information have?
> 
> There are two main ways I'd find this data (as distinct from this format)
> useful:
> 
> Some applications would benefit from knowing which files are cheaper to
> access. A good example would be a database's query planner, when deciding
> whether to use an index or just sequentially scan a table. If the table's
> blocks were resident in memory but the index's weren't, then it might be
> faster just to start scan the table.

Sounds like a severe case of premature optimisation to me. Indeed,
most databases use direct IO, so there aren't any cached pages in
kernel memory, so nothing you do here will tell you anything about
what is the best query method.

> While mmap / mincore'ing the files would provide this information
> for a specific file, when the size of the files you're interested
> in exceed the address space available (admittedly unlike on 64-bit
> machines, but easy on 32-bit machines) you'd have to start
> processing the files in chunks; this would take much longer and so
> increase the accuracy problems you highlight.

And points out the silliness of attempting to use "what is cached"
as a method of determining the best algorithm to use - it simply
doesn't scale up. Further, if you optimise towards whatever method
gives the best physical IO patterns you'll end up with the most
robust and consistently performing solution.

There's nothing more irritating than a database that randomly
changes performance on the same workload for no obvious reason....

> This scenario actually highlights an algorithmic problem with my
> solution - it loops through the inodes of each (block-device)
> super-block, querying if any of their pages are resident.

Well, yes. Think of a machine with a couple of TB of RAM and tens of
millions of cached inodes....

> It'd be far more efficient to look through the resident pages, and
> see which inodes they pointed at (if any), possibly by walking
> through the memory zones (like /proc/zoneinfo), iterating over the
> per_cpu_pages and mapping them to inodes (if applicable) via
> page->mapping->host?

That doesn't make the TB of page cache case any better - it's just
as gross as your current patch....

> The other use-case I had in mind was when profiling existing
> processes that either use memory-mapping or otherwise rely on the
> kernel to cache the data they frequently rely on.

Go google for the recent hot data tracking patch series.

> I understand your concerns, but I believe more transparency around
> what the page cache is doing would be useful due to its
> significant impact on a system's performance.

You don't need to scan the page cache to understand what it is
doing. strace will tell you the IO your application is doing,
blktrace will tell you the IO that the page cache is doing, various
tracepoints will tell you what pages are being reclaimed, etc.  If
this isn't sufficient for you to understand what your application is
doing and you really need fine grained, custom information about
what is cached in the page cache, then perhaps systemtap would be a
better solution for your purposes.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2013-04-29  1:38 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-04-25 14:27 [PATCH] procfs: expose page cache contents Nick White
2013-04-26  0:47 ` Dave Chinner
2013-04-28  0:05   ` Nick White
2013-04-29  1:38     ` Dave Chinner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).