All of lore.kernel.org
 help / color / mirror / Atom feed
From: Wu Fengguang <fengguang.wu@intel.com>
To: Chris Frost <frost@cs.ucla.edu>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>,
	Alexander Viro <viro@zeniv.linux.org.uk>,
	Benny Halevy <bhalevy@panasas.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	linux-fsdevel@vger.kernel.org, linux-mm@kvack.org,
	linux-kernel@vger.kernel.org,
	Steve VanDeBogart <vandebo-lkml@nerdbox.net>,
	Andi Kleen <andi@firstfloor.org>, Matt Mackall <mpm@selenic.com>,
	Peter Zijlstra <peterz@infradead.org>
Subject: Re: [PATCH] fs: add fincore(2) (mincore(2) for file descriptors)
Date: Fri, 22 Jan 2010 09:17:09 +0800	[thread overview]
Message-ID: <20100122011709.GA6700@localhost> (raw)
In-Reply-To: <20100120215712.GO27212@frostnet.net>

On Wed, Jan 20, 2010 at 01:57:12PM -0800, Chris Frost wrote:
> Add the fincore() system call. fincore() is mincore() for file descriptors.
> 
> The functionality of fincore() can be emulated with an mmap(), mincore(),
> and munmap(), but this emulation requires more system calls and requires
> page table modifications. fincore() can provide a significant performance
> improvement for non-sequential in-core queries.

FYI I have a seqfile based procfile that export cached file pages with
various states:

root /home/wfg# echo /sbin/init > /proc/filecache
root /home/wfg# cat /proc/filecache
# file /sbin/init
# flags R:referenced A:active M:mmap U:uptodate D:dirty W:writeback X:readahead P:private O:owner b:buffer d:dirty w:writeback
# idx   len     state           refcnt
0       6       RAMU________    2
6       1       _AMU________    2
7       1       RAMU________    2
8       2       ___U________    1

It was first developed to provide information for prefetching.
Since then I've been using it as a generic page cache inspection tool.
It helped me debug vm/fs issues, eg. readahead, writeback and vmscan.
Though I'm not sure if the interface is acceptable to Linux.

Here is the code snippet if you are interested :) 

/*
 * Listing of cached page ranges of a file.
 *
 * Usage:
 * 		echo 'file name' > /proc/filecache
 * 		cat /proc/filecache
 */

unsigned long page_mask;
#define PG_MMAP		PG_lru		/* reuse any non-relevant flag */
#define PG_BUFFER	PG_swapcache	/* ditto */
#define PG_DIRTY	PG_error	/* ditto */
#define PG_WRITEBACK	PG_buddy	/* ditto */

/*
 * Page state names, prefixed by their abbreviations.
 */
struct {
	unsigned long	mask;
	const char     *name;
	int		faked;
} page_flag [] = {
	{1 << PG_referenced,	"R:referenced",	0},
	{1 << PG_active,	"A:active",	0},
	{1 << PG_MMAP,		"M:mmap",	1},

	{1 << PG_uptodate,	"U:uptodate",	0},
	{1 << PG_dirty,		"D:dirty",	0},
	{1 << PG_writeback,	"W:writeback",	0},
	{1 << PG_reclaim,	"X:readahead",	0},

	{1 << PG_private,	"P:private",	0},
	{1 << PG_owner_priv_1,	"O:owner",	0},

	{1 << PG_BUFFER,	"b:buffer",	1},
	{1 << PG_DIRTY,		"d:dirty",	1},
	{1 << PG_WRITEBACK,	"w:writeback",	1},
};

static unsigned long page_flags(struct page* page)
{
	unsigned long flags;
	struct address_space *mapping = page_mapping(page);

	flags = page->flags & page_mask;

	if (page_mapped(page))
		flags |= (1 << PG_MMAP);

	if (page_has_buffers(page))
		flags |= (1 << PG_BUFFER);

	if (mapping) {
		if (radix_tree_tag_get(&mapping->page_tree,
					page_index(page),
					PAGECACHE_TAG_WRITEBACK))
			flags |= (1 << PG_WRITEBACK);

		if (radix_tree_tag_get(&mapping->page_tree,
					page_index(page),
					PAGECACHE_TAG_DIRTY))
			flags |= (1 << PG_DIRTY);
	}

	return flags;
}

static int pages_similiar(struct page* page0, struct page* page)
{
	if (page_count(page0) != page_count(page))
		return 0;

	if (page_flags(page0) != page_flags(page))
		return 0;

	return 1;
}

static void show_range(struct seq_file *m, struct page* page, unsigned long len)
{
	int i;
	unsigned long flags;

	if (!m || !page)
		return;

	seq_printf(m, "%lu\t%lu\t", page->index, len);

	flags = page_flags(page);
	for (i = 0; i < ARRAY_SIZE(page_flag); i++)
		seq_putc(m, (flags & page_flag[i].mask) ?
					page_flag[i].name[0] : '_');

	seq_printf(m, "\t%d\n", page_count(page));
}

#define BATCH_LINES	100
static pgoff_t show_file_cache(struct seq_file *m,
				struct address_space *mapping, pgoff_t start)
{
	int i;
	int lines = 0;
	pgoff_t len = 0;
	struct pagevec pvec;
	struct page *page;
	struct page *page0 = NULL;

	for (;;) {
		pagevec_init(&pvec, 0);
		pvec.nr = radix_tree_gang_lookup(&mapping->page_tree,
				(void **)pvec.pages, start + len, PAGEVEC_SIZE);

		if (pvec.nr == 0) {
			show_range(m, page0, len);
			start = ULONG_MAX;
			goto out;
		}

		if (!page0)
			page0 = pvec.pages[0];

		for (i = 0; i < pvec.nr; i++) {
			page = pvec.pages[i];

			if (page->index == start + len &&
					pages_similiar(page0, page))
				len++;
			else {
				show_range(m, page0, len);
				page0 = page;
				start = page->index;
				len = 1;
				if (++lines > BATCH_LINES)
					goto out;
			}
		}
	}

out:
	return start;
}

Thanks,
Fengguang

WARNING: multiple messages have this Message-ID (diff)
From: Wu Fengguang <fengguang.wu@intel.com>
To: Chris Frost <frost@cs.ucla.edu>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>,
	Alexander Viro <viro@zeniv.linux.org.uk>,
	Benny Halevy <bhalevy@panasas.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	linux-fsdevel@vger.kernel.org, linux-mm@kvack.org,
	linux-kernel@vger.kernel.org,
	Steve VanDeBogart <vandebo-lkml@nerdbox.net>,
	Andi Kleen <andi@firstfloor.org>, Matt Mackall <mpm@selenic.com>,
	Peter Zijlstra <peterz@infradead.org>
Subject: Re: [PATCH] fs: add fincore(2) (mincore(2) for file descriptors)
Date: Fri, 22 Jan 2010 09:17:09 +0800	[thread overview]
Message-ID: <20100122011709.GA6700@localhost> (raw)
In-Reply-To: <20100120215712.GO27212@frostnet.net>

On Wed, Jan 20, 2010 at 01:57:12PM -0800, Chris Frost wrote:
> Add the fincore() system call. fincore() is mincore() for file descriptors.
> 
> The functionality of fincore() can be emulated with an mmap(), mincore(),
> and munmap(), but this emulation requires more system calls and requires
> page table modifications. fincore() can provide a significant performance
> improvement for non-sequential in-core queries.

FYI I have a seqfile based procfile that export cached file pages with
various states:

root /home/wfg# echo /sbin/init > /proc/filecache
root /home/wfg# cat /proc/filecache
# file /sbin/init
# flags R:referenced A:active M:mmap U:uptodate D:dirty W:writeback X:readahead P:private O:owner b:buffer d:dirty w:writeback
# idx   len     state           refcnt
0       6       RAMU________    2
6       1       _AMU________    2
7       1       RAMU________    2
8       2       ___U________    1

It was first developed to provide information for prefetching.
Since then I've been using it as a generic page cache inspection tool.
It helped me debug vm/fs issues, eg. readahead, writeback and vmscan.
Though I'm not sure if the interface is acceptable to Linux.

Here is the code snippet if you are interested :) 

/*
 * Listing of cached page ranges of a file.
 *
 * Usage:
 * 		echo 'file name' > /proc/filecache
 * 		cat /proc/filecache
 */

unsigned long page_mask;
#define PG_MMAP		PG_lru		/* reuse any non-relevant flag */
#define PG_BUFFER	PG_swapcache	/* ditto */
#define PG_DIRTY	PG_error	/* ditto */
#define PG_WRITEBACK	PG_buddy	/* ditto */

/*
 * Page state names, prefixed by their abbreviations.
 */
struct {
	unsigned long	mask;
	const char     *name;
	int		faked;
} page_flag [] = {
	{1 << PG_referenced,	"R:referenced",	0},
	{1 << PG_active,	"A:active",	0},
	{1 << PG_MMAP,		"M:mmap",	1},

	{1 << PG_uptodate,	"U:uptodate",	0},
	{1 << PG_dirty,		"D:dirty",	0},
	{1 << PG_writeback,	"W:writeback",	0},
	{1 << PG_reclaim,	"X:readahead",	0},

	{1 << PG_private,	"P:private",	0},
	{1 << PG_owner_priv_1,	"O:owner",	0},

	{1 << PG_BUFFER,	"b:buffer",	1},
	{1 << PG_DIRTY,		"d:dirty",	1},
	{1 << PG_WRITEBACK,	"w:writeback",	1},
};

static unsigned long page_flags(struct page* page)
{
	unsigned long flags;
	struct address_space *mapping = page_mapping(page);

	flags = page->flags & page_mask;

	if (page_mapped(page))
		flags |= (1 << PG_MMAP);

	if (page_has_buffers(page))
		flags |= (1 << PG_BUFFER);

	if (mapping) {
		if (radix_tree_tag_get(&mapping->page_tree,
					page_index(page),
					PAGECACHE_TAG_WRITEBACK))
			flags |= (1 << PG_WRITEBACK);

		if (radix_tree_tag_get(&mapping->page_tree,
					page_index(page),
					PAGECACHE_TAG_DIRTY))
			flags |= (1 << PG_DIRTY);
	}

	return flags;
}

static int pages_similiar(struct page* page0, struct page* page)
{
	if (page_count(page0) != page_count(page))
		return 0;

	if (page_flags(page0) != page_flags(page))
		return 0;

	return 1;
}

static void show_range(struct seq_file *m, struct page* page, unsigned long len)
{
	int i;
	unsigned long flags;

	if (!m || !page)
		return;

	seq_printf(m, "%lu\t%lu\t", page->index, len);

	flags = page_flags(page);
	for (i = 0; i < ARRAY_SIZE(page_flag); i++)
		seq_putc(m, (flags & page_flag[i].mask) ?
					page_flag[i].name[0] : '_');

	seq_printf(m, "\t%d\n", page_count(page));
}

#define BATCH_LINES	100
static pgoff_t show_file_cache(struct seq_file *m,
				struct address_space *mapping, pgoff_t start)
{
	int i;
	int lines = 0;
	pgoff_t len = 0;
	struct pagevec pvec;
	struct page *page;
	struct page *page0 = NULL;

	for (;;) {
		pagevec_init(&pvec, 0);
		pvec.nr = radix_tree_gang_lookup(&mapping->page_tree,
				(void **)pvec.pages, start + len, PAGEVEC_SIZE);

		if (pvec.nr == 0) {
			show_range(m, page0, len);
			start = ULONG_MAX;
			goto out;
		}

		if (!page0)
			page0 = pvec.pages[0];

		for (i = 0; i < pvec.nr; i++) {
			page = pvec.pages[i];

			if (page->index == start + len &&
					pages_similiar(page0, page))
				len++;
			else {
				show_range(m, page0, len);
				page0 = page;
				start = page->index;
				len = 1;
				if (++lines > BATCH_LINES)
					goto out;
			}
		}
	}

out:
	return start;
}

Thanks,
Fengguang

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2010-01-22  1:17 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-01-20 21:57 [PATCH] fs: add fincore(2) (mincore(2) for file descriptors) Chris Frost
2010-01-20 21:57 ` Chris Frost
2010-01-21  1:11 ` Andi Kleen
2010-01-21  1:11   ` Andi Kleen
2010-02-16 18:13   ` Chris Frost
2010-02-21  3:02     ` Andy Isaacson
2010-02-21  3:02       ` Andy Isaacson
2010-02-21  3:25       ` Wu Fengguang
2010-02-21  3:25         ` Wu Fengguang
2010-02-23 16:39         ` Andy Isaacson
2010-02-23 16:39           ` Andy Isaacson
2010-05-07 22:46       ` Cédric Villemain
2010-05-07 22:46         ` Cédric Villemain
2010-01-22  1:17 ` Wu Fengguang [this message]
2010-01-22  1:17   ` Wu Fengguang
2010-01-22  1:29 ` Paul E. McKenney
2010-01-22  1:29   ` Paul E. McKenney
2010-01-26 22:12 ` Andrew Morton
2010-01-26 22:12   ` Andrew Morton
2010-01-28  7:42   ` Steve VanDeBogart
2010-01-28  7:42     ` Steve VanDeBogart
2010-01-28  8:23     ` Andrew Morton
2010-01-28  8:23       ` Andrew Morton
2010-01-28  8:32       ` Steve VanDeBogart
2010-01-28  8:32         ` Steve VanDeBogart
2010-01-28 23:54       ` Andres Freund
2010-01-28 23:54         ` Andres Freund
2010-01-27 18:14 ` Jamie Lokier
2010-01-27 18:14   ` Jamie Lokier
2010-01-28  8:23   ` Steve VanDeBogart
2010-01-28  8:23     ` Steve VanDeBogart
  -- strict thread matches above, loose matches on Subject: below --
2010-01-20 21:57 Chris Frost

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20100122011709.GA6700@localhost \
    --to=fengguang.wu@intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=andi@firstfloor.org \
    --cc=bhalevy@panasas.com \
    --cc=frost@cs.ucla.edu \
    --cc=heiko.carstens@de.ibm.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mpm@selenic.com \
    --cc=peterz@infradead.org \
    --cc=vandebo-lkml@nerdbox.net \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.