[PATCH 5/6] vmscan: Write out ranges of pages contiguous to the inode where possible

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Mel Gorman <mel@csn.ul.ie>
To: linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	linux-mm@kvack.org
Cc: Dave Chinner <david@fromorbit.com>,
	Chris Mason <chris.mason@oracle.com>,
	Nick Piggin <npiggin@suse.de>, Rik van Riel <riel@redhat.com>,
	Mel Gorman <mel@csn.ul.ie>
Subject: [PATCH 5/6] vmscan: Write out ranges of pages contiguous to the inode where possible
Date: Tue,  8 Jun 2010 10:02:24 +0100	[thread overview]
Message-ID: <1275987745-21708-6-git-send-email-mel@csn.ul.ie> (raw)
In-Reply-To: <1275987745-21708-1-git-send-email-mel@csn.ul.ie>

Page reclaim cleans individual pages using a_ops->writepage() because from
the VM perspective, it is known that pages in a particular zone must be freed
soon, it considers the target page to be the oldest and it does not want
to wait while background flushers cleans other pages. From a filesystem
perspective this is extremely inefficient as it generates a very seeky
IO pattern leading to the perverse situation where it can take longer to
clean all dirty pages than it would have otherwise.

This patch recognises that there are cases where a number of pages
belonging to the same inode are being written out. When this happens and
writepages() is implemented, the range of pages will be written out with
a_ops->writepages. The inode is pinned and the page lock released before
submitting the range to the filesystem. While this potentially means that
more pages are cleaned than strictly necessary, the expectation is that the
filesystem will be able to writeout the pages more efficiently and improve
overall performance.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
---
 mm/vmscan.c |  220 +++++++++++++++++++++++++++++++++++++++++++++++------------
 1 files changed, 176 insertions(+), 44 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 58527c4..b2eb2a6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -323,6 +323,55 @@ typedef enum {
 	PAGE_CLEAN,
 } pageout_t;
 
+int write_reclaim_page(struct page *page, struct address_space *mapping,
+						enum pageout_io sync_writeback)
+{
+	int res;
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_NONE,
+		.nr_to_write = SWAP_CLUSTER_MAX,
+		.range_start = 0,
+		.range_end = LLONG_MAX,
+		.nonblocking = 1,
+		.for_reclaim = 1,
+	};
+
+	if (!clear_page_dirty_for_io(page))
+		return PAGE_CLEAN;
+
+	SetPageReclaim(page);
+	res = mapping->a_ops->writepage(page, &wbc);
+	/*
+	 * XXX: This is the Holy Hand Grenade of PotentiallyInvalidMapping. As
+	 * the page lock has been dropped by ->writepage, that mapping could
+	 * be anything
+	 */
+	if (res < 0)
+		handle_write_error(mapping, page, res);
+	if (res == AOP_WRITEPAGE_ACTIVATE) {
+		ClearPageReclaim(page);
+		return PAGE_ACTIVATE;
+	}
+
+	/*
+	 * Wait on writeback if requested to. This happens when
+	 * direct reclaiming a large contiguous area and the
+	 * first attempt to free a range of pages fails.
+	 */
+	if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC)
+		wait_on_page_writeback(page);
+
+	if (!PageWriteback(page)) {
+		/* synchronous write or broken a_ops? */
+		ClearPageReclaim(page);
+	}
+	trace_mm_vmscan_writepage(page,
+		sync_writeback == PAGEOUT_IO_SYNC);
+	inc_zone_page_state(page, NR_VMSCAN_WRITE);
+
+	return PAGE_SUCCESS;
+}
+
 /*
  * pageout is called by shrink_page_list() for each dirty page.
  * Calls ->writepage().
@@ -367,45 +416,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 	if (!may_write_to_queue(mapping->backing_dev_info))
 		return PAGE_KEEP;
 
-	if (clear_page_dirty_for_io(page)) {
-		int res;
-		struct writeback_control wbc = {
-			.sync_mode = WB_SYNC_NONE,
-			.nr_to_write = SWAP_CLUSTER_MAX,
-			.range_start = 0,
-			.range_end = LLONG_MAX,
-			.nonblocking = 1,
-			.for_reclaim = 1,
-		};
-
-		SetPageReclaim(page);
-		res = mapping->a_ops->writepage(page, &wbc);
-		if (res < 0)
-			handle_write_error(mapping, page, res);
-		if (res == AOP_WRITEPAGE_ACTIVATE) {
-			ClearPageReclaim(page);
-			return PAGE_ACTIVATE;
-		}
-
-		/*
-		 * Wait on writeback if requested to. This happens when
-		 * direct reclaiming a large contiguous area and the
-		 * first attempt to free a range of pages fails.
-		 */
-		if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC)
-			wait_on_page_writeback(page);
-
-		if (!PageWriteback(page)) {
-			/* synchronous write or broken a_ops? */
-			ClearPageReclaim(page);
-		}
-		trace_mm_vmscan_writepage(page,
-			sync_writeback == PAGEOUT_IO_SYNC);
-		inc_zone_page_state(page, NR_VMSCAN_WRITE);
-		return PAGE_SUCCESS;
-	}
-
-	return PAGE_CLEAN;
+	return write_reclaim_page(page, mapping, sync_writeback);
 }
 
 /*
@@ -621,20 +632,120 @@ static enum page_references page_check_references(struct page *page,
 }
 
 /*
+ * Clean a list of pages in contiguous ranges where possible. It is expected
+ * that all the pages on page_list have been locked as part of isolation from
+ * the LRU
+ *
+ * XXX: Is there a problem with holding multiple page locks like this?
+ */
+static noinline_for_stack void clean_page_list(struct list_head *page_list,
+				struct scan_control *sc)
+{
+	LIST_HEAD(ret_pages);
+	struct page *cursor, *page, *tmp;
+
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_NONE,
+	};
+
+	if (!sc->may_writepage)
+		return;
+
+	/* Write the pages out to disk in ranges where possible */
+	while (!list_empty(page_list)) {
+		struct address_space *mapping;
+		bool may_enter_fs;
+
+		cursor = lru_to_page(page_list);
+		list_del(&cursor->lru);
+		list_add(&cursor->lru, &ret_pages);
+		mapping = page_mapping(cursor);
+		if (!mapping || !may_write_to_queue(mapping->backing_dev_info)) {
+			unlock_page(cursor);
+			continue;
+		}
+
+		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
+			(PageSwapCache(cursor) && (sc->gfp_mask & __GFP_IO));
+		if (!may_enter_fs) {
+			unlock_page(cursor);
+			continue;
+		}
+
+		wbc.nr_to_write = LONG_MAX;
+		wbc.range_start = page_offset(cursor);
+		wbc.range_end = page_offset(cursor) + PAGE_CACHE_SIZE - 1;
+
+		/* Only search if there is an inode to pin the address_space with */
+		if (!mapping->host)
+			goto writeout;
+
+		/* Only search if the address_space is smart about ranges */
+		if (!mapping->a_ops->writepages)
+			goto writeout;
+
+		/* Find a range of pages to clean within this list */
+		list_for_each_entry_safe(page, tmp, page_list, lru) {
+			if (!PageDirty(page) || PageWriteback(page))
+				continue;
+			if (page_mapping(page) != mapping)
+				continue;
+
+			list_del(&page->lru);
+			unlock_page(page);
+			list_add(&page->lru, &ret_pages);
+
+			wbc.range_start = min(wbc.range_start, page_offset(page));
+			wbc.range_end = max(wbc.range_end, 
+				(page_offset(page) + PAGE_CACHE_SIZE - 1));
+		}
+
+writeout:
+		if (wbc.range_start == wbc.range_end - PAGE_CACHE_SIZE + 1) {
+			/* Write single page */
+			switch (write_reclaim_page(cursor, mapping, PAGEOUT_IO_ASYNC)) {
+			case PAGE_KEEP:
+			case PAGE_ACTIVATE:
+			case PAGE_CLEAN:
+				unlock_page(cursor);
+				break;
+			case PAGE_SUCCESS:
+				break;
+			}
+		} else {
+			/* Grab inode under page lock before writing range */
+			struct inode *inode = igrab(mapping->host);
+			unlock_page(cursor);
+			if (inode) {
+				do_writepages(mapping, &wbc);
+				iput(inode);
+			}
+		}
+	}
+	list_splice(&ret_pages, page_list);
+}
+
+/*
  * shrink_page_list() returns the number of reclaimed pages
  */
 static unsigned long shrink_page_list(struct list_head *page_list,
 					struct scan_control *sc,
 					enum pageout_io sync_writeback)
 {
-	LIST_HEAD(ret_pages);
+	LIST_HEAD(putback_pages);
+	LIST_HEAD(dirty_pages);
+	struct list_head *ret_list = page_list;
 	struct pagevec freed_pvec;
-	int pgactivate = 0;
+	int pgactivate;
+	bool cleaned = false;
 	unsigned long nr_reclaimed = 0;
 
+	pgactivate = 0;
 	cond_resched();
 
 	pagevec_init(&freed_pvec, 1);
+
+restart_dirty:
 	while (!list_empty(page_list)) {
 		enum page_references references;
 		struct address_space *mapping;
@@ -723,7 +834,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			}
 		}
 
-		if (PageDirty(page)) {
+		if (PageDirty(page))  {
+			/*
+			 * On the first pass, dirty pages are put on a separate
+			 * list. IO is then queued based on ranges of pages for
+			 * each unique mapping in the list
+			 */
+			if (!cleaned) {
+				/* Keep locked for clean_page_list */
+				list_add(&page->lru, &dirty_pages);
+				goto keep_dirty;
+			}
+
 			if (references == PAGEREF_RECLAIM_CLEAN)
 				goto keep_locked;
 			if (!may_enter_fs)
@@ -832,10 +954,20 @@ activate_locked:
 keep_locked:
 		unlock_page(page);
 keep:
-		list_add(&page->lru, &ret_pages);
+		list_add(&page->lru, &putback_pages);
+keep_dirty:
 		VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
 	}
-	list_splice(&ret_pages, page_list);
+
+	if (!cleaned && !list_empty(&dirty_pages)) {
+		clean_page_list(&dirty_pages, sc);
+		page_list = &dirty_pages;
+		cleaned = true;
+		goto restart_dirty;
+	}
+	BUG_ON(!list_empty(&dirty_pages));
+
+	list_splice(&putback_pages, ret_list);
 	if (pagevec_count(&freed_pvec))
 		__pagevec_free(&freed_pvec);
 	count_vm_events(PGACTIVATE, pgactivate);
-- 
1.7.1

next prev parent reply	other threads:[~2010-06-08  9:02 UTC|newest]

Thread overview: 67+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-06-08  9:02 [RFC PATCH 0/6] Do not call ->writepage[s] from direct reclaim and use a_ops->writepages() where possible Mel Gorman
2010-06-08  9:02 ` [PATCH 1/6] tracing, vmscan: Add trace events for kswapd wakeup, sleeping and direct reclaim Mel Gorman
2010-06-08  9:02 ` [PATCH 2/6] tracing, vmscan: Add trace events for LRU page isolation Mel Gorman
2010-06-08  9:02 ` [PATCH 3/6] tracing, vmscan: Add trace event when a page is written Mel Gorman
2010-06-08  9:02 ` [PATCH 4/6] tracing, vmscan: Add a postprocessing script for reclaim-related ftrace events Mel Gorman
2010-06-08  9:02 ` Mel Gorman [this message]
2010-06-11  6:10   ` [PATCH 5/6] vmscan: Write out ranges of pages contiguous to the inode where possible Andrew Morton
2010-06-11 12:49     ` Mel Gorman
2010-06-11 19:07       ` Andrew Morton
2010-06-11 20:44         ` Mel Gorman
2010-06-11 21:33           ` Andrew Morton
2010-06-12  0:17             ` Mel Gorman
2010-06-11 16:27     ` Christoph Hellwig
2010-06-08  9:02 ` [PATCH 6/6] vmscan: Do not writeback pages in direct reclaim Mel Gorman
2010-06-11  6:17   ` Andrew Morton
2010-06-11 12:54     ` Mel Gorman
2010-06-11 16:25     ` Christoph Hellwig
2010-06-11 17:43       ` Andrew Morton
2010-06-11 17:49         ` Christoph Hellwig
2010-06-11 18:13           ` Mel Gorman
2010-06-08  9:08 ` [RFC PATCH 0/6] Do not call ->writepage[s] from direct reclaim and use a_ops->writepages() where possible Christoph Hellwig
2010-06-08  9:28   ` Mel Gorman
2010-06-11 16:29     ` Christoph Hellwig
2010-06-11 18:15       ` Mel Gorman
2010-06-11 19:12       ` Chris Mason
2010-06-09  2:52 ` KAMEZAWA Hiroyuki
2010-06-09  9:52   ` Mel Gorman
2010-06-10  0:38     ` KAMEZAWA Hiroyuki
2010-06-10  1:10       ` Mel Gorman
2010-06-10  1:29         ` KAMEZAWA Hiroyuki
2010-06-11  5:57 ` Andrew Morton
2010-06-11 12:33   ` Mel Gorman
2010-06-11 16:30     ` Christoph Hellwig
2010-06-11 18:17       ` Mel Gorman
2010-06-15 14:00 ` Andrea Arcangeli
2010-06-15 14:11   ` Christoph Hellwig
2010-06-15 14:22     ` Andrea Arcangeli
2010-06-15 14:43       ` Christoph Hellwig
2010-06-15 15:08         ` Andrea Arcangeli
2010-06-15 15:25           ` Christoph Hellwig
2010-06-15 15:45             ` Andrea Arcangeli
2010-06-15 16:26               ` Christoph Hellwig
2010-06-15 16:31                 ` Andrea Arcangeli
2010-06-15 16:49                 ` Rik van Riel
2010-06-15 16:54                   ` Christoph Hellwig
2010-06-15 19:13                     ` Rik van Riel
2010-06-15 19:17                       ` Christoph Hellwig
2010-06-15 19:44                         ` Chris Mason
2010-06-16  7:57                       ` Nick Piggin
2010-06-16 16:59                         ` Rik van Riel
2010-06-16 17:04                           ` Andrea Arcangeli
2010-06-15 16:54                   ` Nick Piggin
2010-06-15 15:38           ` Mel Gorman
2010-06-15 16:14             ` Andrea Arcangeli
2010-06-15 16:22               ` Christoph Hellwig
2010-06-15 16:30               ` Mel Gorman
2010-06-15 16:34                 ` Mel Gorman
2010-06-15 16:54                   ` Andrea Arcangeli
2010-06-15 16:35                 ` Christoph Hellwig
2010-06-15 16:37                 ` Andrea Arcangeli
2010-06-15 17:43                   ` Christoph Hellwig
2010-06-15 16:45               ` Christoph Hellwig
2010-06-15 14:51   ` Mel Gorman
2010-06-15 14:55     ` Rik van Riel
2010-06-15 15:08     ` Nick Piggin
2010-06-15 15:10       ` Mel Gorman
2010-06-15 16:28     ` Andrea Arcangeli

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:58527c4 dfblob:b2eb2a6 )
 OR (
bs:"[PATCH 5/6] vmscan: Write out ranges of pages contiguous to the inode where possible" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1275987745-21708-6-git-send-email-mel@csn.ul.ie \
    --to=mel@csn.ul.ie \
    --cc=chris.mason@oracle.com \
    --cc=david@fromorbit.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=npiggin@suse.de \
    --cc=riel@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).