linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Wu Fengguang <fengguang.wu@intel.com>
To: "linux-mm@kvack.org" <linux-mm@kvack.org>
Cc: Mel Gorman <mel@csn.ul.ie>, Wu Fengguang <fengguang.wu@intel.com>,
	Rik van Riel <riel@redhat.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Minchan Kim <minchan.kim@gmail.com>,
	KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>,
	KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>,
	Jan Kara <jack@suse.cz>,
	Andrew Morton <akpm@linux-foundation.org>
Subject: [PATCH 3/4] writeback: introduce bdi_start_inode_writeback()
Date: Mon, 13 Sep 2010 20:31:13 +0800	[thread overview]
Message-ID: <20100913130150.138758012@intel.com> (raw)
In-Reply-To: 20100913123110.372291929@intel.com

[-- Attachment #1: writeback-bdi_start_inode_writeback.patch --]
[-- Type: text/plain, Size: 6084 bytes --]

This is to transfer dirty pages encountered in page reclaim to the
flusher threads for writeback.

The flusher will piggy back more dirty pages for IO
- it's more IO efficient
- it helps clean more pages, a good number of them may sit in the same
  LRU list that is being scanned.

To avoid memory allocations at page reclaim, a mempool is created.

Background/periodic works will quit automatically, so as to clean the
pages under reclaim ASAP. However the sync work can still block us for
long time.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c           |  103 +++++++++++++++++++++++++++++++++-
 include/linux/backing-dev.h |    2 
 2 files changed, 102 insertions(+), 3 deletions(-)

--- linux-next.orig/fs/fs-writeback.c	2010-09-13 20:06:09.000000000 +0800
+++ linux-next/fs/fs-writeback.c	2010-09-13 20:24:03.000000000 +0800
@@ -30,11 +30,20 @@
 #include "internal.h"
 
 /*
+ * When flushing an inode page (for page reclaim), try to piggy back more
+ * nearby pages for IO efficiency. These pages will have good opportunity
+ * to be in the same LRU list.
+ */
+#define WRITE_AROUND_PAGES	(1UL << (20 - PAGE_CACHE_SHIFT))
+
+/*
  * Passed into wb_writeback(), essentially a subset of writeback_control
  */
 struct wb_writeback_work {
 	long nr_pages;
 	struct super_block *sb;
+	struct inode *inode;
+	pgoff_t offset;
 	enum writeback_sync_modes sync_mode;
 	unsigned int for_kupdate:1;
 	unsigned int range_cyclic:1;
@@ -59,6 +68,27 @@ struct wb_writeback_work {
  */
 int nr_pdflush_threads;
 
+static mempool_t *wb_work_mempool;
+
+static void *wb_work_alloc(gfp_t gfp_mask, void *pool_data)
+{
+	/*
+	 * bdi_start_inode_writeback() may be called on page reclaim
+	 */
+	if (current->flags & PF_MEMALLOC)
+		return NULL;
+
+	return kmalloc(sizeof(struct wb_writeback_work), gfp_mask);
+}
+
+static __init int wb_work_init(void)
+{
+	wb_work_mempool = mempool_create(1024,
+					 wb_work_alloc, mempool_kfree, NULL);
+	return wb_work_mempool ? 0 : -ENOMEM;
+}
+fs_initcall(wb_work_init);
+
 /**
  * writeback_in_progress - determine whether there is writeback in progress
  * @bdi: the device's backing_dev_info structure.
@@ -101,7 +131,7 @@ __bdi_start_writeback(struct backing_dev
 	 * This is WB_SYNC_NONE writeback, so if allocation fails just
 	 * wakeup the thread for old dirty data writeback
 	 */
-	work = kzalloc(sizeof(*work), GFP_ATOMIC);
+	work = mempool_alloc(wb_work_mempool, GFP_NOWAIT);
 	if (!work) {
 		if (bdi->wb.task) {
 			trace_writeback_nowork(bdi);
@@ -110,6 +140,7 @@ __bdi_start_writeback(struct backing_dev
 		return;
 	}
 
+	memset(work, 0, sizeof(*work));
 	work->sync_mode	= WB_SYNC_NONE;
 	work->nr_pages	= nr_pages;
 	work->range_cyclic = range_cyclic;
@@ -148,6 +179,55 @@ void bdi_start_background_writeback(stru
 	__bdi_start_writeback(bdi, LONG_MAX, true, true);
 }
 
+int bdi_start_inode_writeback(struct backing_dev_info *bdi,
+			      struct inode *inode, pgoff_t offset)
+{
+	struct wb_writeback_work *work;
+
+	spin_lock_bh(&bdi->wb_lock);
+	list_for_each_entry_reverse(work, &bdi->work_list, list) {
+		unsigned long end;
+		if (work->inode != inode)
+			continue;
+		end = work->offset + work->nr_pages;
+		if (work->offset - offset < WRITE_AROUND_PAGES) {
+			work->nr_pages += work->offset - offset;
+			work->offset = offset;
+			inode = NULL;
+			break;
+		} else if (offset - end < WRITE_AROUND_PAGES) {
+			work->nr_pages += offset - end;
+			inode = NULL;
+			break;
+		} else if (offset > work->offset &&
+			   offset < end) {
+			inode = NULL;
+			break;
+		}
+	}
+	spin_unlock_bh(&bdi->wb_lock);
+
+	if (!inode)
+		return 0;
+
+	if (!igrab(inode))
+		return -ENOENT;
+
+	work = mempool_alloc(wb_work_mempool, GFP_NOWAIT);
+	if (!work)
+		return -ENOMEM;
+
+	memset(work, 0, sizeof(*work));
+	work->sync_mode		= WB_SYNC_NONE;
+	work->inode		= inode;
+	work->offset		= offset;
+	work->nr_pages		= 1;
+
+	bdi_queue_work(inode->i_sb->s_bdi, work);
+
+	return 0;
+}
+
 /*
  * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
  * furthest end of its superblock's dirty-inode list.
@@ -724,6 +804,20 @@ get_next_work_item(struct backing_dev_in
 	return work;
 }
 
+static long wb_flush_inode(struct bdi_writeback *wb,
+			   struct wb_writeback_work *work)
+{
+	pgoff_t start = round_down(work->offset, WRITE_AROUND_PAGES);
+	pgoff_t end = round_up(work->offset + work->nr_pages,
+			       WRITE_AROUND_PAGES);
+	int wrote;
+
+	wrote = __filemap_fdatawrite_range(work->inode->i_mapping,
+					   start, end, WB_SYNC_NONE);
+	iput(work->inode);
+	return wrote;
+}
+
 static long wb_check_background_flush(struct bdi_writeback *wb)
 {
 	if (over_bground_thresh()) {
@@ -796,7 +890,10 @@ long wb_do_writeback(struct bdi_writebac
 
 		trace_writeback_exec(bdi, work);
 
-		wrote += wb_writeback(wb, work);
+		if (work->inode)
+			wrote += wb_flush_inode(wb, work);
+		else
+			wrote += wb_writeback(wb, work);
 
 		/*
 		 * Notify the caller of completion if this is a synchronous
@@ -805,7 +902,7 @@ long wb_do_writeback(struct bdi_writebac
 		if (work->done)
 			complete(work->done);
 		else
-			kfree(work);
+			mempool_free(work, wb_work_mempool);
 	}
 
 	/*
--- linux-next.orig/include/linux/backing-dev.h	2010-09-13 19:48:16.000000000 +0800
+++ linux-next/include/linux/backing-dev.h	2010-09-13 20:20:59.000000000 +0800
@@ -106,6 +106,8 @@ void bdi_unregister(struct backing_dev_i
 int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages);
 void bdi_start_background_writeback(struct backing_dev_info *bdi);
+int bdi_start_inode_writeback(struct backing_dev_info *bdi,
+			      struct inode *inode, pgoff_t offset);
 int bdi_writeback_thread(void *data);
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
 void bdi_arm_supers_timer(void);


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2010-09-13 13:08 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-09-13 12:31 [PATCH 0/4] [RFC] transfer vmscan pageout works to the flusher thread Wu Fengguang
2010-09-13 12:31 ` [PATCH 1/4] writeback: integrated background work Wu Fengguang
2010-09-13 22:46   ` Minchan Kim
2010-09-14  0:53     ` Wu Fengguang
2010-09-14 12:45   ` Jan Kara
2010-09-13 12:31 ` [PATCH 2/4] writeback: quit background/periodic work when other works are enqueued Wu Fengguang
2010-09-14 12:40   ` Jan Kara
2010-11-01 12:07     ` Wu Fengguang
2010-11-01 15:08       ` Jan Kara
2010-11-01 15:13       ` Jan Kara
2010-11-01 12:14     ` [PATCH 1/2] writeback: integrated background writeback work Wu Fengguang
2010-11-01 12:22       ` [PATCH 2/2] writeback: stop background/kupdate works from livelocking other works Wu Fengguang
2010-11-01 15:22         ` Christoph Hellwig
2010-11-02  1:57         ` Minchan Kim
2010-11-05 12:15         ` Johannes Weiner
2010-11-01 15:21       ` [PATCH 1/2] writeback: integrated background writeback work Christoph Hellwig
2010-11-01 20:37         ` Wu Fengguang
2010-11-01 20:39         ` [PATCH 1/2 v2] " Wu Fengguang
2010-11-02  1:55           ` Minchan Kim
2010-11-05 12:01           ` Johannes Weiner
2010-09-13 12:31 ` Wu Fengguang [this message]
2010-09-14 13:36   ` [PATCH 3/4] writeback: introduce bdi_start_inode_writeback() Jan Kara
2010-11-01 12:35     ` Wu Fengguang
2010-09-13 12:31 ` [PATCH 4/4] vmscan: transfer async file writeback to the flusher Wu Fengguang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20100913130150.138758012@intel.com \
    --to=fengguang.wu@intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=hannes@cmpxchg.org \
    --cc=jack@suse.cz \
    --cc=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=kosaki.motohiro@jp.fujitsu.com \
    --cc=linux-mm@kvack.org \
    --cc=mel@csn.ul.ie \
    --cc=minchan.kim@gmail.com \
    --cc=riel@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).