linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Wu Fengguang <fengguang.wu@intel.com>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>,
	LKML <linux-kernel@vger.kernel.org>
Cc: "linux-fsdevel@vger.kernel.org" <linux-fsdevel@vger.kernel.org>
Cc: "linux-mm@kvack.org" <linux-mm@kvack.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Chris Mason <chris.mason@oracle.com>, Nick Piggin <npiggin@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>, Mel Gorman <mel@csn.ul.ie>
Cc: Minchan Kim <minchan.kim@gmail.com>
Subject: [PATCH 4/5] writeback: introduce bdi_start_inode_writeback()
Date: Thu, 29 Jul 2010 19:51:46 +0800	[thread overview]
Message-ID: <20100729121423.613727382@intel.com> (raw)
In-Reply-To: 20100729115142.102255590@intel.com

[-- Attachment #1: writeback-bdi_start_inode_writeback.patch --]
[-- Type: text/plain, Size: 5350 bytes --]

This is to transfer dirty pages encountered in page reclaim to the
flusher threads for writeback.

The flusher will piggy back more dirty pages for IO, yeilding more
efficient IO.

To avoid memory allocations at page reclaim, a mempool is created.
TODO: more adaptive mempool size.

Background works will be kicked to clean the pages under reclaim ASAP.
TODO: sync_works is temporary reused for convenience.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c           |   69 ++++++++++++++++++++++++++++++++--
 include/linux/backing-dev.h |    1 
 2 files changed, 66 insertions(+), 4 deletions(-)

--- linux-next.orig/fs/fs-writeback.c	2010-07-29 17:13:58.000000000 +0800
+++ linux-next/fs/fs-writeback.c	2010-07-29 17:49:05.000000000 +0800
@@ -35,12 +35,15 @@
 struct wb_writeback_work {
 	long nr_pages;
 	struct super_block *sb;
+	struct inode *inode;
+	pgoff_t offset;
 	enum writeback_sync_modes sync_mode;
 	unsigned long sync_after;
 	unsigned int for_sync:1;
 	unsigned int for_kupdate:1;
 	unsigned int range_cyclic:1;
 	unsigned int for_background:1;
+	unsigned int for_reclaim:1;
 
 	struct list_head list;		/* pending work list */
 	struct completion *done;	/* set if the caller waits */
@@ -61,6 +64,27 @@ struct wb_writeback_work {
  */
 int nr_pdflush_threads;
 
+static mempool_t *wb_work_mempool;
+
+static void *wb_work_alloc(gfp_t gfp_mask, void *pool_data)
+{
+	/*
+	 * bdi_start_inode_writeback() may be called on page reclaim
+	 */
+	if (current->flags & PF_MEMALLOC)
+		return NULL;
+
+	return kmalloc(sizeof(struct wb_writeback_work), gfp_mask);
+}
+
+static __init int wb_work_init(void)
+{
+	wb_work_mempool = mempool_create(10240, /* XXX: better number */
+					 wb_work_alloc, mempool_kfree, NULL);
+	return wb_work_mempool ? 0 : -ENOMEM;
+}
+fs_initcall(wb_work_init);
+
 /**
  * writeback_in_progress - determine whether there is writeback in progress
  * @bdi: the device's backing_dev_info structure.
@@ -80,7 +104,7 @@ static void bdi_queue_work(struct backin
 
 	spin_lock(&bdi->wb_lock);
 	list_add_tail(&work->list, &bdi->work_list);
-	if (work->for_sync)
+	if (work->for_sync || work->for_reclaim)
 		atomic_inc(&bdi->wb.sync_works);
 	spin_unlock(&bdi->wb_lock);
 
@@ -109,7 +133,7 @@ __bdi_start_writeback(struct backing_dev
 	 * This is WB_SYNC_NONE writeback, so if allocation fails just
 	 * wakeup the thread for old dirty data writeback
 	 */
-	work = kzalloc(sizeof(*work), GFP_ATOMIC);
+	work = mempool_alloc(wb_work_mempool, GFP_NOWAIT);
 	if (!work) {
 		if (bdi->wb.task) {
 			trace_writeback_nowork(bdi);
@@ -118,6 +142,7 @@ __bdi_start_writeback(struct backing_dev
 		return;
 	}
 
+	memset(work, 0, sizeof(*work));
 	work->sync_mode	= WB_SYNC_NONE;
 	work->nr_pages	= nr_pages;
 	work->range_cyclic = range_cyclic;
@@ -156,6 +181,26 @@ void bdi_start_background_writeback(stru
 	__bdi_start_writeback(bdi, LONG_MAX, true, true);
 }
 
+void bdi_start_inode_writeback(struct inode *inode, pgoff_t offset)
+{
+	struct wb_writeback_work *work;
+
+	if (!igrab(inode))
+		return;
+
+	work = mempool_alloc(wb_work_mempool, GFP_NOWAIT);
+	if (!work)
+		return;
+
+	memset(work, 0, sizeof(*work));
+	work->sync_mode		= WB_SYNC_NONE;
+	work->inode		= inode;
+	work->offset		= offset;
+	work->for_reclaim	= 1;
+
+	bdi_queue_work(inode->i_sb->s_bdi, work);
+}
+
 /*
  * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
  * furthest end of its superblock's dirty-inode list.
@@ -618,6 +663,22 @@ static long wb_writeback(struct bdi_writ
 	long wrote = 0;
 	struct inode *inode;
 
+	if (work->for_reclaim) {
+		struct page *page = find_get_page(work->inode->i_mapping,
+						  work->offset);
+		wrote = __filemap_fdatawrite_range( /* XXX: write around */
+					work->inode->i_mapping,
+					work->offset,
+					work->offset + MAX_WRITEBACK_PAGES,
+					WB_SYNC_NONE);
+		if (page && PageWriteback(page))
+			SetPageReclaim(page);
+		if (page)
+			page_cache_release(page);
+		iput(work->inode);
+		return wrote;
+	}
+
 	if (!wbc.range_cyclic) {
 		wbc.range_start = 0;
 		wbc.range_end = LLONG_MAX;
@@ -771,7 +832,7 @@ long wb_do_writeback(struct bdi_writebac
 
 		wrote += wb_writeback(wb, work);
 
-		if (work->for_sync)
+		if (work->for_sync || work->for_reclaim)
 			atomic_dec(&wb->sync_works);
 
 		/*
@@ -781,7 +842,7 @@ long wb_do_writeback(struct bdi_writebac
 		if (work->done)
 			complete(work->done);
 		else
-			kfree(work);
+			mempool_free(work, wb_work_mempool);
 	}
 
 	/*
--- linux-next.orig/include/linux/backing-dev.h	2010-07-29 17:13:31.000000000 +0800
+++ linux-next/include/linux/backing-dev.h	2010-07-29 17:47:58.000000000 +0800
@@ -108,6 +108,7 @@ void bdi_unregister(struct backing_dev_i
 int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages);
 void bdi_start_background_writeback(struct backing_dev_info *bdi);
+void bdi_start_inode_writeback(struct inode *inode, pgoff_t offset);
 int bdi_writeback_thread(void *data);
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
 void bdi_arm_supers_timer(void);


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2010-07-29 11:51 UTC|newest]

Thread overview: 30+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-07-29 11:51 [PATCH 0/5] [RFC] transfer ASYNC vmscan writeback IO to the flusher threads Wu Fengguang
2010-07-29 11:51 ` [PATCH 1/5] writeback: introduce wbc.for_sync to cover the two sync stages Wu Fengguang
2010-07-29 15:04   ` Jan Kara
2010-07-30  5:10     ` Wu Fengguang
2010-07-29 11:51 ` [PATCH 2/5] writeback: stop periodic/background work on seeing sync works Wu Fengguang
2010-07-29 16:20   ` Jan Kara
2010-07-30  4:03     ` Wu Fengguang
2010-08-02 20:51       ` Jan Kara
2010-08-03  3:01         ` Wu Fengguang
2010-08-03 10:55           ` Jan Kara
2010-08-03 12:39             ` Jan Kara
2010-08-03 12:59               ` Wu Fengguang
2010-08-03 13:18                 ` Jan Kara
2010-08-03 13:22                 ` Wu Fengguang
2010-08-03 13:44                   ` Wu Fengguang
2010-08-03 13:48                     ` Wu Fengguang
2010-08-03 14:36             ` Wu Fengguang
2010-07-29 11:51 ` [PATCH 3/5] writeback: prevent sync livelock with the sync_after timestamp Wu Fengguang
2010-07-29 15:02   ` Jan Kara
2010-07-30  5:17     ` Wu Fengguang
2010-07-29 11:51 ` Wu Fengguang [this message]
2010-07-29 11:51 ` [PATCH 5/5] vmscan: transfer async file writeback to the flusher Wu Fengguang
2010-07-29 16:09 ` [PATCH 0/5] [RFC] transfer ASYNC vmscan writeback IO to the flusher threads Jan Kara
2010-07-30  5:34   ` Wu Fengguang
2010-07-29 23:23 ` Dave Chinner
2010-07-30  7:58   ` Wu Fengguang
2010-07-30  9:22     ` KOSAKI Motohiro
2010-07-30 12:25       ` Wu Fengguang
2010-07-30 11:12     ` Dave Chinner
2010-07-30 13:18       ` Wu Fengguang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20100729121423.613727382@intel.com \
    --to=fengguang.wu@intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).