linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Tejun Heo <tj@kernel.org>
To: axboe@kernel.dk
Cc: linux-kernel@vger.kernel.org, jack@suse.cz, hch@infradead.org,
	hannes@cmpxchg.org, linux-fsdevel@vger.kernel.org,
	vgoyal@redhat.com, lizefan@huawei.com, cgroups@vger.kernel.org,
	linux-mm@kvack.org, mhocko@suse.cz, clm@fb.com,
	fengguang.wu@intel.com, david@fromorbit.com,
	Tejun Heo <tj@kernel.org>
Subject: [PATCH 42/45] writeback: make __filemap_fdatawrite_range() croup writeback aware
Date: Tue,  6 Jan 2015 16:26:19 -0500	[thread overview]
Message-ID: <1420579582-8516-43-git-send-email-tj@kernel.org> (raw)
In-Reply-To: <1420579582-8516-1-git-send-email-tj@kernel.org>

__filemap_fdatawrite_range() and its friends are used, among other
things, to implement fsync and coordinate buffered and direct IOs.
The function directly invokes do_writepages() bypassing the usual fs
writeback mechanism and thus currently doesn't respect cgroup
writeback.

This patch adds wb_writeback_work->mapping[_range_{start|end}] which
are used to instruct wb_writeback_work item to execute do_writepages()
on a single mapping.  A new function cgwb_do_writepages() is added
which splits do_writepages() to all dirtying wb's (bdi_writeback's)
using this new work type.  __filemap_fdatawrite_range() is updated to
use cgwb_do_writepages() instead of do_writepages().

cgwb_do_writepages() first tries direct do_writepages() on the current
blkcg as it's likely that the calling cgroup is trying to flush pages
that it dirtied.  If that doesn't write out all pages, it issues
single mappign work items to all wb's with dirty pages on the target
mapping.  It currently doesn't distribute wbc->nr_to_write according
to the bandwidth proportion of each wb.  If this ever becomes
necessary, implementing it shouldn't be too difficult.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
---
 fs/fs-writeback.c           | 149 +++++++++++++++++++++++++++++++++++++++++++-
 include/linux/backing-dev.h |   8 +++
 mm/filemap.c                |   2 +-
 3 files changed, 157 insertions(+), 2 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2bb14d5..cea13fe 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -36,6 +36,9 @@
 
 struct wb_completion {
 	atomic_t		cnt;
+#ifdef CONFIG_CGROUP_WRITEBACK
+	int			mapping_ret;	/* used by works w/ ->mapping */
+#endif
 };
 
 /*
@@ -43,8 +46,19 @@ struct wb_completion {
  */
 struct wb_writeback_work {
 	long nr_pages;
+
 	struct super_block *sb;
 	unsigned long *older_than_this;
+
+	/*
+	 * If ->mapping is set, only that mapping is written out using
+	 * do_writepages().  ->mapping_range_{start|end} are meaningful
+	 * only in such cases.
+	 */
+	struct address_space *mapping;
+	loff_t mapping_range_start;
+	loff_t mapping_range_end;
+
 	enum writeback_sync_modes sync_mode;
 	unsigned int tagged_writepages:1;
 	unsigned int for_kupdate:1;
@@ -697,6 +711,133 @@ static inline bool wbc_skip_metadata(struct writeback_control *wbc)
 	return wbc->iwbl && !iwbl_is_root(wbc->iwbl);
 }
 
+static bool cgwb_do_writepages_split_work(struct inode_wb_link *iwbl,
+					  struct wb_writeback_work *base_work)
+{
+	struct bdi_writeback *wb = iwbl_to_wb(iwbl);
+
+	/* if DIRTY_PAGES isn't visible yet, neither is the dirty data */
+	if (!test_bit(IWBL_DIRTY_PAGES, &iwbl->data))
+		return true;
+
+	return wb_clone_and_queue_work(wb, base_work);
+}
+
+/**
+ * cgwb_do_writepages - cgroup-aware do_writepages()
+ * @mapping: address_space to write out
+ * @wbc: writeback_control in effect
+ *
+ * Write out pages from @mapping according to @wbc.  This function expects
+ * @mapping to be a file backed one.  If cgroup writeback is enabled, the
+ * writes are distributed across the cgroups which dirtied the pages;
+ * otherwise, this is equivalent to do_writepages().  Returns 0 on success,
+ * -errno on failre.
+ */
+int cgwb_do_writepages(struct address_space *mapping,
+		       struct writeback_control *wbc)
+{
+	DEFINE_WB_COMPLETION_ONSTACK(done);
+	struct inode *inode = mapping->host;
+	struct backing_dev_info *bdi = inode_to_bdi(inode);
+	struct wb_writeback_work base_work = {
+		.mapping		= mapping,
+		.mapping_range_start	= wbc->range_start,
+		.mapping_range_end	= wbc->range_end,
+		.nr_pages		= wbc->nr_to_write,
+		.sync_mode		= wbc->sync_mode,
+		.tagged_writepages	= wbc->tagged_writepages,
+		.for_kupdate		= wbc->for_kupdate,
+		.range_cyclic		= wbc->range_cyclic,
+		.for_background		= wbc->for_background,
+		.for_sync		= wbc->for_sync,
+		.done			= &done,
+	};
+	struct cgroup_subsys_state *blkcg_css;
+	struct inode_wb_link *iwbl;
+	struct inode_cgwb_link *icgwbl, *n;
+	int last_blkcg_id = 0, ret;
+
+	/*
+	 * The caller is likely flushing the pages it dirtied.  First look
+	 * up the current iwbl and perform do_writepages() directly on it.
+	 * If no page is skipped due to mismatching cgroup, there's nothing
+	 * more to do.
+	 */
+	blkcg_css = task_get_css(current, blkio_cgrp_id);
+	iwbl = iwbl_lookup(inode, blkcg_css);
+	if (iwbl) {
+		wbc_set_iwbl(wbc, iwbl);
+		wbc->iwbl_mismatch = 0;
+
+		ret = do_writepages(mapping, wbc);
+
+		css_put(blkcg_css);
+		if (ret || !wbc->iwbl_mismatch)
+			return ret;
+	} else {
+		css_put(blkcg_css);
+	}
+
+	/*
+	 * Split writes to all dirty iwbl's.  We don't yet implement
+	 * bandwidth-proportional distribution of nr_pages as the only
+	 * current caller, __filemap_fdatawrite_range(), always sets it to
+	 * LONG_MAX.  Implementing proportional distribution would require
+	 * a prepatory pass over dirty iwbl's to calculate the total write
+	 * bandwidth of the involved wb's.
+	 */
+	WARN_ON_ONCE(base_work.nr_pages != LONG_MAX);
+
+	if (!cgwb_do_writepages_split_work(&inode->i_wb_link, &base_work))
+		wb_wait_for_single_work(bdi, &base_work);
+restart_split:
+	rcu_read_lock();
+	inode_for_each_icgwbl(icgwbl, n, inode) {
+		struct inode_wb_link *iwbl = &icgwbl->iwbl;
+		int blkcg_id = iwbl_to_wb(iwbl)->blkcg_css->id;
+
+		if (blkcg_id <= last_blkcg_id)
+			continue;
+
+		if (!cgwb_do_writepages_split_work(iwbl, &base_work)) {
+			rcu_read_unlock();
+			wb_wait_for_single_work(bdi, &base_work);
+			goto restart_split;
+		}
+		last_blkcg_id = blkcg_id;
+	}
+	rcu_read_unlock();
+
+	wb_wait_for_completion(bdi, &done);
+	return done.mapping_ret;
+}
+
+static bool maybe_writeback_single_mapping(struct wb_writeback_work *work)
+{
+	struct wb_completion *done = work->done;
+	struct writeback_control wbc = {
+		.range_start		= work->mapping_range_start,
+		.range_end		= work->mapping_range_end,
+		.nr_to_write		= work->nr_pages,
+		.sync_mode		= work->sync_mode,
+		.tagged_writepages	= work->tagged_writepages,
+		.for_kupdate		= work->for_kupdate,
+		.range_cyclic		= work->range_cyclic,
+		.for_background		= work->for_background,
+		.for_sync		= work->for_sync,
+	};
+	int ret;
+
+	if (!work->mapping)
+		return false;
+
+	ret = do_writepages(work->mapping, &wbc);
+	if (done && ret)
+		done->mapping_ret = ret;
+	return true;
+}
+
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
 static void init_cgwb_dirty_page_context(struct dirty_context *dctx)
@@ -809,6 +950,11 @@ static inline bool wbc_skip_metadata(struct writeback_control *wbc)
 	return false;
 }
 
+static bool maybe_writeback_single_mapping(struct wb_writeback_work *work)
+{
+	return false;
+}
+
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
 /**
@@ -1718,7 +1864,8 @@ static long wb_do_writeback(struct bdi_writeback *wb)
 
 		trace_writeback_exec(wb->bdi, work);
 
-		wrote += wb_writeback(wb, work);
+		if (!maybe_writeback_single_mapping(work))
+			wrote += wb_writeback(wb, work);
 
 		if (work->single_wait) {
 			WARN_ON_ONCE(work->auto_free);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 173d218..2456efb 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -281,6 +281,8 @@ int __cgwb_create(struct backing_dev_info *bdi,
 		  struct cgroup_subsys_state *blkcg_css);
 struct inode_wb_link *iwbl_create(struct inode *inode,
 				  struct bdi_writeback *wb);
+int cgwb_do_writepages(struct address_space *mapping,
+		       struct writeback_control *wbc);
 int mapping_congested(struct address_space *mapping, struct task_struct *task,
 		      int bdi_bits);
 
@@ -787,6 +789,12 @@ static inline bool wbc_skip_page(struct writeback_control *wbc,
 	return false;
 }
 
+static inline int cgwb_do_writepages(struct address_space *mapping,
+				     struct writeback_control *wbc)
+{
+	return do_writepages(mapping, wbc);
+}
+
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
 static inline int mapping_read_congested(struct address_space *mapping,
diff --git a/mm/filemap.c b/mm/filemap.c
index faa577d..e858cd1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -284,7 +284,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 	if (!mapping_cap_writeback_dirty(mapping))
 		return 0;
 
-	ret = do_writepages(mapping, &wbc);
+	ret = cgwb_do_writepages(mapping, &wbc);
 	return ret;
 }
 
-- 
2.1.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2015-01-06 21:27 UTC|newest]

Thread overview: 54+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-01-06 21:25 [PATCHSET RFC block/for-next] writeback: cgroup writeback support Tejun Heo
2015-01-06 21:25 ` [PATCH 01/45] writeback: add struct dirty_context Tejun Heo
2015-01-06 21:25 ` [PATCH 02/45] writeback: add {CONFIG|BDI_CAP|FS}_CGROUP_WRITEBACK Tejun Heo
2015-01-06 21:25 ` [PATCH 03/45] memcg: encode page_cgflags in the lower bits of page->mem_cgroup Tejun Heo
2015-01-06 21:25 ` [PATCH 04/45] memcg, writeback: implement memcg_blkcg_ptr Tejun Heo
2015-01-06 21:25 ` [PATCH 05/45] writeback: make backing_dev_info host cgroup-specific bdi_writebacks Tejun Heo
2015-01-06 21:25 ` [PATCH 06/45] writeback, blkcg: associate each blkcg_gq with the corresponding bdi_writeback Tejun Heo
2015-01-06 21:25 ` [PATCH 07/45] writeback: attribute stats to the matching per-cgroup bdi_writeback Tejun Heo
2015-01-06 21:25 ` [PATCH 08/45] writeback: let balance_dirty_pages() work on the matching cgroup bdi_writeback Tejun Heo
2015-01-06 21:25 ` [PATCH 09/45] writeback: make congestion functions per bdi_writeback Tejun Heo
2015-01-06 21:25 ` [PATCH 10/45] writeback, blkcg: restructure blk_{set|clear}_queue_congested() Tejun Heo
2015-01-06 21:25 ` [PATCH 11/45] writeback, blkcg: propagate non-root blkcg congestion state Tejun Heo
2015-01-06 21:25 ` [PATCH 12/45] writeback: implement and use mapping_congested() Tejun Heo
2015-01-06 21:25 ` [PATCH 13/45] writeback: implement WB_has_dirty_io wb_state flag Tejun Heo
2015-01-06 21:25 ` [PATCH 14/45] writeback: implement backing_dev_info->tot_write_bandwidth Tejun Heo
2015-01-06 21:25 ` [PATCH 15/45] writeback: make bdi_has_dirty_io() take multiple bdi_writeback's into account Tejun Heo
2015-01-06 21:25 ` [PATCH 16/45] writeback: don't issue wb_writeback_work if clean Tejun Heo
2015-01-06 21:25 ` [PATCH 17/45] writeback: make bdi->min/max_ratio handling cgroup writeback aware Tejun Heo
2015-01-06 21:25 ` [PATCH 18/45] writeback: implement bdi_for_each_wb() Tejun Heo
2015-01-06 21:25 ` [PATCH 19/45] writeback: remove bdi_start_writeback() Tejun Heo
2015-01-06 21:25 ` [PATCH 20/45] writeback: make laptop_mode_timer_fn() handle multiple bdi_writeback's Tejun Heo
2015-01-06 21:25 ` [PATCH 21/45] writeback: make writeback_in_progress() take bdi_writeback instead of backing_dev_info Tejun Heo
2015-01-06 21:25 ` [PATCH 22/45] writeback: make bdi_start_background_writeback() " Tejun Heo
2015-01-06 21:26 ` [PATCH 23/45] writeback: make wakeup_flusher_threads() handle multiple bdi_writeback's Tejun Heo
2015-01-06 21:26 ` [PATCH 24/45] writeback: add wb_writeback_work->auto_free Tejun Heo
2015-01-06 21:26 ` [PATCH 25/45] writeback: implement bdi_wait_for_completion() Tejun Heo
2015-01-06 21:26 ` [PATCH 26/45] writeback: implement wb_wait_for_single_work() Tejun Heo
2015-01-06 21:26 ` [PATCH 27/45] writeback: restructure try_writeback_inodes_sb[_nr]() Tejun Heo
2015-01-06 21:26 ` [PATCH 28/45] writeback: make writeback initiation functions handle multiple bdi_writeback's Tejun Heo
2015-01-06 21:26 ` [PATCH 29/45] writeback: move i_wb_list emptiness test into inode_wb_list_del() from its caller Tejun Heo
2015-01-06 21:26 ` [PATCH 30/45] vfs, writeback: introduce struct inode_wb_link Tejun Heo
2015-01-06 21:26 ` [PATCH 31/45] vfs, writeback: add inode_wb_link->data point to the associated bdi_writeback Tejun Heo
2015-01-06 21:26 ` [PATCH 32/45] vfs, writeback: move inode->dirtied_when into inode->i_wb_link Tejun Heo
2015-01-06 21:26 ` [PATCH 33/45] writeback: minor reorganization of fs/fs-writeback.c Tejun Heo
2015-01-06 21:26 ` [PATCH 34/45] vfs, writeback: implement support for multiple inode_wb_link's Tejun Heo
2015-01-06 21:26 ` [PATCH 35/45] vfs, writeback: implement inode->i_nr_syncs Tejun Heo
2015-01-06 21:26 ` [PATCH 36/45] writeback: dirty inodes against their matching cgroup bdi_writeback's Tejun Heo
2015-01-06 21:26 ` [PATCH 37/45] writeback: make writeback_control carry the inode_wb_link being served Tejun Heo
2015-01-06 21:26 ` [PATCH 38/45] writeback: make cyclic writeback cursor cgroup writeback aware Tejun Heo
2015-01-06 21:26 ` [PATCH 39/45] writeback: make DIRTY_PAGES tracking " Tejun Heo
2015-01-06 21:26 ` [PATCH 40/45] writeback: make write_cache_pages() " Tejun Heo
2015-01-06 21:26 ` [PATCH 41/45] writeback: make __writeback_single_inode() " Tejun Heo
2015-01-06 21:26 ` Tejun Heo [this message]
2015-01-06 21:26 ` [PATCH 43/45] buffer, writeback: make __block_write_full_page() honor cgroup writeback Tejun Heo
2015-01-06 21:26 ` [PATCH 44/45] mpage: make __mpage_writepage() " Tejun Heo
2015-01-06 21:26 ` [PATCH 45/45] ext2: enable cgroup writeback support Tejun Heo
2015-01-06 21:44 ` [PATCHSET RFC block/for-next] writeback: " Tejun Heo
2015-01-07 23:45   ` Dave Chinner
2015-01-09 21:23     ` Tejun Heo
2015-01-10  0:38       ` Dave Chinner
2015-01-10 15:56         ` Tejun Heo
2015-01-10 16:05           ` Tejun Heo
2015-01-08  9:30 ` Jan Kara
2015-01-09 21:36   ` Tejun Heo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1420579582-8516-43-git-send-email-tj@kernel.org \
    --to=tj@kernel.org \
    --cc=axboe@kernel.dk \
    --cc=cgroups@vger.kernel.org \
    --cc=clm@fb.com \
    --cc=david@fromorbit.com \
    --cc=fengguang.wu@intel.com \
    --cc=hannes@cmpxchg.org \
    --cc=hch@infradead.org \
    --cc=jack@suse.cz \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lizefan@huawei.com \
    --cc=mhocko@suse.cz \
    --cc=vgoyal@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).