* [PATCH 1/9] writeback: relocate wb[_try]_get(), wb_put(), inode_{attach|detach}_wb()
  2015-05-22 22:36 [PATCHSET 3/3 v3 block/for-4.2/core] writeback: implement foreign cgroup inode bdi_writeback switching Tejun Heo
@ 2015-05-22 22:36 ` Tejun Heo
  0 siblings, 0 replies; 11+ messages in thread
From: Tejun Heo @ 2015-05-22 22:36 UTC (permalink / raw)
  To: axboe
  Cc: linux-kernel, jack, hch, hannes, linux-fsdevel, vgoyal, lizefan,
	cgroups, linux-mm, mhocko, clm, fengguang.wu, david, gthelen,
	khlebnikov, Tejun Heo
Currently, majority of cgroup writeback support including all the
above functions are implemented in include/linux/backing-dev.h and
mm/backing-dev.c; however, the portion closely related to writeback
logic implemented in include/linux/writeback.h and mm/page-writeback.c
will expand to support foreign writeback detection and correction.
This patch moves wb[_try]_get() and wb_put() to
include/linux/backing-dev-defs.h so that they can be used from
writeback.h and inode_{attach|detach}_wb() to writeback.h and
page-writeback.c.
This is pure reorganization and doesn't introduce any functional
changes.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Greg Thelen <gthelen@google.com>
---
 fs/fs-writeback.c                | 31 +++++++++++++++
 include/linux/backing-dev-defs.h | 50 ++++++++++++++++++++++++
 include/linux/backing-dev.h      | 82 ----------------------------------------
 include/linux/writeback.h        | 46 ++++++++++++++++++++++
 mm/backing-dev.c                 | 30 ---------------
 5 files changed, 127 insertions(+), 112 deletions(-)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index da35587..cf6ccfb 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -27,6 +27,7 @@
 #include <linux/backing-dev.h>
 #include <linux/tracepoint.h>
 #include <linux/device.h>
+#include <linux/memcontrol.h>
 #include "internal.h"
 
 /*
@@ -213,6 +214,36 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi,
 
 #ifdef CONFIG_CGROUP_WRITEBACK
 
+void __inode_attach_wb(struct inode *inode, struct page *page)
+{
+	struct backing_dev_info *bdi = inode_to_bdi(inode);
+	struct bdi_writeback *wb = NULL;
+
+	if (inode_cgwb_enabled(inode)) {
+		struct cgroup_subsys_state *memcg_css;
+
+		if (page) {
+			memcg_css = mem_cgroup_css_from_page(page);
+			wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+		} else {
+			/* must pin memcg_css, see wb_get_create() */
+			memcg_css = task_get_css(current, memory_cgrp_id);
+			wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+			css_put(memcg_css);
+		}
+	}
+
+	if (!wb)
+		wb = &bdi->wb;
+
+	/*
+	 * There may be multiple instances of this function racing to
+	 * update the same inode.  Use cmpxchg() to tell the winner.
+	 */
+	if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
+		wb_put(wb);
+}
+
 /**
  * inode_congested - test whether an inode is congested
  * @inode: inode to test for congestion
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 8d470b7..e047b49 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -186,4 +186,54 @@ static inline void set_bdi_congested(struct backing_dev_info *bdi, int sync)
 	set_wb_congested(bdi->wb.congested, sync);
 }
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+/**
+ * wb_tryget - try to increment a wb's refcount
+ * @wb: bdi_writeback to get
+ */
+static inline bool wb_tryget(struct bdi_writeback *wb)
+{
+	if (wb != &wb->bdi->wb)
+		return percpu_ref_tryget(&wb->refcnt);
+	return true;
+}
+
+/**
+ * wb_get - increment a wb's refcount
+ * @wb: bdi_writeback to get
+ */
+static inline void wb_get(struct bdi_writeback *wb)
+{
+	if (wb != &wb->bdi->wb)
+		percpu_ref_get(&wb->refcnt);
+}
+
+/**
+ * wb_put - decrement a wb's refcount
+ * @wb: bdi_writeback to put
+ */
+static inline void wb_put(struct bdi_writeback *wb)
+{
+	if (wb != &wb->bdi->wb)
+		percpu_ref_put(&wb->refcnt);
+}
+
+#else	/* CONFIG_CGROUP_WRITEBACK */
+
+static inline bool wb_tryget(struct bdi_writeback *wb)
+{
+	return true;
+}
+
+static inline void wb_get(struct bdi_writeback *wb)
+{
+}
+
+static inline void wb_put(struct bdi_writeback *wb)
+{
+}
+
+#endif	/* CONFIG_CGROUP_WRITEBACK */
+
 #endif	/* __LINUX_BACKING_DEV_DEFS_H */
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index e9d7373..5c978a9 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -243,7 +243,6 @@ void wb_congested_put(struct bdi_writeback_congested *congested);
 struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
 				    struct cgroup_subsys_state *memcg_css,
 				    gfp_t gfp);
-void __inode_attach_wb(struct inode *inode, struct page *page);
 void wb_memcg_offline(struct mem_cgroup *memcg);
 void wb_blkcg_offline(struct blkcg *blkcg);
 int inode_congested(struct inode *inode, int cong_bits);
@@ -265,37 +264,6 @@ static inline bool inode_cgwb_enabled(struct inode *inode)
 }
 
 /**
- * wb_tryget - try to increment a wb's refcount
- * @wb: bdi_writeback to get
- */
-static inline bool wb_tryget(struct bdi_writeback *wb)
-{
-	if (wb != &wb->bdi->wb)
-		return percpu_ref_tryget(&wb->refcnt);
-	return true;
-}
-
-/**
- * wb_get - increment a wb's refcount
- * @wb: bdi_writeback to get
- */
-static inline void wb_get(struct bdi_writeback *wb)
-{
-	if (wb != &wb->bdi->wb)
-		percpu_ref_get(&wb->refcnt);
-}
-
-/**
- * wb_put - decrement a wb's refcount
- * @wb: bdi_writeback to put
- */
-static inline void wb_put(struct bdi_writeback *wb)
-{
-	if (wb != &wb->bdi->wb)
-		percpu_ref_put(&wb->refcnt);
-}
-
-/**
  * wb_find_current - find wb for %current on a bdi
  * @bdi: bdi of interest
  *
@@ -354,35 +322,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
 }
 
 /**
- * inode_attach_wb - associate an inode with its wb
- * @inode: inode of interest
- * @page: page being dirtied (may be NULL)
- *
- * If @inode doesn't have its wb, associate it with the wb matching the
- * memcg of @page or, if @page is NULL, %current.  May be called w/ or w/o
- * @inode->i_lock.
- */
-static inline void inode_attach_wb(struct inode *inode, struct page *page)
-{
-	if (!inode->i_wb)
-		__inode_attach_wb(inode, page);
-}
-
-/**
- * inode_detach_wb - disassociate an inode from its wb
- * @inode: inode of interest
- *
- * @inode is being freed.  Detach from its wb.
- */
-static inline void inode_detach_wb(struct inode *inode)
-{
-	if (inode->i_wb) {
-		wb_put(inode->i_wb);
-		inode->i_wb = NULL;
-	}
-}
-
-/**
  * inode_to_wb - determine the wb of an inode
  * @inode: inode of interest
  *
@@ -471,19 +410,6 @@ static inline void wb_congested_put(struct bdi_writeback_congested *congested)
 {
 }
 
-static inline bool wb_tryget(struct bdi_writeback *wb)
-{
-	return true;
-}
-
-static inline void wb_get(struct bdi_writeback *wb)
-{
-}
-
-static inline void wb_put(struct bdi_writeback *wb)
-{
-}
-
 static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
 {
 	return &bdi->wb;
@@ -495,14 +421,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
 	return &bdi->wb;
 }
 
-static inline void inode_attach_wb(struct inode *inode, struct page *page)
-{
-}
-
-static inline void inode_detach_wb(struct inode *inode)
-{
-}
-
 static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
 {
 	return &inode_to_bdi(inode)->wb;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 3b73e97..6726b7e 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -8,6 +8,7 @@
 #include <linux/workqueue.h>
 #include <linux/fs.h>
 #include <linux/flex_proportions.h>
+#include <linux/backing-dev-defs.h>
 
 DECLARE_PER_CPU(int, dirty_throttle_leaks);
 
@@ -173,6 +174,51 @@ static inline void wait_on_inode(struct inode *inode)
 	wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE);
 }
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+void __inode_attach_wb(struct inode *inode, struct page *page);
+
+/**
+ * inode_attach_wb - associate an inode with its wb
+ * @inode: inode of interest
+ * @page: page being dirtied (may be NULL)
+ *
+ * If @inode doesn't have its wb, associate it with the wb matching the
+ * memcg of @page or, if @page is NULL, %current.  May be called w/ or w/o
+ * @inode->i_lock.
+ */
+static inline void inode_attach_wb(struct inode *inode, struct page *page)
+{
+	if (!inode->i_wb)
+		__inode_attach_wb(inode, page);
+}
+
+/**
+ * inode_detach_wb - disassociate an inode from its wb
+ * @inode: inode of interest
+ *
+ * @inode is being freed.  Detach from its wb.
+ */
+static inline void inode_detach_wb(struct inode *inode)
+{
+	if (inode->i_wb) {
+		wb_put(inode->i_wb);
+		inode->i_wb = NULL;
+	}
+}
+
+#else	/* CONFIG_CGROUP_WRITEBACK */
+
+static inline void inode_attach_wb(struct inode *inode, struct page *page)
+{
+}
+
+static inline void inode_detach_wb(struct inode *inode)
+{
+}
+
+#endif	/* CONFIG_CGROUP_WRITEBACK */
+
 /*
  * mm/page-writeback.c
  */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 84ebf7c..887d72a8 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -660,36 +660,6 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
 	return wb;
 }
 
-void __inode_attach_wb(struct inode *inode, struct page *page)
-{
-	struct backing_dev_info *bdi = inode_to_bdi(inode);
-	struct bdi_writeback *wb = NULL;
-
-	if (inode_cgwb_enabled(inode)) {
-		struct cgroup_subsys_state *memcg_css;
-
-		if (page) {
-			memcg_css = mem_cgroup_css_from_page(page);
-			wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
-		} else {
-			/* must pin memcg_css, see wb_get_create() */
-			memcg_css = task_get_css(current, memory_cgrp_id);
-			wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
-			css_put(memcg_css);
-		}
-	}
-
-	if (!wb)
-		wb = &bdi->wb;
-
-	/*
-	 * There may be multiple instances of this function racing to
-	 * update the same inode.  Use cmpxchg() to tell the winner.
-	 */
-	if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
-		wb_put(wb);
-}
-
 static void cgwb_bdi_init(struct backing_dev_info *bdi)
 {
 	bdi->wb.memcg_css = mem_cgroup_root_css;
-- 
2.4.0
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 11+ messages in thread
* [PATCHSET 3/3 v4 block/for-4.2/core] writeback: implement foreign cgroup inode bdi_writeback switching
@ 2015-05-28 18:50 Tejun Heo
       [not found] ` <1432839057-17609-1-git-send-email-tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
                   ` (7 more replies)
  0 siblings, 8 replies; 11+ messages in thread
From: Tejun Heo @ 2015-05-28 18:50 UTC (permalink / raw)
  To: axboe-tSWWG44O7X1aa/9Udqfwiw
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, jack-AlSwsSmVLrQ,
	hch-wEGCiKHe2LqWVfeAwA7xHQ, hannes-druUgvl0LCNAfugRpC6u6w,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	vgoyal-H+wXaHxf7aLQT0dZR+AlfA, lizefan-hv44wF8Li93QT0dZR+AlfA,
	cgroups-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	mhocko-AlSwsSmVLrQ, clm-b10kYP2dOMg,
	fengguang.wu-ral2JQCrhuEAvxtiuMwx3w, david-FqsqvQoI3Ljby3iVrkZq2A,
	gthelen-hpIqsD4AKlfQT0dZR+AlfA, khlebnikov-XoJtRXgx1JseBXzfvpsJ4g
Hello,
The changes from the last take[L] are
* 0002-writeback-make-writeback_control-track-the-inode-bei.patch and
  0003-writeback-implement-foreign-cgroup-inode-detection.patch
  assumed that all wbc's are attached to the inode and wb being
  written out; however, pageout() path doesn't participate in cgroup
  writeback leading to oops.
  pageout() isn't the main writeback path, so the impact on isolation
  is relatively limited and as the whole path runs on the same thread
  we don't want it to block on slow cgroups anyway.  In the long term,
  the best route seems to make the path kick off the usual writeback
  path rather than trying to write pages directly.
  Both patches updated to skip cgroup writeback related processing if
  the wbc is not associated with inode / wb.
* might_lock() on tree_lock dropped from
  0007-writeback-add-lockdep-annotation-to-inode_to_wb.patch due to
  spurious locking context warnings.  Unfortunately, there isn't a
  simple way to express _irqsave for might_lock().
The previous two patchsets [1][2] implemented cgroup writeback support
and backpressure propagation through dirty throttling mechanism;
however, the inode is assigned to the wb (bdi_writeback) matching the
first dirtied page and stays there until released.  This first-use
policy can easily lead to gross misbehaviors - a single stray dirty
page can cause gigatbytes to be written by the wrong cgroup.  Also,
while concurrently write sharing an inode is extremely rare and
unsupported, inodes jumping cgroups over time are more common.
This patchset implements foreign cgroup inode detection and wb
switching.  Each writeback run tracks the majority wb being written
using a simple but fairly robust algorithm and when an inode
persistently writes out more foreign cgroup pages than local ones, the
inode is transferred to the majority winner.
This patchset adds 8 bytes to inode making the total per-inode space
overhead of cgroup writeback support 16 bytes on 64bit systems.  The
computational overhead should be negligible.  If the writer changes
from one cgroup to another entirely, the mechanism can render the
correct switch verdict in several seconds of IO time in most cases and
it can converge on the correct answer in reasonable amount of time
even in more ambiguous cases.
This patchset contains the following 8 patches.
 0001-writeback-relocate-wb-_try-_get-wb_put-inode_-attach.patch
 0002-writeback-make-writeback_control-track-the-inode-bei.patch
 0003-writeback-implement-foreign-cgroup-inode-detection.patch
 0004-writeback-implement-locked_-inode_to_wb_and_lock_lis.patch
 0005-writeback-implement-unlocked_inode_to_wb-transaction.patch
 0006-writeback-use-unlocked_inode_to_wb-transaction-in-in.patch
 0007-writeback-add-lockdep-annotation-to-inode_to_wb.patch
 0008-writeback-implement-foreign-cgroup-inode-bdi_writeba.patch
 0009-writeback-disassociate-inodes-from-dying-bdi_writeba.patch
This patchset is on top of
  block/for-4.2/core b04a5636a665 ("block: replace trylock with mutex_lock in blkdev_reread_part()")
+ [1] [PATCHSET 1/3 v4 block/for-4.2/core] writeback: cgroup writeback support
+ [2] [PATCHSET 2/3 v3 block/for-4.2/core] writeback: cgroup writeback backpressure propagation
and available in the following git branch.
 git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git review-cgroup-writeback-switch-20150528
diffstat follows.  Thanks.
 fs/buffer.c                      |   26 -
 fs/fs-writeback.c                |  532 ++++++++++++++++++++++++++++++++++++++-
 fs/mpage.c                       |    3 
 include/linux/backing-dev-defs.h |   66 ++++
 include/linux/backing-dev.h      |  142 ++++------
 include/linux/fs.h               |   11 
 include/linux/mm.h               |    3 
 include/linux/writeback.h        |  130 +++++++++
 mm/backing-dev.c                 |   30 --
 mm/filemap.c                     |    5 
 mm/page-writeback.c              |   27 +
 11 files changed, 836 insertions(+), 139 deletions(-)
--
tejun
[L] http://lkml.kernel.org/g/1432334183-6324-1-git-send-email-tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org
[1] http://lkml.kernel.org/g/1432329245-5844-1-git-send-email-tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org
[2] http://lkml.kernel.org/g/1428350674-8303-1-git-send-email-tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org
^ permalink raw reply	[flat|nested] 11+ messages in thread
* [PATCH 1/9] writeback: relocate wb[_try]_get(), wb_put(), inode_{attach|detach}_wb()
       [not found] ` <1432839057-17609-1-git-send-email-tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
@ 2015-05-28 18:50   ` Tejun Heo
  2015-05-28 18:50   ` [PATCH 7/9] writeback: add lockdep annotation to inode_to_wb() Tejun Heo
  1 sibling, 0 replies; 11+ messages in thread
From: Tejun Heo @ 2015-05-28 18:50 UTC (permalink / raw)
  To: axboe-tSWWG44O7X1aa/9Udqfwiw
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, jack-AlSwsSmVLrQ,
	hch-wEGCiKHe2LqWVfeAwA7xHQ, hannes-druUgvl0LCNAfugRpC6u6w,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	vgoyal-H+wXaHxf7aLQT0dZR+AlfA, lizefan-hv44wF8Li93QT0dZR+AlfA,
	cgroups-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	mhocko-AlSwsSmVLrQ, clm-b10kYP2dOMg,
	fengguang.wu-ral2JQCrhuEAvxtiuMwx3w, david-FqsqvQoI3Ljby3iVrkZq2A,
	gthelen-hpIqsD4AKlfQT0dZR+AlfA, khlebnikov-XoJtRXgx1JseBXzfvpsJ4g,
	Tejun Heo
Currently, majority of cgroup writeback support including all the
above functions are implemented in include/linux/backing-dev.h and
mm/backing-dev.c; however, the portion closely related to writeback
logic implemented in include/linux/writeback.h and mm/page-writeback.c
will expand to support foreign writeback detection and correction.
This patch moves wb[_try]_get() and wb_put() to
include/linux/backing-dev-defs.h so that they can be used from
writeback.h and inode_{attach|detach}_wb() to writeback.h and
page-writeback.c.
This is pure reorganization and doesn't introduce any functional
changes.
Signed-off-by: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
Cc: Jens Axboe <axboe-tSWWG44O7X1aa/9Udqfwiw@public.gmane.org>
Cc: Jan Kara <jack-AlSwsSmVLrQ@public.gmane.org>
Cc: Wu Fengguang <fengguang.wu-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Cc: Greg Thelen <gthelen-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
---
 fs/fs-writeback.c                | 31 +++++++++++++++
 include/linux/backing-dev-defs.h | 50 ++++++++++++++++++++++++
 include/linux/backing-dev.h      | 82 ----------------------------------------
 include/linux/writeback.h        | 46 ++++++++++++++++++++++
 mm/backing-dev.c                 | 30 ---------------
 5 files changed, 127 insertions(+), 112 deletions(-)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index da35587..cf6ccfb 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -27,6 +27,7 @@
 #include <linux/backing-dev.h>
 #include <linux/tracepoint.h>
 #include <linux/device.h>
+#include <linux/memcontrol.h>
 #include "internal.h"
 
 /*
@@ -213,6 +214,36 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi,
 
 #ifdef CONFIG_CGROUP_WRITEBACK
 
+void __inode_attach_wb(struct inode *inode, struct page *page)
+{
+	struct backing_dev_info *bdi = inode_to_bdi(inode);
+	struct bdi_writeback *wb = NULL;
+
+	if (inode_cgwb_enabled(inode)) {
+		struct cgroup_subsys_state *memcg_css;
+
+		if (page) {
+			memcg_css = mem_cgroup_css_from_page(page);
+			wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+		} else {
+			/* must pin memcg_css, see wb_get_create() */
+			memcg_css = task_get_css(current, memory_cgrp_id);
+			wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+			css_put(memcg_css);
+		}
+	}
+
+	if (!wb)
+		wb = &bdi->wb;
+
+	/*
+	 * There may be multiple instances of this function racing to
+	 * update the same inode.  Use cmpxchg() to tell the winner.
+	 */
+	if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
+		wb_put(wb);
+}
+
 /**
  * inode_congested - test whether an inode is congested
  * @inode: inode to test for congestion
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 8d470b7..e047b49 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -186,4 +186,54 @@ static inline void set_bdi_congested(struct backing_dev_info *bdi, int sync)
 	set_wb_congested(bdi->wb.congested, sync);
 }
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+/**
+ * wb_tryget - try to increment a wb's refcount
+ * @wb: bdi_writeback to get
+ */
+static inline bool wb_tryget(struct bdi_writeback *wb)
+{
+	if (wb != &wb->bdi->wb)
+		return percpu_ref_tryget(&wb->refcnt);
+	return true;
+}
+
+/**
+ * wb_get - increment a wb's refcount
+ * @wb: bdi_writeback to get
+ */
+static inline void wb_get(struct bdi_writeback *wb)
+{
+	if (wb != &wb->bdi->wb)
+		percpu_ref_get(&wb->refcnt);
+}
+
+/**
+ * wb_put - decrement a wb's refcount
+ * @wb: bdi_writeback to put
+ */
+static inline void wb_put(struct bdi_writeback *wb)
+{
+	if (wb != &wb->bdi->wb)
+		percpu_ref_put(&wb->refcnt);
+}
+
+#else	/* CONFIG_CGROUP_WRITEBACK */
+
+static inline bool wb_tryget(struct bdi_writeback *wb)
+{
+	return true;
+}
+
+static inline void wb_get(struct bdi_writeback *wb)
+{
+}
+
+static inline void wb_put(struct bdi_writeback *wb)
+{
+}
+
+#endif	/* CONFIG_CGROUP_WRITEBACK */
+
 #endif	/* __LINUX_BACKING_DEV_DEFS_H */
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index e9d7373..5c978a9 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -243,7 +243,6 @@ void wb_congested_put(struct bdi_writeback_congested *congested);
 struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
 				    struct cgroup_subsys_state *memcg_css,
 				    gfp_t gfp);
-void __inode_attach_wb(struct inode *inode, struct page *page);
 void wb_memcg_offline(struct mem_cgroup *memcg);
 void wb_blkcg_offline(struct blkcg *blkcg);
 int inode_congested(struct inode *inode, int cong_bits);
@@ -265,37 +264,6 @@ static inline bool inode_cgwb_enabled(struct inode *inode)
 }
 
 /**
- * wb_tryget - try to increment a wb's refcount
- * @wb: bdi_writeback to get
- */
-static inline bool wb_tryget(struct bdi_writeback *wb)
-{
-	if (wb != &wb->bdi->wb)
-		return percpu_ref_tryget(&wb->refcnt);
-	return true;
-}
-
-/**
- * wb_get - increment a wb's refcount
- * @wb: bdi_writeback to get
- */
-static inline void wb_get(struct bdi_writeback *wb)
-{
-	if (wb != &wb->bdi->wb)
-		percpu_ref_get(&wb->refcnt);
-}
-
-/**
- * wb_put - decrement a wb's refcount
- * @wb: bdi_writeback to put
- */
-static inline void wb_put(struct bdi_writeback *wb)
-{
-	if (wb != &wb->bdi->wb)
-		percpu_ref_put(&wb->refcnt);
-}
-
-/**
  * wb_find_current - find wb for %current on a bdi
  * @bdi: bdi of interest
  *
@@ -354,35 +322,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
 }
 
 /**
- * inode_attach_wb - associate an inode with its wb
- * @inode: inode of interest
- * @page: page being dirtied (may be NULL)
- *
- * If @inode doesn't have its wb, associate it with the wb matching the
- * memcg of @page or, if @page is NULL, %current.  May be called w/ or w/o
- * @inode->i_lock.
- */
-static inline void inode_attach_wb(struct inode *inode, struct page *page)
-{
-	if (!inode->i_wb)
-		__inode_attach_wb(inode, page);
-}
-
-/**
- * inode_detach_wb - disassociate an inode from its wb
- * @inode: inode of interest
- *
- * @inode is being freed.  Detach from its wb.
- */
-static inline void inode_detach_wb(struct inode *inode)
-{
-	if (inode->i_wb) {
-		wb_put(inode->i_wb);
-		inode->i_wb = NULL;
-	}
-}
-
-/**
  * inode_to_wb - determine the wb of an inode
  * @inode: inode of interest
  *
@@ -471,19 +410,6 @@ static inline void wb_congested_put(struct bdi_writeback_congested *congested)
 {
 }
 
-static inline bool wb_tryget(struct bdi_writeback *wb)
-{
-	return true;
-}
-
-static inline void wb_get(struct bdi_writeback *wb)
-{
-}
-
-static inline void wb_put(struct bdi_writeback *wb)
-{
-}
-
 static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
 {
 	return &bdi->wb;
@@ -495,14 +421,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
 	return &bdi->wb;
 }
 
-static inline void inode_attach_wb(struct inode *inode, struct page *page)
-{
-}
-
-static inline void inode_detach_wb(struct inode *inode)
-{
-}
-
 static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
 {
 	return &inode_to_bdi(inode)->wb;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 3b73e97..6726b7e 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -8,6 +8,7 @@
 #include <linux/workqueue.h>
 #include <linux/fs.h>
 #include <linux/flex_proportions.h>
+#include <linux/backing-dev-defs.h>
 
 DECLARE_PER_CPU(int, dirty_throttle_leaks);
 
@@ -173,6 +174,51 @@ static inline void wait_on_inode(struct inode *inode)
 	wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE);
 }
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+void __inode_attach_wb(struct inode *inode, struct page *page);
+
+/**
+ * inode_attach_wb - associate an inode with its wb
+ * @inode: inode of interest
+ * @page: page being dirtied (may be NULL)
+ *
+ * If @inode doesn't have its wb, associate it with the wb matching the
+ * memcg of @page or, if @page is NULL, %current.  May be called w/ or w/o
+ * @inode->i_lock.
+ */
+static inline void inode_attach_wb(struct inode *inode, struct page *page)
+{
+	if (!inode->i_wb)
+		__inode_attach_wb(inode, page);
+}
+
+/**
+ * inode_detach_wb - disassociate an inode from its wb
+ * @inode: inode of interest
+ *
+ * @inode is being freed.  Detach from its wb.
+ */
+static inline void inode_detach_wb(struct inode *inode)
+{
+	if (inode->i_wb) {
+		wb_put(inode->i_wb);
+		inode->i_wb = NULL;
+	}
+}
+
+#else	/* CONFIG_CGROUP_WRITEBACK */
+
+static inline void inode_attach_wb(struct inode *inode, struct page *page)
+{
+}
+
+static inline void inode_detach_wb(struct inode *inode)
+{
+}
+
+#endif	/* CONFIG_CGROUP_WRITEBACK */
+
 /*
  * mm/page-writeback.c
  */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 84ebf7c..887d72a8 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -660,36 +660,6 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
 	return wb;
 }
 
-void __inode_attach_wb(struct inode *inode, struct page *page)
-{
-	struct backing_dev_info *bdi = inode_to_bdi(inode);
-	struct bdi_writeback *wb = NULL;
-
-	if (inode_cgwb_enabled(inode)) {
-		struct cgroup_subsys_state *memcg_css;
-
-		if (page) {
-			memcg_css = mem_cgroup_css_from_page(page);
-			wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
-		} else {
-			/* must pin memcg_css, see wb_get_create() */
-			memcg_css = task_get_css(current, memory_cgrp_id);
-			wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
-			css_put(memcg_css);
-		}
-	}
-
-	if (!wb)
-		wb = &bdi->wb;
-
-	/*
-	 * There may be multiple instances of this function racing to
-	 * update the same inode.  Use cmpxchg() to tell the winner.
-	 */
-	if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
-		wb_put(wb);
-}
-
 static void cgwb_bdi_init(struct backing_dev_info *bdi)
 {
 	bdi->wb.memcg_css = mem_cgroup_root_css;
-- 
2.4.0
^ permalink raw reply related	[flat|nested] 11+ messages in thread
* [PATCH 2/9] writeback: make writeback_control track the inode being written back
  2015-05-28 18:50 [PATCHSET 3/3 v4 block/for-4.2/core] writeback: implement foreign cgroup inode bdi_writeback switching Tejun Heo
       [not found] ` <1432839057-17609-1-git-send-email-tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
@ 2015-05-28 18:50 ` Tejun Heo
  2015-05-28 18:50 ` [PATCH 3/9] writeback: implement foreign cgroup inode detection Tejun Heo
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 11+ messages in thread
From: Tejun Heo @ 2015-05-28 18:50 UTC (permalink / raw)
  To: axboe
  Cc: linux-kernel, jack, hch, hannes, linux-fsdevel, vgoyal, lizefan,
	cgroups, linux-mm, mhocko, clm, fengguang.wu, david, gthelen,
	khlebnikov, Tejun Heo
Currently, for cgroup writeback, the IO submission paths directly
associate the bio's with the blkcg from inode_to_wb_blkcg_css();
however, it'd be necessary to keep more writeback context to implement
foreign inode writeback detection.  wbc (writeback_control) is the
natural fit for the extra context - it persists throughout the
writeback of each inode and is passed all the way down to IO
submission paths.
This patch adds wbc_attach_and_unlock_inode(), wbc_detach_inode(), and
wbc_attach_fdatawrite_inode() which are used to associate wbc with the
inode being written back.  IO submission paths now use wbc_init_bio()
instead of directly associating bio's with blkcg themselves.  This
leaves inode_to_wb_blkcg_css() w/o any user.  The function is removed.
wbc currently only tracks the associated wb (bdi_writeback).  Future
patches will add more for foreign inode detection.  The association is
established under i_lock which will be depended upon when migrating
foreign inodes to other wb's.
As currently, once established, inode to wb association never changes,
going through wbc when initializing bio's doesn't cause any behavior
changes.
v2: submit_blk_blkcg() now checks whether the wbc is associated with a
    wb before dereferencing it.  This can happen when pageout() is
    writing pages directly without going through the usual writeback
    path.  As pageout() path is single-threaded, we don't want it to
    be blocked behind a slow cgroup and ultimately want it to delegate
    actual writing to the usual writeback path.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Greg Thelen <gthelen@google.com>
---
 fs/buffer.c                 | 24 ++++++++--------
 fs/fs-writeback.c           | 37 ++++++++++++++++++++++--
 fs/mpage.c                  |  2 +-
 include/linux/backing-dev.h | 12 --------
 include/linux/writeback.h   | 68 +++++++++++++++++++++++++++++++++++++++++++++
 mm/filemap.c                |  2 ++
 6 files changed, 117 insertions(+), 28 deletions(-)
diff --git a/fs/buffer.c b/fs/buffer.c
index 18cd378..8140923 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -45,9 +45,9 @@
 #include <trace/events/block.h>
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
-static int submit_bh_blkcg(int rw, struct buffer_head *bh,
-			   unsigned long bio_flags,
-			   struct cgroup_subsys_state *blkcg_css);
+static int submit_bh_wbc(int rw, struct buffer_head *bh,
+			 unsigned long bio_flags,
+			 struct writeback_control *wbc);
 
 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
 
@@ -1709,7 +1709,6 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	unsigned int blocksize, bbits;
 	int nr_underway = 0;
 	int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
-	struct cgroup_subsys_state *blkcg_css = inode_to_wb_blkcg_css(inode);
 
 	head = create_page_buffers(page, inode,
 					(1 << BH_Dirty)|(1 << BH_Uptodate));
@@ -1798,7 +1797,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	do {
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
-			submit_bh_blkcg(write_op, bh, 0, blkcg_css);
+			submit_bh_wbc(write_op, bh, 0, wbc);
 			nr_underway++;
 		}
 		bh = next;
@@ -1852,7 +1851,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
 			clear_buffer_dirty(bh);
-			submit_bh_blkcg(write_op, bh, 0, blkcg_css);
+			submit_bh_wbc(write_op, bh, 0, wbc);
 			nr_underway++;
 		}
 		bh = next;
@@ -3017,9 +3016,8 @@ void guard_bio_eod(int rw, struct bio *bio)
 	}
 }
 
-static int submit_bh_blkcg(int rw, struct buffer_head *bh,
-			   unsigned long bio_flags,
-			   struct cgroup_subsys_state *blkcg_css)
+static int submit_bh_wbc(int rw, struct buffer_head *bh,
+			 unsigned long bio_flags, struct writeback_control *wbc)
 {
 	struct bio *bio;
 	int ret = 0;
@@ -3042,8 +3040,8 @@ static int submit_bh_blkcg(int rw, struct buffer_head *bh,
 	 */
 	bio = bio_alloc(GFP_NOIO, 1);
 
-	if (blkcg_css)
-		bio_associate_blkcg(bio, blkcg_css);
+	if (wbc)
+		wbc_init_bio(wbc, bio);
 
 	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 	bio->bi_bdev = bh->b_bdev;
@@ -3072,13 +3070,13 @@ static int submit_bh_blkcg(int rw, struct buffer_head *bh,
 
 int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
 {
-	return submit_bh_blkcg(rw, bh, bio_flags, NULL);
+	return submit_bh_wbc(rw, bh, bio_flags, NULL);
 }
 EXPORT_SYMBOL_GPL(_submit_bh);
 
 int submit_bh(int rw, struct buffer_head *bh)
 {
-	return submit_bh_blkcg(rw, bh, 0, NULL);
+	return submit_bh_wbc(rw, bh, 0, NULL);
 }
 EXPORT_SYMBOL(submit_bh);
 
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index cf6ccfb..755e8ef 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -245,6 +245,37 @@ void __inode_attach_wb(struct inode *inode, struct page *page)
 }
 
 /**
+ * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
+ * @wbc: writeback_control of interest
+ * @inode: target inode
+ *
+ * @inode is locked and about to be written back under the control of @wbc.
+ * Record @inode's writeback context into @wbc and unlock the i_lock.  On
+ * writeback completion, wbc_detach_inode() should be called.  This is used
+ * to track the cgroup writeback context.
+ */
+void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+				 struct inode *inode)
+{
+	wbc->wb = inode_to_wb(inode);
+	wb_get(wbc->wb);
+	spin_unlock(&inode->i_lock);
+}
+
+/**
+ * wbc_detach_inode - disassociate wbc from its target inode
+ * @wbc: writeback_control of interest
+ *
+ * To be called after a writeback attempt of an inode finishes and undoes
+ * wbc_attach_and_unlock_inode().  Can be called under any context.
+ */
+void wbc_detach_inode(struct writeback_control *wbc)
+{
+	wb_put(wbc->wb);
+	wbc->wb = NULL;
+}
+
+/**
  * inode_congested - test whether an inode is congested
  * @inode: inode to test for congestion
  * @cong_bits: mask of WB_[a]sync_congested bits to test
@@ -877,10 +908,11 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 	     !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
 		goto out;
 	inode->i_state |= I_SYNC;
-	spin_unlock(&inode->i_lock);
+	wbc_attach_and_unlock_inode(wbc, inode);
 
 	ret = __writeback_single_inode(inode, wbc);
 
+	wbc_detach_inode(wbc);
 	spin_lock(&wb->list_lock);
 	spin_lock(&inode->i_lock);
 	/*
@@ -1013,7 +1045,7 @@ static long writeback_sb_inodes(struct super_block *sb,
 			continue;
 		}
 		inode->i_state |= I_SYNC;
-		spin_unlock(&inode->i_lock);
+		wbc_attach_and_unlock_inode(&wbc, inode);
 
 		write_chunk = writeback_chunk_size(wb, work);
 		wbc.nr_to_write = write_chunk;
@@ -1025,6 +1057,7 @@ static long writeback_sb_inodes(struct super_block *sb,
 		 */
 		__writeback_single_inode(inode, &wbc);
 
+		wbc_detach_inode(&wbc);
 		work->nr_pages -= write_chunk - wbc.nr_to_write;
 		wrote += write_chunk - wbc.nr_to_write;
 		spin_lock(&wb->list_lock);
diff --git a/fs/mpage.c b/fs/mpage.c
index a3ccb0b..388fde6 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -606,7 +606,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
 		if (bio == NULL)
 			goto confused;
 
-		bio_associate_blkcg(bio, inode_to_wb_blkcg_css(inode));
+		wbc_init_bio(wbc, bio);
 	}
 
 	/*
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 5c978a9..b1d2489 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -332,12 +332,6 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
 	return inode->i_wb;
 }
 
-static inline struct cgroup_subsys_state *
-inode_to_wb_blkcg_css(struct inode *inode)
-{
-	return inode_to_wb(inode)->blkcg_css;
-}
-
 struct wb_iter {
 	int			start_blkcg_id;
 	struct radix_tree_iter	tree_iter;
@@ -434,12 +428,6 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg)
 {
 }
 
-static inline struct cgroup_subsys_state *
-inode_to_wb_blkcg_css(struct inode *inode)
-{
-	return blkcg_root_css;
-}
-
 struct wb_iter {
 	int		next_id;
 };
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 6726b7e..8f964e5 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -86,6 +86,9 @@ struct writeback_control {
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
 	unsigned for_sync:1;		/* sync(2) WB_SYNC_ALL writeback */
+#ifdef CONFIG_CGROUP_WRITEBACK
+	struct bdi_writeback *wb;	/* wb this writeback is issued under */
+#endif
 };
 
 /*
@@ -176,7 +179,14 @@ static inline void wait_on_inode(struct inode *inode)
 
 #ifdef CONFIG_CGROUP_WRITEBACK
 
+#include <linux/cgroup.h>
+#include <linux/bio.h>
+
 void __inode_attach_wb(struct inode *inode, struct page *page);
+void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+				 struct inode *inode)
+	__releases(&inode->i_lock);
+void wbc_detach_inode(struct writeback_control *wbc);
 
 /**
  * inode_attach_wb - associate an inode with its wb
@@ -207,6 +217,44 @@ static inline void inode_detach_wb(struct inode *inode)
 	}
 }
 
+/**
+ * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite
+ * @wbc: writeback_control of interest
+ * @inode: target inode
+ *
+ * This function is to be used by __filemap_fdatawrite_range(), which is an
+ * alternative entry point into writeback code, and first ensures @inode is
+ * associated with a bdi_writeback and attaches it to @wbc.
+ */
+static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
+					       struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	inode_attach_wb(inode, NULL);
+	wbc_attach_and_unlock_inode(wbc, inode);
+}
+
+/**
+ * wbc_init_bio - writeback specific initializtion of bio
+ * @wbc: writeback_control for the writeback in progress
+ * @bio: bio to be initialized
+ *
+ * @bio is a part of the writeback in progress controlled by @wbc.  Perform
+ * writeback specific initialization.  This is used to apply the cgroup
+ * writeback context.
+ */
+static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
+{
+	/*
+	 * pageout() path doesn't attach @wbc to the inode being written
+	 * out.  This is intentional as we don't want the function to block
+	 * behind a slow cgroup.  Ultimately, we want pageout() to kick off
+	 * regular writeback instead of writing things out itself.
+	 */
+	if (wbc->wb)
+		bio_associate_blkcg(bio, wbc->wb->blkcg_css);
+}
+
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
 static inline void inode_attach_wb(struct inode *inode, struct page *page)
@@ -217,6 +265,26 @@ static inline void inode_detach_wb(struct inode *inode)
 {
 }
 
+static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+					       struct inode *inode)
+	__releases(&inode->i_lock)
+{
+	spin_unlock(&inode->i_lock);
+}
+
+static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
+					       struct inode *inode)
+{
+}
+
+static inline void wbc_detach_inode(struct writeback_control *wbc)
+{
+}
+
+static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
+{
+}
+
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
 /*
diff --git a/mm/filemap.c b/mm/filemap.c
index 7b1443d..2f065b1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -290,7 +290,9 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 	if (!mapping_cap_writeback_dirty(mapping))
 		return 0;
 
+	wbc_attach_fdatawrite_inode(&wbc, mapping->host);
 	ret = do_writepages(mapping, &wbc);
+	wbc_detach_inode(&wbc);
 	return ret;
 }
 
-- 
2.4.0
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 11+ messages in thread
* [PATCH 3/9] writeback: implement foreign cgroup inode detection
  2015-05-28 18:50 [PATCHSET 3/3 v4 block/for-4.2/core] writeback: implement foreign cgroup inode bdi_writeback switching Tejun Heo
       [not found] ` <1432839057-17609-1-git-send-email-tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
  2015-05-28 18:50 ` [PATCH 2/9] writeback: make writeback_control track the inode being written back Tejun Heo
@ 2015-05-28 18:50 ` Tejun Heo
  2015-05-28 18:50 ` [PATCH 4/9] writeback: implement [locked_]inode_to_wb_and_lock_list() Tejun Heo
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 11+ messages in thread
From: Tejun Heo @ 2015-05-28 18:50 UTC (permalink / raw)
  To: axboe
  Cc: linux-kernel, jack, hch, hannes, linux-fsdevel, vgoyal, lizefan,
	cgroups, linux-mm, mhocko, clm, fengguang.wu, david, gthelen,
	khlebnikov, Tejun Heo
As concurrent write sharing of an inode is expected to be very rare
and memcg only tracks page ownership on first-use basis severely
confining the usefulness of such sharing, cgroup writeback tracks
ownership per-inode.  While the support for concurrent write sharing
of an inode is deemed unnecessary, an inode being written to by
different cgroups at different points in time is a lot more common,
and, more importantly, charging only by first-use can too readily lead
to grossly incorrect behaviors (single foreign page can lead to
gigabytes of writeback to be incorrectly attributed).
To resolve this issue, cgroup writeback detects the majority dirtier
of an inode and will transfer the ownership to it.  To avoid
unnnecessary oscillation, the detection mechanism keeps track of
history and gives out the switch verdict only if the foreign usage
pattern is stable over a certain amount of time and/or writeback
attempts.
The detection mechanism has fairly low space and computation overhead.
It adds 8 bytes to struct inode (one int and two u16's) and minimal
amount of calculation per IO.  The detection mechanism converges to
the correct answer usually in several seconds of IO time when there's
a clear majority dirtier.  Even when there isn't, it can reach an
acceptable answer fairly quickly under most circumstances.
Please see wb_detach_inode() for more details.
This patch only implements detection.  Following patches will
implement actual switching.
v2: wbc_account_io() now checks whether the wbc is associated with a
    wb before dereferencing it.  This can happen when pageout() is
    writing pages directly without going through the usual writeback
    path.  As pageout() path is single-threaded, we don't want it to
    be blocked behind a slow cgroup and ultimately want it to delegate
    actual writing to the usual writeback path.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Greg Thelen <gthelen@google.com>
---
 fs/buffer.c               |   4 +-
 fs/fs-writeback.c         | 177 +++++++++++++++++++++++++++++++++++++++++++++-
 fs/mpage.c                |   1 +
 include/linux/fs.h        |   5 ++
 include/linux/writeback.h |  16 +++++
 5 files changed, 200 insertions(+), 3 deletions(-)
diff --git a/fs/buffer.c b/fs/buffer.c
index 8140923..32d33b7 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3040,8 +3040,10 @@ static int submit_bh_wbc(int rw, struct buffer_head *bh,
 	 */
 	bio = bio_alloc(GFP_NOIO, 1);
 
-	if (wbc)
+	if (wbc) {
 		wbc_init_bio(wbc, bio);
+		wbc_account_io(wbc, bh->b_page, bh->b_size);
+	}
 
 	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 	bio->bi_bdev = bh->b_bdev;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 755e8ef..f98d403 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -214,6 +214,20 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi,
 
 #ifdef CONFIG_CGROUP_WRITEBACK
 
+/* parameters for foreign inode detection, see wb_detach_inode() */
+#define WB_FRN_TIME_SHIFT	13	/* 1s = 2^13, upto 8 secs w/ 16bit */
+#define WB_FRN_TIME_AVG_SHIFT	3	/* avg = avg * 7/8 + new * 1/8 */
+#define WB_FRN_TIME_CUT_DIV	2	/* ignore rounds < avg / 2 */
+#define WB_FRN_TIME_PERIOD	(2 * (1 << WB_FRN_TIME_SHIFT))	/* 2s */
+
+#define WB_FRN_HIST_SLOTS	16	/* inode->i_wb_frn_history is 16bit */
+#define WB_FRN_HIST_UNIT	(WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
+					/* each slot's duration is 2s / 16 */
+#define WB_FRN_HIST_THR_SLOTS	(WB_FRN_HIST_SLOTS / 2)
+					/* if foreign slots >= 8, switch */
+#define WB_FRN_HIST_MAX_SLOTS	(WB_FRN_HIST_THR_SLOTS / 2 + 1)
+					/* one round can affect upto 5 slots */
+
 void __inode_attach_wb(struct inode *inode, struct page *page)
 {
 	struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -258,24 +272,183 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
 				 struct inode *inode)
 {
 	wbc->wb = inode_to_wb(inode);
+	wbc->inode = inode;
+
+	wbc->wb_id = wbc->wb->memcg_css->id;
+	wbc->wb_lcand_id = inode->i_wb_frn_winner;
+	wbc->wb_tcand_id = 0;
+	wbc->wb_bytes = 0;
+	wbc->wb_lcand_bytes = 0;
+	wbc->wb_tcand_bytes = 0;
+
 	wb_get(wbc->wb);
 	spin_unlock(&inode->i_lock);
 }
 
 /**
- * wbc_detach_inode - disassociate wbc from its target inode
- * @wbc: writeback_control of interest
+ * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
+ * @wbc: writeback_control of the just finished writeback
  *
  * To be called after a writeback attempt of an inode finishes and undoes
  * wbc_attach_and_unlock_inode().  Can be called under any context.
+ *
+ * As concurrent write sharing of an inode is expected to be very rare and
+ * memcg only tracks page ownership on first-use basis severely confining
+ * the usefulness of such sharing, cgroup writeback tracks ownership
+ * per-inode.  While the support for concurrent write sharing of an inode
+ * is deemed unnecessary, an inode being written to by different cgroups at
+ * different points in time is a lot more common, and, more importantly,
+ * charging only by first-use can too readily lead to grossly incorrect
+ * behaviors (single foreign page can lead to gigabytes of writeback to be
+ * incorrectly attributed).
+ *
+ * To resolve this issue, cgroup writeback detects the majority dirtier of
+ * an inode and transfers the ownership to it.  To avoid unnnecessary
+ * oscillation, the detection mechanism keeps track of history and gives
+ * out the switch verdict only if the foreign usage pattern is stable over
+ * a certain amount of time and/or writeback attempts.
+ *
+ * On each writeback attempt, @wbc tries to detect the majority writer
+ * using Boyer-Moore majority vote algorithm.  In addition to the byte
+ * count from the majority voting, it also counts the bytes written for the
+ * current wb and the last round's winner wb (max of last round's current
+ * wb, the winner from two rounds ago, and the last round's majority
+ * candidate).  Keeping track of the historical winner helps the algorithm
+ * to semi-reliably detect the most active writer even when it's not the
+ * absolute majority.
+ *
+ * Once the winner of the round is determined, whether the winner is
+ * foreign or not and how much IO time the round consumed is recorded in
+ * inode->i_wb_frn_history.  If the amount of recorded foreign IO time is
+ * over a certain threshold, the switch verdict is given.
  */
 void wbc_detach_inode(struct writeback_control *wbc)
 {
+	struct bdi_writeback *wb = wbc->wb;
+	struct inode *inode = wbc->inode;
+	u16 history = inode->i_wb_frn_history;
+	unsigned long avg_time = inode->i_wb_frn_avg_time;
+	unsigned long max_bytes, max_time;
+	int max_id;
+
+	/* pick the winner of this round */
+	if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
+	    wbc->wb_bytes >= wbc->wb_tcand_bytes) {
+		max_id = wbc->wb_id;
+		max_bytes = wbc->wb_bytes;
+	} else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
+		max_id = wbc->wb_lcand_id;
+		max_bytes = wbc->wb_lcand_bytes;
+	} else {
+		max_id = wbc->wb_tcand_id;
+		max_bytes = wbc->wb_tcand_bytes;
+	}
+
+	/*
+	 * Calculate the amount of IO time the winner consumed and fold it
+	 * into the running average kept per inode.  If the consumed IO
+	 * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for
+	 * deciding whether to switch or not.  This is to prevent one-off
+	 * small dirtiers from skewing the verdict.
+	 */
+	max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
+				wb->avg_write_bandwidth);
+	if (avg_time)
+		avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
+			    (avg_time >> WB_FRN_TIME_AVG_SHIFT);
+	else
+		avg_time = max_time;	/* immediate catch up on first run */
+
+	if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
+		int slots;
+
+		/*
+		 * The switch verdict is reached if foreign wb's consume
+		 * more than a certain proportion of IO time in a
+		 * WB_FRN_TIME_PERIOD.  This is loosely tracked by 16 slot
+		 * history mask where each bit represents one sixteenth of
+		 * the period.  Determine the number of slots to shift into
+		 * history from @max_time.
+		 */
+		slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
+			    (unsigned long)WB_FRN_HIST_MAX_SLOTS);
+		history <<= slots;
+		if (wbc->wb_id != max_id)
+			history |= (1U << slots) - 1;
+
+		/*
+		 * Switch if the current wb isn't the consistent winner.
+		 * If there are multiple closely competing dirtiers, the
+		 * inode may switch across them repeatedly over time, which
+		 * is okay.  The main goal is avoiding keeping an inode on
+		 * the wrong wb for an extended period of time.
+		 */
+		if (hweight32(history) > WB_FRN_HIST_THR_SLOTS) {
+			/* switch */
+			max_id = 0;
+			avg_time = 0;
+			history = 0;
+		}
+	}
+
+	/*
+	 * Multiple instances of this function may race to update the
+	 * following fields but we don't mind occassional inaccuracies.
+	 */
+	inode->i_wb_frn_winner = max_id;
+	inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
+	inode->i_wb_frn_history = history;
+
 	wb_put(wbc->wb);
 	wbc->wb = NULL;
 }
 
 /**
+ * wbc_account_io - account IO issued during writeback
+ * @wbc: writeback_control of the writeback in progress
+ * @page: page being written out
+ * @bytes: number of bytes being written out
+ *
+ * @bytes from @page are about to written out during the writeback
+ * controlled by @wbc.  Keep the book for foreign inode detection.  See
+ * wbc_detach_inode().
+ */
+void wbc_account_io(struct writeback_control *wbc, struct page *page,
+		    size_t bytes)
+{
+	int id;
+
+	/*
+	 * pageout() path doesn't attach @wbc to the inode being written
+	 * out.  This is intentional as we don't want the function to block
+	 * behind a slow cgroup.  Ultimately, we want pageout() to kick off
+	 * regular writeback instead of writing things out itself.
+	 */
+	if (!wbc->wb)
+		return;
+
+	rcu_read_lock();
+	id = mem_cgroup_css_from_page(page)->id;
+	rcu_read_unlock();
+
+	if (id == wbc->wb_id) {
+		wbc->wb_bytes += bytes;
+		return;
+	}
+
+	if (id == wbc->wb_lcand_id)
+		wbc->wb_lcand_bytes += bytes;
+
+	/* Boyer-Moore majority vote algorithm */
+	if (!wbc->wb_tcand_bytes)
+		wbc->wb_tcand_id = id;
+	if (id == wbc->wb_tcand_id)
+		wbc->wb_tcand_bytes += bytes;
+	else
+		wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
+}
+
+/**
  * inode_congested - test whether an inode is congested
  * @inode: inode to test for congestion
  * @cong_bits: mask of WB_[a]sync_congested bits to test
diff --git a/fs/mpage.c b/fs/mpage.c
index 388fde6..ca0244b 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -614,6 +614,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
 	 * the confused fail path above (OOM) will be very confused when
 	 * it finds all bh marked clean (i.e. it will not write anything)
 	 */
+	wbc_account_io(wbc, page, PAGE_SIZE);
 	length = first_unmapped << blkbits;
 	if (bio_add_page(bio, page, length, 0) < length) {
 		bio = mpage_bio_submit(WRITE, bio);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 67a42ec..740126d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -638,6 +638,11 @@ struct inode {
 	struct list_head	i_wb_list;	/* backing dev IO list */
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct bdi_writeback	*i_wb;		/* the associated cgroup wb */
+
+	/* foreign inode detection, see wbc_detach_inode() */
+	int			i_wb_frn_winner;
+	u16			i_wb_frn_avg_time;
+	u16			i_wb_frn_history;
 #endif
 	struct list_head	i_lru;		/* inode LRU list */
 	struct list_head	i_sb_list;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 8f964e5..b333c94 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -88,6 +88,15 @@ struct writeback_control {
 	unsigned for_sync:1;		/* sync(2) WB_SYNC_ALL writeback */
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct bdi_writeback *wb;	/* wb this writeback is issued under */
+	struct inode *inode;		/* inode being written out */
+
+	/* foreign inode detection, see wbc_detach_inode() */
+	int wb_id;			/* current wb id */
+	int wb_lcand_id;		/* last foreign candidate wb id */
+	int wb_tcand_id;		/* this foreign candidate wb id */
+	size_t wb_bytes;		/* bytes written by current wb */
+	size_t wb_lcand_bytes;		/* bytes written by last candidate */
+	size_t wb_tcand_bytes;		/* bytes written by this candidate */
 #endif
 };
 
@@ -187,6 +196,8 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
 				 struct inode *inode)
 	__releases(&inode->i_lock);
 void wbc_detach_inode(struct writeback_control *wbc);
+void wbc_account_io(struct writeback_control *wbc, struct page *page,
+		    size_t bytes);
 
 /**
  * inode_attach_wb - associate an inode with its wb
@@ -285,6 +296,11 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
 {
 }
 
+static inline void wbc_account_io(struct writeback_control *wbc,
+				  struct page *page, size_t bytes)
+{
+}
+
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
 /*
-- 
2.4.0
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 11+ messages in thread
* [PATCH 4/9] writeback: implement [locked_]inode_to_wb_and_lock_list()
  2015-05-28 18:50 [PATCHSET 3/3 v4 block/for-4.2/core] writeback: implement foreign cgroup inode bdi_writeback switching Tejun Heo
                   ` (2 preceding siblings ...)
  2015-05-28 18:50 ` [PATCH 3/9] writeback: implement foreign cgroup inode detection Tejun Heo
@ 2015-05-28 18:50 ` Tejun Heo
  2015-05-28 18:50 ` [PATCH 5/9] writeback: implement unlocked_inode_to_wb transaction and use it for stat updates Tejun Heo
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 11+ messages in thread
From: Tejun Heo @ 2015-05-28 18:50 UTC (permalink / raw)
  To: axboe
  Cc: linux-kernel, jack, hch, hannes, linux-fsdevel, vgoyal, lizefan,
	cgroups, linux-mm, mhocko, clm, fengguang.wu, david, gthelen,
	khlebnikov, Tejun Heo
cgroup writeback currently assumes that inode to wb association
doesn't change; however, with the planned foreign inode wb switching
mechanism, the association will change dynamically.
When an inode needs to be put on one of the IO lists of its wb, the
current code simply calls inode_to_wb() and locks the returned wb;
however, with the planned wb switching, the association may change
before locking the wb and may even get released.
This patch implements [locked_]inode_to_wb_and_lock_list() which pins
the associated wb while holding i_lock, releases it, acquires
wb->list_lock and verifies that the association hasn't changed
inbetween.  As the association will be protected by both locks among
other things, this guarantees that the wb is the inode's associated wb
until the list_lock is released.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Greg Thelen <gthelen@google.com>
---
 fs/fs-writeback.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 75 insertions(+), 5 deletions(-)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f98d403..f67b956 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -259,6 +259,56 @@ void __inode_attach_wb(struct inode *inode, struct page *page)
 }
 
 /**
+ * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
+ * @inode: inode of interest with i_lock held
+ *
+ * Returns @inode's wb with its list_lock held.  @inode->i_lock must be
+ * held on entry and is released on return.  The returned wb is guaranteed
+ * to stay @inode's associated wb until its list_lock is released.
+ */
+static struct bdi_writeback *
+locked_inode_to_wb_and_lock_list(struct inode *inode)
+	__releases(&inode->i_lock)
+	__acquires(&wb->list_lock)
+{
+	while (true) {
+		struct bdi_writeback *wb = inode_to_wb(inode);
+
+		/*
+		 * inode_to_wb() association is protected by both
+		 * @inode->i_lock and @wb->list_lock but list_lock nests
+		 * outside i_lock.  Drop i_lock and verify that the
+		 * association hasn't changed after acquiring list_lock.
+		 */
+		wb_get(wb);
+		spin_unlock(&inode->i_lock);
+		spin_lock(&wb->list_lock);
+		wb_put(wb);		/* not gonna deref it anymore */
+
+		if (likely(wb == inode_to_wb(inode)))
+			return wb;	/* @inode already has ref */
+
+		spin_unlock(&wb->list_lock);
+		cpu_relax();
+		spin_lock(&inode->i_lock);
+	}
+}
+
+/**
+ * inode_to_wb_and_lock_list - determine an inode's wb and lock it
+ * @inode: inode of interest
+ *
+ * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held
+ * on entry.
+ */
+static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
+	__acquires(&wb->list_lock)
+{
+	spin_lock(&inode->i_lock);
+	return locked_inode_to_wb_and_lock_list(inode);
+}
+
+/**
  * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
  * @wbc: writeback_control of interest
  * @inode: target inode
@@ -603,6 +653,27 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
 
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
+static struct bdi_writeback *
+locked_inode_to_wb_and_lock_list(struct inode *inode)
+	__releases(&inode->i_lock)
+	__acquires(&wb->list_lock)
+{
+	struct bdi_writeback *wb = inode_to_wb(inode);
+
+	spin_unlock(&inode->i_lock);
+	spin_lock(&wb->list_lock);
+	return wb;
+}
+
+static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
+	__acquires(&wb->list_lock)
+{
+	struct bdi_writeback *wb = inode_to_wb(inode);
+
+	spin_lock(&wb->list_lock);
+	return wb;
+}
+
 static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
 {
 	return nr_pages;
@@ -678,9 +749,9 @@ void wb_start_background_writeback(struct bdi_writeback *wb)
  */
 void inode_wb_list_del(struct inode *inode)
 {
-	struct bdi_writeback *wb = inode_to_wb(inode);
+	struct bdi_writeback *wb;
 
-	spin_lock(&wb->list_lock);
+	wb = inode_to_wb_and_lock_list(inode);
 	inode_wb_list_del_locked(inode, wb);
 	spin_unlock(&wb->list_lock);
 }
@@ -1784,12 +1855,11 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 		 * reposition it (that would break b_dirty time-ordering).
 		 */
 		if (!was_dirty) {
-			struct bdi_writeback *wb = inode_to_wb(inode);
+			struct bdi_writeback *wb;
 			struct list_head *dirty_list;
 			bool wakeup_bdi = false;
 
-			spin_unlock(&inode->i_lock);
-			spin_lock(&wb->list_lock);
+			wb = locked_inode_to_wb_and_lock_list(inode);
 
 			WARN(bdi_cap_writeback_dirty(wb->bdi) &&
 			     !test_bit(WB_registered, &wb->state),
-- 
2.4.0
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 11+ messages in thread
* [PATCH 5/9] writeback: implement unlocked_inode_to_wb transaction and use it for stat updates
  2015-05-28 18:50 [PATCHSET 3/3 v4 block/for-4.2/core] writeback: implement foreign cgroup inode bdi_writeback switching Tejun Heo
                   ` (3 preceding siblings ...)
  2015-05-28 18:50 ` [PATCH 4/9] writeback: implement [locked_]inode_to_wb_and_lock_list() Tejun Heo
@ 2015-05-28 18:50 ` Tejun Heo
  2015-05-28 18:50 ` [PATCH 6/9] writeback: use unlocked_inode_to_wb transaction in inode_congested() Tejun Heo
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 11+ messages in thread
From: Tejun Heo @ 2015-05-28 18:50 UTC (permalink / raw)
  To: axboe
  Cc: linux-kernel, jack, hch, hannes, linux-fsdevel, vgoyal, lizefan,
	cgroups, linux-mm, mhocko, clm, fengguang.wu, david, gthelen,
	khlebnikov, Tejun Heo
The mechanism for detecting whether an inode should switch its wb
(bdi_writeback) association is now in place.  This patch build the
framework for the actual switching.
This patch adds a new inode flag I_WB_SWITCHING, which has two
functions.  First, the easy one, it ensures that there's only one
switching in progress for a give inode.  Second, it's used as a
mechanism to synchronize wb stat updates.
The two stats, WB_RECLAIMABLE and WB_WRITEBACK, aren't event counters
but track the current number of dirty pages and pages under writeback
respectively.  As such, when an inode is moved from one wb to another,
the inode's portion of those stats have to be transferred together;
unfortunately, this is a bit tricky as those stat updates are percpu
operations which are performed without holding any lock in some
places.
This patch solves the problem in a similar way as memcg.  Each such
lockless stat updates are wrapped in transaction surrounded by
unlocked_inode_to_wb_begin/end().  During normal operation, they map
to rcu_read_lock/unlock(); however, if I_WB_SWITCHING is asserted,
mapping->tree_lock is grabbed across the transaction.
In turn, the switching path sets I_WB_SWITCHING and waits for a RCU
grace period to pass before actually starting to switch, which
guarantees that all stat update paths are synchronizing against
mapping->tree_lock.
This patch still doesn't implement the actual switching.
v3: Updated on top of the recent cancel_dirty_page() updates.
    unlocked_inode_to_wb_begin() now nests inside
    mem_cgroup_begin_page_stat() to match the locking order.
v2: The i_wb access transaction will be used for !stat accesses too.
    Function names and comments updated accordingly.
    s/inode_wb_stat_unlocked_{begin|end}/unlocked_inode_to_wb_{begin|end}/
    s/switch_wb/switch_wbs/
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Greg Thelen <gthelen@google.com>
---
 fs/fs-writeback.c           | 117 +++++++++++++++++++++++++++++++++++++++++---
 include/linux/backing-dev.h |  54 ++++++++++++++++++++
 include/linux/fs.h          |   6 +++
 include/linux/mm.h          |   3 +-
 mm/filemap.c                |   3 +-
 mm/page-writeback.c         |  27 +++++++---
 6 files changed, 196 insertions(+), 14 deletions(-)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f67b956..08f5496 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -308,6 +308,115 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
 	return locked_inode_to_wb_and_lock_list(inode);
 }
 
+struct inode_switch_wbs_context {
+	struct inode		*inode;
+	struct bdi_writeback	*new_wb;
+
+	struct rcu_head		rcu_head;
+	struct work_struct	work;
+};
+
+static void inode_switch_wbs_work_fn(struct work_struct *work)
+{
+	struct inode_switch_wbs_context *isw =
+		container_of(work, struct inode_switch_wbs_context, work);
+	struct inode *inode = isw->inode;
+	struct bdi_writeback *new_wb = isw->new_wb;
+
+	/*
+	 * By the time control reaches here, RCU grace period has passed
+	 * since I_WB_SWITCH assertion and all wb stat update transactions
+	 * between unlocked_inode_to_wb_begin/end() are guaranteed to be
+	 * synchronizing against mapping->tree_lock.
+	 */
+	spin_lock(&inode->i_lock);
+
+	inode->i_wb_frn_winner = 0;
+	inode->i_wb_frn_avg_time = 0;
+	inode->i_wb_frn_history = 0;
+
+	/*
+	 * Paired with load_acquire in unlocked_inode_to_wb_begin() and
+	 * ensures that the new wb is visible if they see !I_WB_SWITCH.
+	 */
+	smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
+
+	spin_unlock(&inode->i_lock);
+
+	iput(inode);
+	wb_put(new_wb);
+	kfree(isw);
+}
+
+static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
+{
+	struct inode_switch_wbs_context *isw = container_of(rcu_head,
+				struct inode_switch_wbs_context, rcu_head);
+
+	/* needs to grab bh-unsafe locks, bounce to work item */
+	INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
+	schedule_work(&isw->work);
+}
+
+/**
+ * inode_switch_wbs - change the wb association of an inode
+ * @inode: target inode
+ * @new_wb_id: ID of the new wb
+ *
+ * Switch @inode's wb association to the wb identified by @new_wb_id.  The
+ * switching is performed asynchronously and may fail silently.
+ */
+static void inode_switch_wbs(struct inode *inode, int new_wb_id)
+{
+	struct backing_dev_info *bdi = inode_to_bdi(inode);
+	struct cgroup_subsys_state *memcg_css;
+	struct inode_switch_wbs_context *isw;
+
+	/* noop if seems to be already in progress */
+	if (inode->i_state & I_WB_SWITCH)
+		return;
+
+	isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
+	if (!isw)
+		return;
+
+	/* find and pin the new wb */
+	rcu_read_lock();
+	memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
+	if (memcg_css)
+		isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+	rcu_read_unlock();
+	if (!isw->new_wb)
+		goto out_free;
+
+	/* while holding I_WB_SWITCH, no one else can update the association */
+	spin_lock(&inode->i_lock);
+	if (inode->i_state & (I_WB_SWITCH | I_FREEING) ||
+	    inode_to_wb(inode) == isw->new_wb) {
+		spin_unlock(&inode->i_lock);
+		goto out_free;
+	}
+	inode->i_state |= I_WB_SWITCH;
+	spin_unlock(&inode->i_lock);
+
+	ihold(inode);
+	isw->inode = inode;
+
+	/*
+	 * In addition to synchronizing among switchers, I_WB_SWITCH tells
+	 * the RCU protected stat update paths to grab the mapping's
+	 * tree_lock so that stat transfer can synchronize against them.
+	 * Let's continue after I_WB_SWITCH is guaranteed to be visible.
+	 */
+	call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
+	return;
+
+out_free:
+	if (isw->new_wb)
+		wb_put(isw->new_wb);
+	kfree(isw);
+}
+
 /**
  * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
  * @wbc: writeback_control of interest
@@ -433,12 +542,8 @@ void wbc_detach_inode(struct writeback_control *wbc)
 		 * is okay.  The main goal is avoiding keeping an inode on
 		 * the wrong wb for an extended period of time.
 		 */
-		if (hweight32(history) > WB_FRN_HIST_THR_SLOTS) {
-			/* switch */
-			max_id = 0;
-			avg_time = 0;
-			history = 0;
-		}
+		if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
+			inode_switch_wbs(inode, max_id);
 	}
 
 	/*
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index b1d2489..73ffa32 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -332,6 +332,50 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
 	return inode->i_wb;
 }
 
+/**
+ * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction
+ * @inode: target inode
+ * @lockedp: temp bool output param, to be passed to the end function
+ *
+ * The caller wants to access the wb associated with @inode but isn't
+ * holding inode->i_lock, mapping->tree_lock or wb->list_lock.  This
+ * function determines the wb associated with @inode and ensures that the
+ * association doesn't change until the transaction is finished with
+ * unlocked_inode_to_wb_end().
+ *
+ * The caller must call unlocked_inode_to_wb_end() with *@lockdep
+ * afterwards and can't sleep during transaction.  IRQ may or may not be
+ * disabled on return.
+ */
+static inline struct bdi_writeback *
+unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
+{
+	rcu_read_lock();
+
+	/*
+	 * Paired with store_release in inode_switch_wb_work_fn() and
+	 * ensures that we see the new wb if we see cleared I_WB_SWITCH.
+	 */
+	*lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;
+
+	if (unlikely(*lockedp))
+		spin_lock_irq(&inode->i_mapping->tree_lock);
+	return inode_to_wb(inode);
+}
+
+/**
+ * unlocked_inode_to_wb_end - end inode wb access transaction
+ * @inode: target inode
+ * @locked: *@lockedp from unlocked_inode_to_wb_begin()
+ */
+static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
+{
+	if (unlikely(locked))
+		spin_unlock_irq(&inode->i_mapping->tree_lock);
+
+	rcu_read_unlock();
+}
+
 struct wb_iter {
 	int			start_blkcg_id;
 	struct radix_tree_iter	tree_iter;
@@ -420,6 +464,16 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
 	return &inode_to_bdi(inode)->wb;
 }
 
+static inline struct bdi_writeback *
+unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
+{
+	return inode_to_wb(inode);
+}
+
+static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
+{
+}
+
 static inline void wb_memcg_offline(struct mem_cgroup *memcg)
 {
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 740126d..b5e1dcf 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1815,6 +1815,11 @@ struct super_operations {
  *
  * I_DIO_WAKEUP		Never set.  Only used as a key for wait_on_bit().
  *
+ * I_WB_SWITCH		Cgroup bdi_writeback switching in progress.  Used to
+ *			synchronize competing switching instances and to tell
+ *			wb stat updates to grab mapping->tree_lock.  See
+ *			inode_switch_wb_work_fn() for details.
+ *
  * Q: What is the difference between I_WILL_FREE and I_FREEING?
  */
 #define I_DIRTY_SYNC		(1 << 0)
@@ -1834,6 +1839,7 @@ struct super_operations {
 #define I_DIRTY_TIME		(1 << 11)
 #define __I_DIRTY_TIME_EXPIRED	12
 #define I_DIRTY_TIME_EXPIRED	(1 << __I_DIRTY_TIME_EXPIRED)
+#define I_WB_SWITCH		(1 << 13)
 
 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
 #define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f48d979..4024543 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -27,6 +27,7 @@ struct anon_vma_chain;
 struct file_ra_state;
 struct user_struct;
 struct writeback_control;
+struct bdi_writeback;
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES	/* Don't use mapnrs, do it properly */
 extern unsigned long max_mapnr;
@@ -1214,7 +1215,7 @@ int redirty_page_for_writepage(struct writeback_control *wbc,
 void account_page_dirtied(struct page *page, struct address_space *mapping,
 			  struct mem_cgroup *memcg);
 void account_page_cleaned(struct page *page, struct address_space *mapping,
-			  struct mem_cgroup *memcg);
+			  struct mem_cgroup *memcg, struct bdi_writeback *wb);
 int set_page_dirty(struct page *page);
 int set_page_dirty_lock(struct page *page);
 void cancel_dirty_page(struct page *page);
diff --git a/mm/filemap.c b/mm/filemap.c
index 2f065b1..bfc1ab0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -213,7 +213,8 @@ void __delete_from_page_cache(struct page *page, void *shadow,
 	 * anyway will be cleared before returning page into buddy allocator.
 	 */
 	if (WARN_ON_ONCE(PageDirty(page)))
-		account_page_cleaned(page, mapping, memcg);
+		account_page_cleaned(page, mapping, memcg,
+				     inode_to_wb(mapping->host));
 }
 
 /**
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e890335..e1514d5 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2422,12 +2422,12 @@ EXPORT_SYMBOL(account_page_dirtied);
  * Caller must hold mem_cgroup_begin_page_stat().
  */
 void account_page_cleaned(struct page *page, struct address_space *mapping,
-			  struct mem_cgroup *memcg)
+			  struct mem_cgroup *memcg, struct bdi_writeback *wb)
 {
 	if (mapping_cap_account_dirty(mapping)) {
 		mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
 		dec_zone_page_state(page, NR_FILE_DIRTY);
-		dec_wb_stat(inode_to_wb(mapping->host), WB_RECLAIMABLE);
+		dec_wb_stat(wb, WB_RECLAIMABLE);
 		task_io_account_cancelled_write(PAGE_CACHE_SIZE);
 	}
 }
@@ -2490,11 +2490,15 @@ void account_page_redirty(struct page *page)
 	struct address_space *mapping = page->mapping;
 
 	if (mapping && mapping_cap_account_dirty(mapping)) {
-		struct bdi_writeback *wb = inode_to_wb(mapping->host);
+		struct inode *inode = mapping->host;
+		struct bdi_writeback *wb;
+		bool locked;
 
+		wb = unlocked_inode_to_wb_begin(inode, &locked);
 		current->nr_dirtied--;
 		dec_zone_page_state(page, NR_DIRTIED);
 		dec_wb_stat(wb, WB_DIRTIED);
+		unlocked_inode_to_wb_end(inode, locked);
 	}
 }
 EXPORT_SYMBOL(account_page_redirty);
@@ -2597,13 +2601,18 @@ void cancel_dirty_page(struct page *page)
 	struct address_space *mapping = page_mapping(page);
 
 	if (mapping_cap_account_dirty(mapping)) {
+		struct inode *inode = mapping->host;
+		struct bdi_writeback *wb;
 		struct mem_cgroup *memcg;
+		bool locked;
 
 		memcg = mem_cgroup_begin_page_stat(page);
+		wb = unlocked_inode_to_wb_begin(inode, &locked);
 
 		if (TestClearPageDirty(page))
-			account_page_cleaned(page, mapping, memcg);
+			account_page_cleaned(page, mapping, memcg, wb);
 
+		unlocked_inode_to_wb_end(inode, locked);
 		mem_cgroup_end_page_stat(memcg);
 	} else {
 		ClearPageDirty(page);
@@ -2628,12 +2637,16 @@ EXPORT_SYMBOL(cancel_dirty_page);
 int clear_page_dirty_for_io(struct page *page)
 {
 	struct address_space *mapping = page_mapping(page);
-	struct mem_cgroup *memcg;
 	int ret = 0;
 
 	BUG_ON(!PageLocked(page));
 
 	if (mapping && mapping_cap_account_dirty(mapping)) {
+		struct inode *inode = mapping->host;
+		struct bdi_writeback *wb;
+		struct mem_cgroup *memcg;
+		bool locked;
+
 		/*
 		 * Yes, Virginia, this is indeed insane.
 		 *
@@ -2670,12 +2683,14 @@ int clear_page_dirty_for_io(struct page *page)
 		 * exclusion.
 		 */
 		memcg = mem_cgroup_begin_page_stat(page);
+		wb = unlocked_inode_to_wb_begin(inode, &locked);
 		if (TestClearPageDirty(page)) {
 			mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
 			dec_zone_page_state(page, NR_FILE_DIRTY);
-			dec_wb_stat(inode_to_wb(mapping->host), WB_RECLAIMABLE);
+			dec_wb_stat(wb, WB_RECLAIMABLE);
 			ret = 1;
 		}
+		unlocked_inode_to_wb_end(inode, locked);
 		mem_cgroup_end_page_stat(memcg);
 		return ret;
 	}
-- 
2.4.0
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 11+ messages in thread
* [PATCH 6/9] writeback: use unlocked_inode_to_wb transaction in inode_congested()
  2015-05-28 18:50 [PATCHSET 3/3 v4 block/for-4.2/core] writeback: implement foreign cgroup inode bdi_writeback switching Tejun Heo
                   ` (4 preceding siblings ...)
  2015-05-28 18:50 ` [PATCH 5/9] writeback: implement unlocked_inode_to_wb transaction and use it for stat updates Tejun Heo
@ 2015-05-28 18:50 ` Tejun Heo
  2015-05-28 18:50 ` [PATCH 8/9] writeback: implement foreign cgroup inode bdi_writeback switching Tejun Heo
  2015-05-28 18:50 ` [PATCH 9/9] writeback: disassociate inodes from dying bdi_writebacks Tejun Heo
  7 siblings, 0 replies; 11+ messages in thread
From: Tejun Heo @ 2015-05-28 18:50 UTC (permalink / raw)
  To: axboe
  Cc: linux-kernel, jack, hch, hannes, linux-fsdevel, vgoyal, lizefan,
	cgroups, linux-mm, mhocko, clm, fengguang.wu, david, gthelen,
	khlebnikov, Tejun Heo
Similar to wb stat updates, inode_congested() accesses the associated
wb of an inode locklessly, which will break with foreign inode wb
switching.  This path updates inode_congested() to use unlocked inode
wb access transaction introduced by the previous patch.
Combined with the previous two patches, this makes all wb list and
access operations to be protected by either of inode->i_lock,
wb->list_lock, or mapping->tree_lock while wb switching is in
progress.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Greg Thelen <gthelen@google.com>
---
 fs/fs-writeback.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 08f5496..25458fa 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -618,10 +618,18 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page,
  */
 int inode_congested(struct inode *inode, int cong_bits)
 {
-	if (inode) {
-		struct bdi_writeback *wb = inode_to_wb(inode);
-		if (wb)
-			return wb_congested(wb, cong_bits);
+	/*
+	 * Once set, ->i_wb never becomes NULL while the inode is alive.
+	 * Start transaction iff ->i_wb is visible.
+	 */
+	if (inode && inode_to_wb(inode)) {
+		struct bdi_writeback *wb;
+		bool locked, congested;
+
+		wb = unlocked_inode_to_wb_begin(inode, &locked);
+		congested = wb_congested(wb, cong_bits);
+		unlocked_inode_to_wb_end(inode, locked);
+		return congested;
 	}
 
 	return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
-- 
2.4.0
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 11+ messages in thread
* [PATCH 7/9] writeback: add lockdep annotation to inode_to_wb()
       [not found] ` <1432839057-17609-1-git-send-email-tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
  2015-05-28 18:50   ` [PATCH 1/9] writeback: relocate wb[_try]_get(), wb_put(), inode_{attach|detach}_wb() Tejun Heo
@ 2015-05-28 18:50   ` Tejun Heo
  1 sibling, 0 replies; 11+ messages in thread
From: Tejun Heo @ 2015-05-28 18:50 UTC (permalink / raw)
  To: axboe-tSWWG44O7X1aa/9Udqfwiw
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, jack-AlSwsSmVLrQ,
	hch-wEGCiKHe2LqWVfeAwA7xHQ, hannes-druUgvl0LCNAfugRpC6u6w,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	vgoyal-H+wXaHxf7aLQT0dZR+AlfA, lizefan-hv44wF8Li93QT0dZR+AlfA,
	cgroups-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	mhocko-AlSwsSmVLrQ, clm-b10kYP2dOMg,
	fengguang.wu-ral2JQCrhuEAvxtiuMwx3w, david-FqsqvQoI3Ljby3iVrkZq2A,
	gthelen-hpIqsD4AKlfQT0dZR+AlfA, khlebnikov-XoJtRXgx1JseBXzfvpsJ4g,
	Tejun Heo
With the previous three patches, all operations which acquire wb from
inode are either under one of inode->i_lock, mapping->tree_lock or
wb->list_lock or protected by unlocked_inode_to_wb transaction.  This
will be depended upon by foreign inode wb switching.
This patch adds lockdep assertion to inode_to_wb() so that usages
outside the above list locks can be caught easily.  There are three
exceptions.
* locked_inode_to_wb_and_lock_list() is holding wb->list_lock but the
  wb may not be the inode's.  Ensuring that is the function's role
  after all.  Updated to deref inode->i_wb directly.
* inode_wb_stat_unlocked_begin() is usually protected by combination
  of !I_WB_SWITCH and rcu_read_lock().  Updated to deref inode->i_wb
  directly.
* inode_congested() wants to test whether inode->i_wb is set before
  starting the transaction.  Added inode_to_wb_is_valid() which tests
  inode->i_wb directly.
v5: might_lock() removed.  It annotates that the lock is grabbed w/
    irq enabled which isn't the case and triggering lockdep warning
    spuriously.
v4: might_lock() added to unlocked_inode_to_wb_begin().
v3: inode_congested() conversion added.
v2: locked_inode_to_wb_and_lock_list() was missing in the first
    version.
Signed-off-by: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
Cc: Jens Axboe <axboe-tSWWG44O7X1aa/9Udqfwiw@public.gmane.org>
Cc: Jan Kara <jack-AlSwsSmVLrQ@public.gmane.org>
Cc: Wu Fengguang <fengguang.wu-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Cc: Greg Thelen <gthelen-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
---
 fs/fs-writeback.c           |  5 +++--
 include/linux/backing-dev.h | 34 ++++++++++++++++++++++++++++++++--
 2 files changed, 35 insertions(+), 4 deletions(-)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 25458fa..6b99dee 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -285,7 +285,8 @@ locked_inode_to_wb_and_lock_list(struct inode *inode)
 		spin_lock(&wb->list_lock);
 		wb_put(wb);		/* not gonna deref it anymore */
 
-		if (likely(wb == inode_to_wb(inode)))
+		/* i_wb may have changed inbetween, can't use inode_to_wb() */
+		if (likely(wb == inode->i_wb))
 			return wb;	/* @inode already has ref */
 
 		spin_unlock(&wb->list_lock);
@@ -622,7 +623,7 @@ int inode_congested(struct inode *inode, int cong_bits)
 	 * Once set, ->i_wb never becomes NULL while the inode is alive.
 	 * Start transaction iff ->i_wb is visible.
 	 */
-	if (inode && inode_to_wb(inode)) {
+	if (inode && inode_to_wb_is_valid(inode)) {
 		struct bdi_writeback *wb;
 		bool locked, congested;
 
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 73ffa32..dfce808 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -322,13 +322,33 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
 }
 
 /**
+ * inode_to_wb_is_valid - test whether an inode has a wb associated
+ * @inode: inode of interest
+ *
+ * Returns %true if @inode has a wb associated.  May be called without any
+ * locking.
+ */
+static inline bool inode_to_wb_is_valid(struct inode *inode)
+{
+	return inode->i_wb;
+}
+
+/**
  * inode_to_wb - determine the wb of an inode
  * @inode: inode of interest
  *
- * Returns the wb @inode is currently associated with.
+ * Returns the wb @inode is currently associated with.  The caller must be
+ * holding either @inode->i_lock, @inode->i_mapping->tree_lock, or the
+ * associated wb's list_lock.
  */
 static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
 {
+#ifdef CONFIG_LOCKDEP
+	WARN_ON_ONCE(debug_locks &&
+		     (!lockdep_is_held(&inode->i_lock) &&
+		      !lockdep_is_held(&inode->i_mapping->tree_lock) &&
+		      !lockdep_is_held(&inode->i_wb->list_lock)));
+#endif
 	return inode->i_wb;
 }
 
@@ -360,7 +380,12 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
 
 	if (unlikely(*lockedp))
 		spin_lock_irq(&inode->i_mapping->tree_lock);
-	return inode_to_wb(inode);
+
+	/*
+	 * Protected by either !I_WB_SWITCH + rcu_read_lock() or tree_lock.
+	 * inode_to_wb() will bark.  Deref directly.
+	 */
+	return inode->i_wb;
 }
 
 /**
@@ -459,6 +484,11 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
 	return &bdi->wb;
 }
 
+static inline bool inode_to_wb_is_valid(struct inode *inode)
+{
+	return true;
+}
+
 static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
 {
 	return &inode_to_bdi(inode)->wb;
-- 
2.4.0
^ permalink raw reply related	[flat|nested] 11+ messages in thread
* [PATCH 8/9] writeback: implement foreign cgroup inode bdi_writeback switching
  2015-05-28 18:50 [PATCHSET 3/3 v4 block/for-4.2/core] writeback: implement foreign cgroup inode bdi_writeback switching Tejun Heo
                   ` (5 preceding siblings ...)
  2015-05-28 18:50 ` [PATCH 6/9] writeback: use unlocked_inode_to_wb transaction in inode_congested() Tejun Heo
@ 2015-05-28 18:50 ` Tejun Heo
  2015-05-28 18:50 ` [PATCH 9/9] writeback: disassociate inodes from dying bdi_writebacks Tejun Heo
  7 siblings, 0 replies; 11+ messages in thread
From: Tejun Heo @ 2015-05-28 18:50 UTC (permalink / raw)
  To: axboe
  Cc: linux-kernel, jack, hch, hannes, linux-fsdevel, vgoyal, lizefan,
	cgroups, linux-mm, mhocko, clm, fengguang.wu, david, gthelen,
	khlebnikov, Tejun Heo
As concurrent write sharing of an inode is expected to be very rare
and memcg only tracks page ownership on first-use basis severely
confining the usefulness of such sharing, cgroup writeback tracks
ownership per-inode.  While the support for concurrent write sharing
of an inode is deemed unnecessary, an inode being written to by
different cgroups at different points in time is a lot more common,
and, more importantly, charging only by first-use can too readily lead
to grossly incorrect behaviors (single foreign page can lead to
gigabytes of writeback to be incorrectly attributed).
To resolve this issue, cgroup writeback detects the majority dirtier
of an inode and transfers the ownership to it.  The previous patches
implemented the foreign condition detection mechanism and laid the
groundwork.  This patch implements the actual switching.
With the previously implemented [unlocked_]inode_to_wb_and_list_lock()
and wb stat transaction, grabbing wb->list_lock, inode->i_lock and
mapping->tree_lock gives us full exclusion against all wb operations
on the target inode.  inode_switch_wb_work_fn() grabs all the locks
and transfers the inode atomically along with its RECLAIMABLE and
WRITEBACK stats.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Greg Thelen <gthelen@google.com>
---
 fs/fs-writeback.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 84 insertions(+), 2 deletions(-)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 6b99dee..5eeb24a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -322,30 +322,112 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
 	struct inode_switch_wbs_context *isw =
 		container_of(work, struct inode_switch_wbs_context, work);
 	struct inode *inode = isw->inode;
+	struct address_space *mapping = inode->i_mapping;
+	struct bdi_writeback *old_wb = inode->i_wb;
 	struct bdi_writeback *new_wb = isw->new_wb;
+	struct radix_tree_iter iter;
+	bool switched = false;
+	void **slot;
 
 	/*
 	 * By the time control reaches here, RCU grace period has passed
 	 * since I_WB_SWITCH assertion and all wb stat update transactions
 	 * between unlocked_inode_to_wb_begin/end() are guaranteed to be
 	 * synchronizing against mapping->tree_lock.
+	 *
+	 * Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock
+	 * gives us exclusion against all wb related operations on @inode
+	 * including IO list manipulations and stat updates.
 	 */
+	if (old_wb < new_wb) {
+		spin_lock(&old_wb->list_lock);
+		spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
+	} else {
+		spin_lock(&new_wb->list_lock);
+		spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
+	}
 	spin_lock(&inode->i_lock);
+	spin_lock_irq(&mapping->tree_lock);
+
+	/*
+	 * Once I_FREEING is visible under i_lock, the eviction path owns
+	 * the inode and we shouldn't modify ->i_wb_list.
+	 */
+	if (unlikely(inode->i_state & I_FREEING))
+		goto skip_switch;
 
+	/*
+	 * Count and transfer stats.  Note that PAGECACHE_TAG_DIRTY points
+	 * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
+	 * pages actually under underwriteback.
+	 */
+	radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
+				   PAGECACHE_TAG_DIRTY) {
+		struct page *page = radix_tree_deref_slot_protected(slot,
+							&mapping->tree_lock);
+		if (likely(page) && PageDirty(page)) {
+			__dec_wb_stat(old_wb, WB_RECLAIMABLE);
+			__inc_wb_stat(new_wb, WB_RECLAIMABLE);
+		}
+	}
+
+	radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
+				   PAGECACHE_TAG_WRITEBACK) {
+		struct page *page = radix_tree_deref_slot_protected(slot,
+							&mapping->tree_lock);
+		if (likely(page)) {
+			WARN_ON_ONCE(!PageWriteback(page));
+			__dec_wb_stat(old_wb, WB_WRITEBACK);
+			__inc_wb_stat(new_wb, WB_WRITEBACK);
+		}
+	}
+
+	wb_get(new_wb);
+
+	/*
+	 * Transfer to @new_wb's IO list if necessary.  The specific list
+	 * @inode was on is ignored and the inode is put on ->b_dirty which
+	 * is always correct including from ->b_dirty_time.  The transfer
+	 * preserves @inode->dirtied_when ordering.
+	 */
+	if (!list_empty(&inode->i_wb_list)) {
+		struct inode *pos;
+
+		inode_wb_list_del_locked(inode, old_wb);
+		inode->i_wb = new_wb;
+		list_for_each_entry(pos, &new_wb->b_dirty, i_wb_list)
+			if (time_after_eq(inode->dirtied_when,
+					  pos->dirtied_when))
+				break;
+		inode_wb_list_move_locked(inode, new_wb, pos->i_wb_list.prev);
+	} else {
+		inode->i_wb = new_wb;
+	}
+
+	/* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */
 	inode->i_wb_frn_winner = 0;
 	inode->i_wb_frn_avg_time = 0;
 	inode->i_wb_frn_history = 0;
-
+	switched = true;
+skip_switch:
 	/*
 	 * Paired with load_acquire in unlocked_inode_to_wb_begin() and
 	 * ensures that the new wb is visible if they see !I_WB_SWITCH.
 	 */
 	smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
 
+	spin_unlock_irq(&mapping->tree_lock);
 	spin_unlock(&inode->i_lock);
+	spin_unlock(&new_wb->list_lock);
+	spin_unlock(&old_wb->list_lock);
 
-	iput(inode);
+	if (switched) {
+		wb_wakeup(new_wb);
+		wb_put(old_wb);
+	}
 	wb_put(new_wb);
+
+	iput(inode);
 	kfree(isw);
 }
 
-- 
2.4.0
^ permalink raw reply related	[flat|nested] 11+ messages in thread
* [PATCH 9/9] writeback: disassociate inodes from dying bdi_writebacks
  2015-05-28 18:50 [PATCHSET 3/3 v4 block/for-4.2/core] writeback: implement foreign cgroup inode bdi_writeback switching Tejun Heo
                   ` (6 preceding siblings ...)
  2015-05-28 18:50 ` [PATCH 8/9] writeback: implement foreign cgroup inode bdi_writeback switching Tejun Heo
@ 2015-05-28 18:50 ` Tejun Heo
  7 siblings, 0 replies; 11+ messages in thread
From: Tejun Heo @ 2015-05-28 18:50 UTC (permalink / raw)
  To: axboe
  Cc: linux-kernel, jack, hch, hannes, linux-fsdevel, vgoyal, lizefan,
	cgroups, linux-mm, mhocko, clm, fengguang.wu, david, gthelen,
	khlebnikov, Tejun Heo
For the purpose of foreign inode detection, wb's (bdi_writeback's) are
identified by the associated memcg ID.  As we create a separate wb for
each memcg, this is enough to identify the active wb's; however, when
blkcg is enabled or disabled higher up in the hierarchy, the mapping
between memcg and blkcg changes which in turn creates a new wb to
service the new mapping.  The old wb is unlinked from index and
released after all references are drained.  The foreign inode
detection logic can't detect this condition because both the old and
new wb's point to the same memcg and thus never decides to move inodes
attached to the old wb to the new one.
This patch adds logic to initiate switching immediately in
wbc_attach_and_unlock_inode() if the associated wb is dying.  We can
make the usual foreign detection logic to distinguish the different
wb's mapped to the memcg but the dying wb is never gonna be in active
service again and there's no point in tracking the usage history and
reaching the switch verdict after enough data points are collected.
It's already known that the wb has to be switched.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Greg Thelen <gthelen@google.com>
---
 fs/fs-writeback.c                |  7 +++++++
 include/linux/backing-dev-defs.h | 16 ++++++++++++++++
 2 files changed, 23 insertions(+)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5eeb24a..f60de54 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -525,6 +525,13 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
 
 	wb_get(wbc->wb);
 	spin_unlock(&inode->i_lock);
+
+	/*
+	 * A dying wb indicates that the memcg-blkcg mapping has changed
+	 * and a new wb is already serving the memcg.  Switch immediately.
+	 */
+	if (unlikely(wb_dying(wbc->wb)))
+		inode_switch_wbs(inode, wbc->wb_id);
 }
 
 /**
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index e047b49..a48d90e 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -219,6 +219,17 @@ static inline void wb_put(struct bdi_writeback *wb)
 		percpu_ref_put(&wb->refcnt);
 }
 
+/**
+ * wb_dying - is a wb dying?
+ * @wb: bdi_writeback of interest
+ *
+ * Returns whether @wb is unlinked and being drained.
+ */
+static inline bool wb_dying(struct bdi_writeback *wb)
+{
+	return percpu_ref_is_dying(&wb->refcnt);
+}
+
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
 static inline bool wb_tryget(struct bdi_writeback *wb)
@@ -234,6 +245,11 @@ static inline void wb_put(struct bdi_writeback *wb)
 {
 }
 
+static inline bool wb_dying(struct bdi_writeback *wb)
+{
+	return false;
+}
+
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
 #endif	/* __LINUX_BACKING_DEV_DEFS_H */
-- 
2.4.0
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 11+ messages in thread
end of thread, other threads:[~2015-05-28 18:50 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-05-28 18:50 [PATCHSET 3/3 v4 block/for-4.2/core] writeback: implement foreign cgroup inode bdi_writeback switching Tejun Heo
     [not found] ` <1432839057-17609-1-git-send-email-tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
2015-05-28 18:50   ` [PATCH 1/9] writeback: relocate wb[_try]_get(), wb_put(), inode_{attach|detach}_wb() Tejun Heo
2015-05-28 18:50   ` [PATCH 7/9] writeback: add lockdep annotation to inode_to_wb() Tejun Heo
2015-05-28 18:50 ` [PATCH 2/9] writeback: make writeback_control track the inode being written back Tejun Heo
2015-05-28 18:50 ` [PATCH 3/9] writeback: implement foreign cgroup inode detection Tejun Heo
2015-05-28 18:50 ` [PATCH 4/9] writeback: implement [locked_]inode_to_wb_and_lock_list() Tejun Heo
2015-05-28 18:50 ` [PATCH 5/9] writeback: implement unlocked_inode_to_wb transaction and use it for stat updates Tejun Heo
2015-05-28 18:50 ` [PATCH 6/9] writeback: use unlocked_inode_to_wb transaction in inode_congested() Tejun Heo
2015-05-28 18:50 ` [PATCH 8/9] writeback: implement foreign cgroup inode bdi_writeback switching Tejun Heo
2015-05-28 18:50 ` [PATCH 9/9] writeback: disassociate inodes from dying bdi_writebacks Tejun Heo
  -- strict thread matches above, loose matches on Subject: below --
2015-05-22 22:36 [PATCHSET 3/3 v3 block/for-4.2/core] writeback: implement foreign cgroup inode bdi_writeback switching Tejun Heo
2015-05-22 22:36 ` [PATCH 1/9] writeback: relocate wb[_try]_get(), wb_put(), inode_{attach|detach}_wb() Tejun Heo
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).