All of lore.kernel.org
 help / color / mirror / Atom feed
From: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
To: Tahsin Erdogan <tahsin-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
Cc: Jan Kara <jack-AlSwsSmVLrQ@public.gmane.org>,
	Jens Axboe <axboe-tSWWG44O7X1aa/9Udqfwiw@public.gmane.org>,
	cgroups-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	Theodore Ts'o <tytso-3s7WtUTddSA@public.gmane.org>,
	Nauman Rafique <nauman-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	Jan Kara <jack-IBi9RG/b67k@public.gmane.org>,
	Al Viro <viro-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
Subject: [PATCH v2 block/for-linus] writeback: flush inode cgroup wb switches instead of pinning super_block
Date: Mon, 29 Feb 2016 18:28:53 -0500	[thread overview]
Message-ID: <20160229232853.GD3965@htj.duckdns.org> (raw)
In-Reply-To: <20160229204724.GV3965-piEFEHQLUPpN0TnZuCh8vA@public.gmane.org>

If cgroup writeback is in use, inodes can be scheduled for
asynchronous wb switching.  Before 5ff8eaac1636 ("writeback: keep
superblock pinned during cgroup writeback association switches"), this
could race with umount leading to super_block being destroyed while
inodes are pinned for wb switching.  5ff8eaac1636 fixed it by bumping
s_active while wb switches are in flight; however, this allowed
in-flight wb switches to make umounts asynchronous when the userland
expected synchronosity - e.g. fsck immediately following umount may
fail because the device is still busy.

This patch removes the problematic super_block pinning and instead
makes generic_shutdown_super() flush in-flight wb switches.  wb
switches are now executed on a dedicated isw_wq so that they can be
flushed and isw_nr_in_flight keeps track of the number of in-flight wb
switches so that flushing can be avoided in most cases.

v2: Move cgroup_writeback_umount() further below and add MS_ACTIVE
    check in inode_switch_wbs() as Jan an Al suggested.

Signed-off-by: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
Reported-by: Tahsin Erdogan <tahsin-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
Cc: Jan Kara <jack-AlSwsSmVLrQ@public.gmane.org>
Cc: Al Viro <viro-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
Link: http://lkml.kernel.org/g/CAAeU0aNCq7LGODvVGRU-oU_o-6enii5ey0p1c26D1ZzYwkDc5A-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org
Fixes: 5ff8eaac1636 ("writeback: keep superblock pinned during cgroup writeback association switches")
Cc: stable-u79uwXL29TY76Z2rM5mHXA@public.gmane.org #v4.5
---
 fs/fs-writeback.c         |   54 ++++++++++++++++++++++++++++++++++------------
 fs/super.c                |    1 
 include/linux/writeback.h |    5 ++++
 3 files changed, 47 insertions(+), 13 deletions(-)

--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -223,6 +223,9 @@ static void wb_wait_for_completion(struc
 #define WB_FRN_HIST_MAX_SLOTS	(WB_FRN_HIST_THR_SLOTS / 2 + 1)
 					/* one round can affect upto 5 slots */
 
+static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
+static struct workqueue_struct *isw_wq;
+
 void __inode_attach_wb(struct inode *inode, struct page *page)
 {
 	struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -317,7 +320,6 @@ static void inode_switch_wbs_work_fn(str
 	struct inode_switch_wbs_context *isw =
 		container_of(work, struct inode_switch_wbs_context, work);
 	struct inode *inode = isw->inode;
-	struct super_block *sb = inode->i_sb;
 	struct address_space *mapping = inode->i_mapping;
 	struct bdi_writeback *old_wb = inode->i_wb;
 	struct bdi_writeback *new_wb = isw->new_wb;
@@ -424,8 +426,9 @@ skip_switch:
 	wb_put(new_wb);
 
 	iput(inode);
-	deactivate_super(sb);
 	kfree(isw);
+
+	atomic_dec(&isw_nr_in_flight);
 }
 
 static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
@@ -435,7 +438,7 @@ static void inode_switch_wbs_rcu_fn(stru
 
 	/* needs to grab bh-unsafe locks, bounce to work item */
 	INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
-	schedule_work(&isw->work);
+	queue_work(isw_wq, &isw->work);
 }
 
 /**
@@ -471,20 +474,20 @@ static void inode_switch_wbs(struct inod
 
 	/* while holding I_WB_SWITCH, no one else can update the association */
 	spin_lock(&inode->i_lock);
-
-	if (inode->i_state & (I_WB_SWITCH | I_FREEING) ||
-	    inode_to_wb(inode) == isw->new_wb)
-		goto out_unlock;
-
-	if (!atomic_inc_not_zero(&inode->i_sb->s_active))
-		goto out_unlock;
-
+	if (!(inode->i_sb->s_flags & MS_ACTIVE) ||
+	    inode->i_state & (I_WB_SWITCH | I_FREEING) ||
+	    inode_to_wb(inode) == isw->new_wb) {
+		spin_unlock(&inode->i_lock);
+		goto out_free;
+	}
 	inode->i_state |= I_WB_SWITCH;
 	spin_unlock(&inode->i_lock);
 
 	ihold(inode);
 	isw->inode = inode;
 
+	atomic_inc(&isw_nr_in_flight);
+
 	/*
 	 * In addition to synchronizing among switchers, I_WB_SWITCH tells
 	 * the RCU protected stat update paths to grab the mapping's
@@ -494,8 +497,6 @@ static void inode_switch_wbs(struct inod
 	call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
 	return;
 
-out_unlock:
-	spin_unlock(&inode->i_lock);
 out_free:
 	if (isw->new_wb)
 		wb_put(isw->new_wb);
@@ -847,6 +848,33 @@ restart:
 		wb_put(last_wb);
 }
 
+/**
+ * cgroup_writeback_umount - flush inode wb switches for umount
+ *
+ * This function is called when a super_block is about to be destroyed and
+ * flushes in-flight inode wb switches.  An inode wb switch goes through
+ * RCU and then workqueue, so the two need to be flushed in order to ensure
+ * that all previously scheduled switches are finished.  As wb switches are
+ * rare occurrences and synchronize_rcu() can take a while, perform
+ * flushing iff wb switches are in flight.
+ */
+void cgroup_writeback_umount(void)
+{
+	if (atomic_read(&isw_nr_in_flight)) {
+		synchronize_rcu();
+		flush_workqueue(isw_wq);
+	}
+}
+
+static int __init cgroup_writeback_init(void)
+{
+	isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
+	if (!isw_wq)
+		return -ENOMEM;
+	return 0;
+}
+fs_initcall(cgroup_writeback_init);
+
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
 static struct bdi_writeback *
--- a/fs/super.c
+++ b/fs/super.c
@@ -415,6 +415,7 @@ void generic_shutdown_super(struct super
 		sb->s_flags &= ~MS_ACTIVE;
 
 		fsnotify_unmount_inodes(sb);
+		cgroup_writeback_umount();
 
 		evict_inodes(sb);
 
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -198,6 +198,7 @@ void wbc_attach_and_unlock_inode(struct
 void wbc_detach_inode(struct writeback_control *wbc);
 void wbc_account_io(struct writeback_control *wbc, struct page *page,
 		    size_t bytes);
+void cgroup_writeback_umount(void);
 
 /**
  * inode_attach_wb - associate an inode with its wb
@@ -301,6 +302,10 @@ static inline void wbc_account_io(struct
 {
 }
 
+static inline void cgroup_writeback_umount(void)
+{
+}
+
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
 /*

WARNING: multiple messages have this Message-ID (diff)
From: Tejun Heo <tj@kernel.org>
To: Tahsin Erdogan <tahsin@google.com>
Cc: Jan Kara <jack@suse.cz>, Jens Axboe <axboe@kernel.dk>,
	cgroups@vger.kernel.org, "Theodore Ts'o" <tytso@mit.edu>,
	Nauman Rafique <nauman@google.com>,
	linux-kernel@vger.kernel.org, Jan Kara <jack@suse.com>,
	Al Viro <viro@ZenIV.linux.org.uk>
Subject: [PATCH v2 block/for-linus] writeback: flush inode cgroup wb switches instead of pinning super_block
Date: Mon, 29 Feb 2016 18:28:53 -0500	[thread overview]
Message-ID: <20160229232853.GD3965@htj.duckdns.org> (raw)
In-Reply-To: <20160229204724.GV3965@htj.duckdns.org>

If cgroup writeback is in use, inodes can be scheduled for
asynchronous wb switching.  Before 5ff8eaac1636 ("writeback: keep
superblock pinned during cgroup writeback association switches"), this
could race with umount leading to super_block being destroyed while
inodes are pinned for wb switching.  5ff8eaac1636 fixed it by bumping
s_active while wb switches are in flight; however, this allowed
in-flight wb switches to make umounts asynchronous when the userland
expected synchronosity - e.g. fsck immediately following umount may
fail because the device is still busy.

This patch removes the problematic super_block pinning and instead
makes generic_shutdown_super() flush in-flight wb switches.  wb
switches are now executed on a dedicated isw_wq so that they can be
flushed and isw_nr_in_flight keeps track of the number of in-flight wb
switches so that flushing can be avoided in most cases.

v2: Move cgroup_writeback_umount() further below and add MS_ACTIVE
    check in inode_switch_wbs() as Jan an Al suggested.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Tahsin Erdogan <tahsin@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Link: http://lkml.kernel.org/g/CAAeU0aNCq7LGODvVGRU-oU_o-6enii5ey0p1c26D1ZzYwkDc5A@mail.gmail.com
Fixes: 5ff8eaac1636 ("writeback: keep superblock pinned during cgroup writeback association switches")
Cc: stable@vger.kernel.org #v4.5
---
 fs/fs-writeback.c         |   54 ++++++++++++++++++++++++++++++++++------------
 fs/super.c                |    1 
 include/linux/writeback.h |    5 ++++
 3 files changed, 47 insertions(+), 13 deletions(-)

--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -223,6 +223,9 @@ static void wb_wait_for_completion(struc
 #define WB_FRN_HIST_MAX_SLOTS	(WB_FRN_HIST_THR_SLOTS / 2 + 1)
 					/* one round can affect upto 5 slots */
 
+static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
+static struct workqueue_struct *isw_wq;
+
 void __inode_attach_wb(struct inode *inode, struct page *page)
 {
 	struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -317,7 +320,6 @@ static void inode_switch_wbs_work_fn(str
 	struct inode_switch_wbs_context *isw =
 		container_of(work, struct inode_switch_wbs_context, work);
 	struct inode *inode = isw->inode;
-	struct super_block *sb = inode->i_sb;
 	struct address_space *mapping = inode->i_mapping;
 	struct bdi_writeback *old_wb = inode->i_wb;
 	struct bdi_writeback *new_wb = isw->new_wb;
@@ -424,8 +426,9 @@ skip_switch:
 	wb_put(new_wb);
 
 	iput(inode);
-	deactivate_super(sb);
 	kfree(isw);
+
+	atomic_dec(&isw_nr_in_flight);
 }
 
 static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
@@ -435,7 +438,7 @@ static void inode_switch_wbs_rcu_fn(stru
 
 	/* needs to grab bh-unsafe locks, bounce to work item */
 	INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
-	schedule_work(&isw->work);
+	queue_work(isw_wq, &isw->work);
 }
 
 /**
@@ -471,20 +474,20 @@ static void inode_switch_wbs(struct inod
 
 	/* while holding I_WB_SWITCH, no one else can update the association */
 	spin_lock(&inode->i_lock);
-
-	if (inode->i_state & (I_WB_SWITCH | I_FREEING) ||
-	    inode_to_wb(inode) == isw->new_wb)
-		goto out_unlock;
-
-	if (!atomic_inc_not_zero(&inode->i_sb->s_active))
-		goto out_unlock;
-
+	if (!(inode->i_sb->s_flags & MS_ACTIVE) ||
+	    inode->i_state & (I_WB_SWITCH | I_FREEING) ||
+	    inode_to_wb(inode) == isw->new_wb) {
+		spin_unlock(&inode->i_lock);
+		goto out_free;
+	}
 	inode->i_state |= I_WB_SWITCH;
 	spin_unlock(&inode->i_lock);
 
 	ihold(inode);
 	isw->inode = inode;
 
+	atomic_inc(&isw_nr_in_flight);
+
 	/*
 	 * In addition to synchronizing among switchers, I_WB_SWITCH tells
 	 * the RCU protected stat update paths to grab the mapping's
@@ -494,8 +497,6 @@ static void inode_switch_wbs(struct inod
 	call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
 	return;
 
-out_unlock:
-	spin_unlock(&inode->i_lock);
 out_free:
 	if (isw->new_wb)
 		wb_put(isw->new_wb);
@@ -847,6 +848,33 @@ restart:
 		wb_put(last_wb);
 }
 
+/**
+ * cgroup_writeback_umount - flush inode wb switches for umount
+ *
+ * This function is called when a super_block is about to be destroyed and
+ * flushes in-flight inode wb switches.  An inode wb switch goes through
+ * RCU and then workqueue, so the two need to be flushed in order to ensure
+ * that all previously scheduled switches are finished.  As wb switches are
+ * rare occurrences and synchronize_rcu() can take a while, perform
+ * flushing iff wb switches are in flight.
+ */
+void cgroup_writeback_umount(void)
+{
+	if (atomic_read(&isw_nr_in_flight)) {
+		synchronize_rcu();
+		flush_workqueue(isw_wq);
+	}
+}
+
+static int __init cgroup_writeback_init(void)
+{
+	isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
+	if (!isw_wq)
+		return -ENOMEM;
+	return 0;
+}
+fs_initcall(cgroup_writeback_init);
+
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
 static struct bdi_writeback *
--- a/fs/super.c
+++ b/fs/super.c
@@ -415,6 +415,7 @@ void generic_shutdown_super(struct super
 		sb->s_flags &= ~MS_ACTIVE;
 
 		fsnotify_unmount_inodes(sb);
+		cgroup_writeback_umount();
 
 		evict_inodes(sb);
 
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -198,6 +198,7 @@ void wbc_attach_and_unlock_inode(struct
 void wbc_detach_inode(struct writeback_control *wbc);
 void wbc_account_io(struct writeback_control *wbc, struct page *page,
 		    size_t bytes);
+void cgroup_writeback_umount(void);
 
 /**
  * inode_attach_wb - associate an inode with its wb
@@ -301,6 +302,10 @@ static inline void wbc_account_io(struct
 {
 }
 
+static inline void cgroup_writeback_umount(void)
+{
+}
+
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
 /*

  parent reply	other threads:[~2016-02-29 23:28 UTC|newest]

Thread overview: 52+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-02-12 19:32 [BUG] cgroup writeback crash Tahsin Erdogan
     [not found] ` <CAAeU0aNCq7LGODvVGRU-oU_o-6enii5ey0p1c26D1ZzYwkDc5A-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2016-02-15 21:00   ` Tejun Heo
     [not found]     ` <20160215210047.GN3965-piEFEHQLUPpN0TnZuCh8vA@public.gmane.org>
2016-02-16  7:56       ` Tahsin Erdogan
     [not found]         ` <CAAeU0aNAd1Ra6LXmWwq8row4MD_BpVHiSXOwHx07m86UWREvHw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2016-02-16 18:24           ` [PATCH block/for-4.5-fixes] writeback: keep superblock pinned during cgroup writeback association switches Tejun Heo
2016-02-16 18:24             ` Tejun Heo
     [not found]             ` <20160216182457.GO3741-qYNAdHglDFBN0TnZuCh8vA@public.gmane.org>
2016-02-16 18:34               ` Jens Axboe
2016-02-16 18:34                 ` Jens Axboe
2016-02-17 20:57               ` Jan Kara
2016-02-17 20:57                 ` Jan Kara
     [not found]                 ` <20160217205721.GE14140-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org>
2016-02-17 21:07                   ` Tejun Heo
2016-02-17 21:07                     ` Tejun Heo
2016-02-17 22:30                     ` Jan Kara
2016-02-17 22:41                       ` Tahsin Erdogan
     [not found]                         ` <CAAeU0aOvSwPbLPU0=20D1RExNj8VsbB38hUnyso2L8xNSQC0XA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2016-02-17 23:02                           ` Tejun Heo
2016-02-17 23:02                             ` Tejun Heo
2016-02-18  9:55                             ` Jan Kara
     [not found]                               ` <20160218095538.GA4338-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org>
2016-02-18 13:00                                 ` Tejun Heo
2016-02-18 13:00                                   ` Tejun Heo
2016-02-18 13:20                                   ` Jan Kara
2016-02-19 20:18                                   ` Al Viro
2016-02-19 20:51                                     ` Tejun Heo
     [not found]                                       ` <20160219205147.GN13177-qYNAdHglDFBN0TnZuCh8vA@public.gmane.org>
2016-02-19 21:58                                         ` Al Viro
2016-02-19 21:58                                           ` Al Viro
     [not found]                                           ` <20160219215811.GA17997-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
2016-02-19 22:15                                             ` Tejun Heo
2016-02-19 22:15                                               ` Tejun Heo
2016-02-19 22:26                                               ` Al Viro
     [not found]                                                 ` <20160219222609.GC17997-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
2016-02-28 21:53                                                   ` Tejun Heo
2016-02-28 21:53                                                     ` Tejun Heo
     [not found]                             ` <20160217230231.GC6479-qYNAdHglDFBN0TnZuCh8vA@public.gmane.org>
2016-02-29 20:47                               ` [PATCH block/for-linus] writeback: flush inode cgroup wb switches instead of pinning super_block Tejun Heo
2016-02-29 20:47                                 ` Tejun Heo
     [not found]                                 ` <20160229204724.GV3965-piEFEHQLUPpN0TnZuCh8vA@public.gmane.org>
2016-02-29 20:54                                   ` Al Viro
2016-02-29 20:54                                     ` Al Viro
     [not found]                                     ` <20160229205428.GB17997-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
2016-02-29 20:58                                       ` Tejun Heo
2016-02-29 20:58                                         ` Tejun Heo
     [not found]                                         ` <20160229205837.GX3965-piEFEHQLUPpN0TnZuCh8vA@public.gmane.org>
2016-02-29 21:06                                           ` Al Viro
2016-02-29 21:06                                             ` Al Viro
     [not found]                                             ` <20160229210614.GC17997-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
2016-02-29 21:08                                               ` Tejun Heo
2016-02-29 21:08                                                 ` Tejun Heo
     [not found]                                                 ` <20160229210800.GY3965-piEFEHQLUPpN0TnZuCh8vA@public.gmane.org>
2016-02-29 21:21                                                   ` Jan Kara
2016-02-29 21:21                                                     ` Jan Kara
2016-02-29 23:28                                   ` Tejun Heo [this message]
2016-02-29 23:28                                     ` [PATCH v2 " Tejun Heo
2016-03-01  9:20                                     ` Jan Kara
2016-03-01 17:46                                     ` Jens Axboe
     [not found]                                       ` <56D5D592.2020800-tSWWG44O7X1aa/9Udqfwiw@public.gmane.org>
2016-03-01 17:50                                         ` Tejun Heo
2016-03-01 17:50                                           ` Tejun Heo
     [not found]                                           ` <CAOS58YO5vTBnM561np7gpXKGQELrT169bYqmcfvAvsquBJK5yw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2016-03-02 10:29                                             ` Jan Kara
2016-03-02 10:29                                               ` Jan Kara
2016-03-01 13:39                                 ` [PATCH " Tahsin Erdogan
2016-02-18 10:12               ` [PATCH block/for-4.5-fixes] writeback: keep superblock pinned during cgroup writeback association switches Nikolay Borisov
2016-02-18 10:12                 ` Nikolay Borisov
2016-02-18 12:57                 ` Tejun Heo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20160229232853.GD3965@htj.duckdns.org \
    --to=tj-dgejt+ai2ygdnm+yrofe0a@public.gmane.org \
    --cc=axboe-tSWWG44O7X1aa/9Udqfwiw@public.gmane.org \
    --cc=cgroups-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
    --cc=jack-AlSwsSmVLrQ@public.gmane.org \
    --cc=jack-IBi9RG/b67k@public.gmane.org \
    --cc=linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
    --cc=nauman-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org \
    --cc=tahsin-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org \
    --cc=tytso-3s7WtUTddSA@public.gmane.org \
    --cc=viro-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.