All of lore.kernel.org
 help / color / mirror / Atom feed
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: linux-kernel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	stable@vger.kernel.org, Tejun Heo <tj@kernel.org>,
	Tahsin Erdogan <tahsin@google.com>, Jan Kara <jack@suse.cz>,
	Al Viro <viro@ZenIV.linux.org.uk>, Jens Axboe <axboe@fb.com>
Subject: [PATCH 4.4 32/74] writeback: flush inode cgroup wb switches instead of pinning super_block
Date: Mon,  7 Mar 2016 16:02:57 -0800	[thread overview]
Message-ID: <20160308000316.324754192@linuxfoundation.org> (raw)
In-Reply-To: <20160308000315.294406921@linuxfoundation.org>

4.4-stable review patch.  If anyone has any objections, please let me know.

------------------

From: Tejun Heo <tj@kernel.org>

commit a1a0e23e49037c23ea84bc8cc146a03584d13577 upstream.

If cgroup writeback is in use, inodes can be scheduled for
asynchronous wb switching.  Before 5ff8eaac1636 ("writeback: keep
superblock pinned during cgroup writeback association switches"), this
could race with umount leading to super_block being destroyed while
inodes are pinned for wb switching.  5ff8eaac1636 fixed it by bumping
s_active while wb switches are in flight; however, this allowed
in-flight wb switches to make umounts asynchronous when the userland
expected synchronosity - e.g. fsck immediately following umount may
fail because the device is still busy.

This patch removes the problematic super_block pinning and instead
makes generic_shutdown_super() flush in-flight wb switches.  wb
switches are now executed on a dedicated isw_wq so that they can be
flushed and isw_nr_in_flight keeps track of the number of in-flight wb
switches so that flushing can be avoided in most cases.

v2: Move cgroup_writeback_umount() further below and add MS_ACTIVE
    check in inode_switch_wbs() as Jan an Al suggested.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Tahsin Erdogan <tahsin@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Link: http://lkml.kernel.org/g/CAAeU0aNCq7LGODvVGRU-oU_o-6enii5ey0p1c26D1ZzYwkDc5A@mail.gmail.com
Fixes: 5ff8eaac1636 ("writeback: keep superblock pinned during cgroup writeback association switches")
Reviewed-by: Jan Kara <jack@suse.cz>
Tested-by: Tahsin Erdogan <tahsin@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

---
 fs/fs-writeback.c         |   54 ++++++++++++++++++++++++++++++++++------------
 fs/super.c                |    1 
 include/linux/writeback.h |    5 ++++
 3 files changed, 47 insertions(+), 13 deletions(-)

--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -223,6 +223,9 @@ static void wb_wait_for_completion(struc
 #define WB_FRN_HIST_MAX_SLOTS	(WB_FRN_HIST_THR_SLOTS / 2 + 1)
 					/* one round can affect upto 5 slots */
 
+static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
+static struct workqueue_struct *isw_wq;
+
 void __inode_attach_wb(struct inode *inode, struct page *page)
 {
 	struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -317,7 +320,6 @@ static void inode_switch_wbs_work_fn(str
 	struct inode_switch_wbs_context *isw =
 		container_of(work, struct inode_switch_wbs_context, work);
 	struct inode *inode = isw->inode;
-	struct super_block *sb = inode->i_sb;
 	struct address_space *mapping = inode->i_mapping;
 	struct bdi_writeback *old_wb = inode->i_wb;
 	struct bdi_writeback *new_wb = isw->new_wb;
@@ -424,8 +426,9 @@ skip_switch:
 	wb_put(new_wb);
 
 	iput(inode);
-	deactivate_super(sb);
 	kfree(isw);
+
+	atomic_dec(&isw_nr_in_flight);
 }
 
 static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
@@ -435,7 +438,7 @@ static void inode_switch_wbs_rcu_fn(stru
 
 	/* needs to grab bh-unsafe locks, bounce to work item */
 	INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
-	schedule_work(&isw->work);
+	queue_work(isw_wq, &isw->work);
 }
 
 /**
@@ -471,20 +474,20 @@ static void inode_switch_wbs(struct inod
 
 	/* while holding I_WB_SWITCH, no one else can update the association */
 	spin_lock(&inode->i_lock);
-
-	if (inode->i_state & (I_WB_SWITCH | I_FREEING) ||
-	    inode_to_wb(inode) == isw->new_wb)
-		goto out_unlock;
-
-	if (!atomic_inc_not_zero(&inode->i_sb->s_active))
-		goto out_unlock;
-
+	if (!(inode->i_sb->s_flags & MS_ACTIVE) ||
+	    inode->i_state & (I_WB_SWITCH | I_FREEING) ||
+	    inode_to_wb(inode) == isw->new_wb) {
+		spin_unlock(&inode->i_lock);
+		goto out_free;
+	}
 	inode->i_state |= I_WB_SWITCH;
 	spin_unlock(&inode->i_lock);
 
 	ihold(inode);
 	isw->inode = inode;
 
+	atomic_inc(&isw_nr_in_flight);
+
 	/*
 	 * In addition to synchronizing among switchers, I_WB_SWITCH tells
 	 * the RCU protected stat update paths to grab the mapping's
@@ -494,8 +497,6 @@ static void inode_switch_wbs(struct inod
 	call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
 	return;
 
-out_unlock:
-	spin_unlock(&inode->i_lock);
 out_free:
 	if (isw->new_wb)
 		wb_put(isw->new_wb);
@@ -849,6 +850,33 @@ restart:
 		wb_put(last_wb);
 }
 
+/**
+ * cgroup_writeback_umount - flush inode wb switches for umount
+ *
+ * This function is called when a super_block is about to be destroyed and
+ * flushes in-flight inode wb switches.  An inode wb switch goes through
+ * RCU and then workqueue, so the two need to be flushed in order to ensure
+ * that all previously scheduled switches are finished.  As wb switches are
+ * rare occurrences and synchronize_rcu() can take a while, perform
+ * flushing iff wb switches are in flight.
+ */
+void cgroup_writeback_umount(void)
+{
+	if (atomic_read(&isw_nr_in_flight)) {
+		synchronize_rcu();
+		flush_workqueue(isw_wq);
+	}
+}
+
+static int __init cgroup_writeback_init(void)
+{
+	isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
+	if (!isw_wq)
+		return -ENOMEM;
+	return 0;
+}
+fs_initcall(cgroup_writeback_init);
+
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
 static struct bdi_writeback *
--- a/fs/super.c
+++ b/fs/super.c
@@ -415,6 +415,7 @@ void generic_shutdown_super(struct super
 		sb->s_flags &= ~MS_ACTIVE;
 
 		fsnotify_unmount_inodes(sb);
+		cgroup_writeback_umount();
 
 		evict_inodes(sb);
 
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -198,6 +198,7 @@ void wbc_attach_and_unlock_inode(struct
 void wbc_detach_inode(struct writeback_control *wbc);
 void wbc_account_io(struct writeback_control *wbc, struct page *page,
 		    size_t bytes);
+void cgroup_writeback_umount(void);
 
 /**
  * inode_attach_wb - associate an inode with its wb
@@ -301,6 +302,10 @@ static inline void wbc_account_io(struct
 {
 }
 
+static inline void cgroup_writeback_umount(void)
+{
+}
+
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
 /*

  parent reply	other threads:[~2016-03-08  0:18 UTC|newest]

Thread overview: 81+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-03-08  0:02 [PATCH 4.4 00/74] 4.4.5-stable review Greg Kroah-Hartman
2016-03-08  0:02 ` [PATCH 4.4 01/74] use ->d_seq to get coherency between ->d_inode and ->d_flags Greg Kroah-Hartman
2016-03-08  0:02 ` [PATCH 4.4 02/74] drivers: sh: Restore legacy clock domain on SuperH platforms Greg Kroah-Hartman
2016-03-08  0:02 ` [PATCH 4.4 03/74] Btrfs: fix deadlock running delayed iputs at transaction commit time Greg Kroah-Hartman
2016-03-08  0:02 ` [PATCH 4.4 04/74] btrfs: Fix no_space in write and rm loop Greg Kroah-Hartman
2016-03-08  0:02 ` [PATCH 4.4 05/74] btrfs: async-thread: Fix a use-after-free error for trace Greg Kroah-Hartman
2016-03-08  0:02 ` [PATCH 4.4 07/74] block: Initialize max_dev_sectors to 0 Greg Kroah-Hartman
2016-03-08  0:02 ` [PATCH 4.4 08/74] PCI: keystone: Fix MSI code that retrieves struct pcie_port pointer Greg Kroah-Hartman
2016-03-08  0:02 ` [PATCH 4.4 09/74] parisc: Fix ptrace syscall number and return value modification Greg Kroah-Hartman
2016-03-08  0:02 ` [PATCH 4.4 10/74] mips/kvm: fix ioctl error handling Greg Kroah-Hartman
2016-03-08  0:02 ` [PATCH 4.4 11/74] kvm: x86: Update tsc multiplier on change Greg Kroah-Hartman
2016-03-08  0:02 ` [PATCH 4.4 12/74] fbcon: set a default value to blink interval Greg Kroah-Hartman
2016-03-08  0:02 ` [PATCH 4.4 13/74] cifs: fix out-of-bounds access in lease parsing Greg Kroah-Hartman
2016-03-09  3:47   ` Ben Hutchings
2016-03-09  4:23     ` Steve French
2016-03-09 16:17       ` Ben Hutchings
2016-03-08  0:02 ` [PATCH 4.4 14/74] CIFS: Fix SMB2+ interim response processing for read requests Greg Kroah-Hartman
2016-03-08  0:02 ` [PATCH 4.4 15/74] Fix cifs_uniqueid_to_ino_t() function for s390x Greg Kroah-Hartman
2016-03-08  0:02 ` [PATCH 4.4 16/74] vfio: fix ioctl error handling Greg Kroah-Hartman
2016-03-08  0:02 ` [PATCH 4.4 17/74] KVM: x86: fix root cause for missed hardware breakpoints Greg Kroah-Hartman
2016-03-08  0:02 ` [PATCH 4.4 18/74] arm/arm64: KVM: Fix ioctl error handling Greg Kroah-Hartman
2016-03-08  0:02 ` [PATCH 4.4 19/74] iommu/amd: Apply workaround for ATS write permission check Greg Kroah-Hartman
2016-03-08  0:02 ` [PATCH 4.4 20/74] iommu/amd: Fix boot warning when device 00:00.0 is not iommu covered Greg Kroah-Hartman
2016-03-08  0:02 ` [PATCH 4.4 21/74] iommu/vt-d: Use BUS_NOTIFY_REMOVED_DEVICE in hotplug path Greg Kroah-Hartman
2016-03-08  0:02 ` [PATCH 4.4 22/74] target: Fix WRITE_SAME/DISCARD conversion to linux 512b sectors Greg Kroah-Hartman
2016-03-08  0:02 ` [PATCH 4.4 23/74] drm/ast: Fix incorrect register check for DRAM width Greg Kroah-Hartman
2016-03-08  0:02 ` [PATCH 4.4 28/74] drm/amdgpu: return from atombios_dp_get_dpcd only when error Greg Kroah-Hartman
2016-03-08  0:02 ` [PATCH 4.4 29/74] libata: fix HDIO_GET_32BIT ioctl Greg Kroah-Hartman
2016-03-08  0:02 ` [PATCH 4.4 30/74] libata: Align ata_devices id on a cacheline Greg Kroah-Hartman
2016-03-08  0:02 ` [PATCH 4.4 31/74] block: bio: introduce helpers to get the 1st and last bvec Greg Kroah-Hartman
2016-03-08  0:02 ` Greg Kroah-Hartman [this message]
2016-03-08  0:02 ` [PATCH 4.4 33/74] Adding Intel Lewisburg device IDs for SATA Greg Kroah-Hartman
2016-03-08  0:02 ` [PATCH 4.4 34/74] arm64: vmemmap: use virtual projection of linear region Greg Kroah-Hartman
2016-03-08 10:40   ` Ard Biesheuvel
2016-03-08 13:44     ` Greg Kroah-Hartman
2016-03-08 13:45       ` Ard Biesheuvel
2016-03-12  1:51         ` Ard Biesheuvel
2016-03-12  5:50           ` Greg Kroah-Hartman
2016-03-12  5:55             ` Ard Biesheuvel
2016-03-12  6:05               ` Greg Kroah-Hartman
2016-03-12  8:14                 ` Ard Biesheuvel
2016-03-08  0:03 ` [PATCH 4.4 35/74] PM / sleep / x86: Fix crash on graph trace through x86 suspend Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 36/74] ata: ahci: dont mark HotPlugCapable Ports as external/removable Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 37/74] tracing: Do not have comm filter override event comm field Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 38/74] pata-rb532-cf: get rid of the irq_to_gpio() call Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 39/74] Btrfs: fix loading of orphan roots leading to BUG_ON Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 40/74] Revert "jffs2: Fix lock acquisition order bug in jffs2_write_begin" Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 43/74] dmaengine: pxa_dma: fix cyclic transfers Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 44/74] [media] adv7604: fix tx 5v detect regression Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 45/74] ALSA: usb-audio: Add a quirk for Plantronics DA45 Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 46/74] ALSA: ctl: Fix ioctls for X32 ABI Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 47/74] ALSA: hda - Fix mic issues on Acer Aspire E1-472 Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 48/74] ALSA: rawmidi: Fix ioctls X32 ABI Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 49/74] ALSA: timer: Fix ioctls for " Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 50/74] ALSA: pcm: " Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 51/74] ALSA: seq: oss: Dont drain at closing a client Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 52/74] ALSA: hdspm: Fix wrong boolean ctl value accesses Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 53/74] ALSA: hdsp: " Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 54/74] ALSA: hdspm: Fix zero-division Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 55/74] ALSA: timer: Fix broken compat timer user status ioctl Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 56/74] usb: chipidea: otg: change workqueue ci_otg as freezable Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 57/74] USB: cp210x: Add ID for Parrot NMEA GPS Flight Recorder Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 60/74] USB: serial: option: add support for Telit LE922 PID 0x1045 Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 61/74] USB: serial: option: add support for Quectel UC20 Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 62/74] MIPS: scache: Fix scache init with invalid line size Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 63/74] MIPS: traps: Fix SIGFPE information leak from `do_ov and `do_trap_or_bp Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 64/74] cxl: Fix PSL timebase synchronization detection Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 65/74] ubi: Fix out of bounds write in volume update code Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 66/74] i2c: brcmstb: allocate correct amount of memory for regmap Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 67/74] thermal: cpu_cooling: fix out of bounds access in time_in_idle Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 69/74] block: check virt boundary in bio_will_gap() Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 70/74] block: get the 1st and last bvec via helpers Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 71/74] drm/i915: more virtual south bridge detection Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 73/74] modules: fix longstanding /proc/kallsyms vs module insertion race Greg Kroah-Hartman
2016-03-08  0:03 ` [PATCH 4.4 74/74] drm/amdgpu: fix topaz/tonga gmc assignment in 4.4 stable Greg Kroah-Hartman
2016-03-08 11:45 ` [PATCH 4.4 00/74] 4.4.5-stable review Guenter Roeck
2016-03-08 14:19   ` Greg Kroah-Hartman
     [not found] ` <56dea53c.a3f6c20a.71577.ffff9660@mx.google.com>
2016-03-08 14:34   ` Greg Kroah-Hartman
2016-03-09  5:32     ` Kevin Hilman
2016-03-08 16:24 ` Shuah Khan
2016-03-09  2:07   ` Greg Kroah-Hartman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20160308000316.324754192@linuxfoundation.org \
    --to=gregkh@linuxfoundation.org \
    --cc=axboe@fb.com \
    --cc=jack@suse.cz \
    --cc=linux-kernel@vger.kernel.org \
    --cc=stable@vger.kernel.org \
    --cc=tahsin@google.com \
    --cc=tj@kernel.org \
    --cc=viro@ZenIV.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.