public inbox for linux-xfs@vger.kernel.org
 help / color / mirror / Atom feed
From: Catherine Hoang <catherine.hoang@oracle.com>
To: linux-xfs@vger.kernel.org
Subject: [PATCH 6.6 CANDIDATE v2 10/21] xfs: allow read IO and FICLONE to run concurrently
Date: Mon,  5 Feb 2024 14:20:00 -0800	[thread overview]
Message-ID: <20240205222011.95476-11-catherine.hoang@oracle.com> (raw)
In-Reply-To: <20240205222011.95476-1-catherine.hoang@oracle.com>

commit 14a537983b228cb050ceca3a5b743d01315dc4aa upstream.

One of our VM cluster management products needs to snapshot KVM image
files so that they can be restored in case of failure. Snapshotting is
done by redirecting VM disk writes to a sidecar file and using reflink
on the disk image, specifically the FICLONE ioctl as used by
"cp --reflink". Reflink locks the source and destination files while it
operates, which means that reads from the main vm disk image are blocked,
causing the vm to stall. When an image file is heavily fragmented, the
copy process could take several minutes. Some of the vm image files have
50-100 million extent records, and duplicating that much metadata locks
the file for 30 minutes or more. Having activities suspended for such
a long time in a cluster node could result in node eviction.

Clone operations and read IO do not change any data in the source file,
so they should be able to run concurrently. Demote the exclusive locks
taken by FICLONE to shared locks to allow reads while cloning. While a
clone is in progress, writes will take the IOLOCK_EXCL, so they block
until the clone completes.

Link: https://lore.kernel.org/linux-xfs/8911B94D-DD29-4D6E-B5BC-32EAF1866245@oracle.com/
Signed-off-by: Catherine Hoang <catherine.hoang@oracle.com>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/xfs_file.c    | 63 +++++++++++++++++++++++++++++++++++---------
 fs/xfs/xfs_inode.c   | 17 ++++++++++++
 fs/xfs/xfs_inode.h   |  9 +++++++
 fs/xfs/xfs_reflink.c |  4 +++
 4 files changed, 80 insertions(+), 13 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 203700278ddb..e33e5e13b95f 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -214,6 +214,43 @@ xfs_ilock_iocb(
 	return 0;
 }
 
+static int
+xfs_ilock_iocb_for_write(
+	struct kiocb		*iocb,
+	unsigned int		*lock_mode)
+{
+	ssize_t			ret;
+	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
+
+	ret = xfs_ilock_iocb(iocb, *lock_mode);
+	if (ret)
+		return ret;
+
+	if (*lock_mode == XFS_IOLOCK_EXCL)
+		return 0;
+	if (!xfs_iflags_test(ip, XFS_IREMAPPING))
+		return 0;
+
+	xfs_iunlock(ip, *lock_mode);
+	*lock_mode = XFS_IOLOCK_EXCL;
+	return xfs_ilock_iocb(iocb, *lock_mode);
+}
+
+static unsigned int
+xfs_ilock_for_write_fault(
+	struct xfs_inode	*ip)
+{
+	/* get a shared lock if no remapping in progress */
+	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+	if (!xfs_iflags_test(ip, XFS_IREMAPPING))
+		return XFS_MMAPLOCK_SHARED;
+
+	/* wait for remapping to complete */
+	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+	return XFS_MMAPLOCK_EXCL;
+}
+
 STATIC ssize_t
 xfs_file_dio_read(
 	struct kiocb		*iocb,
@@ -551,7 +588,7 @@ xfs_file_dio_write_aligned(
 	unsigned int		iolock = XFS_IOLOCK_SHARED;
 	ssize_t			ret;
 
-	ret = xfs_ilock_iocb(iocb, iolock);
+	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
 	if (ret)
 		return ret;
 	ret = xfs_file_write_checks(iocb, from, &iolock);
@@ -618,7 +655,7 @@ xfs_file_dio_write_unaligned(
 		flags = IOMAP_DIO_FORCE_WAIT;
 	}
 
-	ret = xfs_ilock_iocb(iocb, iolock);
+	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
 	if (ret)
 		return ret;
 
@@ -1180,7 +1217,7 @@ xfs_file_remap_range(
 	if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
 		xfs_log_force_inode(dest);
 out_unlock:
-	xfs_iunlock2_io_mmap(src, dest);
+	xfs_iunlock2_remapping(src, dest);
 	if (ret)
 		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
 	return remapped > 0 ? remapped : ret;
@@ -1328,6 +1365,7 @@ __xfs_filemap_fault(
 	struct inode		*inode = file_inode(vmf->vma->vm_file);
 	struct xfs_inode	*ip = XFS_I(inode);
 	vm_fault_t		ret;
+	unsigned int		lock_mode = 0;
 
 	trace_xfs_filemap_fault(ip, order, write_fault);
 
@@ -1336,25 +1374,24 @@ __xfs_filemap_fault(
 		file_update_time(vmf->vma->vm_file);
 	}
 
+	if (IS_DAX(inode) || write_fault)
+		lock_mode = xfs_ilock_for_write_fault(XFS_I(inode));
+
 	if (IS_DAX(inode)) {
 		pfn_t pfn;
 
-		xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 		ret = xfs_dax_fault(vmf, order, write_fault, &pfn);
 		if (ret & VM_FAULT_NEEDDSYNC)
 			ret = dax_finish_sync_fault(vmf, order, pfn);
-		xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+	} else if (write_fault) {
+		ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops);
 	} else {
-		if (write_fault) {
-			xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-			ret = iomap_page_mkwrite(vmf,
-					&xfs_page_mkwrite_iomap_ops);
-			xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-		} else {
-			ret = filemap_fault(vmf);
-		}
+		ret = filemap_fault(vmf);
 	}
 
+	if (lock_mode)
+		xfs_iunlock(XFS_I(inode), lock_mode);
+
 	if (write_fault)
 		sb_end_pagefault(inode->i_sb);
 	return ret;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index fb85c5c81745..f9d29acd72b9 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3628,6 +3628,23 @@ xfs_iunlock2_io_mmap(
 		inode_unlock(VFS_I(ip1));
 }
 
+/* Drop the MMAPLOCK and the IOLOCK after a remap completes. */
+void
+xfs_iunlock2_remapping(
+	struct xfs_inode	*ip1,
+	struct xfs_inode	*ip2)
+{
+	xfs_iflags_clear(ip1, XFS_IREMAPPING);
+
+	if (ip1 != ip2)
+		xfs_iunlock(ip1, XFS_MMAPLOCK_SHARED);
+	xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
+
+	if (ip1 != ip2)
+		inode_unlock_shared(VFS_I(ip1));
+	inode_unlock(VFS_I(ip2));
+}
+
 /*
  * Reload the incore inode list for this inode.  Caller should ensure that
  * the link count cannot change, either by taking ILOCK_SHARED or otherwise
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0c5bdb91152e..3dc47937da5d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -347,6 +347,14 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
 /* Quotacheck is running but inode has not been added to quota counts. */
 #define XFS_IQUOTAUNCHECKED	(1 << 14)
 
+/*
+ * Remap in progress. Callers that wish to update file data while
+ * holding a shared IOLOCK or MMAPLOCK must drop the lock and retake
+ * the lock in exclusive mode. Relocking the file will block until
+ * IREMAPPING is cleared.
+ */
+#define XFS_IREMAPPING		(1U << 15)
+
 /* All inode state flags related to inode reclaim. */
 #define XFS_ALL_IRECLAIM_FLAGS	(XFS_IRECLAIMABLE | \
 				 XFS_IRECLAIM | \
@@ -595,6 +603,7 @@ void xfs_end_io(struct work_struct *work);
 
 int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
 void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
+void xfs_iunlock2_remapping(struct xfs_inode *ip1, struct xfs_inode *ip2);
 
 static inline bool
 xfs_inode_unlinked_incomplete(
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index eb9102453aff..658edee8381d 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1540,6 +1540,10 @@ xfs_reflink_remap_prep(
 	if (ret)
 		goto out_unlock;
 
+	xfs_iflags_set(src, XFS_IREMAPPING);
+	if (inode_in != inode_out)
+		xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
+
 	return 0;
 out_unlock:
 	xfs_iunlock2_io_mmap(src, dest);
-- 
2.39.3


  parent reply	other threads:[~2024-02-05 22:20 UTC|newest]

Thread overview: 23+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-02-05 22:19 [PATCH 6.6 CANDIDATE v2 00/21] xfs backports for 6.6.y (from v6.7) Catherine Hoang
2024-02-05 22:19 ` [PATCH 6.6 CANDIDATE v2 01/21] MAINTAINERS: add Catherine as xfs maintainer for 6.6.y Catherine Hoang
2024-02-05 22:19 ` [PATCH 6.6 CANDIDATE v2 02/21] xfs: bump max fsgeom struct version Catherine Hoang
2024-02-05 22:19 ` [PATCH 6.6 CANDIDATE v2 03/21] xfs: hoist freeing of rt data fork extent mappings Catherine Hoang
2024-02-05 22:19 ` [PATCH 6.6 CANDIDATE v2 04/21] xfs: prevent rt growfs when quota is enabled Catherine Hoang
2024-02-05 22:19 ` [PATCH 6.6 CANDIDATE v2 05/21] xfs: rt stubs should return negative errnos when rt disabled Catherine Hoang
2024-02-05 22:19 ` [PATCH 6.6 CANDIDATE v2 06/21] xfs: fix units conversion error in xfs_bmap_del_extent_delay Catherine Hoang
2024-02-05 22:19 ` [PATCH 6.6 CANDIDATE v2 07/21] xfs: make sure maxlen is still congruent with prod when rounding down Catherine Hoang
2024-02-05 22:19 ` [PATCH 6.6 CANDIDATE v2 08/21] xfs: introduce protection for drop nlink Catherine Hoang
2024-02-05 22:19 ` [PATCH 6.6 CANDIDATE v2 09/21] xfs: handle nimaps=0 from xfs_bmapi_write in xfs_alloc_file_space Catherine Hoang
2024-02-05 22:20 ` Catherine Hoang [this message]
2024-02-05 22:20 ` [PATCH 6.6 CANDIDATE v2 11/21] xfs: factor out xfs_defer_pending_abort Catherine Hoang
2024-02-05 22:20 ` [PATCH 6.6 CANDIDATE v2 12/21] xfs: abort intent items when recovery intents fail Catherine Hoang
2024-02-05 22:20 ` [PATCH 6.6 CANDIDATE v2 13/21] xfs: only remap the written blocks in xfs_reflink_end_cow_extent Catherine Hoang
2024-02-05 22:20 ` [PATCH 6.6 CANDIDATE v2 14/21] xfs: up(ic_sema) if flushing data device fails Catherine Hoang
2024-02-05 22:20 ` [PATCH 6.6 CANDIDATE v2 15/21] xfs: fix internal error from AGFL exhaustion Catherine Hoang
2024-02-05 22:20 ` [PATCH 6.6 CANDIDATE v2 16/21] xfs: fix again select in kconfig XFS_ONLINE_SCRUB_STATS Catherine Hoang
2024-02-05 22:20 ` [PATCH 6.6 CANDIDATE v2 17/21] xfs: inode recovery does not validate the recovered inode Catherine Hoang
2024-02-05 22:20 ` [PATCH 6.6 CANDIDATE v2 18/21] xfs: clean up dqblk extraction Catherine Hoang
2024-02-05 22:20 ` [PATCH 6.6 CANDIDATE v2 19/21] xfs: dquot recovery does not validate the recovered dquot Catherine Hoang
2024-02-05 22:20 ` [PATCH 6.6 CANDIDATE v2 20/21] xfs: clean up FS_XFLAG_REALTIME handling in xfs_ioctl_setattr_xflags Catherine Hoang
2024-02-05 22:20 ` [PATCH 6.6 CANDIDATE v2 21/21] xfs: respect the stable writes flag on the RT device Catherine Hoang
2024-02-06  5:23 ` [PATCH 6.6 CANDIDATE v2 00/21] xfs backports for 6.6.y (from v6.7) Chandan Babu R

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240205222011.95476-11-catherine.hoang@oracle.com \
    --to=catherine.hoang@oracle.com \
    --cc=linux-xfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox