fall back from direct to buffered I/O when stable writes are required

linux-xfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* fall back from direct to buffered I/O when stable writes are required
@ 2025-10-29  7:15 Christoph Hellwig
  2025-10-29  7:15 ` [PATCH 1/4] fs: replace FOP_DIO_PARALLEL_WRITE with a fmode bits Christoph Hellwig
                   ` (5 more replies)
  0 siblings, 6 replies; 53+ messages in thread
From: Christoph Hellwig @ 2025-10-29  7:15 UTC (permalink / raw)
  To: Carlos Maiolino, Christian Brauner
  Cc: Jan Kara, Martin K. Petersen, linux-kernel, linux-xfs,
	linux-fsdevel, linux-raid, linux-block

Hi all,

we've had a long standing issue that direct I/O to and from devices that
require stable writes can corrupt data because the user memory can be
modified while in flight.  This series tries to address this by falling
back to uncached buffered I/O.  Given that this requires an extra copy it
is usually going to be a slow down, especially for very high bandwith
use cases, so I'm not exactly happy about.

I suspect we need a way to opt out of this for applications that know
what they are doing, and I can think of a few ways to do that:

1a) Allow a mount option to override the behavior

	This allows the sysadmin to get back to the previous state.
	This is fairly easy to implement, but the scope might be to wide.

1b) Sysfs attribute

	Same as above.  Slightly easier to modify, but a more unusual
	interface.

2) Have a per-inode attribute

	Allows to set it on a specific file.  Would require an on-disk
	format change for the usual attr options.

3) Have a fcntl or similar to allow an application to override it

	Fine granularity.  Requires application change.  We might not
	allow any application to force this as it could be used to inject
	corruption.

In other words, they are all kinda horrible.

Diffstat:
 fs/ext4/file.c      |    2 -
 fs/xfs/xfs_file.c   |   59 +++++++++++++++++++++++++++++++++++++++++++---------
 fs/xfs/xfs_iops.c   |    6 +++++
 include/linux/fs.h  |   11 +++++----
 io_uring/io_uring.c |    2 -
 5 files changed, 63 insertions(+), 17 deletions(-)

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH 1/4] fs: replace FOP_DIO_PARALLEL_WRITE with a fmode bits
  2025-10-29  7:15 fall back from direct to buffered I/O when stable writes are required Christoph Hellwig
@ 2025-10-29  7:15 ` Christoph Hellwig
  2025-10-29 16:01   ` Darrick J. Wong
                     ` (2 more replies)
  2025-10-29  7:15 ` [PATCH 2/4] fs: return writeback errors for IOCB_DONTCACHE in generic_write_sync Christoph Hellwig
                   ` (4 subsequent siblings)
  5 siblings, 3 replies; 53+ messages in thread
From: Christoph Hellwig @ 2025-10-29  7:15 UTC (permalink / raw)
  To: Carlos Maiolino, Christian Brauner
  Cc: Jan Kara, Martin K. Petersen, linux-kernel, linux-xfs,
	linux-fsdevel, linux-raid, linux-block

To properly handle the direct to buffered I/O fallback for devices that
require stable writes, we need to be able to set the DIO_PARALLEL_WRITE
on a per-file basis and no statically for a given file_operations
instance.

This effectively reverts a part of 210a03c9d51a ("fs: claw back a few
FMODE_* bits").

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/ext4/file.c      | 2 +-
 fs/xfs/xfs_file.c   | 4 ++--
 include/linux/fs.h  | 7 ++-----
 io_uring/io_uring.c | 2 +-
 4 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 7a8b30932189..b484e98b9c78 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -924,6 +924,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
 		filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
 
 	filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
+	filp->f_mode |= FMODE_DIO_PARALLEL_WRITE;
 	return dquot_file_open(inode, filp);
 }
 
@@ -978,7 +979,6 @@ const struct file_operations ext4_file_operations = {
 	.splice_write	= iter_file_splice_write,
 	.fallocate	= ext4_fallocate,
 	.fop_flags	= FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
-			  FOP_DIO_PARALLEL_WRITE |
 			  FOP_DONTCACHE,
 };
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 2702fef2c90c..5703b6681b1d 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1553,6 +1553,7 @@ xfs_file_open(
 	if (xfs_is_shutdown(XFS_M(inode->i_sb)))
 		return -EIO;
 	file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
+	file->f_mode |= FMODE_DIO_PARALLEL_WRITE;
 	if (xfs_get_atomic_write_min(XFS_I(inode)) > 0)
 		file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
 	return generic_file_open(inode, file);
@@ -1951,8 +1952,7 @@ const struct file_operations xfs_file_operations = {
 	.fadvise	= xfs_file_fadvise,
 	.remap_file_range = xfs_file_remap_range,
 	.fop_flags	= FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
-			  FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE |
-			  FOP_DONTCACHE,
+			  FOP_BUFFER_WASYNC | FOP_DONTCACHE,
 };
 
 const struct file_operations xfs_dir_file_operations = {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c895146c1444..09b47effc55e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -128,9 +128,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 #define FMODE_WRITE_RESTRICTED	((__force fmode_t)(1 << 6))
 /* File supports atomic writes */
 #define FMODE_CAN_ATOMIC_WRITE	((__force fmode_t)(1 << 7))
-
-/* FMODE_* bit 8 */
-
+/* Supports non-exclusive O_DIRECT writes from multiple threads */
+#define FMODE_DIO_PARALLEL_WRITE ((__force fmode_t)(1 << 8))
 /* 32bit hashes as llseek() offset (for directories) */
 #define FMODE_32BITHASH         ((__force fmode_t)(1 << 9))
 /* 64bit hashes as llseek() offset (for directories) */
@@ -2317,8 +2316,6 @@ struct file_operations {
 #define FOP_BUFFER_WASYNC	((__force fop_flags_t)(1 << 1))
 /* Supports synchronous page faults for mappings */
 #define FOP_MMAP_SYNC		((__force fop_flags_t)(1 << 2))
-/* Supports non-exclusive O_DIRECT writes from multiple threads */
-#define FOP_DIO_PARALLEL_WRITE	((__force fop_flags_t)(1 << 3))
 /* Contains huge pages */
 #define FOP_HUGE_PAGES		((__force fop_flags_t)(1 << 4))
 /* Treat loff_t as unsigned (e.g., /dev/mem) */
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 296667ba712c..668937da27e8 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -469,7 +469,7 @@ static void io_prep_async_work(struct io_kiocb *req)
 
 		/* don't serialize this request if the fs doesn't need it */
 		if (should_hash && (req->file->f_flags & O_DIRECT) &&
-		    (req->file->f_op->fop_flags & FOP_DIO_PARALLEL_WRITE))
+		    (req->file->f_mode & FMODE_DIO_PARALLEL_WRITE))
 			should_hash = false;
 		if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL))
 			io_wq_hash_work(&req->work, file_inode(req->file));
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 2/4] fs: return writeback errors for IOCB_DONTCACHE in generic_write_sync
  2025-10-29  7:15 fall back from direct to buffered I/O when stable writes are required Christoph Hellwig
  2025-10-29  7:15 ` [PATCH 1/4] fs: replace FOP_DIO_PARALLEL_WRITE with a fmode bits Christoph Hellwig
@ 2025-10-29  7:15 ` Christoph Hellwig
  2025-10-29 16:01   ` Darrick J. Wong
  2025-10-29  7:15 ` [PATCH 3/4] xfs: use IOCB_DONTCACHE when falling back to buffered writes Christoph Hellwig
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 53+ messages in thread
From: Christoph Hellwig @ 2025-10-29  7:15 UTC (permalink / raw)
  To: Carlos Maiolino, Christian Brauner
  Cc: Jan Kara, Martin K. Petersen, linux-kernel, linux-xfs,
	linux-fsdevel, linux-raid, linux-block

Currently generic_write_sync only kicks of writeback for IOCB_DONTCACHE
writes, but never looks at the writeback errors.  When using
IOCB_DONTCACHE as a fallback for IOCB_DIRECT for devcies that require
stable writes, this breaks a few xfstests test cases that expect instant
errors like removed devices to be directly propagated to the writer.
While I don't know of real applications that would expect this, trying to
keep the behavior as similar as possible sounds useful and can be
trivially done by checking for and returning writeback errors in
generic_write_sync.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/fs.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 09b47effc55e..34a843cf4c1c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3047,9 +3047,13 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
 			return ret;
 	} else if (iocb->ki_flags & IOCB_DONTCACHE) {
 		struct address_space *mapping = iocb->ki_filp->f_mapping;
+		int err;
 
 		filemap_fdatawrite_range_kick(mapping, iocb->ki_pos - count,
 					      iocb->ki_pos - 1);
+		err = file_check_and_advance_wb_err(iocb->ki_filp);
+		if (err)
+			return err;
 	}
 
 	return count;
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 3/4] xfs: use IOCB_DONTCACHE when falling back to buffered writes
  2025-10-29  7:15 fall back from direct to buffered I/O when stable writes are required Christoph Hellwig
  2025-10-29  7:15 ` [PATCH 1/4] fs: replace FOP_DIO_PARALLEL_WRITE with a fmode bits Christoph Hellwig
  2025-10-29  7:15 ` [PATCH 2/4] fs: return writeback errors for IOCB_DONTCACHE in generic_write_sync Christoph Hellwig
@ 2025-10-29  7:15 ` Christoph Hellwig
  2025-10-29 15:57   ` Darrick J. Wong
  2025-11-04 12:33   ` Nirjhar Roy (IBM)
  2025-10-29  7:15 ` [PATCH 4/4] xfs: fallback to buffered I/O for direct I/O when stable writes are required Christoph Hellwig
                   ` (2 subsequent siblings)
  5 siblings, 2 replies; 53+ messages in thread
From: Christoph Hellwig @ 2025-10-29  7:15 UTC (permalink / raw)
  To: Carlos Maiolino, Christian Brauner
  Cc: Jan Kara, Martin K. Petersen, linux-kernel, linux-xfs,
	linux-fsdevel, linux-raid, linux-block

Doing sub-block direct writes to COW inodes is not supported by XFS,
because new blocks need to be allocated as a whole.  Such writes
fall back to buffered I/O, and really should be using the
IOCB_DONTCACHE that didn't exist when the code was added to mimic
direct I/O semantics as closely as possible.  Also clear the
IOCB_DIRECT flags so that later code can't get confused by it being
set for something that at this point is not a direct I/O operation
any more.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_file.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 5703b6681b1d..e09ae86e118e 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1119,6 +1119,9 @@ xfs_file_write_iter(
 		ret = xfs_file_dio_write(iocb, from);
 		if (ret != -ENOTBLK)
 			return ret;
+
+		iocb->ki_flags &= ~IOCB_DIRECT;
+		iocb->ki_flags |= IOCB_DONTCACHE;
 	}
 
 	if (xfs_is_zoned_inode(ip))
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 4/4] xfs: fallback to buffered I/O for direct I/O when stable writes are required
  2025-10-29  7:15 fall back from direct to buffered I/O when stable writes are required Christoph Hellwig
                   ` (2 preceding siblings ...)
  2025-10-29  7:15 ` [PATCH 3/4] xfs: use IOCB_DONTCACHE when falling back to buffered writes Christoph Hellwig
@ 2025-10-29  7:15 ` Christoph Hellwig
  2025-10-29 15:53   ` Darrick J. Wong
  2025-11-10 13:38   ` Nirjhar Roy (IBM)
  2025-10-29 15:58 ` fall back from direct to buffered " Bart Van Assche
  2025-10-30 11:20 ` Dave Chinner
  5 siblings, 2 replies; 53+ messages in thread
From: Christoph Hellwig @ 2025-10-29  7:15 UTC (permalink / raw)
  To: Carlos Maiolino, Christian Brauner
  Cc: Jan Kara, Martin K. Petersen, linux-kernel, linux-xfs,
	linux-fsdevel, linux-raid, linux-block

Inodes can be marked as requiring stable writes, which is a setting
usually inherited from block devices that require stable writes.  Block
devices require stable writes when the drivers have to sample the data
more than once, e.g. to calculate a checksum or parity in one pass, and
then send the data on to a hardware device, and modifying the data
in-flight can lead to inconsistent checksums or parity.

For buffered I/O, the writeback code implements this by not allowing
modifications while folios are marked as under writeback, but for
direct I/O, the kernel currently does not have any way to prevent the
user application from modifying the in-flight memory, so modifications
can easily corrupt data despite the block driver setting the stable
write flag.  Even worse, corruption can happen on reads as well,
where concurrent modifications can cause checksum mismatches, or
failures to rebuild parity.  One application known to trigger this
behavior is Qemu when running Windows VMs, but there might be many
others as well.  xfstests can also hit this behavior, not only in the
specifically crafted patch for this (generic/761), but also in
various other tests that mostly stress races between different I/O
modes, which generic/095 being the most trivial and easy to hit
one.

Fix XFS to fall back to uncached buffered I/O when the block device
requires stable writes to fix these races.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_file.c | 54 +++++++++++++++++++++++++++++++++++++++--------
 fs/xfs/xfs_iops.c |  6 ++++++
 2 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e09ae86e118e..0668af07966a 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -230,6 +230,12 @@ xfs_file_dio_read(
 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
 	ssize_t			ret;
 
+	if (mapping_stable_writes(iocb->ki_filp->f_mapping)) {
+		xfs_info_once(ip->i_mount,
+			"falling back from direct to buffered I/O for read");
+		return -ENOTBLK;
+	}
+
 	trace_xfs_file_direct_read(iocb, to);
 
 	if (!iov_iter_count(to))
@@ -302,13 +308,22 @@ xfs_file_read_iter(
 	if (xfs_is_shutdown(mp))
 		return -EIO;
 
-	if (IS_DAX(inode))
+	if (IS_DAX(inode)) {
 		ret = xfs_file_dax_read(iocb, to);
-	else if (iocb->ki_flags & IOCB_DIRECT)
+		goto done;
+	}
+
+	if (iocb->ki_flags & IOCB_DIRECT) {
 		ret = xfs_file_dio_read(iocb, to);
-	else
-		ret = xfs_file_buffered_read(iocb, to);
+		if (ret != -ENOTBLK)
+			goto done;
+
+		iocb->ki_flags &= ~IOCB_DIRECT;
+		iocb->ki_flags |= IOCB_DONTCACHE;
+	}
 
+	ret = xfs_file_buffered_read(iocb, to);
+done:
 	if (ret > 0)
 		XFS_STATS_ADD(mp, xs_read_bytes, ret);
 	return ret;
@@ -883,6 +898,7 @@ xfs_file_dio_write(
 	struct iov_iter		*from)
 {
 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
+	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
 	size_t			count = iov_iter_count(from);
 
@@ -890,15 +906,21 @@ xfs_file_dio_write(
 	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
 		return -EINVAL;
 
+	if (mapping_stable_writes(iocb->ki_filp->f_mapping)) {
+		xfs_info_once(mp,
+			"falling back from direct to buffered I/O for write");
+		return -ENOTBLK;
+	}
+
 	/*
 	 * For always COW inodes we also must check the alignment of each
 	 * individual iovec segment, as they could end up with different
 	 * I/Os due to the way bio_iov_iter_get_pages works, and we'd
 	 * then overwrite an already written block.
 	 */
-	if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) ||
+	if (((iocb->ki_pos | count) & mp->m_blockmask) ||
 	    (xfs_is_always_cow_inode(ip) &&
-	     (iov_iter_alignment(from) & ip->i_mount->m_blockmask)))
+	     (iov_iter_alignment(from) & mp->m_blockmask)))
 		return xfs_file_dio_write_unaligned(ip, iocb, from);
 	if (xfs_is_zoned_inode(ip))
 		return xfs_file_dio_write_zoned(ip, iocb, from);
@@ -1555,10 +1577,24 @@ xfs_file_open(
 {
 	if (xfs_is_shutdown(XFS_M(inode->i_sb)))
 		return -EIO;
+
+	/*
+	 * If the underlying devices requires stable writes, we have to fall
+	 * back to (uncached) buffered I/O for direct I/O reads and writes, as
+	 * the kernel can't prevent applications from modifying the memory under
+	 * I/O.  We still claim to support O_DIRECT as we want opens for that to
+	 * succeed and fall back.
+	 *
+	 * As atomic writes are only supported for direct I/O, they can't be
+	 * supported either in this case.
+	 */
 	file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
-	file->f_mode |= FMODE_DIO_PARALLEL_WRITE;
-	if (xfs_get_atomic_write_min(XFS_I(inode)) > 0)
-		file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
+	if (!mapping_stable_writes(file->f_mapping)) {
+		file->f_mode |= FMODE_DIO_PARALLEL_WRITE;
+		if (xfs_get_atomic_write_min(XFS_I(inode)) > 0)
+			file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
+	}
+
 	return generic_file_open(inode, file);
 }
 
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index caff0125faea..bd49ac6b31de 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -672,6 +672,12 @@ xfs_report_atomic_write(
 	struct xfs_inode	*ip,
 	struct kstat		*stat)
 {
+	/*
+	 * If the stable writes flag is set, we have to fall back to buffered
+	 * I/O, which doesn't support atomic writes.
+	 */
+	if (mapping_stable_writes(VFS_I(ip)->i_mapping))
+		return;
 	generic_fill_statx_atomic_writes(stat,
 			xfs_get_atomic_write_min(ip),
 			xfs_get_atomic_write_max(ip),
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [PATCH 4/4] xfs: fallback to buffered I/O for direct I/O when stable writes are required
  2025-10-29  7:15 ` [PATCH 4/4] xfs: fallback to buffered I/O for direct I/O when stable writes are required Christoph Hellwig
@ 2025-10-29 15:53   ` Darrick J. Wong
  2025-10-29 16:35     ` Christoph Hellwig
  2025-11-10 13:38   ` Nirjhar Roy (IBM)
  1 sibling, 1 reply; 53+ messages in thread
From: Darrick J. Wong @ 2025-10-29 15:53 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Carlos Maiolino, Christian Brauner, Jan Kara, Martin K. Petersen,
	linux-kernel, linux-xfs, linux-fsdevel, linux-raid, linux-block

On Wed, Oct 29, 2025 at 08:15:05AM +0100, Christoph Hellwig wrote:
> Inodes can be marked as requiring stable writes, which is a setting
> usually inherited from block devices that require stable writes.  Block
> devices require stable writes when the drivers have to sample the data
> more than once, e.g. to calculate a checksum or parity in one pass, and
> then send the data on to a hardware device, and modifying the data
> in-flight can lead to inconsistent checksums or parity.
> 
> For buffered I/O, the writeback code implements this by not allowing
> modifications while folios are marked as under writeback, but for
> direct I/O, the kernel currently does not have any way to prevent the
> user application from modifying the in-flight memory, so modifications
> can easily corrupt data despite the block driver setting the stable
> write flag.  Even worse, corruption can happen on reads as well,
> where concurrent modifications can cause checksum mismatches, or
> failures to rebuild parity.  One application known to trigger this
> behavior is Qemu when running Windows VMs, but there might be many
> others as well.  xfstests can also hit this behavior, not only in the
> specifically crafted patch for this (generic/761), but also in
> various other tests that mostly stress races between different I/O
> modes, which generic/095 being the most trivial and easy to hit
> one.
> 
> Fix XFS to fall back to uncached buffered I/O when the block device
> requires stable writes to fix these races.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_file.c | 54 +++++++++++++++++++++++++++++++++++++++--------
>  fs/xfs/xfs_iops.c |  6 ++++++
>  2 files changed, 51 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index e09ae86e118e..0668af07966a 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -230,6 +230,12 @@ xfs_file_dio_read(
>  	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
>  	ssize_t			ret;
>  
> +	if (mapping_stable_writes(iocb->ki_filp->f_mapping)) {
> +		xfs_info_once(ip->i_mount,
> +			"falling back from direct to buffered I/O for read");
> +		return -ENOTBLK;
> +	}
> +
>  	trace_xfs_file_direct_read(iocb, to);
>  
>  	if (!iov_iter_count(to))
> @@ -302,13 +308,22 @@ xfs_file_read_iter(
>  	if (xfs_is_shutdown(mp))
>  		return -EIO;
>  
> -	if (IS_DAX(inode))
> +	if (IS_DAX(inode)) {
>  		ret = xfs_file_dax_read(iocb, to);
> -	else if (iocb->ki_flags & IOCB_DIRECT)
> +		goto done;
> +	}
> +
> +	if (iocb->ki_flags & IOCB_DIRECT) {
>  		ret = xfs_file_dio_read(iocb, to);
> -	else
> -		ret = xfs_file_buffered_read(iocb, to);
> +		if (ret != -ENOTBLK)
> +			goto done;
> +
> +		iocb->ki_flags &= ~IOCB_DIRECT;
> +		iocb->ki_flags |= IOCB_DONTCACHE;
> +	}
>  
> +	ret = xfs_file_buffered_read(iocb, to);
> +done:
>  	if (ret > 0)
>  		XFS_STATS_ADD(mp, xs_read_bytes, ret);
>  	return ret;
> @@ -883,6 +898,7 @@ xfs_file_dio_write(
>  	struct iov_iter		*from)
>  {
>  	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
> +	struct xfs_mount	*mp = ip->i_mount;
>  	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
>  	size_t			count = iov_iter_count(from);
>  
> @@ -890,15 +906,21 @@ xfs_file_dio_write(
>  	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
>  		return -EINVAL;
>  
> +	if (mapping_stable_writes(iocb->ki_filp->f_mapping)) {
> +		xfs_info_once(mp,
> +			"falling back from direct to buffered I/O for write");
> +		return -ENOTBLK;
> +	}

/me wonders if the other filesystems will have to implement this same
fallback and hence this should be a common helper ala
dio_warn_stale_pagecache?  But we'll get there when we get there.

> +
>  	/*
>  	 * For always COW inodes we also must check the alignment of each
>  	 * individual iovec segment, as they could end up with different
>  	 * I/Os due to the way bio_iov_iter_get_pages works, and we'd
>  	 * then overwrite an already written block.
>  	 */
> -	if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) ||
> +	if (((iocb->ki_pos | count) & mp->m_blockmask) ||
>  	    (xfs_is_always_cow_inode(ip) &&
> -	     (iov_iter_alignment(from) & ip->i_mount->m_blockmask)))
> +	     (iov_iter_alignment(from) & mp->m_blockmask)))
>  		return xfs_file_dio_write_unaligned(ip, iocb, from);
>  	if (xfs_is_zoned_inode(ip))
>  		return xfs_file_dio_write_zoned(ip, iocb, from);
> @@ -1555,10 +1577,24 @@ xfs_file_open(
>  {
>  	if (xfs_is_shutdown(XFS_M(inode->i_sb)))
>  		return -EIO;
> +
> +	/*
> +	 * If the underlying devices requires stable writes, we have to fall
> +	 * back to (uncached) buffered I/O for direct I/O reads and writes, as
> +	 * the kernel can't prevent applications from modifying the memory under
> +	 * I/O.  We still claim to support O_DIRECT as we want opens for that to
> +	 * succeed and fall back.
> +	 *
> +	 * As atomic writes are only supported for direct I/O, they can't be
> +	 * supported either in this case.
> +	 */
>  	file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
> -	file->f_mode |= FMODE_DIO_PARALLEL_WRITE;
> -	if (xfs_get_atomic_write_min(XFS_I(inode)) > 0)
> -		file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
> +	if (!mapping_stable_writes(file->f_mapping)) {
> +		file->f_mode |= FMODE_DIO_PARALLEL_WRITE;

Hrm.  So parallel directio writes are disabled for writes to files on
stable_pages devices because we have to fall back to buffered writes.
Those serialize on i_rwsem so that's why we don't set
FMODE_DIO_PARALLEL_WRITE, correct?  There's not some more subtle reason
for not supporting it, right?

If the answers are {yes, yes} then I've understood this well enough for
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

--D

> +		if (xfs_get_atomic_write_min(XFS_I(inode)) > 0)
> +			file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
> +	}
> +
>  	return generic_file_open(inode, file);
>  }
>  
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index caff0125faea..bd49ac6b31de 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -672,6 +672,12 @@ xfs_report_atomic_write(
>  	struct xfs_inode	*ip,
>  	struct kstat		*stat)
>  {
> +	/*
> +	 * If the stable writes flag is set, we have to fall back to buffered
> +	 * I/O, which doesn't support atomic writes.
> +	 */
> +	if (mapping_stable_writes(VFS_I(ip)->i_mapping))
> +		return;
>  	generic_fill_statx_atomic_writes(stat,
>  			xfs_get_atomic_write_min(ip),
>  			xfs_get_atomic_write_max(ip),
> -- 
> 2.47.3
> 
> 

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 3/4] xfs: use IOCB_DONTCACHE when falling back to buffered writes
  2025-10-29  7:15 ` [PATCH 3/4] xfs: use IOCB_DONTCACHE when falling back to buffered writes Christoph Hellwig
@ 2025-10-29 15:57   ` Darrick J. Wong
  2025-11-04 12:33   ` Nirjhar Roy (IBM)
  1 sibling, 0 replies; 53+ messages in thread
From: Darrick J. Wong @ 2025-10-29 15:57 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Carlos Maiolino, Christian Brauner, Jan Kara, Martin K. Petersen,
	linux-kernel, linux-xfs, linux-fsdevel, linux-raid, linux-block

On Wed, Oct 29, 2025 at 08:15:04AM +0100, Christoph Hellwig wrote:
> Doing sub-block direct writes to COW inodes is not supported by XFS,
> because new blocks need to be allocated as a whole.  Such writes
> fall back to buffered I/O, and really should be using the
> IOCB_DONTCACHE that didn't exist when the code was added to mimic
> direct I/O semantics as closely as possible.  Also clear the
> IOCB_DIRECT flags so that later code can't get confused by it being
> set for something that at this point is not a direct I/O operation
> any more.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Seems like a reasonable fallback now that we have dontcache
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

--D

> ---
>  fs/xfs/xfs_file.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 5703b6681b1d..e09ae86e118e 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -1119,6 +1119,9 @@ xfs_file_write_iter(
>  		ret = xfs_file_dio_write(iocb, from);
>  		if (ret != -ENOTBLK)
>  			return ret;
> +
> +		iocb->ki_flags &= ~IOCB_DIRECT;
> +		iocb->ki_flags |= IOCB_DONTCACHE;
>  	}
>  
>  	if (xfs_is_zoned_inode(ip))
> -- 
> 2.47.3
> 
> 

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: fall back from direct to buffered I/O when stable writes are required
  2025-10-29  7:15 fall back from direct to buffered I/O when stable writes are required Christoph Hellwig
                   ` (3 preceding siblings ...)
  2025-10-29  7:15 ` [PATCH 4/4] xfs: fallback to buffered I/O for direct I/O when stable writes are required Christoph Hellwig
@ 2025-10-29 15:58 ` Bart Van Assche
  2025-10-29 16:14   ` Darrick J. Wong
  2025-10-29 16:33   ` Christoph Hellwig
  2025-10-30 11:20 ` Dave Chinner
  5 siblings, 2 replies; 53+ messages in thread
From: Bart Van Assche @ 2025-10-29 15:58 UTC (permalink / raw)
  To: Christoph Hellwig, Carlos Maiolino, Christian Brauner
  Cc: Jan Kara, Martin K. Petersen, linux-kernel, linux-xfs,
	linux-fsdevel, linux-raid, linux-block

On 10/29/25 12:15 AM, Christoph Hellwig wrote:
> we've had a long standing issue that direct I/O to and from devices that
> require stable writes can corrupt data because the user memory can be
> modified while in flight.  This series tries to address this by falling
> back to uncached buffered I/O.  Given that this requires an extra copy it
> is usually going to be a slow down, especially for very high bandwith
> use cases, so I'm not exactly happy about.
> 
> I suspect we need a way to opt out of this for applications that know
> what they are doing, and I can think of a few ways to do that:
> 
> 1a) Allow a mount option to override the behavior
> 
> 	This allows the sysadmin to get back to the previous state.
> 	This is fairly easy to implement, but the scope might be to wide.
> 
> 1b) Sysfs attribute
> 
> 	Same as above.  Slightly easier to modify, but a more unusual
> 	interface.
> 
> 2) Have a per-inode attribute
> 
> 	Allows to set it on a specific file.  Would require an on-disk
> 	format change for the usual attr options.
> 
> 3) Have a fcntl or similar to allow an application to override it
> 
> 	Fine granularity.  Requires application change.  We might not
> 	allow any application to force this as it could be used to inject
> 	corruption.
> 
> In other words, they are all kinda horrible.

Hi Christoph,

Has the opposite been considered: only fall back to buffered I/O for 
buggy software that modifies direct I/O buffers before I/O has
completed?

Regarding selecting the direct I/O behavior for a process, how about
introducing a new prctl() flag and introducing a new command-line
utility that follows the style of ionice and sets the new flag before
any code runs in the started process?

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/4] fs: return writeback errors for IOCB_DONTCACHE in generic_write_sync
  2025-10-29  7:15 ` [PATCH 2/4] fs: return writeback errors for IOCB_DONTCACHE in generic_write_sync Christoph Hellwig
@ 2025-10-29 16:01   ` Darrick J. Wong
  2025-10-29 16:37     ` Christoph Hellwig
  0 siblings, 1 reply; 53+ messages in thread
From: Darrick J. Wong @ 2025-10-29 16:01 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Carlos Maiolino, Christian Brauner, Jan Kara, Martin K. Petersen,
	linux-kernel, linux-xfs, linux-fsdevel, linux-raid, linux-block

On Wed, Oct 29, 2025 at 08:15:03AM +0100, Christoph Hellwig wrote:
> Currently generic_write_sync only kicks of writeback for IOCB_DONTCACHE
> writes, but never looks at the writeback errors.  When using
> IOCB_DONTCACHE as a fallback for IOCB_DIRECT for devcies that require

                                                   devices

> stable writes, this breaks a few xfstests test cases that expect instant
> errors like removed devices to be directly propagated to the writer.
> While I don't know of real applications that would expect this, trying to
> keep the behavior as similar as possible sounds useful and can be
> trivially done by checking for and returning writeback errors in
> generic_write_sync.

Hum.  So we kick writeback but don't wait for any of it to start, and
immediately sample wberr.  Does that mean that in the "bdev died" case,
the newly initiated writeback will have failed so quickly that
file_check_and_advance_wb_err will see that?  Or are we only reflecting
past write failures back to userspace on the *second* write after the
device dies?

It would be helpful to know which fstests break, btw.

--D

> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  include/linux/fs.h | 4 ++++
>  1 file changed, 4 insertions(+)
> 
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 09b47effc55e..34a843cf4c1c 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -3047,9 +3047,13 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
>  			return ret;
>  	} else if (iocb->ki_flags & IOCB_DONTCACHE) {
>  		struct address_space *mapping = iocb->ki_filp->f_mapping;
> +		int err;
>  
>  		filemap_fdatawrite_range_kick(mapping, iocb->ki_pos - count,
>  					      iocb->ki_pos - 1);
> +		err = file_check_and_advance_wb_err(iocb->ki_filp);
> +		if (err)
> +			return err;
>  	}
>  
>  	return count;
> -- 
> 2.47.3
> 
> 

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 1/4] fs: replace FOP_DIO_PARALLEL_WRITE with a fmode bits
  2025-10-29  7:15 ` [PATCH 1/4] fs: replace FOP_DIO_PARALLEL_WRITE with a fmode bits Christoph Hellwig
@ 2025-10-29 16:01   ` Darrick J. Wong
  2025-11-04  7:00   ` Nirjhar Roy (IBM)
  2025-11-11  9:44   ` Christian Brauner
  2 siblings, 0 replies; 53+ messages in thread
From: Darrick J. Wong @ 2025-10-29 16:01 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Carlos Maiolino, Christian Brauner, Jan Kara, Martin K. Petersen,
	linux-kernel, linux-xfs, linux-fsdevel, linux-raid, linux-block

On Wed, Oct 29, 2025 at 08:15:02AM +0100, Christoph Hellwig wrote:
> To properly handle the direct to buffered I/O fallback for devices that
> require stable writes, we need to be able to set the DIO_PARALLEL_WRITE
> on a per-file basis and no statically for a given file_operations
> instance.
> 
> This effectively reverts a part of 210a03c9d51a ("fs: claw back a few
> FMODE_* bits").
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Looks ok,
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

--D

> ---
>  fs/ext4/file.c      | 2 +-
>  fs/xfs/xfs_file.c   | 4 ++--
>  include/linux/fs.h  | 7 ++-----
>  io_uring/io_uring.c | 2 +-
>  4 files changed, 6 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 7a8b30932189..b484e98b9c78 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -924,6 +924,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
>  		filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
>  
>  	filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
> +	filp->f_mode |= FMODE_DIO_PARALLEL_WRITE;
>  	return dquot_file_open(inode, filp);
>  }
>  
> @@ -978,7 +979,6 @@ const struct file_operations ext4_file_operations = {
>  	.splice_write	= iter_file_splice_write,
>  	.fallocate	= ext4_fallocate,
>  	.fop_flags	= FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
> -			  FOP_DIO_PARALLEL_WRITE |
>  			  FOP_DONTCACHE,
>  };
>  
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 2702fef2c90c..5703b6681b1d 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -1553,6 +1553,7 @@ xfs_file_open(
>  	if (xfs_is_shutdown(XFS_M(inode->i_sb)))
>  		return -EIO;
>  	file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
> +	file->f_mode |= FMODE_DIO_PARALLEL_WRITE;
>  	if (xfs_get_atomic_write_min(XFS_I(inode)) > 0)
>  		file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
>  	return generic_file_open(inode, file);
> @@ -1951,8 +1952,7 @@ const struct file_operations xfs_file_operations = {
>  	.fadvise	= xfs_file_fadvise,
>  	.remap_file_range = xfs_file_remap_range,
>  	.fop_flags	= FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
> -			  FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE |
> -			  FOP_DONTCACHE,
> +			  FOP_BUFFER_WASYNC | FOP_DONTCACHE,
>  };
>  
>  const struct file_operations xfs_dir_file_operations = {
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index c895146c1444..09b47effc55e 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -128,9 +128,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
>  #define FMODE_WRITE_RESTRICTED	((__force fmode_t)(1 << 6))
>  /* File supports atomic writes */
>  #define FMODE_CAN_ATOMIC_WRITE	((__force fmode_t)(1 << 7))
> -
> -/* FMODE_* bit 8 */
> -
> +/* Supports non-exclusive O_DIRECT writes from multiple threads */
> +#define FMODE_DIO_PARALLEL_WRITE ((__force fmode_t)(1 << 8))
>  /* 32bit hashes as llseek() offset (for directories) */
>  #define FMODE_32BITHASH         ((__force fmode_t)(1 << 9))
>  /* 64bit hashes as llseek() offset (for directories) */
> @@ -2317,8 +2316,6 @@ struct file_operations {
>  #define FOP_BUFFER_WASYNC	((__force fop_flags_t)(1 << 1))
>  /* Supports synchronous page faults for mappings */
>  #define FOP_MMAP_SYNC		((__force fop_flags_t)(1 << 2))
> -/* Supports non-exclusive O_DIRECT writes from multiple threads */
> -#define FOP_DIO_PARALLEL_WRITE	((__force fop_flags_t)(1 << 3))
>  /* Contains huge pages */
>  #define FOP_HUGE_PAGES		((__force fop_flags_t)(1 << 4))
>  /* Treat loff_t as unsigned (e.g., /dev/mem) */
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 296667ba712c..668937da27e8 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -469,7 +469,7 @@ static void io_prep_async_work(struct io_kiocb *req)
>  
>  		/* don't serialize this request if the fs doesn't need it */
>  		if (should_hash && (req->file->f_flags & O_DIRECT) &&
> -		    (req->file->f_op->fop_flags & FOP_DIO_PARALLEL_WRITE))
> +		    (req->file->f_mode & FMODE_DIO_PARALLEL_WRITE))
>  			should_hash = false;
>  		if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL))
>  			io_wq_hash_work(&req->work, file_inode(req->file));
> -- 
> 2.47.3
> 
> 

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: fall back from direct to buffered I/O when stable writes are required
  2025-10-29 15:58 ` fall back from direct to buffered " Bart Van Assche
@ 2025-10-29 16:14   ` Darrick J. Wong
  2025-10-29 16:33   ` Christoph Hellwig
  1 sibling, 0 replies; 53+ messages in thread
From: Darrick J. Wong @ 2025-10-29 16:14 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Christoph Hellwig, Carlos Maiolino, Christian Brauner, Jan Kara,
	Martin K. Petersen, linux-kernel, linux-xfs, linux-fsdevel,
	linux-raid, linux-block

On Wed, Oct 29, 2025 at 08:58:52AM -0700, Bart Van Assche wrote:
> On 10/29/25 12:15 AM, Christoph Hellwig wrote:
> > we've had a long standing issue that direct I/O to and from devices that
> > require stable writes can corrupt data because the user memory can be
> > modified while in flight.  This series tries to address this by falling
> > back to uncached buffered I/O.  Given that this requires an extra copy it
> > is usually going to be a slow down, especially for very high bandwith
> > use cases, so I'm not exactly happy about.
> > 
> > I suspect we need a way to opt out of this for applications that know
> > what they are doing, and I can think of a few ways to do that:
> > 
> > 1a) Allow a mount option to override the behavior
> > 
> > 	This allows the sysadmin to get back to the previous state.
> > 	This is fairly easy to implement, but the scope might be to wide.

/me dislikes mount options because getting rid of them is hard.

> > 1b) Sysfs attribute
> > 
> > 	Same as above.  Slightly easier to modify, but a more unusual
> > 	interface.
> > 
> > 2) Have a per-inode attribute
> > 
> > 	Allows to set it on a specific file.  Would require an on-disk
> > 	format change for the usual attr options.
> > 
> > 3) Have a fcntl or similar to allow an application to override it
> > 
> > 	Fine granularity.  Requires application change.  We might not
> > 	allow any application to force this as it could be used to inject
> > 	corruption.
> > 
> > In other words, they are all kinda horrible.

Yeah, I don't like the choices either.  Bart's prctl sounds the least
annoying but even then I still don't like "I KNOW WHAT I'M DOING!!"
flags.

> Hi Christoph,
> 
> Has the opposite been considered: only fall back to buffered I/O for buggy
> software that modifies direct I/O buffers before I/O has
> completed?

How would xfs detect that?  For all we know the dio buffer is actually a
piece of device memory or something, and some hardware changed the
memory without the kernel knowing that.  Later on the raid scrub fails a
parity check and it's far too late to do anything about it.

--D

> Regarding selecting the direct I/O behavior for a process, how about
> introducing a new prctl() flag and introducing a new command-line
> utility that follows the style of ionice and sets the new flag before
> any code runs in the started process?
> 
> Thanks,
> 
> Bart.
> 

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: fall back from direct to buffered I/O when stable writes are required
  2025-10-29 15:58 ` fall back from direct to buffered " Bart Van Assche
  2025-10-29 16:14   ` Darrick J. Wong
@ 2025-10-29 16:33   ` Christoph Hellwig
  1 sibling, 0 replies; 53+ messages in thread
From: Christoph Hellwig @ 2025-10-29 16:33 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Christoph Hellwig, Carlos Maiolino, Christian Brauner, Jan Kara,
	Martin K. Petersen, linux-kernel, linux-xfs, linux-fsdevel,
	linux-raid, linux-block

On Wed, Oct 29, 2025 at 08:58:52AM -0700, Bart Van Assche wrote:
> Has the opposite been considered: only fall back to buffered I/O for buggy 
> software that modifies direct I/O buffers before I/O has
> completed?

Given that we never claimed that you can't modify the buffer I would
not call it buggy, even if the behavior is unfortunate.  Also with
file systems and block drivers driving work off I/O errors or checksum
failures (RAID rebuild, or Darrick's xfs healthmon / healer work recently
reposted) applications could also be malicious and cause action not
intended.  Note that this is an issue with all non-privileged ways to
signals this.

> Regarding selecting the direct I/O behavior for a process, how about
> introducing a new prctl() flag and introducing a new command-line
> utility that follows the style of ionice and sets the new flag before
> any code runs in the started process?

I suspect most of this code isn't run from the command line, but modulo
all the other concerns about unprivileged ways to opt out of the bounce
buffering this does sound like a reasonable idea.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 4/4] xfs: fallback to buffered I/O for direct I/O when stable writes are required
  2025-10-29 15:53   ` Darrick J. Wong
@ 2025-10-29 16:35     ` Christoph Hellwig
  2025-10-29 21:23       ` Qu Wenruo
  0 siblings, 1 reply; 53+ messages in thread
From: Christoph Hellwig @ 2025-10-29 16:35 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Christoph Hellwig, Carlos Maiolino, Christian Brauner, Jan Kara,
	Martin K. Petersen, linux-kernel, linux-xfs, linux-fsdevel,
	linux-raid, linux-block, Qu Wenruo, linux-btrfs

[Adding Qu and the btrfs list]

On Wed, Oct 29, 2025 at 08:53:06AM -0700, Darrick J. Wong wrote:
> > +	if (mapping_stable_writes(iocb->ki_filp->f_mapping)) {
> > +		xfs_info_once(mp,
> > +			"falling back from direct to buffered I/O for write");
> > +		return -ENOTBLK;
> > +	}
> 
> /me wonders if the other filesystems will have to implement this same
> fallback and hence this should be a common helper ala
> dio_warn_stale_pagecache?  But we'll get there when we get there.

As far as I'm concerned they should.  Btrfs in fact has recently done
that, as they are even more exposed due to the integrated checksumming.

So yes, a common helper might make sense.  Especially if we want common
configuration for opt-outs eventually.

> >  	file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
> > -	file->f_mode |= FMODE_DIO_PARALLEL_WRITE;
> > -	if (xfs_get_atomic_write_min(XFS_I(inode)) > 0)
> > -		file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
> > +	if (!mapping_stable_writes(file->f_mapping)) {
> > +		file->f_mode |= FMODE_DIO_PARALLEL_WRITE;
> 
> Hrm.  So parallel directio writes are disabled for writes to files on
> stable_pages devices because we have to fall back to buffered writes.
> Those serialize on i_rwsem so that's why we don't set
> FMODE_DIO_PARALLEL_WRITE, correct?

Yes.

> There's not some more subtle reason
> for not supporting it, right?

Not that I know of anyway.


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/4] fs: return writeback errors for IOCB_DONTCACHE in generic_write_sync
  2025-10-29 16:01   ` Darrick J. Wong
@ 2025-10-29 16:37     ` Christoph Hellwig
  2025-10-29 18:12       ` Darrick J. Wong
  2025-11-04 12:04       ` Nirjhar Roy (IBM)
  0 siblings, 2 replies; 53+ messages in thread
From: Christoph Hellwig @ 2025-10-29 16:37 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Christoph Hellwig, Carlos Maiolino, Christian Brauner, Jan Kara,
	Martin K. Petersen, linux-kernel, linux-xfs, linux-fsdevel,
	linux-raid, linux-block

On Wed, Oct 29, 2025 at 09:01:01AM -0700, Darrick J. Wong wrote:
> Hum.  So we kick writeback but don't wait for any of it to start, and
> immediately sample wberr.  Does that mean that in the "bdev died" case,
> the newly initiated writeback will have failed so quickly that
> file_check_and_advance_wb_err will see that?

Yes, this is primarily about catching errors in the submission path
before it reaches the device, which are returned synchronously.

> Or are we only reflecting
> past write failures back to userspace on the *second* write after the
> device dies?
> 
> It would be helpful to know which fstests break, btw.

generic/252 generic/329 xfs/237


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/4] fs: return writeback errors for IOCB_DONTCACHE in generic_write_sync
  2025-10-29 16:37     ` Christoph Hellwig
@ 2025-10-29 18:12       ` Darrick J. Wong
  2025-10-30  5:59         ` Christoph Hellwig
  2025-11-04 12:04       ` Nirjhar Roy (IBM)
  1 sibling, 1 reply; 53+ messages in thread
From: Darrick J. Wong @ 2025-10-29 18:12 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Carlos Maiolino, Christian Brauner, Jan Kara, Martin K. Petersen,
	linux-kernel, linux-xfs, linux-fsdevel, linux-raid, linux-block

On Wed, Oct 29, 2025 at 05:37:08PM +0100, Christoph Hellwig wrote:
> On Wed, Oct 29, 2025 at 09:01:01AM -0700, Darrick J. Wong wrote:
> > Hum.  So we kick writeback but don't wait for any of it to start, and
> > immediately sample wberr.  Does that mean that in the "bdev died" case,
> > the newly initiated writeback will have failed so quickly that
> > file_check_and_advance_wb_err will see that?
> 
> Yes, this is primarily about catching errors in the submission path
> before it reaches the device, which are returned synchronously.

Ah, ok.

> > Or are we only reflecting
> > past write failures back to userspace on the *second* write after the
> > device dies?
> > 
> > It would be helpful to know which fstests break, btw.
> 
> generic/252 generic/329 xfs/237

Would you mind putting that in the commit message as a breadcrumb for
anyone who comes looking later?

--D

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 4/4] xfs: fallback to buffered I/O for direct I/O when stable writes are required
  2025-10-29 16:35     ` Christoph Hellwig
@ 2025-10-29 21:23       ` Qu Wenruo
  2025-10-30  5:58         ` Christoph Hellwig
  0 siblings, 1 reply; 53+ messages in thread
From: Qu Wenruo @ 2025-10-29 21:23 UTC (permalink / raw)
  To: Christoph Hellwig, Darrick J. Wong
  Cc: Carlos Maiolino, Christian Brauner, Jan Kara, Martin K. Petersen,
	linux-kernel, linux-xfs, linux-fsdevel, linux-raid, linux-block,
	linux-btrfs



在 2025/10/30 03:05, Christoph Hellwig 写道:
> [Adding Qu and the btrfs list]
> 
> On Wed, Oct 29, 2025 at 08:53:06AM -0700, Darrick J. Wong wrote:
>>> +	if (mapping_stable_writes(iocb->ki_filp->f_mapping)) {
>>> +		xfs_info_once(mp,
>>> +			"falling back from direct to buffered I/O for write");
>>> +		return -ENOTBLK;
>>> +	}
>>
>> /me wonders if the other filesystems will have to implement this same
>> fallback and hence this should be a common helper ala
>> dio_warn_stale_pagecache?  But we'll get there when we get there.
> 
> As far as I'm concerned they should.  Btrfs in fact has recently done
> that, as they are even more exposed due to the integrated checksumming.
> 
> So yes, a common helper might make sense.  Especially if we want common
> configuration for opt-outs eventually.

Yep, a common helper will help, or even integrate the check into 
__iomap_dio_rw().

Although btrfs currently uses some btrfs specific flags to do the check, 
we're also setting stable writes flags for the address space, thus we 
can share the same check.

However I'm not sure if a warning will be that useful.

If the warning is only outputted once like here, it doesn't show the ino 
number to tell which file is affected.
If the warning is shown every time, it will flood the dmesg.

It will be much straightforward if there is some flag allowing us to 
return error directly if true zero-copy direct IO can not be executed.

Thanks,
Qu

> 
>>>   	file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
>>> -	file->f_mode |= FMODE_DIO_PARALLEL_WRITE;
>>> -	if (xfs_get_atomic_write_min(XFS_I(inode)) > 0)
>>> -		file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
>>> +	if (!mapping_stable_writes(file->f_mapping)) {
>>> +		file->f_mode |= FMODE_DIO_PARALLEL_WRITE;
>>
>> Hrm.  So parallel directio writes are disabled for writes to files on
>> stable_pages devices because we have to fall back to buffered writes.
>> Those serialize on i_rwsem so that's why we don't set
>> FMODE_DIO_PARALLEL_WRITE, correct?
> 
> Yes.
> 
>> There's not some more subtle reason
>> for not supporting it, right?
> 
> Not that I know of anyway.
> 


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 4/4] xfs: fallback to buffered I/O for direct I/O when stable writes are required
  2025-10-29 21:23       ` Qu Wenruo
@ 2025-10-30  5:58         ` Christoph Hellwig
  2025-10-30  6:37           ` Qu Wenruo
  0 siblings, 1 reply; 53+ messages in thread
From: Christoph Hellwig @ 2025-10-30  5:58 UTC (permalink / raw)
  To: Qu Wenruo
  Cc: Christoph Hellwig, Darrick J. Wong, Carlos Maiolino,
	Christian Brauner, Jan Kara, Martin K. Petersen, linux-kernel,
	linux-xfs, linux-fsdevel, linux-raid, linux-block, linux-btrfs

On Thu, Oct 30, 2025 at 07:53:30AM +1030, Qu Wenruo wrote:
> Yep, a common helper will help, or even integrate the check into 
> __iomap_dio_rw().

Having the check in __iomap_dio_rw would be a last resort, because at
the point we've already done direct I/O specific locking we'd need to
unwind from, making the fallback slower than we'd have to.

> However I'm not sure if a warning will be that useful.
>
> If the warning is only outputted once like here, it doesn't show the ino 
> number to tell which file is affected.
> If the warning is shown every time, it will flood the dmesg.

While the flag is set on the address_space it is global (or semi global
for separate storage pools like the XFS RT device), so the inode number
doesn't really matter too much.

> It will be much straightforward if there is some flag allowing us to return 
> error directly if true zero-copy direct IO can not be executed.

I don't really understand this part.


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/4] fs: return writeback errors for IOCB_DONTCACHE in generic_write_sync
  2025-10-29 18:12       ` Darrick J. Wong
@ 2025-10-30  5:59         ` Christoph Hellwig
  0 siblings, 0 replies; 53+ messages in thread
From: Christoph Hellwig @ 2025-10-30  5:59 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Christoph Hellwig, Carlos Maiolino, Christian Brauner, Jan Kara,
	Martin K. Petersen, linux-kernel, linux-xfs, linux-fsdevel,
	linux-raid, linux-block

On Wed, Oct 29, 2025 at 11:12:13AM -0700, Darrick J. Wong wrote:
> Would you mind putting that in the commit message as a breadcrumb for
> anyone who comes looking later?

Sure, I'll add it.


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 4/4] xfs: fallback to buffered I/O for direct I/O when stable writes are required
  2025-10-30  5:58         ` Christoph Hellwig
@ 2025-10-30  6:37           ` Qu Wenruo
  2025-10-30  6:49             ` Christoph Hellwig
  0 siblings, 1 reply; 53+ messages in thread
From: Qu Wenruo @ 2025-10-30  6:37 UTC (permalink / raw)
  To: Christoph Hellwig, Qu Wenruo
  Cc: Darrick J. Wong, Carlos Maiolino, Christian Brauner, Jan Kara,
	Martin K. Petersen, linux-kernel, linux-xfs, linux-fsdevel,
	linux-raid, linux-block, linux-btrfs

在 2025/10/30 16:28, Christoph Hellwig 写道:
[...]
>> It will be much straightforward if there is some flag allowing us to return
>> error directly if true zero-copy direct IO can not be executed.
> 
> I don't really understand this part.

I mean some open flag like O_DIRECT_NO_FALLBACK, then we can directly 
reutrn -ENOBLK without falling back to buffered IO (and no need to 
bother the warning of falling back).

This will provide the most accurate, true zero-copy for those programs 
that really require zero-copy.

And we won't need to bother falling back to buffered IO, it will be 
something for the user space to bother.

Thanks,
Qu

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 4/4] xfs: fallback to buffered I/O for direct I/O when stable writes are required
  2025-10-30  6:37           ` Qu Wenruo
@ 2025-10-30  6:49             ` Christoph Hellwig
  2025-10-30  6:53               ` Qu Wenruo
  0 siblings, 1 reply; 53+ messages in thread
From: Christoph Hellwig @ 2025-10-30  6:49 UTC (permalink / raw)
  To: Qu Wenruo
  Cc: Christoph Hellwig, Qu Wenruo, Darrick J. Wong, Carlos Maiolino,
	Christian Brauner, Jan Kara, Martin K. Petersen, linux-kernel,
	linux-xfs, linux-fsdevel, linux-raid, linux-block, linux-btrfs

On Thu, Oct 30, 2025 at 05:07:44PM +1030, Qu Wenruo wrote:
> I mean some open flag like O_DIRECT_NO_FALLBACK, then we can directly 
> reutrn -ENOBLK without falling back to buffered IO (and no need to bother 
> the warning of falling back).
>
> This will provide the most accurate, true zero-copy for those programs that 
> really require zero-copy.
>
> And we won't need to bother falling back to buffered IO, it will be 
> something for the user space to bother.

So what is your application going to do if the open fails?

>
> Thanks,
> Qu
---end quoted text---

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 4/4] xfs: fallback to buffered I/O for direct I/O when stable writes are required
  2025-10-30  6:49             ` Christoph Hellwig
@ 2025-10-30  6:53               ` Qu Wenruo
  2025-10-30  6:55                 ` Christoph Hellwig
  0 siblings, 1 reply; 53+ messages in thread
From: Qu Wenruo @ 2025-10-30  6:53 UTC (permalink / raw)
  To: Christoph Hellwig, Qu Wenruo
  Cc: Darrick J. Wong, Carlos Maiolino, Christian Brauner, Jan Kara,
	Martin K. Petersen, linux-kernel, linux-xfs, linux-fsdevel,
	linux-raid, linux-block, linux-btrfs



在 2025/10/30 17:19, Christoph Hellwig 写道:
> On Thu, Oct 30, 2025 at 05:07:44PM +1030, Qu Wenruo wrote:
>> I mean some open flag like O_DIRECT_NO_FALLBACK, then we can directly
>> reutrn -ENOBLK without falling back to buffered IO (and no need to bother
>> the warning of falling back).
>>
>> This will provide the most accurate, true zero-copy for those programs that
>> really require zero-copy.
>>
>> And we won't need to bother falling back to buffered IO, it will be
>> something for the user space to bother.
> 
> So what is your application going to do if the open fails?

If it can not accept buffered fallback, error out.

If it can, do regular open without direct IO flags, and may be even open 
a bug report to the project, questioning if they really need direct IO 
in the first place.

Thanks,
Qu

> 
>>
>> Thanks,
>> Qu
> ---end quoted text---


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 4/4] xfs: fallback to buffered I/O for direct I/O when stable writes are required
  2025-10-30  6:53               ` Qu Wenruo
@ 2025-10-30  6:55                 ` Christoph Hellwig
  2025-10-30  7:14                   ` Qu Wenruo
  0 siblings, 1 reply; 53+ messages in thread
From: Christoph Hellwig @ 2025-10-30  6:55 UTC (permalink / raw)
  To: Qu Wenruo
  Cc: Christoph Hellwig, Qu Wenruo, Darrick J. Wong, Carlos Maiolino,
	Christian Brauner, Jan Kara, Martin K. Petersen, linux-kernel,
	linux-xfs, linux-fsdevel, linux-raid, linux-block, linux-btrfs

On Thu, Oct 30, 2025 at 05:23:32PM +1030, Qu Wenruo wrote:
>> So what is your application going to do if the open fails?
>
> If it can not accept buffered fallback, error out.

Why would it not be able to accept that?


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 4/4] xfs: fallback to buffered I/O for direct I/O when stable writes are required
  2025-10-30  6:55                 ` Christoph Hellwig
@ 2025-10-30  7:14                   ` Qu Wenruo
  2025-10-30  7:17                     ` Christoph Hellwig
  0 siblings, 1 reply; 53+ messages in thread
From: Qu Wenruo @ 2025-10-30  7:14 UTC (permalink / raw)
  To: Christoph Hellwig, Qu Wenruo
  Cc: Darrick J. Wong, Carlos Maiolino, Christian Brauner, Jan Kara,
	Martin K. Petersen, linux-kernel, linux-xfs, linux-fsdevel,
	linux-raid, linux-block, linux-btrfs



在 2025/10/30 17:25, Christoph Hellwig 写道:
> On Thu, Oct 30, 2025 at 05:23:32PM +1030, Qu Wenruo wrote:
>>> So what is your application going to do if the open fails?
>>
>> If it can not accept buffered fallback, error out.
> 
> Why would it not be able to accept that?
> 

Because for whatever reasons, although the only reason I can come up 
with is performance.

I thought the old kernel principle is, providing the mechanism not the 
policy.
But the fallback-to-buffered looks more like a policy, and if that's the 
case user space should be more suitable.

Thanks,
Qu

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 4/4] xfs: fallback to buffered I/O for direct I/O when stable writes are required
  2025-10-30  7:14                   ` Qu Wenruo
@ 2025-10-30  7:17                     ` Christoph Hellwig
  0 siblings, 0 replies; 53+ messages in thread
From: Christoph Hellwig @ 2025-10-30  7:17 UTC (permalink / raw)
  To: Qu Wenruo
  Cc: Christoph Hellwig, Darrick J. Wong, Carlos Maiolino,
	Christian Brauner, Jan Kara, Martin K. Petersen, linux-kernel,
	linux-xfs, linux-fsdevel, linux-raid, linux-block, linux-btrfs

On Thu, Oct 30, 2025 at 05:44:22PM +1030, Qu Wenruo wrote:
> Because for whatever reasons, although the only reason I can come up with 
> is performance.
>
> I thought the old kernel principle is, providing the mechanism not the 
> policy.
> But the fallback-to-buffered looks more like a policy, and if that's the 
> case user space should be more suitable.

I don't think so.  O_DIRECT really is a hint.  We already do fallbacks
for various reasons (for XFS e.g. unaligned writes on COW files), and
btrfs in fact falls back for any alignment mismatch already.  And there's
really nothing an application can do when the most optimal way is not
available except for using a less optimal one.  So there's really no
value add for an option to fail.


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: fall back from direct to buffered I/O when stable writes are required
  2025-10-29  7:15 fall back from direct to buffered I/O when stable writes are required Christoph Hellwig
                   ` (4 preceding siblings ...)
  2025-10-29 15:58 ` fall back from direct to buffered " Bart Van Assche
@ 2025-10-30 11:20 ` Dave Chinner
  2025-10-30 12:00   ` Geoff Back
  2025-10-30 14:33   ` Christoph Hellwig
  5 siblings, 2 replies; 53+ messages in thread
From: Dave Chinner @ 2025-10-30 11:20 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Carlos Maiolino, Christian Brauner, Jan Kara, Martin K. Petersen,
	linux-kernel, linux-xfs, linux-fsdevel, linux-raid, linux-block

On Wed, Oct 29, 2025 at 08:15:01AM +0100, Christoph Hellwig wrote:
> Hi all,
> 
> we've had a long standing issue that direct I/O to and from devices that
> require stable writes can corrupt data because the user memory can be
> modified while in flight.  This series tries to address this by falling
> back to uncached buffered I/O.  Given that this requires an extra copy it
> is usually going to be a slow down, especially for very high bandwith
> use cases, so I'm not exactly happy about.

How many applications actually have this problem? I've not heard of
anyone encoutnering such RAID corruption problems on production
XFS filesystems -ever-, so it cannot be a common thing.

So, what applications are actually tripping over this, and why can't
these rare instances be fixed instead of penalising the vast
majority of users who -don't have a problem to begin with-?

> I suspect we need a way to opt out of this for applications that know
> what they are doing, and I can think of a few ways to do that:

....

> In other words, they are all kinda horrible.

Forcing a performance regression on users, then telling them "you
need to work around the performance regression" is a pretty horrible
thing to do in the first place. Given that none of the workarounds
are any better, perhaps this approach should be discarded and some
other way of addressin the problem be considered?

How about we do it the other way around? If the application is known
to corrupt stable page based block devices, then perhaps they should
be setting a "DIO is not supported" option somewhere. None of them
are pretty, but instead of affecting the whole world, it only
affects the rare applications that trigger this DIO issue.

That seems like a much better way to deal with the issue to me;
most users are completely unaffected, and never have to worry about
(or even know about) this workaround for a very specific type of
weird application behaviour...

-Dave.

-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: fall back from direct to buffered I/O when stable writes are required
  2025-10-30 11:20 ` Dave Chinner
@ 2025-10-30 12:00   ` Geoff Back
  2025-10-30 12:54     ` Jan Kara
                       ` (2 more replies)
  2025-10-30 14:33   ` Christoph Hellwig
  1 sibling, 3 replies; 53+ messages in thread
From: Geoff Back @ 2025-10-30 12:00 UTC (permalink / raw)
  To: Dave Chinner, Christoph Hellwig
  Cc: Carlos Maiolino, Christian Brauner, Jan Kara, Martin K. Petersen,
	linux-kernel, linux-xfs, linux-fsdevel, linux-raid, linux-block

On 30/10/2025 11:20, Dave Chinner wrote:
> On Wed, Oct 29, 2025 at 08:15:01AM +0100, Christoph Hellwig wrote:
>> Hi all,
>>
>> we've had a long standing issue that direct I/O to and from devices that
>> require stable writes can corrupt data because the user memory can be
>> modified while in flight.  This series tries to address this by falling
>> back to uncached buffered I/O.  Given that this requires an extra copy it
>> is usually going to be a slow down, especially for very high bandwith
>> use cases, so I'm not exactly happy about.
> How many applications actually have this problem? I've not heard of
> anyone encoutnering such RAID corruption problems on production
> XFS filesystems -ever-, so it cannot be a common thing.
>
> So, what applications are actually tripping over this, and why can't
> these rare instances be fixed instead of penalising the vast
> majority of users who -don't have a problem to begin with-?
I don't claim to have deep knowledge of what's going on here, but if I
understand correctly the problem occurs only if the process submitting
the direct I/O is breaking the semantic "contract" by modifying the page
after submitting the I/O but before it completes.  Since the page
referenced by the I/O is supposed to be immutable until the I/O
completes, what about marking the page read only at time of submission
and restoring the original page permissions after the I/O completes? 
Then if the process writes to the page (triggering a fault) make a copy
of the page that can be mapped back as writeable for the process - i.e.
normal copy-on-write behaviour - and write a once-per-process dmesg
warning that the process broke the direct I/O "contract".  And maybe tag
the process with a flag that forces all future "direct I/O" requests
made by that process to be automatically made buffered?

That way, processes that behave correctly still get direct I/O, and
those that do break the rules get degraded to buffered I/O.

Unfortunately I don't know enough to know what the performance impact of
changing the page permissions for every direct I/O would be.

>
>> I suspect we need a way to opt out of this for applications that know
>> what they are doing, and I can think of a few ways to do that:
> ....
>
>> In other words, they are all kinda horrible.
> Forcing a performance regression on users, then telling them "you
> need to work around the performance regression" is a pretty horrible
> thing to do in the first place. Given that none of the workarounds
> are any better, perhaps this approach should be discarded and some
> other way of addressin the problem be considered?
>
> How about we do it the other way around? If the application is known
> to corrupt stable page based block devices, then perhaps they should
> be setting a "DIO is not supported" option somewhere. None of them
> are pretty, but instead of affecting the whole world, it only
> affects the rare applications that trigger this DIO issue.
>
> That seems like a much better way to deal with the issue to me;
> most users are completely unaffected, and never have to worry about
> (or even know about) this workaround for a very specific type of
> weird application behaviour...
Yes, I completely agree that we should not be penalising processes that
obey the direct I/O rules for the benefit of those that do not.

>
> -Dave.
>
Regards,

Geoff.


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: fall back from direct to buffered I/O when stable writes are required
  2025-10-30 12:00   ` Geoff Back
@ 2025-10-30 12:54     ` Jan Kara
  2025-10-30 14:35     ` Christoph Hellwig
  2025-10-30 22:02     ` Dave Chinner
  2 siblings, 0 replies; 53+ messages in thread
From: Jan Kara @ 2025-10-30 12:54 UTC (permalink / raw)
  To: Geoff Back
  Cc: Dave Chinner, Christoph Hellwig, Carlos Maiolino,
	Christian Brauner, Jan Kara, Martin K. Petersen, linux-kernel,
	linux-xfs, linux-fsdevel, linux-raid, linux-block

On Thu 30-10-25 12:00:26, Geoff Back wrote:
> On 30/10/2025 11:20, Dave Chinner wrote:
> > On Wed, Oct 29, 2025 at 08:15:01AM +0100, Christoph Hellwig wrote:
> >> Hi all,
> >>
> >> we've had a long standing issue that direct I/O to and from devices that
> >> require stable writes can corrupt data because the user memory can be
> >> modified while in flight.  This series tries to address this by falling
> >> back to uncached buffered I/O.  Given that this requires an extra copy it
> >> is usually going to be a slow down, especially for very high bandwith
> >> use cases, so I'm not exactly happy about.
> > How many applications actually have this problem? I've not heard of
> > anyone encoutnering such RAID corruption problems on production
> > XFS filesystems -ever-, so it cannot be a common thing.
> >
> > So, what applications are actually tripping over this, and why can't
> > these rare instances be fixed instead of penalising the vast
> > majority of users who -don't have a problem to begin with-?
> I don't claim to have deep knowledge of what's going on here, but if I
> understand correctly the problem occurs only if the process submitting
> the direct I/O is breaking the semantic "contract" by modifying the page
> after submitting the I/O but before it completes.  Since the page
> referenced by the I/O is supposed to be immutable until the I/O
> completes, what about marking the page read only at time of submission
> and restoring the original page permissions after the I/O completes? 
> Then if the process writes to the page (triggering a fault) make a copy
> of the page that can be mapped back as writeable for the process - i.e.
> normal copy-on-write behaviour - and write a once-per-process dmesg
> warning that the process broke the direct I/O "contract".  And maybe tag
> the process with a flag that forces all future "direct I/O" requests
> made by that process to be automatically made buffered?
> 
> That way, processes that behave correctly still get direct I/O, and
> those that do break the rules get degraded to buffered I/O.
> 
> Unfortunately I don't know enough to know what the performance impact of
> changing the page permissions for every direct I/O would be.

That is a fine idea and we've considered that. The trouble is this gets
quite complex because buffers may be modified not only through the
application directly writing to the buffer while the IO is in flight but
also by setting up another IO to the same buffer. As soon as you let the
first IO use the buffer, the kernel would need to block all the other IOs
to the same buffer and doing all this without providing malicious apps with
a way to deadlock the kernel by cleverly chaining IOs & buffers.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: fall back from direct to buffered I/O when stable writes are required
  2025-10-30 11:20 ` Dave Chinner
  2025-10-30 12:00   ` Geoff Back
@ 2025-10-30 14:33   ` Christoph Hellwig
  2025-10-30 23:18     ` Dave Chinner
  1 sibling, 1 reply; 53+ messages in thread
From: Christoph Hellwig @ 2025-10-30 14:33 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Christoph Hellwig, Carlos Maiolino, Christian Brauner, Jan Kara,
	Martin K. Petersen, linux-kernel, linux-xfs, linux-fsdevel,
	linux-raid, linux-block

On Thu, Oct 30, 2025 at 10:20:02PM +1100, Dave Chinner wrote:
> > use cases, so I'm not exactly happy about.
> 
> How many applications actually have this problem? I've not heard of
> anyone encoutnering such RAID corruption problems on production
> XFS filesystems -ever-, so it cannot be a common thing.

The most common application to hit this is probably the most common
use of O_DIRECT: qemu.  Look up for btrfs errors with PI, caused by
the interaction of checksumming.  Btrfs finally fixed this a short
while ago, and there are reports for other applications a swell.

For RAID you probably won't see too many reports, as with RAID the
problem will only show up as silent corruption long after a rebuild
rebuild happened that made use of the racy data.  With checksums
it is much easier to reproduce and trivially shown by various xfstests.
With increasing storage capacities checksums are becoming more and
more important, and I'm trying to get Linux in general and XFS
specifically to use them well.  Right now I don't think anyone is
using PI with XFS or any Linux file system given the amount of work
I had to put in to make it work well, and how often I see regressions
with it.

> Forcing a performance regression on users, then telling them "you
> need to work around the performance regression" is a pretty horrible
> thing to do in the first place.

I disagree.  Not corruption user data for applications that use the
interface correctly per all documentation is a prime priority.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: fall back from direct to buffered I/O when stable writes are required
  2025-10-30 12:00   ` Geoff Back
  2025-10-30 12:54     ` Jan Kara
@ 2025-10-30 14:35     ` Christoph Hellwig
  2025-10-30 22:02     ` Dave Chinner
  2 siblings, 0 replies; 53+ messages in thread
From: Christoph Hellwig @ 2025-10-30 14:35 UTC (permalink / raw)
  To: Geoff Back
  Cc: Dave Chinner, Christoph Hellwig, Carlos Maiolino,
	Christian Brauner, Jan Kara, Martin K. Petersen, linux-kernel,
	linux-xfs, linux-fsdevel, linux-raid, linux-block

On Thu, Oct 30, 2025 at 12:00:26PM +0000, Geoff Back wrote:
> I don't claim to have deep knowledge of what's going on here, but if I
> understand correctly the problem occurs only if the process submitting
> the direct I/O is breaking the semantic "contract" by modifying the page
> after submitting the I/O but before it completes.

Except that there never has been any such "contract", written or
unwritten.  Modifying in-flight I/O is perfectly fine IFF the data
is sampled once as in the usual non-checksum non-RAID mode, and nothing
ever told applications not to do it.

>   Since the page
> referenced by the I/O is supposed to be immutable until the I/O
> completes, what about marking the page read only at time of submission
> and restoring the original page permissions after the I/O completes? 
> Then if the process writes to the page (triggering a fault) make a copy
> of the page that can be mapped back as writeable for the process - i.e.
> normal copy-on-write behaviour - and write a once-per-process dmesg
> warning that the process broke the direct I/O "contract".

That would be nice, but also pretty hard.  See various previous
discussions on this topic.  Also at least for small I/O it probably
is more expensive than bounce buffering while for large enough I/O,
especially using PMD mappings or similar avoiding the copy will be
very beneficial.


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: fall back from direct to buffered I/O when stable writes are required
  2025-10-30 12:00   ` Geoff Back
  2025-10-30 12:54     ` Jan Kara
  2025-10-30 14:35     ` Christoph Hellwig
@ 2025-10-30 22:02     ` Dave Chinner
  2 siblings, 0 replies; 53+ messages in thread
From: Dave Chinner @ 2025-10-30 22:02 UTC (permalink / raw)
  To: Geoff Back
  Cc: Christoph Hellwig, Carlos Maiolino, Christian Brauner, Jan Kara,
	Martin K. Petersen, linux-kernel, linux-xfs, linux-fsdevel,
	linux-raid, linux-block

On Thu, Oct 30, 2025 at 12:00:26PM +0000, Geoff Back wrote:
> On 30/10/2025 11:20, Dave Chinner wrote:
> > On Wed, Oct 29, 2025 at 08:15:01AM +0100, Christoph Hellwig wrote:
> >> Hi all,
> >>
> >> we've had a long standing issue that direct I/O to and from devices that
> >> require stable writes can corrupt data because the user memory can be
> >> modified while in flight.  This series tries to address this by falling
> >> back to uncached buffered I/O.  Given that this requires an extra copy it
> >> is usually going to be a slow down, especially for very high bandwith
> >> use cases, so I'm not exactly happy about.
> > How many applications actually have this problem? I've not heard of
> > anyone encoutnering such RAID corruption problems on production
> > XFS filesystems -ever-, so it cannot be a common thing.
> >
> > So, what applications are actually tripping over this, and why can't
> > these rare instances be fixed instead of penalising the vast
> > majority of users who -don't have a problem to begin with-?
> I don't claim to have deep knowledge of what's going on here, but if I
> understand correctly the problem occurs only if the process submitting
> the direct I/O is breaking the semantic "contract" by modifying the page
> after submitting the I/O but before it completes.  Since the page
> referenced by the I/O is supposed to be immutable until the I/O
> completes, what about marking the page read only at time of submission
> and restoring the original page permissions after the I/O completes? 
> Then if the process writes to the page (triggering a fault) make a copy
> of the page that can be mapped back as writeable for the process - i.e.
> normal copy-on-write behaviour

There's nothing new in this world - this is pretty much how the IO
paths in Irix worked back in the mid 1990s. The transparent
zero-copy buffered read and zero-copy network send paths that this
enabled was one of the reasons why Irix was always at the top of the
IO performance charts, even though the CPUs were underpowered
compared to the competition...

> - and write a once-per-process dmesg
> warning that the process broke the direct I/O "contract". 

Yes, there was occasionally an application that tried to re-use
buffers before the kernel was finished with them and triggered the
COW path.  However, these were easily identified and generally fixed
pretty quickly by the application vendors because performance was
the very reason they were deploying IO intensive applications on
SGI/Irix platforms in the first place....

> And maybe tag
> the process with a flag that forces all future "direct I/O" requests
> made by that process to be automatically made buffered?
>
> That way, processes that behave correctly still get direct I/O, and
> those that do break the rules get degraded to buffered I/O.

Why? The cost of the COW for the user page is the same as copying
the data in the first place. However, if COW faults are rare, then
allowing DIO to continue will result in better performance overall.

The other side of this is that falling back to buffered IO for AIO
is an -awful thing to do-. You no longer get AIO behaviour - reads
will block on IO, and writes will block on reads and other writes...

> Unfortunately I don't know enough to know what the performance impact of
> changing the page permissions for every direct I/O would be.

On high performance storage, it will almost certainly be less of an
impact than forcing all IO to be bounce buffered through the page
cache.

-Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: fall back from direct to buffered I/O when stable writes are required
  2025-10-30 14:33   ` Christoph Hellwig
@ 2025-10-30 23:18     ` Dave Chinner
  2025-10-31 13:00       ` Christoph Hellwig
  0 siblings, 1 reply; 53+ messages in thread
From: Dave Chinner @ 2025-10-30 23:18 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Carlos Maiolino, Christian Brauner, Jan Kara, Martin K. Petersen,
	linux-kernel, linux-xfs, linux-fsdevel, linux-raid, linux-block

On Thu, Oct 30, 2025 at 03:33:24PM +0100, Christoph Hellwig wrote:
> On Thu, Oct 30, 2025 at 10:20:02PM +1100, Dave Chinner wrote:
> > > use cases, so I'm not exactly happy about.
> > 
> > How many applications actually have this problem? I've not heard of
> > anyone encoutnering such RAID corruption problems on production
> > XFS filesystems -ever-, so it cannot be a common thing.
> 
> The most common application to hit this is probably the most common
> use of O_DIRECT: qemu.  Look up for btrfs errors with PI, caused by
> the interaction of checksumming.  Btrfs finally fixed this a short
> while ago, and there are reports for other applications a swell.

I'm not asking about btrfs - I'm asking about actual, real world
problems reported in production XFS environments.

> For RAID you probably won't see too many reports, as with RAID the
> problem will only show up as silent corruption long after a rebuild
> rebuild happened that made use of the racy data.

Yet we are not hearing about this, either. Nobody is reporting that
their data is being found to be corrupt days/weeks/months/years down
the track.

This is important, because software RAID5 is pretty much the -only-
common usage of BLK_FEAT_STABLE_WRITES that users are exposed to.
This patch set is effectively disallowing direct IO for anyone
using software RAID5.

That is simply not an acceptible outcome here.

> With checksums
> it is much easier to reproduce and trivially shown by various xfstests.

Such as? 

> With increasing storage capacities checksums are becoming more and
> more important, and I'm trying to get Linux in general and XFS
> specifically to use them well.

So when XFS implements checksums and that implementation is
incompatible with Direct IO, then we can talk about disabling Direct
IO on XFS when that feature is enabled. But right now, that feature
does not exist, and ....

> Right now I don't think anyone is
> using PI with XFS or any Linux file system given the amount of work
> I had to put in to make it work well, and how often I see regressions
> with it.

.... as you say, "nobody is using PI with XFS".

So patchset is a "fix" for a problem that no-one is actually having
right now.

> > Forcing a performance regression on users, then telling them "you
> > need to work around the performance regression" is a pretty horrible
> > thing to do in the first place.
> 
> I disagree.  Not corruption user data for applications that use the
> interface correctly per all documentation is a prime priority.

Modifying an IO buffer whilst a DIO is in flight on that buffer has
-always- been an application bug.  It is a vector for torn writes
that don't get detected until the next read. It is a vector for
in-memory data corruption of read buffers.

Indeed, it does not matter if the underlying storage asserts
BLK_FEAT_STABLE_WRITES or not, modifying DIO buffers that are under
IO will (eventually) result in data corruption.  Hence, by your
logic, we should disable Direct IO for everyone.

That's just .... insane.

Remember: O_DIRECT means the application takes full responsibility
for ensuring IO concurrency semantics are correctly implemented.
Modifying IO buffers whilst the IO buffer is being read from or
written to by the hardware has always been an IO concurrency bug in
the application.

The behaviour being talked about here is, and always has been, an
application IO concurrency bug, regardless of PI, stable writes,
etc. Such an application bug existing is *not a valid reason for the
kernel or filesystem to disable Direct IO*.

-Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: fall back from direct to buffered I/O when stable writes are required
  2025-10-30 23:18     ` Dave Chinner
@ 2025-10-31 13:00       ` Christoph Hellwig
  2025-10-31 15:57         ` Keith Busch
  0 siblings, 1 reply; 53+ messages in thread
From: Christoph Hellwig @ 2025-10-31 13:00 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Christoph Hellwig, Carlos Maiolino, Christian Brauner, Jan Kara,
	Martin K. Petersen, linux-kernel, linux-xfs, linux-fsdevel,
	linux-raid, linux-block

On Fri, Oct 31, 2025 at 10:18:46AM +1100, Dave Chinner wrote:
> I'm not asking about btrfs - I'm asking about actual, real world
> problems reported in production XFS environments.

The same things applies once we have checksums with PI.  But it seems
like you don't want to listen anyway.

> > For RAID you probably won't see too many reports, as with RAID the
> > problem will only show up as silent corruption long after a rebuild
> > rebuild happened that made use of the racy data.
> 
> Yet we are not hearing about this, either. Nobody is reporting that
> their data is being found to be corrupt days/weeks/months/years down
> the track.
> 
> This is important, because software RAID5 is pretty much the -only-
> common usage of BLK_FEAT_STABLE_WRITES that users are exposed to.

RAID5 bounce buffers by default.  It has a tunable to disable that:

https://sbsfaq.com/qnap-fails-to-reveal-data-corruption-bug-that-affects-all-4-bay-and-higher-nas-devices/

and once that was turned on it pretty much immediately caused data
corruption:

https://sbsfaq.com/qnap-fails-to-reveal-data-corruption-bug-that-affects-all-4-bay-and-higher-nas-devices/
https://sbsfaq.com/synology-nas-confirmed-to-have-same-data-corruption-bug-as-qnap/

> This patch set is effectively disallowing direct IO for anyone
> using software RAID5. That is simply not an acceptible outcome here.

Quite contrary, fixing this properly allows STABLE_WRITES to actually
work without bouncing in lower layers and at least get efficient
buffered I/O.

> 
> > With checksums
> > it is much easier to reproduce and trivially shown by various xfstests.
> 
> Such as? 

Basically anything using fsstress long enough plus a few others.

> 
> > With increasing storage capacities checksums are becoming more and
> > more important, and I'm trying to get Linux in general and XFS
> > specifically to use them well.
> 
> So when XFS implements checksums and that implementation is
> incompatible with Direct IO, then we can talk about disabling Direct
> IO on XFS when that feature is enabled. But right now, that feature
> does not exist, and ....

Every Linux file system supports checksums with PI capable device.
I'm trying to make it actually work for all case and perform well for a
while.

> 
> > Right now I don't think anyone is
> > using PI with XFS or any Linux file system given the amount of work
> > I had to put in to make it work well, and how often I see regressions
> > with it.
> 
> .... as you say, "nobody is using PI with XFS".
> 
> So patchset is a "fix" for a problem that no-one is actually having
> right now.

I'm making it work.

> Modifying an IO buffer whilst a DIO is in flight on that buffer has
> -always- been an application bug.

Says who?

> It is a vector for torn writes
> that don't get detected until the next read. It is a vector for
> in-memory data corruption of read buffers.

That assumes that particular use case cares about torn writes.  We've
never ever documented any such requirement.  We can't just make that
up 20+ years later.

> Indeed, it does not matter if the underlying storage asserts
> BLK_FEAT_STABLE_WRITES or not, modifying DIO buffers that are under
> IO will (eventually) result in data corruption.

It doesn't if that's not your assumption.  But more importantly with
RAID5 if you modify them you do not primarily corrupt your own data,
but other data in the stripe.  It is a way how a malicious user can
corrupt other users data.

> Hence, by your
> logic, we should disable Direct IO for everyone.

That's your weird logic, not mine.


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: fall back from direct to buffered I/O when stable writes are required
  2025-10-31 13:00       ` Christoph Hellwig
@ 2025-10-31 15:57         ` Keith Busch
  2025-10-31 16:47           ` Christoph Hellwig
  0 siblings, 1 reply; 53+ messages in thread
From: Keith Busch @ 2025-10-31 15:57 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Dave Chinner, Carlos Maiolino, Christian Brauner, Jan Kara,
	Martin K. Petersen, linux-kernel, linux-xfs, linux-fsdevel,
	linux-raid, linux-block

On Fri, Oct 31, 2025 at 02:00:50PM +0100, Christoph Hellwig wrote:
> On Fri, Oct 31, 2025 at 10:18:46AM +1100, Dave Chinner wrote:
> 
> > Modifying an IO buffer whilst a DIO is in flight on that buffer has
> > -always- been an application bug.
> 
> Says who?

Not sure of any official statement to that effect, but storage in
general always says the behavior of modifying data concurrently with
in-flight operations on that data produces non-deterministic results. An
application with such behavior sounds like a bug to me as I can't
imagine anyone purposefully choosing to persist data with a random
outcome. If PI is enabled, I think they'd rather get a deterministic
guard check error so they know they did something with undefined
behavior.

It's like having reads and writes to overlapping LBA and/or memory
ranges concurrently outstanding. There's no guaranteed result there
either; specs just say it's the host's responsibilty to not do that.
The kernel doesn't stop an application from trying that on raw block
direct-io, but I'd say that's an application bug.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: fall back from direct to buffered I/O when stable writes are required
  2025-10-31 15:57         ` Keith Busch
@ 2025-10-31 16:47           ` Christoph Hellwig
  2025-11-03 11:14             ` Jan Kara
  0 siblings, 1 reply; 53+ messages in thread
From: Christoph Hellwig @ 2025-10-31 16:47 UTC (permalink / raw)
  To: Keith Busch
  Cc: Christoph Hellwig, Dave Chinner, Carlos Maiolino,
	Christian Brauner, Jan Kara, Martin K. Petersen, linux-kernel,
	linux-xfs, linux-fsdevel, linux-raid, linux-block

On Fri, Oct 31, 2025 at 09:57:35AM -0600, Keith Busch wrote:
> Not sure of any official statement to that effect, but storage in
> general always says the behavior of modifying data concurrently with
> in-flight operations on that data produces non-deterministic results.

Yes, it's pretty clear that the result in non-deterministic in what you
get.  But that result still does not result in corruption, because
there is a clear boundary ( either the sector size, or for NVMe
optionally even a larger bodunary) that designates the atomicy boundary.

> An
> application with such behavior sounds like a bug to me as I can't
> imagine anyone purposefully choosing to persist data with a random
> outcome. If PI is enabled, I think they'd rather get a deterministic
> guard check error so they know they did something with undefined
> behavior.

As long as your clearly define your transaction boundaries that
non-atomicy is not a problem per se.

> It's like having reads and writes to overlapping LBA and/or memory
> ranges concurrently outstanding. There's no guaranteed result there
> either; specs just say it's the host's responsibilty to not do that.

There is no guaranteed result as in an enforced ordering.  But there
is a pretty clear model that you get either the old or new at a
well defined boundary.

> The kernel doesn't stop an application from trying that on raw block
> direct-io, but I'd say that's an application bug.

If it corrupts other applications data as in the RAID case it's
pretty clearly not an application bug.  It's also pretty clear that
at least some applications (qemu and other VMs) have been doings this
for 20+ years.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: fall back from direct to buffered I/O when stable writes are required
  2025-10-31 16:47           ` Christoph Hellwig
@ 2025-11-03 11:14             ` Jan Kara
  2025-11-03 12:21               ` Christoph Hellwig
  0 siblings, 1 reply; 53+ messages in thread
From: Jan Kara @ 2025-11-03 11:14 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Keith Busch, Dave Chinner, Carlos Maiolino, Christian Brauner,
	Jan Kara, Martin K. Petersen, linux-kernel, linux-xfs,
	linux-fsdevel, linux-raid, linux-block

On Fri 31-10-25 17:47:01, Christoph Hellwig wrote:
> On Fri, Oct 31, 2025 at 09:57:35AM -0600, Keith Busch wrote:
> > Not sure of any official statement to that effect, but storage in
> > general always says the behavior of modifying data concurrently with
> > in-flight operations on that data produces non-deterministic results.
> 
> Yes, it's pretty clear that the result in non-deterministic in what you
> get.  But that result still does not result in corruption, because
> there is a clear boundary ( either the sector size, or for NVMe
> optionally even a larger bodunary) that designates the atomicy boundary.

Well, is that boundary really guaranteed? I mean if you modify the buffer
under IO couldn't it happen that the DMA sees part of the sector new and
part of the sector old? I agree the window is small but I think the real
guarantee is architecture dependent and likely cacheline granularity or
something like that.

> > An
> > application with such behavior sounds like a bug to me as I can't
> > imagine anyone purposefully choosing to persist data with a random
> > outcome. If PI is enabled, I think they'd rather get a deterministic
> > guard check error so they know they did something with undefined
> > behavior.
> 
> As long as your clearly define your transaction boundaries that
> non-atomicy is not a problem per se.
> 
> > It's like having reads and writes to overlapping LBA and/or memory
> > ranges concurrently outstanding. There's no guaranteed result there
> > either; specs just say it's the host's responsibilty to not do that.
> 
> There is no guaranteed result as in an enforced ordering.  But there
> is a pretty clear model that you get either the old or new at a
> well defined boundary.
> 
> > The kernel doesn't stop an application from trying that on raw block
> > direct-io, but I'd say that's an application bug.
> 
> If it corrupts other applications data as in the RAID case it's
> pretty clearly not an application bug.  It's also pretty clear that
> at least some applications (qemu and other VMs) have been doings this
> for 20+ years.

Well, I'm mostly of the opinion that modifying IO buffers in flight is an
application bug (as much as most current storage stacks tolerate it) but on
the other hand returning IO errors later or even corrupting RAID5 on resync
is, in my opinion, not a sane error handling on the kernel side either so I
think we need to do better.

I also think the performance cost of the unconditional bounce buffering is
so heavy that it's just a polite way of pushing the app to do proper IO
buffer synchronization itself (assuming it cares about IO performance but
given it bothered with direct IO it presumably does). 

So the question is how to get out of this mess with the least disruption
possible which IMO also means providing easy way for well-behaved apps to
avoid the overhead.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: fall back from direct to buffered I/O when stable writes are required
  2025-11-03 11:14             ` Jan Kara
@ 2025-11-03 12:21               ` Christoph Hellwig
  2025-11-03 22:47                 ` Keith Busch
  2025-11-04 23:38                 ` Darrick J. Wong
  0 siblings, 2 replies; 53+ messages in thread
From: Christoph Hellwig @ 2025-11-03 12:21 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Keith Busch, Dave Chinner, Carlos Maiolino,
	Christian Brauner, Martin K. Petersen, linux-kernel, linux-xfs,
	linux-fsdevel, linux-raid, linux-block

On Mon, Nov 03, 2025 at 12:14:06PM +0100, Jan Kara wrote:
> > Yes, it's pretty clear that the result in non-deterministic in what you
> > get.  But that result still does not result in corruption, because
> > there is a clear boundary ( either the sector size, or for NVMe
> > optionally even a larger bodunary) that designates the atomicy boundary.
> 
> Well, is that boundary really guaranteed? I mean if you modify the buffer
> under IO couldn't it happen that the DMA sees part of the sector new and
> part of the sector old? I agree the window is small but I think the real
> guarantee is architecture dependent and likely cacheline granularity or
> something like that.

If you actually modify it: yes.  But I think Keith' argument was just
about regular racing reads vs writes.

> > pretty clearly not an application bug.  It's also pretty clear that
> > at least some applications (qemu and other VMs) have been doings this
> > for 20+ years.
> 
> Well, I'm mostly of the opinion that modifying IO buffers in flight is an
> application bug (as much as most current storage stacks tolerate it) but on
> the other hand returning IO errors later or even corrupting RAID5 on resync
> is, in my opinion, not a sane error handling on the kernel side either so I
> think we need to do better.

Yes.  Also if you look at the man page which is about official as it gets
for the semantics you can't find anything requiring the buffers to be
stable (but all kinds of other odd rants).

> I also think the performance cost of the unconditional bounce buffering is
> so heavy that it's just a polite way of pushing the app to do proper IO
> buffer synchronization itself (assuming it cares about IO performance but
> given it bothered with direct IO it presumably does). 
>
> So the question is how to get out of this mess with the least disruption
> possible which IMO also means providing easy way for well-behaved apps to
> avoid the overhead.

Remember the cases where this matters is checksumming and parity, where
we touch all the cache lines anyway and consume the DRAM bandwidth,
although bounce buffering upgrades this from pure reads to also writes.
So the overhead is heavy, but if we handle it the right way, that is
doing the checksum/parity calculation while the cache line is still hot
it should not be prohibitive.  And getting this right in the direct
I/O code means that the low-level code could stop bounce buffering
for buffered I/O, providing a major speedup there.

I've been thinking a bit more on how to better get the copy close to the
checksumming at least for PI, and to avoid the extra copies for RAID5
buffered I/O. M maybe a better way is to mark a bio as trusted/untrusted
so that the checksumming/raid code can bounce buffer it, and I start to
like that idea.  A complication is that PI could relax that requirement
if we support PI passthrough from userspace (currently only for block
device, but I plan to add file system support), where the device checks
it, but we can't do that for parity RAID.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: fall back from direct to buffered I/O when stable writes are required
  2025-11-03 12:21               ` Christoph Hellwig
@ 2025-11-03 22:47                 ` Keith Busch
  2025-11-04 23:38                 ` Darrick J. Wong
  1 sibling, 0 replies; 53+ messages in thread
From: Keith Busch @ 2025-11-03 22:47 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jan Kara, Dave Chinner, Carlos Maiolino, Christian Brauner,
	Martin K. Petersen, linux-kernel, linux-xfs, linux-fsdevel,
	linux-raid, linux-block

On Mon, Nov 03, 2025 at 01:21:11PM +0100, Christoph Hellwig wrote:
> On Mon, Nov 03, 2025 at 12:14:06PM +0100, Jan Kara wrote:
> > > Yes, it's pretty clear that the result in non-deterministic in what you
> > > get.  But that result still does not result in corruption, because
> > > there is a clear boundary ( either the sector size, or for NVMe
> > > optionally even a larger bodunary) that designates the atomicy boundary.
> > 
> > Well, is that boundary really guaranteed? I mean if you modify the buffer
> > under IO couldn't it happen that the DMA sees part of the sector new and
> > part of the sector old? I agree the window is small but I think the real
> > guarantee is architecture dependent and likely cacheline granularity or
> > something like that.
> 
> If you actually modify it: yes.  But I think Keith' argument was just
> about regular racing reads vs writes.

I was seeking documented behavior about concurrently modifying and
using any part of a host data buffer, so I look to storage specs. The
general guidance there aligns with "the reprecussions are your fault".
Linux DIO didn't say that, but I'm just saying there's precedence lower
down.

I'm not even sure how you handle the read side when multiple entities
are concurrently modifying the buffer. That has to be an application
bug even if bouncing it defeats the gaurd checks before the completion
overwrites the application's conflicting changes from the bounce buffer.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 1/4] fs: replace FOP_DIO_PARALLEL_WRITE with a fmode bits
  2025-10-29  7:15 ` [PATCH 1/4] fs: replace FOP_DIO_PARALLEL_WRITE with a fmode bits Christoph Hellwig
  2025-10-29 16:01   ` Darrick J. Wong
@ 2025-11-04  7:00   ` Nirjhar Roy (IBM)
  2025-11-05 14:04     ` Christoph Hellwig
  2025-11-11  9:44   ` Christian Brauner
  2 siblings, 1 reply; 53+ messages in thread
From: Nirjhar Roy (IBM) @ 2025-11-04  7:00 UTC (permalink / raw)
  To: Christoph Hellwig, Carlos Maiolino, Christian Brauner
  Cc: Jan Kara, Martin K. Petersen, linux-kernel, linux-xfs,
	linux-fsdevel, linux-raid, linux-block

On Wed, 2025-10-29 at 08:15 +0100, Christoph Hellwig wrote:
> To properly handle the direct to buffered I/O fallback for devices that
> require stable writes, we need to be able to set the DIO_PARALLEL_WRITE
> on a per-file basis and no statically for a given file_operations
> instance.
So, is the fallback configurable(like we can turn it on/off)? Looking at the code it seems like it
is not. Any reason for not making it configurable?
--NR
> 
> This effectively reverts a part of 210a03c9d51a ("fs: claw back a few
> FMODE_* bits").
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/ext4/file.c      | 2 +-
>  fs/xfs/xfs_file.c   | 4 ++--
>  include/linux/fs.h  | 7 ++-----
>  io_uring/io_uring.c | 2 +-
>  4 files changed, 6 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 7a8b30932189..b484e98b9c78 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -924,6 +924,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
>  		filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
>  
>  	filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
> +	filp->f_mode |= FMODE_DIO_PARALLEL_WRITE;
>  	return dquot_file_open(inode, filp);
>  }
>  
> @@ -978,7 +979,6 @@ const struct file_operations ext4_file_operations = {
>  	.splice_write	= iter_file_splice_write,
>  	.fallocate	= ext4_fallocate,
>  	.fop_flags	= FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
> -			  FOP_DIO_PARALLEL_WRITE |
>  			  FOP_DONTCACHE,
>  };
>  
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 2702fef2c90c..5703b6681b1d 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -1553,6 +1553,7 @@ xfs_file_open(
>  	if (xfs_is_shutdown(XFS_M(inode->i_sb)))
>  		return -EIO;
>  	file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
> +	file->f_mode |= FMODE_DIO_PARALLEL_WRITE;
>  	if (xfs_get_atomic_write_min(XFS_I(inode)) > 0)
>  		file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
>  	return generic_file_open(inode, file);
> @@ -1951,8 +1952,7 @@ const struct file_operations xfs_file_operations = {
>  	.fadvise	= xfs_file_fadvise,
>  	.remap_file_range = xfs_file_remap_range,
>  	.fop_flags	= FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
> -			  FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE |
> -			  FOP_DONTCACHE,
> +			  FOP_BUFFER_WASYNC | FOP_DONTCACHE,
>  };
>  
>  const struct file_operations xfs_dir_file_operations = {
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index c895146c1444..09b47effc55e 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -128,9 +128,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
>  #define FMODE_WRITE_RESTRICTED	((__force fmode_t)(1 << 6))
>  /* File supports atomic writes */
>  #define FMODE_CAN_ATOMIC_WRITE	((__force fmode_t)(1 << 7))
> -
> -/* FMODE_* bit 8 */
> -
> +/* Supports non-exclusive O_DIRECT writes from multiple threads */
> +#define FMODE_DIO_PARALLEL_WRITE ((__force fmode_t)(1 << 8))
>  /* 32bit hashes as llseek() offset (for directories) */
>  #define FMODE_32BITHASH         ((__force fmode_t)(1 << 9))
>  /* 64bit hashes as llseek() offset (for directories) */
> @@ -2317,8 +2316,6 @@ struct file_operations {
>  #define FOP_BUFFER_WASYNC	((__force fop_flags_t)(1 << 1))
>  /* Supports synchronous page faults for mappings */
>  #define FOP_MMAP_SYNC		((__force fop_flags_t)(1 << 2))
> -/* Supports non-exclusive O_DIRECT writes from multiple threads */
> -#define FOP_DIO_PARALLEL_WRITE	((__force fop_flags_t)(1 << 3))
>  /* Contains huge pages */
>  #define FOP_HUGE_PAGES		((__force fop_flags_t)(1 << 4))
>  /* Treat loff_t as unsigned (e.g., /dev/mem) */
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 296667ba712c..668937da27e8 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -469,7 +469,7 @@ static void io_prep_async_work(struct io_kiocb *req)
>  
>  		/* don't serialize this request if the fs doesn't need it */
>  		if (should_hash && (req->file->f_flags & O_DIRECT) &&
> -		    (req->file->f_op->fop_flags & FOP_DIO_PARALLEL_WRITE))
> +		    (req->file->f_mode & FMODE_DIO_PARALLEL_WRITE))
>  			should_hash = false;
>  		if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL))
>  			io_wq_hash_work(&req->work, file_inode(req->file));


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/4] fs: return writeback errors for IOCB_DONTCACHE in generic_write_sync
  2025-10-29 16:37     ` Christoph Hellwig
  2025-10-29 18:12       ` Darrick J. Wong
@ 2025-11-04 12:04       ` Nirjhar Roy (IBM)
  2025-11-04 15:53         ` Christoph Hellwig
  1 sibling, 1 reply; 53+ messages in thread
From: Nirjhar Roy (IBM) @ 2025-11-04 12:04 UTC (permalink / raw)
  To: Christoph Hellwig, Darrick J. Wong
  Cc: Carlos Maiolino, Christian Brauner, Jan Kara, Martin K. Petersen,
	linux-kernel, linux-xfs, linux-fsdevel, linux-raid, linux-block

On Wed, 2025-10-29 at 17:37 +0100, Christoph Hellwig wrote:
> On Wed, Oct 29, 2025 at 09:01:01AM -0700, Darrick J. Wong wrote:
> > Hum.  So we kick writeback but don't wait for any of it to start, and
> > immediately sample wberr.  Does that mean that in the "bdev died" case,
> > the newly initiated writeback will have failed so quickly that
> > file_check_and_advance_wb_err will see that?
> 
> Yes, this is primarily about catching errors in the submission path
> before it reaches the device, which are returned synchronously.
So, what you are saying is file_check_and_advance_wb_err() will wait/block till the write back
request done in filemap_fdatawrite_range_kick() is completely submitted and there are no more
chances of write back failure?
--NR
> 
> > Or are we only reflecting
> > past write failures back to userspace on the *second* write after the
> > device dies?
> > 
> > It would be helpful to know which fstests break, btw.
> 
> generic/252 generic/329 xfs/237
> 


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 3/4] xfs: use IOCB_DONTCACHE when falling back to buffered writes
  2025-10-29  7:15 ` [PATCH 3/4] xfs: use IOCB_DONTCACHE when falling back to buffered writes Christoph Hellwig
  2025-10-29 15:57   ` Darrick J. Wong
@ 2025-11-04 12:33   ` Nirjhar Roy (IBM)
  2025-11-04 15:52     ` Christoph Hellwig
  1 sibling, 1 reply; 53+ messages in thread
From: Nirjhar Roy (IBM) @ 2025-11-04 12:33 UTC (permalink / raw)
  To: Christoph Hellwig, Carlos Maiolino, Christian Brauner
  Cc: Jan Kara, Martin K. Petersen, linux-kernel, linux-xfs,
	linux-fsdevel, linux-raid, linux-block

On Wed, 2025-10-29 at 08:15 +0100, Christoph Hellwig wrote:
> Doing sub-block direct writes to COW inodes is not supported by XFS,
> because new blocks need to be allocated as a whole.  Such writes
Okay, since allocation of new blocks involves whole lot of metatdata updates/transactions etc and
that would consume a lot of time and in this large window the user buffer(for direct I/O) can be re-
used/freed which would cause corruptions?
Just thinking out loud: What if we supported sub-block direct IO in XFS and indeed allocated new
blocks+ update the metadata structures and then directly write the user data to the newly allocated
blocks instead of using the page cache? Assuming the application doesn't modify the user data buffer
- can we (at least theoritically) do such kind of sub-block DIO?
--NR
> fall back to buffered I/O, and really should be using the
> IOCB_DONTCACHE that didn't exist when the code was added to mimic
Just curious: How was it mimiced? 
> direct I/O semantics as closely as possible.  Also clear the
> IOCB_DIRECT flags so that later code can't get confused by it being
> set for something that at this point is not a direct I/O operation
> any more.
This makes sense to me.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_file.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 5703b6681b1d..e09ae86e118e 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -1119,6 +1119,9 @@ xfs_file_write_iter(
>  		ret = xfs_file_dio_write(iocb, from);
>  		if (ret != -ENOTBLK)
>  			return ret;
> +
> +		iocb->ki_flags &= ~IOCB_DIRECT;
> +		iocb->ki_flags |= IOCB_DONTCACHE;
>  	}
>  
>  	if (xfs_is_zoned_inode(ip))


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 3/4] xfs: use IOCB_DONTCACHE when falling back to buffered writes
  2025-11-04 12:33   ` Nirjhar Roy (IBM)
@ 2025-11-04 15:52     ` Christoph Hellwig
  0 siblings, 0 replies; 53+ messages in thread
From: Christoph Hellwig @ 2025-11-04 15:52 UTC (permalink / raw)
  To: Nirjhar Roy (IBM)
  Cc: Christoph Hellwig, Carlos Maiolino, Christian Brauner, Jan Kara,
	Martin K. Petersen, linux-kernel, linux-xfs, linux-fsdevel,
	linux-raid, linux-block

On Tue, Nov 04, 2025 at 06:03:35PM +0530, Nirjhar Roy (IBM) wrote:
> > Doing sub-block direct writes to COW inodes is not supported by XFS,
> > because new blocks need to be allocated as a whole.  Such writes
>
> Okay, since allocation of new blocks involves whole lot of metatdata
> updates/transactions etc and that would consume a lot of time and in
> this large window the user buffer(for direct I/O) can be re-used/freed
> which would cause corruptions?

I don't understand what you're trying to say here.

> Just thinking out loud: What if we supported sub-block direct IO in XFS
> and indeed allocated new blocks+ update the metadata structures and then
> directly write the user data to the newly allocated blocks instead of
> using the page cache?
>
> Assuming the application doesn't modify the user data buffer - can we
> (at least theoritically) do such kind of sub-block DIO?

Regular XFS does that.  Zoned XFS or the always COW debug mode can't do
that (except maybe for appends) as it it requires a read-modify-write
cycle that is not implemented in iomap.  Yes, we could implement that,
but it's not going to perform any better than the fallback, and would
also require full serialization.


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/4] fs: return writeback errors for IOCB_DONTCACHE in generic_write_sync
  2025-11-04 12:04       ` Nirjhar Roy (IBM)
@ 2025-11-04 15:53         ` Christoph Hellwig
  0 siblings, 0 replies; 53+ messages in thread
From: Christoph Hellwig @ 2025-11-04 15:53 UTC (permalink / raw)
  To: Nirjhar Roy (IBM)
  Cc: Christoph Hellwig, Darrick J. Wong, Carlos Maiolino,
	Christian Brauner, Jan Kara, Martin K. Petersen, linux-kernel,
	linux-xfs, linux-fsdevel, linux-raid, linux-block

On Tue, Nov 04, 2025 at 05:34:50PM +0530, Nirjhar Roy (IBM) wrote:
> So, what you are saying is file_check_and_advance_wb_err() will
> wait/block till the write back request done in
> filemap_fdatawrite_range_kick() is completely submitted

No, it won't wait.  But filemap_fdatawrite_range_kick isn't asynchronous,
so it doesn't have to wait either.


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: fall back from direct to buffered I/O when stable writes are required
  2025-11-03 12:21               ` Christoph Hellwig
  2025-11-03 22:47                 ` Keith Busch
@ 2025-11-04 23:38                 ` Darrick J. Wong
  2025-11-05 14:11                   ` Christoph Hellwig
  1 sibling, 1 reply; 53+ messages in thread
From: Darrick J. Wong @ 2025-11-04 23:38 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jan Kara, Keith Busch, Dave Chinner, Carlos Maiolino,
	Christian Brauner, Martin K. Petersen, linux-kernel, linux-xfs,
	linux-fsdevel, linux-raid, linux-block

On Mon, Nov 03, 2025 at 01:21:11PM +0100, Christoph Hellwig wrote:
> On Mon, Nov 03, 2025 at 12:14:06PM +0100, Jan Kara wrote:
> > > Yes, it's pretty clear that the result in non-deterministic in what you
> > > get.  But that result still does not result in corruption, because
> > > there is a clear boundary ( either the sector size, or for NVMe
> > > optionally even a larger bodunary) that designates the atomicy boundary.
> > 
> > Well, is that boundary really guaranteed? I mean if you modify the buffer
> > under IO couldn't it happen that the DMA sees part of the sector new and
> > part of the sector old? I agree the window is small but I think the real
> > guarantee is architecture dependent and likely cacheline granularity or
> > something like that.
> 
> If you actually modify it: yes.  But I think Keith' argument was just
> about regular racing reads vs writes.
> 
> > > pretty clearly not an application bug.  It's also pretty clear that
> > > at least some applications (qemu and other VMs) have been doings this
> > > for 20+ years.
> > 
> > Well, I'm mostly of the opinion that modifying IO buffers in flight is an
> > application bug (as much as most current storage stacks tolerate it) but on
> > the other hand returning IO errors later or even corrupting RAID5 on resync
> > is, in my opinion, not a sane error handling on the kernel side either so I
> > think we need to do better.
> 
> Yes.  Also if you look at the man page which is about official as it gets
> for the semantics you can't find anything requiring the buffers to be
> stable (but all kinds of other odd rants).
> 
> > I also think the performance cost of the unconditional bounce buffering is
> > so heavy that it's just a polite way of pushing the app to do proper IO
> > buffer synchronization itself (assuming it cares about IO performance but
> > given it bothered with direct IO it presumably does). 
> >
> > So the question is how to get out of this mess with the least disruption
> > possible which IMO also means providing easy way for well-behaved apps to
> > avoid the overhead.
> 
> Remember the cases where this matters is checksumming and parity, where
> we touch all the cache lines anyway and consume the DRAM bandwidth,
> although bounce buffering upgrades this from pure reads to also writes.
> So the overhead is heavy, but if we handle it the right way, that is
> doing the checksum/parity calculation while the cache line is still hot
> it should not be prohibitive.  And getting this right in the direct
> I/O code means that the low-level code could stop bounce buffering
> for buffered I/O, providing a major speedup there.
> 
> I've been thinking a bit more on how to better get the copy close to the
> checksumming at least for PI, and to avoid the extra copies for RAID5
> buffered I/O. M maybe a better way is to mark a bio as trusted/untrusted
> so that the checksumming/raid code can bounce buffer it, and I start to
> like that idea.  A complication is that PI could relax that requirement
> if we support PI passthrough from userspace (currently only for block
> device, but I plan to add file system support), where the device checks
> it, but we can't do that for parity RAID.

IIRC, a PI disk is supposed to check the supplied CRC against the
supplied data, and fail the write if there's a discrepancy, right?  In
that case, an application can't actually corrupt its own data because
hardware will catch it.

For reads, the kernel will check the supplied CRC against the data
buffer, right?  So a program can blow itself up, but that only affects
the buggy program.

I think that means the following:

A. We can allow mutant directio to non-PI devices because buggy programs
   can only screw themselves over.  Not great but we've allowed this
   forever.

B. We can also allow it to PI devices because those buggy programs will
   get hit with EIOs immediately.

C. Mutant directio reads from a RAID1/5 on non-PI devices are ok-ish
   because the broken application can decide to retry and that's just
   wasting resources.

D. Mutant directio reads from a RAID1/5 on PI devices are not good
   because the read failure will result in an unnecessary rebuild, which
   could turn really bad if the other disks are corrupt.

E. Mutant directio writes to a RAID5 are bad bad bad because you corrupt
   the stripe and now unsuspecting users on other strips lose data.

I think the btrfs corruption problems are akin to a RAID5 where you can
persist the wrong CRC to storage and you'll only see it on re-read; but
at least the blast is contained to the buggy application's file.

I wonder if that means we really need a way to convey the potential
damage of a mutant write through the block layer / address space so that
the filesystem can do the right thing?  IOWs, instead of a single
stable-pages flag, something along the lines of:

enum mutation_blast_radius {
	/* nobody will notice a thing */
	MBR_UNCHECKED,

	/* program doing the corruption will notice */
	MBR_BADAPP,

	/* everyone else's data get corrupted too */
	MBR_EVERYONE,
};

AS_STABLE_WRITES is set for MBR_BADAPP and MBR_EVERYONE, and the
directio -> dontcache flag change is done for a write to a MBR_EVERYONE
bdev.

Hm?

--D

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 1/4] fs: replace FOP_DIO_PARALLEL_WRITE with a fmode bits
  2025-11-04  7:00   ` Nirjhar Roy (IBM)
@ 2025-11-05 14:04     ` Christoph Hellwig
  0 siblings, 0 replies; 53+ messages in thread
From: Christoph Hellwig @ 2025-11-05 14:04 UTC (permalink / raw)
  To: Nirjhar Roy (IBM)
  Cc: Christoph Hellwig, Carlos Maiolino, Christian Brauner, Jan Kara,
	Martin K. Petersen, linux-kernel, linux-xfs, linux-fsdevel,
	linux-raid, linux-block

On Tue, Nov 04, 2025 at 12:30:06PM +0530, Nirjhar Roy (IBM) wrote:
> On Wed, 2025-10-29 at 08:15 +0100, Christoph Hellwig wrote:
> > To properly handle the direct to buffered I/O fallback for devices that
> > require stable writes, we need to be able to set the DIO_PARALLEL_WRITE
> > on a per-file basis and no statically for a given file_operations
> > instance.
> So, is the fallback configurable(like we can turn it on/off)? Looking at
> the code it seems like it is not. Any reason for not making it
> configurable?

Please read the cover letter.


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: fall back from direct to buffered I/O when stable writes are required
  2025-11-04 23:38                 ` Darrick J. Wong
@ 2025-11-05 14:11                   ` Christoph Hellwig
  2025-11-05 21:44                     ` Darrick J. Wong
  0 siblings, 1 reply; 53+ messages in thread
From: Christoph Hellwig @ 2025-11-05 14:11 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Christoph Hellwig, Jan Kara, Keith Busch, Dave Chinner,
	Carlos Maiolino, Christian Brauner, Martin K. Petersen,
	linux-kernel, linux-xfs, linux-fsdevel, linux-raid, linux-block

On Tue, Nov 04, 2025 at 03:38:24PM -0800, Darrick J. Wong wrote:
> IIRC, a PI disk is supposed to check the supplied CRC against the
> supplied data, and fail the write if there's a discrepancy, right?

Yes.

> In
> that case, an application can't actually corrupt its own data because
> hardware will catch it.

Yes.

> A. We can allow mutant directio to non-PI devices because buggy programs
>    can only screw themselves over.  Not great but we've allowed this
>    forever.
>
> B. We can also allow it to PI devices because those buggy programs will
>    get hit with EIOs immediately.

Well, those "buggy programs" include qemu and probably others.  Which
immediately limits the usefulness of operating with PI.

This also does not help with non-PI checksums - one thing my RFC series
did is to allow storing checksums in non-PI metadata, which is useful
for devices that are too cheap for PI, but still provide metadata.  These
do exist, although are not very wide spread, and this will require an
on-disk flag in XFS, so it's not right there.  But compared to all the
others methods to provide checksums, block metdata is by far the best,
so I'll keep it on the agenda in the hope that such devices become
more prelevant.

> I wonder if that means we really need a way to convey the potential
> damage of a mutant write through the block layer / address space so that
> the filesystem can do the right thing?  IOWs, instead of a single
> stable-pages flag, something along the lines of:

Maybe, I actually suggested this earlier.  But breaking the biggest user
of direct I/O (qemu) by default once we have checksums still feels like a
losing proposition.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: fall back from direct to buffered I/O when stable writes are required
  2025-11-05 14:11                   ` Christoph Hellwig
@ 2025-11-05 21:44                     ` Darrick J. Wong
  2025-11-06  9:50                       ` Johannes Thumshirn
  0 siblings, 1 reply; 53+ messages in thread
From: Darrick J. Wong @ 2025-11-05 21:44 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jan Kara, Keith Busch, Dave Chinner, Carlos Maiolino,
	Christian Brauner, Martin K. Petersen, linux-kernel, linux-xfs,
	linux-fsdevel, linux-raid, linux-block

On Wed, Nov 05, 2025 at 03:11:30PM +0100, Christoph Hellwig wrote:
> On Tue, Nov 04, 2025 at 03:38:24PM -0800, Darrick J. Wong wrote:
> > IIRC, a PI disk is supposed to check the supplied CRC against the
> > supplied data, and fail the write if there's a discrepancy, right?
> 
> Yes.
> 
> > In
> > that case, an application can't actually corrupt its own data because
> > hardware will catch it.
> 
> Yes.
> 
> > A. We can allow mutant directio to non-PI devices because buggy programs
> >    can only screw themselves over.  Not great but we've allowed this
> >    forever.
> >
> > B. We can also allow it to PI devices because those buggy programs will
> >    get hit with EIOs immediately.
> 
> Well, those "buggy programs" include qemu and probably others.  Which
> immediately limits the usefulness of operating with PI.
> 
> This also does not help with non-PI checksums - one thing my RFC series
> did is to allow storing checksums in non-PI metadata, which is useful
> for devices that are too cheap for PI, but still provide metadata.  These
> do exist, although are not very wide spread, and this will require an
> on-disk flag in XFS, so it's not right there.  But compared to all the
> others methods to provide checksums, block metdata is by far the best,
> so I'll keep it on the agenda in the hope that such devices become
> more prelevant.
> 
> > I wonder if that means we really need a way to convey the potential
> > damage of a mutant write through the block layer / address space so that
> > the filesystem can do the right thing?  IOWs, instead of a single
> > stable-pages flag, something along the lines of:
> 
> Maybe, I actually suggested this earlier.  But breaking the biggest user
> of direct I/O (qemu) by default once we have checksums still feels like a
> losing proposition.

Just out of curiosity -- is qemu itself mutating the buffers that it is
passing down to the lower levels via dio?  Or is it a program in the
guest that's mutating buffers that are submitted for dio, which then get
zerocopied all the way down to the hypervisor?

(But yeah, I'm coming back around to the notion that the dio ->
dontcache transition is needed for all of the PI/raid cases...)

--D

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: fall back from direct to buffered I/O when stable writes are required
  2025-11-05 21:44                     ` Darrick J. Wong
@ 2025-11-06  9:50                       ` Johannes Thumshirn
  2025-11-06 12:49                         ` hch
  0 siblings, 1 reply; 53+ messages in thread
From: Johannes Thumshirn @ 2025-11-06  9:50 UTC (permalink / raw)
  To: Darrick J. Wong, hch
  Cc: Jan Kara, Keith Busch, Dave Chinner, Carlos Maiolino,
	Christian Brauner, Martin K. Petersen,
	linux-kernel@vger.kernel.org, linux-xfs@vger.kernel.org,
	linux-fsdevel@vger.kernel.org, linux-raid@vger.kernel.org,
	linux-block@vger.kernel.org

On 11/5/25 10:44 PM, Darrick J. Wong wrote:
> Just out of curiosity -- is qemu itself mutating the buffers that it is
> passing down to the lower levels via dio?  Or is it a program in the
> guest that's mutating buffers that are submitted for dio, which then get
> zerocopied all the way down to the hypervisor?

If my memory serves me right it is the guest (or at least can be). I 
remember a bug report on btrfs where a Windows guest had messed up 
checksums because of modifying inflight I/O.


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: fall back from direct to buffered I/O when stable writes are required
  2025-11-06  9:50                       ` Johannes Thumshirn
@ 2025-11-06 12:49                         ` hch
  2025-11-12 14:18                           ` Ming Lei
  0 siblings, 1 reply; 53+ messages in thread
From: hch @ 2025-11-06 12:49 UTC (permalink / raw)
  To: Johannes Thumshirn
  Cc: Darrick J. Wong, hch, Jan Kara, Keith Busch, Dave Chinner,
	Carlos Maiolino, Christian Brauner, Martin K. Petersen,
	linux-kernel@vger.kernel.org, linux-xfs@vger.kernel.org,
	linux-fsdevel@vger.kernel.org, linux-raid@vger.kernel.org,
	linux-block@vger.kernel.org

On Thu, Nov 06, 2025 at 09:50:10AM +0000, Johannes Thumshirn wrote:
> On 11/5/25 10:44 PM, Darrick J. Wong wrote:
> > Just out of curiosity -- is qemu itself mutating the buffers that it is
> > passing down to the lower levels via dio?  Or is it a program in the
> > guest that's mutating buffers that are submitted for dio, which then get
> > zerocopied all the way down to the hypervisor?
> 
> If my memory serves me right it is the guest (or at least can be). I 
> remember a bug report on btrfs where a Windows guest had messed up 
> checksums because of modifying inflight I/O.

qemu passes I/O through, so yes it is guest controller.  Windows is most
famous, but the Linux swap code can trigger it easily too.


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 4/4] xfs: fallback to buffered I/O for direct I/O when stable writes are required
  2025-10-29  7:15 ` [PATCH 4/4] xfs: fallback to buffered I/O for direct I/O when stable writes are required Christoph Hellwig
  2025-10-29 15:53   ` Darrick J. Wong
@ 2025-11-10 13:38   ` Nirjhar Roy (IBM)
  2025-11-10 13:59     ` Christoph Hellwig
  1 sibling, 1 reply; 53+ messages in thread
From: Nirjhar Roy (IBM) @ 2025-11-10 13:38 UTC (permalink / raw)
  To: Christoph Hellwig, Carlos Maiolino, Christian Brauner
  Cc: Jan Kara, Martin K. Petersen, linux-kernel, linux-xfs,
	linux-fsdevel, linux-raid, linux-block

On Wed, 2025-10-29 at 08:15 +0100, Christoph Hellwig wrote:
> Inodes can be marked as requiring stable writes, which is a setting
> usually inherited from block devices that require stable writes.  Block
> devices require stable writes when the drivers have to sample the data
> more than once, e.g. to calculate a checksum or parity in one pass, and
> then send the data on to a hardware device, and modifying the data
> in-flight can lead to inconsistent checksums or parity.
> 
> For buffered I/O, the writeback code implements this by not allowing
> modifications while folios are marked as under writeback, but for
> direct I/O, the kernel currently does not have any way to prevent the
> user application from modifying the in-flight memory, so modifications
> can easily corrupt data despite the block driver setting the stable
> write flag.  Even worse, corruption can happen on reads as well,
> where concurrent modifications can cause checksum mismatches, or
> failures to rebuild parity.  One application known to trigger this
> behavior is Qemu when running Windows VMs, but there might be many
> others as well.  xfstests can also hit this behavior, not only in the
> specifically crafted patch for this (generic/761), but also in
> various other tests that mostly stress races between different I/O
> modes, which generic/095 being the most trivial and easy to hit
> one.
> 
> Fix XFS to fall back to uncached buffered I/O when the block device
> requires stable writes to fix these races.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_file.c | 54 +++++++++++++++++++++++++++++++++++++++--------
>  fs/xfs/xfs_iops.c |  6 ++++++
>  2 files changed, 51 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index e09ae86e118e..0668af07966a 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -230,6 +230,12 @@ xfs_file_dio_read(
>  	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
>  	ssize_t			ret;
>  
> +	if (mapping_stable_writes(iocb->ki_filp->f_mapping)) {
> +		xfs_info_once(ip->i_mount,
> +			"falling back from direct to buffered I/O for read");
> +		return -ENOTBLK;
> +	}
> +
>  	trace_xfs_file_direct_read(iocb, to);
>  
>  	if (!iov_iter_count(to))
> @@ -302,13 +308,22 @@ xfs_file_read_iter(
>  	if (xfs_is_shutdown(mp))
>  		return -EIO;
>  
> -	if (IS_DAX(inode))
> +	if (IS_DAX(inode)) {
>  		ret = xfs_file_dax_read(iocb, to);
> -	else if (iocb->ki_flags & IOCB_DIRECT)
> +		goto done;
> +	}
> +
> +	if (iocb->ki_flags & IOCB_DIRECT) {
>  		ret = xfs_file_dio_read(iocb, to);
> -	else
> -		ret = xfs_file_buffered_read(iocb, to);
> +		if (ret != -ENOTBLK)
> +			goto done;
> +
> +		iocb->ki_flags &= ~IOCB_DIRECT;
> +		iocb->ki_flags |= IOCB_DONTCACHE;
> +	}
>  
> +	ret = xfs_file_buffered_read(iocb, to);
> +done:
>  	if (ret > 0)
>  		XFS_STATS_ADD(mp, xs_read_bytes, ret);
>  	return ret;
> @@ -883,6 +898,7 @@ xfs_file_dio_write(
>  	struct iov_iter		*from)
>  {
>  	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
> +	struct xfs_mount	*mp = ip->i_mount;
>  	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
>  	size_t			count = iov_iter_count(from);
>  
> @@ -890,15 +906,21 @@ xfs_file_dio_write(
>  	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
>  		return -EINVAL;
>  
> +	if (mapping_stable_writes(iocb->ki_filp->f_mapping)) {
> +		xfs_info_once(mp,
> +			"falling back from direct to buffered I/O for write");
Minor: Let us say that an user opens a file in O_DIRECT in an atomic write enabled device(requiring
stable writes), we get this warning once. Now the same/different user/application opens another file
with O_DIRECT in the same atomic write enabled device and expects atomic write to be enabled - but
it will not be enabled (since the kernel has falled back to the uncached buffered write path)
without any warning message. Won't that be a bit confusing for the user (of course unless the user
is totally aware of the kernel's exact behavior)?
--NR

> +		return -ENOTBLK;
> +	}
> +
>  	/*
>  	 * For always COW inodes we also must check the alignment of each
>  	 * individual iovec segment, as they could end up with different
>  	 * I/Os due to the way bio_iov_iter_get_pages works, and we'd
>  	 * then overwrite an already written block.
>  	 */
> -	if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) ||
> +	if (((iocb->ki_pos | count) & mp->m_blockmask) ||
>  	    (xfs_is_always_cow_inode(ip) &&
> -	     (iov_iter_alignment(from) & ip->i_mount->m_blockmask)))
> +	     (iov_iter_alignment(from) & mp->m_blockmask)))
>  		return xfs_file_dio_write_unaligned(ip, iocb, from);
>  	if (xfs_is_zoned_inode(ip))
>  		return xfs_file_dio_write_zoned(ip, iocb, from);
> @@ -1555,10 +1577,24 @@ xfs_file_open(
>  {
>  	if (xfs_is_shutdown(XFS_M(inode->i_sb)))
>  		return -EIO;
> +
> +	/*
> +	 * If the underlying devices requires stable writes, we have to fall
> +	 * back to (uncached) buffered I/O for direct I/O reads and writes, as
> +	 * the kernel can't prevent applications from modifying the memory under
> +	 * I/O.  We still claim to support O_DIRECT as we want opens for that to
> +	 * succeed and fall back.
> +	 *
> +	 * As atomic writes are only supported for direct I/O, they can't be
> +	 * supported either in this case.
> +	 */
>  	file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
> -	file->f_mode |= FMODE_DIO_PARALLEL_WRITE;
> -	if (xfs_get_atomic_write_min(XFS_I(inode)) > 0)
> -		file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
> +	if (!mapping_stable_writes(file->f_mapping)) {
> +		file->f_mode |= FMODE_DIO_PARALLEL_WRITE;
> +		if (xfs_get_atomic_write_min(XFS_I(inode)) > 0)
> +			file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
> +	}
> +
>  	return generic_file_open(inode, file);
>  }
>  
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index caff0125faea..bd49ac6b31de 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -672,6 +672,12 @@ xfs_report_atomic_write(
>  	struct xfs_inode	*ip,
>  	struct kstat		*stat)
>  {
> +	/*
> +	 * If the stable writes flag is set, we have to fall back to buffered
> +	 * I/O, which doesn't support atomic writes.
> +	 */
> +	if (mapping_stable_writes(VFS_I(ip)->i_mapping))
> +		return;
>  	generic_fill_statx_atomic_writes(stat,
>  			xfs_get_atomic_write_min(ip),
>  			xfs_get_atomic_write_max(ip),


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 4/4] xfs: fallback to buffered I/O for direct I/O when stable writes are required
  2025-11-10 13:38   ` Nirjhar Roy (IBM)
@ 2025-11-10 13:59     ` Christoph Hellwig
  2025-11-12  7:13       ` Nirjhar Roy (IBM)
  0 siblings, 1 reply; 53+ messages in thread
From: Christoph Hellwig @ 2025-11-10 13:59 UTC (permalink / raw)
  To: Nirjhar Roy (IBM)
  Cc: Christoph Hellwig, Carlos Maiolino, Christian Brauner, Jan Kara,
	Martin K. Petersen, linux-kernel, linux-xfs, linux-fsdevel,
	linux-raid, linux-block

On Mon, Nov 10, 2025 at 07:08:05PM +0530, Nirjhar Roy (IBM) wrote:
> Minor: Let us say that an user opens a file in O_DIRECT in an atomic
> write enabled device(requiring stable writes), we get this warning
> once. Now the same/different user/application opens another file
> with O_DIRECT in the same atomic write enabled device and expects
> atomic write to be enabled - but it will not be enabled (since the
> kernel has falled back to the uncached buffered write path)
> without any warning message. Won't that be a bit confusing for the
> user (of course unless the user is totally aware of the kernel's exact
> behavior)?

The kernel with this patch should reject IOCB_ATOMIC writes because
the FMODE_CAN_ATOMIC_WRITE is not set when we need to fallback.

But anyway, based on the feedback in this thread I plan to revisit the
approach so that the I/O issuer can declare I/O stable (initially just
for buffered I/O, but things like nvmet and nfsd might be able to
guarantee that for direct I/O as well), and then bounce buffer in lower
layers.  This should then also support parallel writes, async I/O and
atomic writes.


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 1/4] fs: replace FOP_DIO_PARALLEL_WRITE with a fmode bits
  2025-10-29  7:15 ` [PATCH 1/4] fs: replace FOP_DIO_PARALLEL_WRITE with a fmode bits Christoph Hellwig
  2025-10-29 16:01   ` Darrick J. Wong
  2025-11-04  7:00   ` Nirjhar Roy (IBM)
@ 2025-11-11  9:44   ` Christian Brauner
  2 siblings, 0 replies; 53+ messages in thread
From: Christian Brauner @ 2025-11-11  9:44 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Carlos Maiolino, Jan Kara, Martin K. Petersen, linux-kernel,
	linux-xfs, linux-fsdevel, linux-raid, linux-block

On Wed, Oct 29, 2025 at 08:15:02AM +0100, Christoph Hellwig wrote:
> To properly handle the direct to buffered I/O fallback for devices that
> require stable writes, we need to be able to set the DIO_PARALLEL_WRITE
> on a per-file basis and no statically for a given file_operations
> instance.

Groan...

> 
> This effectively reverts a part of 210a03c9d51a ("fs: claw back a few
> FMODE_* bits").
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/ext4/file.c      | 2 +-
>  fs/xfs/xfs_file.c   | 4 ++--
>  include/linux/fs.h  | 7 ++-----
>  io_uring/io_uring.c | 2 +-
>  4 files changed, 6 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 7a8b30932189..b484e98b9c78 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -924,6 +924,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
>  		filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
>  
>  	filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
> +	filp->f_mode |= FMODE_DIO_PARALLEL_WRITE;
>  	return dquot_file_open(inode, filp);
>  }
>  
> @@ -978,7 +979,6 @@ const struct file_operations ext4_file_operations = {
>  	.splice_write	= iter_file_splice_write,
>  	.fallocate	= ext4_fallocate,
>  	.fop_flags	= FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
> -			  FOP_DIO_PARALLEL_WRITE |
>  			  FOP_DONTCACHE,
>  };
>  
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 2702fef2c90c..5703b6681b1d 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -1553,6 +1553,7 @@ xfs_file_open(
>  	if (xfs_is_shutdown(XFS_M(inode->i_sb)))
>  		return -EIO;
>  	file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
> +	file->f_mode |= FMODE_DIO_PARALLEL_WRITE;
>  	if (xfs_get_atomic_write_min(XFS_I(inode)) > 0)
>  		file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
>  	return generic_file_open(inode, file);
> @@ -1951,8 +1952,7 @@ const struct file_operations xfs_file_operations = {
>  	.fadvise	= xfs_file_fadvise,
>  	.remap_file_range = xfs_file_remap_range,
>  	.fop_flags	= FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
> -			  FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE |
> -			  FOP_DONTCACHE,
> +			  FOP_BUFFER_WASYNC | FOP_DONTCACHE,
>  };
>  
>  const struct file_operations xfs_dir_file_operations = {
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index c895146c1444..09b47effc55e 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -128,9 +128,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
>  #define FMODE_WRITE_RESTRICTED	((__force fmode_t)(1 << 6))
>  /* File supports atomic writes */
>  #define FMODE_CAN_ATOMIC_WRITE	((__force fmode_t)(1 << 7))
> -
> -/* FMODE_* bit 8 */
> -
> +/* Supports non-exclusive O_DIRECT writes from multiple threads */
> +#define FMODE_DIO_PARALLEL_WRITE ((__force fmode_t)(1 << 8))
>  /* 32bit hashes as llseek() offset (for directories) */
>  #define FMODE_32BITHASH         ((__force fmode_t)(1 << 9))
>  /* 64bit hashes as llseek() offset (for directories) */
> @@ -2317,8 +2316,6 @@ struct file_operations {
>  #define FOP_BUFFER_WASYNC	((__force fop_flags_t)(1 << 1))
>  /* Supports synchronous page faults for mappings */
>  #define FOP_MMAP_SYNC		((__force fop_flags_t)(1 << 2))
> -/* Supports non-exclusive O_DIRECT writes from multiple threads */
> -#define FOP_DIO_PARALLEL_WRITE	((__force fop_flags_t)(1 << 3))
>  /* Contains huge pages */
>  #define FOP_HUGE_PAGES		((__force fop_flags_t)(1 << 4))
>  /* Treat loff_t as unsigned (e.g., /dev/mem) */
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 296667ba712c..668937da27e8 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -469,7 +469,7 @@ static void io_prep_async_work(struct io_kiocb *req)
>  
>  		/* don't serialize this request if the fs doesn't need it */
>  		if (should_hash && (req->file->f_flags & O_DIRECT) &&
> -		    (req->file->f_op->fop_flags & FOP_DIO_PARALLEL_WRITE))
> +		    (req->file->f_mode & FMODE_DIO_PARALLEL_WRITE))
>  			should_hash = false;
>  		if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL))
>  			io_wq_hash_work(&req->work, file_inode(req->file));
> -- 
> 2.47.3
> 

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 4/4] xfs: fallback to buffered I/O for direct I/O when stable writes are required
  2025-11-10 13:59     ` Christoph Hellwig
@ 2025-11-12  7:13       ` Nirjhar Roy (IBM)
  0 siblings, 0 replies; 53+ messages in thread
From: Nirjhar Roy (IBM) @ 2025-11-12  7:13 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Carlos Maiolino, Christian Brauner, Jan Kara, Martin K. Petersen,
	linux-kernel, linux-xfs, linux-fsdevel, linux-raid, linux-block


On 11/10/25 19:29, Christoph Hellwig wrote:
> On Mon, Nov 10, 2025 at 07:08:05PM +0530, Nirjhar Roy (IBM) wrote:
>> Minor: Let us say that an user opens a file in O_DIRECT in an atomic
>> write enabled device(requiring stable writes), we get this warning
>> once. Now the same/different user/application opens another file
>> with O_DIRECT in the same atomic write enabled device and expects
>> atomic write to be enabled - but it will not be enabled (since the
>> kernel has falled back to the uncached buffered write path)
>> without any warning message. Won't that be a bit confusing for the
>> user (of course unless the user is totally aware of the kernel's exact
>> behavior)?
> The kernel with this patch should reject IOCB_ATOMIC writes because
> the FMODE_CAN_ATOMIC_WRITE is not set when we need to fallback.
Okay, makes sense.
>
> But anyway, based on the feedback in this thread I plan to revisit the
> approach so that the I/O issuer can declare I/O stable (initially just
> for buffered I/O, but things like nvmet and nfsd might be able to
> guarantee that for direct I/O as well), and then bounce buffer in lower
> layers.  This should then also support parallel writes, async I/O and
> atomic writes.

Okay.

--NR

>
-- 
Nirjhar Roy
Linux Kernel Developer
IBM, Bangalore


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: fall back from direct to buffered I/O when stable writes are required
  2025-11-06 12:49                         ` hch
@ 2025-11-12 14:18                           ` Ming Lei
  0 siblings, 0 replies; 53+ messages in thread
From: Ming Lei @ 2025-11-12 14:18 UTC (permalink / raw)
  To: hch
  Cc: Johannes Thumshirn, Darrick J. Wong, Jan Kara, Keith Busch,
	Dave Chinner, Carlos Maiolino, Christian Brauner,
	Martin K. Petersen, linux-kernel@vger.kernel.org,
	linux-xfs@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	linux-raid@vger.kernel.org, linux-block@vger.kernel.org

On Thu, Nov 06, 2025 at 01:49:00PM +0100, hch wrote:
> On Thu, Nov 06, 2025 at 09:50:10AM +0000, Johannes Thumshirn wrote:
> > On 11/5/25 10:44 PM, Darrick J. Wong wrote:
> > > Just out of curiosity -- is qemu itself mutating the buffers that it is
> > > passing down to the lower levels via dio?  Or is it a program in the
> > > guest that's mutating buffers that are submitted for dio, which then get
> > > zerocopied all the way down to the hypervisor?
> > 
> > If my memory serves me right it is the guest (or at least can be). I 
> > remember a bug report on btrfs where a Windows guest had messed up 
> > checksums because of modifying inflight I/O.
> 
> qemu passes I/O through, so yes it is guest controller.  Windows is most
> famous, but the Linux swap code can trigger it easily too.

Looks buffer overwrite is actually done by buggy software in guest side,
why is qemu's trouble? Or will qemu IO emulator write to the IO buffer
when guest IO is inflight?


Thanks,
Ming


^ permalink raw reply	[flat|nested] 53+ messages in thread

end of thread, other threads:[~2025-11-12 14:19 UTC | newest]

Thread overview: 53+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-29  7:15 fall back from direct to buffered I/O when stable writes are required Christoph Hellwig
2025-10-29  7:15 ` [PATCH 1/4] fs: replace FOP_DIO_PARALLEL_WRITE with a fmode bits Christoph Hellwig
2025-10-29 16:01   ` Darrick J. Wong
2025-11-04  7:00   ` Nirjhar Roy (IBM)
2025-11-05 14:04     ` Christoph Hellwig
2025-11-11  9:44   ` Christian Brauner
2025-10-29  7:15 ` [PATCH 2/4] fs: return writeback errors for IOCB_DONTCACHE in generic_write_sync Christoph Hellwig
2025-10-29 16:01   ` Darrick J. Wong
2025-10-29 16:37     ` Christoph Hellwig
2025-10-29 18:12       ` Darrick J. Wong
2025-10-30  5:59         ` Christoph Hellwig
2025-11-04 12:04       ` Nirjhar Roy (IBM)
2025-11-04 15:53         ` Christoph Hellwig
2025-10-29  7:15 ` [PATCH 3/4] xfs: use IOCB_DONTCACHE when falling back to buffered writes Christoph Hellwig
2025-10-29 15:57   ` Darrick J. Wong
2025-11-04 12:33   ` Nirjhar Roy (IBM)
2025-11-04 15:52     ` Christoph Hellwig
2025-10-29  7:15 ` [PATCH 4/4] xfs: fallback to buffered I/O for direct I/O when stable writes are required Christoph Hellwig
2025-10-29 15:53   ` Darrick J. Wong
2025-10-29 16:35     ` Christoph Hellwig
2025-10-29 21:23       ` Qu Wenruo
2025-10-30  5:58         ` Christoph Hellwig
2025-10-30  6:37           ` Qu Wenruo
2025-10-30  6:49             ` Christoph Hellwig
2025-10-30  6:53               ` Qu Wenruo
2025-10-30  6:55                 ` Christoph Hellwig
2025-10-30  7:14                   ` Qu Wenruo
2025-10-30  7:17                     ` Christoph Hellwig
2025-11-10 13:38   ` Nirjhar Roy (IBM)
2025-11-10 13:59     ` Christoph Hellwig
2025-11-12  7:13       ` Nirjhar Roy (IBM)
2025-10-29 15:58 ` fall back from direct to buffered " Bart Van Assche
2025-10-29 16:14   ` Darrick J. Wong
2025-10-29 16:33   ` Christoph Hellwig
2025-10-30 11:20 ` Dave Chinner
2025-10-30 12:00   ` Geoff Back
2025-10-30 12:54     ` Jan Kara
2025-10-30 14:35     ` Christoph Hellwig
2025-10-30 22:02     ` Dave Chinner
2025-10-30 14:33   ` Christoph Hellwig
2025-10-30 23:18     ` Dave Chinner
2025-10-31 13:00       ` Christoph Hellwig
2025-10-31 15:57         ` Keith Busch
2025-10-31 16:47           ` Christoph Hellwig
2025-11-03 11:14             ` Jan Kara
2025-11-03 12:21               ` Christoph Hellwig
2025-11-03 22:47                 ` Keith Busch
2025-11-04 23:38                 ` Darrick J. Wong
2025-11-05 14:11                   ` Christoph Hellwig
2025-11-05 21:44                     ` Darrick J. Wong
2025-11-06  9:50                       ` Johannes Thumshirn
2025-11-06 12:49                         ` hch
2025-11-12 14:18                           ` Ming Lei

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).