From: Ojaswin Mujoo <ojaswin@linux.ibm.com>
To: Christian Brauner <brauner@kernel.org>,
djwong@kernel.org, ritesh.list@gmail.com,
john.g.garry@oracle.com, tytso@mit.edu, willy@infradead.org,
dchinner@redhat.com, hch@lst.de
Cc: linux-xfs@vger.kernel.org, linux-kernel@vger.kernel.org,
linux-ext4@vger.kernel.org, linux-fsdevel@vger.kernel.org,
linux-mm@kvack.org, jack@suse.cz, nilay@linux.ibm.com,
martin.petersen@oracle.com, rostedt@goodmis.org, axboe@kernel.dk,
linux-block@vger.kernel.org, linux-trace-kernel@vger.kernel.org
Subject: [RFC PATCH 4/8] iomap: buffered atomic write support
Date: Wed, 12 Nov 2025 16:36:07 +0530 [thread overview]
Message-ID: <8229fb9bcd2504b80caf0e763b1984d7ee6178b0.1762945505.git.ojaswin@linux.ibm.com> (raw)
In-Reply-To: <cover.1762945505.git.ojaswin@linux.ibm.com>
Add special handling of PG_atomic flag to iomap buffered write path.
To flag an iomap iter for an atomic write, set IOMAP_ATOMIC. For a folio
associated with a write which has IOMAP_ATOMIC set, set PG_atomic.
Otherwise, when IOMAP_ATOMIC is unset, clear PG_atomic.
This means that for an "atomic" folio which has not been written back,
it loses it "atomicity". So if userspace issues a write with RWF_ATOMIC
set and another write with RWF_ATOMIC unset, that folio is not written back
atomically. For such a scenario to occur, it would be considered a userspace
usage error.
To ensure that a buffered atomic write is written back atomically when
the write syscall returns, RWF_SYNC or similar needs to be used (in
conjunction with RWF_ATOMIC).
Only a single BIO should ever be submitted for an atomic write. So
modify iomap_add_to_ioend() to ensure that we don't try to write back an
atomic folio as part of a larger mixed-atomicity BIO.
In iomap_alloc_ioend(), handle an atomic write by setting REQ_ATOMIC for
the allocated BIO. When a folio is written back, again clear PG_atomic,
as it is no longer required.
Currently, RWF_ATOMIC with buffered IO is limited to single block
size writes, and has 2 main restrictions:
1. Only blocksize == pagesize is supported
2. Writes where the user buffer is not aligned to PAGE_SIZE are not
supported
For more details, refer to the comment in generic_atomic_write_valid()
Co-developed-by: John Garry <john.g.garry@oracle.com>
Signed-off-by: John Garry <john.g.garry@oracle.com>
Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
---
fs/iomap/buffered-io.c | 48 ++++++++++++++++++++++++++++++++++++------
fs/iomap/ioend.c | 18 ++++++++++++----
fs/read_write.c | 34 ++++++++++++++++++++++++++++--
include/linux/iomap.h | 2 ++
4 files changed, 89 insertions(+), 13 deletions(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index f099c086cbe8..947c76c2688a 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -850,11 +850,13 @@ static int iomap_write_begin(struct iomap_iter *iter,
{
const struct iomap *srcmap = iomap_iter_srcmap(iter);
loff_t pos;
- u64 len = min_t(u64, SIZE_MAX, iomap_length(iter));
+ u64 orig_len = min_t(u64, SIZE_MAX, iomap_length(iter));
+ u64 len;
struct folio *folio;
int status = 0;
+ bool is_atomic = iter->flags & IOMAP_ATOMIC;
- len = min_not_zero(len, *plen);
+ len = min_not_zero(orig_len, *plen);
*foliop = NULL;
*plen = 0;
@@ -922,6 +924,11 @@ static int iomap_write_begin(struct iomap_iter *iter,
if (unlikely(status))
goto out_unlock;
+ if (is_atomic && (len != orig_len)) {
+ status = -EINVAL;
+ goto out_unlock;
+ }
+
*foliop = folio;
*plen = len;
return 0;
@@ -931,7 +938,7 @@ static int iomap_write_begin(struct iomap_iter *iter,
return status;
}
-static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
+static bool __iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
size_t copied, struct folio *folio)
{
flush_dcache_folio(folio);
@@ -951,7 +958,27 @@ static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
return false;
iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len);
iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied);
- filemap_dirty_folio(inode->i_mapping, folio);
+ filemap_dirty_folio(iter->inode->i_mapping, folio);
+
+ /*
+ * Policy: non atomic write over a previously atomic range makes the
+ * range non-atomic. Handle this here.
+ */
+ if (iter->flags & IOMAP_ATOMIC) {
+ if (copied < len) {
+ /*
+ * A short atomic write is only okay as long as nothing
+ * is written at all. If we have a partial write, there
+ * is a bug in our code.
+ */
+ WARN_ON_ONCE(copied != 0);
+
+ return false;
+ }
+ folio_set_atomic(folio);
+ } else
+ folio_clear_atomic(folio);
+
return true;
}
@@ -997,7 +1024,7 @@ static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied,
return bh_written == copied;
}
- return __iomap_write_end(iter->inode, pos, len, copied, folio);
+ return __iomap_write_end(iter, pos, len, copied, folio);
}
static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i,
@@ -1124,6 +1151,8 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
iter.flags |= IOMAP_NOWAIT;
if (iocb->ki_flags & IOCB_DONTCACHE)
iter.flags |= IOMAP_DONTCACHE;
+ if (iocb->ki_flags & IOCB_ATOMIC)
+ iter.flags |= IOMAP_ATOMIC;
while ((ret = iomap_iter(&iter, ops)) > 0)
iter.status = iomap_write_iter(&iter, i, write_ops);
@@ -1588,6 +1617,7 @@ static int iomap_folio_mkwrite_iter(struct iomap_iter *iter,
} else {
WARN_ON_ONCE(!folio_test_uptodate(folio));
folio_mark_dirty(folio);
+ folio_clear_atomic(folio);
}
return iomap_iter_advance(iter, length);
@@ -1642,8 +1672,10 @@ void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
- if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending))
+ if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending)) {
+ folio_clear_atomic(folio);
folio_end_writeback(folio);
+ }
}
EXPORT_SYMBOL_GPL(iomap_finish_folio_write);
@@ -1807,8 +1839,10 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio)
if (atomic_dec_and_test(&ifs->write_bytes_pending))
folio_end_writeback(folio);
} else {
- if (!wb_pending)
+ if (!wb_pending) {
+ folio_clear_atomic(folio);
folio_end_writeback(folio);
+ }
}
mapping_set_error(inode->i_mapping, error);
return error;
diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
index b49fa75eab26..c129a695ceca 100644
--- a/fs/iomap/ioend.c
+++ b/fs/iomap/ioend.c
@@ -98,13 +98,17 @@ int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error)
EXPORT_SYMBOL_GPL(iomap_ioend_writeback_submit);
static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
- loff_t pos, u16 ioend_flags)
+ loff_t pos, u16 ioend_flags,
+ bool atomic)
{
struct bio *bio;
+ blk_opf_t opf = REQ_OP_WRITE | wbc_to_write_flags(wpc->wbc);
+
+ if (atomic)
+ opf |= REQ_ATOMIC;
bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
- REQ_OP_WRITE | wbc_to_write_flags(wpc->wbc),
- GFP_NOFS, &iomap_ioend_bioset);
+ opf, GFP_NOFS, &iomap_ioend_bioset);
bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
bio->bi_write_hint = wpc->inode->i_write_hint;
wbc_init_bio(wpc->wbc, bio);
@@ -122,6 +126,9 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
(ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
return false;
+ if ((ioend_flags & IOMAP_IOEND_ATOMIC) ||
+ (ioend->io_flags & IOMAP_IOEND_ATOMIC))
+ return false;
if (pos != ioend->io_offset + ioend->io_size)
return false;
if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) &&
@@ -156,6 +163,7 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
unsigned int ioend_flags = 0;
unsigned int map_len = min_t(u64, dirty_len,
wpc->iomap.offset + wpc->iomap.length - pos);
+ bool is_atomic = folio_test_atomic(folio);
int error;
trace_iomap_add_to_ioend(wpc->inode, pos, dirty_len, &wpc->iomap);
@@ -180,6 +188,8 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
ioend_flags |= IOMAP_IOEND_DONTCACHE;
if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
ioend_flags |= IOMAP_IOEND_BOUNDARY;
+ if (is_atomic)
+ ioend_flags |= IOMAP_IOEND_ATOMIC;
if (!ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) {
new_ioend:
@@ -188,7 +198,7 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
if (error)
return error;
}
- wpc->wb_ctx = ioend = iomap_alloc_ioend(wpc, pos, ioend_flags);
+ wpc->wb_ctx = ioend = iomap_alloc_ioend(wpc, pos, ioend_flags, is_atomic);
}
if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff))
diff --git a/fs/read_write.c b/fs/read_write.c
index 833bae068770..37546aa40f0d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1802,6 +1802,8 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out)
int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter)
{
+ struct super_block *sb = iocb->ki_filp->f_mapping->host->i_sb;
+
size_t len = iov_iter_count(iter);
if (!iter_is_ubuf(iter))
@@ -1813,8 +1815,36 @@ int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter)
if (!IS_ALIGNED(iocb->ki_pos, len))
return -EINVAL;
- if (!(iocb->ki_flags & IOCB_DIRECT))
- return -EOPNOTSUPP;
+ if (!(iocb->ki_flags & IOCB_DIRECT)) {
+ /* Some restrictions to buferred IO */
+
+ /*
+ * We only support block size == page size
+ * right now. This is to avoid the following:
+ * 1. 4kb block atomic write marks the complete 64kb folio as
+ * atomic.
+ * 2. Other writes, dirty the whole 64kb folio.
+ * 3. Writeback sees the whole folio dirty and atomic and tries
+ * to send a 64kb atomic write, which might exceed the
+ * allowed size and fail.
+ *
+ * Once we support sub-page atomic write tracking, we can remove
+ * this restriction.
+ */
+ if (sb->s_blocksize != PAGE_SIZE)
+ return -EOPNOTSUPP;
+
+ /*
+ * If the user buffer of atomic write crosses page boundary,
+ * there's a possibility of short write, example if 1 user page
+ * could not be faulted or got reclaimed before the copy
+ * operation. For now don't allow such a scenario by ensuring
+ * user buffer is page aligned.
+ */
+ if (!PAGE_ALIGNED(iov_iter_alignment(iter)))
+ return -EOPNOTSUPP;
+
+ }
return 0;
}
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 8b1ac08c7474..693f3e5ad03c 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -390,6 +390,8 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
#define IOMAP_IOEND_DIRECT (1U << 3)
/* is DONTCACHE I/O */
#define IOMAP_IOEND_DONTCACHE (1U << 4)
+/* is atomic I/O. These are never merged */
+#define IOMAP_IOEND_ATOMIC (1U << 5)
/*
* Flags that if set on either ioend prevent the merge of two ioends.
--
2.51.0
next prev parent reply other threads:[~2025-11-12 11:07 UTC|newest]
Thread overview: 28+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-11-12 11:06 [RFC PATCH 0/8] xfs: single block atomic writes for buffered IO Ojaswin Mujoo
2025-11-12 11:06 ` [RFC PATCH 1/8] fs: Rename STATX{_ATTR}_WRITE_ATOMIC -> STATX{_ATTR}_WRITE_ATOMIC_DIO Ojaswin Mujoo
2025-11-12 11:06 ` [RFC PATCH 2/8] mm: Add PG_atomic Ojaswin Mujoo
2025-11-12 15:56 ` Matthew Wilcox
2025-11-13 12:34 ` David Hildenbrand (Red Hat)
2025-11-14 5:00 ` Ritesh Harjani
2025-11-14 13:16 ` Matthew Wilcox
2025-11-18 16:17 ` Ritesh Harjani
2025-11-18 23:30 ` Dave Chinner
2025-11-12 11:06 ` [RFC PATCH 3/8] fs: Add initial buffered atomic write support info to statx Ojaswin Mujoo
2025-11-12 11:06 ` Ojaswin Mujoo [this message]
2025-11-12 11:06 ` [RFC PATCH 5/8] iomap: pin pages for RWF_ATOMIC buffered write Ojaswin Mujoo
2025-11-12 11:06 ` [RFC PATCH 6/8] xfs: Report atomic write min and max for buf io as well Ojaswin Mujoo
2025-11-12 11:06 ` [RFC PATCH 7/8] iomap: Add bs<ps buffered atomic writes support Ojaswin Mujoo
2025-11-12 11:06 ` [RFC PATCH 8/8] xfs: Lift the bs == ps restriction for HW buffered atomic writes Ojaswin Mujoo
2025-11-12 15:50 ` [syzbot ci] Re: xfs: single block atomic writes for buffered IO syzbot ci
2025-11-12 21:56 ` [RFC PATCH 0/8] " Dave Chinner
2025-11-13 5:23 ` Christoph Hellwig
2025-11-13 5:42 ` Ritesh Harjani
2025-11-13 5:57 ` Christoph Hellwig
2025-11-13 10:32 ` Dave Chinner
2025-11-14 9:20 ` Ojaswin Mujoo
2025-11-14 13:18 ` Matthew Wilcox
2025-11-16 8:11 ` Dave Chinner
2025-11-17 10:59 ` John Garry
2025-11-17 20:51 ` Dave Chinner
2025-11-20 10:37 ` Ojaswin Mujoo
2025-11-20 12:14 ` Ojaswin Mujoo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=8229fb9bcd2504b80caf0e763b1984d7ee6178b0.1762945505.git.ojaswin@linux.ibm.com \
--to=ojaswin@linux.ibm.com \
--cc=axboe@kernel.dk \
--cc=brauner@kernel.org \
--cc=dchinner@redhat.com \
--cc=djwong@kernel.org \
--cc=hch@lst.de \
--cc=jack@suse.cz \
--cc=john.g.garry@oracle.com \
--cc=linux-block@vger.kernel.org \
--cc=linux-ext4@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linux-trace-kernel@vger.kernel.org \
--cc=linux-xfs@vger.kernel.org \
--cc=martin.petersen@oracle.com \
--cc=nilay@linux.ibm.com \
--cc=ritesh.list@gmail.com \
--cc=rostedt@goodmis.org \
--cc=tytso@mit.edu \
--cc=willy@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).