* [PATCH v2 1/3] pNFS: report clora_changed in the cb_layoutrecall_file tracepoint
2026-06-25 12:05 [PATCH v2 0/3] pNFS/flexfiles: honor clora_changed and report cancelled I/O Benjamin Coddington
@ 2026-06-25 12:05 ` Benjamin Coddington
2026-06-25 12:05 ` [PATCH v2 2/3] pNFS: honor clora_changed when recalling a layout Benjamin Coddington
2026-06-25 12:05 ` [PATCH v2 3/3] NFSv4/flexfiles: report cancelled I/O as a layout error Benjamin Coddington
2 siblings, 0 replies; 4+ messages in thread
From: Benjamin Coddington @ 2026-06-25 12:05 UTC (permalink / raw)
To: Trond Myklebust, Anna Schumaker; +Cc: linux-nfs
A CB_LAYOUTRECALL carries the clora_changed flag (RFC 8881, Section
20.3.3), which tells the client whether the server is changing the
layout (and therefore whether the client should flush modified data to
the storage devices before returning, or stop writing to them and go
through the metadata server). The client decodes this into
cbl_layoutchanged, but it is otherwise invisible.
Give nfs4_cb_layoutrecall_file its own event definition and report
clora_changed, so the intent of a recall can be observed in a trace.
Signed-off-by: Benjamin Coddington <bcodding@hammerspace.com>
---
fs/nfs/callback_proc.c | 2 +-
fs/nfs/nfs4trace.h | 55 +++++++++++++++++++++++++++++++++++++++++-
2 files changed, 55 insertions(+), 2 deletions(-)
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 10f2354ba304..f5cf76d36367 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -317,7 +317,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
nfs_iput_and_deactive(ino);
out_noput:
trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, ino,
- &args->cbl_stateid, -rv);
+ &args->cbl_stateid, args->cbl_layoutchanged, -rv);
return rv;
}
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 1ed677810d9d..e679507eccb6 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -1515,7 +1515,60 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event,
), \
TP_ARGS(clp, fhandle, inode, stateid, error))
DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_recall);
-DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_layoutrecall_file);
+
+TRACE_EVENT(nfs4_cb_layoutrecall_file,
+ TP_PROTO(
+ const struct nfs_client *clp,
+ const struct nfs_fh *fhandle,
+ const struct inode *inode,
+ const nfs4_stateid *stateid,
+ unsigned int changed,
+ int error
+ ),
+
+ TP_ARGS(clp, fhandle, inode, stateid, changed, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __string(dstaddr, clp ? clp->cl_hostname : "unknown")
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __field(unsigned int, changed)
+ ),
+
+ TP_fast_assign(
+ __entry->error = error < 0 ? -error : 0;
+ __entry->fhandle = nfs_fhandle_hash(fhandle);
+ if (!IS_ERR_OR_NULL(inode)) {
+ __entry->fileid = inode->i_ino;
+ __entry->dev = inode->i_sb->s_dev;
+ } else {
+ __entry->fileid = 0;
+ __entry->dev = 0;
+ }
+ __assign_str(dstaddr);
+ __entry->stateid_seq =
+ be32_to_cpu(stateid->seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(stateid);
+ __entry->changed = changed;
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x dstaddr=%s clora_changed=%u",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __get_str(dstaddr), __entry->changed
+ )
+);
#define show_stateid_type(type) \
__print_symbolic(type, \
--
2.53.0
^ permalink raw reply related [flat|nested] 4+ messages in thread* [PATCH v2 2/3] pNFS: honor clora_changed when recalling a layout
2026-06-25 12:05 [PATCH v2 0/3] pNFS/flexfiles: honor clora_changed and report cancelled I/O Benjamin Coddington
2026-06-25 12:05 ` [PATCH v2 1/3] pNFS: report clora_changed in the cb_layoutrecall_file tracepoint Benjamin Coddington
@ 2026-06-25 12:05 ` Benjamin Coddington
2026-06-25 12:05 ` [PATCH v2 3/3] NFSv4/flexfiles: report cancelled I/O as a layout error Benjamin Coddington
2 siblings, 0 replies; 4+ messages in thread
From: Benjamin Coddington @ 2026-06-25 12:05 UTC (permalink / raw)
To: Trond Myklebust, Anna Schumaker; +Cc: linux-nfs
When the metadata server recalls a layout with clora_changed FALSE, the
layout is not changing and the client may complete its modified writes to
the storage devices before returning the layout (RFC 8881, Section
20.3.3). Only when clora_changed is TRUE -- the server is restriping, or
a storage device has failed -- should the client stop writing to the
storage devices and redirect through the metadata server.
Since commit b739a5bd9d9f ("NFSv4/flexfiles: Cancel I/O if the layout is
recalled or revoked") the client cancels in-flight I/O on every recall,
regardless of clora_changed. For an unchanged recall this abandons
writes whose data may already have reached the storage device; such a
write can then land after the LAYOUTRETURN, which the server sees as a
write without a layout.
Pass the recall's clora_changed value through
pnfs_mark_matching_lsegs_return() and only cancel in-flight I/O when the
layout is actually changing. When it is not, the existing deferred
return path waits for the in-flight writes to drain before sending the
LAYOUTRETURN. Other callers, which are tearing down or returning the
layout for their own reasons, continue to cancel as before.
Signed-off-by: Benjamin Coddington <bcodding@hammerspace.com>
---
fs/nfs/callback_proc.c | 3 ++-
fs/nfs/pnfs.c | 22 +++++++++++++---------
fs/nfs/pnfs.h | 2 +-
3 files changed, 16 insertions(+), 11 deletions(-)
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index f5cf76d36367..3fb10c8e4271 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -291,7 +291,8 @@ static u32 initiate_file_draining(struct nfs_client *clp,
pnfs_set_layout_stateid(lo, &args->cbl_stateid, NULL, true);
switch (pnfs_mark_matching_lsegs_return(lo, &free_me_list,
&args->cbl_range,
- be32_to_cpu(args->cbl_stateid.seqid))) {
+ be32_to_cpu(args->cbl_stateid.seqid),
+ args->cbl_layoutchanged)) {
case 0:
case -EBUSY:
/* There are layout segments that need to be returned */
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 7715e2bd5871..5becc70af16e 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -432,7 +432,8 @@ bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
goto out;
}
/* Try to update the seqid to the most recent */
- err = pnfs_mark_matching_lsegs_return(lo, &head, &range, 0);
+ err = pnfs_mark_matching_lsegs_return(lo, &head, &range, 0,
+ true);
if (err != -EBUSY) {
dst->seqid = lo->plh_stateid.seqid;
*dst_range = range;
@@ -486,7 +487,7 @@ static int pnfs_mark_layout_stateid_return(struct pnfs_layout_hdr *lo,
.length = NFS4_MAX_UINT64,
};
- return pnfs_mark_matching_lsegs_return(lo, lseg_list, &range, seq);
+ return pnfs_mark_matching_lsegs_return(lo, lseg_list, &range, seq, true);
}
static int
@@ -524,7 +525,7 @@ pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode)
spin_lock(&inode->i_lock);
pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
- pnfs_mark_matching_lsegs_return(lo, &head, &range, 0);
+ pnfs_mark_matching_lsegs_return(lo, &head, &range, 0, true);
spin_unlock(&inode->i_lock);
pnfs_free_lseg_list(&head);
dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__,
@@ -1461,7 +1462,7 @@ _pnfs_return_layout(struct inode *ino)
}
valid_layout = pnfs_layout_is_valid(lo);
pnfs_clear_layoutcommit(ino, &tmp_list);
- pnfs_mark_matching_lsegs_return(lo, &tmp_list, &range, 0);
+ pnfs_mark_matching_lsegs_return(lo, &tmp_list, &range, 0, true);
/* Don't send a LAYOUTRETURN if list was initially empty */
@@ -2621,7 +2622,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
.iomode = IOMODE_ANY,
.length = NFS4_MAX_UINT64,
};
- pnfs_mark_matching_lsegs_return(lo, &free_me, &range, 0);
+ pnfs_mark_matching_lsegs_return(lo, &free_me, &range, 0, true);
goto out_forget;
} else {
/* We have a completely new layout */
@@ -2652,6 +2653,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
* @tmp_list: list header to be used with pnfs_free_lseg_list()
* @return_range: describe layout segment ranges to be returned
* @seq: stateid seqid to match
+ * @cancel_io: signal io be cancelled
*
* This function is mainly intended for use by layoutrecall. It attempts
* to free the layout segment immediately, or else to mark it for return
@@ -2666,7 +2668,7 @@ int
pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
const struct pnfs_layout_range *return_range,
- u32 seq)
+ u32 seq, bool cancel_io)
{
struct pnfs_layout_segment *lseg, *next;
struct nfs_server *server = NFS_SERVER(lo->plh_inode);
@@ -2692,7 +2694,8 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
continue;
remaining++;
set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
- pnfs_lseg_cancel_io(server, lseg);
+ if (cancel_io)
+ pnfs_lseg_cancel_io(server, lseg);
}
if (remaining) {
@@ -2727,7 +2730,8 @@ pnfs_mark_layout_for_return(struct inode *inode,
* segments at hand when sending layoutreturn. See pnfs_put_lseg()
* for how it works.
*/
- if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, range, 0) != -EBUSY) {
+ if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, range, 0,
+ true) != -EBUSY) {
const struct cred *cred;
nfs4_stateid stateid;
enum pnfs_iomode iomode;
@@ -2842,7 +2846,7 @@ static int pnfs_layout_return_unused_byserver(struct nfs_server *server,
pnfs_get_layout_hdr(lo);
pnfs_set_plh_return_info(lo, range->iomode, 0);
if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs,
- range, 0) != 0 ||
+ range, 0, true) != 0 ||
!pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode)) {
spin_unlock(&inode->i_lock);
rcu_read_unlock();
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index eb39859c216c..673c2b244978 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -300,7 +300,7 @@ int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
const struct pnfs_layout_range *recall_range,
- u32 seq);
+ u32 seq, bool cancel_io);
int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
struct list_head *lseg_list);
bool pnfs_roc(struct inode *ino, struct nfs4_layoutreturn_args *args,
--
2.53.0
^ permalink raw reply related [flat|nested] 4+ messages in thread* [PATCH v2 3/3] NFSv4/flexfiles: report cancelled I/O as a layout error
2026-06-25 12:05 [PATCH v2 0/3] pNFS/flexfiles: honor clora_changed and report cancelled I/O Benjamin Coddington
2026-06-25 12:05 ` [PATCH v2 1/3] pNFS: report clora_changed in the cb_layoutrecall_file tracepoint Benjamin Coddington
2026-06-25 12:05 ` [PATCH v2 2/3] pNFS: honor clora_changed when recalling a layout Benjamin Coddington
@ 2026-06-25 12:05 ` Benjamin Coddington
2 siblings, 0 replies; 4+ messages in thread
From: Benjamin Coddington @ 2026-06-25 12:05 UTC (permalink / raw)
To: Trond Myklebust, Anna Schumaker; +Cc: linux-nfs
When a layout is recalled or revoked the client cancels its in-flight I/O
so the layout can be returned. The metadata server needs to learn that
this I/O to the storage device did not complete, so that it can reconcile
the affected mirror instance (or, if none remains, take other action).
The cancellation completed with -EAGAIN, which ff_layout_io_track_ds_error()
does not recognise: it fell through the switch and recorded nothing, so no
error was reported to the server.
-EAGAIN is overloaded in the RPC layer, so rather than key the reporting on
it, cancel the I/O with -ECANCELED and map that to NFS4ERR_NXIO in
ff_layout_io_track_ds_error() -- the status the client already reports for
the transport errors that leave an in-flight write incomplete. The
cancelled I/O is then reported to the server via LAYOUTERROR / LAYOUTRETURN.
Unlike a genuine transport error, though, we aborted the I/O ourselves and
have no evidence the device is at fault, so once the error is recorded we
skip marking the device unreachable and forcing a further layout return.
The retry disposition is unchanged from the original -EAGAIN cancellation:
both NFS4ERR_NXIO and -ECANCELED are no-ops in ff_layout_async_handle_error(),
which still resets the I/O to pNFS (or the MDS), so it is re-driven as before.
Signed-off-by: Benjamin Coddington <bcodding@hammerspace.com>
---
fs/nfs/flexfilelayout/flexfilelayout.c | 23 ++++++++++++++++++++++-
1 file changed, 22 insertions(+), 1 deletion(-)
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index c4aa995026f6..c8072f333236 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1543,6 +1543,17 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
case -EACCES:
*op_status = status = NFS4ERR_ACCESS;
break;
+ case -ECANCELED:
+ /*
+ * In-flight I/O we cancelled to return a recalled or
+ * revoked layout. Report it as a failure to reach the
+ * device (NFS4ERR_NXIO), like the transport errors
+ * above, so the server can reconcile the affected mirror
+ * instance. We aborted the I/O ourselves rather than
+ * observe the device fail, so don't condemn it below.
+ */
+ *op_status = status = NFS4ERR_NXIO;
+ break;
default:
return;
}
@@ -1553,6 +1564,15 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
mirror, dss_id, offset, length, status, opnum,
nfs_io_gfp_mask());
+ /*
+ * I/O we cancelled ourselves to return a recalled or revoked layout
+ * is reported above so the server can reconcile the mirror, but we
+ * have no evidence the device is at fault: don't mark it unreachable
+ * or force a return.
+ */
+ if (error == -ECANCELED)
+ goto out;
+
switch (status) {
case NFS4ERR_DELAY:
case NFS4ERR_GRACE:
@@ -1572,6 +1592,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
lseg);
}
+out:
dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
}
@@ -2462,7 +2483,7 @@ static void ff_layout_cancel_io(struct pnfs_layout_segment *lseg)
clnt = ds_clp->cl_rpcclient;
if (!clnt)
continue;
- if (!rpc_cancel_tasks(clnt, -EAGAIN,
+ if (!rpc_cancel_tasks(clnt, -ECANCELED,
ff_layout_match_io, lseg))
continue;
rpc_clnt_disconnect(clnt);
--
2.53.0
^ permalink raw reply related [flat|nested] 4+ messages in thread