* [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing
@ 2026-01-27 0:50 Dai Ngo
2026-01-27 17:11 ` Chuck Lever
2026-01-27 17:54 ` Jeff Layton
0 siblings, 2 replies; 12+ messages in thread
From: Dai Ngo @ 2026-01-27 0:50 UTC (permalink / raw)
To: chuck.lever, jlayton, neil, okorniev, tom, hch, alex.aring, viro,
brauner, jack
Cc: linux-fsdevel, linux-nfs
When a layout conflict triggers a recall, enforcing a timeout is
necessary to prevent excessive nfsd threads from being blocked in
__break_lease ensuring the server continues servicing incoming
requests efficiently.
This patch introduces a new function to lease_manager_operations:
lm_breaker_timedout: Invoked when a lease recall times out and is
about to be disposed of. This function enables the lease manager
to inform the caller whether the file_lease should remain on the
flc_list or be disposed of.
For the NFSD lease manager, this function now handles layout recall
timeouts. If the layout type supports fencing and the client has not
been fenced, a fence operation is triggered to prevent the client
from accessing the block device.
While the fencing operation is in progress, the conflicting file_lease
remains on the flc_list until fencing is complete. This guarantees
that no other clients can access the file, and the client with
exclusive access is properly blocked before disposal.
Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
---
Documentation/filesystems/locking.rst | 2 +
fs/locks.c | 10 +++-
fs/nfsd/blocklayout.c | 38 ++++++-------
fs/nfsd/nfs4layouts.c | 79 +++++++++++++++++++++++++--
fs/nfsd/nfs4state.c | 1 +
fs/nfsd/state.h | 6 ++
include/linux/filelock.h | 1 +
7 files changed, 110 insertions(+), 27 deletions(-)
v2:
. Update Subject line to include fencing operation.
. Allow conflicting lease to remain on flc_list until fencing
is complete.
. Use system worker to perform fencing operation asynchronously.
. Use nfs4_stid.sc_count to ensure layout stateid remains
valid before starting the fencing operation, nfs4_stid.sc_count
is released after fencing operation is complete.
. Rework nfsd4_scsi_fence_client to:
. wait until fencing to complete before exiting.
. wait until fencing in progress to complete before
checking the NFSD_MDS_PR_FENCED flag.
. Remove lm_need_to_retry from lease_manager_operations.
v3:
. correct locking requirement in locking.rst.
. add max retry count to fencing operation.
. add missing nfs4_put_stid in nfsd4_layout_fence_worker.
. remove special-casing of FL_LAYOUT in lease_modify.
. remove lease_want_dispose.
. move lm_breaker_timedout call to time_out_leases.
v4:
. only increment ls_fence_retry_cnt after successfully
schedule new work in nfsd4_layout_lm_breaker_timedout.
diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index 04c7691e50e0..a339491f02e4 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -403,6 +403,7 @@ prototypes::
bool (*lm_breaker_owns_lease)(struct file_lock *);
bool (*lm_lock_expirable)(struct file_lock *);
void (*lm_expire_lock)(void);
+ void (*lm_breaker_timedout)(struct file_lease *);
locking rules:
@@ -417,6 +418,7 @@ lm_breaker_owns_lease: yes no no
lm_lock_expirable yes no no
lm_expire_lock no no yes
lm_open_conflict yes no no
+lm_breaker_timedout yes no no
====================== ============= ================= =========
buffer_head
diff --git a/fs/locks.c b/fs/locks.c
index 46f229f740c8..1b63aa704598 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1524,6 +1524,7 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
{
struct file_lock_context *ctx = inode->i_flctx;
struct file_lease *fl, *tmp;
+ bool remove = true;
lockdep_assert_held(&ctx->flc_lock);
@@ -1531,8 +1532,13 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
trace_time_out_leases(inode, fl);
if (past_time(fl->fl_downgrade_time))
lease_modify(fl, F_RDLCK, dispose);
- if (past_time(fl->fl_break_time))
- lease_modify(fl, F_UNLCK, dispose);
+
+ if (past_time(fl->fl_break_time)) {
+ if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout)
+ remove = fl->fl_lmops->lm_breaker_timedout(fl);
+ if (remove)
+ lease_modify(fl, F_UNLCK, dispose);
+ }
}
}
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 7ba9e2dd0875..69d3889df302 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -443,6 +443,14 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp,
return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
}
+/*
+ * Perform the fence operation to prevent the client from accessing the
+ * block device. If a fence operation is already in progress, wait for
+ * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the
+ * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set,
+ * update the layout stateid by setting the ls_fenced flag to indicate
+ * that the client has been fenced.
+ */
static void
nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
{
@@ -450,31 +458,23 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev;
int status;
- if (nfsd4_scsi_fence_set(clp, bdev->bd_dev))
+ mutex_lock(&clp->cl_fence_mutex);
+ if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) {
+ ls->ls_fenced = true;
+ mutex_unlock(&clp->cl_fence_mutex);
+ nfs4_put_stid(&ls->ls_stid);
return;
+ }
status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
nfsd4_scsi_pr_key(clp),
PR_EXCLUSIVE_ACCESS_REG_ONLY, true);
- /*
- * Reset to allow retry only when the command could not have
- * reached the device. Negative status means a local error
- * (e.g., -ENOMEM) prevented the command from being sent.
- * PR_STS_PATH_FAILED, PR_STS_PATH_FAST_FAILED, and
- * PR_STS_RETRY_PATH_FAILURE indicate transport path failures
- * before device delivery.
- *
- * For all other errors, the command may have reached the device
- * and the preempt may have succeeded. Avoid resetting, since
- * retrying a successful preempt returns PR_STS_IOERR or
- * PR_STS_RESERVATION_CONFLICT, which would cause an infinite
- * retry loop.
- */
- if (status < 0 ||
- status == PR_STS_PATH_FAILED ||
- status == PR_STS_PATH_FAST_FAILED ||
- status == PR_STS_RETRY_PATH_FAILURE)
+ if (status)
nfsd4_scsi_fence_clear(clp, bdev->bd_dev);
+ else
+ ls->ls_fenced = true;
+ mutex_unlock(&clp->cl_fence_mutex);
+ nfs4_put_stid(&ls->ls_stid);
trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status);
}
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index ad7af8cfcf1f..1c498f3cd059 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -222,6 +222,29 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
return 0;
}
+static void
+nfsd4_layout_fence_worker(struct work_struct *work)
+{
+ struct nfsd_file *nf;
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct nfs4_layout_stateid *ls = container_of(dwork,
+ struct nfs4_layout_stateid, ls_fence_work);
+ u32 type;
+
+ rcu_read_lock();
+ nf = nfsd_file_get(ls->ls_file);
+ rcu_read_unlock();
+ if (!nf) {
+ nfs4_put_stid(&ls->ls_stid);
+ return;
+ }
+
+ type = ls->ls_layout_type;
+ if (nfsd4_layout_ops[type]->fence_client)
+ nfsd4_layout_ops[type]->fence_client(ls, nf);
+ nfsd_file_put(nf);
+}
+
static struct nfs4_layout_stateid *
nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
struct nfs4_stid *parent, u32 layout_type)
@@ -271,6 +294,10 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
list_add(&ls->ls_perfile, &fp->fi_lo_states);
spin_unlock(&fp->fi_lock);
+ INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker);
+ ls->ls_fenced = false;
+ ls->ls_fence_retry_cnt = 0;
+
trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid);
return ls;
}
@@ -708,9 +735,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
rcu_read_unlock();
if (fl) {
ops = nfsd4_layout_ops[ls->ls_layout_type];
- if (ops->fence_client)
+ if (ops->fence_client) {
+ refcount_inc(&ls->ls_stid.sc_count);
ops->fence_client(ls, fl);
- else
+ } else
nfsd4_cb_layout_fail(ls, fl);
nfsd_file_put(fl);
}
@@ -747,11 +775,9 @@ static bool
nfsd4_layout_lm_break(struct file_lease *fl)
{
/*
- * We don't want the locks code to timeout the lease for us;
- * we'll remove it ourself if a layout isn't returned
- * in time:
+ * Enforce break lease timeout to prevent NFSD
+ * thread from hanging in __break_lease.
*/
- fl->fl_break_time = 0;
nfsd4_recall_file_layout(fl->c.flc_owner);
return false;
}
@@ -782,10 +808,51 @@ nfsd4_layout_lm_open_conflict(struct file *filp, int arg)
return 0;
}
+/**
+ * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out.
+ * If the layout type supports a fence operation, schedule a worker to
+ * fence the client from accessing the block device.
+ *
+ * @fl: file to check
+ *
+ * Return true if the file lease should be disposed of by the caller;
+ * otherwise, return false.
+ */
+static bool
+nfsd4_layout_lm_breaker_timedout(struct file_lease *fl)
+{
+ struct nfs4_layout_stateid *ls = fl->c.flc_owner;
+ bool ret;
+
+ if (!nfsd4_layout_ops[ls->ls_layout_type]->fence_client)
+ return true;
+ if (ls->ls_fenced || ls->ls_fence_retry_cnt >= LO_MAX_FENCE_RETRY)
+ return true;
+
+ if (work_busy(&ls->ls_fence_work.work))
+ return false;
+ /* Schedule work to do the fence operation */
+ ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
+ if (!ret) {
+ /*
+ * If there is no pending work, mod_delayed_work queues
+ * new task. While fencing is in progress, a reference
+ * count is added to the layout stateid to ensure its
+ * validity. This reference count is released once fencing
+ * has been completed.
+ */
+ refcount_inc(&ls->ls_stid.sc_count);
+ ++ls->ls_fence_retry_cnt;
+ return true;
+ }
+ return false;
+}
+
static const struct lease_manager_operations nfsd4_layouts_lm_ops = {
.lm_break = nfsd4_layout_lm_break,
.lm_change = nfsd4_layout_lm_change,
.lm_open_conflict = nfsd4_layout_lm_open_conflict,
+ .lm_breaker_timedout = nfsd4_layout_lm_breaker_timedout,
};
int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 583c13b5aaf3..a57fa3318362 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2385,6 +2385,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name,
#endif
#ifdef CONFIG_NFSD_SCSILAYOUT
xa_init(&clp->cl_dev_fences);
+ mutex_init(&clp->cl_fence_mutex);
#endif
INIT_LIST_HEAD(&clp->async_copies);
spin_lock_init(&clp->async_lock);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 713f55ef6554..57e54dfb406c 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -529,6 +529,7 @@ struct nfs4_client {
time64_t cl_ra_time;
#ifdef CONFIG_NFSD_SCSILAYOUT
struct xarray cl_dev_fences;
+ struct mutex cl_fence_mutex;
#endif
};
@@ -738,8 +739,13 @@ struct nfs4_layout_stateid {
stateid_t ls_recall_sid;
bool ls_recalled;
struct mutex ls_mutex;
+ struct delayed_work ls_fence_work;
+ bool ls_fenced;
+ int ls_fence_retry_cnt;
};
+#define LO_MAX_FENCE_RETRY 5
+
static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
{
return container_of(s, struct nfs4_layout_stateid, ls_stid);
diff --git a/include/linux/filelock.h b/include/linux/filelock.h
index 2f5e5588ee07..13b9c9f04589 100644
--- a/include/linux/filelock.h
+++ b/include/linux/filelock.h
@@ -50,6 +50,7 @@ struct lease_manager_operations {
void (*lm_setup)(struct file_lease *, void **);
bool (*lm_breaker_owns_lease)(struct file_lease *);
int (*lm_open_conflict)(struct file *, int);
+ bool (*lm_breaker_timedout)(struct file_lease *fl);
};
struct lock_manager {
--
2.47.3
^ permalink raw reply related [flat|nested] 12+ messages in thread* Re: [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing 2026-01-27 0:50 [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing Dai Ngo @ 2026-01-27 17:11 ` Chuck Lever 2026-01-27 18:14 ` Jeff Layton 2026-01-27 20:36 ` Dai Ngo 2026-01-27 17:54 ` Jeff Layton 1 sibling, 2 replies; 12+ messages in thread From: Chuck Lever @ 2026-01-27 17:11 UTC (permalink / raw) To: Dai Ngo, Chuck Lever, Jeff Layton, NeilBrown, Olga Kornievskaia, Tom Talpey, Christoph Hellwig, Alexander Aring, Alexander Viro, Christian Brauner, Jan Kara Cc: linux-fsdevel, linux-nfs On Mon, Jan 26, 2026, at 7:50 PM, Dai Ngo wrote: > When a layout conflict triggers a recall, enforcing a timeout is > necessary to prevent excessive nfsd threads from being blocked in > __break_lease ensuring the server continues servicing incoming > requests efficiently. > > This patch introduces a new function to lease_manager_operations: > > lm_breaker_timedout: Invoked when a lease recall times out and is > about to be disposed of. This function enables the lease manager > to inform the caller whether the file_lease should remain on the > flc_list or be disposed of. > > For the NFSD lease manager, this function now handles layout recall > timeouts. If the layout type supports fencing and the client has not > been fenced, a fence operation is triggered to prevent the client > from accessing the block device. > > While the fencing operation is in progress, the conflicting file_lease > remains on the flc_list until fencing is complete. This guarantees > that no other clients can access the file, and the client with > exclusive access is properly blocked before disposal. > > Signed-off-by: Dai Ngo <dai.ngo@oracle.com> > --- > Documentation/filesystems/locking.rst | 2 + > fs/locks.c | 10 +++- > fs/nfsd/blocklayout.c | 38 ++++++------- > fs/nfsd/nfs4layouts.c | 79 +++++++++++++++++++++++++-- > fs/nfsd/nfs4state.c | 1 + > fs/nfsd/state.h | 6 ++ > include/linux/filelock.h | 1 + > 7 files changed, 110 insertions(+), 27 deletions(-) > > v2: > . Update Subject line to include fencing operation. > . Allow conflicting lease to remain on flc_list until fencing > is complete. > . Use system worker to perform fencing operation asynchronously. > . Use nfs4_stid.sc_count to ensure layout stateid remains > valid before starting the fencing operation, nfs4_stid.sc_count > is released after fencing operation is complete. > . Rework nfsd4_scsi_fence_client to: > . wait until fencing to complete before exiting. > . wait until fencing in progress to complete before > checking the NFSD_MDS_PR_FENCED flag. > . Remove lm_need_to_retry from lease_manager_operations. > v3: > . correct locking requirement in locking.rst. > . add max retry count to fencing operation. > . add missing nfs4_put_stid in nfsd4_layout_fence_worker. > . remove special-casing of FL_LAYOUT in lease_modify. > . remove lease_want_dispose. > . move lm_breaker_timedout call to time_out_leases. > v4: > . only increment ls_fence_retry_cnt after successfully > schedule new work in nfsd4_layout_lm_breaker_timedout. > > diff --git a/Documentation/filesystems/locking.rst > b/Documentation/filesystems/locking.rst > index 04c7691e50e0..a339491f02e4 100644 > --- a/Documentation/filesystems/locking.rst > +++ b/Documentation/filesystems/locking.rst > @@ -403,6 +403,7 @@ prototypes:: > bool (*lm_breaker_owns_lease)(struct file_lock *); > bool (*lm_lock_expirable)(struct file_lock *); > void (*lm_expire_lock)(void); > + void (*lm_breaker_timedout)(struct file_lease *); > > locking rules: > > @@ -417,6 +418,7 @@ lm_breaker_owns_lease: yes no no > lm_lock_expirable yes no no > lm_expire_lock no no yes > lm_open_conflict yes no no > +lm_breaker_timedout yes no no > ====================== ============= ================= ========= > > buffer_head > diff --git a/fs/locks.c b/fs/locks.c > index 46f229f740c8..1b63aa704598 100644 > --- a/fs/locks.c > +++ b/fs/locks.c > @@ -1524,6 +1524,7 @@ static void time_out_leases(struct inode *inode, > struct list_head *dispose) > { > struct file_lock_context *ctx = inode->i_flctx; > struct file_lease *fl, *tmp; > + bool remove = true; > > lockdep_assert_held(&ctx->flc_lock); > > @@ -1531,8 +1532,13 @@ static void time_out_leases(struct inode *inode, > struct list_head *dispose) > trace_time_out_leases(inode, fl); > if (past_time(fl->fl_downgrade_time)) > lease_modify(fl, F_RDLCK, dispose); > - if (past_time(fl->fl_break_time)) > - lease_modify(fl, F_UNLCK, dispose); > + > + if (past_time(fl->fl_break_time)) { > + if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout) > + remove = fl->fl_lmops->lm_breaker_timedout(fl); > + if (remove) > + lease_modify(fl, F_UNLCK, dispose); > + } > } > } > > diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c > index 7ba9e2dd0875..69d3889df302 100644 > --- a/fs/nfsd/blocklayout.c > +++ b/fs/nfsd/blocklayout.c > @@ -443,6 +443,14 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, > struct svc_rqst *rqstp, > return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps); > } > > +/* > + * Perform the fence operation to prevent the client from accessing the > + * block device. If a fence operation is already in progress, wait for > + * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the > + * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set, > + * update the layout stateid by setting the ls_fenced flag to indicate > + * that the client has been fenced. > + */ > static void > nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct > nfsd_file *file) > { > @@ -450,31 +458,23 @@ nfsd4_scsi_fence_client(struct > nfs4_layout_stateid *ls, struct nfsd_file *file) > struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev; > int status; > > - if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) > + mutex_lock(&clp->cl_fence_mutex); > + if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) { > + ls->ls_fenced = true; > + mutex_unlock(&clp->cl_fence_mutex); > + nfs4_put_stid(&ls->ls_stid); > return; > + } > > status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY, > nfsd4_scsi_pr_key(clp), > PR_EXCLUSIVE_ACCESS_REG_ONLY, true); > - /* > - * Reset to allow retry only when the command could not have > - * reached the device. Negative status means a local error > - * (e.g., -ENOMEM) prevented the command from being sent. > - * PR_STS_PATH_FAILED, PR_STS_PATH_FAST_FAILED, and > - * PR_STS_RETRY_PATH_FAILURE indicate transport path failures > - * before device delivery. > - * > - * For all other errors, the command may have reached the device > - * and the preempt may have succeeded. Avoid resetting, since > - * retrying a successful preempt returns PR_STS_IOERR or > - * PR_STS_RESERVATION_CONFLICT, which would cause an infinite > - * retry loop. > - */ > - if (status < 0 || > - status == PR_STS_PATH_FAILED || > - status == PR_STS_PATH_FAST_FAILED || > - status == PR_STS_RETRY_PATH_FAILURE) > + if (status) > nfsd4_scsi_fence_clear(clp, bdev->bd_dev); > + else > + ls->ls_fenced = true; The removed code distinguishes retryable errors (such as path failures) from permanent errors (PR_STS_IOERR, PR_STS_RESERVATION_CONFLICT), but the new code clears the fence on any error, potentially causing infinite retry loops as warned in the removed comment. Either the comment and error distinction should remain, or some rationale for removing the distinction between temporary and permanent errors should be added to the commit message. > + mutex_unlock(&clp->cl_fence_mutex); > + nfs4_put_stid(&ls->ls_stid); > > trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status); > } > diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c > index ad7af8cfcf1f..1c498f3cd059 100644 > --- a/fs/nfsd/nfs4layouts.c > +++ b/fs/nfsd/nfs4layouts.c > @@ -222,6 +222,29 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls) > return 0; > } > > +static void > +nfsd4_layout_fence_worker(struct work_struct *work) > +{ > + struct nfsd_file *nf; > + struct delayed_work *dwork = to_delayed_work(work); > + struct nfs4_layout_stateid *ls = container_of(dwork, > + struct nfs4_layout_stateid, ls_fence_work); > + u32 type; > + > + rcu_read_lock(); > + nf = nfsd_file_get(ls->ls_file); > + rcu_read_unlock(); > + if (!nf) { > + nfs4_put_stid(&ls->ls_stid); > + return; > + } > + > + type = ls->ls_layout_type; > + if (nfsd4_layout_ops[type]->fence_client) > + nfsd4_layout_ops[type]->fence_client(ls, nf); > + nfsd_file_put(nf); > +} > + > static struct nfs4_layout_stateid * > nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, > struct nfs4_stid *parent, u32 layout_type) > @@ -271,6 +294,10 @@ nfsd4_alloc_layout_stateid(struct > nfsd4_compound_state *cstate, > list_add(&ls->ls_perfile, &fp->fi_lo_states); > spin_unlock(&fp->fi_lock); > > + INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker); > + ls->ls_fenced = false; > + ls->ls_fence_retry_cnt = 0; > + > trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid); > return ls; > } > @@ -708,9 +735,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, > struct rpc_task *task) > rcu_read_unlock(); > if (fl) { > ops = nfsd4_layout_ops[ls->ls_layout_type]; > - if (ops->fence_client) > + if (ops->fence_client) { > + refcount_inc(&ls->ls_stid.sc_count); > ops->fence_client(ls, fl); > - else > + } else > nfsd4_cb_layout_fail(ls, fl); > nfsd_file_put(fl); > } > @@ -747,11 +775,9 @@ static bool > nfsd4_layout_lm_break(struct file_lease *fl) > { > /* > - * We don't want the locks code to timeout the lease for us; > - * we'll remove it ourself if a layout isn't returned > - * in time: > + * Enforce break lease timeout to prevent NFSD > + * thread from hanging in __break_lease. > */ > - fl->fl_break_time = 0; > nfsd4_recall_file_layout(fl->c.flc_owner); > return false; > } > @@ -782,10 +808,51 @@ nfsd4_layout_lm_open_conflict(struct file *filp, > int arg) > return 0; > } > > +/** > + * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out. Nit: Move the description of @fl here. > + * If the layout type supports a fence operation, schedule a worker to > + * fence the client from accessing the block device. > + * > + * @fl: file to check > + * > + * Return true if the file lease should be disposed of by the caller; > + * otherwise, return false. > + */ > +static bool > +nfsd4_layout_lm_breaker_timedout(struct file_lease *fl) > +{ > + struct nfs4_layout_stateid *ls = fl->c.flc_owner; > + bool ret; > + > + if (!nfsd4_layout_ops[ls->ls_layout_type]->fence_client) > + return true; > + if (ls->ls_fenced || ls->ls_fence_retry_cnt >= LO_MAX_FENCE_RETRY) > + return true; Since these two variables are accessed while the fence mutex is held in other places, they need similar protection here to prevent races. You explained that the mutex cannot be taken here, so some other form of serialization will be needed to protect these fields. If I'm reading this correctly, when the fence fails after all 5 retries, the client retains block device access but server believes recall succeeded ? > + > + if (work_busy(&ls->ls_fence_work.work)) > + return false; > + /* Schedule work to do the fence operation */ > + ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0); > + if (!ret) { > + /* > + * If there is no pending work, mod_delayed_work queues > + * new task. While fencing is in progress, a reference > + * count is added to the layout stateid to ensure its > + * validity. This reference count is released once fencing > + * has been completed. > + */ > + refcount_inc(&ls->ls_stid.sc_count); > + ++ls->ls_fence_retry_cnt; > + return true; > + } Incrementing after the mod_delayed_work() call is racy. Instead: refcount_inc(&ls->ls_stid.sc_count); ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0); if (ret) { refcount_dec(&ls->ls_stid.sc_count); .... } > + return false; > +} > + > static const struct lease_manager_operations nfsd4_layouts_lm_ops = { > .lm_break = nfsd4_layout_lm_break, > .lm_change = nfsd4_layout_lm_change, > .lm_open_conflict = nfsd4_layout_lm_open_conflict, > + .lm_breaker_timedout = nfsd4_layout_lm_breaker_timedout, > }; > > int > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c > index 583c13b5aaf3..a57fa3318362 100644 > --- a/fs/nfsd/nfs4state.c > +++ b/fs/nfsd/nfs4state.c > @@ -2385,6 +2385,7 @@ static struct nfs4_client *alloc_client(struct > xdr_netobj name, > #endif > #ifdef CONFIG_NFSD_SCSILAYOUT > xa_init(&clp->cl_dev_fences); > + mutex_init(&clp->cl_fence_mutex); > #endif > INIT_LIST_HEAD(&clp->async_copies); > spin_lock_init(&clp->async_lock); > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h > index 713f55ef6554..57e54dfb406c 100644 > --- a/fs/nfsd/state.h > +++ b/fs/nfsd/state.h > @@ -529,6 +529,7 @@ struct nfs4_client { > time64_t cl_ra_time; > #ifdef CONFIG_NFSD_SCSILAYOUT > struct xarray cl_dev_fences; > + struct mutex cl_fence_mutex; > #endif > }; > > @@ -738,8 +739,13 @@ struct nfs4_layout_stateid { > stateid_t ls_recall_sid; > bool ls_recalled; > struct mutex ls_mutex; > + struct delayed_work ls_fence_work; Still missing cancel_delayed_work_sync() when freeing the layout stateid. > + bool ls_fenced; > + int ls_fence_retry_cnt; > }; > > +#define LO_MAX_FENCE_RETRY 5 The value of 5 needs some justification here in a comment. Actually, 5 might be too low, but I can't really tell. > + > static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s) > { > return container_of(s, struct nfs4_layout_stateid, ls_stid); > diff --git a/include/linux/filelock.h b/include/linux/filelock.h > index 2f5e5588ee07..13b9c9f04589 100644 > --- a/include/linux/filelock.h > +++ b/include/linux/filelock.h > @@ -50,6 +50,7 @@ struct lease_manager_operations { > void (*lm_setup)(struct file_lease *, void **); > bool (*lm_breaker_owns_lease)(struct file_lease *); > int (*lm_open_conflict)(struct file *, int); > + bool (*lm_breaker_timedout)(struct file_lease *fl); > }; > > struct lock_manager { > -- > 2.47.3 -- Chuck Lever ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing 2026-01-27 17:11 ` Chuck Lever @ 2026-01-27 18:14 ` Jeff Layton 2026-01-27 20:38 ` Dai Ngo 2026-01-27 20:36 ` Dai Ngo 1 sibling, 1 reply; 12+ messages in thread From: Jeff Layton @ 2026-01-27 18:14 UTC (permalink / raw) To: Chuck Lever, Dai Ngo, Chuck Lever, NeilBrown, Olga Kornievskaia, Tom Talpey, Christoph Hellwig, Alexander Aring, Alexander Viro, Christian Brauner, Jan Kara Cc: linux-fsdevel, linux-nfs On Tue, 2026-01-27 at 12:11 -0500, Chuck Lever wrote: > > > + > > + if (work_busy(&ls->ls_fence_work.work)) > > + return false; > > + /* Schedule work to do the fence operation */ > > + ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0); > > + if (!ret) { > > + /* > > + * If there is no pending work, mod_delayed_work queues > > + * new task. While fencing is in progress, a reference > > + * count is added to the layout stateid to ensure its > > + * validity. This reference count is released once fencing > > + * has been completed. > > + */ > > + refcount_inc(&ls->ls_stid.sc_count); > > + ++ls->ls_fence_retry_cnt; > > + return true; > > + } > > Incrementing after the mod_delayed_work() call is racy. Instead: > > refcount_inc(&ls->ls_stid.sc_count); > ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0); > if (ret) { > refcount_dec(&ls->ls_stid.sc_count); > .... > } > Is that safe? That's done in nfsd_break_one_deleg() for similar reasons, but there is a comment over that function explaining why it's OK. We'll need to validate that the lease is always released before the sc_count goes to 0 here too. At first glance, it doesn't look like that is the case. -- Jeff Layton <jlayton@kernel.org> ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing 2026-01-27 18:14 ` Jeff Layton @ 2026-01-27 20:38 ` Dai Ngo 0 siblings, 0 replies; 12+ messages in thread From: Dai Ngo @ 2026-01-27 20:38 UTC (permalink / raw) To: Jeff Layton, Chuck Lever, Chuck Lever, NeilBrown, Olga Kornievskaia, Tom Talpey, Christoph Hellwig, Alexander Aring, Alexander Viro, Christian Brauner, Jan Kara Cc: linux-fsdevel, linux-nfs On 1/27/26 10:14 AM, Jeff Layton wrote: > On Tue, 2026-01-27 at 12:11 -0500, Chuck Lever wrote: >>> + >>> + if (work_busy(&ls->ls_fence_work.work)) >>> + return false; >>> + /* Schedule work to do the fence operation */ >>> + ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0); >>> + if (!ret) { >>> + /* >>> + * If there is no pending work, mod_delayed_work queues >>> + * new task. While fencing is in progress, a reference >>> + * count is added to the layout stateid to ensure its >>> + * validity. This reference count is released once fencing >>> + * has been completed. >>> + */ >>> + refcount_inc(&ls->ls_stid.sc_count); >>> + ++ls->ls_fence_retry_cnt; >>> + return true; >>> + } >> Incrementing after the mod_delayed_work() call is racy. Instead: >> >> refcount_inc(&ls->ls_stid.sc_count); >> ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0); >> if (ret) { >> refcount_dec(&ls->ls_stid.sc_count); >> .... >> } >> > Is that safe? That's done in nfsd_break_one_deleg() for similar > reasons, but there is a comment over that function explaining why it's > OK. > > We'll need to validate that the lease is always released before the > sc_count goes to 0 here too. At first glance, it doesn't look like that > is the case. No, it's not safe. I have to think of other ways to make sure the layout statid is valid when the fence worker runs. -Dai > > ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing 2026-01-27 17:11 ` Chuck Lever 2026-01-27 18:14 ` Jeff Layton @ 2026-01-27 20:36 ` Dai Ngo 2026-01-27 21:53 ` Chuck Lever 1 sibling, 1 reply; 12+ messages in thread From: Dai Ngo @ 2026-01-27 20:36 UTC (permalink / raw) To: Chuck Lever, Chuck Lever, Jeff Layton, NeilBrown, Olga Kornievskaia, Tom Talpey, Christoph Hellwig, Alexander Aring, Alexander Viro, Christian Brauner, Jan Kara Cc: linux-fsdevel, linux-nfs On 1/27/26 9:11 AM, Chuck Lever wrote: > > On Mon, Jan 26, 2026, at 7:50 PM, Dai Ngo wrote: >> When a layout conflict triggers a recall, enforcing a timeout is >> necessary to prevent excessive nfsd threads from being blocked in >> __break_lease ensuring the server continues servicing incoming >> requests efficiently. >> >> This patch introduces a new function to lease_manager_operations: >> >> lm_breaker_timedout: Invoked when a lease recall times out and is >> about to be disposed of. This function enables the lease manager >> to inform the caller whether the file_lease should remain on the >> flc_list or be disposed of. >> >> For the NFSD lease manager, this function now handles layout recall >> timeouts. If the layout type supports fencing and the client has not >> been fenced, a fence operation is triggered to prevent the client >> from accessing the block device. >> >> While the fencing operation is in progress, the conflicting file_lease >> remains on the flc_list until fencing is complete. This guarantees >> that no other clients can access the file, and the client with >> exclusive access is properly blocked before disposal. >> >> Signed-off-by: Dai Ngo <dai.ngo@oracle.com> >> --- >> Documentation/filesystems/locking.rst | 2 + >> fs/locks.c | 10 +++- >> fs/nfsd/blocklayout.c | 38 ++++++------- >> fs/nfsd/nfs4layouts.c | 79 +++++++++++++++++++++++++-- >> fs/nfsd/nfs4state.c | 1 + >> fs/nfsd/state.h | 6 ++ >> include/linux/filelock.h | 1 + >> 7 files changed, 110 insertions(+), 27 deletions(-) >> >> v2: >> . Update Subject line to include fencing operation. >> . Allow conflicting lease to remain on flc_list until fencing >> is complete. >> . Use system worker to perform fencing operation asynchronously. >> . Use nfs4_stid.sc_count to ensure layout stateid remains >> valid before starting the fencing operation, nfs4_stid.sc_count >> is released after fencing operation is complete. >> . Rework nfsd4_scsi_fence_client to: >> . wait until fencing to complete before exiting. >> . wait until fencing in progress to complete before >> checking the NFSD_MDS_PR_FENCED flag. >> . Remove lm_need_to_retry from lease_manager_operations. >> v3: >> . correct locking requirement in locking.rst. >> . add max retry count to fencing operation. >> . add missing nfs4_put_stid in nfsd4_layout_fence_worker. >> . remove special-casing of FL_LAYOUT in lease_modify. >> . remove lease_want_dispose. >> . move lm_breaker_timedout call to time_out_leases. >> v4: >> . only increment ls_fence_retry_cnt after successfully >> schedule new work in nfsd4_layout_lm_breaker_timedout. >> >> diff --git a/Documentation/filesystems/locking.rst >> b/Documentation/filesystems/locking.rst >> index 04c7691e50e0..a339491f02e4 100644 >> --- a/Documentation/filesystems/locking.rst >> +++ b/Documentation/filesystems/locking.rst >> @@ -403,6 +403,7 @@ prototypes:: >> bool (*lm_breaker_owns_lease)(struct file_lock *); >> bool (*lm_lock_expirable)(struct file_lock *); >> void (*lm_expire_lock)(void); >> + void (*lm_breaker_timedout)(struct file_lease *); >> >> locking rules: >> >> @@ -417,6 +418,7 @@ lm_breaker_owns_lease: yes no no >> lm_lock_expirable yes no no >> lm_expire_lock no no yes >> lm_open_conflict yes no no >> +lm_breaker_timedout yes no no >> ====================== ============= ================= ========= >> >> buffer_head >> diff --git a/fs/locks.c b/fs/locks.c >> index 46f229f740c8..1b63aa704598 100644 >> --- a/fs/locks.c >> +++ b/fs/locks.c >> @@ -1524,6 +1524,7 @@ static void time_out_leases(struct inode *inode, >> struct list_head *dispose) >> { >> struct file_lock_context *ctx = inode->i_flctx; >> struct file_lease *fl, *tmp; >> + bool remove = true; >> >> lockdep_assert_held(&ctx->flc_lock); >> >> @@ -1531,8 +1532,13 @@ static void time_out_leases(struct inode *inode, >> struct list_head *dispose) >> trace_time_out_leases(inode, fl); >> if (past_time(fl->fl_downgrade_time)) >> lease_modify(fl, F_RDLCK, dispose); >> - if (past_time(fl->fl_break_time)) >> - lease_modify(fl, F_UNLCK, dispose); >> + >> + if (past_time(fl->fl_break_time)) { >> + if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout) >> + remove = fl->fl_lmops->lm_breaker_timedout(fl); >> + if (remove) >> + lease_modify(fl, F_UNLCK, dispose); >> + } >> } >> } >> >> diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c >> index 7ba9e2dd0875..69d3889df302 100644 >> --- a/fs/nfsd/blocklayout.c >> +++ b/fs/nfsd/blocklayout.c >> @@ -443,6 +443,14 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, >> struct svc_rqst *rqstp, >> return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps); >> } >> >> +/* >> + * Perform the fence operation to prevent the client from accessing the >> + * block device. If a fence operation is already in progress, wait for >> + * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the >> + * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set, >> + * update the layout stateid by setting the ls_fenced flag to indicate >> + * that the client has been fenced. >> + */ >> static void >> nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct >> nfsd_file *file) >> { >> @@ -450,31 +458,23 @@ nfsd4_scsi_fence_client(struct >> nfs4_layout_stateid *ls, struct nfsd_file *file) >> struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev; >> int status; >> >> - if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) >> + mutex_lock(&clp->cl_fence_mutex); >> + if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) { >> + ls->ls_fenced = true; >> + mutex_unlock(&clp->cl_fence_mutex); >> + nfs4_put_stid(&ls->ls_stid); >> return; >> + } >> >> status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY, >> nfsd4_scsi_pr_key(clp), >> PR_EXCLUSIVE_ACCESS_REG_ONLY, true); >> - /* >> - * Reset to allow retry only when the command could not have >> - * reached the device. Negative status means a local error >> - * (e.g., -ENOMEM) prevented the command from being sent. >> - * PR_STS_PATH_FAILED, PR_STS_PATH_FAST_FAILED, and >> - * PR_STS_RETRY_PATH_FAILURE indicate transport path failures >> - * before device delivery. >> - * >> - * For all other errors, the command may have reached the device >> - * and the preempt may have succeeded. Avoid resetting, since >> - * retrying a successful preempt returns PR_STS_IOERR or >> - * PR_STS_RESERVATION_CONFLICT, which would cause an infinite >> - * retry loop. >> - */ >> - if (status < 0 || >> - status == PR_STS_PATH_FAILED || >> - status == PR_STS_PATH_FAST_FAILED || >> - status == PR_STS_RETRY_PATH_FAILURE) >> + if (status) >> nfsd4_scsi_fence_clear(clp, bdev->bd_dev); >> + else >> + ls->ls_fenced = true; > The removed code distinguishes retryable errors (such as path failures) > from permanent errors (PR_STS_IOERR, PR_STS_RESERVATION_CONFLICT), but > the new code clears the fence on any error, In v2 and v3 patch, I left the comment and the code distinguishes retryable errors intact. And you commented this: > In nfsd4_scsi_fence_client(), when a device error occurs (e.g., > PR_STS_IOERR), ls->ls_fenced is set even though the client may > still have storage access. So I'm not sure how to treat permanent errors now. Please advise. > potentially causing infinite > retry loops as warned in the removed comment. Infinite loops can not happen due to the maximum retry count. > > Either the comment and error distinction should remain, As mentioned above, I can put the comment and code that distinguishes temporarily error codes back. Then what do we need to do for other permanent errors; PR_STS_IOERR, PR_STS_RESERVATION_CONFLICTT? > or some rationale > for removing the distinction between temporary and permanent errors should > be added to the commit message. > > >> + mutex_unlock(&clp->cl_fence_mutex); >> + nfs4_put_stid(&ls->ls_stid); >> >> trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status); >> } >> diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c >> index ad7af8cfcf1f..1c498f3cd059 100644 >> --- a/fs/nfsd/nfs4layouts.c >> +++ b/fs/nfsd/nfs4layouts.c >> @@ -222,6 +222,29 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls) >> return 0; >> } >> >> +static void >> +nfsd4_layout_fence_worker(struct work_struct *work) >> +{ >> + struct nfsd_file *nf; >> + struct delayed_work *dwork = to_delayed_work(work); >> + struct nfs4_layout_stateid *ls = container_of(dwork, >> + struct nfs4_layout_stateid, ls_fence_work); >> + u32 type; >> + >> + rcu_read_lock(); >> + nf = nfsd_file_get(ls->ls_file); >> + rcu_read_unlock(); >> + if (!nf) { >> + nfs4_put_stid(&ls->ls_stid); >> + return; >> + } >> + >> + type = ls->ls_layout_type; >> + if (nfsd4_layout_ops[type]->fence_client) >> + nfsd4_layout_ops[type]->fence_client(ls, nf); >> + nfsd_file_put(nf); >> +} >> + >> static struct nfs4_layout_stateid * >> nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, >> struct nfs4_stid *parent, u32 layout_type) >> @@ -271,6 +294,10 @@ nfsd4_alloc_layout_stateid(struct >> nfsd4_compound_state *cstate, >> list_add(&ls->ls_perfile, &fp->fi_lo_states); >> spin_unlock(&fp->fi_lock); >> >> + INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker); >> + ls->ls_fenced = false; >> + ls->ls_fence_retry_cnt = 0; >> + >> trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid); >> return ls; >> } >> @@ -708,9 +735,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, >> struct rpc_task *task) >> rcu_read_unlock(); >> if (fl) { >> ops = nfsd4_layout_ops[ls->ls_layout_type]; >> - if (ops->fence_client) >> + if (ops->fence_client) { >> + refcount_inc(&ls->ls_stid.sc_count); >> ops->fence_client(ls, fl); >> - else >> + } else >> nfsd4_cb_layout_fail(ls, fl); >> nfsd_file_put(fl); >> } >> @@ -747,11 +775,9 @@ static bool >> nfsd4_layout_lm_break(struct file_lease *fl) >> { >> /* >> - * We don't want the locks code to timeout the lease for us; >> - * we'll remove it ourself if a layout isn't returned >> - * in time: >> + * Enforce break lease timeout to prevent NFSD >> + * thread from hanging in __break_lease. >> */ >> - fl->fl_break_time = 0; >> nfsd4_recall_file_layout(fl->c.flc_owner); >> return false; >> } >> @@ -782,10 +808,51 @@ nfsd4_layout_lm_open_conflict(struct file *filp, >> int arg) >> return 0; >> } >> >> +/** >> + * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out. > Nit: Move the description of @fl here. Fix in v5. > >> + * If the layout type supports a fence operation, schedule a worker to >> + * fence the client from accessing the block device. >> + * >> + * @fl: file to check >> + * >> + * Return true if the file lease should be disposed of by the caller; >> + * otherwise, return false. >> + */ >> +static bool >> +nfsd4_layout_lm_breaker_timedout(struct file_lease *fl) >> +{ >> + struct nfs4_layout_stateid *ls = fl->c.flc_owner; >> + bool ret; >> + >> + if (!nfsd4_layout_ops[ls->ls_layout_type]->fence_client) >> + return true; >> + if (ls->ls_fenced || ls->ls_fence_retry_cnt >= LO_MAX_FENCE_RETRY) >> + return true; > Since these two variables are accessed while the fence mutex > is held in other places, they need similar protection here > to prevent races. You explained that the mutex cannot be > taken here, so some other form of serialization will be > needed to protect these fields. I do not think we need protection here to access ls_fenced and ls_fence_retry_cnt. For ls_fenced, it's a read operation from nfsd4_layout_lm_breaker_timedout. The value is either true or false, it does not matter. If it's true then the device was fenced. If it's not true then nfsd4_layout_lm_breaker_timedout will be called again from time_out_leases on next check. ls_fence_retry_cnt is incremented and checked only by nfsd4_layout_lm_breaker_timedout. > > If I'm reading this correctly, when the fence fails after all > 5 retries, the client retains block device access but server > believes recall succeeded ? What should we do if the maximum retries is reached, perhaps a warning message in the system log for the admin to take action? > > >> + >> + if (work_busy(&ls->ls_fence_work.work)) >> + return false; >> + /* Schedule work to do the fence operation */ >> + ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0); >> + if (!ret) { >> + /* >> + * If there is no pending work, mod_delayed_work queues >> + * new task. While fencing is in progress, a reference >> + * count is added to the layout stateid to ensure its >> + * validity. This reference count is released once fencing >> + * has been completed. >> + */ >> + refcount_inc(&ls->ls_stid.sc_count); >> + ++ls->ls_fence_retry_cnt; >> + return true; >> + } > Incrementing after the mod_delayed_work() call is racy. Instead: > > refcount_inc(&ls->ls_stid.sc_count); > ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0); > if (ret) { > refcount_dec(&ls->ls_stid.sc_count); > .... > } Yes, it's racy, I thought about this. But we can not just do a refcount_dec here, we need to do nfs4_put_stid. But we can not do nfs4_put_stid here since nfsd4_layout_lm_breaker_timedout runs under the spin_lock flc_lock and nfs4_put_stid might causes the flc_lock to be acquired again from generic_delete_lease if this is the last reference count on stid. We might need to add the reference count when the file_lease is inserted on the flc_list and decrement when it's removed from the flc_list. I need to think about this more. > > >> + return false; >> +} >> + >> static const struct lease_manager_operations nfsd4_layouts_lm_ops = { >> .lm_break = nfsd4_layout_lm_break, >> .lm_change = nfsd4_layout_lm_change, >> .lm_open_conflict = nfsd4_layout_lm_open_conflict, >> + .lm_breaker_timedout = nfsd4_layout_lm_breaker_timedout, >> }; >> >> int >> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c >> index 583c13b5aaf3..a57fa3318362 100644 >> --- a/fs/nfsd/nfs4state.c >> +++ b/fs/nfsd/nfs4state.c >> @@ -2385,6 +2385,7 @@ static struct nfs4_client *alloc_client(struct >> xdr_netobj name, >> #endif >> #ifdef CONFIG_NFSD_SCSILAYOUT >> xa_init(&clp->cl_dev_fences); >> + mutex_init(&clp->cl_fence_mutex); >> #endif >> INIT_LIST_HEAD(&clp->async_copies); >> spin_lock_init(&clp->async_lock); >> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h >> index 713f55ef6554..57e54dfb406c 100644 >> --- a/fs/nfsd/state.h >> +++ b/fs/nfsd/state.h >> @@ -529,6 +529,7 @@ struct nfs4_client { >> time64_t cl_ra_time; >> #ifdef CONFIG_NFSD_SCSILAYOUT >> struct xarray cl_dev_fences; >> + struct mutex cl_fence_mutex; >> #endif >> }; >> >> @@ -738,8 +739,13 @@ struct nfs4_layout_stateid { >> stateid_t ls_recall_sid; >> bool ls_recalled; >> struct mutex ls_mutex; >> + struct delayed_work ls_fence_work; > Still missing cancel_delayed_work_sync() when freeing the > layout stateid. I don't think the layout stateid can be freed while a fence worker was scheduled due to the added reference count to the stid. > > >> + bool ls_fenced; >> + int ls_fence_retry_cnt; >> }; >> >> +#define LO_MAX_FENCE_RETRY 5 > The value of 5 needs some justification here in a comment. > Actually, 5 might be too low, but I can't really tell. At the minimal each retry happens after 1 sec, and it can be more depending what entry is at the front of flc_list. So if retry for 5 seconds (minimum) is too low then is 20 retries is sufficient? -Dai > > >> + >> static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s) >> { >> return container_of(s, struct nfs4_layout_stateid, ls_stid); >> diff --git a/include/linux/filelock.h b/include/linux/filelock.h >> index 2f5e5588ee07..13b9c9f04589 100644 >> --- a/include/linux/filelock.h >> +++ b/include/linux/filelock.h >> @@ -50,6 +50,7 @@ struct lease_manager_operations { >> void (*lm_setup)(struct file_lease *, void **); >> bool (*lm_breaker_owns_lease)(struct file_lease *); >> int (*lm_open_conflict)(struct file *, int); >> + bool (*lm_breaker_timedout)(struct file_lease *fl); >> }; >> >> struct lock_manager { >> -- >> 2.47.3 ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing 2026-01-27 20:36 ` Dai Ngo @ 2026-01-27 21:53 ` Chuck Lever 2026-01-28 0:50 ` Dai Ngo 0 siblings, 1 reply; 12+ messages in thread From: Chuck Lever @ 2026-01-27 21:53 UTC (permalink / raw) To: Dai Ngo, Chuck Lever, Jeff Layton, NeilBrown, Olga Kornievskaia, Tom Talpey, Christoph Hellwig, Alexander Aring, Alexander Viro, Christian Brauner, Jan Kara Cc: linux-fsdevel, linux-nfs On 1/27/26 3:36 PM, Dai Ngo wrote: > > On 1/27/26 9:11 AM, Chuck Lever wrote: >> >> On Mon, Jan 26, 2026, at 7:50 PM, Dai Ngo wrote: >>> When a layout conflict triggers a recall, enforcing a timeout is >>> necessary to prevent excessive nfsd threads from being blocked in >>> __break_lease ensuring the server continues servicing incoming >>> requests efficiently. >>> >>> This patch introduces a new function to lease_manager_operations: >>> >>> lm_breaker_timedout: Invoked when a lease recall times out and is >>> about to be disposed of. This function enables the lease manager >>> to inform the caller whether the file_lease should remain on the >>> flc_list or be disposed of. >>> >>> For the NFSD lease manager, this function now handles layout recall >>> timeouts. If the layout type supports fencing and the client has not >>> been fenced, a fence operation is triggered to prevent the client >>> from accessing the block device. >>> >>> While the fencing operation is in progress, the conflicting file_lease >>> remains on the flc_list until fencing is complete. This guarantees >>> that no other clients can access the file, and the client with >>> exclusive access is properly blocked before disposal. >>> >>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com> >>> --- >>> Documentation/filesystems/locking.rst | 2 + >>> fs/locks.c | 10 +++- >>> fs/nfsd/blocklayout.c | 38 ++++++------- >>> fs/nfsd/nfs4layouts.c | 79 +++++++++++++++++++++++++-- >>> fs/nfsd/nfs4state.c | 1 + >>> fs/nfsd/state.h | 6 ++ >>> include/linux/filelock.h | 1 + >>> 7 files changed, 110 insertions(+), 27 deletions(-) >>> >>> v2: >>> . Update Subject line to include fencing operation. >>> . Allow conflicting lease to remain on flc_list until fencing >>> is complete. >>> . Use system worker to perform fencing operation asynchronously. >>> . Use nfs4_stid.sc_count to ensure layout stateid remains >>> valid before starting the fencing operation, nfs4_stid.sc_count >>> is released after fencing operation is complete. >>> . Rework nfsd4_scsi_fence_client to: >>> . wait until fencing to complete before exiting. >>> . wait until fencing in progress to complete before >>> checking the NFSD_MDS_PR_FENCED flag. >>> . Remove lm_need_to_retry from lease_manager_operations. >>> v3: >>> . correct locking requirement in locking.rst. >>> . add max retry count to fencing operation. >>> . add missing nfs4_put_stid in nfsd4_layout_fence_worker. >>> . remove special-casing of FL_LAYOUT in lease_modify. >>> . remove lease_want_dispose. >>> . move lm_breaker_timedout call to time_out_leases. >>> v4: >>> . only increment ls_fence_retry_cnt after successfully >>> schedule new work in nfsd4_layout_lm_breaker_timedout. >>> >>> diff --git a/Documentation/filesystems/locking.rst >>> b/Documentation/filesystems/locking.rst >>> index 04c7691e50e0..a339491f02e4 100644 >>> --- a/Documentation/filesystems/locking.rst >>> +++ b/Documentation/filesystems/locking.rst >>> @@ -403,6 +403,7 @@ prototypes:: >>> bool (*lm_breaker_owns_lease)(struct file_lock *); >>> bool (*lm_lock_expirable)(struct file_lock *); >>> void (*lm_expire_lock)(void); >>> + void (*lm_breaker_timedout)(struct file_lease *); >>> >>> locking rules: >>> >>> @@ -417,6 +418,7 @@ lm_breaker_owns_lease: yes >>> no no >>> lm_lock_expirable yes no no >>> lm_expire_lock no no yes >>> lm_open_conflict yes no no >>> +lm_breaker_timedout yes no no >>> ====================== ============= ================= >>> ========= >>> >>> buffer_head >>> diff --git a/fs/locks.c b/fs/locks.c >>> index 46f229f740c8..1b63aa704598 100644 >>> --- a/fs/locks.c >>> +++ b/fs/locks.c >>> @@ -1524,6 +1524,7 @@ static void time_out_leases(struct inode *inode, >>> struct list_head *dispose) >>> { >>> struct file_lock_context *ctx = inode->i_flctx; >>> struct file_lease *fl, *tmp; >>> + bool remove = true; >>> >>> lockdep_assert_held(&ctx->flc_lock); >>> >>> @@ -1531,8 +1532,13 @@ static void time_out_leases(struct inode *inode, >>> struct list_head *dispose) >>> trace_time_out_leases(inode, fl); >>> if (past_time(fl->fl_downgrade_time)) >>> lease_modify(fl, F_RDLCK, dispose); >>> - if (past_time(fl->fl_break_time)) >>> - lease_modify(fl, F_UNLCK, dispose); >>> + >>> + if (past_time(fl->fl_break_time)) { >>> + if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout) >>> + remove = fl->fl_lmops->lm_breaker_timedout(fl); >>> + if (remove) >>> + lease_modify(fl, F_UNLCK, dispose); >>> + } >>> } >>> } >>> >>> diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c >>> index 7ba9e2dd0875..69d3889df302 100644 >>> --- a/fs/nfsd/blocklayout.c >>> +++ b/fs/nfsd/blocklayout.c >>> @@ -443,6 +443,14 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, >>> struct svc_rqst *rqstp, >>> return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps); >>> } >>> >>> +/* >>> + * Perform the fence operation to prevent the client from accessing the >>> + * block device. If a fence operation is already in progress, wait for >>> + * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the >>> + * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set, >>> + * update the layout stateid by setting the ls_fenced flag to indicate >>> + * that the client has been fenced. >>> + */ >>> static void >>> nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct >>> nfsd_file *file) >>> { >>> @@ -450,31 +458,23 @@ nfsd4_scsi_fence_client(struct >>> nfs4_layout_stateid *ls, struct nfsd_file *file) >>> struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb- >>> >s_bdev; >>> int status; >>> >>> - if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) >>> + mutex_lock(&clp->cl_fence_mutex); >>> + if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) { >>> + ls->ls_fenced = true; >>> + mutex_unlock(&clp->cl_fence_mutex); >>> + nfs4_put_stid(&ls->ls_stid); >>> return; >>> + } >>> >>> status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, >>> NFSD_MDS_PR_KEY, >>> nfsd4_scsi_pr_key(clp), >>> PR_EXCLUSIVE_ACCESS_REG_ONLY, true); >>> - /* >>> - * Reset to allow retry only when the command could not have >>> - * reached the device. Negative status means a local error >>> - * (e.g., -ENOMEM) prevented the command from being sent. >>> - * PR_STS_PATH_FAILED, PR_STS_PATH_FAST_FAILED, and >>> - * PR_STS_RETRY_PATH_FAILURE indicate transport path failures >>> - * before device delivery. >>> - * >>> - * For all other errors, the command may have reached the device >>> - * and the preempt may have succeeded. Avoid resetting, since >>> - * retrying a successful preempt returns PR_STS_IOERR or >>> - * PR_STS_RESERVATION_CONFLICT, which would cause an infinite >>> - * retry loop. >>> - */ >>> - if (status < 0 || >>> - status == PR_STS_PATH_FAILED || >>> - status == PR_STS_PATH_FAST_FAILED || >>> - status == PR_STS_RETRY_PATH_FAILURE) >>> + if (status) >>> nfsd4_scsi_fence_clear(clp, bdev->bd_dev); >>> + else >>> + ls->ls_fenced = true; >> The removed code distinguishes retryable errors (such as path failures) >> from permanent errors (PR_STS_IOERR, PR_STS_RESERVATION_CONFLICT), but >> the new code clears the fence on any error, > > In v2 and v3 patch, I left the comment and the code distinguishes retryable > errors intact. And you commented this: > >> In nfsd4_scsi_fence_client(), when a device error occurs (e.g., >> PR_STS_IOERR), ls->ls_fenced is set even though the client may >> still have storage access. > > So I'm not sure how to treat permanent errors now. Please advise. The comment is important and explains some of the subtleties. I think that needs to be preserved even if we decide to set ls_fenced in every error case. If the correct behavior now is to leave ls_fenced clear in all error cases, then something in the patch needs to provide the rationale for making this code change and for helping us remember the design when it needs to be altered or fixed later. The rationale is different for permanent errors and temporary errors, I would think. >> potentially causing infinite >> retry loops as warned in the removed comment. > > Infinite loops can not happen due to the maximum retry count. Fine, but with the new retry escape clause, as mentioned below, the server and client can be left in opposite states, it looks like: One believes the fence succeeded, and one believes it still has full device access. Am I incorrect about that? This is the bane of every timeout-based scheme I know about, and why we have the "hard" mount option. >> Either the comment and error distinction should remain, > > As mentioned above, I can put the comment and code that distinguishes > temporarily error codes back. Then what do we need to do for other > permanent errors; PR_STS_IOERR, PR_STS_RESERVATION_CONFLICT? > >> or some rationale >> for removing the distinction between temporary and permanent errors >> should >> be added to the commit message. >> >> >>> + mutex_unlock(&clp->cl_fence_mutex); >>> + nfs4_put_stid(&ls->ls_stid); >>> >>> trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status); >>> } >>> diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c >>> index ad7af8cfcf1f..1c498f3cd059 100644 >>> --- a/fs/nfsd/nfs4layouts.c >>> +++ b/fs/nfsd/nfs4layouts.c >>> @@ -222,6 +222,29 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid >>> *ls) >>> return 0; >>> } >>> >>> +static void >>> +nfsd4_layout_fence_worker(struct work_struct *work) >>> +{ >>> + struct nfsd_file *nf; >>> + struct delayed_work *dwork = to_delayed_work(work); >>> + struct nfs4_layout_stateid *ls = container_of(dwork, >>> + struct nfs4_layout_stateid, ls_fence_work); >>> + u32 type; >>> + >>> + rcu_read_lock(); >>> + nf = nfsd_file_get(ls->ls_file); >>> + rcu_read_unlock(); >>> + if (!nf) { >>> + nfs4_put_stid(&ls->ls_stid); >>> + return; >>> + } >>> + >>> + type = ls->ls_layout_type; >>> + if (nfsd4_layout_ops[type]->fence_client) >>> + nfsd4_layout_ops[type]->fence_client(ls, nf); >>> + nfsd_file_put(nf); >>> +} >>> + >>> static struct nfs4_layout_stateid * >>> nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, >>> struct nfs4_stid *parent, u32 layout_type) >>> @@ -271,6 +294,10 @@ nfsd4_alloc_layout_stateid(struct >>> nfsd4_compound_state *cstate, >>> list_add(&ls->ls_perfile, &fp->fi_lo_states); >>> spin_unlock(&fp->fi_lock); >>> >>> + INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker); >>> + ls->ls_fenced = false; >>> + ls->ls_fence_retry_cnt = 0; >>> + >>> trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid); >>> return ls; >>> } >>> @@ -708,9 +735,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, >>> struct rpc_task *task) >>> rcu_read_unlock(); >>> if (fl) { >>> ops = nfsd4_layout_ops[ls->ls_layout_type]; >>> - if (ops->fence_client) >>> + if (ops->fence_client) { >>> + refcount_inc(&ls->ls_stid.sc_count); >>> ops->fence_client(ls, fl); >>> - else >>> + } else >>> nfsd4_cb_layout_fail(ls, fl); >>> nfsd_file_put(fl); >>> } >>> @@ -747,11 +775,9 @@ static bool >>> nfsd4_layout_lm_break(struct file_lease *fl) >>> { >>> /* >>> - * We don't want the locks code to timeout the lease for us; >>> - * we'll remove it ourself if a layout isn't returned >>> - * in time: >>> + * Enforce break lease timeout to prevent NFSD >>> + * thread from hanging in __break_lease. >>> */ >>> - fl->fl_break_time = 0; >>> nfsd4_recall_file_layout(fl->c.flc_owner); >>> return false; >>> } >>> @@ -782,10 +808,51 @@ nfsd4_layout_lm_open_conflict(struct file *filp, >>> int arg) >>> return 0; >>> } >>> >>> +/** >>> + * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out. >> Nit: Move the description of @fl here. > > Fix in v5. > >> >>> + * If the layout type supports a fence operation, schedule a worker to >>> + * fence the client from accessing the block device. >>> + * >>> + * @fl: file to check >>> + * >>> + * Return true if the file lease should be disposed of by the caller; >>> + * otherwise, return false. >>> + */ >>> +static bool >>> +nfsd4_layout_lm_breaker_timedout(struct file_lease *fl) >>> +{ >>> + struct nfs4_layout_stateid *ls = fl->c.flc_owner; >>> + bool ret; >>> + >>> + if (!nfsd4_layout_ops[ls->ls_layout_type]->fence_client) >>> + return true; >>> + if (ls->ls_fenced || ls->ls_fence_retry_cnt >= LO_MAX_FENCE_RETRY) >>> + return true; >> Since these two variables are accessed while the fence mutex >> is held in other places, they need similar protection here >> to prevent races. You explained that the mutex cannot be >> taken here, so some other form of serialization will be >> needed to protect these fields. > > I do not think we need protection here to access ls_fenced and > ls_fence_retry_cnt. For ls_fenced, it's a read operation from > nfsd4_layout_lm_breaker_timedout. The value is either true or > false, it does not matter. If it's true then the device was > fenced. If it's not true then nfsd4_layout_lm_breaker_timedout > will be called again from time_out_leases on next check. Fair enough for ls_fence_retry_cnt, but are you sure there isn't a TOCTOU issue here for ls_fenced ? Isn't there a race possible with the server receiving the client's callback while lm_breaker_timeout is running? > ls_fence_retry_cnt is incremented and checked only by > nfsd4_layout_lm_breaker_timedout. Then the kdoc comment could be clearer that only one copy of the callback is allowed to run concurrently. Something like "Caller serializes". >> If I'm reading this correctly, when the fence fails after all >> 5 retries, the client retains block device access but server >> believes recall succeeded ? > > What should we do if the maximum retries is reached, perhaps a > warning message in the system log for the admin to take action? A warning doesn't seem like it would prevent data corruption. (I'm not against a warning, in and of itself... it just does not seem to me to be sufficient). I would really like this mechanism to be more data-safe. It needs to deal properly with network partitions and other problems. I don't have any magic answer yet, sorry! >>> + >>> + if (work_busy(&ls->ls_fence_work.work)) >>> + return false; >>> + /* Schedule work to do the fence operation */ >>> + ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0); >>> + if (!ret) { >>> + /* >>> + * If there is no pending work, mod_delayed_work queues >>> + * new task. While fencing is in progress, a reference >>> + * count is added to the layout stateid to ensure its >>> + * validity. This reference count is released once fencing >>> + * has been completed. >>> + */ >>> + refcount_inc(&ls->ls_stid.sc_count); >>> + ++ls->ls_fence_retry_cnt; >>> + return true; >>> + } >> Incrementing after the mod_delayed_work() call is racy. Instead: >> >> refcount_inc(&ls->ls_stid.sc_count); >> ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0); >> if (ret) { >> refcount_dec(&ls->ls_stid.sc_count); >> .... >> } > > Yes, it's racy, I thought about this. But we can not just do a > refcount_dec here, we need to do nfs4_put_stid. But we can not > do nfs4_put_stid here since nfsd4_layout_lm_breaker_timedout runs > under the spin_lock flc_lock and nfs4_put_stid might causes the > flc_lock to be acquired again from generic_delete_lease if this > is the last reference count on stid. > > We might need to add the reference count when the file_lease is > inserted on the flc_list and decrement when it's removed from the > flc_list. I need to think about this more. If I may, why is flc_lock being held? That seems to be the root of several implementation problems. >>> + return false; >>> +} >>> + >>> static const struct lease_manager_operations nfsd4_layouts_lm_ops = { >>> .lm_break = nfsd4_layout_lm_break, >>> .lm_change = nfsd4_layout_lm_change, >>> .lm_open_conflict = nfsd4_layout_lm_open_conflict, >>> + .lm_breaker_timedout = nfsd4_layout_lm_breaker_timedout, >>> }; >>> >>> int >>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c >>> index 583c13b5aaf3..a57fa3318362 100644 >>> --- a/fs/nfsd/nfs4state.c >>> +++ b/fs/nfsd/nfs4state.c >>> @@ -2385,6 +2385,7 @@ static struct nfs4_client *alloc_client(struct >>> xdr_netobj name, >>> #endif >>> #ifdef CONFIG_NFSD_SCSILAYOUT >>> xa_init(&clp->cl_dev_fences); >>> + mutex_init(&clp->cl_fence_mutex); >>> #endif >>> INIT_LIST_HEAD(&clp->async_copies); >>> spin_lock_init(&clp->async_lock); >>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h >>> index 713f55ef6554..57e54dfb406c 100644 >>> --- a/fs/nfsd/state.h >>> +++ b/fs/nfsd/state.h >>> @@ -529,6 +529,7 @@ struct nfs4_client { >>> time64_t cl_ra_time; >>> #ifdef CONFIG_NFSD_SCSILAYOUT >>> struct xarray cl_dev_fences; >>> + struct mutex cl_fence_mutex; >>> #endif >>> }; >>> >>> @@ -738,8 +739,13 @@ struct nfs4_layout_stateid { >>> stateid_t ls_recall_sid; >>> bool ls_recalled; >>> struct mutex ls_mutex; >>> + struct delayed_work ls_fence_work; >> Still missing cancel_delayed_work_sync() when freeing the >> layout stateid. > > I don't think the layout stateid can be freed while a fence > worker was scheduled due to the added reference count to the > stid. Then add a comment to that effect, or if possible, some kind of assertion, in nfsd4_free_layout_stateid(). >>> + bool ls_fenced; >>> + int ls_fence_retry_cnt; >>> }; >>> >>> +#define LO_MAX_FENCE_RETRY 5 >> The value of 5 needs some justification here in a comment. >> Actually, 5 might be too low, but I can't really tell. > > At the minimal each retry happens after 1 sec, and it can be > more depending what entry is at the front of flc_list. So if > retry for 5 seconds (minimum) is too low then is 20 retries is > sufficient? As I said above: here's why we have the "hard" mount option for NFS. It might be impossible to choose a safe maximum retry count and still guarantee that the server and clients will eventually agree on the device reservation state. In that case we will need to consider some other mechanism for detecting and preventing infinite PR retry loops. Maybe the retry maximum could depend on the lease expiry time? Just remember, I'm thinking about this and learning as I go along, just like you are. I know I'm possibly contradicting myself here. -- Chuck Lever ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing 2026-01-27 21:53 ` Chuck Lever @ 2026-01-28 0:50 ` Dai Ngo 0 siblings, 0 replies; 12+ messages in thread From: Dai Ngo @ 2026-01-28 0:50 UTC (permalink / raw) To: Chuck Lever, Chuck Lever, Jeff Layton, NeilBrown, Olga Kornievskaia, Tom Talpey, Christoph Hellwig, Alexander Aring, Alexander Viro, Christian Brauner, Jan Kara Cc: linux-fsdevel, linux-nfs On 1/27/26 1:53 PM, Chuck Lever wrote: > On 1/27/26 3:36 PM, Dai Ngo wrote: >> On 1/27/26 9:11 AM, Chuck Lever wrote: >>> On Mon, Jan 26, 2026, at 7:50 PM, Dai Ngo wrote: >>>> When a layout conflict triggers a recall, enforcing a timeout is >>>> necessary to prevent excessive nfsd threads from being blocked in >>>> __break_lease ensuring the server continues servicing incoming >>>> requests efficiently. >>>> >>>> This patch introduces a new function to lease_manager_operations: >>>> >>>> lm_breaker_timedout: Invoked when a lease recall times out and is >>>> about to be disposed of. This function enables the lease manager >>>> to inform the caller whether the file_lease should remain on the >>>> flc_list or be disposed of. >>>> >>>> For the NFSD lease manager, this function now handles layout recall >>>> timeouts. If the layout type supports fencing and the client has not >>>> been fenced, a fence operation is triggered to prevent the client >>>> from accessing the block device. >>>> >>>> While the fencing operation is in progress, the conflicting file_lease >>>> remains on the flc_list until fencing is complete. This guarantees >>>> that no other clients can access the file, and the client with >>>> exclusive access is properly blocked before disposal. >>>> >>>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com> >>>> --- >>>> Documentation/filesystems/locking.rst | 2 + >>>> fs/locks.c | 10 +++- >>>> fs/nfsd/blocklayout.c | 38 ++++++------- >>>> fs/nfsd/nfs4layouts.c | 79 +++++++++++++++++++++++++-- >>>> fs/nfsd/nfs4state.c | 1 + >>>> fs/nfsd/state.h | 6 ++ >>>> include/linux/filelock.h | 1 + >>>> 7 files changed, 110 insertions(+), 27 deletions(-) >>>> >>>> v2: >>>> . Update Subject line to include fencing operation. >>>> . Allow conflicting lease to remain on flc_list until fencing >>>> is complete. >>>> . Use system worker to perform fencing operation asynchronously. >>>> . Use nfs4_stid.sc_count to ensure layout stateid remains >>>> valid before starting the fencing operation, nfs4_stid.sc_count >>>> is released after fencing operation is complete. >>>> . Rework nfsd4_scsi_fence_client to: >>>> . wait until fencing to complete before exiting. >>>> . wait until fencing in progress to complete before >>>> checking the NFSD_MDS_PR_FENCED flag. >>>> . Remove lm_need_to_retry from lease_manager_operations. >>>> v3: >>>> . correct locking requirement in locking.rst. >>>> . add max retry count to fencing operation. >>>> . add missing nfs4_put_stid in nfsd4_layout_fence_worker. >>>> . remove special-casing of FL_LAYOUT in lease_modify. >>>> . remove lease_want_dispose. >>>> . move lm_breaker_timedout call to time_out_leases. >>>> v4: >>>> . only increment ls_fence_retry_cnt after successfully >>>> schedule new work in nfsd4_layout_lm_breaker_timedout. >>>> >>>> diff --git a/Documentation/filesystems/locking.rst >>>> b/Documentation/filesystems/locking.rst >>>> index 04c7691e50e0..a339491f02e4 100644 >>>> --- a/Documentation/filesystems/locking.rst >>>> +++ b/Documentation/filesystems/locking.rst >>>> @@ -403,6 +403,7 @@ prototypes:: >>>> bool (*lm_breaker_owns_lease)(struct file_lock *); >>>> bool (*lm_lock_expirable)(struct file_lock *); >>>> void (*lm_expire_lock)(void); >>>> + void (*lm_breaker_timedout)(struct file_lease *); >>>> >>>> locking rules: >>>> >>>> @@ -417,6 +418,7 @@ lm_breaker_owns_lease: yes >>>> no no >>>> lm_lock_expirable yes no no >>>> lm_expire_lock no no yes >>>> lm_open_conflict yes no no >>>> +lm_breaker_timedout yes no no >>>> ====================== ============= ================= >>>> ========= >>>> >>>> buffer_head >>>> diff --git a/fs/locks.c b/fs/locks.c >>>> index 46f229f740c8..1b63aa704598 100644 >>>> --- a/fs/locks.c >>>> +++ b/fs/locks.c >>>> @@ -1524,6 +1524,7 @@ static void time_out_leases(struct inode *inode, >>>> struct list_head *dispose) >>>> { >>>> struct file_lock_context *ctx = inode->i_flctx; >>>> struct file_lease *fl, *tmp; >>>> + bool remove = true; >>>> >>>> lockdep_assert_held(&ctx->flc_lock); >>>> >>>> @@ -1531,8 +1532,13 @@ static void time_out_leases(struct inode *inode, >>>> struct list_head *dispose) >>>> trace_time_out_leases(inode, fl); >>>> if (past_time(fl->fl_downgrade_time)) >>>> lease_modify(fl, F_RDLCK, dispose); >>>> - if (past_time(fl->fl_break_time)) >>>> - lease_modify(fl, F_UNLCK, dispose); >>>> + >>>> + if (past_time(fl->fl_break_time)) { >>>> + if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout) >>>> + remove = fl->fl_lmops->lm_breaker_timedout(fl); >>>> + if (remove) >>>> + lease_modify(fl, F_UNLCK, dispose); >>>> + } >>>> } >>>> } >>>> >>>> diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c >>>> index 7ba9e2dd0875..69d3889df302 100644 >>>> --- a/fs/nfsd/blocklayout.c >>>> +++ b/fs/nfsd/blocklayout.c >>>> @@ -443,6 +443,14 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, >>>> struct svc_rqst *rqstp, >>>> return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps); >>>> } >>>> >>>> +/* >>>> + * Perform the fence operation to prevent the client from accessing the >>>> + * block device. If a fence operation is already in progress, wait for >>>> + * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the >>>> + * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set, >>>> + * update the layout stateid by setting the ls_fenced flag to indicate >>>> + * that the client has been fenced. >>>> + */ >>>> static void >>>> nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct >>>> nfsd_file *file) >>>> { >>>> @@ -450,31 +458,23 @@ nfsd4_scsi_fence_client(struct >>>> nfs4_layout_stateid *ls, struct nfsd_file *file) >>>> struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb- >>>>> s_bdev; >>>> int status; >>>> >>>> - if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) >>>> + mutex_lock(&clp->cl_fence_mutex); >>>> + if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) { >>>> + ls->ls_fenced = true; >>>> + mutex_unlock(&clp->cl_fence_mutex); >>>> + nfs4_put_stid(&ls->ls_stid); >>>> return; >>>> + } >>>> >>>> status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, >>>> NFSD_MDS_PR_KEY, >>>> nfsd4_scsi_pr_key(clp), >>>> PR_EXCLUSIVE_ACCESS_REG_ONLY, true); >>>> - /* >>>> - * Reset to allow retry only when the command could not have >>>> - * reached the device. Negative status means a local error >>>> - * (e.g., -ENOMEM) prevented the command from being sent. >>>> - * PR_STS_PATH_FAILED, PR_STS_PATH_FAST_FAILED, and >>>> - * PR_STS_RETRY_PATH_FAILURE indicate transport path failures >>>> - * before device delivery. >>>> - * >>>> - * For all other errors, the command may have reached the device >>>> - * and the preempt may have succeeded. Avoid resetting, since >>>> - * retrying a successful preempt returns PR_STS_IOERR or >>>> - * PR_STS_RESERVATION_CONFLICT, which would cause an infinite >>>> - * retry loop. >>>> - */ >>>> - if (status < 0 || >>>> - status == PR_STS_PATH_FAILED || >>>> - status == PR_STS_PATH_FAST_FAILED || >>>> - status == PR_STS_RETRY_PATH_FAILURE) >>>> + if (status) >>>> nfsd4_scsi_fence_clear(clp, bdev->bd_dev); >>>> + else >>>> + ls->ls_fenced = true; >>> The removed code distinguishes retryable errors (such as path failures) >>> from permanent errors (PR_STS_IOERR, PR_STS_RESERVATION_CONFLICT), but >>> the new code clears the fence on any error, >> In v2 and v3 patch, I left the comment and the code distinguishes retryable >> errors intact. And you commented this: >> >>> In nfsd4_scsi_fence_client(), when a device error occurs (e.g., >>> PR_STS_IOERR), ls->ls_fenced is set even though the client may >>> still have storage access. >> So I'm not sure how to treat permanent errors now. Please advise. > The comment is important and explains some of the subtleties. I think > that needs to be preserved even if we decide to set ls_fenced in every > error case. > > If the correct behavior now is to leave ls_fenced clear in all error > cases, then something in the patch needs to provide the rationale for > making this code change and for helping us remember the design when it > needs to be altered or fixed later. > > The rationale is different for permanent errors and temporary errors, > I would think. I restored the previous comment and treat PR_STS_IOERR and PR_STS_RESERVATION_CONFLICT as non-error case. The rest of the errors, temporarily and other errors; ENOMEM, as retryable error. > > >>> potentially causing infinite >>> retry loops as warned in the removed comment. >> Infinite loops can not happen due to the maximum retry count. > Fine, but with the new retry escape clause, as mentioned below, the > server and client can be left in opposite states, it looks like: One > believes the fence succeeded, and one believes it still has full device > access. Am I incorrect about that? You're right, the server and client can be left in opposite states if we still can not fence the client after so many retries. If we want to be safe then pr_warn the admin then retry forever which effectively prevents all other clients to access the conflict file until it's manually resolved by the admin. > > This is the bane of every timeout-based scheme I know about, and why we > have the "hard" mount option. > > >>> Either the comment and error distinction should remain, >> As mentioned above, I can put the comment and code that distinguishes >> temporarily error codes back. Then what do we need to do for other >> permanent errors; PR_STS_IOERR, PR_STS_RESERVATION_CONFLICT? >> >>> or some rationale >>> for removing the distinction between temporary and permanent errors >>> should >>> be added to the commit message. >>> >>> >>>> + mutex_unlock(&clp->cl_fence_mutex); >>>> + nfs4_put_stid(&ls->ls_stid); >>>> >>>> trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status); >>>> } >>>> diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c >>>> index ad7af8cfcf1f..1c498f3cd059 100644 >>>> --- a/fs/nfsd/nfs4layouts.c >>>> +++ b/fs/nfsd/nfs4layouts.c >>>> @@ -222,6 +222,29 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid >>>> *ls) >>>> return 0; >>>> } >>>> >>>> +static void >>>> +nfsd4_layout_fence_worker(struct work_struct *work) >>>> +{ >>>> + struct nfsd_file *nf; >>>> + struct delayed_work *dwork = to_delayed_work(work); >>>> + struct nfs4_layout_stateid *ls = container_of(dwork, >>>> + struct nfs4_layout_stateid, ls_fence_work); >>>> + u32 type; >>>> + >>>> + rcu_read_lock(); >>>> + nf = nfsd_file_get(ls->ls_file); >>>> + rcu_read_unlock(); >>>> + if (!nf) { >>>> + nfs4_put_stid(&ls->ls_stid); >>>> + return; >>>> + } >>>> + >>>> + type = ls->ls_layout_type; >>>> + if (nfsd4_layout_ops[type]->fence_client) >>>> + nfsd4_layout_ops[type]->fence_client(ls, nf); >>>> + nfsd_file_put(nf); >>>> +} >>>> + >>>> static struct nfs4_layout_stateid * >>>> nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, >>>> struct nfs4_stid *parent, u32 layout_type) >>>> @@ -271,6 +294,10 @@ nfsd4_alloc_layout_stateid(struct >>>> nfsd4_compound_state *cstate, >>>> list_add(&ls->ls_perfile, &fp->fi_lo_states); >>>> spin_unlock(&fp->fi_lock); >>>> >>>> + INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker); >>>> + ls->ls_fenced = false; >>>> + ls->ls_fence_retry_cnt = 0; >>>> + >>>> trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid); >>>> return ls; >>>> } >>>> @@ -708,9 +735,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, >>>> struct rpc_task *task) >>>> rcu_read_unlock(); >>>> if (fl) { >>>> ops = nfsd4_layout_ops[ls->ls_layout_type]; >>>> - if (ops->fence_client) >>>> + if (ops->fence_client) { >>>> + refcount_inc(&ls->ls_stid.sc_count); >>>> ops->fence_client(ls, fl); >>>> - else >>>> + } else >>>> nfsd4_cb_layout_fail(ls, fl); >>>> nfsd_file_put(fl); >>>> } >>>> @@ -747,11 +775,9 @@ static bool >>>> nfsd4_layout_lm_break(struct file_lease *fl) >>>> { >>>> /* >>>> - * We don't want the locks code to timeout the lease for us; >>>> - * we'll remove it ourself if a layout isn't returned >>>> - * in time: >>>> + * Enforce break lease timeout to prevent NFSD >>>> + * thread from hanging in __break_lease. >>>> */ >>>> - fl->fl_break_time = 0; >>>> nfsd4_recall_file_layout(fl->c.flc_owner); >>>> return false; >>>> } >>>> @@ -782,10 +808,51 @@ nfsd4_layout_lm_open_conflict(struct file *filp, >>>> int arg) >>>> return 0; >>>> } >>>> >>>> +/** >>>> + * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out. >>> Nit: Move the description of @fl here. >> Fix in v5. >> >>>> + * If the layout type supports a fence operation, schedule a worker to >>>> + * fence the client from accessing the block device. >>>> + * >>>> + * @fl: file to check >>>> + * >>>> + * Return true if the file lease should be disposed of by the caller; >>>> + * otherwise, return false. >>>> + */ >>>> +static bool >>>> +nfsd4_layout_lm_breaker_timedout(struct file_lease *fl) >>>> +{ >>>> + struct nfs4_layout_stateid *ls = fl->c.flc_owner; >>>> + bool ret; >>>> + >>>> + if (!nfsd4_layout_ops[ls->ls_layout_type]->fence_client) >>>> + return true; >>>> + if (ls->ls_fenced || ls->ls_fence_retry_cnt >= LO_MAX_FENCE_RETRY) >>>> + return true; >>> Since these two variables are accessed while the fence mutex >>> is held in other places, they need similar protection here >>> to prevent races. You explained that the mutex cannot be >>> taken here, so some other form of serialization will be >>> needed to protect these fields. >> I do not think we need protection here to access ls_fenced and >> ls_fence_retry_cnt. For ls_fenced, it's a read operation from >> nfsd4_layout_lm_breaker_timedout. The value is either true or >> false, it does not matter. If it's true then the device was >> fenced. If it's not true then nfsd4_layout_lm_breaker_timedout >> will be called again from time_out_leases on next check. > Fair enough for ls_fence_retry_cnt, but are you sure there isn't a > TOCTOU issue here for ls_fenced ? Isn't there a race possible with > the server receiving the client's callback while lm_breaker_timeout > is running? I will rework this logic to prevent race with LAYOUTRETURN and also to allow nfsd4_layout_lm_breaker_timedout to release reference count safely. > > >> ls_fence_retry_cnt is incremented and checked only by >> nfsd4_layout_lm_breaker_timedout. > Then the kdoc comment could be clearer that only one copy of the > callback is allowed to run concurrently. Something like "Caller > serializes". > > >>> If I'm reading this correctly, when the fence fails after all >>> 5 retries, the client retains block device access but server >>> believes recall succeeded ? >> What should we do if the maximum retries is reached, perhaps a >> warning message in the system log for the admin to take action? > A warning doesn't seem like it would prevent data corruption. (I'm > not against a warning, in and of itself... it just does not seem to > me to be sufficient). > > I would really like this mechanism to be more data-safe. It needs to > deal properly with network partitions and other problems. I don't have > any magic answer yet, sorry! I propose pr_warn the admin and retry forever for now until we can find a better option. I expect this to be a very rare condition so it should not effect most environments. > > >>>> + >>>> + if (work_busy(&ls->ls_fence_work.work)) >>>> + return false; >>>> + /* Schedule work to do the fence operation */ >>>> + ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0); >>>> + if (!ret) { >>>> + /* >>>> + * If there is no pending work, mod_delayed_work queues >>>> + * new task. While fencing is in progress, a reference >>>> + * count is added to the layout stateid to ensure its >>>> + * validity. This reference count is released once fencing >>>> + * has been completed. >>>> + */ >>>> + refcount_inc(&ls->ls_stid.sc_count); >>>> + ++ls->ls_fence_retry_cnt; >>>> + return true; >>>> + } >>> Incrementing after the mod_delayed_work() call is racy. Instead: >>> >>> refcount_inc(&ls->ls_stid.sc_count); >>> ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0); >>> if (ret) { >>> refcount_dec(&ls->ls_stid.sc_count); >>> .... >>> } >> Yes, it's racy, I thought about this. But we can not just do a >> refcount_dec here, we need to do nfs4_put_stid. But we can not >> do nfs4_put_stid here since nfsd4_layout_lm_breaker_timedout runs >> under the spin_lock flc_lock and nfs4_put_stid might causes the >> flc_lock to be acquired again from generic_delete_lease if this >> is the last reference count on stid. >> >> We might need to add the reference count when the file_lease is >> inserted on the flc_list and decrement when it's removed from the >> flc_list. I need to think about this more. > If I may, why is flc_lock being held? That seems to be the root of > several implementation problems. Jeff probably has better answer, but my AFAIK time_out_leases hold the spin lock flc_lock to prevent the flc_list from being corrupted if the underlying code removing the file_lease by kernel_setlease while it's working on the list. > > >>>> + return false; >>>> +} >>>> + >>>> static const struct lease_manager_operations nfsd4_layouts_lm_ops = { >>>> .lm_break = nfsd4_layout_lm_break, >>>> .lm_change = nfsd4_layout_lm_change, >>>> .lm_open_conflict = nfsd4_layout_lm_open_conflict, >>>> + .lm_breaker_timedout = nfsd4_layout_lm_breaker_timedout, >>>> }; >>>> >>>> int >>>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c >>>> index 583c13b5aaf3..a57fa3318362 100644 >>>> --- a/fs/nfsd/nfs4state.c >>>> +++ b/fs/nfsd/nfs4state.c >>>> @@ -2385,6 +2385,7 @@ static struct nfs4_client *alloc_client(struct >>>> xdr_netobj name, >>>> #endif >>>> #ifdef CONFIG_NFSD_SCSILAYOUT >>>> xa_init(&clp->cl_dev_fences); >>>> + mutex_init(&clp->cl_fence_mutex); >>>> #endif >>>> INIT_LIST_HEAD(&clp->async_copies); >>>> spin_lock_init(&clp->async_lock); >>>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h >>>> index 713f55ef6554..57e54dfb406c 100644 >>>> --- a/fs/nfsd/state.h >>>> +++ b/fs/nfsd/state.h >>>> @@ -529,6 +529,7 @@ struct nfs4_client { >>>> time64_t cl_ra_time; >>>> #ifdef CONFIG_NFSD_SCSILAYOUT >>>> struct xarray cl_dev_fences; >>>> + struct mutex cl_fence_mutex; >>>> #endif >>>> }; >>>> >>>> @@ -738,8 +739,13 @@ struct nfs4_layout_stateid { >>>> stateid_t ls_recall_sid; >>>> bool ls_recalled; >>>> struct mutex ls_mutex; >>>> + struct delayed_work ls_fence_work; >>> Still missing cancel_delayed_work_sync() when freeing the >>> layout stateid. >> I don't think the layout stateid can be freed while a fence >> worker was scheduled due to the added reference count to the >> stid. > Then add a comment to that effect, or if possible, some kind of > assertion, in nfsd4_free_layout_stateid(). Will do. > > >>>> + bool ls_fenced; >>>> + int ls_fence_retry_cnt; >>>> }; >>>> >>>> +#define LO_MAX_FENCE_RETRY 5 >>> The value of 5 needs some justification here in a comment. >>> Actually, 5 might be too low, but I can't really tell. >> At the minimal each retry happens after 1 sec, and it can be >> more depending what entry is at the front of flc_list. So if >> retry for 5 seconds (minimum) is too low then is 20 retries is >> sufficient? > As I said above: here's why we have the "hard" mount option for NFS. It > might be impossible to choose a safe maximum retry count and still > guarantee that the server and clients will eventually agree on the > device reservation state. In that case we will need to consider some > other mechanism for detecting and preventing infinite PR retry loops. > > Maybe the retry maximum could depend on the lease expiry time? The problem with using client's lease as max retries is the client might be healthy but the NFS server is having problem fencing the block device. If we give up after max retries and mark the device as fenced then there is possibility of data corruption if the original client continue to access the block device. I think to be on the safe side, we should retry forever while pr_warn the admin to fix the problem. > > > Just remember, I'm thinking about this and learning as I go along, > just like you are. I know I'm possibly contradicting myself here. I'm greatly appreciated your and Jeff's comments. We're working on the same goal to build a reliable NFS server. Thanks, -Dai ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing 2026-01-27 0:50 [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing Dai Ngo 2026-01-27 17:11 ` Chuck Lever @ 2026-01-27 17:54 ` Jeff Layton 2026-01-27 20:52 ` Dai Ngo 2026-01-28 17:25 ` Dai Ngo 1 sibling, 2 replies; 12+ messages in thread From: Jeff Layton @ 2026-01-27 17:54 UTC (permalink / raw) To: Dai Ngo, chuck.lever, neil, okorniev, tom, hch, alex.aring, viro, brauner, jack Cc: linux-fsdevel, linux-nfs On Mon, 2026-01-26 at 16:50 -0800, Dai Ngo wrote: > When a layout conflict triggers a recall, enforcing a timeout is > necessary to prevent excessive nfsd threads from being blocked in > __break_lease ensuring the server continues servicing incoming > requests efficiently. > > This patch introduces a new function to lease_manager_operations: > > lm_breaker_timedout: Invoked when a lease recall times out and is > about to be disposed of. This function enables the lease manager > to inform the caller whether the file_lease should remain on the > flc_list or be disposed of. > > For the NFSD lease manager, this function now handles layout recall > timeouts. If the layout type supports fencing and the client has not > been fenced, a fence operation is triggered to prevent the client > from accessing the block device. > > While the fencing operation is in progress, the conflicting file_lease > remains on the flc_list until fencing is complete. This guarantees > that no other clients can access the file, and the client with > exclusive access is properly blocked before disposal. > > Signed-off-by: Dai Ngo <dai.ngo@oracle.com> > --- > Documentation/filesystems/locking.rst | 2 + > fs/locks.c | 10 +++- > fs/nfsd/blocklayout.c | 38 ++++++------- > fs/nfsd/nfs4layouts.c | 79 +++++++++++++++++++++++++-- > fs/nfsd/nfs4state.c | 1 + > fs/nfsd/state.h | 6 ++ > include/linux/filelock.h | 1 + > 7 files changed, 110 insertions(+), 27 deletions(-) > > v2: > . Update Subject line to include fencing operation. > . Allow conflicting lease to remain on flc_list until fencing > is complete. > . Use system worker to perform fencing operation asynchronously. > . Use nfs4_stid.sc_count to ensure layout stateid remains > valid before starting the fencing operation, nfs4_stid.sc_count > is released after fencing operation is complete. > . Rework nfsd4_scsi_fence_client to: > . wait until fencing to complete before exiting. > . wait until fencing in progress to complete before > checking the NFSD_MDS_PR_FENCED flag. > . Remove lm_need_to_retry from lease_manager_operations. > v3: > . correct locking requirement in locking.rst. > . add max retry count to fencing operation. > . add missing nfs4_put_stid in nfsd4_layout_fence_worker. > . remove special-casing of FL_LAYOUT in lease_modify. > . remove lease_want_dispose. > . move lm_breaker_timedout call to time_out_leases. > v4: > . only increment ls_fence_retry_cnt after successfully > schedule new work in nfsd4_layout_lm_breaker_timedout. > > diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst > index 04c7691e50e0..a339491f02e4 100644 > --- a/Documentation/filesystems/locking.rst > +++ b/Documentation/filesystems/locking.rst > @@ -403,6 +403,7 @@ prototypes:: > bool (*lm_breaker_owns_lease)(struct file_lock *); > bool (*lm_lock_expirable)(struct file_lock *); > void (*lm_expire_lock)(void); > + void (*lm_breaker_timedout)(struct file_lease *); > > locking rules: > > @@ -417,6 +418,7 @@ lm_breaker_owns_lease: yes no no > lm_lock_expirable yes no no > lm_expire_lock no no yes > lm_open_conflict yes no no > +lm_breaker_timedout yes no no > ====================== ============= ================= ========= > > buffer_head > diff --git a/fs/locks.c b/fs/locks.c > index 46f229f740c8..1b63aa704598 100644 > --- a/fs/locks.c > +++ b/fs/locks.c > @@ -1524,6 +1524,7 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose) > { > struct file_lock_context *ctx = inode->i_flctx; > struct file_lease *fl, *tmp; > + bool remove = true; > > lockdep_assert_held(&ctx->flc_lock); > > @@ -1531,8 +1532,13 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose) > trace_time_out_leases(inode, fl); > if (past_time(fl->fl_downgrade_time)) > lease_modify(fl, F_RDLCK, dispose); > - if (past_time(fl->fl_break_time)) > - lease_modify(fl, F_UNLCK, dispose); > + > + if (past_time(fl->fl_break_time)) { > + if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout) > + remove = fl->fl_lmops->lm_breaker_timedout(fl); > + if (remove) > + lease_modify(fl, F_UNLCK, dispose); I'd not bother with the return code to lm_breaker_timedout. Make it void return and have it call lease_modify itself before returning in the cases where you have it returning true. If the operation isn't defined then just do the lease_modify here like we always have. > + } > } > } > > diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c > index 7ba9e2dd0875..69d3889df302 100644 > --- a/fs/nfsd/blocklayout.c > +++ b/fs/nfsd/blocklayout.c > @@ -443,6 +443,14 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp, > return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps); > } > > +/* > + * Perform the fence operation to prevent the client from accessing the > + * block device. If a fence operation is already in progress, wait for > + * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the > + * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set, > + * update the layout stateid by setting the ls_fenced flag to indicate > + * that the client has been fenced. > + */ > static void > nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file) > { > @@ -450,31 +458,23 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file) > struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev; > int status; > > - if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) > + mutex_lock(&clp->cl_fence_mutex); > + if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) { > + ls->ls_fenced = true; > + mutex_unlock(&clp->cl_fence_mutex); > + nfs4_put_stid(&ls->ls_stid); > return; > + } > I don't understand what this new mutex is protecting, and this all seems overly complex. If feels kind of like you want nfsd to be driving the fencing retries, but I don't think we really do. Here's what I'd do. I'd just make ->fence_client a bool or int return, and have it indicate whether the client was successfully fenced or not. If it was successfully fenced, then have the caller call lease_modify() to remove the lease. If it wasn't successfully fenced, have the caller (the workqueue job) requeue itself if you want to retry. If the caller is ready to give up, then call lease_modify() on it and remove it (and probably throw a pr_warn()). > status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY, > nfsd4_scsi_pr_key(clp), > PR_EXCLUSIVE_ACCESS_REG_ONLY, true); > - /* > - * Reset to allow retry only when the command could not have > - * reached the device. Negative status means a local error > - * (e.g., -ENOMEM) prevented the command from being sent. > - * PR_STS_PATH_FAILED, PR_STS_PATH_FAST_FAILED, and > - * PR_STS_RETRY_PATH_FAILURE indicate transport path failures > - * before device delivery. > - * > - * For all other errors, the command may have reached the device > - * and the preempt may have succeeded. Avoid resetting, since > - * retrying a successful preempt returns PR_STS_IOERR or > - * PR_STS_RESERVATION_CONFLICT, which would cause an infinite > - * retry loop. > - */ > - if (status < 0 || > - status == PR_STS_PATH_FAILED || > - status == PR_STS_PATH_FAST_FAILED || > - status == PR_STS_RETRY_PATH_FAILURE) > + if (status) > nfsd4_scsi_fence_clear(clp, bdev->bd_dev); > + else > + ls->ls_fenced = true; > + mutex_unlock(&clp->cl_fence_mutex); > + nfs4_put_stid(&ls->ls_stid); > > trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status); > } > diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c > index ad7af8cfcf1f..1c498f3cd059 100644 > --- a/fs/nfsd/nfs4layouts.c > +++ b/fs/nfsd/nfs4layouts.c > @@ -222,6 +222,29 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls) > return 0; > } > > +static void > +nfsd4_layout_fence_worker(struct work_struct *work) > +{ > + struct nfsd_file *nf; > + struct delayed_work *dwork = to_delayed_work(work); > + struct nfs4_layout_stateid *ls = container_of(dwork, > + struct nfs4_layout_stateid, ls_fence_work); > + u32 type; > + > + rcu_read_lock(); > + nf = nfsd_file_get(ls->ls_file); > + rcu_read_unlock(); > + if (!nf) { > + nfs4_put_stid(&ls->ls_stid); > + return; > + } > + > + type = ls->ls_layout_type; > + if (nfsd4_layout_ops[type]->fence_client) > + nfsd4_layout_ops[type]->fence_client(ls, nf); If you make fence_client an int/bool return, then you could just requeue this job to try it again. > + nfsd_file_put(nf); > +} > + > static struct nfs4_layout_stateid * > nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, > struct nfs4_stid *parent, u32 layout_type) > @@ -271,6 +294,10 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, > list_add(&ls->ls_perfile, &fp->fi_lo_states); > spin_unlock(&fp->fi_lock); > > + INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker); > + ls->ls_fenced = false; > + ls->ls_fence_retry_cnt = 0; > + > trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid); > return ls; > } > @@ -708,9 +735,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) > rcu_read_unlock(); > if (fl) { > ops = nfsd4_layout_ops[ls->ls_layout_type]; > - if (ops->fence_client) > + if (ops->fence_client) { > + refcount_inc(&ls->ls_stid.sc_count); > ops->fence_client(ls, fl); > - else > + } else > nfsd4_cb_layout_fail(ls, fl); > nfsd_file_put(fl); > } > @@ -747,11 +775,9 @@ static bool > nfsd4_layout_lm_break(struct file_lease *fl) > { > /* > - * We don't want the locks code to timeout the lease for us; > - * we'll remove it ourself if a layout isn't returned > - * in time: > + * Enforce break lease timeout to prevent NFSD > + * thread from hanging in __break_lease. > */ > - fl->fl_break_time = 0; > nfsd4_recall_file_layout(fl->c.flc_owner); > return false; > } > @@ -782,10 +808,51 @@ nfsd4_layout_lm_open_conflict(struct file *filp, int arg) > return 0; > } > > +/** > + * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out. > + * If the layout type supports a fence operation, schedule a worker to > + * fence the client from accessing the block device. > + * > + * @fl: file to check > + * > + * Return true if the file lease should be disposed of by the caller; > + * otherwise, return false. > + */ > +static bool > +nfsd4_layout_lm_breaker_timedout(struct file_lease *fl) > +{ > + struct nfs4_layout_stateid *ls = fl->c.flc_owner; > + bool ret; > + > + if (!nfsd4_layout_ops[ls->ls_layout_type]->fence_client) > + return true; > + if (ls->ls_fenced || ls->ls_fence_retry_cnt >= LO_MAX_FENCE_RETRY) > + return true; > + > + if (work_busy(&ls->ls_fence_work.work)) > + return false; > + /* Schedule work to do the fence operation */ > + ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0); > + if (!ret) { > + /* > + * If there is no pending work, mod_delayed_work queues > + * new task. While fencing is in progress, a reference > + * count is added to the layout stateid to ensure its > + * validity. This reference count is released once fencing > + * has been completed. > + */ > + refcount_inc(&ls->ls_stid.sc_count); > + ++ls->ls_fence_retry_cnt; > + return true; The cases where the fencing didn't work after too many retries, or the job couldn't be queued should probably get a pr_warn or something. The admin needs to know that data corruption is possible and that they might need to nuke the client manually. > + } > + return false; > +} > + > static const struct lease_manager_operations nfsd4_layouts_lm_ops = { > .lm_break = nfsd4_layout_lm_break, > .lm_change = nfsd4_layout_lm_change, > .lm_open_conflict = nfsd4_layout_lm_open_conflict, > + .lm_breaker_timedout = nfsd4_layout_lm_breaker_timedout, > }; > > int > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c > index 583c13b5aaf3..a57fa3318362 100644 > --- a/fs/nfsd/nfs4state.c > +++ b/fs/nfsd/nfs4state.c > @@ -2385,6 +2385,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name, > #endif > #ifdef CONFIG_NFSD_SCSILAYOUT > xa_init(&clp->cl_dev_fences); > + mutex_init(&clp->cl_fence_mutex); > #endif > INIT_LIST_HEAD(&clp->async_copies); > spin_lock_init(&clp->async_lock); > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h > index 713f55ef6554..57e54dfb406c 100644 > --- a/fs/nfsd/state.h > +++ b/fs/nfsd/state.h > @@ -529,6 +529,7 @@ struct nfs4_client { > time64_t cl_ra_time; > #ifdef CONFIG_NFSD_SCSILAYOUT > struct xarray cl_dev_fences; > + struct mutex cl_fence_mutex; > #endif > }; > > @@ -738,8 +739,13 @@ struct nfs4_layout_stateid { > stateid_t ls_recall_sid; > bool ls_recalled; > struct mutex ls_mutex; > + struct delayed_work ls_fence_work; > + bool ls_fenced; > + int ls_fence_retry_cnt; > }; > > +#define LO_MAX_FENCE_RETRY 5 > + > static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s) > { > return container_of(s, struct nfs4_layout_stateid, ls_stid); > diff --git a/include/linux/filelock.h b/include/linux/filelock.h > index 2f5e5588ee07..13b9c9f04589 100644 > --- a/include/linux/filelock.h > +++ b/include/linux/filelock.h > @@ -50,6 +50,7 @@ struct lease_manager_operations { > void (*lm_setup)(struct file_lease *, void **); > bool (*lm_breaker_owns_lease)(struct file_lease *); > int (*lm_open_conflict)(struct file *, int); > + bool (*lm_breaker_timedout)(struct file_lease *fl); > }; > > struct lock_manager { -- Jeff Layton <jlayton@kernel.org> ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing 2026-01-27 17:54 ` Jeff Layton @ 2026-01-27 20:52 ` Dai Ngo 2026-01-28 17:25 ` Dai Ngo 1 sibling, 0 replies; 12+ messages in thread From: Dai Ngo @ 2026-01-27 20:52 UTC (permalink / raw) To: Jeff Layton, chuck.lever, neil, okorniev, tom, hch, alex.aring, viro, brauner, jack Cc: linux-fsdevel, linux-nfs On 1/27/26 9:54 AM, Jeff Layton wrote: > On Mon, 2026-01-26 at 16:50 -0800, Dai Ngo wrote: >> When a layout conflict triggers a recall, enforcing a timeout is >> necessary to prevent excessive nfsd threads from being blocked in >> __break_lease ensuring the server continues servicing incoming >> requests efficiently. >> >> This patch introduces a new function to lease_manager_operations: >> >> lm_breaker_timedout: Invoked when a lease recall times out and is >> about to be disposed of. This function enables the lease manager >> to inform the caller whether the file_lease should remain on the >> flc_list or be disposed of. >> >> For the NFSD lease manager, this function now handles layout recall >> timeouts. If the layout type supports fencing and the client has not >> been fenced, a fence operation is triggered to prevent the client >> from accessing the block device. >> >> While the fencing operation is in progress, the conflicting file_lease >> remains on the flc_list until fencing is complete. This guarantees >> that no other clients can access the file, and the client with >> exclusive access is properly blocked before disposal. >> >> Signed-off-by: Dai Ngo <dai.ngo@oracle.com> >> --- >> Documentation/filesystems/locking.rst | 2 + >> fs/locks.c | 10 +++- >> fs/nfsd/blocklayout.c | 38 ++++++------- >> fs/nfsd/nfs4layouts.c | 79 +++++++++++++++++++++++++-- >> fs/nfsd/nfs4state.c | 1 + >> fs/nfsd/state.h | 6 ++ >> include/linux/filelock.h | 1 + >> 7 files changed, 110 insertions(+), 27 deletions(-) >> >> v2: >> . Update Subject line to include fencing operation. >> . Allow conflicting lease to remain on flc_list until fencing >> is complete. >> . Use system worker to perform fencing operation asynchronously. >> . Use nfs4_stid.sc_count to ensure layout stateid remains >> valid before starting the fencing operation, nfs4_stid.sc_count >> is released after fencing operation is complete. >> . Rework nfsd4_scsi_fence_client to: >> . wait until fencing to complete before exiting. >> . wait until fencing in progress to complete before >> checking the NFSD_MDS_PR_FENCED flag. >> . Remove lm_need_to_retry from lease_manager_operations. >> v3: >> . correct locking requirement in locking.rst. >> . add max retry count to fencing operation. >> . add missing nfs4_put_stid in nfsd4_layout_fence_worker. >> . remove special-casing of FL_LAYOUT in lease_modify. >> . remove lease_want_dispose. >> . move lm_breaker_timedout call to time_out_leases. >> v4: >> . only increment ls_fence_retry_cnt after successfully >> schedule new work in nfsd4_layout_lm_breaker_timedout. >> >> diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst >> index 04c7691e50e0..a339491f02e4 100644 >> --- a/Documentation/filesystems/locking.rst >> +++ b/Documentation/filesystems/locking.rst >> @@ -403,6 +403,7 @@ prototypes:: >> bool (*lm_breaker_owns_lease)(struct file_lock *); >> bool (*lm_lock_expirable)(struct file_lock *); >> void (*lm_expire_lock)(void); >> + void (*lm_breaker_timedout)(struct file_lease *); >> >> locking rules: >> >> @@ -417,6 +418,7 @@ lm_breaker_owns_lease: yes no no >> lm_lock_expirable yes no no >> lm_expire_lock no no yes >> lm_open_conflict yes no no >> +lm_breaker_timedout yes no no >> ====================== ============= ================= ========= >> >> buffer_head >> diff --git a/fs/locks.c b/fs/locks.c >> index 46f229f740c8..1b63aa704598 100644 >> --- a/fs/locks.c >> +++ b/fs/locks.c >> @@ -1524,6 +1524,7 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose) >> { >> struct file_lock_context *ctx = inode->i_flctx; >> struct file_lease *fl, *tmp; >> + bool remove = true; >> >> lockdep_assert_held(&ctx->flc_lock); >> >> @@ -1531,8 +1532,13 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose) >> trace_time_out_leases(inode, fl); >> if (past_time(fl->fl_downgrade_time)) >> lease_modify(fl, F_RDLCK, dispose); >> - if (past_time(fl->fl_break_time)) >> - lease_modify(fl, F_UNLCK, dispose); >> + >> + if (past_time(fl->fl_break_time)) { >> + if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout) >> + remove = fl->fl_lmops->lm_breaker_timedout(fl); >> + if (remove) >> + lease_modify(fl, F_UNLCK, dispose); > I'd not bother with the return code to lm_breaker_timedout. > > Make it void return and have it call lease_modify itself before > returning in the cases where you have it returning true. If the > operation isn't defined then just do the lease_modify here like we > always have. Ok, I will try this approach. > >> + } >> } >> } >> >> diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c >> index 7ba9e2dd0875..69d3889df302 100644 >> --- a/fs/nfsd/blocklayout.c >> +++ b/fs/nfsd/blocklayout.c >> @@ -443,6 +443,14 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp, >> return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps); >> } >> >> +/* >> + * Perform the fence operation to prevent the client from accessing the >> + * block device. If a fence operation is already in progress, wait for >> + * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the >> + * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set, >> + * update the layout stateid by setting the ls_fenced flag to indicate >> + * that the client has been fenced. >> + */ >> static void >> nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file) >> { >> @@ -450,31 +458,23 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file) >> struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev; >> int status; >> >> - if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) >> + mutex_lock(&clp->cl_fence_mutex); >> + if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) { >> + ls->ls_fenced = true; >> + mutex_unlock(&clp->cl_fence_mutex); >> + nfs4_put_stid(&ls->ls_stid); >> return; >> + } >> > I don't understand what this new mutex is protecting, The purpose of this mutex is to ensure that once this function returns, either the client was fenced or not fence and not fencing in progress (some one is in the process of doing it). > and this all > seems overly complex. If feels kind of like you want nfsd to be driving > the fencing retries, but I don't think we really do. Here's what I'd > do. > > I'd just make ->fence_client a bool or int return, and have it indicate > whether the client was successfully fenced or not. If it was > successfully fenced, then have the caller call lease_modify() to remove > the lease. If it wasn't successfully fenced, have the caller (the > workqueue job) requeue itself if you want to retry. If the caller is > ready to give up, then call lease_modify() on it and remove it (and > probably throw a pr_warn()). Ok, I will try this approach: have the fence worker doing the retry instead of nfsd so that the fence worker is only scheduled once to do the fencing. > >> status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY, >> nfsd4_scsi_pr_key(clp), >> PR_EXCLUSIVE_ACCESS_REG_ONLY, true); >> - /* >> - * Reset to allow retry only when the command could not have >> - * reached the device. Negative status means a local error >> - * (e.g., -ENOMEM) prevented the command from being sent. >> - * PR_STS_PATH_FAILED, PR_STS_PATH_FAST_FAILED, and >> - * PR_STS_RETRY_PATH_FAILURE indicate transport path failures >> - * before device delivery. >> - * >> - * For all other errors, the command may have reached the device >> - * and the preempt may have succeeded. Avoid resetting, since >> - * retrying a successful preempt returns PR_STS_IOERR or >> - * PR_STS_RESERVATION_CONFLICT, which would cause an infinite >> - * retry loop. >> - */ >> - if (status < 0 || >> - status == PR_STS_PATH_FAILED || >> - status == PR_STS_PATH_FAST_FAILED || >> - status == PR_STS_RETRY_PATH_FAILURE) >> + if (status) >> nfsd4_scsi_fence_clear(clp, bdev->bd_dev); >> + else >> + ls->ls_fenced = true; >> + mutex_unlock(&clp->cl_fence_mutex); >> + nfs4_put_stid(&ls->ls_stid); >> >> trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status); >> } >> diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c >> index ad7af8cfcf1f..1c498f3cd059 100644 >> --- a/fs/nfsd/nfs4layouts.c >> +++ b/fs/nfsd/nfs4layouts.c >> @@ -222,6 +222,29 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls) >> return 0; >> } >> >> +static void >> +nfsd4_layout_fence_worker(struct work_struct *work) >> +{ >> + struct nfsd_file *nf; >> + struct delayed_work *dwork = to_delayed_work(work); >> + struct nfs4_layout_stateid *ls = container_of(dwork, >> + struct nfs4_layout_stateid, ls_fence_work); >> + u32 type; >> + >> + rcu_read_lock(); >> + nf = nfsd_file_get(ls->ls_file); >> + rcu_read_unlock(); >> + if (!nf) { >> + nfs4_put_stid(&ls->ls_stid); >> + return; >> + } >> + >> + type = ls->ls_layout_type; >> + if (nfsd4_layout_ops[type]->fence_client) >> + nfsd4_layout_ops[type]->fence_client(ls, nf); > If you make fence_client an int/bool return, then you could just > requeue this job to try it again. Yes. > >> + nfsd_file_put(nf); >> +} >> + >> static struct nfs4_layout_stateid * >> nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, >> struct nfs4_stid *parent, u32 layout_type) >> @@ -271,6 +294,10 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, >> list_add(&ls->ls_perfile, &fp->fi_lo_states); >> spin_unlock(&fp->fi_lock); >> >> + INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker); >> + ls->ls_fenced = false; >> + ls->ls_fence_retry_cnt = 0; >> + >> trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid); >> return ls; >> } >> @@ -708,9 +735,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) >> rcu_read_unlock(); >> if (fl) { >> ops = nfsd4_layout_ops[ls->ls_layout_type]; >> - if (ops->fence_client) >> + if (ops->fence_client) { >> + refcount_inc(&ls->ls_stid.sc_count); >> ops->fence_client(ls, fl); >> - else >> + } else >> nfsd4_cb_layout_fail(ls, fl); >> nfsd_file_put(fl); >> } >> @@ -747,11 +775,9 @@ static bool >> nfsd4_layout_lm_break(struct file_lease *fl) >> { >> /* >> - * We don't want the locks code to timeout the lease for us; >> - * we'll remove it ourself if a layout isn't returned >> - * in time: >> + * Enforce break lease timeout to prevent NFSD >> + * thread from hanging in __break_lease. >> */ >> - fl->fl_break_time = 0; >> nfsd4_recall_file_layout(fl->c.flc_owner); >> return false; >> } >> @@ -782,10 +808,51 @@ nfsd4_layout_lm_open_conflict(struct file *filp, int arg) >> return 0; >> } >> >> +/** >> + * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out. >> + * If the layout type supports a fence operation, schedule a worker to >> + * fence the client from accessing the block device. >> + * >> + * @fl: file to check >> + * >> + * Return true if the file lease should be disposed of by the caller; >> + * otherwise, return false. >> + */ >> +static bool >> +nfsd4_layout_lm_breaker_timedout(struct file_lease *fl) >> +{ >> + struct nfs4_layout_stateid *ls = fl->c.flc_owner; >> + bool ret; >> + >> + if (!nfsd4_layout_ops[ls->ls_layout_type]->fence_client) >> + return true; >> + if (ls->ls_fenced || ls->ls_fence_retry_cnt >= LO_MAX_FENCE_RETRY) >> + return true; >> + >> + if (work_busy(&ls->ls_fence_work.work)) >> + return false; >> + /* Schedule work to do the fence operation */ >> + ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0); >> + if (!ret) { >> + /* >> + * If there is no pending work, mod_delayed_work queues >> + * new task. While fencing is in progress, a reference >> + * count is added to the layout stateid to ensure its >> + * validity. This reference count is released once fencing >> + * has been completed. >> + */ >> + refcount_inc(&ls->ls_stid.sc_count); >> + ++ls->ls_fence_retry_cnt; >> + return true; > The cases where the fencing didn't work after too many retries, or the > job couldn't be queued should probably get a pr_warn or something. The > admin needs to know that data corruption is possible and that they > might need to nuke the client manually. Yes, will do. Thanks, -Dai > >> + } >> + return false; >> +} >> + >> static const struct lease_manager_operations nfsd4_layouts_lm_ops = { >> .lm_break = nfsd4_layout_lm_break, >> .lm_change = nfsd4_layout_lm_change, >> .lm_open_conflict = nfsd4_layout_lm_open_conflict, >> + .lm_breaker_timedout = nfsd4_layout_lm_breaker_timedout, >> }; >> >> int >> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c >> index 583c13b5aaf3..a57fa3318362 100644 >> --- a/fs/nfsd/nfs4state.c >> +++ b/fs/nfsd/nfs4state.c >> @@ -2385,6 +2385,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name, >> #endif >> #ifdef CONFIG_NFSD_SCSILAYOUT >> xa_init(&clp->cl_dev_fences); >> + mutex_init(&clp->cl_fence_mutex); >> #endif >> INIT_LIST_HEAD(&clp->async_copies); >> spin_lock_init(&clp->async_lock); >> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h >> index 713f55ef6554..57e54dfb406c 100644 >> --- a/fs/nfsd/state.h >> +++ b/fs/nfsd/state.h >> @@ -529,6 +529,7 @@ struct nfs4_client { >> time64_t cl_ra_time; >> #ifdef CONFIG_NFSD_SCSILAYOUT >> struct xarray cl_dev_fences; >> + struct mutex cl_fence_mutex; >> #endif >> }; >> >> @@ -738,8 +739,13 @@ struct nfs4_layout_stateid { >> stateid_t ls_recall_sid; >> bool ls_recalled; >> struct mutex ls_mutex; >> + struct delayed_work ls_fence_work; >> + bool ls_fenced; >> + int ls_fence_retry_cnt; >> }; >> >> +#define LO_MAX_FENCE_RETRY 5 >> + >> static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s) >> { >> return container_of(s, struct nfs4_layout_stateid, ls_stid); >> diff --git a/include/linux/filelock.h b/include/linux/filelock.h >> index 2f5e5588ee07..13b9c9f04589 100644 >> --- a/include/linux/filelock.h >> +++ b/include/linux/filelock.h >> @@ -50,6 +50,7 @@ struct lease_manager_operations { >> void (*lm_setup)(struct file_lease *, void **); >> bool (*lm_breaker_owns_lease)(struct file_lease *); >> int (*lm_open_conflict)(struct file *, int); >> + bool (*lm_breaker_timedout)(struct file_lease *fl); >> }; >> >> struct lock_manager { ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing 2026-01-27 17:54 ` Jeff Layton 2026-01-27 20:52 ` Dai Ngo @ 2026-01-28 17:25 ` Dai Ngo 2026-01-28 18:01 ` Jeff Layton 1 sibling, 1 reply; 12+ messages in thread From: Dai Ngo @ 2026-01-28 17:25 UTC (permalink / raw) To: Jeff Layton, chuck.lever, neil, okorniev, tom, hch, alex.aring, viro, brauner, jack Cc: linux-fsdevel, linux-nfs On 1/27/26 9:54 AM, Jeff Layton wrote: > On Mon, 2026-01-26 at 16:50 -0800, Dai Ngo wrote: >> When a layout conflict triggers a recall, enforcing a timeout is >> necessary to prevent excessive nfsd threads from being blocked in >> __break_lease ensuring the server continues servicing incoming >> requests efficiently. >> >> This patch introduces a new function to lease_manager_operations: >> >> lm_breaker_timedout: Invoked when a lease recall times out and is >> about to be disposed of. This function enables the lease manager >> to inform the caller whether the file_lease should remain on the >> flc_list or be disposed of. >> >> For the NFSD lease manager, this function now handles layout recall >> timeouts. If the layout type supports fencing and the client has not >> been fenced, a fence operation is triggered to prevent the client >> from accessing the block device. >> >> While the fencing operation is in progress, the conflicting file_lease >> remains on the flc_list until fencing is complete. This guarantees >> that no other clients can access the file, and the client with >> exclusive access is properly blocked before disposal. >> >> Signed-off-by: Dai Ngo <dai.ngo@oracle.com> >> --- >> Documentation/filesystems/locking.rst | 2 + >> fs/locks.c | 10 +++- >> fs/nfsd/blocklayout.c | 38 ++++++------- >> fs/nfsd/nfs4layouts.c | 79 +++++++++++++++++++++++++-- >> fs/nfsd/nfs4state.c | 1 + >> fs/nfsd/state.h | 6 ++ >> include/linux/filelock.h | 1 + >> 7 files changed, 110 insertions(+), 27 deletions(-) >> >> v2: >> . Update Subject line to include fencing operation. >> . Allow conflicting lease to remain on flc_list until fencing >> is complete. >> . Use system worker to perform fencing operation asynchronously. >> . Use nfs4_stid.sc_count to ensure layout stateid remains >> valid before starting the fencing operation, nfs4_stid.sc_count >> is released after fencing operation is complete. >> . Rework nfsd4_scsi_fence_client to: >> . wait until fencing to complete before exiting. >> . wait until fencing in progress to complete before >> checking the NFSD_MDS_PR_FENCED flag. >> . Remove lm_need_to_retry from lease_manager_operations. >> v3: >> . correct locking requirement in locking.rst. >> . add max retry count to fencing operation. >> . add missing nfs4_put_stid in nfsd4_layout_fence_worker. >> . remove special-casing of FL_LAYOUT in lease_modify. >> . remove lease_want_dispose. >> . move lm_breaker_timedout call to time_out_leases. >> v4: >> . only increment ls_fence_retry_cnt after successfully >> schedule new work in nfsd4_layout_lm_breaker_timedout. >> >> diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst >> index 04c7691e50e0..a339491f02e4 100644 >> --- a/Documentation/filesystems/locking.rst >> +++ b/Documentation/filesystems/locking.rst >> @@ -403,6 +403,7 @@ prototypes:: >> bool (*lm_breaker_owns_lease)(struct file_lock *); >> bool (*lm_lock_expirable)(struct file_lock *); >> void (*lm_expire_lock)(void); >> + void (*lm_breaker_timedout)(struct file_lease *); >> >> locking rules: >> >> @@ -417,6 +418,7 @@ lm_breaker_owns_lease: yes no no >> lm_lock_expirable yes no no >> lm_expire_lock no no yes >> lm_open_conflict yes no no >> +lm_breaker_timedout yes no no >> ====================== ============= ================= ========= >> >> buffer_head >> diff --git a/fs/locks.c b/fs/locks.c >> index 46f229f740c8..1b63aa704598 100644 >> --- a/fs/locks.c >> +++ b/fs/locks.c >> @@ -1524,6 +1524,7 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose) >> { >> struct file_lock_context *ctx = inode->i_flctx; >> struct file_lease *fl, *tmp; >> + bool remove = true; >> >> lockdep_assert_held(&ctx->flc_lock); >> >> @@ -1531,8 +1532,13 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose) >> trace_time_out_leases(inode, fl); >> if (past_time(fl->fl_downgrade_time)) >> lease_modify(fl, F_RDLCK, dispose); >> - if (past_time(fl->fl_break_time)) >> - lease_modify(fl, F_UNLCK, dispose); >> + >> + if (past_time(fl->fl_break_time)) { >> + if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout) >> + remove = fl->fl_lmops->lm_breaker_timedout(fl); >> + if (remove) >> + lease_modify(fl, F_UNLCK, dispose); > I'd not bother with the return code to lm_breaker_timedout. > > Make it void return and have it call lease_modify itself before > returning in the cases where you have it returning true. If the > operation isn't defined then just do the lease_modify here like we > always have. The first call to lm_breaker_timedout schedules the fence worker so fencing is not done yet. If lm_breaker_timedout return to time_out_leases without any return value then lease_modify is called to remove the lease. Am i missing something? -Dai > >> + } >> } >> } >> >> diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c >> index 7ba9e2dd0875..69d3889df302 100644 >> --- a/fs/nfsd/blocklayout.c >> +++ b/fs/nfsd/blocklayout.c >> @@ -443,6 +443,14 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp, >> return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps); >> } >> >> +/* >> + * Perform the fence operation to prevent the client from accessing the >> + * block device. If a fence operation is already in progress, wait for >> + * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the >> + * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set, >> + * update the layout stateid by setting the ls_fenced flag to indicate >> + * that the client has been fenced. >> + */ >> static void >> nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file) >> { >> @@ -450,31 +458,23 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file) >> struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev; >> int status; >> >> - if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) >> + mutex_lock(&clp->cl_fence_mutex); >> + if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) { >> + ls->ls_fenced = true; >> + mutex_unlock(&clp->cl_fence_mutex); >> + nfs4_put_stid(&ls->ls_stid); >> return; >> + } >> > I don't understand what this new mutex is protecting, and this all > seems overly complex. If feels kind of like you want nfsd to be driving > the fencing retries, but I don't think we really do. Here's what I'd > do. > > I'd just make ->fence_client a bool or int return, and have it indicate > whether the client was successfully fenced or not. If it was > successfully fenced, then have the caller call lease_modify() to remove > the lease. If it wasn't successfully fenced, have the caller (the > workqueue job) requeue itself if you want to retry. If the caller is > ready to give up, then call lease_modify() on it and remove it (and > probably throw a pr_warn()). > >> status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY, >> nfsd4_scsi_pr_key(clp), >> PR_EXCLUSIVE_ACCESS_REG_ONLY, true); >> - /* >> - * Reset to allow retry only when the command could not have >> - * reached the device. Negative status means a local error >> - * (e.g., -ENOMEM) prevented the command from being sent. >> - * PR_STS_PATH_FAILED, PR_STS_PATH_FAST_FAILED, and >> - * PR_STS_RETRY_PATH_FAILURE indicate transport path failures >> - * before device delivery. >> - * >> - * For all other errors, the command may have reached the device >> - * and the preempt may have succeeded. Avoid resetting, since >> - * retrying a successful preempt returns PR_STS_IOERR or >> - * PR_STS_RESERVATION_CONFLICT, which would cause an infinite >> - * retry loop. >> - */ >> - if (status < 0 || >> - status == PR_STS_PATH_FAILED || >> - status == PR_STS_PATH_FAST_FAILED || >> - status == PR_STS_RETRY_PATH_FAILURE) >> + if (status) >> nfsd4_scsi_fence_clear(clp, bdev->bd_dev); >> + else >> + ls->ls_fenced = true; >> + mutex_unlock(&clp->cl_fence_mutex); >> + nfs4_put_stid(&ls->ls_stid); >> >> trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status); >> } >> diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c >> index ad7af8cfcf1f..1c498f3cd059 100644 >> --- a/fs/nfsd/nfs4layouts.c >> +++ b/fs/nfsd/nfs4layouts.c >> @@ -222,6 +222,29 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls) >> return 0; >> } >> >> +static void >> +nfsd4_layout_fence_worker(struct work_struct *work) >> +{ >> + struct nfsd_file *nf; >> + struct delayed_work *dwork = to_delayed_work(work); >> + struct nfs4_layout_stateid *ls = container_of(dwork, >> + struct nfs4_layout_stateid, ls_fence_work); >> + u32 type; >> + >> + rcu_read_lock(); >> + nf = nfsd_file_get(ls->ls_file); >> + rcu_read_unlock(); >> + if (!nf) { >> + nfs4_put_stid(&ls->ls_stid); >> + return; >> + } >> + >> + type = ls->ls_layout_type; >> + if (nfsd4_layout_ops[type]->fence_client) >> + nfsd4_layout_ops[type]->fence_client(ls, nf); > If you make fence_client an int/bool return, then you could just > requeue this job to try it again. > >> + nfsd_file_put(nf); >> +} >> + >> static struct nfs4_layout_stateid * >> nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, >> struct nfs4_stid *parent, u32 layout_type) >> @@ -271,6 +294,10 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, >> list_add(&ls->ls_perfile, &fp->fi_lo_states); >> spin_unlock(&fp->fi_lock); >> >> + INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker); >> + ls->ls_fenced = false; >> + ls->ls_fence_retry_cnt = 0; >> + >> trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid); >> return ls; >> } >> @@ -708,9 +735,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) >> rcu_read_unlock(); >> if (fl) { >> ops = nfsd4_layout_ops[ls->ls_layout_type]; >> - if (ops->fence_client) >> + if (ops->fence_client) { >> + refcount_inc(&ls->ls_stid.sc_count); >> ops->fence_client(ls, fl); >> - else >> + } else >> nfsd4_cb_layout_fail(ls, fl); >> nfsd_file_put(fl); >> } >> @@ -747,11 +775,9 @@ static bool >> nfsd4_layout_lm_break(struct file_lease *fl) >> { >> /* >> - * We don't want the locks code to timeout the lease for us; >> - * we'll remove it ourself if a layout isn't returned >> - * in time: >> + * Enforce break lease timeout to prevent NFSD >> + * thread from hanging in __break_lease. >> */ >> - fl->fl_break_time = 0; >> nfsd4_recall_file_layout(fl->c.flc_owner); >> return false; >> } >> @@ -782,10 +808,51 @@ nfsd4_layout_lm_open_conflict(struct file *filp, int arg) >> return 0; >> } >> >> +/** >> + * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out. >> + * If the layout type supports a fence operation, schedule a worker to >> + * fence the client from accessing the block device. >> + * >> + * @fl: file to check >> + * >> + * Return true if the file lease should be disposed of by the caller; >> + * otherwise, return false. >> + */ >> +static bool >> +nfsd4_layout_lm_breaker_timedout(struct file_lease *fl) >> +{ >> + struct nfs4_layout_stateid *ls = fl->c.flc_owner; >> + bool ret; >> + >> + if (!nfsd4_layout_ops[ls->ls_layout_type]->fence_client) >> + return true; >> + if (ls->ls_fenced || ls->ls_fence_retry_cnt >= LO_MAX_FENCE_RETRY) >> + return true; >> + >> + if (work_busy(&ls->ls_fence_work.work)) >> + return false; >> + /* Schedule work to do the fence operation */ >> + ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0); >> + if (!ret) { >> + /* >> + * If there is no pending work, mod_delayed_work queues >> + * new task. While fencing is in progress, a reference >> + * count is added to the layout stateid to ensure its >> + * validity. This reference count is released once fencing >> + * has been completed. >> + */ >> + refcount_inc(&ls->ls_stid.sc_count); >> + ++ls->ls_fence_retry_cnt; >> + return true; > The cases where the fencing didn't work after too many retries, or the > job couldn't be queued should probably get a pr_warn or something. The > admin needs to know that data corruption is possible and that they > might need to nuke the client manually. > >> + } >> + return false; >> +} >> + >> static const struct lease_manager_operations nfsd4_layouts_lm_ops = { >> .lm_break = nfsd4_layout_lm_break, >> .lm_change = nfsd4_layout_lm_change, >> .lm_open_conflict = nfsd4_layout_lm_open_conflict, >> + .lm_breaker_timedout = nfsd4_layout_lm_breaker_timedout, >> }; >> >> int >> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c >> index 583c13b5aaf3..a57fa3318362 100644 >> --- a/fs/nfsd/nfs4state.c >> +++ b/fs/nfsd/nfs4state.c >> @@ -2385,6 +2385,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name, >> #endif >> #ifdef CONFIG_NFSD_SCSILAYOUT >> xa_init(&clp->cl_dev_fences); >> + mutex_init(&clp->cl_fence_mutex); >> #endif >> INIT_LIST_HEAD(&clp->async_copies); >> spin_lock_init(&clp->async_lock); >> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h >> index 713f55ef6554..57e54dfb406c 100644 >> --- a/fs/nfsd/state.h >> +++ b/fs/nfsd/state.h >> @@ -529,6 +529,7 @@ struct nfs4_client { >> time64_t cl_ra_time; >> #ifdef CONFIG_NFSD_SCSILAYOUT >> struct xarray cl_dev_fences; >> + struct mutex cl_fence_mutex; >> #endif >> }; >> >> @@ -738,8 +739,13 @@ struct nfs4_layout_stateid { >> stateid_t ls_recall_sid; >> bool ls_recalled; >> struct mutex ls_mutex; >> + struct delayed_work ls_fence_work; >> + bool ls_fenced; >> + int ls_fence_retry_cnt; >> }; >> >> +#define LO_MAX_FENCE_RETRY 5 >> + >> static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s) >> { >> return container_of(s, struct nfs4_layout_stateid, ls_stid); >> diff --git a/include/linux/filelock.h b/include/linux/filelock.h >> index 2f5e5588ee07..13b9c9f04589 100644 >> --- a/include/linux/filelock.h >> +++ b/include/linux/filelock.h >> @@ -50,6 +50,7 @@ struct lease_manager_operations { >> void (*lm_setup)(struct file_lease *, void **); >> bool (*lm_breaker_owns_lease)(struct file_lease *); >> int (*lm_open_conflict)(struct file *, int); >> + bool (*lm_breaker_timedout)(struct file_lease *fl); >> }; >> >> struct lock_manager { ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing 2026-01-28 17:25 ` Dai Ngo @ 2026-01-28 18:01 ` Jeff Layton 2026-01-28 18:27 ` Dai Ngo 0 siblings, 1 reply; 12+ messages in thread From: Jeff Layton @ 2026-01-28 18:01 UTC (permalink / raw) To: Dai Ngo, chuck.lever, neil, okorniev, tom, hch, alex.aring, viro, brauner, jack Cc: linux-fsdevel, linux-nfs On Wed, 2026-01-28 at 09:25 -0800, Dai Ngo wrote: > On 1/27/26 9:54 AM, Jeff Layton wrote: > > On Mon, 2026-01-26 at 16:50 -0800, Dai Ngo wrote: > > > When a layout conflict triggers a recall, enforcing a timeout is > > > necessary to prevent excessive nfsd threads from being blocked in > > > __break_lease ensuring the server continues servicing incoming > > > requests efficiently. > > > > > > This patch introduces a new function to lease_manager_operations: > > > > > > lm_breaker_timedout: Invoked when a lease recall times out and is > > > about to be disposed of. This function enables the lease manager > > > to inform the caller whether the file_lease should remain on the > > > flc_list or be disposed of. > > > > > > For the NFSD lease manager, this function now handles layout recall > > > timeouts. If the layout type supports fencing and the client has not > > > been fenced, a fence operation is triggered to prevent the client > > > from accessing the block device. > > > > > > While the fencing operation is in progress, the conflicting file_lease > > > remains on the flc_list until fencing is complete. This guarantees > > > that no other clients can access the file, and the client with > > > exclusive access is properly blocked before disposal. > > > > > > Signed-off-by: Dai Ngo <dai.ngo@oracle.com> > > > --- > > > Documentation/filesystems/locking.rst | 2 + > > > fs/locks.c | 10 +++- > > > fs/nfsd/blocklayout.c | 38 ++++++------- > > > fs/nfsd/nfs4layouts.c | 79 +++++++++++++++++++++++++-- > > > fs/nfsd/nfs4state.c | 1 + > > > fs/nfsd/state.h | 6 ++ > > > include/linux/filelock.h | 1 + > > > 7 files changed, 110 insertions(+), 27 deletions(-) > > > > > > v2: > > > . Update Subject line to include fencing operation. > > > . Allow conflicting lease to remain on flc_list until fencing > > > is complete. > > > . Use system worker to perform fencing operation asynchronously. > > > . Use nfs4_stid.sc_count to ensure layout stateid remains > > > valid before starting the fencing operation, nfs4_stid.sc_count > > > is released after fencing operation is complete. > > > . Rework nfsd4_scsi_fence_client to: > > > . wait until fencing to complete before exiting. > > > . wait until fencing in progress to complete before > > > checking the NFSD_MDS_PR_FENCED flag. > > > . Remove lm_need_to_retry from lease_manager_operations. > > > v3: > > > . correct locking requirement in locking.rst. > > > . add max retry count to fencing operation. > > > . add missing nfs4_put_stid in nfsd4_layout_fence_worker. > > > . remove special-casing of FL_LAYOUT in lease_modify. > > > . remove lease_want_dispose. > > > . move lm_breaker_timedout call to time_out_leases. > > > v4: > > > . only increment ls_fence_retry_cnt after successfully > > > schedule new work in nfsd4_layout_lm_breaker_timedout. > > > > > > diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst > > > index 04c7691e50e0..a339491f02e4 100644 > > > --- a/Documentation/filesystems/locking.rst > > > +++ b/Documentation/filesystems/locking.rst > > > @@ -403,6 +403,7 @@ prototypes:: > > > bool (*lm_breaker_owns_lease)(struct file_lock *); > > > bool (*lm_lock_expirable)(struct file_lock *); > > > void (*lm_expire_lock)(void); > > > + void (*lm_breaker_timedout)(struct file_lease *); > > > > > > locking rules: > > > > > > @@ -417,6 +418,7 @@ lm_breaker_owns_lease: yes no no > > > lm_lock_expirable yes no no > > > lm_expire_lock no no yes > > > lm_open_conflict yes no no > > > +lm_breaker_timedout yes no no > > > ====================== ============= ================= ========= > > > > > > buffer_head > > > diff --git a/fs/locks.c b/fs/locks.c > > > index 46f229f740c8..1b63aa704598 100644 > > > --- a/fs/locks.c > > > +++ b/fs/locks.c > > > @@ -1524,6 +1524,7 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose) > > > { > > > struct file_lock_context *ctx = inode->i_flctx; > > > struct file_lease *fl, *tmp; > > > + bool remove = true; > > > > > > lockdep_assert_held(&ctx->flc_lock); > > > > > > @@ -1531,8 +1532,13 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose) > > > trace_time_out_leases(inode, fl); > > > if (past_time(fl->fl_downgrade_time)) > > > lease_modify(fl, F_RDLCK, dispose); > > > - if (past_time(fl->fl_break_time)) > > > - lease_modify(fl, F_UNLCK, dispose); > > > + > > > + if (past_time(fl->fl_break_time)) { > > > + if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout) > > > + remove = fl->fl_lmops->lm_breaker_timedout(fl); > > > + if (remove) > > > + lease_modify(fl, F_UNLCK, dispose); > > I'd not bother with the return code to lm_breaker_timedout. > > > > Make it void return and have it call lease_modify itself before > > returning in the cases where you have it returning true. If the > > operation isn't defined then just do the lease_modify here like we > > always have. > > The first call to lm_breaker_timedout schedules the fence worker so > fencing is not done yet. If lm_breaker_timedout return to time_out_leases > without any return value then lease_modify is called to remove the lease. > > Am i missing something? > > -Dai > if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout) fl->fl_lmops->lm_breaker_timedout(fl); else lease_modify(fl, F_UNLCK, dispose); ... the lm_breaker_timedout() function (or the workqueue job it spawns) is then responsible for calling lease_modify() at the appropriate time. In most cases you're going to need to queue a workqueue job to fence, and you don't want to clear out the lease until after it's done. > > > > > + } > > > } > > > } > > > > > > diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c > > > index 7ba9e2dd0875..69d3889df302 100644 > > > --- a/fs/nfsd/blocklayout.c > > > +++ b/fs/nfsd/blocklayout.c > > > @@ -443,6 +443,14 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp, > > > return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps); > > > } > > > > > > +/* > > > + * Perform the fence operation to prevent the client from accessing the > > > + * block device. If a fence operation is already in progress, wait for > > > + * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the > > > + * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set, > > > + * update the layout stateid by setting the ls_fenced flag to indicate > > > + * that the client has been fenced. > > > + */ > > > static void > > > nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file) > > > { > > > @@ -450,31 +458,23 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file) > > > struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev; > > > int status; > > > > > > - if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) > > > + mutex_lock(&clp->cl_fence_mutex); > > > + if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) { > > > + ls->ls_fenced = true; > > > + mutex_unlock(&clp->cl_fence_mutex); > > > + nfs4_put_stid(&ls->ls_stid); > > > return; > > > + } > > > > > I don't understand what this new mutex is protecting, and this all > > seems overly complex. If feels kind of like you want nfsd to be driving > > the fencing retries, but I don't think we really do. Here's what I'd > > do. > > > > I'd just make ->fence_client a bool or int return, and have it indicate > > whether the client was successfully fenced or not. If it was > > successfully fenced, then have the caller call lease_modify() to remove > > the lease. If it wasn't successfully fenced, have the caller (the > > workqueue job) requeue itself if you want to retry. If the caller is > > ready to give up, then call lease_modify() on it and remove it (and > > probably throw a pr_warn()). > > > > > status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY, > > > nfsd4_scsi_pr_key(clp), > > > PR_EXCLUSIVE_ACCESS_REG_ONLY, true); > > > - /* > > > - * Reset to allow retry only when the command could not have > > > - * reached the device. Negative status means a local error > > > - * (e.g., -ENOMEM) prevented the command from being sent. > > > - * PR_STS_PATH_FAILED, PR_STS_PATH_FAST_FAILED, and > > > - * PR_STS_RETRY_PATH_FAILURE indicate transport path failures > > > - * before device delivery. > > > - * > > > - * For all other errors, the command may have reached the device > > > - * and the preempt may have succeeded. Avoid resetting, since > > > - * retrying a successful preempt returns PR_STS_IOERR or > > > - * PR_STS_RESERVATION_CONFLICT, which would cause an infinite > > > - * retry loop. > > > - */ > > > - if (status < 0 || > > > - status == PR_STS_PATH_FAILED || > > > - status == PR_STS_PATH_FAST_FAILED || > > > - status == PR_STS_RETRY_PATH_FAILURE) > > > + if (status) > > > nfsd4_scsi_fence_clear(clp, bdev->bd_dev); > > > + else > > > + ls->ls_fenced = true; > > > + mutex_unlock(&clp->cl_fence_mutex); > > > + nfs4_put_stid(&ls->ls_stid); > > > > > > trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status); > > > } > > > diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c > > > index ad7af8cfcf1f..1c498f3cd059 100644 > > > --- a/fs/nfsd/nfs4layouts.c > > > +++ b/fs/nfsd/nfs4layouts.c > > > @@ -222,6 +222,29 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls) > > > return 0; > > > } > > > > > > +static void > > > +nfsd4_layout_fence_worker(struct work_struct *work) > > > +{ > > > + struct nfsd_file *nf; > > > + struct delayed_work *dwork = to_delayed_work(work); > > > + struct nfs4_layout_stateid *ls = container_of(dwork, > > > + struct nfs4_layout_stateid, ls_fence_work); > > > + u32 type; > > > + > > > + rcu_read_lock(); > > > + nf = nfsd_file_get(ls->ls_file); > > > + rcu_read_unlock(); > > > + if (!nf) { > > > + nfs4_put_stid(&ls->ls_stid); > > > + return; > > > + } > > > + > > > + type = ls->ls_layout_type; > > > + if (nfsd4_layout_ops[type]->fence_client) > > > + nfsd4_layout_ops[type]->fence_client(ls, nf); > > If you make fence_client an int/bool return, then you could just > > requeue this job to try it again. > > > > > + nfsd_file_put(nf); > > > +} > > > + > > > static struct nfs4_layout_stateid * > > > nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, > > > struct nfs4_stid *parent, u32 layout_type) > > > @@ -271,6 +294,10 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, > > > list_add(&ls->ls_perfile, &fp->fi_lo_states); > > > spin_unlock(&fp->fi_lock); > > > > > > + INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker); > > > + ls->ls_fenced = false; > > > + ls->ls_fence_retry_cnt = 0; > > > + > > > trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid); > > > return ls; > > > } > > > @@ -708,9 +735,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) > > > rcu_read_unlock(); > > > if (fl) { > > > ops = nfsd4_layout_ops[ls->ls_layout_type]; > > > - if (ops->fence_client) > > > + if (ops->fence_client) { > > > + refcount_inc(&ls->ls_stid.sc_count); > > > ops->fence_client(ls, fl); > > > - else > > > + } else > > > nfsd4_cb_layout_fail(ls, fl); > > > nfsd_file_put(fl); > > > } > > > @@ -747,11 +775,9 @@ static bool > > > nfsd4_layout_lm_break(struct file_lease *fl) > > > { > > > /* > > > - * We don't want the locks code to timeout the lease for us; > > > - * we'll remove it ourself if a layout isn't returned > > > - * in time: > > > + * Enforce break lease timeout to prevent NFSD > > > + * thread from hanging in __break_lease. > > > */ > > > - fl->fl_break_time = 0; > > > nfsd4_recall_file_layout(fl->c.flc_owner); > > > return false; > > > } > > > @@ -782,10 +808,51 @@ nfsd4_layout_lm_open_conflict(struct file *filp, int arg) > > > return 0; > > > } > > > > > > +/** > > > + * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out. > > > + * If the layout type supports a fence operation, schedule a worker to > > > + * fence the client from accessing the block device. > > > + * > > > + * @fl: file to check > > > + * > > > + * Return true if the file lease should be disposed of by the caller; > > > + * otherwise, return false. > > > + */ > > > +static bool > > > +nfsd4_layout_lm_breaker_timedout(struct file_lease *fl) > > > +{ > > > + struct nfs4_layout_stateid *ls = fl->c.flc_owner; > > > + bool ret; > > > + > > > + if (!nfsd4_layout_ops[ls->ls_layout_type]->fence_client) > > > + return true; > > > + if (ls->ls_fenced || ls->ls_fence_retry_cnt >= LO_MAX_FENCE_RETRY) > > > + return true; > > > + > > > + if (work_busy(&ls->ls_fence_work.work)) > > > + return false; > > > + /* Schedule work to do the fence operation */ > > > + ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0); > > > + if (!ret) { > > > + /* > > > + * If there is no pending work, mod_delayed_work queues > > > + * new task. While fencing is in progress, a reference > > > + * count is added to the layout stateid to ensure its > > > + * validity. This reference count is released once fencing > > > + * has been completed. > > > + */ > > > + refcount_inc(&ls->ls_stid.sc_count); > > > + ++ls->ls_fence_retry_cnt; > > > + return true; > > The cases where the fencing didn't work after too many retries, or the > > job couldn't be queued should probably get a pr_warn or something. The > > admin needs to know that data corruption is possible and that they > > might need to nuke the client manually. > > > > > + } > > > + return false; > > > +} > > > + > > > static const struct lease_manager_operations nfsd4_layouts_lm_ops = { > > > .lm_break = nfsd4_layout_lm_break, > > > .lm_change = nfsd4_layout_lm_change, > > > .lm_open_conflict = nfsd4_layout_lm_open_conflict, > > > + .lm_breaker_timedout = nfsd4_layout_lm_breaker_timedout, > > > }; > > > > > > int > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c > > > index 583c13b5aaf3..a57fa3318362 100644 > > > --- a/fs/nfsd/nfs4state.c > > > +++ b/fs/nfsd/nfs4state.c > > > @@ -2385,6 +2385,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name, > > > #endif > > > #ifdef CONFIG_NFSD_SCSILAYOUT > > > xa_init(&clp->cl_dev_fences); > > > + mutex_init(&clp->cl_fence_mutex); > > > #endif > > > INIT_LIST_HEAD(&clp->async_copies); > > > spin_lock_init(&clp->async_lock); > > > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h > > > index 713f55ef6554..57e54dfb406c 100644 > > > --- a/fs/nfsd/state.h > > > +++ b/fs/nfsd/state.h > > > @@ -529,6 +529,7 @@ struct nfs4_client { > > > time64_t cl_ra_time; > > > #ifdef CONFIG_NFSD_SCSILAYOUT > > > struct xarray cl_dev_fences; > > > + struct mutex cl_fence_mutex; > > > #endif > > > }; > > > > > > @@ -738,8 +739,13 @@ struct nfs4_layout_stateid { > > > stateid_t ls_recall_sid; > > > bool ls_recalled; > > > struct mutex ls_mutex; > > > + struct delayed_work ls_fence_work; > > > + bool ls_fenced; > > > + int ls_fence_retry_cnt; > > > }; > > > > > > +#define LO_MAX_FENCE_RETRY 5 > > > + > > > static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s) > > > { > > > return container_of(s, struct nfs4_layout_stateid, ls_stid); > > > diff --git a/include/linux/filelock.h b/include/linux/filelock.h > > > index 2f5e5588ee07..13b9c9f04589 100644 > > > --- a/include/linux/filelock.h > > > +++ b/include/linux/filelock.h > > > @@ -50,6 +50,7 @@ struct lease_manager_operations { > > > void (*lm_setup)(struct file_lease *, void **); > > > bool (*lm_breaker_owns_lease)(struct file_lease *); > > > int (*lm_open_conflict)(struct file *, int); > > > + bool (*lm_breaker_timedout)(struct file_lease *fl); > > > }; > > > > > > struct lock_manager { -- Jeff Layton <jlayton@kernel.org> ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing 2026-01-28 18:01 ` Jeff Layton @ 2026-01-28 18:27 ` Dai Ngo 0 siblings, 0 replies; 12+ messages in thread From: Dai Ngo @ 2026-01-28 18:27 UTC (permalink / raw) To: Jeff Layton, chuck.lever, neil, okorniev, tom, hch, alex.aring, viro, brauner, jack Cc: linux-fsdevel, linux-nfs On 1/28/26 10:01 AM, Jeff Layton wrote: > On Wed, 2026-01-28 at 09:25 -0800, Dai Ngo wrote: >> On 1/27/26 9:54 AM, Jeff Layton wrote: >>> On Mon, 2026-01-26 at 16:50 -0800, Dai Ngo wrote: >>>> When a layout conflict triggers a recall, enforcing a timeout is >>>> necessary to prevent excessive nfsd threads from being blocked in >>>> __break_lease ensuring the server continues servicing incoming >>>> requests efficiently. >>>> >>>> This patch introduces a new function to lease_manager_operations: >>>> >>>> lm_breaker_timedout: Invoked when a lease recall times out and is >>>> about to be disposed of. This function enables the lease manager >>>> to inform the caller whether the file_lease should remain on the >>>> flc_list or be disposed of. >>>> >>>> For the NFSD lease manager, this function now handles layout recall >>>> timeouts. If the layout type supports fencing and the client has not >>>> been fenced, a fence operation is triggered to prevent the client >>>> from accessing the block device. >>>> >>>> While the fencing operation is in progress, the conflicting file_lease >>>> remains on the flc_list until fencing is complete. This guarantees >>>> that no other clients can access the file, and the client with >>>> exclusive access is properly blocked before disposal. >>>> >>>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com> >>>> --- >>>> Documentation/filesystems/locking.rst | 2 + >>>> fs/locks.c | 10 +++- >>>> fs/nfsd/blocklayout.c | 38 ++++++------- >>>> fs/nfsd/nfs4layouts.c | 79 +++++++++++++++++++++++++-- >>>> fs/nfsd/nfs4state.c | 1 + >>>> fs/nfsd/state.h | 6 ++ >>>> include/linux/filelock.h | 1 + >>>> 7 files changed, 110 insertions(+), 27 deletions(-) >>>> >>>> v2: >>>> . Update Subject line to include fencing operation. >>>> . Allow conflicting lease to remain on flc_list until fencing >>>> is complete. >>>> . Use system worker to perform fencing operation asynchronously. >>>> . Use nfs4_stid.sc_count to ensure layout stateid remains >>>> valid before starting the fencing operation, nfs4_stid.sc_count >>>> is released after fencing operation is complete. >>>> . Rework nfsd4_scsi_fence_client to: >>>> . wait until fencing to complete before exiting. >>>> . wait until fencing in progress to complete before >>>> checking the NFSD_MDS_PR_FENCED flag. >>>> . Remove lm_need_to_retry from lease_manager_operations. >>>> v3: >>>> . correct locking requirement in locking.rst. >>>> . add max retry count to fencing operation. >>>> . add missing nfs4_put_stid in nfsd4_layout_fence_worker. >>>> . remove special-casing of FL_LAYOUT in lease_modify. >>>> . remove lease_want_dispose. >>>> . move lm_breaker_timedout call to time_out_leases. >>>> v4: >>>> . only increment ls_fence_retry_cnt after successfully >>>> schedule new work in nfsd4_layout_lm_breaker_timedout. >>>> >>>> diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst >>>> index 04c7691e50e0..a339491f02e4 100644 >>>> --- a/Documentation/filesystems/locking.rst >>>> +++ b/Documentation/filesystems/locking.rst >>>> @@ -403,6 +403,7 @@ prototypes:: >>>> bool (*lm_breaker_owns_lease)(struct file_lock *); >>>> bool (*lm_lock_expirable)(struct file_lock *); >>>> void (*lm_expire_lock)(void); >>>> + void (*lm_breaker_timedout)(struct file_lease *); >>>> >>>> locking rules: >>>> >>>> @@ -417,6 +418,7 @@ lm_breaker_owns_lease: yes no no >>>> lm_lock_expirable yes no no >>>> lm_expire_lock no no yes >>>> lm_open_conflict yes no no >>>> +lm_breaker_timedout yes no no >>>> ====================== ============= ================= ========= >>>> >>>> buffer_head >>>> diff --git a/fs/locks.c b/fs/locks.c >>>> index 46f229f740c8..1b63aa704598 100644 >>>> --- a/fs/locks.c >>>> +++ b/fs/locks.c >>>> @@ -1524,6 +1524,7 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose) >>>> { >>>> struct file_lock_context *ctx = inode->i_flctx; >>>> struct file_lease *fl, *tmp; >>>> + bool remove = true; >>>> >>>> lockdep_assert_held(&ctx->flc_lock); >>>> >>>> @@ -1531,8 +1532,13 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose) >>>> trace_time_out_leases(inode, fl); >>>> if (past_time(fl->fl_downgrade_time)) >>>> lease_modify(fl, F_RDLCK, dispose); >>>> - if (past_time(fl->fl_break_time)) >>>> - lease_modify(fl, F_UNLCK, dispose); >>>> + >>>> + if (past_time(fl->fl_break_time)) { >>>> + if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout) >>>> + remove = fl->fl_lmops->lm_breaker_timedout(fl); >>>> + if (remove) >>>> + lease_modify(fl, F_UNLCK, dispose); >>> I'd not bother with the return code to lm_breaker_timedout. >>> >>> Make it void return and have it call lease_modify itself before >>> returning in the cases where you have it returning true. If the >>> operation isn't defined then just do the lease_modify here like we >>> always have. >> The first call to lm_breaker_timedout schedules the fence worker so >> fencing is not done yet. If lm_breaker_timedout return to time_out_leases >> without any return value then lease_modify is called to remove the lease. >> >> Am i missing something? >> >> -Dai >> > if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout) > fl->fl_lmops->lm_breaker_timedout(fl); > else > lease_modify(fl, F_UNLCK, dispose); > > ... the lm_breaker_timedout() function (or the workqueue job it spawns) > is then responsible for calling lease_modify() at the appropriate time. > > In most cases you're going to need to queue a workqueue job to fence, > and you don't want to clear out the lease until after it's done. Oh I see, thank you! -Dai > > >>>> + } >>>> } >>>> } >>>> >>>> diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c >>>> index 7ba9e2dd0875..69d3889df302 100644 >>>> --- a/fs/nfsd/blocklayout.c >>>> +++ b/fs/nfsd/blocklayout.c >>>> @@ -443,6 +443,14 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp, >>>> return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps); >>>> } >>>> >>>> +/* >>>> + * Perform the fence operation to prevent the client from accessing the >>>> + * block device. If a fence operation is already in progress, wait for >>>> + * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the >>>> + * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set, >>>> + * update the layout stateid by setting the ls_fenced flag to indicate >>>> + * that the client has been fenced. >>>> + */ >>>> static void >>>> nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file) >>>> { >>>> @@ -450,31 +458,23 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file) >>>> struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev; >>>> int status; >>>> >>>> - if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) >>>> + mutex_lock(&clp->cl_fence_mutex); >>>> + if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) { >>>> + ls->ls_fenced = true; >>>> + mutex_unlock(&clp->cl_fence_mutex); >>>> + nfs4_put_stid(&ls->ls_stid); >>>> return; >>>> + } >>>> >>> I don't understand what this new mutex is protecting, and this all >>> seems overly complex. If feels kind of like you want nfsd to be driving >>> the fencing retries, but I don't think we really do. Here's what I'd >>> do. >>> >>> I'd just make ->fence_client a bool or int return, and have it indicate >>> whether the client was successfully fenced or not. If it was >>> successfully fenced, then have the caller call lease_modify() to remove >>> the lease. If it wasn't successfully fenced, have the caller (the >>> workqueue job) requeue itself if you want to retry. If the caller is >>> ready to give up, then call lease_modify() on it and remove it (and >>> probably throw a pr_warn()). >>> >>>> status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY, >>>> nfsd4_scsi_pr_key(clp), >>>> PR_EXCLUSIVE_ACCESS_REG_ONLY, true); >>>> - /* >>>> - * Reset to allow retry only when the command could not have >>>> - * reached the device. Negative status means a local error >>>> - * (e.g., -ENOMEM) prevented the command from being sent. >>>> - * PR_STS_PATH_FAILED, PR_STS_PATH_FAST_FAILED, and >>>> - * PR_STS_RETRY_PATH_FAILURE indicate transport path failures >>>> - * before device delivery. >>>> - * >>>> - * For all other errors, the command may have reached the device >>>> - * and the preempt may have succeeded. Avoid resetting, since >>>> - * retrying a successful preempt returns PR_STS_IOERR or >>>> - * PR_STS_RESERVATION_CONFLICT, which would cause an infinite >>>> - * retry loop. >>>> - */ >>>> - if (status < 0 || >>>> - status == PR_STS_PATH_FAILED || >>>> - status == PR_STS_PATH_FAST_FAILED || >>>> - status == PR_STS_RETRY_PATH_FAILURE) >>>> + if (status) >>>> nfsd4_scsi_fence_clear(clp, bdev->bd_dev); >>>> + else >>>> + ls->ls_fenced = true; >>>> + mutex_unlock(&clp->cl_fence_mutex); >>>> + nfs4_put_stid(&ls->ls_stid); >>>> >>>> trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status); >>>> } >>>> diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c >>>> index ad7af8cfcf1f..1c498f3cd059 100644 >>>> --- a/fs/nfsd/nfs4layouts.c >>>> +++ b/fs/nfsd/nfs4layouts.c >>>> @@ -222,6 +222,29 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls) >>>> return 0; >>>> } >>>> >>>> +static void >>>> +nfsd4_layout_fence_worker(struct work_struct *work) >>>> +{ >>>> + struct nfsd_file *nf; >>>> + struct delayed_work *dwork = to_delayed_work(work); >>>> + struct nfs4_layout_stateid *ls = container_of(dwork, >>>> + struct nfs4_layout_stateid, ls_fence_work); >>>> + u32 type; >>>> + >>>> + rcu_read_lock(); >>>> + nf = nfsd_file_get(ls->ls_file); >>>> + rcu_read_unlock(); >>>> + if (!nf) { >>>> + nfs4_put_stid(&ls->ls_stid); >>>> + return; >>>> + } >>>> + >>>> + type = ls->ls_layout_type; >>>> + if (nfsd4_layout_ops[type]->fence_client) >>>> + nfsd4_layout_ops[type]->fence_client(ls, nf); >>> If you make fence_client an int/bool return, then you could just >>> requeue this job to try it again. >>> >>>> + nfsd_file_put(nf); >>>> +} >>>> + >>>> static struct nfs4_layout_stateid * >>>> nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, >>>> struct nfs4_stid *parent, u32 layout_type) >>>> @@ -271,6 +294,10 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, >>>> list_add(&ls->ls_perfile, &fp->fi_lo_states); >>>> spin_unlock(&fp->fi_lock); >>>> >>>> + INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker); >>>> + ls->ls_fenced = false; >>>> + ls->ls_fence_retry_cnt = 0; >>>> + >>>> trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid); >>>> return ls; >>>> } >>>> @@ -708,9 +735,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) >>>> rcu_read_unlock(); >>>> if (fl) { >>>> ops = nfsd4_layout_ops[ls->ls_layout_type]; >>>> - if (ops->fence_client) >>>> + if (ops->fence_client) { >>>> + refcount_inc(&ls->ls_stid.sc_count); >>>> ops->fence_client(ls, fl); >>>> - else >>>> + } else >>>> nfsd4_cb_layout_fail(ls, fl); >>>> nfsd_file_put(fl); >>>> } >>>> @@ -747,11 +775,9 @@ static bool >>>> nfsd4_layout_lm_break(struct file_lease *fl) >>>> { >>>> /* >>>> - * We don't want the locks code to timeout the lease for us; >>>> - * we'll remove it ourself if a layout isn't returned >>>> - * in time: >>>> + * Enforce break lease timeout to prevent NFSD >>>> + * thread from hanging in __break_lease. >>>> */ >>>> - fl->fl_break_time = 0; >>>> nfsd4_recall_file_layout(fl->c.flc_owner); >>>> return false; >>>> } >>>> @@ -782,10 +808,51 @@ nfsd4_layout_lm_open_conflict(struct file *filp, int arg) >>>> return 0; >>>> } >>>> >>>> +/** >>>> + * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out. >>>> + * If the layout type supports a fence operation, schedule a worker to >>>> + * fence the client from accessing the block device. >>>> + * >>>> + * @fl: file to check >>>> + * >>>> + * Return true if the file lease should be disposed of by the caller; >>>> + * otherwise, return false. >>>> + */ >>>> +static bool >>>> +nfsd4_layout_lm_breaker_timedout(struct file_lease *fl) >>>> +{ >>>> + struct nfs4_layout_stateid *ls = fl->c.flc_owner; >>>> + bool ret; >>>> + >>>> + if (!nfsd4_layout_ops[ls->ls_layout_type]->fence_client) >>>> + return true; >>>> + if (ls->ls_fenced || ls->ls_fence_retry_cnt >= LO_MAX_FENCE_RETRY) >>>> + return true; >>>> + >>>> + if (work_busy(&ls->ls_fence_work.work)) >>>> + return false; >>>> + /* Schedule work to do the fence operation */ >>>> + ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0); >>>> + if (!ret) { >>>> + /* >>>> + * If there is no pending work, mod_delayed_work queues >>>> + * new task. While fencing is in progress, a reference >>>> + * count is added to the layout stateid to ensure its >>>> + * validity. This reference count is released once fencing >>>> + * has been completed. >>>> + */ >>>> + refcount_inc(&ls->ls_stid.sc_count); >>>> + ++ls->ls_fence_retry_cnt; >>>> + return true; >>> The cases where the fencing didn't work after too many retries, or the >>> job couldn't be queued should probably get a pr_warn or something. The >>> admin needs to know that data corruption is possible and that they >>> might need to nuke the client manually. >>> >>>> + } >>>> + return false; >>>> +} >>>> + >>>> static const struct lease_manager_operations nfsd4_layouts_lm_ops = { >>>> .lm_break = nfsd4_layout_lm_break, >>>> .lm_change = nfsd4_layout_lm_change, >>>> .lm_open_conflict = nfsd4_layout_lm_open_conflict, >>>> + .lm_breaker_timedout = nfsd4_layout_lm_breaker_timedout, >>>> }; >>>> >>>> int >>>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c >>>> index 583c13b5aaf3..a57fa3318362 100644 >>>> --- a/fs/nfsd/nfs4state.c >>>> +++ b/fs/nfsd/nfs4state.c >>>> @@ -2385,6 +2385,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name, >>>> #endif >>>> #ifdef CONFIG_NFSD_SCSILAYOUT >>>> xa_init(&clp->cl_dev_fences); >>>> + mutex_init(&clp->cl_fence_mutex); >>>> #endif >>>> INIT_LIST_HEAD(&clp->async_copies); >>>> spin_lock_init(&clp->async_lock); >>>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h >>>> index 713f55ef6554..57e54dfb406c 100644 >>>> --- a/fs/nfsd/state.h >>>> +++ b/fs/nfsd/state.h >>>> @@ -529,6 +529,7 @@ struct nfs4_client { >>>> time64_t cl_ra_time; >>>> #ifdef CONFIG_NFSD_SCSILAYOUT >>>> struct xarray cl_dev_fences; >>>> + struct mutex cl_fence_mutex; >>>> #endif >>>> }; >>>> >>>> @@ -738,8 +739,13 @@ struct nfs4_layout_stateid { >>>> stateid_t ls_recall_sid; >>>> bool ls_recalled; >>>> struct mutex ls_mutex; >>>> + struct delayed_work ls_fence_work; >>>> + bool ls_fenced; >>>> + int ls_fence_retry_cnt; >>>> }; >>>> >>>> +#define LO_MAX_FENCE_RETRY 5 >>>> + >>>> static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s) >>>> { >>>> return container_of(s, struct nfs4_layout_stateid, ls_stid); >>>> diff --git a/include/linux/filelock.h b/include/linux/filelock.h >>>> index 2f5e5588ee07..13b9c9f04589 100644 >>>> --- a/include/linux/filelock.h >>>> +++ b/include/linux/filelock.h >>>> @@ -50,6 +50,7 @@ struct lease_manager_operations { >>>> void (*lm_setup)(struct file_lease *, void **); >>>> bool (*lm_breaker_owns_lease)(struct file_lease *); >>>> int (*lm_open_conflict)(struct file *, int); >>>> + bool (*lm_breaker_timedout)(struct file_lease *fl); >>>> }; >>>> >>>> struct lock_manager { ^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2026-01-28 18:28 UTC | newest] Thread overview: 12+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2026-01-27 0:50 [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing Dai Ngo 2026-01-27 17:11 ` Chuck Lever 2026-01-27 18:14 ` Jeff Layton 2026-01-27 20:38 ` Dai Ngo 2026-01-27 20:36 ` Dai Ngo 2026-01-27 21:53 ` Chuck Lever 2026-01-28 0:50 ` Dai Ngo 2026-01-27 17:54 ` Jeff Layton 2026-01-27 20:52 ` Dai Ngo 2026-01-28 17:25 ` Dai Ngo 2026-01-28 18:01 ` Jeff Layton 2026-01-28 18:27 ` Dai Ngo
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox