* [PATCH v2 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing
@ 2026-01-25 22:21 Dai Ngo
2026-01-26 16:11 ` Chuck Lever
2026-01-26 18:50 ` Jeff Layton
0 siblings, 2 replies; 5+ messages in thread
From: Dai Ngo @ 2026-01-25 22:21 UTC (permalink / raw)
To: chuck.lever, jlayton, neil, okorniev, tom, hch, alex.aring, viro,
brauner, jack
Cc: linux-fsdevel, linux-nfs
When a layout conflict triggers a recall, enforcing a timeout is
necessary to prevent excessive nfsd threads from being blocked in
__break_lease ensuring the server continues servicing incoming
requests efficiently.
This patch introduces a new function to lease_manager_operations:
lm_breaker_timedout: Invoked when a lease recall times out and is
about to be disposed of. This function enables the lease manager
to inform the caller whether the file_lease should remain on the
flc_list or be disposed of.
For the NFSD lease manager, this function now handles layout recall
timeouts. If the layout type supports fencing and the client has not
been fenced, a fence operation is triggered to prevent the client
from accessing the block device.
Fencing operation is done asynchronously using a system worker. This
is to allow lease_modify to trigger the fencing opeation when layout
recall timed out.
To ensure layout stateid remains valid while the fencing operation is
in progress, a reference count is added to layout stateid before
schedule the system worker to do the fencing operation. The reference
count is released after the fencing operation is complete.
While the fencing operation is in progress, the conflicting file_lease
remains on the flc_list until fencing is complete. This guarantees
that no other clients can access the file, and the client with exclusive
access is properly blocked before disposal.
Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
---
Documentation/filesystems/locking.rst | 2 +
fs/locks.c | 29 ++++++++++-
fs/nfsd/blocklayout.c | 19 ++++++-
fs/nfsd/nfs4layouts.c | 74 ++++++++++++++++++++++++---
fs/nfsd/nfs4state.c | 1 +
fs/nfsd/state.h | 3 ++
include/linux/filelock.h | 2 +
7 files changed, 122 insertions(+), 8 deletions(-)
v2:
. Update Subject line to include fencing operation.
. Allow conflicting lease to remain on flc_list until fencing
is complete.
. Use system worker to perform fencing operation asynchronously.
. Use nfs4_stid.sc_count to ensure layout stateid remains
valid before starting the fencing operation, nfs4_stid.sc_count
is released after fencing operation is complete.
. Rework nfsd4_scsi_fence_client to:
. wait until fencing to complete before exiting.
. wait until fencing in progress to complete before
checking the NFSD_MDS_PR_FENCED flag.
. Remove lm_need_to_retry from lease_manager_operations.
diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index 04c7691e50e0..f7fe2c1ee32b 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -403,6 +403,7 @@ prototypes::
bool (*lm_breaker_owns_lease)(struct file_lock *);
bool (*lm_lock_expirable)(struct file_lock *);
void (*lm_expire_lock)(void);
+ void (*lm_breaker_timedout)(struct file_lease *);
locking rules:
@@ -417,6 +418,7 @@ lm_breaker_owns_lease: yes no no
lm_lock_expirable yes no no
lm_expire_lock no no yes
lm_open_conflict yes no no
+lm_breaker_timedout no no yes
====================== ============= ================= =========
buffer_head
diff --git a/fs/locks.c b/fs/locks.c
index 46f229f740c8..28e63aa87f74 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1487,6 +1487,25 @@ static void lease_clear_pending(struct file_lease *fl, int arg)
}
}
+/*
+ * A layout lease is about to be disposed. If the disposal is
+ * due to a layout recall timeout, consult the lease manager
+ * to see whether the operation should continue.
+ *
+ * Return true if the lease should be disposed else return
+ * false.
+ */
+static bool lease_want_dispose(struct file_lease *fl)
+{
+ if (!(fl->c.flc_flags & FL_BREAKER_TIMEDOUT))
+ return true;
+
+ if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout &&
+ (!fl->fl_lmops->lm_breaker_timedout(fl)))
+ return false;
+ return true;
+}
+
/* We already had a lease on this file; just change its type */
int lease_modify(struct file_lease *fl, int arg, struct list_head *dispose)
{
@@ -1494,6 +1513,11 @@ int lease_modify(struct file_lease *fl, int arg, struct list_head *dispose)
if (error)
return error;
+
+ if ((fl->c.flc_flags & FL_LAYOUT) && (arg == F_UNLCK) &&
+ (!lease_want_dispose(fl)))
+ return 0;
+
lease_clear_pending(fl, arg);
locks_wake_up_blocks(&fl->c);
if (arg == F_UNLCK) {
@@ -1531,8 +1555,11 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
trace_time_out_leases(inode, fl);
if (past_time(fl->fl_downgrade_time))
lease_modify(fl, F_RDLCK, dispose);
- if (past_time(fl->fl_break_time))
+
+ if (past_time(fl->fl_break_time)) {
+ fl->c.flc_flags |= FL_BREAKER_TIMEDOUT;
lease_modify(fl, F_UNLCK, dispose);
+ }
}
}
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 7ba9e2dd0875..05ddff5a4005 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -443,6 +443,14 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp,
return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
}
+/*
+ * Perform the fence operation to prevent the client from accessing the
+ * block device. If a fence operation is already in progress, wait for
+ * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the
+ * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set,
+ * update the layout stateid by setting the ls_fenced flag to indicate
+ * that the client has been fenced.
+ */
static void
nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
{
@@ -450,8 +458,13 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev;
int status;
- if (nfsd4_scsi_fence_set(clp, bdev->bd_dev))
+ mutex_lock(&clp->cl_fence_mutex);
+ if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) {
+ ls->ls_fenced = true;
+ mutex_unlock(&clp->cl_fence_mutex);
+ nfs4_put_stid(&ls->ls_stid);
return;
+ }
status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
nfsd4_scsi_pr_key(clp),
@@ -475,6 +488,10 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
status == PR_STS_PATH_FAST_FAILED ||
status == PR_STS_RETRY_PATH_FAILURE)
nfsd4_scsi_fence_clear(clp, bdev->bd_dev);
+ else
+ ls->ls_fenced = true;
+ mutex_unlock(&clp->cl_fence_mutex);
+ nfs4_put_stid(&ls->ls_stid);
trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status);
}
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index ad7af8cfcf1f..4a11ccd5b0a5 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -222,6 +222,27 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
return 0;
}
+static void
+nfsd4_layout_fence_worker(struct work_struct *work)
+{
+ struct nfsd_file *nf;
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct nfs4_layout_stateid *ls = container_of(dwork,
+ struct nfs4_layout_stateid, ls_fence_work);
+ u32 type;
+
+ rcu_read_lock();
+ nf = nfsd_file_get(ls->ls_file);
+ rcu_read_unlock();
+ if (!nf)
+ return;
+
+ type = ls->ls_layout_type;
+ if (nfsd4_layout_ops[type]->fence_client)
+ nfsd4_layout_ops[type]->fence_client(ls, nf);
+ nfsd_file_put(nf);
+}
+
static struct nfs4_layout_stateid *
nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
struct nfs4_stid *parent, u32 layout_type)
@@ -271,6 +292,9 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
list_add(&ls->ls_perfile, &fp->fi_lo_states);
spin_unlock(&fp->fi_lock);
+ INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker);
+ ls->ls_fenced = false;
+
trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid);
return ls;
}
@@ -708,9 +732,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
rcu_read_unlock();
if (fl) {
ops = nfsd4_layout_ops[ls->ls_layout_type];
- if (ops->fence_client)
+ if (ops->fence_client) {
+ refcount_inc(&ls->ls_stid.sc_count);
ops->fence_client(ls, fl);
- else
+ } else
nfsd4_cb_layout_fail(ls, fl);
nfsd_file_put(fl);
}
@@ -747,11 +772,9 @@ static bool
nfsd4_layout_lm_break(struct file_lease *fl)
{
/*
- * We don't want the locks code to timeout the lease for us;
- * we'll remove it ourself if a layout isn't returned
- * in time:
+ * Enforce break lease timeout to prevent NFSD
+ * thread from hanging in __break_lease.
*/
- fl->fl_break_time = 0;
nfsd4_recall_file_layout(fl->c.flc_owner);
return false;
}
@@ -782,10 +805,49 @@ nfsd4_layout_lm_open_conflict(struct file *filp, int arg)
return 0;
}
+/**
+ * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out.
+ * If the layout type supports a fence operation, schedule a worker to
+ * fence the client from accessing the block device.
+ *
+ * @fl: file to check
+ *
+ * Return true if the file lease should be disposed of by the caller;
+ * otherwise, return false.
+ */
+static bool
+nfsd4_layout_lm_breaker_timedout(struct file_lease *fl)
+{
+ struct nfs4_layout_stateid *ls = fl->c.flc_owner;
+ bool ret;
+
+ if (!nfsd4_layout_ops[ls->ls_layout_type]->fence_client)
+ return true;
+ if (ls->ls_fenced)
+ return true;
+
+ if (work_busy(&ls->ls_fence_work.work))
+ return false;
+ /* Schedule work to do the fence operation */
+ ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
+ if (!ret) {
+ /*
+ * If there is no pending work, mod_delayed_work queues
+ * new task. While fencing is in progress, a reference
+ * count is added to the layout stateid to ensure its
+ * validity. This reference count is released once fencing
+ * has been completed.
+ */
+ refcount_inc(&ls->ls_stid.sc_count);
+ }
+ return false;
+}
+
static const struct lease_manager_operations nfsd4_layouts_lm_ops = {
.lm_break = nfsd4_layout_lm_break,
.lm_change = nfsd4_layout_lm_change,
.lm_open_conflict = nfsd4_layout_lm_open_conflict,
+ .lm_breaker_timedout = nfsd4_layout_lm_breaker_timedout,
};
int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 583c13b5aaf3..a57fa3318362 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2385,6 +2385,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name,
#endif
#ifdef CONFIG_NFSD_SCSILAYOUT
xa_init(&clp->cl_dev_fences);
+ mutex_init(&clp->cl_fence_mutex);
#endif
INIT_LIST_HEAD(&clp->async_copies);
spin_lock_init(&clp->async_lock);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 713f55ef6554..d9a3c966a35f 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -529,6 +529,7 @@ struct nfs4_client {
time64_t cl_ra_time;
#ifdef CONFIG_NFSD_SCSILAYOUT
struct xarray cl_dev_fences;
+ struct mutex cl_fence_mutex;
#endif
};
@@ -738,6 +739,8 @@ struct nfs4_layout_stateid {
stateid_t ls_recall_sid;
bool ls_recalled;
struct mutex ls_mutex;
+ struct delayed_work ls_fence_work;
+ bool ls_fenced;
};
static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
diff --git a/include/linux/filelock.h b/include/linux/filelock.h
index 2f5e5588ee07..6939952f2088 100644
--- a/include/linux/filelock.h
+++ b/include/linux/filelock.h
@@ -17,6 +17,7 @@
#define FL_OFDLCK 1024 /* lock is "owned" by struct file */
#define FL_LAYOUT 2048 /* outstanding pNFS layout */
#define FL_RECLAIM 4096 /* reclaiming from a reboot server */
+#define FL_BREAKER_TIMEDOUT 8192 /* lease breaker timed out */
#define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE)
@@ -50,6 +51,7 @@ struct lease_manager_operations {
void (*lm_setup)(struct file_lease *, void **);
bool (*lm_breaker_owns_lease)(struct file_lease *);
int (*lm_open_conflict)(struct file *, int);
+ bool (*lm_breaker_timedout)(struct file_lease *fl);
};
struct lock_manager {
--
2.47.3
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH v2 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing
2026-01-25 22:21 [PATCH v2 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing Dai Ngo
@ 2026-01-26 16:11 ` Chuck Lever
2026-01-26 17:44 ` Dai Ngo
2026-01-26 18:50 ` Jeff Layton
1 sibling, 1 reply; 5+ messages in thread
From: Chuck Lever @ 2026-01-26 16:11 UTC (permalink / raw)
To: Dai Ngo, Chuck Lever, Jeff Layton, NeilBrown, Olga Kornievskaia,
Tom Talpey, Christoph Hellwig, Alexander Aring, Alexander Viro,
Christian Brauner, Jan Kara
Cc: linux-fsdevel, linux-nfs
On Sun, Jan 25, 2026, at 5:21 PM, Dai Ngo wrote:
> When a layout conflict triggers a recall, enforcing a timeout is
> necessary to prevent excessive nfsd threads from being blocked in
> __break_lease ensuring the server continues servicing incoming
> requests efficiently.
>
> This patch introduces a new function to lease_manager_operations:
>
> lm_breaker_timedout: Invoked when a lease recall times out and is
> about to be disposed of. This function enables the lease manager
> to inform the caller whether the file_lease should remain on the
> flc_list or be disposed of.
>
> For the NFSD lease manager, this function now handles layout recall
> timeouts. If the layout type supports fencing and the client has not
> been fenced, a fence operation is triggered to prevent the client
> from accessing the block device.
>
> Fencing operation is done asynchronously using a system worker. This
> is to allow lease_modify to trigger the fencing opeation when layout
> recall timed out.
>
> To ensure layout stateid remains valid while the fencing operation is
> in progress, a reference count is added to layout stateid before
> schedule the system worker to do the fencing operation. The reference
> count is released after the fencing operation is complete.
>
> While the fencing operation is in progress, the conflicting file_lease
> remains on the flc_list until fencing is complete. This guarantees
> that no other clients can access the file, and the client with exclusive
> access is properly blocked before disposal.
>
> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
> ---
> Documentation/filesystems/locking.rst | 2 +
> fs/locks.c | 29 ++++++++++-
> fs/nfsd/blocklayout.c | 19 ++++++-
> fs/nfsd/nfs4layouts.c | 74 ++++++++++++++++++++++++---
> fs/nfsd/nfs4state.c | 1 +
> fs/nfsd/state.h | 3 ++
> include/linux/filelock.h | 2 +
> 7 files changed, 122 insertions(+), 8 deletions(-)
>
> v2:
> . Update Subject line to include fencing operation.
> . Allow conflicting lease to remain on flc_list until fencing
> is complete.
> . Use system worker to perform fencing operation asynchronously.
> . Use nfs4_stid.sc_count to ensure layout stateid remains
> valid before starting the fencing operation, nfs4_stid.sc_count
> is released after fencing operation is complete.
> . Rework nfsd4_scsi_fence_client to:
> . wait until fencing to complete before exiting.
> . wait until fencing in progress to complete before
> checking the NFSD_MDS_PR_FENCED flag.
> . Remove lm_need_to_retry from lease_manager_operations.
>
> diff --git a/Documentation/filesystems/locking.rst
> b/Documentation/filesystems/locking.rst
> index 04c7691e50e0..f7fe2c1ee32b 100644
> --- a/Documentation/filesystems/locking.rst
> +++ b/Documentation/filesystems/locking.rst
> @@ -403,6 +403,7 @@ prototypes::
> bool (*lm_breaker_owns_lease)(struct file_lock *);
> bool (*lm_lock_expirable)(struct file_lock *);
> void (*lm_expire_lock)(void);
> + void (*lm_breaker_timedout)(struct file_lease *);
>
> locking rules:
>
> @@ -417,6 +418,7 @@ lm_breaker_owns_lease: yes no no
> lm_lock_expirable yes no no
> lm_expire_lock no no yes
> lm_open_conflict yes no no
> +lm_breaker_timedout no no yes
This might not be consistent with other comments that state
lm_breaker_timedout runs with no locks held and may block.
But looks like the flc_lock spinlock IS held. Documentation
should say flc_lock = yes, may block = no.
> ====================== ============= ================= =========
>
> buffer_head
> diff --git a/fs/locks.c b/fs/locks.c
> index 46f229f740c8..28e63aa87f74 100644
> --- a/fs/locks.c
> +++ b/fs/locks.c
> @@ -1487,6 +1487,25 @@ static void lease_clear_pending(struct
> file_lease *fl, int arg)
> }
> }
>
> +/*
> + * A layout lease is about to be disposed. If the disposal is
> + * due to a layout recall timeout, consult the lease manager
> + * to see whether the operation should continue.
> + *
> + * Return true if the lease should be disposed else return
> + * false.
> + */
> +static bool lease_want_dispose(struct file_lease *fl)
> +{
> + if (!(fl->c.flc_flags & FL_BREAKER_TIMEDOUT))
> + return true;
> +
> + if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout &&
> + (!fl->fl_lmops->lm_breaker_timedout(fl)))
> + return false;
> + return true;
> +}
> +
> /* We already had a lease on this file; just change its type */
> int lease_modify(struct file_lease *fl, int arg, struct list_head
> *dispose)
> {
> @@ -1494,6 +1513,11 @@ int lease_modify(struct file_lease *fl, int arg,
> struct list_head *dispose)
>
> if (error)
> return error;
> +
> + if ((fl->c.flc_flags & FL_LAYOUT) && (arg == F_UNLCK) &&
> + (!lease_want_dispose(fl)))
> + return 0;
> +
> lease_clear_pending(fl, arg);
> locks_wake_up_blocks(&fl->c);
> if (arg == F_UNLCK) {
> @@ -1531,8 +1555,11 @@ static void time_out_leases(struct inode *inode,
> struct list_head *dispose)
> trace_time_out_leases(inode, fl);
> if (past_time(fl->fl_downgrade_time))
> lease_modify(fl, F_RDLCK, dispose);
> - if (past_time(fl->fl_break_time))
> +
> + if (past_time(fl->fl_break_time)) {
> + fl->c.flc_flags |= FL_BREAKER_TIMEDOUT;
> lease_modify(fl, F_UNLCK, dispose);
> + }
> }
> }
>
> diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
> index 7ba9e2dd0875..05ddff5a4005 100644
> --- a/fs/nfsd/blocklayout.c
> +++ b/fs/nfsd/blocklayout.c
> @@ -443,6 +443,14 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode,
> struct svc_rqst *rqstp,
> return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
> }
>
> +/*
> + * Perform the fence operation to prevent the client from accessing the
> + * block device. If a fence operation is already in progress, wait for
> + * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the
> + * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set,
> + * update the layout stateid by setting the ls_fenced flag to indicate
> + * that the client has been fenced.
> + */
> static void
> nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct
> nfsd_file *file)
> {
> @@ -450,8 +458,13 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid
> *ls, struct nfsd_file *file)
> struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev;
> int status;
>
> - if (nfsd4_scsi_fence_set(clp, bdev->bd_dev))
> + mutex_lock(&clp->cl_fence_mutex);
> + if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) {
> + ls->ls_fenced = true;
> + mutex_unlock(&clp->cl_fence_mutex);
> + nfs4_put_stid(&ls->ls_stid);
> return;
> + }
>
> status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev,
> NFSD_MDS_PR_KEY,
> nfsd4_scsi_pr_key(clp),
> @@ -475,6 +488,10 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid
> *ls, struct nfsd_file *file)
> status == PR_STS_PATH_FAST_FAILED ||
> status == PR_STS_RETRY_PATH_FAILURE)
> nfsd4_scsi_fence_clear(clp, bdev->bd_dev);
> + else
> + ls->ls_fenced = true;
> + mutex_unlock(&clp->cl_fence_mutex);
> + nfs4_put_stid(&ls->ls_stid);
>
> trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status);
> }
> diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
> index ad7af8cfcf1f..4a11ccd5b0a5 100644
> --- a/fs/nfsd/nfs4layouts.c
> +++ b/fs/nfsd/nfs4layouts.c
> @@ -222,6 +222,27 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
> return 0;
> }
>
> +static void
> +nfsd4_layout_fence_worker(struct work_struct *work)
> +{
> + struct nfsd_file *nf;
> + struct delayed_work *dwork = to_delayed_work(work);
> + struct nfs4_layout_stateid *ls = container_of(dwork,
> + struct nfs4_layout_stateid, ls_fence_work);
> + u32 type;
> +
> + rcu_read_lock();
> + nf = nfsd_file_get(ls->ls_file);
> + rcu_read_unlock();
> + if (!nf)
> + return;
The refcount was incremented when scheduling the work but is never
released for this early return. Do you need:
if (!nf) {
nfs4_put_stid(&ls->ls_stid);
return;
}
here?
> +
> + type = ls->ls_layout_type;
> + if (nfsd4_layout_ops[type]->fence_client)
> + nfsd4_layout_ops[type]->fence_client(ls, nf);
> + nfsd_file_put(nf);
> +}
> +
> static struct nfs4_layout_stateid *
> nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
> struct nfs4_stid *parent, u32 layout_type)
> @@ -271,6 +292,9 @@ nfsd4_alloc_layout_stateid(struct
> nfsd4_compound_state *cstate,
> list_add(&ls->ls_perfile, &fp->fi_lo_states);
> spin_unlock(&fp->fi_lock);
>
> + INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker);
> + ls->ls_fenced = false;
> +
> trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid);
> return ls;
> }
> @@ -708,9 +732,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb,
> struct rpc_task *task)
> rcu_read_unlock();
> if (fl) {
> ops = nfsd4_layout_ops[ls->ls_layout_type];
> - if (ops->fence_client)
> + if (ops->fence_client) {
> + refcount_inc(&ls->ls_stid.sc_count);
> ops->fence_client(ls, fl);
> - else
> + } else
> nfsd4_cb_layout_fail(ls, fl);
> nfsd_file_put(fl);
> }
> @@ -747,11 +772,9 @@ static bool
> nfsd4_layout_lm_break(struct file_lease *fl)
> {
> /*
> - * We don't want the locks code to timeout the lease for us;
> - * we'll remove it ourself if a layout isn't returned
> - * in time:
> + * Enforce break lease timeout to prevent NFSD
> + * thread from hanging in __break_lease.
> */
> - fl->fl_break_time = 0;
> nfsd4_recall_file_layout(fl->c.flc_owner);
> return false;
> }
> @@ -782,10 +805,49 @@ nfsd4_layout_lm_open_conflict(struct file *filp,
> int arg)
> return 0;
> }
>
> +/**
> + * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out.
> + * If the layout type supports a fence operation, schedule a worker to
> + * fence the client from accessing the block device.
> + *
> + * @fl: file to check
> + *
> + * Return true if the file lease should be disposed of by the caller;
> + * otherwise, return false.
> + */
> +static bool
> +nfsd4_layout_lm_breaker_timedout(struct file_lease *fl)
> +{
> + struct nfs4_layout_stateid *ls = fl->c.flc_owner;
> + bool ret;
> +
> + if (!nfsd4_layout_ops[ls->ls_layout_type]->fence_client)
> + return true;
> + if (ls->ls_fenced)
> + return true;
Between this check and mod_delayed_work(), another thread could
complete fencing. This can cause duplicate worker scheduling and
reference count leaks. Should you hold cl_fence_mutex while
checking ls_fenced?
> +
> + if (work_busy(&ls->ls_fence_work.work))
> + return false;
> + /* Schedule work to do the fence operation */
> + ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
> + if (!ret) {
> + /*
> + * If there is no pending work, mod_delayed_work queues
> + * new task. While fencing is in progress, a reference
> + * count is added to the layout stateid to ensure its
> + * validity. This reference count is released once fencing
> + * has been completed.
> + */
> + refcount_inc(&ls->ls_stid.sc_count);
> + }
> + return false;
> +}
> +
> static const struct lease_manager_operations nfsd4_layouts_lm_ops = {
> .lm_break = nfsd4_layout_lm_break,
> .lm_change = nfsd4_layout_lm_change,
> .lm_open_conflict = nfsd4_layout_lm_open_conflict,
> + .lm_breaker_timedout = nfsd4_layout_lm_breaker_timedout,
> };
>
> int
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 583c13b5aaf3..a57fa3318362 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -2385,6 +2385,7 @@ static struct nfs4_client *alloc_client(struct
> xdr_netobj name,
> #endif
> #ifdef CONFIG_NFSD_SCSILAYOUT
> xa_init(&clp->cl_dev_fences);
> + mutex_init(&clp->cl_fence_mutex);
> #endif
> INIT_LIST_HEAD(&clp->async_copies);
> spin_lock_init(&clp->async_lock);
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index 713f55ef6554..d9a3c966a35f 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -529,6 +529,7 @@ struct nfs4_client {
> time64_t cl_ra_time;
> #ifdef CONFIG_NFSD_SCSILAYOUT
> struct xarray cl_dev_fences;
> + struct mutex cl_fence_mutex;
> #endif
> };
>
> @@ -738,6 +739,8 @@ struct nfs4_layout_stateid {
> stateid_t ls_recall_sid;
> bool ls_recalled;
> struct mutex ls_mutex;
> + struct delayed_work ls_fence_work;
nfsd4_free_layout_stateid() needs to cancel ls_fence_work
explicitly before freeing stateid memory. Otherwise if the
fence worker runs after the stateid is freed, it will
access the freed stateid memory.
> + bool ls_fenced;
> };
>
> static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
> diff --git a/include/linux/filelock.h b/include/linux/filelock.h
> index 2f5e5588ee07..6939952f2088 100644
> --- a/include/linux/filelock.h
> +++ b/include/linux/filelock.h
> @@ -17,6 +17,7 @@
> #define FL_OFDLCK 1024 /* lock is "owned" by struct file */
> #define FL_LAYOUT 2048 /* outstanding pNFS layout */
> #define FL_RECLAIM 4096 /* reclaiming from a reboot server */
> +#define FL_BREAKER_TIMEDOUT 8192 /* lease breaker timed out */
>
> #define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE)
>
> @@ -50,6 +51,7 @@ struct lease_manager_operations {
> void (*lm_setup)(struct file_lease *, void **);
> bool (*lm_breaker_owns_lease)(struct file_lease *);
> int (*lm_open_conflict)(struct file *, int);
> + bool (*lm_breaker_timedout)(struct file_lease *fl);
> };
>
> struct lock_manager {
> --
> 2.47.3
In nfsd4_scsi_fence_client(), when a device error occurs (e.g.,
PR_STS_IOERR), ls->ls_fenced is set even though the client may
still have storage access.
Also, if fencing consistently fails with retryable errors,
fencing is continually retried with no maximum retry limit. Is
this new lease breaker timeout designed to take care of that
cleanly, or is it something that needs some explicit logic?
I don't immediately spot a mechanism to force lease disposal
if fencing never completes.
--
Chuck Lever
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH v2 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing
2026-01-26 16:11 ` Chuck Lever
@ 2026-01-26 17:44 ` Dai Ngo
0 siblings, 0 replies; 5+ messages in thread
From: Dai Ngo @ 2026-01-26 17:44 UTC (permalink / raw)
To: Chuck Lever, Chuck Lever, Jeff Layton, NeilBrown,
Olga Kornievskaia, Tom Talpey, Christoph Hellwig, Alexander Aring,
Alexander Viro, Christian Brauner, Jan Kara
Cc: linux-fsdevel, linux-nfs
On 1/26/26 8:11 AM, Chuck Lever wrote:
>
> On Sun, Jan 25, 2026, at 5:21 PM, Dai Ngo wrote:
>> When a layout conflict triggers a recall, enforcing a timeout is
>> necessary to prevent excessive nfsd threads from being blocked in
>> __break_lease ensuring the server continues servicing incoming
>> requests efficiently.
>>
>> This patch introduces a new function to lease_manager_operations:
>>
>> lm_breaker_timedout: Invoked when a lease recall times out and is
>> about to be disposed of. This function enables the lease manager
>> to inform the caller whether the file_lease should remain on the
>> flc_list or be disposed of.
>>
>> For the NFSD lease manager, this function now handles layout recall
>> timeouts. If the layout type supports fencing and the client has not
>> been fenced, a fence operation is triggered to prevent the client
>> from accessing the block device.
>>
>> Fencing operation is done asynchronously using a system worker. This
>> is to allow lease_modify to trigger the fencing opeation when layout
>> recall timed out.
>>
>> To ensure layout stateid remains valid while the fencing operation is
>> in progress, a reference count is added to layout stateid before
>> schedule the system worker to do the fencing operation. The reference
>> count is released after the fencing operation is complete.
>>
>> While the fencing operation is in progress, the conflicting file_lease
>> remains on the flc_list until fencing is complete. This guarantees
>> that no other clients can access the file, and the client with exclusive
>> access is properly blocked before disposal.
>>
>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>> ---
>> Documentation/filesystems/locking.rst | 2 +
>> fs/locks.c | 29 ++++++++++-
>> fs/nfsd/blocklayout.c | 19 ++++++-
>> fs/nfsd/nfs4layouts.c | 74 ++++++++++++++++++++++++---
>> fs/nfsd/nfs4state.c | 1 +
>> fs/nfsd/state.h | 3 ++
>> include/linux/filelock.h | 2 +
>> 7 files changed, 122 insertions(+), 8 deletions(-)
>>
>> v2:
>> . Update Subject line to include fencing operation.
>> . Allow conflicting lease to remain on flc_list until fencing
>> is complete.
>> . Use system worker to perform fencing operation asynchronously.
>> . Use nfs4_stid.sc_count to ensure layout stateid remains
>> valid before starting the fencing operation, nfs4_stid.sc_count
>> is released after fencing operation is complete.
>> . Rework nfsd4_scsi_fence_client to:
>> . wait until fencing to complete before exiting.
>> . wait until fencing in progress to complete before
>> checking the NFSD_MDS_PR_FENCED flag.
>> . Remove lm_need_to_retry from lease_manager_operations.
>>
>> diff --git a/Documentation/filesystems/locking.rst
>> b/Documentation/filesystems/locking.rst
>> index 04c7691e50e0..f7fe2c1ee32b 100644
>> --- a/Documentation/filesystems/locking.rst
>> +++ b/Documentation/filesystems/locking.rst
>> @@ -403,6 +403,7 @@ prototypes::
>> bool (*lm_breaker_owns_lease)(struct file_lock *);
>> bool (*lm_lock_expirable)(struct file_lock *);
>> void (*lm_expire_lock)(void);
>> + void (*lm_breaker_timedout)(struct file_lease *);
>>
>> locking rules:
>>
>> @@ -417,6 +418,7 @@ lm_breaker_owns_lease: yes no no
>> lm_lock_expirable yes no no
>> lm_expire_lock no no yes
>> lm_open_conflict yes no no
>> +lm_breaker_timedout no no yes
> This might not be consistent with other comments that state
> lm_breaker_timedout runs with no locks held and may block.
> But looks like the flc_lock spinlock IS held. Documentation
> should say flc_lock = yes, may block = no.
Yes, I forgot to update this doc. Fix in v3.
>
>
>> ====================== ============= ================= =========
>>
>> buffer_head
>> diff --git a/fs/locks.c b/fs/locks.c
>> index 46f229f740c8..28e63aa87f74 100644
>> --- a/fs/locks.c
>> +++ b/fs/locks.c
>> @@ -1487,6 +1487,25 @@ static void lease_clear_pending(struct
>> file_lease *fl, int arg)
>> }
>> }
>>
>> +/*
>> + * A layout lease is about to be disposed. If the disposal is
>> + * due to a layout recall timeout, consult the lease manager
>> + * to see whether the operation should continue.
>> + *
>> + * Return true if the lease should be disposed else return
>> + * false.
>> + */
>> +static bool lease_want_dispose(struct file_lease *fl)
>> +{
>> + if (!(fl->c.flc_flags & FL_BREAKER_TIMEDOUT))
>> + return true;
>> +
>> + if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout &&
>> + (!fl->fl_lmops->lm_breaker_timedout(fl)))
>> + return false;
>> + return true;
>> +}
>> +
>> /* We already had a lease on this file; just change its type */
>> int lease_modify(struct file_lease *fl, int arg, struct list_head
>> *dispose)
>> {
>> @@ -1494,6 +1513,11 @@ int lease_modify(struct file_lease *fl, int arg,
>> struct list_head *dispose)
>>
>> if (error)
>> return error;
>> +
>> + if ((fl->c.flc_flags & FL_LAYOUT) && (arg == F_UNLCK) &&
>> + (!lease_want_dispose(fl)))
>> + return 0;
>> +
>> lease_clear_pending(fl, arg);
>> locks_wake_up_blocks(&fl->c);
>> if (arg == F_UNLCK) {
>> @@ -1531,8 +1555,11 @@ static void time_out_leases(struct inode *inode,
>> struct list_head *dispose)
>> trace_time_out_leases(inode, fl);
>> if (past_time(fl->fl_downgrade_time))
>> lease_modify(fl, F_RDLCK, dispose);
>> - if (past_time(fl->fl_break_time))
>> +
>> + if (past_time(fl->fl_break_time)) {
>> + fl->c.flc_flags |= FL_BREAKER_TIMEDOUT;
>> lease_modify(fl, F_UNLCK, dispose);
>> + }
>> }
>> }
>>
>> diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
>> index 7ba9e2dd0875..05ddff5a4005 100644
>> --- a/fs/nfsd/blocklayout.c
>> +++ b/fs/nfsd/blocklayout.c
>> @@ -443,6 +443,14 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode,
>> struct svc_rqst *rqstp,
>> return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
>> }
>>
>> +/*
>> + * Perform the fence operation to prevent the client from accessing the
>> + * block device. If a fence operation is already in progress, wait for
>> + * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the
>> + * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set,
>> + * update the layout stateid by setting the ls_fenced flag to indicate
>> + * that the client has been fenced.
>> + */
>> static void
>> nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct
>> nfsd_file *file)
>> {
>> @@ -450,8 +458,13 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid
>> *ls, struct nfsd_file *file)
>> struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev;
>> int status;
>>
>> - if (nfsd4_scsi_fence_set(clp, bdev->bd_dev))
>> + mutex_lock(&clp->cl_fence_mutex);
>> + if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) {
>> + ls->ls_fenced = true;
>> + mutex_unlock(&clp->cl_fence_mutex);
>> + nfs4_put_stid(&ls->ls_stid);
>> return;
>> + }
>>
>> status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev,
>> NFSD_MDS_PR_KEY,
>> nfsd4_scsi_pr_key(clp),
>> @@ -475,6 +488,10 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid
>> *ls, struct nfsd_file *file)
>> status == PR_STS_PATH_FAST_FAILED ||
>> status == PR_STS_RETRY_PATH_FAILURE)
>> nfsd4_scsi_fence_clear(clp, bdev->bd_dev);
>> + else
>> + ls->ls_fenced = true;
>> + mutex_unlock(&clp->cl_fence_mutex);
>> + nfs4_put_stid(&ls->ls_stid);
>>
>> trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status);
>> }
>> diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
>> index ad7af8cfcf1f..4a11ccd5b0a5 100644
>> --- a/fs/nfsd/nfs4layouts.c
>> +++ b/fs/nfsd/nfs4layouts.c
>> @@ -222,6 +222,27 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
>> return 0;
>> }
>>
>> +static void
>> +nfsd4_layout_fence_worker(struct work_struct *work)
>> +{
>> + struct nfsd_file *nf;
>> + struct delayed_work *dwork = to_delayed_work(work);
>> + struct nfs4_layout_stateid *ls = container_of(dwork,
>> + struct nfs4_layout_stateid, ls_fence_work);
>> + u32 type;
>> +
>> + rcu_read_lock();
>> + nf = nfsd_file_get(ls->ls_file);
>> + rcu_read_unlock();
>> + if (!nf)
>> + return;
> The refcount was incremented when scheduling the work but is never
> released for this early return. Do you need:
>
> if (!nf) {
> nfs4_put_stid(&ls->ls_stid);
> return;
> }
>
> here?
yes, we do. Fix in v3.
>
>
>> +
>> + type = ls->ls_layout_type;
>> + if (nfsd4_layout_ops[type]->fence_client)
>> + nfsd4_layout_ops[type]->fence_client(ls, nf);
>> + nfsd_file_put(nf);
>> +}
>> +
>> static struct nfs4_layout_stateid *
>> nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
>> struct nfs4_stid *parent, u32 layout_type)
>> @@ -271,6 +292,9 @@ nfsd4_alloc_layout_stateid(struct
>> nfsd4_compound_state *cstate,
>> list_add(&ls->ls_perfile, &fp->fi_lo_states);
>> spin_unlock(&fp->fi_lock);
>>
>> + INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker);
>> + ls->ls_fenced = false;
>> +
>> trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid);
>> return ls;
>> }
>> @@ -708,9 +732,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb,
>> struct rpc_task *task)
>> rcu_read_unlock();
>> if (fl) {
>> ops = nfsd4_layout_ops[ls->ls_layout_type];
>> - if (ops->fence_client)
>> + if (ops->fence_client) {
>> + refcount_inc(&ls->ls_stid.sc_count);
>> ops->fence_client(ls, fl);
>> - else
>> + } else
>> nfsd4_cb_layout_fail(ls, fl);
>> nfsd_file_put(fl);
>> }
>> @@ -747,11 +772,9 @@ static bool
>> nfsd4_layout_lm_break(struct file_lease *fl)
>> {
>> /*
>> - * We don't want the locks code to timeout the lease for us;
>> - * we'll remove it ourself if a layout isn't returned
>> - * in time:
>> + * Enforce break lease timeout to prevent NFSD
>> + * thread from hanging in __break_lease.
>> */
>> - fl->fl_break_time = 0;
>> nfsd4_recall_file_layout(fl->c.flc_owner);
>> return false;
>> }
>> @@ -782,10 +805,49 @@ nfsd4_layout_lm_open_conflict(struct file *filp,
>> int arg)
>> return 0;
>> }
>>
>> +/**
>> + * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out.
>> + * If the layout type supports a fence operation, schedule a worker to
>> + * fence the client from accessing the block device.
>> + *
>> + * @fl: file to check
>> + *
>> + * Return true if the file lease should be disposed of by the caller;
>> + * otherwise, return false.
>> + */
>> +static bool
>> +nfsd4_layout_lm_breaker_timedout(struct file_lease *fl)
>> +{
>> + struct nfs4_layout_stateid *ls = fl->c.flc_owner;
>> + bool ret;
>> +
>> + if (!nfsd4_layout_ops[ls->ls_layout_type]->fence_client)
>> + return true;
>> + if (ls->ls_fenced)
>> + return true;
> Between this check and mod_delayed_work(), another thread could
> complete fencing. This can cause duplicate worker scheduling and
> reference count leaks.
This could cause another worker to be scheduled while fencing already
completed. But the new worker will detect that the device has been
fenced and do nothing. Each worker increment and decrement the reference
count separately so I don't see how it can leak?
> Should you hold cl_fence_mutex while
> checking ls_fenced?
We can't hold the cl_fence_mutex here since this function runs under
the spin_lock flc_lock.
>
>
>> +
>> + if (work_busy(&ls->ls_fence_work.work))
>> + return false;
>> + /* Schedule work to do the fence operation */
>> + ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
>> + if (!ret) {
>> + /*
>> + * If there is no pending work, mod_delayed_work queues
>> + * new task. While fencing is in progress, a reference
>> + * count is added to the layout stateid to ensure its
>> + * validity. This reference count is released once fencing
>> + * has been completed.
>> + */
>> + refcount_inc(&ls->ls_stid.sc_count);
>> + }
>> + return false;
>> +}
>> +
>> static const struct lease_manager_operations nfsd4_layouts_lm_ops = {
>> .lm_break = nfsd4_layout_lm_break,
>> .lm_change = nfsd4_layout_lm_change,
>> .lm_open_conflict = nfsd4_layout_lm_open_conflict,
>> + .lm_breaker_timedout = nfsd4_layout_lm_breaker_timedout,
>> };
>>
>> int
>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>> index 583c13b5aaf3..a57fa3318362 100644
>> --- a/fs/nfsd/nfs4state.c
>> +++ b/fs/nfsd/nfs4state.c
>> @@ -2385,6 +2385,7 @@ static struct nfs4_client *alloc_client(struct
>> xdr_netobj name,
>> #endif
>> #ifdef CONFIG_NFSD_SCSILAYOUT
>> xa_init(&clp->cl_dev_fences);
>> + mutex_init(&clp->cl_fence_mutex);
>> #endif
>> INIT_LIST_HEAD(&clp->async_copies);
>> spin_lock_init(&clp->async_lock);
>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
>> index 713f55ef6554..d9a3c966a35f 100644
>> --- a/fs/nfsd/state.h
>> +++ b/fs/nfsd/state.h
>> @@ -529,6 +529,7 @@ struct nfs4_client {
>> time64_t cl_ra_time;
>> #ifdef CONFIG_NFSD_SCSILAYOUT
>> struct xarray cl_dev_fences;
>> + struct mutex cl_fence_mutex;
>> #endif
>> };
>>
>> @@ -738,6 +739,8 @@ struct nfs4_layout_stateid {
>> stateid_t ls_recall_sid;
>> bool ls_recalled;
>> struct mutex ls_mutex;
>> + struct delayed_work ls_fence_work;
> nfsd4_free_layout_stateid() needs to cancel ls_fence_work
> explicitly before freeing stateid memory. Otherwise if the
> fence worker runs after the stateid is freed, it will
> access the freed stateid memory.
The layout stateid cannot be freed by nfs4_put_stid once the
fence worker has been scheduled, because the layout stateid's
reference count is incremented at that point.
>
>
>> + bool ls_fenced;
>> };
>>
>> static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
>> diff --git a/include/linux/filelock.h b/include/linux/filelock.h
>> index 2f5e5588ee07..6939952f2088 100644
>> --- a/include/linux/filelock.h
>> +++ b/include/linux/filelock.h
>> @@ -17,6 +17,7 @@
>> #define FL_OFDLCK 1024 /* lock is "owned" by struct file */
>> #define FL_LAYOUT 2048 /* outstanding pNFS layout */
>> #define FL_RECLAIM 4096 /* reclaiming from a reboot server */
>> +#define FL_BREAKER_TIMEDOUT 8192 /* lease breaker timed out */
>>
>> #define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE)
>>
>> @@ -50,6 +51,7 @@ struct lease_manager_operations {
>> void (*lm_setup)(struct file_lease *, void **);
>> bool (*lm_breaker_owns_lease)(struct file_lease *);
>> int (*lm_open_conflict)(struct file *, int);
>> + bool (*lm_breaker_timedout)(struct file_lease *fl);
>> };
>>
>> struct lock_manager {
>> --
>> 2.47.3
> In nfsd4_scsi_fence_client(), when a device error occurs (e.g.,
> PR_STS_IOERR), ls->ls_fenced is set even though the client may
> still have storage access.
I will revert this check back to (status != 0) as error condition
instead of checking for selected error codes as it does now.
Fix in v3.
>
> Also, if fencing consistently fails with retryable errors,
> fencing is continually retried with no maximum retry limit. Is
> this new lease breaker timeout designed to take care of that
> cleanly, or is it something that needs some explicit logic?
> I don't immediately spot a mechanism to force lease disposal
> if fencing never completes.
Currently there is no mechanism that prevents the fencing operation
from retrying on errors forever. Would it sufficient to add a max
count for retries, like 5 times?
Thanks,
-Dai
>
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH v2 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing
2026-01-25 22:21 [PATCH v2 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing Dai Ngo
2026-01-26 16:11 ` Chuck Lever
@ 2026-01-26 18:50 ` Jeff Layton
2026-01-26 20:33 ` Dai Ngo
1 sibling, 1 reply; 5+ messages in thread
From: Jeff Layton @ 2026-01-26 18:50 UTC (permalink / raw)
To: Dai Ngo, chuck.lever, neil, okorniev, tom, hch, alex.aring, viro,
brauner, jack
Cc: linux-fsdevel, linux-nfs
On Sun, 2026-01-25 at 14:21 -0800, Dai Ngo wrote:
> When a layout conflict triggers a recall, enforcing a timeout is
> necessary to prevent excessive nfsd threads from being blocked in
> __break_lease ensuring the server continues servicing incoming
> requests efficiently.
>
> This patch introduces a new function to lease_manager_operations:
>
> lm_breaker_timedout: Invoked when a lease recall times out and is
> about to be disposed of. This function enables the lease manager
> to inform the caller whether the file_lease should remain on the
> flc_list or be disposed of.
>
> For the NFSD lease manager, this function now handles layout recall
> timeouts. If the layout type supports fencing and the client has not
> been fenced, a fence operation is triggered to prevent the client
> from accessing the block device.
>
> Fencing operation is done asynchronously using a system worker. This
> is to allow lease_modify to trigger the fencing opeation when layout
> recall timed out.
>
> To ensure layout stateid remains valid while the fencing operation is
> in progress, a reference count is added to layout stateid before
> schedule the system worker to do the fencing operation. The reference
> count is released after the fencing operation is complete.
>
> While the fencing operation is in progress, the conflicting file_lease
> remains on the flc_list until fencing is complete. This guarantees
> that no other clients can access the file, and the client with exclusive
> access is properly blocked before disposal.
>
> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
> ---
> Documentation/filesystems/locking.rst | 2 +
> fs/locks.c | 29 ++++++++++-
> fs/nfsd/blocklayout.c | 19 ++++++-
> fs/nfsd/nfs4layouts.c | 74 ++++++++++++++++++++++++---
> fs/nfsd/nfs4state.c | 1 +
> fs/nfsd/state.h | 3 ++
> include/linux/filelock.h | 2 +
> 7 files changed, 122 insertions(+), 8 deletions(-)
>
> v2:
> . Update Subject line to include fencing operation.
> . Allow conflicting lease to remain on flc_list until fencing
> is complete.
> . Use system worker to perform fencing operation asynchronously.
> . Use nfs4_stid.sc_count to ensure layout stateid remains
> valid before starting the fencing operation, nfs4_stid.sc_count
> is released after fencing operation is complete.
> . Rework nfsd4_scsi_fence_client to:
> . wait until fencing to complete before exiting.
> . wait until fencing in progress to complete before
> checking the NFSD_MDS_PR_FENCED flag.
> . Remove lm_need_to_retry from lease_manager_operations.
>
> diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
> index 04c7691e50e0..f7fe2c1ee32b 100644
> --- a/Documentation/filesystems/locking.rst
> +++ b/Documentation/filesystems/locking.rst
> @@ -403,6 +403,7 @@ prototypes::
> bool (*lm_breaker_owns_lease)(struct file_lock *);
> bool (*lm_lock_expirable)(struct file_lock *);
> void (*lm_expire_lock)(void);
> + void (*lm_breaker_timedout)(struct file_lease *);
>
> locking rules:
>
> @@ -417,6 +418,7 @@ lm_breaker_owns_lease: yes no no
> lm_lock_expirable yes no no
> lm_expire_lock no no yes
> lm_open_conflict yes no no
> +lm_breaker_timedout no no yes
> ====================== ============= ================= =========
>
> buffer_head
> diff --git a/fs/locks.c b/fs/locks.c
> index 46f229f740c8..28e63aa87f74 100644
> --- a/fs/locks.c
> +++ b/fs/locks.c
> @@ -1487,6 +1487,25 @@ static void lease_clear_pending(struct file_lease *fl, int arg)
> }
> }
>
> +/*
> + * A layout lease is about to be disposed. If the disposal is
> + * due to a layout recall timeout, consult the lease manager
> + * to see whether the operation should continue.
> + *
> + * Return true if the lease should be disposed else return
> + * false.
> + */
> +static bool lease_want_dispose(struct file_lease *fl)
> +{
> + if (!(fl->c.flc_flags & FL_BREAKER_TIMEDOUT))
> + return true;
> +
> + if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout &&
> + (!fl->fl_lmops->lm_breaker_timedout(fl)))
> + return false;
> + return true;
> +}
> +
> /* We already had a lease on this file; just change its type */
> int lease_modify(struct file_lease *fl, int arg, struct list_head *dispose)
> {
> @@ -1494,6 +1513,11 @@ int lease_modify(struct file_lease *fl, int arg, struct list_head *dispose)
>
> if (error)
> return error;
> +
> + if ((fl->c.flc_flags & FL_LAYOUT) && (arg == F_UNLCK) &&
> + (!lease_want_dispose(fl)))
> + return 0;
> +
Barf. I really don't care for this special-casing inside of
lease_modify(). That's a fairly low-level lease handling primitive.
This needs to be handled at a higher level.
I also don't quite get what you're doing here. If you skip unlocking
the lease here, when does it get unlocked?
> lease_clear_pending(fl, arg);
> locks_wake_up_blocks(&fl->c);
> if (arg == F_UNLCK) {
> @@ -1531,8 +1555,11 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
> trace_time_out_leases(inode, fl);
> if (past_time(fl->fl_downgrade_time))
> lease_modify(fl, F_RDLCK, dispose);
> - if (past_time(fl->fl_break_time))
> +
> + if (past_time(fl->fl_break_time)) {
> + fl->c.flc_flags |= FL_BREAKER_TIMEDOUT;
> lease_modify(fl, F_UNLCK, dispose);
ISTM that this function is the right place to call lm_breaker_timedout.
Also, it might be best to define a lm_breaker_timedout() for all lease
types, and just have the common case call lease_modify() as above.
> + }
> }
> }
>
> diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
> index 7ba9e2dd0875..05ddff5a4005 100644
> --- a/fs/nfsd/blocklayout.c
> +++ b/fs/nfsd/blocklayout.c
> @@ -443,6 +443,14 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp,
> return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
> }
>
> +/*
> + * Perform the fence operation to prevent the client from accessing the
> + * block device. If a fence operation is already in progress, wait for
> + * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the
> + * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set,
> + * update the layout stateid by setting the ls_fenced flag to indicate
> + * that the client has been fenced.
> + */
> static void
> nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
> {
> @@ -450,8 +458,13 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
> struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev;
> int status;
>
> - if (nfsd4_scsi_fence_set(clp, bdev->bd_dev))
> + mutex_lock(&clp->cl_fence_mutex);
> + if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) {
> + ls->ls_fenced = true;
> + mutex_unlock(&clp->cl_fence_mutex);
> + nfs4_put_stid(&ls->ls_stid);
> return;
> + }
>
> status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
> nfsd4_scsi_pr_key(clp),
> @@ -475,6 +488,10 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
> status == PR_STS_PATH_FAST_FAILED ||
> status == PR_STS_RETRY_PATH_FAILURE)
> nfsd4_scsi_fence_clear(clp, bdev->bd_dev);
> + else
> + ls->ls_fenced = true;
> + mutex_unlock(&clp->cl_fence_mutex);
> + nfs4_put_stid(&ls->ls_stid);
>
> trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status);
> }
> diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
> index ad7af8cfcf1f..4a11ccd5b0a5 100644
> --- a/fs/nfsd/nfs4layouts.c
> +++ b/fs/nfsd/nfs4layouts.c
> @@ -222,6 +222,27 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
> return 0;
> }
>
> +static void
> +nfsd4_layout_fence_worker(struct work_struct *work)
> +{
> + struct nfsd_file *nf;
> + struct delayed_work *dwork = to_delayed_work(work);
> + struct nfs4_layout_stateid *ls = container_of(dwork,
> + struct nfs4_layout_stateid, ls_fence_work);
> + u32 type;
> +
> + rcu_read_lock();
> + nf = nfsd_file_get(ls->ls_file);
> + rcu_read_unlock();
> + if (!nf)
> + return;
> +
> + type = ls->ls_layout_type;
> + if (nfsd4_layout_ops[type]->fence_client)
> + nfsd4_layout_ops[type]->fence_client(ls, nf);
> + nfsd_file_put(nf);
> +}
> +
> static struct nfs4_layout_stateid *
> nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
> struct nfs4_stid *parent, u32 layout_type)
> @@ -271,6 +292,9 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
> list_add(&ls->ls_perfile, &fp->fi_lo_states);
> spin_unlock(&fp->fi_lock);
>
> + INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker);
> + ls->ls_fenced = false;
> +
> trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid);
> return ls;
> }
> @@ -708,9 +732,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
> rcu_read_unlock();
> if (fl) {
> ops = nfsd4_layout_ops[ls->ls_layout_type];
> - if (ops->fence_client)
> + if (ops->fence_client) {
> + refcount_inc(&ls->ls_stid.sc_count);
> ops->fence_client(ls, fl);
> - else
> + } else
> nfsd4_cb_layout_fail(ls, fl);
> nfsd_file_put(fl);
> }
> @@ -747,11 +772,9 @@ static bool
> nfsd4_layout_lm_break(struct file_lease *fl)
> {
> /*
> - * We don't want the locks code to timeout the lease for us;
> - * we'll remove it ourself if a layout isn't returned
> - * in time:
> + * Enforce break lease timeout to prevent NFSD
> + * thread from hanging in __break_lease.
> */
> - fl->fl_break_time = 0;
> nfsd4_recall_file_layout(fl->c.flc_owner);
> return false;
> }
> @@ -782,10 +805,49 @@ nfsd4_layout_lm_open_conflict(struct file *filp, int arg)
> return 0;
> }
>
> +/**
> + * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out.
> + * If the layout type supports a fence operation, schedule a worker to
> + * fence the client from accessing the block device.
> + *
> + * @fl: file to check
> + *
> + * Return true if the file lease should be disposed of by the caller;
> + * otherwise, return false.
> + */
> +static bool
> +nfsd4_layout_lm_breaker_timedout(struct file_lease *fl)
> +{
> + struct nfs4_layout_stateid *ls = fl->c.flc_owner;
> + bool ret;
> +
> + if (!nfsd4_layout_ops[ls->ls_layout_type]->fence_client)
> + return true;
> + if (ls->ls_fenced)
> + return true;
> +
> + if (work_busy(&ls->ls_fence_work.work))
> + return false;
> + /* Schedule work to do the fence operation */
> + ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
> + if (!ret) {
> + /*
> + * If there is no pending work, mod_delayed_work queues
> + * new task. While fencing is in progress, a reference
> + * count is added to the layout stateid to ensure its
> + * validity. This reference count is released once fencing
> + * has been completed.
> + */
> + refcount_inc(&ls->ls_stid.sc_count);
> + }
> + return false;
> +}
> +
> static const struct lease_manager_operations nfsd4_layouts_lm_ops = {
> .lm_break = nfsd4_layout_lm_break,
> .lm_change = nfsd4_layout_lm_change,
> .lm_open_conflict = nfsd4_layout_lm_open_conflict,
> + .lm_breaker_timedout = nfsd4_layout_lm_breaker_timedout,
> };
>
> int
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 583c13b5aaf3..a57fa3318362 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -2385,6 +2385,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name,
> #endif
> #ifdef CONFIG_NFSD_SCSILAYOUT
> xa_init(&clp->cl_dev_fences);
> + mutex_init(&clp->cl_fence_mutex);
> #endif
> INIT_LIST_HEAD(&clp->async_copies);
> spin_lock_init(&clp->async_lock);
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index 713f55ef6554..d9a3c966a35f 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -529,6 +529,7 @@ struct nfs4_client {
> time64_t cl_ra_time;
> #ifdef CONFIG_NFSD_SCSILAYOUT
> struct xarray cl_dev_fences;
> + struct mutex cl_fence_mutex;
> #endif
> };
>
> @@ -738,6 +739,8 @@ struct nfs4_layout_stateid {
> stateid_t ls_recall_sid;
> bool ls_recalled;
> struct mutex ls_mutex;
> + struct delayed_work ls_fence_work;
> + bool ls_fenced;
> };
>
> static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
> diff --git a/include/linux/filelock.h b/include/linux/filelock.h
> index 2f5e5588ee07..6939952f2088 100644
> --- a/include/linux/filelock.h
> +++ b/include/linux/filelock.h
> @@ -17,6 +17,7 @@
> #define FL_OFDLCK 1024 /* lock is "owned" by struct file */
> #define FL_LAYOUT 2048 /* outstanding pNFS layout */
> #define FL_RECLAIM 4096 /* reclaiming from a reboot server */
> +#define FL_BREAKER_TIMEDOUT 8192 /* lease breaker timed out */
>
> #define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE)
>
> @@ -50,6 +51,7 @@ struct lease_manager_operations {
> void (*lm_setup)(struct file_lease *, void **);
> bool (*lm_breaker_owns_lease)(struct file_lease *);
> int (*lm_open_conflict)(struct file *, int);
> + bool (*lm_breaker_timedout)(struct file_lease *fl);
> };
>
> struct lock_manager {
--
Jeff Layton <jlayton@kernel.org>
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH v2 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing
2026-01-26 18:50 ` Jeff Layton
@ 2026-01-26 20:33 ` Dai Ngo
0 siblings, 0 replies; 5+ messages in thread
From: Dai Ngo @ 2026-01-26 20:33 UTC (permalink / raw)
To: Jeff Layton, chuck.lever, neil, okorniev, tom, hch, alex.aring,
viro, brauner, jack
Cc: linux-fsdevel, linux-nfs
On 1/26/26 10:50 AM, Jeff Layton wrote:
> On Sun, 2026-01-25 at 14:21 -0800, Dai Ngo wrote:
>> When a layout conflict triggers a recall, enforcing a timeout is
>> necessary to prevent excessive nfsd threads from being blocked in
>> __break_lease ensuring the server continues servicing incoming
>> requests efficiently.
>>
>> This patch introduces a new function to lease_manager_operations:
>>
>> lm_breaker_timedout: Invoked when a lease recall times out and is
>> about to be disposed of. This function enables the lease manager
>> to inform the caller whether the file_lease should remain on the
>> flc_list or be disposed of.
>>
>> For the NFSD lease manager, this function now handles layout recall
>> timeouts. If the layout type supports fencing and the client has not
>> been fenced, a fence operation is triggered to prevent the client
>> from accessing the block device.
>>
>> Fencing operation is done asynchronously using a system worker. This
>> is to allow lease_modify to trigger the fencing opeation when layout
>> recall timed out.
>>
>> To ensure layout stateid remains valid while the fencing operation is
>> in progress, a reference count is added to layout stateid before
>> schedule the system worker to do the fencing operation. The reference
>> count is released after the fencing operation is complete.
>>
>> While the fencing operation is in progress, the conflicting file_lease
>> remains on the flc_list until fencing is complete. This guarantees
>> that no other clients can access the file, and the client with exclusive
>> access is properly blocked before disposal.
>>
>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>> ---
>> Documentation/filesystems/locking.rst | 2 +
>> fs/locks.c | 29 ++++++++++-
>> fs/nfsd/blocklayout.c | 19 ++++++-
>> fs/nfsd/nfs4layouts.c | 74 ++++++++++++++++++++++++---
>> fs/nfsd/nfs4state.c | 1 +
>> fs/nfsd/state.h | 3 ++
>> include/linux/filelock.h | 2 +
>> 7 files changed, 122 insertions(+), 8 deletions(-)
>>
>> v2:
>> . Update Subject line to include fencing operation.
>> . Allow conflicting lease to remain on flc_list until fencing
>> is complete.
>> . Use system worker to perform fencing operation asynchronously.
>> . Use nfs4_stid.sc_count to ensure layout stateid remains
>> valid before starting the fencing operation, nfs4_stid.sc_count
>> is released after fencing operation is complete.
>> . Rework nfsd4_scsi_fence_client to:
>> . wait until fencing to complete before exiting.
>> . wait until fencing in progress to complete before
>> checking the NFSD_MDS_PR_FENCED flag.
>> . Remove lm_need_to_retry from lease_manager_operations.
>>
>> diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
>> index 04c7691e50e0..f7fe2c1ee32b 100644
>> --- a/Documentation/filesystems/locking.rst
>> +++ b/Documentation/filesystems/locking.rst
>> @@ -403,6 +403,7 @@ prototypes::
>> bool (*lm_breaker_owns_lease)(struct file_lock *);
>> bool (*lm_lock_expirable)(struct file_lock *);
>> void (*lm_expire_lock)(void);
>> + void (*lm_breaker_timedout)(struct file_lease *);
>>
>> locking rules:
>>
>> @@ -417,6 +418,7 @@ lm_breaker_owns_lease: yes no no
>> lm_lock_expirable yes no no
>> lm_expire_lock no no yes
>> lm_open_conflict yes no no
>> +lm_breaker_timedout no no yes
>> ====================== ============= ================= =========
>>
>> buffer_head
>> diff --git a/fs/locks.c b/fs/locks.c
>> index 46f229f740c8..28e63aa87f74 100644
>> --- a/fs/locks.c
>> +++ b/fs/locks.c
>> @@ -1487,6 +1487,25 @@ static void lease_clear_pending(struct file_lease *fl, int arg)
>> }
>> }
>>
>> +/*
>> + * A layout lease is about to be disposed. If the disposal is
>> + * due to a layout recall timeout, consult the lease manager
>> + * to see whether the operation should continue.
>> + *
>> + * Return true if the lease should be disposed else return
>> + * false.
>> + */
>> +static bool lease_want_dispose(struct file_lease *fl)
>> +{
>> + if (!(fl->c.flc_flags & FL_BREAKER_TIMEDOUT))
>> + return true;
>> +
>> + if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout &&
>> + (!fl->fl_lmops->lm_breaker_timedout(fl)))
>> + return false;
>> + return true;
>> +}
>> +
>> /* We already had a lease on this file; just change its type */
>> int lease_modify(struct file_lease *fl, int arg, struct list_head *dispose)
>> {
>> @@ -1494,6 +1513,11 @@ int lease_modify(struct file_lease *fl, int arg, struct list_head *dispose)
>>
>> if (error)
>> return error;
>> +
>> + if ((fl->c.flc_flags & FL_LAYOUT) && (arg == F_UNLCK) &&
>> + (!lease_want_dispose(fl)))
>> + return 0;
>> +
> Barf. I really don't care for this special-casing inside of
> lease_modify(). That's a fairly low-level lease handling primitive.
> This needs to be handled at a higher level.
I will remove this special-casing in v3.
>
> I also don't quite get what you're doing here. If you skip unlocking
> the lease here, when does it get unlocked?
The lease is removed when ls_fenced is set after fencing operation
completed which is currently checked in lease_want_dispose/lm_breaker_timedout.
>
>> lease_clear_pending(fl, arg);
>> locks_wake_up_blocks(&fl->c);
>> if (arg == F_UNLCK) {
>> @@ -1531,8 +1555,11 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
>> trace_time_out_leases(inode, fl);
>> if (past_time(fl->fl_downgrade_time))
>> lease_modify(fl, F_RDLCK, dispose);
>> - if (past_time(fl->fl_break_time))
>> +
>> + if (past_time(fl->fl_break_time)) {
>> + fl->c.flc_flags |= FL_BREAKER_TIMEDOUT;
>> lease_modify(fl, F_UNLCK, dispose);
> ISTM that this function is the right place to call lm_breaker_timedout.
I will move lm_breaker_timedout to here in v3.
> Also, it might be best to define a lm_breaker_timedout() for all lease
> types, and just have the common case call lease_modify() as above.
ok, let me send out v3 and see if we still need to do this.
Thanks,
-Dai
>
>> + }
>> }
>> }
>>
>> diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
>> index 7ba9e2dd0875..05ddff5a4005 100644
>> --- a/fs/nfsd/blocklayout.c
>> +++ b/fs/nfsd/blocklayout.c
>> @@ -443,6 +443,14 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp,
>> return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
>> }
>>
>> +/*
>> + * Perform the fence operation to prevent the client from accessing the
>> + * block device. If a fence operation is already in progress, wait for
>> + * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the
>> + * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set,
>> + * update the layout stateid by setting the ls_fenced flag to indicate
>> + * that the client has been fenced.
>> + */
>> static void
>> nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
>> {
>> @@ -450,8 +458,13 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
>> struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev;
>> int status;
>>
>> - if (nfsd4_scsi_fence_set(clp, bdev->bd_dev))
>> + mutex_lock(&clp->cl_fence_mutex);
>> + if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) {
>> + ls->ls_fenced = true;
>> + mutex_unlock(&clp->cl_fence_mutex);
>> + nfs4_put_stid(&ls->ls_stid);
>> return;
>> + }
>>
>> status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
>> nfsd4_scsi_pr_key(clp),
>> @@ -475,6 +488,10 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
>> status == PR_STS_PATH_FAST_FAILED ||
>> status == PR_STS_RETRY_PATH_FAILURE)
>> nfsd4_scsi_fence_clear(clp, bdev->bd_dev);
>> + else
>> + ls->ls_fenced = true;
>> + mutex_unlock(&clp->cl_fence_mutex);
>> + nfs4_put_stid(&ls->ls_stid);
>>
>> trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status);
>> }
>> diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
>> index ad7af8cfcf1f..4a11ccd5b0a5 100644
>> --- a/fs/nfsd/nfs4layouts.c
>> +++ b/fs/nfsd/nfs4layouts.c
>> @@ -222,6 +222,27 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
>> return 0;
>> }
>>
>> +static void
>> +nfsd4_layout_fence_worker(struct work_struct *work)
>> +{
>> + struct nfsd_file *nf;
>> + struct delayed_work *dwork = to_delayed_work(work);
>> + struct nfs4_layout_stateid *ls = container_of(dwork,
>> + struct nfs4_layout_stateid, ls_fence_work);
>> + u32 type;
>> +
>> + rcu_read_lock();
>> + nf = nfsd_file_get(ls->ls_file);
>> + rcu_read_unlock();
>> + if (!nf)
>> + return;
>> +
>> + type = ls->ls_layout_type;
>> + if (nfsd4_layout_ops[type]->fence_client)
>> + nfsd4_layout_ops[type]->fence_client(ls, nf);
>> + nfsd_file_put(nf);
>> +}
>> +
>> static struct nfs4_layout_stateid *
>> nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
>> struct nfs4_stid *parent, u32 layout_type)
>> @@ -271,6 +292,9 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
>> list_add(&ls->ls_perfile, &fp->fi_lo_states);
>> spin_unlock(&fp->fi_lock);
>>
>> + INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker);
>> + ls->ls_fenced = false;
>> +
>> trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid);
>> return ls;
>> }
>> @@ -708,9 +732,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
>> rcu_read_unlock();
>> if (fl) {
>> ops = nfsd4_layout_ops[ls->ls_layout_type];
>> - if (ops->fence_client)
>> + if (ops->fence_client) {
>> + refcount_inc(&ls->ls_stid.sc_count);
>> ops->fence_client(ls, fl);
>> - else
>> + } else
>> nfsd4_cb_layout_fail(ls, fl);
>> nfsd_file_put(fl);
>> }
>> @@ -747,11 +772,9 @@ static bool
>> nfsd4_layout_lm_break(struct file_lease *fl)
>> {
>> /*
>> - * We don't want the locks code to timeout the lease for us;
>> - * we'll remove it ourself if a layout isn't returned
>> - * in time:
>> + * Enforce break lease timeout to prevent NFSD
>> + * thread from hanging in __break_lease.
>> */
>> - fl->fl_break_time = 0;
>> nfsd4_recall_file_layout(fl->c.flc_owner);
>> return false;
>> }
>> @@ -782,10 +805,49 @@ nfsd4_layout_lm_open_conflict(struct file *filp, int arg)
>> return 0;
>> }
>>
>> +/**
>> + * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out.
>> + * If the layout type supports a fence operation, schedule a worker to
>> + * fence the client from accessing the block device.
>> + *
>> + * @fl: file to check
>> + *
>> + * Return true if the file lease should be disposed of by the caller;
>> + * otherwise, return false.
>> + */
>> +static bool
>> +nfsd4_layout_lm_breaker_timedout(struct file_lease *fl)
>> +{
>> + struct nfs4_layout_stateid *ls = fl->c.flc_owner;
>> + bool ret;
>> +
>> + if (!nfsd4_layout_ops[ls->ls_layout_type]->fence_client)
>> + return true;
>> + if (ls->ls_fenced)
>> + return true;
>> +
>> + if (work_busy(&ls->ls_fence_work.work))
>> + return false;
>> + /* Schedule work to do the fence operation */
>> + ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
>> + if (!ret) {
>> + /*
>> + * If there is no pending work, mod_delayed_work queues
>> + * new task. While fencing is in progress, a reference
>> + * count is added to the layout stateid to ensure its
>> + * validity. This reference count is released once fencing
>> + * has been completed.
>> + */
>> + refcount_inc(&ls->ls_stid.sc_count);
>> + }
>> + return false;
>> +}
>> +
>> static const struct lease_manager_operations nfsd4_layouts_lm_ops = {
>> .lm_break = nfsd4_layout_lm_break,
>> .lm_change = nfsd4_layout_lm_change,
>> .lm_open_conflict = nfsd4_layout_lm_open_conflict,
>> + .lm_breaker_timedout = nfsd4_layout_lm_breaker_timedout,
>> };
>>
>> int
>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>> index 583c13b5aaf3..a57fa3318362 100644
>> --- a/fs/nfsd/nfs4state.c
>> +++ b/fs/nfsd/nfs4state.c
>> @@ -2385,6 +2385,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name,
>> #endif
>> #ifdef CONFIG_NFSD_SCSILAYOUT
>> xa_init(&clp->cl_dev_fences);
>> + mutex_init(&clp->cl_fence_mutex);
>> #endif
>> INIT_LIST_HEAD(&clp->async_copies);
>> spin_lock_init(&clp->async_lock);
>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
>> index 713f55ef6554..d9a3c966a35f 100644
>> --- a/fs/nfsd/state.h
>> +++ b/fs/nfsd/state.h
>> @@ -529,6 +529,7 @@ struct nfs4_client {
>> time64_t cl_ra_time;
>> #ifdef CONFIG_NFSD_SCSILAYOUT
>> struct xarray cl_dev_fences;
>> + struct mutex cl_fence_mutex;
>> #endif
>> };
>>
>> @@ -738,6 +739,8 @@ struct nfs4_layout_stateid {
>> stateid_t ls_recall_sid;
>> bool ls_recalled;
>> struct mutex ls_mutex;
>> + struct delayed_work ls_fence_work;
>> + bool ls_fenced;
>> };
>>
>> static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
>> diff --git a/include/linux/filelock.h b/include/linux/filelock.h
>> index 2f5e5588ee07..6939952f2088 100644
>> --- a/include/linux/filelock.h
>> +++ b/include/linux/filelock.h
>> @@ -17,6 +17,7 @@
>> #define FL_OFDLCK 1024 /* lock is "owned" by struct file */
>> #define FL_LAYOUT 2048 /* outstanding pNFS layout */
>> #define FL_RECLAIM 4096 /* reclaiming from a reboot server */
>> +#define FL_BREAKER_TIMEDOUT 8192 /* lease breaker timed out */
>>
>> #define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE)
>>
>> @@ -50,6 +51,7 @@ struct lease_manager_operations {
>> void (*lm_setup)(struct file_lease *, void **);
>> bool (*lm_breaker_owns_lease)(struct file_lease *);
>> int (*lm_open_conflict)(struct file *, int);
>> + bool (*lm_breaker_timedout)(struct file_lease *fl);
>> };
>>
>> struct lock_manager {
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2026-01-26 20:34 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-01-25 22:21 [PATCH v2 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing Dai Ngo
2026-01-26 16:11 ` Chuck Lever
2026-01-26 17:44 ` Dai Ngo
2026-01-26 18:50 ` Jeff Layton
2026-01-26 20:33 ` Dai Ngo
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox