[PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing

Linux NFS development
 help / color / mirror / Atom feed

* [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing
@ 2026-01-27  0:50 Dai Ngo
  2026-01-27 17:11 ` Chuck Lever
  2026-01-27 17:54 ` Jeff Layton
  0 siblings, 2 replies; 12+ messages in thread
From: Dai Ngo @ 2026-01-27  0:50 UTC (permalink / raw)
  To: chuck.lever, jlayton, neil, okorniev, tom, hch, alex.aring, viro,
	brauner, jack
  Cc: linux-fsdevel, linux-nfs

When a layout conflict triggers a recall, enforcing a timeout is
necessary to prevent excessive nfsd threads from being blocked in
__break_lease ensuring the server continues servicing incoming
requests efficiently.

This patch introduces a new function to lease_manager_operations:

lm_breaker_timedout: Invoked when a lease recall times out and is
about to be disposed of. This function enables the lease manager
to inform the caller whether the file_lease should remain on the
flc_list or be disposed of.

For the NFSD lease manager, this function now handles layout recall
timeouts. If the layout type supports fencing and the client has not
been fenced, a fence operation is triggered to prevent the client
from accessing the block device.

While the fencing operation is in progress, the conflicting file_lease
remains on the flc_list until fencing is complete. This guarantees
that no other clients can access the file, and the client with
exclusive access is properly blocked before disposal.

Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
---
 Documentation/filesystems/locking.rst |  2 +
 fs/locks.c                            | 10 +++-
 fs/nfsd/blocklayout.c                 | 38 ++++++-------
 fs/nfsd/nfs4layouts.c                 | 79 +++++++++++++++++++++++++--
 fs/nfsd/nfs4state.c                   |  1 +
 fs/nfsd/state.h                       |  6 ++
 include/linux/filelock.h              |  1 +
 7 files changed, 110 insertions(+), 27 deletions(-)

v2:
    . Update Subject line to include fencing operation.
    . Allow conflicting lease to remain on flc_list until fencing
      is complete.
    . Use system worker to perform fencing operation asynchronously.
    . Use nfs4_stid.sc_count to ensure layout stateid remains
      valid before starting the fencing operation, nfs4_stid.sc_count
      is released after fencing operation is complete.
    . Rework nfsd4_scsi_fence_client to:
         . wait until fencing to complete before exiting.
         . wait until fencing in progress to complete before
           checking the NFSD_MDS_PR_FENCED flag.
    . Remove lm_need_to_retry from lease_manager_operations.
v3:
    . correct locking requirement in locking.rst.
    . add max retry count to fencing operation.
    . add missing nfs4_put_stid in nfsd4_layout_fence_worker.
    . remove special-casing of FL_LAYOUT in lease_modify.
    . remove lease_want_dispose.
    . move lm_breaker_timedout call to time_out_leases.
v4:
    . only increment ls_fence_retry_cnt after successfully
      schedule new work in nfsd4_layout_lm_breaker_timedout.

diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index 04c7691e50e0..a339491f02e4 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -403,6 +403,7 @@ prototypes::
 	bool (*lm_breaker_owns_lease)(struct file_lock *);
         bool (*lm_lock_expirable)(struct file_lock *);
         void (*lm_expire_lock)(void);
+        void (*lm_breaker_timedout)(struct file_lease *);
 
 locking rules:
 
@@ -417,6 +418,7 @@ lm_breaker_owns_lease:	yes     	no			no
 lm_lock_expirable	yes		no			no
 lm_expire_lock		no		no			yes
 lm_open_conflict	yes		no			no
+lm_breaker_timedout     yes             no                      no
 ======================	=============	=================	=========
 
 buffer_head
diff --git a/fs/locks.c b/fs/locks.c
index 46f229f740c8..1b63aa704598 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1524,6 +1524,7 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
 {
 	struct file_lock_context *ctx = inode->i_flctx;
 	struct file_lease *fl, *tmp;
+	bool remove = true;
 
 	lockdep_assert_held(&ctx->flc_lock);
 
@@ -1531,8 +1532,13 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
 		trace_time_out_leases(inode, fl);
 		if (past_time(fl->fl_downgrade_time))
 			lease_modify(fl, F_RDLCK, dispose);
-		if (past_time(fl->fl_break_time))
-			lease_modify(fl, F_UNLCK, dispose);
+
+		if (past_time(fl->fl_break_time)) {
+			if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout)
+				remove = fl->fl_lmops->lm_breaker_timedout(fl);
+			if (remove)
+				lease_modify(fl, F_UNLCK, dispose);
+		}
 	}
 }
 
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 7ba9e2dd0875..69d3889df302 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -443,6 +443,14 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp,
 	return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
 }
 
+/*
+ * Perform the fence operation to prevent the client from accessing the
+ * block device. If a fence operation is already in progress, wait for
+ * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the
+ * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set,
+ * update the layout stateid by setting the ls_fenced flag to indicate
+ * that the client has been fenced.
+ */
 static void
 nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
 {
@@ -450,31 +458,23 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
 	struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev;
 	int status;
 
-	if (nfsd4_scsi_fence_set(clp, bdev->bd_dev))
+	mutex_lock(&clp->cl_fence_mutex);
+	if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) {
+		ls->ls_fenced = true;
+		mutex_unlock(&clp->cl_fence_mutex);
+		nfs4_put_stid(&ls->ls_stid);
 		return;
+	}
 
 	status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
 			nfsd4_scsi_pr_key(clp),
 			PR_EXCLUSIVE_ACCESS_REG_ONLY, true);
-	/*
-	 * Reset to allow retry only when the command could not have
-	 * reached the device. Negative status means a local error
-	 * (e.g., -ENOMEM) prevented the command from being sent.
-	 * PR_STS_PATH_FAILED, PR_STS_PATH_FAST_FAILED, and
-	 * PR_STS_RETRY_PATH_FAILURE indicate transport path failures
-	 * before device delivery.
-	 *
-	 * For all other errors, the command may have reached the device
-	 * and the preempt may have succeeded. Avoid resetting, since
-	 * retrying a successful preempt returns PR_STS_IOERR or
-	 * PR_STS_RESERVATION_CONFLICT, which would cause an infinite
-	 * retry loop.
-	 */
-	if (status < 0 ||
-	    status == PR_STS_PATH_FAILED ||
-	    status == PR_STS_PATH_FAST_FAILED ||
-	    status == PR_STS_RETRY_PATH_FAILURE)
+	if (status)
 		nfsd4_scsi_fence_clear(clp, bdev->bd_dev);
+	else
+		ls->ls_fenced = true;
+	mutex_unlock(&clp->cl_fence_mutex);
+	nfs4_put_stid(&ls->ls_stid);
 
 	trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status);
 }
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index ad7af8cfcf1f..1c498f3cd059 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -222,6 +222,29 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
 	return 0;
 }
 
+static void
+nfsd4_layout_fence_worker(struct work_struct *work)
+{
+	struct nfsd_file *nf;
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct nfs4_layout_stateid *ls = container_of(dwork,
+			struct nfs4_layout_stateid, ls_fence_work);
+	u32 type;
+
+	rcu_read_lock();
+	nf = nfsd_file_get(ls->ls_file);
+	rcu_read_unlock();
+	if (!nf) {
+		nfs4_put_stid(&ls->ls_stid);
+		return;
+	}
+
+	type = ls->ls_layout_type;
+	if (nfsd4_layout_ops[type]->fence_client)
+		nfsd4_layout_ops[type]->fence_client(ls, nf);
+	nfsd_file_put(nf);
+}
+
 static struct nfs4_layout_stateid *
 nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
 		struct nfs4_stid *parent, u32 layout_type)
@@ -271,6 +294,10 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
 	list_add(&ls->ls_perfile, &fp->fi_lo_states);
 	spin_unlock(&fp->fi_lock);
 
+	INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker);
+	ls->ls_fenced = false;
+	ls->ls_fence_retry_cnt = 0;
+
 	trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid);
 	return ls;
 }
@@ -708,9 +735,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
 		rcu_read_unlock();
 		if (fl) {
 			ops = nfsd4_layout_ops[ls->ls_layout_type];
-			if (ops->fence_client)
+			if (ops->fence_client) {
+				refcount_inc(&ls->ls_stid.sc_count);
 				ops->fence_client(ls, fl);
-			else
+			} else
 				nfsd4_cb_layout_fail(ls, fl);
 			nfsd_file_put(fl);
 		}
@@ -747,11 +775,9 @@ static bool
 nfsd4_layout_lm_break(struct file_lease *fl)
 {
 	/*
-	 * We don't want the locks code to timeout the lease for us;
-	 * we'll remove it ourself if a layout isn't returned
-	 * in time:
+	 * Enforce break lease timeout to prevent NFSD
+	 * thread from hanging in __break_lease.
 	 */
-	fl->fl_break_time = 0;
 	nfsd4_recall_file_layout(fl->c.flc_owner);
 	return false;
 }
@@ -782,10 +808,51 @@ nfsd4_layout_lm_open_conflict(struct file *filp, int arg)
 	return 0;
 }
 
+/**
+ * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out.
+ * If the layout type supports a fence operation, schedule a worker to
+ * fence the client from accessing the block device.
+ *
+ * @fl: file to check
+ *
+ * Return true if the file lease should be disposed of by the caller;
+ * otherwise, return false.
+ */
+static bool
+nfsd4_layout_lm_breaker_timedout(struct file_lease *fl)
+{
+	struct nfs4_layout_stateid *ls = fl->c.flc_owner;
+	bool ret;
+
+	if (!nfsd4_layout_ops[ls->ls_layout_type]->fence_client)
+		return true;
+	if (ls->ls_fenced || ls->ls_fence_retry_cnt >= LO_MAX_FENCE_RETRY)
+		return true;
+
+	if (work_busy(&ls->ls_fence_work.work))
+		return false;
+	/* Schedule work to do the fence operation */
+	ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
+	if (!ret) {
+		/*
+		 * If there is no pending work, mod_delayed_work queues
+		 * new task. While fencing is in progress, a reference
+		 * count is added to the layout stateid to ensure its
+		 * validity. This reference count is released once fencing
+		 * has been completed.
+		 */
+		refcount_inc(&ls->ls_stid.sc_count);
+		++ls->ls_fence_retry_cnt;
+		return true;
+	}
+	return false;
+}
+
 static const struct lease_manager_operations nfsd4_layouts_lm_ops = {
 	.lm_break		= nfsd4_layout_lm_break,
 	.lm_change		= nfsd4_layout_lm_change,
 	.lm_open_conflict	= nfsd4_layout_lm_open_conflict,
+	.lm_breaker_timedout	= nfsd4_layout_lm_breaker_timedout,
 };
 
 int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 583c13b5aaf3..a57fa3318362 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2385,6 +2385,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name,
 #endif
 #ifdef CONFIG_NFSD_SCSILAYOUT
 	xa_init(&clp->cl_dev_fences);
+	mutex_init(&clp->cl_fence_mutex);
 #endif
 	INIT_LIST_HEAD(&clp->async_copies);
 	spin_lock_init(&clp->async_lock);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 713f55ef6554..57e54dfb406c 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -529,6 +529,7 @@ struct nfs4_client {
 	time64_t		cl_ra_time;
 #ifdef CONFIG_NFSD_SCSILAYOUT
 	struct xarray		cl_dev_fences;
+	struct mutex		cl_fence_mutex;
 #endif
 };
 
@@ -738,8 +739,13 @@ struct nfs4_layout_stateid {
 	stateid_t			ls_recall_sid;
 	bool				ls_recalled;
 	struct mutex			ls_mutex;
+	struct delayed_work		ls_fence_work;
+	bool				ls_fenced;
+	int				ls_fence_retry_cnt;
 };
 
+#define	LO_MAX_FENCE_RETRY		5
+
 static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
 {
 	return container_of(s, struct nfs4_layout_stateid, ls_stid);
diff --git a/include/linux/filelock.h b/include/linux/filelock.h
index 2f5e5588ee07..13b9c9f04589 100644
--- a/include/linux/filelock.h
+++ b/include/linux/filelock.h
@@ -50,6 +50,7 @@ struct lease_manager_operations {
 	void (*lm_setup)(struct file_lease *, void **);
 	bool (*lm_breaker_owns_lease)(struct file_lease *);
 	int (*lm_open_conflict)(struct file *, int);
+	bool (*lm_breaker_timedout)(struct file_lease *fl);
 };
 
 struct lock_manager {
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing
  2026-01-27  0:50 [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing Dai Ngo
@ 2026-01-27 17:11 ` Chuck Lever
  2026-01-27 18:14   ` Jeff Layton
  2026-01-27 20:36   ` Dai Ngo
  2026-01-27 17:54 ` Jeff Layton
  1 sibling, 2 replies; 12+ messages in thread
From: Chuck Lever @ 2026-01-27 17:11 UTC (permalink / raw)
  To: Dai Ngo, Chuck Lever, Jeff Layton, NeilBrown, Olga Kornievskaia,
	Tom Talpey, Christoph Hellwig, Alexander Aring, Alexander Viro,
	Christian Brauner, Jan Kara
  Cc: linux-fsdevel, linux-nfs



On Mon, Jan 26, 2026, at 7:50 PM, Dai Ngo wrote:
> When a layout conflict triggers a recall, enforcing a timeout is
> necessary to prevent excessive nfsd threads from being blocked in
> __break_lease ensuring the server continues servicing incoming
> requests efficiently.
>
> This patch introduces a new function to lease_manager_operations:
>
> lm_breaker_timedout: Invoked when a lease recall times out and is
> about to be disposed of. This function enables the lease manager
> to inform the caller whether the file_lease should remain on the
> flc_list or be disposed of.
>
> For the NFSD lease manager, this function now handles layout recall
> timeouts. If the layout type supports fencing and the client has not
> been fenced, a fence operation is triggered to prevent the client
> from accessing the block device.
>
> While the fencing operation is in progress, the conflicting file_lease
> remains on the flc_list until fencing is complete. This guarantees
> that no other clients can access the file, and the client with
> exclusive access is properly blocked before disposal.
>
> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
> ---
>  Documentation/filesystems/locking.rst |  2 +
>  fs/locks.c                            | 10 +++-
>  fs/nfsd/blocklayout.c                 | 38 ++++++-------
>  fs/nfsd/nfs4layouts.c                 | 79 +++++++++++++++++++++++++--
>  fs/nfsd/nfs4state.c                   |  1 +
>  fs/nfsd/state.h                       |  6 ++
>  include/linux/filelock.h              |  1 +
>  7 files changed, 110 insertions(+), 27 deletions(-)
>
> v2:
>     . Update Subject line to include fencing operation.
>     . Allow conflicting lease to remain on flc_list until fencing
>       is complete.
>     . Use system worker to perform fencing operation asynchronously.
>     . Use nfs4_stid.sc_count to ensure layout stateid remains
>       valid before starting the fencing operation, nfs4_stid.sc_count
>       is released after fencing operation is complete.
>     . Rework nfsd4_scsi_fence_client to:
>          . wait until fencing to complete before exiting.
>          . wait until fencing in progress to complete before
>            checking the NFSD_MDS_PR_FENCED flag.
>     . Remove lm_need_to_retry from lease_manager_operations.
> v3:
>     . correct locking requirement in locking.rst.
>     . add max retry count to fencing operation.
>     . add missing nfs4_put_stid in nfsd4_layout_fence_worker.
>     . remove special-casing of FL_LAYOUT in lease_modify.
>     . remove lease_want_dispose.
>     . move lm_breaker_timedout call to time_out_leases.
> v4:
>     . only increment ls_fence_retry_cnt after successfully
>       schedule new work in nfsd4_layout_lm_breaker_timedout.
>
> diff --git a/Documentation/filesystems/locking.rst 
> b/Documentation/filesystems/locking.rst
> index 04c7691e50e0..a339491f02e4 100644
> --- a/Documentation/filesystems/locking.rst
> +++ b/Documentation/filesystems/locking.rst
> @@ -403,6 +403,7 @@ prototypes::
>  	bool (*lm_breaker_owns_lease)(struct file_lock *);
>          bool (*lm_lock_expirable)(struct file_lock *);
>          void (*lm_expire_lock)(void);
> +        void (*lm_breaker_timedout)(struct file_lease *);
> 
>  locking rules:
> 
> @@ -417,6 +418,7 @@ lm_breaker_owns_lease:	yes     	no			no
>  lm_lock_expirable	yes		no			no
>  lm_expire_lock		no		no			yes
>  lm_open_conflict	yes		no			no
> +lm_breaker_timedout     yes             no                      no
>  ======================	=============	=================	=========
> 
>  buffer_head
> diff --git a/fs/locks.c b/fs/locks.c
> index 46f229f740c8..1b63aa704598 100644
> --- a/fs/locks.c
> +++ b/fs/locks.c
> @@ -1524,6 +1524,7 @@ static void time_out_leases(struct inode *inode, 
> struct list_head *dispose)
>  {
>  	struct file_lock_context *ctx = inode->i_flctx;
>  	struct file_lease *fl, *tmp;
> +	bool remove = true;
> 
>  	lockdep_assert_held(&ctx->flc_lock);
> 
> @@ -1531,8 +1532,13 @@ static void time_out_leases(struct inode *inode, 
> struct list_head *dispose)
>  		trace_time_out_leases(inode, fl);
>  		if (past_time(fl->fl_downgrade_time))
>  			lease_modify(fl, F_RDLCK, dispose);
> -		if (past_time(fl->fl_break_time))
> -			lease_modify(fl, F_UNLCK, dispose);
> +
> +		if (past_time(fl->fl_break_time)) {
> +			if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout)
> +				remove = fl->fl_lmops->lm_breaker_timedout(fl);
> +			if (remove)
> +				lease_modify(fl, F_UNLCK, dispose);
> +		}
>  	}
>  }
> 
> diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
> index 7ba9e2dd0875..69d3889df302 100644
> --- a/fs/nfsd/blocklayout.c
> +++ b/fs/nfsd/blocklayout.c
> @@ -443,6 +443,14 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, 
> struct svc_rqst *rqstp,
>  	return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
>  }
> 
> +/*
> + * Perform the fence operation to prevent the client from accessing the
> + * block device. If a fence operation is already in progress, wait for
> + * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the
> + * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set,
> + * update the layout stateid by setting the ls_fenced flag to indicate
> + * that the client has been fenced.
> + */
>  static void
>  nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct 
> nfsd_file *file)
>  {
> @@ -450,31 +458,23 @@ nfsd4_scsi_fence_client(struct 
> nfs4_layout_stateid *ls, struct nfsd_file *file)
>  	struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev;
>  	int status;
> 
> -	if (nfsd4_scsi_fence_set(clp, bdev->bd_dev))
> +	mutex_lock(&clp->cl_fence_mutex);
> +	if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) {
> +		ls->ls_fenced = true;
> +		mutex_unlock(&clp->cl_fence_mutex);
> +		nfs4_put_stid(&ls->ls_stid);
>  		return;
> +	}
> 
>  	status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
>  			nfsd4_scsi_pr_key(clp),
>  			PR_EXCLUSIVE_ACCESS_REG_ONLY, true);
> -	/*
> -	 * Reset to allow retry only when the command could not have
> -	 * reached the device. Negative status means a local error
> -	 * (e.g., -ENOMEM) prevented the command from being sent.
> -	 * PR_STS_PATH_FAILED, PR_STS_PATH_FAST_FAILED, and
> -	 * PR_STS_RETRY_PATH_FAILURE indicate transport path failures
> -	 * before device delivery.
> -	 *
> -	 * For all other errors, the command may have reached the device
> -	 * and the preempt may have succeeded. Avoid resetting, since
> -	 * retrying a successful preempt returns PR_STS_IOERR or
> -	 * PR_STS_RESERVATION_CONFLICT, which would cause an infinite
> -	 * retry loop.
> -	 */
> -	if (status < 0 ||
> -	    status == PR_STS_PATH_FAILED ||
> -	    status == PR_STS_PATH_FAST_FAILED ||
> -	    status == PR_STS_RETRY_PATH_FAILURE)
> +	if (status)
>  		nfsd4_scsi_fence_clear(clp, bdev->bd_dev);
> +	else
> +		ls->ls_fenced = true;

The removed code distinguishes retryable errors (such as path failures)
from permanent errors (PR_STS_IOERR, PR_STS_RESERVATION_CONFLICT), but
the new code clears the fence on any error, potentially causing infinite
retry loops as warned in the removed comment.

Either the comment and error distinction should remain, or some rationale
for removing the distinction between temporary and permanent errors should
be added to the commit message.


> +	mutex_unlock(&clp->cl_fence_mutex);
> +	nfs4_put_stid(&ls->ls_stid);
> 
>  	trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status);
>  }
> diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
> index ad7af8cfcf1f..1c498f3cd059 100644
> --- a/fs/nfsd/nfs4layouts.c
> +++ b/fs/nfsd/nfs4layouts.c
> @@ -222,6 +222,29 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
>  	return 0;
>  }
> 
> +static void
> +nfsd4_layout_fence_worker(struct work_struct *work)
> +{
> +	struct nfsd_file *nf;
> +	struct delayed_work *dwork = to_delayed_work(work);
> +	struct nfs4_layout_stateid *ls = container_of(dwork,
> +			struct nfs4_layout_stateid, ls_fence_work);
> +	u32 type;
> +
> +	rcu_read_lock();
> +	nf = nfsd_file_get(ls->ls_file);
> +	rcu_read_unlock();
> +	if (!nf) {
> +		nfs4_put_stid(&ls->ls_stid);
> +		return;
> +	}
> +
> +	type = ls->ls_layout_type;
> +	if (nfsd4_layout_ops[type]->fence_client)
> +		nfsd4_layout_ops[type]->fence_client(ls, nf);
> +	nfsd_file_put(nf);
> +}
> +
>  static struct nfs4_layout_stateid *
>  nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
>  		struct nfs4_stid *parent, u32 layout_type)
> @@ -271,6 +294,10 @@ nfsd4_alloc_layout_stateid(struct 
> nfsd4_compound_state *cstate,
>  	list_add(&ls->ls_perfile, &fp->fi_lo_states);
>  	spin_unlock(&fp->fi_lock);
> 
> +	INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker);
> +	ls->ls_fenced = false;
> +	ls->ls_fence_retry_cnt = 0;
> +
>  	trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid);
>  	return ls;
>  }
> @@ -708,9 +735,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, 
> struct rpc_task *task)
>  		rcu_read_unlock();
>  		if (fl) {
>  			ops = nfsd4_layout_ops[ls->ls_layout_type];
> -			if (ops->fence_client)
> +			if (ops->fence_client) {
> +				refcount_inc(&ls->ls_stid.sc_count);
>  				ops->fence_client(ls, fl);
> -			else
> +			} else
>  				nfsd4_cb_layout_fail(ls, fl);
>  			nfsd_file_put(fl);
>  		}
> @@ -747,11 +775,9 @@ static bool
>  nfsd4_layout_lm_break(struct file_lease *fl)
>  {
>  	/*
> -	 * We don't want the locks code to timeout the lease for us;
> -	 * we'll remove it ourself if a layout isn't returned
> -	 * in time:
> +	 * Enforce break lease timeout to prevent NFSD
> +	 * thread from hanging in __break_lease.
>  	 */
> -	fl->fl_break_time = 0;
>  	nfsd4_recall_file_layout(fl->c.flc_owner);
>  	return false;
>  }
> @@ -782,10 +808,51 @@ nfsd4_layout_lm_open_conflict(struct file *filp, 
> int arg)
>  	return 0;
>  }
> 
> +/**
> + * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out.

Nit: Move the description of @fl here.

> + * If the layout type supports a fence operation, schedule a worker to
> + * fence the client from accessing the block device.
> + *
> + * @fl: file to check
> + *
> + * Return true if the file lease should be disposed of by the caller;
> + * otherwise, return false.
> + */
> +static bool
> +nfsd4_layout_lm_breaker_timedout(struct file_lease *fl)
> +{
> +	struct nfs4_layout_stateid *ls = fl->c.flc_owner;
> +	bool ret;
> +
> +	if (!nfsd4_layout_ops[ls->ls_layout_type]->fence_client)
> +		return true;
> +	if (ls->ls_fenced || ls->ls_fence_retry_cnt >= LO_MAX_FENCE_RETRY)
> +		return true;

Since these two variables are accessed while the fence mutex
is held in other places, they need similar protection here
to prevent races. You explained that the mutex cannot be
taken here, so some other form of serialization will be
needed to protect these fields.

If I'm reading this correctly, when the fence fails after all
5 retries, the client retains block device access but server
believes recall succeeded ?


> +
> +	if (work_busy(&ls->ls_fence_work.work))
> +		return false;
> +	/* Schedule work to do the fence operation */
> +	ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
> +	if (!ret) {
> +		/*
> +		 * If there is no pending work, mod_delayed_work queues
> +		 * new task. While fencing is in progress, a reference
> +		 * count is added to the layout stateid to ensure its
> +		 * validity. This reference count is released once fencing
> +		 * has been completed.
> +		 */
> +		refcount_inc(&ls->ls_stid.sc_count);
> +		++ls->ls_fence_retry_cnt;
> +		return true;
> +	}

Incrementing after the mod_delayed_work() call is racy. Instead:

refcount_inc(&ls->ls_stid.sc_count);
ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
if (ret) {
    refcount_dec(&ls->ls_stid.sc_count);
    ....
}


> +	return false;
> +}
> +
>  static const struct lease_manager_operations nfsd4_layouts_lm_ops = {
>  	.lm_break		= nfsd4_layout_lm_break,
>  	.lm_change		= nfsd4_layout_lm_change,
>  	.lm_open_conflict	= nfsd4_layout_lm_open_conflict,
> +	.lm_breaker_timedout	= nfsd4_layout_lm_breaker_timedout,
>  };
> 
>  int
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 583c13b5aaf3..a57fa3318362 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -2385,6 +2385,7 @@ static struct nfs4_client *alloc_client(struct 
> xdr_netobj name,
>  #endif
>  #ifdef CONFIG_NFSD_SCSILAYOUT
>  	xa_init(&clp->cl_dev_fences);
> +	mutex_init(&clp->cl_fence_mutex);
>  #endif
>  	INIT_LIST_HEAD(&clp->async_copies);
>  	spin_lock_init(&clp->async_lock);
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index 713f55ef6554..57e54dfb406c 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -529,6 +529,7 @@ struct nfs4_client {
>  	time64_t		cl_ra_time;
>  #ifdef CONFIG_NFSD_SCSILAYOUT
>  	struct xarray		cl_dev_fences;
> +	struct mutex		cl_fence_mutex;
>  #endif
>  };
> 
> @@ -738,8 +739,13 @@ struct nfs4_layout_stateid {
>  	stateid_t			ls_recall_sid;
>  	bool				ls_recalled;
>  	struct mutex			ls_mutex;
> +	struct delayed_work		ls_fence_work;

Still missing cancel_delayed_work_sync() when freeing the
layout stateid.


> +	bool				ls_fenced;
> +	int				ls_fence_retry_cnt;
>  };
> 
> +#define	LO_MAX_FENCE_RETRY		5

The value of 5 needs some justification here in a comment.
Actually, 5 might be too low, but I can't really tell.


> +
>  static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
>  {
>  	return container_of(s, struct nfs4_layout_stateid, ls_stid);
> diff --git a/include/linux/filelock.h b/include/linux/filelock.h
> index 2f5e5588ee07..13b9c9f04589 100644
> --- a/include/linux/filelock.h
> +++ b/include/linux/filelock.h
> @@ -50,6 +50,7 @@ struct lease_manager_operations {
>  	void (*lm_setup)(struct file_lease *, void **);
>  	bool (*lm_breaker_owns_lease)(struct file_lease *);
>  	int (*lm_open_conflict)(struct file *, int);
> +	bool (*lm_breaker_timedout)(struct file_lease *fl);
>  };
> 
>  struct lock_manager {
> -- 
> 2.47.3

-- 
Chuck Lever

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing
  2026-01-27 17:11 ` Chuck Lever
@ 2026-01-27 18:14   ` Jeff Layton
  2026-01-27 20:38     ` Dai Ngo
  2026-01-27 20:36   ` Dai Ngo
  1 sibling, 1 reply; 12+ messages in thread
From: Jeff Layton @ 2026-01-27 18:14 UTC (permalink / raw)
  To: Chuck Lever, Dai Ngo, Chuck Lever, NeilBrown, Olga Kornievskaia,
	Tom Talpey, Christoph Hellwig, Alexander Aring, Alexander Viro,
	Christian Brauner, Jan Kara
  Cc: linux-fsdevel, linux-nfs

On Tue, 2026-01-27 at 12:11 -0500, Chuck Lever wrote:
> 
> > +
> > +	if (work_busy(&ls->ls_fence_work.work))
> > +		return false;
> > +	/* Schedule work to do the fence operation */
> > +	ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
> > +	if (!ret) {
> > +		/*
> > +		 * If there is no pending work, mod_delayed_work queues
> > +		 * new task. While fencing is in progress, a reference
> > +		 * count is added to the layout stateid to ensure its
> > +		 * validity. This reference count is released once fencing
> > +		 * has been completed.
> > +		 */
> > +		refcount_inc(&ls->ls_stid.sc_count);
> > +		++ls->ls_fence_retry_cnt;
> > +		return true;
> > +	}
> 
> Incrementing after the mod_delayed_work() call is racy. Instead:
> 
> refcount_inc(&ls->ls_stid.sc_count);
> ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
> if (ret) {
>     refcount_dec(&ls->ls_stid.sc_count);
>     ....
> }
> 

Is that safe? That's done in nfsd_break_one_deleg() for similar
reasons, but there is a comment over that function explaining why it's
OK.

We'll need to validate that the lease is always released before the
sc_count goes to 0 here too. At first glance, it doesn't look like that
is the case.


-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing
  2026-01-27 18:14   ` Jeff Layton
@ 2026-01-27 20:38     ` Dai Ngo
  0 siblings, 0 replies; 12+ messages in thread
From: Dai Ngo @ 2026-01-27 20:38 UTC (permalink / raw)
  To: Jeff Layton, Chuck Lever, Chuck Lever, NeilBrown,
	Olga Kornievskaia, Tom Talpey, Christoph Hellwig, Alexander Aring,
	Alexander Viro, Christian Brauner, Jan Kara
  Cc: linux-fsdevel, linux-nfs


On 1/27/26 10:14 AM, Jeff Layton wrote:
> On Tue, 2026-01-27 at 12:11 -0500, Chuck Lever wrote:
>>> +
>>> +	if (work_busy(&ls->ls_fence_work.work))
>>> +		return false;
>>> +	/* Schedule work to do the fence operation */
>>> +	ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
>>> +	if (!ret) {
>>> +		/*
>>> +		 * If there is no pending work, mod_delayed_work queues
>>> +		 * new task. While fencing is in progress, a reference
>>> +		 * count is added to the layout stateid to ensure its
>>> +		 * validity. This reference count is released once fencing
>>> +		 * has been completed.
>>> +		 */
>>> +		refcount_inc(&ls->ls_stid.sc_count);
>>> +		++ls->ls_fence_retry_cnt;
>>> +		return true;
>>> +	}
>> Incrementing after the mod_delayed_work() call is racy. Instead:
>>
>> refcount_inc(&ls->ls_stid.sc_count);
>> ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
>> if (ret) {
>>      refcount_dec(&ls->ls_stid.sc_count);
>>      ....
>> }
>>
> Is that safe? That's done in nfsd_break_one_deleg() for similar
> reasons, but there is a comment over that function explaining why it's
> OK.
>
> We'll need to validate that the lease is always released before the
> sc_count goes to 0 here too. At first glance, it doesn't look like that
> is the case.

No, it's not safe. I have to think of other ways to make sure the
layout statid is valid when the fence worker runs.

-Dai

>
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing
  2026-01-27 17:11 ` Chuck Lever
  2026-01-27 18:14   ` Jeff Layton
@ 2026-01-27 20:36   ` Dai Ngo
  2026-01-27 21:53     ` Chuck Lever
  1 sibling, 1 reply; 12+ messages in thread
From: Dai Ngo @ 2026-01-27 20:36 UTC (permalink / raw)
  To: Chuck Lever, Chuck Lever, Jeff Layton, NeilBrown,
	Olga Kornievskaia, Tom Talpey, Christoph Hellwig, Alexander Aring,
	Alexander Viro, Christian Brauner, Jan Kara
  Cc: linux-fsdevel, linux-nfs


On 1/27/26 9:11 AM, Chuck Lever wrote:
>
> On Mon, Jan 26, 2026, at 7:50 PM, Dai Ngo wrote:
>> When a layout conflict triggers a recall, enforcing a timeout is
>> necessary to prevent excessive nfsd threads from being blocked in
>> __break_lease ensuring the server continues servicing incoming
>> requests efficiently.
>>
>> This patch introduces a new function to lease_manager_operations:
>>
>> lm_breaker_timedout: Invoked when a lease recall times out and is
>> about to be disposed of. This function enables the lease manager
>> to inform the caller whether the file_lease should remain on the
>> flc_list or be disposed of.
>>
>> For the NFSD lease manager, this function now handles layout recall
>> timeouts. If the layout type supports fencing and the client has not
>> been fenced, a fence operation is triggered to prevent the client
>> from accessing the block device.
>>
>> While the fencing operation is in progress, the conflicting file_lease
>> remains on the flc_list until fencing is complete. This guarantees
>> that no other clients can access the file, and the client with
>> exclusive access is properly blocked before disposal.
>>
>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>> ---
>>   Documentation/filesystems/locking.rst |  2 +
>>   fs/locks.c                            | 10 +++-
>>   fs/nfsd/blocklayout.c                 | 38 ++++++-------
>>   fs/nfsd/nfs4layouts.c                 | 79 +++++++++++++++++++++++++--
>>   fs/nfsd/nfs4state.c                   |  1 +
>>   fs/nfsd/state.h                       |  6 ++
>>   include/linux/filelock.h              |  1 +
>>   7 files changed, 110 insertions(+), 27 deletions(-)
>>
>> v2:
>>      . Update Subject line to include fencing operation.
>>      . Allow conflicting lease to remain on flc_list until fencing
>>        is complete.
>>      . Use system worker to perform fencing operation asynchronously.
>>      . Use nfs4_stid.sc_count to ensure layout stateid remains
>>        valid before starting the fencing operation, nfs4_stid.sc_count
>>        is released after fencing operation is complete.
>>      . Rework nfsd4_scsi_fence_client to:
>>           . wait until fencing to complete before exiting.
>>           . wait until fencing in progress to complete before
>>             checking the NFSD_MDS_PR_FENCED flag.
>>      . Remove lm_need_to_retry from lease_manager_operations.
>> v3:
>>      . correct locking requirement in locking.rst.
>>      . add max retry count to fencing operation.
>>      . add missing nfs4_put_stid in nfsd4_layout_fence_worker.
>>      . remove special-casing of FL_LAYOUT in lease_modify.
>>      . remove lease_want_dispose.
>>      . move lm_breaker_timedout call to time_out_leases.
>> v4:
>>      . only increment ls_fence_retry_cnt after successfully
>>        schedule new work in nfsd4_layout_lm_breaker_timedout.
>>
>> diff --git a/Documentation/filesystems/locking.rst
>> b/Documentation/filesystems/locking.rst
>> index 04c7691e50e0..a339491f02e4 100644
>> --- a/Documentation/filesystems/locking.rst
>> +++ b/Documentation/filesystems/locking.rst
>> @@ -403,6 +403,7 @@ prototypes::
>>   	bool (*lm_breaker_owns_lease)(struct file_lock *);
>>           bool (*lm_lock_expirable)(struct file_lock *);
>>           void (*lm_expire_lock)(void);
>> +        void (*lm_breaker_timedout)(struct file_lease *);
>>
>>   locking rules:
>>
>> @@ -417,6 +418,7 @@ lm_breaker_owns_lease:	yes     	no			no
>>   lm_lock_expirable	yes		no			no
>>   lm_expire_lock		no		no			yes
>>   lm_open_conflict	yes		no			no
>> +lm_breaker_timedout     yes             no                      no
>>   ======================	=============	=================	=========
>>
>>   buffer_head
>> diff --git a/fs/locks.c b/fs/locks.c
>> index 46f229f740c8..1b63aa704598 100644
>> --- a/fs/locks.c
>> +++ b/fs/locks.c
>> @@ -1524,6 +1524,7 @@ static void time_out_leases(struct inode *inode,
>> struct list_head *dispose)
>>   {
>>   	struct file_lock_context *ctx = inode->i_flctx;
>>   	struct file_lease *fl, *tmp;
>> +	bool remove = true;
>>
>>   	lockdep_assert_held(&ctx->flc_lock);
>>
>> @@ -1531,8 +1532,13 @@ static void time_out_leases(struct inode *inode,
>> struct list_head *dispose)
>>   		trace_time_out_leases(inode, fl);
>>   		if (past_time(fl->fl_downgrade_time))
>>   			lease_modify(fl, F_RDLCK, dispose);
>> -		if (past_time(fl->fl_break_time))
>> -			lease_modify(fl, F_UNLCK, dispose);
>> +
>> +		if (past_time(fl->fl_break_time)) {
>> +			if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout)
>> +				remove = fl->fl_lmops->lm_breaker_timedout(fl);
>> +			if (remove)
>> +				lease_modify(fl, F_UNLCK, dispose);
>> +		}
>>   	}
>>   }
>>
>> diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
>> index 7ba9e2dd0875..69d3889df302 100644
>> --- a/fs/nfsd/blocklayout.c
>> +++ b/fs/nfsd/blocklayout.c
>> @@ -443,6 +443,14 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode,
>> struct svc_rqst *rqstp,
>>   	return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
>>   }
>>
>> +/*
>> + * Perform the fence operation to prevent the client from accessing the
>> + * block device. If a fence operation is already in progress, wait for
>> + * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the
>> + * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set,
>> + * update the layout stateid by setting the ls_fenced flag to indicate
>> + * that the client has been fenced.
>> + */
>>   static void
>>   nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct
>> nfsd_file *file)
>>   {
>> @@ -450,31 +458,23 @@ nfsd4_scsi_fence_client(struct
>> nfs4_layout_stateid *ls, struct nfsd_file *file)
>>   	struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev;
>>   	int status;
>>
>> -	if (nfsd4_scsi_fence_set(clp, bdev->bd_dev))
>> +	mutex_lock(&clp->cl_fence_mutex);
>> +	if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) {
>> +		ls->ls_fenced = true;
>> +		mutex_unlock(&clp->cl_fence_mutex);
>> +		nfs4_put_stid(&ls->ls_stid);
>>   		return;
>> +	}
>>
>>   	status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
>>   			nfsd4_scsi_pr_key(clp),
>>   			PR_EXCLUSIVE_ACCESS_REG_ONLY, true);
>> -	/*
>> -	 * Reset to allow retry only when the command could not have
>> -	 * reached the device. Negative status means a local error
>> -	 * (e.g., -ENOMEM) prevented the command from being sent.
>> -	 * PR_STS_PATH_FAILED, PR_STS_PATH_FAST_FAILED, and
>> -	 * PR_STS_RETRY_PATH_FAILURE indicate transport path failures
>> -	 * before device delivery.
>> -	 *
>> -	 * For all other errors, the command may have reached the device
>> -	 * and the preempt may have succeeded. Avoid resetting, since
>> -	 * retrying a successful preempt returns PR_STS_IOERR or
>> -	 * PR_STS_RESERVATION_CONFLICT, which would cause an infinite
>> -	 * retry loop.
>> -	 */
>> -	if (status < 0 ||
>> -	    status == PR_STS_PATH_FAILED ||
>> -	    status == PR_STS_PATH_FAST_FAILED ||
>> -	    status == PR_STS_RETRY_PATH_FAILURE)
>> +	if (status)
>>   		nfsd4_scsi_fence_clear(clp, bdev->bd_dev);
>> +	else
>> +		ls->ls_fenced = true;
> The removed code distinguishes retryable errors (such as path failures)
> from permanent errors (PR_STS_IOERR, PR_STS_RESERVATION_CONFLICT), but
> the new code clears the fence on any error,

In v2 and v3 patch, I left the comment and the code distinguishes retryable
errors intact. And you commented this:

> In nfsd4_scsi_fence_client(), when a device error occurs (e.g.,
> PR_STS_IOERR), ls->ls_fenced is set even though the client may
> still have storage access.

So I'm not sure how to treat permanent errors now. Please advise.

>   potentially causing infinite
> retry loops as warned in the removed comment.

Infinite loops can not happen due to the maximum retry count.

>
> Either the comment and error distinction should remain,

As mentioned above, I can put the comment and code that distinguishes
temporarily error codes back. Then what do we need to do for other
permanent errors; PR_STS_IOERR, PR_STS_RESERVATION_CONFLICTT?

>   or some rationale
> for removing the distinction between temporary and permanent errors should
> be added to the commit message.
>
>
>> +	mutex_unlock(&clp->cl_fence_mutex);
>> +	nfs4_put_stid(&ls->ls_stid);
>>
>>   	trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status);
>>   }
>> diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
>> index ad7af8cfcf1f..1c498f3cd059 100644
>> --- a/fs/nfsd/nfs4layouts.c
>> +++ b/fs/nfsd/nfs4layouts.c
>> @@ -222,6 +222,29 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
>>   	return 0;
>>   }
>>
>> +static void
>> +nfsd4_layout_fence_worker(struct work_struct *work)
>> +{
>> +	struct nfsd_file *nf;
>> +	struct delayed_work *dwork = to_delayed_work(work);
>> +	struct nfs4_layout_stateid *ls = container_of(dwork,
>> +			struct nfs4_layout_stateid, ls_fence_work);
>> +	u32 type;
>> +
>> +	rcu_read_lock();
>> +	nf = nfsd_file_get(ls->ls_file);
>> +	rcu_read_unlock();
>> +	if (!nf) {
>> +		nfs4_put_stid(&ls->ls_stid);
>> +		return;
>> +	}
>> +
>> +	type = ls->ls_layout_type;
>> +	if (nfsd4_layout_ops[type]->fence_client)
>> +		nfsd4_layout_ops[type]->fence_client(ls, nf);
>> +	nfsd_file_put(nf);
>> +}
>> +
>>   static struct nfs4_layout_stateid *
>>   nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
>>   		struct nfs4_stid *parent, u32 layout_type)
>> @@ -271,6 +294,10 @@ nfsd4_alloc_layout_stateid(struct
>> nfsd4_compound_state *cstate,
>>   	list_add(&ls->ls_perfile, &fp->fi_lo_states);
>>   	spin_unlock(&fp->fi_lock);
>>
>> +	INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker);
>> +	ls->ls_fenced = false;
>> +	ls->ls_fence_retry_cnt = 0;
>> +
>>   	trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid);
>>   	return ls;
>>   }
>> @@ -708,9 +735,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb,
>> struct rpc_task *task)
>>   		rcu_read_unlock();
>>   		if (fl) {
>>   			ops = nfsd4_layout_ops[ls->ls_layout_type];
>> -			if (ops->fence_client)
>> +			if (ops->fence_client) {
>> +				refcount_inc(&ls->ls_stid.sc_count);
>>   				ops->fence_client(ls, fl);
>> -			else
>> +			} else
>>   				nfsd4_cb_layout_fail(ls, fl);
>>   			nfsd_file_put(fl);
>>   		}
>> @@ -747,11 +775,9 @@ static bool
>>   nfsd4_layout_lm_break(struct file_lease *fl)
>>   {
>>   	/*
>> -	 * We don't want the locks code to timeout the lease for us;
>> -	 * we'll remove it ourself if a layout isn't returned
>> -	 * in time:
>> +	 * Enforce break lease timeout to prevent NFSD
>> +	 * thread from hanging in __break_lease.
>>   	 */
>> -	fl->fl_break_time = 0;
>>   	nfsd4_recall_file_layout(fl->c.flc_owner);
>>   	return false;
>>   }
>> @@ -782,10 +808,51 @@ nfsd4_layout_lm_open_conflict(struct file *filp,
>> int arg)
>>   	return 0;
>>   }
>>
>> +/**
>> + * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out.
> Nit: Move the description of @fl here.

Fix in v5.

>
>> + * If the layout type supports a fence operation, schedule a worker to
>> + * fence the client from accessing the block device.
>> + *
>> + * @fl: file to check
>> + *
>> + * Return true if the file lease should be disposed of by the caller;
>> + * otherwise, return false.
>> + */
>> +static bool
>> +nfsd4_layout_lm_breaker_timedout(struct file_lease *fl)
>> +{
>> +	struct nfs4_layout_stateid *ls = fl->c.flc_owner;
>> +	bool ret;
>> +
>> +	if (!nfsd4_layout_ops[ls->ls_layout_type]->fence_client)
>> +		return true;
>> +	if (ls->ls_fenced || ls->ls_fence_retry_cnt >= LO_MAX_FENCE_RETRY)
>> +		return true;
> Since these two variables are accessed while the fence mutex
> is held in other places, they need similar protection here
> to prevent races. You explained that the mutex cannot be
> taken here, so some other form of serialization will be
> needed to protect these fields.

I do not think we need protection here to access ls_fenced and
ls_fence_retry_cnt. For ls_fenced, it's a read operation from
nfsd4_layout_lm_breaker_timedout. The value is either true or
false, it does not matter. If it's true then the device was
fenced. If it's not true then nfsd4_layout_lm_breaker_timedout
will be called again from time_out_leases on next check.

ls_fence_retry_cnt is incremented and checked only by
nfsd4_layout_lm_breaker_timedout.

>
> If I'm reading this correctly, when the fence fails after all
> 5 retries, the client retains block device access but server
> believes recall succeeded ?

What should we do if the maximum retries is reached, perhaps a
warning message in the system log for the admin to take action?

>
>
>> +
>> +	if (work_busy(&ls->ls_fence_work.work))
>> +		return false;
>> +	/* Schedule work to do the fence operation */
>> +	ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
>> +	if (!ret) {
>> +		/*
>> +		 * If there is no pending work, mod_delayed_work queues
>> +		 * new task. While fencing is in progress, a reference
>> +		 * count is added to the layout stateid to ensure its
>> +		 * validity. This reference count is released once fencing
>> +		 * has been completed.
>> +		 */
>> +		refcount_inc(&ls->ls_stid.sc_count);
>> +		++ls->ls_fence_retry_cnt;
>> +		return true;
>> +	}
> Incrementing after the mod_delayed_work() call is racy. Instead:
>
> refcount_inc(&ls->ls_stid.sc_count);
> ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
> if (ret) {
>      refcount_dec(&ls->ls_stid.sc_count);
>      ....
> }

Yes, it's racy, I thought about this. But we can not just do a
refcount_dec here, we need to do nfs4_put_stid. But we can not
do nfs4_put_stid here since nfsd4_layout_lm_breaker_timedout runs
under the spin_lock flc_lock and nfs4_put_stid might causes the
flc_lock to be acquired again from generic_delete_lease if this
is the last reference count on stid.

We might need to add the reference count when the file_lease is
inserted on the flc_list and decrement when it's removed from the
flc_list. I need to think about this more.

>
>
>> +	return false;
>> +}
>> +
>>   static const struct lease_manager_operations nfsd4_layouts_lm_ops = {
>>   	.lm_break		= nfsd4_layout_lm_break,
>>   	.lm_change		= nfsd4_layout_lm_change,
>>   	.lm_open_conflict	= nfsd4_layout_lm_open_conflict,
>> +	.lm_breaker_timedout	= nfsd4_layout_lm_breaker_timedout,
>>   };
>>
>>   int
>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>> index 583c13b5aaf3..a57fa3318362 100644
>> --- a/fs/nfsd/nfs4state.c
>> +++ b/fs/nfsd/nfs4state.c
>> @@ -2385,6 +2385,7 @@ static struct nfs4_client *alloc_client(struct
>> xdr_netobj name,
>>   #endif
>>   #ifdef CONFIG_NFSD_SCSILAYOUT
>>   	xa_init(&clp->cl_dev_fences);
>> +	mutex_init(&clp->cl_fence_mutex);
>>   #endif
>>   	INIT_LIST_HEAD(&clp->async_copies);
>>   	spin_lock_init(&clp->async_lock);
>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
>> index 713f55ef6554..57e54dfb406c 100644
>> --- a/fs/nfsd/state.h
>> +++ b/fs/nfsd/state.h
>> @@ -529,6 +529,7 @@ struct nfs4_client {
>>   	time64_t		cl_ra_time;
>>   #ifdef CONFIG_NFSD_SCSILAYOUT
>>   	struct xarray		cl_dev_fences;
>> +	struct mutex		cl_fence_mutex;
>>   #endif
>>   };
>>
>> @@ -738,8 +739,13 @@ struct nfs4_layout_stateid {
>>   	stateid_t			ls_recall_sid;
>>   	bool				ls_recalled;
>>   	struct mutex			ls_mutex;
>> +	struct delayed_work		ls_fence_work;
> Still missing cancel_delayed_work_sync() when freeing the
> layout stateid.

I don't think the layout stateid can be freed while a fence
worker was scheduled due to the added reference count to the
stid.

>
>
>> +	bool				ls_fenced;
>> +	int				ls_fence_retry_cnt;
>>   };
>>
>> +#define	LO_MAX_FENCE_RETRY		5
> The value of 5 needs some justification here in a comment.
> Actually, 5 might be too low, but I can't really tell.

At the minimal each retry happens after 1 sec, and it can be
more depending what entry is at the front of flc_list. So if
retry for 5 seconds (minimum) is too low then is 20 retries is
sufficient?

-Dai

>
>
>> +
>>   static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
>>   {
>>   	return container_of(s, struct nfs4_layout_stateid, ls_stid);
>> diff --git a/include/linux/filelock.h b/include/linux/filelock.h
>> index 2f5e5588ee07..13b9c9f04589 100644
>> --- a/include/linux/filelock.h
>> +++ b/include/linux/filelock.h
>> @@ -50,6 +50,7 @@ struct lease_manager_operations {
>>   	void (*lm_setup)(struct file_lease *, void **);
>>   	bool (*lm_breaker_owns_lease)(struct file_lease *);
>>   	int (*lm_open_conflict)(struct file *, int);
>> +	bool (*lm_breaker_timedout)(struct file_lease *fl);
>>   };
>>
>>   struct lock_manager {
>> -- 
>> 2.47.3

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing
  2026-01-27 20:36   ` Dai Ngo
@ 2026-01-27 21:53     ` Chuck Lever
  2026-01-28  0:50       ` Dai Ngo
  0 siblings, 1 reply; 12+ messages in thread
From: Chuck Lever @ 2026-01-27 21:53 UTC (permalink / raw)
  To: Dai Ngo, Chuck Lever, Jeff Layton, NeilBrown, Olga Kornievskaia,
	Tom Talpey, Christoph Hellwig, Alexander Aring, Alexander Viro,
	Christian Brauner, Jan Kara
  Cc: linux-fsdevel, linux-nfs

On 1/27/26 3:36 PM, Dai Ngo wrote:
> 
> On 1/27/26 9:11 AM, Chuck Lever wrote:
>>
>> On Mon, Jan 26, 2026, at 7:50 PM, Dai Ngo wrote:
>>> When a layout conflict triggers a recall, enforcing a timeout is
>>> necessary to prevent excessive nfsd threads from being blocked in
>>> __break_lease ensuring the server continues servicing incoming
>>> requests efficiently.
>>>
>>> This patch introduces a new function to lease_manager_operations:
>>>
>>> lm_breaker_timedout: Invoked when a lease recall times out and is
>>> about to be disposed of. This function enables the lease manager
>>> to inform the caller whether the file_lease should remain on the
>>> flc_list or be disposed of.
>>>
>>> For the NFSD lease manager, this function now handles layout recall
>>> timeouts. If the layout type supports fencing and the client has not
>>> been fenced, a fence operation is triggered to prevent the client
>>> from accessing the block device.
>>>
>>> While the fencing operation is in progress, the conflicting file_lease
>>> remains on the flc_list until fencing is complete. This guarantees
>>> that no other clients can access the file, and the client with
>>> exclusive access is properly blocked before disposal.
>>>
>>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>>> ---
>>>   Documentation/filesystems/locking.rst |  2 +
>>>   fs/locks.c                            | 10 +++-
>>>   fs/nfsd/blocklayout.c                 | 38 ++++++-------
>>>   fs/nfsd/nfs4layouts.c                 | 79 +++++++++++++++++++++++++--
>>>   fs/nfsd/nfs4state.c                   |  1 +
>>>   fs/nfsd/state.h                       |  6 ++
>>>   include/linux/filelock.h              |  1 +
>>>   7 files changed, 110 insertions(+), 27 deletions(-)
>>>
>>> v2:
>>>      . Update Subject line to include fencing operation.
>>>      . Allow conflicting lease to remain on flc_list until fencing
>>>        is complete.
>>>      . Use system worker to perform fencing operation asynchronously.
>>>      . Use nfs4_stid.sc_count to ensure layout stateid remains
>>>        valid before starting the fencing operation, nfs4_stid.sc_count
>>>        is released after fencing operation is complete.
>>>      . Rework nfsd4_scsi_fence_client to:
>>>           . wait until fencing to complete before exiting.
>>>           . wait until fencing in progress to complete before
>>>             checking the NFSD_MDS_PR_FENCED flag.
>>>      . Remove lm_need_to_retry from lease_manager_operations.
>>> v3:
>>>      . correct locking requirement in locking.rst.
>>>      . add max retry count to fencing operation.
>>>      . add missing nfs4_put_stid in nfsd4_layout_fence_worker.
>>>      . remove special-casing of FL_LAYOUT in lease_modify.
>>>      . remove lease_want_dispose.
>>>      . move lm_breaker_timedout call to time_out_leases.
>>> v4:
>>>      . only increment ls_fence_retry_cnt after successfully
>>>        schedule new work in nfsd4_layout_lm_breaker_timedout.
>>>
>>> diff --git a/Documentation/filesystems/locking.rst
>>> b/Documentation/filesystems/locking.rst
>>> index 04c7691e50e0..a339491f02e4 100644
>>> --- a/Documentation/filesystems/locking.rst
>>> +++ b/Documentation/filesystems/locking.rst
>>> @@ -403,6 +403,7 @@ prototypes::
>>>       bool (*lm_breaker_owns_lease)(struct file_lock *);
>>>           bool (*lm_lock_expirable)(struct file_lock *);
>>>           void (*lm_expire_lock)(void);
>>> +        void (*lm_breaker_timedout)(struct file_lease *);
>>>
>>>   locking rules:
>>>
>>> @@ -417,6 +418,7 @@ lm_breaker_owns_lease:    yes        
>>> no            no
>>>   lm_lock_expirable    yes        no            no
>>>   lm_expire_lock        no        no            yes
>>>   lm_open_conflict    yes        no            no
>>> +lm_breaker_timedout     yes             no                      no
>>>   ======================    =============    =================   
>>> =========
>>>
>>>   buffer_head
>>> diff --git a/fs/locks.c b/fs/locks.c
>>> index 46f229f740c8..1b63aa704598 100644
>>> --- a/fs/locks.c
>>> +++ b/fs/locks.c
>>> @@ -1524,6 +1524,7 @@ static void time_out_leases(struct inode *inode,
>>> struct list_head *dispose)
>>>   {
>>>       struct file_lock_context *ctx = inode->i_flctx;
>>>       struct file_lease *fl, *tmp;
>>> +    bool remove = true;
>>>
>>>       lockdep_assert_held(&ctx->flc_lock);
>>>
>>> @@ -1531,8 +1532,13 @@ static void time_out_leases(struct inode *inode,
>>> struct list_head *dispose)
>>>           trace_time_out_leases(inode, fl);
>>>           if (past_time(fl->fl_downgrade_time))
>>>               lease_modify(fl, F_RDLCK, dispose);
>>> -        if (past_time(fl->fl_break_time))
>>> -            lease_modify(fl, F_UNLCK, dispose);
>>> +
>>> +        if (past_time(fl->fl_break_time)) {
>>> +            if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout)
>>> +                remove = fl->fl_lmops->lm_breaker_timedout(fl);
>>> +            if (remove)
>>> +                lease_modify(fl, F_UNLCK, dispose);
>>> +        }
>>>       }
>>>   }
>>>
>>> diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
>>> index 7ba9e2dd0875..69d3889df302 100644
>>> --- a/fs/nfsd/blocklayout.c
>>> +++ b/fs/nfsd/blocklayout.c
>>> @@ -443,6 +443,14 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode,
>>> struct svc_rqst *rqstp,
>>>       return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
>>>   }
>>>
>>> +/*
>>> + * Perform the fence operation to prevent the client from accessing the
>>> + * block device. If a fence operation is already in progress, wait for
>>> + * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the
>>> + * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set,
>>> + * update the layout stateid by setting the ls_fenced flag to indicate
>>> + * that the client has been fenced.
>>> + */
>>>   static void
>>>   nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct
>>> nfsd_file *file)
>>>   {
>>> @@ -450,31 +458,23 @@ nfsd4_scsi_fence_client(struct
>>> nfs4_layout_stateid *ls, struct nfsd_file *file)
>>>       struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb-
>>> >s_bdev;
>>>       int status;
>>>
>>> -    if (nfsd4_scsi_fence_set(clp, bdev->bd_dev))
>>> +    mutex_lock(&clp->cl_fence_mutex);
>>> +    if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) {
>>> +        ls->ls_fenced = true;
>>> +        mutex_unlock(&clp->cl_fence_mutex);
>>> +        nfs4_put_stid(&ls->ls_stid);
>>>           return;
>>> +    }
>>>
>>>       status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev,
>>> NFSD_MDS_PR_KEY,
>>>               nfsd4_scsi_pr_key(clp),
>>>               PR_EXCLUSIVE_ACCESS_REG_ONLY, true);
>>> -    /*
>>> -     * Reset to allow retry only when the command could not have
>>> -     * reached the device. Negative status means a local error
>>> -     * (e.g., -ENOMEM) prevented the command from being sent.
>>> -     * PR_STS_PATH_FAILED, PR_STS_PATH_FAST_FAILED, and
>>> -     * PR_STS_RETRY_PATH_FAILURE indicate transport path failures
>>> -     * before device delivery.
>>> -     *
>>> -     * For all other errors, the command may have reached the device
>>> -     * and the preempt may have succeeded. Avoid resetting, since
>>> -     * retrying a successful preempt returns PR_STS_IOERR or
>>> -     * PR_STS_RESERVATION_CONFLICT, which would cause an infinite
>>> -     * retry loop.
>>> -     */
>>> -    if (status < 0 ||
>>> -        status == PR_STS_PATH_FAILED ||
>>> -        status == PR_STS_PATH_FAST_FAILED ||
>>> -        status == PR_STS_RETRY_PATH_FAILURE)
>>> +    if (status)
>>>           nfsd4_scsi_fence_clear(clp, bdev->bd_dev);
>>> +    else
>>> +        ls->ls_fenced = true;
>> The removed code distinguishes retryable errors (such as path failures)
>> from permanent errors (PR_STS_IOERR, PR_STS_RESERVATION_CONFLICT), but
>> the new code clears the fence on any error,
> 
> In v2 and v3 patch, I left the comment and the code distinguishes retryable
> errors intact. And you commented this:
> 
>> In nfsd4_scsi_fence_client(), when a device error occurs (e.g.,
>> PR_STS_IOERR), ls->ls_fenced is set even though the client may
>> still have storage access.
> 
> So I'm not sure how to treat permanent errors now. Please advise.

The comment is important and explains some of the subtleties. I think
that needs to be preserved even if we decide to set ls_fenced in every
error case.

If the correct behavior now is to leave ls_fenced clear in all error
cases, then something in the patch needs to provide the rationale for
making this code change and for helping us remember the design when it
needs to be altered or fixed later.

The rationale is different for permanent errors and temporary errors,
I would think.


>>   potentially causing infinite
>> retry loops as warned in the removed comment.
> 
> Infinite loops can not happen due to the maximum retry count.

Fine, but with the new retry escape clause, as mentioned below, the
server and client can be left in opposite states, it looks like: One
believes the fence succeeded, and one believes it still has full device
access. Am I incorrect about that?

This is the bane of every timeout-based scheme I know about, and why we
have the "hard" mount option.


>> Either the comment and error distinction should remain,
> 
> As mentioned above, I can put the comment and code that distinguishes
> temporarily error codes back. Then what do we need to do for other
> permanent errors; PR_STS_IOERR, PR_STS_RESERVATION_CONFLICT?
> 
>>   or some rationale
>> for removing the distinction between temporary and permanent errors
>> should
>> be added to the commit message.
>>
>>
>>> +    mutex_unlock(&clp->cl_fence_mutex);
>>> +    nfs4_put_stid(&ls->ls_stid);
>>>
>>>       trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status);
>>>   }
>>> diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
>>> index ad7af8cfcf1f..1c498f3cd059 100644
>>> --- a/fs/nfsd/nfs4layouts.c
>>> +++ b/fs/nfsd/nfs4layouts.c
>>> @@ -222,6 +222,29 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid
>>> *ls)
>>>       return 0;
>>>   }
>>>
>>> +static void
>>> +nfsd4_layout_fence_worker(struct work_struct *work)
>>> +{
>>> +    struct nfsd_file *nf;
>>> +    struct delayed_work *dwork = to_delayed_work(work);
>>> +    struct nfs4_layout_stateid *ls = container_of(dwork,
>>> +            struct nfs4_layout_stateid, ls_fence_work);
>>> +    u32 type;
>>> +
>>> +    rcu_read_lock();
>>> +    nf = nfsd_file_get(ls->ls_file);
>>> +    rcu_read_unlock();
>>> +    if (!nf) {
>>> +        nfs4_put_stid(&ls->ls_stid);
>>> +        return;
>>> +    }
>>> +
>>> +    type = ls->ls_layout_type;
>>> +    if (nfsd4_layout_ops[type]->fence_client)
>>> +        nfsd4_layout_ops[type]->fence_client(ls, nf);
>>> +    nfsd_file_put(nf);
>>> +}
>>> +
>>>   static struct nfs4_layout_stateid *
>>>   nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
>>>           struct nfs4_stid *parent, u32 layout_type)
>>> @@ -271,6 +294,10 @@ nfsd4_alloc_layout_stateid(struct
>>> nfsd4_compound_state *cstate,
>>>       list_add(&ls->ls_perfile, &fp->fi_lo_states);
>>>       spin_unlock(&fp->fi_lock);
>>>
>>> +    INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker);
>>> +    ls->ls_fenced = false;
>>> +    ls->ls_fence_retry_cnt = 0;
>>> +
>>>       trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid);
>>>       return ls;
>>>   }
>>> @@ -708,9 +735,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb,
>>> struct rpc_task *task)
>>>           rcu_read_unlock();
>>>           if (fl) {
>>>               ops = nfsd4_layout_ops[ls->ls_layout_type];
>>> -            if (ops->fence_client)
>>> +            if (ops->fence_client) {
>>> +                refcount_inc(&ls->ls_stid.sc_count);
>>>                   ops->fence_client(ls, fl);
>>> -            else
>>> +            } else
>>>                   nfsd4_cb_layout_fail(ls, fl);
>>>               nfsd_file_put(fl);
>>>           }
>>> @@ -747,11 +775,9 @@ static bool
>>>   nfsd4_layout_lm_break(struct file_lease *fl)
>>>   {
>>>       /*
>>> -     * We don't want the locks code to timeout the lease for us;
>>> -     * we'll remove it ourself if a layout isn't returned
>>> -     * in time:
>>> +     * Enforce break lease timeout to prevent NFSD
>>> +     * thread from hanging in __break_lease.
>>>        */
>>> -    fl->fl_break_time = 0;
>>>       nfsd4_recall_file_layout(fl->c.flc_owner);
>>>       return false;
>>>   }
>>> @@ -782,10 +808,51 @@ nfsd4_layout_lm_open_conflict(struct file *filp,
>>> int arg)
>>>       return 0;
>>>   }
>>>
>>> +/**
>>> + * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out.
>> Nit: Move the description of @fl here.
> 
> Fix in v5.
> 
>>
>>> + * If the layout type supports a fence operation, schedule a worker to
>>> + * fence the client from accessing the block device.
>>> + *
>>> + * @fl: file to check
>>> + *
>>> + * Return true if the file lease should be disposed of by the caller;
>>> + * otherwise, return false.
>>> + */
>>> +static bool
>>> +nfsd4_layout_lm_breaker_timedout(struct file_lease *fl)
>>> +{
>>> +    struct nfs4_layout_stateid *ls = fl->c.flc_owner;
>>> +    bool ret;
>>> +
>>> +    if (!nfsd4_layout_ops[ls->ls_layout_type]->fence_client)
>>> +        return true;
>>> +    if (ls->ls_fenced || ls->ls_fence_retry_cnt >= LO_MAX_FENCE_RETRY)
>>> +        return true;
>> Since these two variables are accessed while the fence mutex
>> is held in other places, they need similar protection here
>> to prevent races. You explained that the mutex cannot be
>> taken here, so some other form of serialization will be
>> needed to protect these fields.
> 
> I do not think we need protection here to access ls_fenced and
> ls_fence_retry_cnt. For ls_fenced, it's a read operation from
> nfsd4_layout_lm_breaker_timedout. The value is either true or
> false, it does not matter. If it's true then the device was
> fenced. If it's not true then nfsd4_layout_lm_breaker_timedout
> will be called again from time_out_leases on next check.

Fair enough for ls_fence_retry_cnt, but are you sure there isn't a
TOCTOU issue here for ls_fenced ? Isn't there a race possible with
the server receiving the client's callback while lm_breaker_timeout
is running?


> ls_fence_retry_cnt is incremented and checked only by
> nfsd4_layout_lm_breaker_timedout.

Then the kdoc comment could be clearer that only one copy of the
callback is allowed to run concurrently. Something like "Caller
serializes".


>> If I'm reading this correctly, when the fence fails after all
>> 5 retries, the client retains block device access but server
>> believes recall succeeded ?
> 
> What should we do if the maximum retries is reached, perhaps a
> warning message in the system log for the admin to take action?

A warning doesn't seem like it would prevent data corruption. (I'm
not against a warning, in and of itself... it just does not seem to
me to be sufficient).

I would really like this mechanism to be more data-safe. It needs to
deal properly with network partitions and other problems. I don't have
any magic answer yet, sorry!


>>> +
>>> +    if (work_busy(&ls->ls_fence_work.work))
>>> +        return false;
>>> +    /* Schedule work to do the fence operation */
>>> +    ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
>>> +    if (!ret) {
>>> +        /*
>>> +         * If there is no pending work, mod_delayed_work queues
>>> +         * new task. While fencing is in progress, a reference
>>> +         * count is added to the layout stateid to ensure its
>>> +         * validity. This reference count is released once fencing
>>> +         * has been completed.
>>> +         */
>>> +        refcount_inc(&ls->ls_stid.sc_count);
>>> +        ++ls->ls_fence_retry_cnt;
>>> +        return true;
>>> +    }
>> Incrementing after the mod_delayed_work() call is racy. Instead:
>>
>> refcount_inc(&ls->ls_stid.sc_count);
>> ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
>> if (ret) {
>>      refcount_dec(&ls->ls_stid.sc_count);
>>      ....
>> }
> 
> Yes, it's racy, I thought about this. But we can not just do a
> refcount_dec here, we need to do nfs4_put_stid. But we can not
> do nfs4_put_stid here since nfsd4_layout_lm_breaker_timedout runs
> under the spin_lock flc_lock and nfs4_put_stid might causes the
> flc_lock to be acquired again from generic_delete_lease if this
> is the last reference count on stid.
> 
> We might need to add the reference count when the file_lease is
> inserted on the flc_list and decrement when it's removed from the
> flc_list. I need to think about this more.

If I may, why is flc_lock being held? That seems to be the root of
several implementation problems.


>>> +    return false;
>>> +}
>>> +
>>>   static const struct lease_manager_operations nfsd4_layouts_lm_ops = {
>>>       .lm_break        = nfsd4_layout_lm_break,
>>>       .lm_change        = nfsd4_layout_lm_change,
>>>       .lm_open_conflict    = nfsd4_layout_lm_open_conflict,
>>> +    .lm_breaker_timedout    = nfsd4_layout_lm_breaker_timedout,
>>>   };
>>>
>>>   int
>>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>>> index 583c13b5aaf3..a57fa3318362 100644
>>> --- a/fs/nfsd/nfs4state.c
>>> +++ b/fs/nfsd/nfs4state.c
>>> @@ -2385,6 +2385,7 @@ static struct nfs4_client *alloc_client(struct
>>> xdr_netobj name,
>>>   #endif
>>>   #ifdef CONFIG_NFSD_SCSILAYOUT
>>>       xa_init(&clp->cl_dev_fences);
>>> +    mutex_init(&clp->cl_fence_mutex);
>>>   #endif
>>>       INIT_LIST_HEAD(&clp->async_copies);
>>>       spin_lock_init(&clp->async_lock);
>>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
>>> index 713f55ef6554..57e54dfb406c 100644
>>> --- a/fs/nfsd/state.h
>>> +++ b/fs/nfsd/state.h
>>> @@ -529,6 +529,7 @@ struct nfs4_client {
>>>       time64_t        cl_ra_time;
>>>   #ifdef CONFIG_NFSD_SCSILAYOUT
>>>       struct xarray        cl_dev_fences;
>>> +    struct mutex        cl_fence_mutex;
>>>   #endif
>>>   };
>>>
>>> @@ -738,8 +739,13 @@ struct nfs4_layout_stateid {
>>>       stateid_t            ls_recall_sid;
>>>       bool                ls_recalled;
>>>       struct mutex            ls_mutex;
>>> +    struct delayed_work        ls_fence_work;
>> Still missing cancel_delayed_work_sync() when freeing the
>> layout stateid.
> 
> I don't think the layout stateid can be freed while a fence
> worker was scheduled due to the added reference count to the
> stid.

Then add a comment to that effect, or if possible, some kind of
assertion, in nfsd4_free_layout_stateid().


>>> +    bool                ls_fenced;
>>> +    int                ls_fence_retry_cnt;
>>>   };
>>>
>>> +#define    LO_MAX_FENCE_RETRY        5
>> The value of 5 needs some justification here in a comment.
>> Actually, 5 might be too low, but I can't really tell.
> 
> At the minimal each retry happens after 1 sec, and it can be
> more depending what entry is at the front of flc_list. So if
> retry for 5 seconds (minimum) is too low then is 20 retries is
> sufficient?

As I said above: here's why we have the "hard" mount option for NFS. It
might be impossible to choose a safe maximum retry count and still
guarantee that the server and clients will eventually agree on the
device reservation state. In that case we will need to consider some
other mechanism for detecting and preventing infinite PR retry loops.

Maybe the retry maximum could depend on the lease expiry time?


Just remember, I'm thinking about this and learning as I go along,
just like you are. I know I'm possibly contradicting myself here.


-- 
Chuck Lever

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing
  2026-01-27 21:53     ` Chuck Lever
@ 2026-01-28  0:50       ` Dai Ngo
  0 siblings, 0 replies; 12+ messages in thread
From: Dai Ngo @ 2026-01-28  0:50 UTC (permalink / raw)
  To: Chuck Lever, Chuck Lever, Jeff Layton, NeilBrown,
	Olga Kornievskaia, Tom Talpey, Christoph Hellwig, Alexander Aring,
	Alexander Viro, Christian Brauner, Jan Kara
  Cc: linux-fsdevel, linux-nfs


On 1/27/26 1:53 PM, Chuck Lever wrote:
> On 1/27/26 3:36 PM, Dai Ngo wrote:
>> On 1/27/26 9:11 AM, Chuck Lever wrote:
>>> On Mon, Jan 26, 2026, at 7:50 PM, Dai Ngo wrote:
>>>> When a layout conflict triggers a recall, enforcing a timeout is
>>>> necessary to prevent excessive nfsd threads from being blocked in
>>>> __break_lease ensuring the server continues servicing incoming
>>>> requests efficiently.
>>>>
>>>> This patch introduces a new function to lease_manager_operations:
>>>>
>>>> lm_breaker_timedout: Invoked when a lease recall times out and is
>>>> about to be disposed of. This function enables the lease manager
>>>> to inform the caller whether the file_lease should remain on the
>>>> flc_list or be disposed of.
>>>>
>>>> For the NFSD lease manager, this function now handles layout recall
>>>> timeouts. If the layout type supports fencing and the client has not
>>>> been fenced, a fence operation is triggered to prevent the client
>>>> from accessing the block device.
>>>>
>>>> While the fencing operation is in progress, the conflicting file_lease
>>>> remains on the flc_list until fencing is complete. This guarantees
>>>> that no other clients can access the file, and the client with
>>>> exclusive access is properly blocked before disposal.
>>>>
>>>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>>>> ---
>>>>    Documentation/filesystems/locking.rst |  2 +
>>>>    fs/locks.c                            | 10 +++-
>>>>    fs/nfsd/blocklayout.c                 | 38 ++++++-------
>>>>    fs/nfsd/nfs4layouts.c                 | 79 +++++++++++++++++++++++++--
>>>>    fs/nfsd/nfs4state.c                   |  1 +
>>>>    fs/nfsd/state.h                       |  6 ++
>>>>    include/linux/filelock.h              |  1 +
>>>>    7 files changed, 110 insertions(+), 27 deletions(-)
>>>>
>>>> v2:
>>>>       . Update Subject line to include fencing operation.
>>>>       . Allow conflicting lease to remain on flc_list until fencing
>>>>         is complete.
>>>>       . Use system worker to perform fencing operation asynchronously.
>>>>       . Use nfs4_stid.sc_count to ensure layout stateid remains
>>>>         valid before starting the fencing operation, nfs4_stid.sc_count
>>>>         is released after fencing operation is complete.
>>>>       . Rework nfsd4_scsi_fence_client to:
>>>>            . wait until fencing to complete before exiting.
>>>>            . wait until fencing in progress to complete before
>>>>              checking the NFSD_MDS_PR_FENCED flag.
>>>>       . Remove lm_need_to_retry from lease_manager_operations.
>>>> v3:
>>>>       . correct locking requirement in locking.rst.
>>>>       . add max retry count to fencing operation.
>>>>       . add missing nfs4_put_stid in nfsd4_layout_fence_worker.
>>>>       . remove special-casing of FL_LAYOUT in lease_modify.
>>>>       . remove lease_want_dispose.
>>>>       . move lm_breaker_timedout call to time_out_leases.
>>>> v4:
>>>>       . only increment ls_fence_retry_cnt after successfully
>>>>         schedule new work in nfsd4_layout_lm_breaker_timedout.
>>>>
>>>> diff --git a/Documentation/filesystems/locking.rst
>>>> b/Documentation/filesystems/locking.rst
>>>> index 04c7691e50e0..a339491f02e4 100644
>>>> --- a/Documentation/filesystems/locking.rst
>>>> +++ b/Documentation/filesystems/locking.rst
>>>> @@ -403,6 +403,7 @@ prototypes::
>>>>        bool (*lm_breaker_owns_lease)(struct file_lock *);
>>>>            bool (*lm_lock_expirable)(struct file_lock *);
>>>>            void (*lm_expire_lock)(void);
>>>> +        void (*lm_breaker_timedout)(struct file_lease *);
>>>>
>>>>    locking rules:
>>>>
>>>> @@ -417,6 +418,7 @@ lm_breaker_owns_lease:    yes
>>>> no            no
>>>>    lm_lock_expirable    yes        no            no
>>>>    lm_expire_lock        no        no            yes
>>>>    lm_open_conflict    yes        no            no
>>>> +lm_breaker_timedout     yes             no                      no
>>>>    ======================    =============    =================
>>>> =========
>>>>
>>>>    buffer_head
>>>> diff --git a/fs/locks.c b/fs/locks.c
>>>> index 46f229f740c8..1b63aa704598 100644
>>>> --- a/fs/locks.c
>>>> +++ b/fs/locks.c
>>>> @@ -1524,6 +1524,7 @@ static void time_out_leases(struct inode *inode,
>>>> struct list_head *dispose)
>>>>    {
>>>>        struct file_lock_context *ctx = inode->i_flctx;
>>>>        struct file_lease *fl, *tmp;
>>>> +    bool remove = true;
>>>>
>>>>        lockdep_assert_held(&ctx->flc_lock);
>>>>
>>>> @@ -1531,8 +1532,13 @@ static void time_out_leases(struct inode *inode,
>>>> struct list_head *dispose)
>>>>            trace_time_out_leases(inode, fl);
>>>>            if (past_time(fl->fl_downgrade_time))
>>>>                lease_modify(fl, F_RDLCK, dispose);
>>>> -        if (past_time(fl->fl_break_time))
>>>> -            lease_modify(fl, F_UNLCK, dispose);
>>>> +
>>>> +        if (past_time(fl->fl_break_time)) {
>>>> +            if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout)
>>>> +                remove = fl->fl_lmops->lm_breaker_timedout(fl);
>>>> +            if (remove)
>>>> +                lease_modify(fl, F_UNLCK, dispose);
>>>> +        }
>>>>        }
>>>>    }
>>>>
>>>> diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
>>>> index 7ba9e2dd0875..69d3889df302 100644
>>>> --- a/fs/nfsd/blocklayout.c
>>>> +++ b/fs/nfsd/blocklayout.c
>>>> @@ -443,6 +443,14 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode,
>>>> struct svc_rqst *rqstp,
>>>>        return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
>>>>    }
>>>>
>>>> +/*
>>>> + * Perform the fence operation to prevent the client from accessing the
>>>> + * block device. If a fence operation is already in progress, wait for
>>>> + * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the
>>>> + * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set,
>>>> + * update the layout stateid by setting the ls_fenced flag to indicate
>>>> + * that the client has been fenced.
>>>> + */
>>>>    static void
>>>>    nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct
>>>> nfsd_file *file)
>>>>    {
>>>> @@ -450,31 +458,23 @@ nfsd4_scsi_fence_client(struct
>>>> nfs4_layout_stateid *ls, struct nfsd_file *file)
>>>>        struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb-
>>>>> s_bdev;
>>>>        int status;
>>>>
>>>> -    if (nfsd4_scsi_fence_set(clp, bdev->bd_dev))
>>>> +    mutex_lock(&clp->cl_fence_mutex);
>>>> +    if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) {
>>>> +        ls->ls_fenced = true;
>>>> +        mutex_unlock(&clp->cl_fence_mutex);
>>>> +        nfs4_put_stid(&ls->ls_stid);
>>>>            return;
>>>> +    }
>>>>
>>>>        status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev,
>>>> NFSD_MDS_PR_KEY,
>>>>                nfsd4_scsi_pr_key(clp),
>>>>                PR_EXCLUSIVE_ACCESS_REG_ONLY, true);
>>>> -    /*
>>>> -     * Reset to allow retry only when the command could not have
>>>> -     * reached the device. Negative status means a local error
>>>> -     * (e.g., -ENOMEM) prevented the command from being sent.
>>>> -     * PR_STS_PATH_FAILED, PR_STS_PATH_FAST_FAILED, and
>>>> -     * PR_STS_RETRY_PATH_FAILURE indicate transport path failures
>>>> -     * before device delivery.
>>>> -     *
>>>> -     * For all other errors, the command may have reached the device
>>>> -     * and the preempt may have succeeded. Avoid resetting, since
>>>> -     * retrying a successful preempt returns PR_STS_IOERR or
>>>> -     * PR_STS_RESERVATION_CONFLICT, which would cause an infinite
>>>> -     * retry loop.
>>>> -     */
>>>> -    if (status < 0 ||
>>>> -        status == PR_STS_PATH_FAILED ||
>>>> -        status == PR_STS_PATH_FAST_FAILED ||
>>>> -        status == PR_STS_RETRY_PATH_FAILURE)
>>>> +    if (status)
>>>>            nfsd4_scsi_fence_clear(clp, bdev->bd_dev);
>>>> +    else
>>>> +        ls->ls_fenced = true;
>>> The removed code distinguishes retryable errors (such as path failures)
>>> from permanent errors (PR_STS_IOERR, PR_STS_RESERVATION_CONFLICT), but
>>> the new code clears the fence on any error,
>> In v2 and v3 patch, I left the comment and the code distinguishes retryable
>> errors intact. And you commented this:
>>
>>> In nfsd4_scsi_fence_client(), when a device error occurs (e.g.,
>>> PR_STS_IOERR), ls->ls_fenced is set even though the client may
>>> still have storage access.
>> So I'm not sure how to treat permanent errors now. Please advise.
> The comment is important and explains some of the subtleties. I think
> that needs to be preserved even if we decide to set ls_fenced in every
> error case.
>
> If the correct behavior now is to leave ls_fenced clear in all error
> cases, then something in the patch needs to provide the rationale for
> making this code change and for helping us remember the design when it
> needs to be altered or fixed later.
>
> The rationale is different for permanent errors and temporary errors,
> I would think.

I restored the previous comment and treat PR_STS_IOERR and
PR_STS_RESERVATION_CONFLICT as non-error case. The rest of the errors,
temporarily and other errors; ENOMEM, as retryable error.

>
>
>>>    potentially causing infinite
>>> retry loops as warned in the removed comment.
>> Infinite loops can not happen due to the maximum retry count.
> Fine, but with the new retry escape clause, as mentioned below, the
> server and client can be left in opposite states, it looks like: One
> believes the fence succeeded, and one believes it still has full device
> access. Am I incorrect about that?

You're right, the server and client can be left in opposite states if
we still can not fence the client after so many retries.

If we want to be safe then pr_warn the admin then retry forever which
effectively prevents all other clients to access the conflict file until
it's manually resolved by the admin.

>
> This is the bane of every timeout-based scheme I know about, and why we
> have the "hard" mount option.
>
>
>>> Either the comment and error distinction should remain,
>> As mentioned above, I can put the comment and code that distinguishes
>> temporarily error codes back. Then what do we need to do for other
>> permanent errors; PR_STS_IOERR, PR_STS_RESERVATION_CONFLICT?
>>
>>>    or some rationale
>>> for removing the distinction between temporary and permanent errors
>>> should
>>> be added to the commit message.
>>>
>>>
>>>> +    mutex_unlock(&clp->cl_fence_mutex);
>>>> +    nfs4_put_stid(&ls->ls_stid);
>>>>
>>>>        trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status);
>>>>    }
>>>> diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
>>>> index ad7af8cfcf1f..1c498f3cd059 100644
>>>> --- a/fs/nfsd/nfs4layouts.c
>>>> +++ b/fs/nfsd/nfs4layouts.c
>>>> @@ -222,6 +222,29 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid
>>>> *ls)
>>>>        return 0;
>>>>    }
>>>>
>>>> +static void
>>>> +nfsd4_layout_fence_worker(struct work_struct *work)
>>>> +{
>>>> +    struct nfsd_file *nf;
>>>> +    struct delayed_work *dwork = to_delayed_work(work);
>>>> +    struct nfs4_layout_stateid *ls = container_of(dwork,
>>>> +            struct nfs4_layout_stateid, ls_fence_work);
>>>> +    u32 type;
>>>> +
>>>> +    rcu_read_lock();
>>>> +    nf = nfsd_file_get(ls->ls_file);
>>>> +    rcu_read_unlock();
>>>> +    if (!nf) {
>>>> +        nfs4_put_stid(&ls->ls_stid);
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    type = ls->ls_layout_type;
>>>> +    if (nfsd4_layout_ops[type]->fence_client)
>>>> +        nfsd4_layout_ops[type]->fence_client(ls, nf);
>>>> +    nfsd_file_put(nf);
>>>> +}
>>>> +
>>>>    static struct nfs4_layout_stateid *
>>>>    nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
>>>>            struct nfs4_stid *parent, u32 layout_type)
>>>> @@ -271,6 +294,10 @@ nfsd4_alloc_layout_stateid(struct
>>>> nfsd4_compound_state *cstate,
>>>>        list_add(&ls->ls_perfile, &fp->fi_lo_states);
>>>>        spin_unlock(&fp->fi_lock);
>>>>
>>>> +    INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker);
>>>> +    ls->ls_fenced = false;
>>>> +    ls->ls_fence_retry_cnt = 0;
>>>> +
>>>>        trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid);
>>>>        return ls;
>>>>    }
>>>> @@ -708,9 +735,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb,
>>>> struct rpc_task *task)
>>>>            rcu_read_unlock();
>>>>            if (fl) {
>>>>                ops = nfsd4_layout_ops[ls->ls_layout_type];
>>>> -            if (ops->fence_client)
>>>> +            if (ops->fence_client) {
>>>> +                refcount_inc(&ls->ls_stid.sc_count);
>>>>                    ops->fence_client(ls, fl);
>>>> -            else
>>>> +            } else
>>>>                    nfsd4_cb_layout_fail(ls, fl);
>>>>                nfsd_file_put(fl);
>>>>            }
>>>> @@ -747,11 +775,9 @@ static bool
>>>>    nfsd4_layout_lm_break(struct file_lease *fl)
>>>>    {
>>>>        /*
>>>> -     * We don't want the locks code to timeout the lease for us;
>>>> -     * we'll remove it ourself if a layout isn't returned
>>>> -     * in time:
>>>> +     * Enforce break lease timeout to prevent NFSD
>>>> +     * thread from hanging in __break_lease.
>>>>         */
>>>> -    fl->fl_break_time = 0;
>>>>        nfsd4_recall_file_layout(fl->c.flc_owner);
>>>>        return false;
>>>>    }
>>>> @@ -782,10 +808,51 @@ nfsd4_layout_lm_open_conflict(struct file *filp,
>>>> int arg)
>>>>        return 0;
>>>>    }
>>>>
>>>> +/**
>>>> + * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out.
>>> Nit: Move the description of @fl here.
>> Fix in v5.
>>
>>>> + * If the layout type supports a fence operation, schedule a worker to
>>>> + * fence the client from accessing the block device.
>>>> + *
>>>> + * @fl: file to check
>>>> + *
>>>> + * Return true if the file lease should be disposed of by the caller;
>>>> + * otherwise, return false.
>>>> + */
>>>> +static bool
>>>> +nfsd4_layout_lm_breaker_timedout(struct file_lease *fl)
>>>> +{
>>>> +    struct nfs4_layout_stateid *ls = fl->c.flc_owner;
>>>> +    bool ret;
>>>> +
>>>> +    if (!nfsd4_layout_ops[ls->ls_layout_type]->fence_client)
>>>> +        return true;
>>>> +    if (ls->ls_fenced || ls->ls_fence_retry_cnt >= LO_MAX_FENCE_RETRY)
>>>> +        return true;
>>> Since these two variables are accessed while the fence mutex
>>> is held in other places, they need similar protection here
>>> to prevent races. You explained that the mutex cannot be
>>> taken here, so some other form of serialization will be
>>> needed to protect these fields.
>> I do not think we need protection here to access ls_fenced and
>> ls_fence_retry_cnt. For ls_fenced, it's a read operation from
>> nfsd4_layout_lm_breaker_timedout. The value is either true or
>> false, it does not matter. If it's true then the device was
>> fenced. If it's not true then nfsd4_layout_lm_breaker_timedout
>> will be called again from time_out_leases on next check.
> Fair enough for ls_fence_retry_cnt, but are you sure there isn't a
> TOCTOU issue here for ls_fenced ? Isn't there a race possible with
> the server receiving the client's callback while lm_breaker_timeout
> is running?

I will rework this logic to prevent race with LAYOUTRETURN and also
to allow nfsd4_layout_lm_breaker_timedout to release reference count
safely.

>
>
>> ls_fence_retry_cnt is incremented and checked only by
>> nfsd4_layout_lm_breaker_timedout.
> Then the kdoc comment could be clearer that only one copy of the
> callback is allowed to run concurrently. Something like "Caller
> serializes".
>
>
>>> If I'm reading this correctly, when the fence fails after all
>>> 5 retries, the client retains block device access but server
>>> believes recall succeeded ?
>> What should we do if the maximum retries is reached, perhaps a
>> warning message in the system log for the admin to take action?
> A warning doesn't seem like it would prevent data corruption. (I'm
> not against a warning, in and of itself... it just does not seem to
> me to be sufficient).
>
> I would really like this mechanism to be more data-safe. It needs to
> deal properly with network partitions and other problems. I don't have
> any magic answer yet, sorry!

I propose pr_warn the admin and retry forever for now until we can
find a better option. I expect this to be a very rare condition so
it should not effect most environments.

>
>
>>>> +
>>>> +    if (work_busy(&ls->ls_fence_work.work))
>>>> +        return false;
>>>> +    /* Schedule work to do the fence operation */
>>>> +    ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
>>>> +    if (!ret) {
>>>> +        /*
>>>> +         * If there is no pending work, mod_delayed_work queues
>>>> +         * new task. While fencing is in progress, a reference
>>>> +         * count is added to the layout stateid to ensure its
>>>> +         * validity. This reference count is released once fencing
>>>> +         * has been completed.
>>>> +         */
>>>> +        refcount_inc(&ls->ls_stid.sc_count);
>>>> +        ++ls->ls_fence_retry_cnt;
>>>> +        return true;
>>>> +    }
>>> Incrementing after the mod_delayed_work() call is racy. Instead:
>>>
>>> refcount_inc(&ls->ls_stid.sc_count);
>>> ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
>>> if (ret) {
>>>       refcount_dec(&ls->ls_stid.sc_count);
>>>       ....
>>> }
>> Yes, it's racy, I thought about this. But we can not just do a
>> refcount_dec here, we need to do nfs4_put_stid. But we can not
>> do nfs4_put_stid here since nfsd4_layout_lm_breaker_timedout runs
>> under the spin_lock flc_lock and nfs4_put_stid might causes the
>> flc_lock to be acquired again from generic_delete_lease if this
>> is the last reference count on stid.
>>
>> We might need to add the reference count when the file_lease is
>> inserted on the flc_list and decrement when it's removed from the
>> flc_list. I need to think about this more.
> If I may, why is flc_lock being held? That seems to be the root of
> several implementation problems.

Jeff probably has better answer, but my AFAIK time_out_leases hold
the spin lock flc_lock to prevent the flc_list from being corrupted
if the underlying code removing the file_lease by kernel_setlease
while it's working on the list.

>
>
>>>> +    return false;
>>>> +}
>>>> +
>>>>    static const struct lease_manager_operations nfsd4_layouts_lm_ops = {
>>>>        .lm_break        = nfsd4_layout_lm_break,
>>>>        .lm_change        = nfsd4_layout_lm_change,
>>>>        .lm_open_conflict    = nfsd4_layout_lm_open_conflict,
>>>> +    .lm_breaker_timedout    = nfsd4_layout_lm_breaker_timedout,
>>>>    };
>>>>
>>>>    int
>>>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>>>> index 583c13b5aaf3..a57fa3318362 100644
>>>> --- a/fs/nfsd/nfs4state.c
>>>> +++ b/fs/nfsd/nfs4state.c
>>>> @@ -2385,6 +2385,7 @@ static struct nfs4_client *alloc_client(struct
>>>> xdr_netobj name,
>>>>    #endif
>>>>    #ifdef CONFIG_NFSD_SCSILAYOUT
>>>>        xa_init(&clp->cl_dev_fences);
>>>> +    mutex_init(&clp->cl_fence_mutex);
>>>>    #endif
>>>>        INIT_LIST_HEAD(&clp->async_copies);
>>>>        spin_lock_init(&clp->async_lock);
>>>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
>>>> index 713f55ef6554..57e54dfb406c 100644
>>>> --- a/fs/nfsd/state.h
>>>> +++ b/fs/nfsd/state.h
>>>> @@ -529,6 +529,7 @@ struct nfs4_client {
>>>>        time64_t        cl_ra_time;
>>>>    #ifdef CONFIG_NFSD_SCSILAYOUT
>>>>        struct xarray        cl_dev_fences;
>>>> +    struct mutex        cl_fence_mutex;
>>>>    #endif
>>>>    };
>>>>
>>>> @@ -738,8 +739,13 @@ struct nfs4_layout_stateid {
>>>>        stateid_t            ls_recall_sid;
>>>>        bool                ls_recalled;
>>>>        struct mutex            ls_mutex;
>>>> +    struct delayed_work        ls_fence_work;
>>> Still missing cancel_delayed_work_sync() when freeing the
>>> layout stateid.
>> I don't think the layout stateid can be freed while a fence
>> worker was scheduled due to the added reference count to the
>> stid.
> Then add a comment to that effect, or if possible, some kind of
> assertion, in nfsd4_free_layout_stateid().

Will do.

>
>
>>>> +    bool                ls_fenced;
>>>> +    int                ls_fence_retry_cnt;
>>>>    };
>>>>
>>>> +#define    LO_MAX_FENCE_RETRY        5
>>> The value of 5 needs some justification here in a comment.
>>> Actually, 5 might be too low, but I can't really tell.
>> At the minimal each retry happens after 1 sec, and it can be
>> more depending what entry is at the front of flc_list. So if
>> retry for 5 seconds (minimum) is too low then is 20 retries is
>> sufficient?
> As I said above: here's why we have the "hard" mount option for NFS. It
> might be impossible to choose a safe maximum retry count and still
> guarantee that the server and clients will eventually agree on the
> device reservation state. In that case we will need to consider some
> other mechanism for detecting and preventing infinite PR retry loops.
>
> Maybe the retry maximum could depend on the lease expiry time?

The problem with using client's lease as max retries is the client
might be healthy but the NFS server is having problem fencing the
block device. If we give up after max retries and mark the device
as fenced then there is possibility of data corruption if the
original client continue to access the block device.

I think to be on the safe side, we should retry forever while pr_warn
the admin to fix the problem.

>
>
> Just remember, I'm thinking about this and learning as I go along,
> just like you are. I know I'm possibly contradicting myself here.

I'm greatly appreciated your and Jeff's comments. We're working on
the same goal to build a reliable NFS server.

Thanks,
-Dai


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing
  2026-01-27  0:50 [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing Dai Ngo
  2026-01-27 17:11 ` Chuck Lever
@ 2026-01-27 17:54 ` Jeff Layton
  2026-01-27 20:52   ` Dai Ngo
  2026-01-28 17:25   ` Dai Ngo
  1 sibling, 2 replies; 12+ messages in thread
From: Jeff Layton @ 2026-01-27 17:54 UTC (permalink / raw)
  To: Dai Ngo, chuck.lever, neil, okorniev, tom, hch, alex.aring, viro,
	brauner, jack
  Cc: linux-fsdevel, linux-nfs

On Mon, 2026-01-26 at 16:50 -0800, Dai Ngo wrote:
> When a layout conflict triggers a recall, enforcing a timeout is
> necessary to prevent excessive nfsd threads from being blocked in
> __break_lease ensuring the server continues servicing incoming
> requests efficiently.
> 
> This patch introduces a new function to lease_manager_operations:
> 
> lm_breaker_timedout: Invoked when a lease recall times out and is
> about to be disposed of. This function enables the lease manager
> to inform the caller whether the file_lease should remain on the
> flc_list or be disposed of.
> 
> For the NFSD lease manager, this function now handles layout recall
> timeouts. If the layout type supports fencing and the client has not
> been fenced, a fence operation is triggered to prevent the client
> from accessing the block device.
> 
> While the fencing operation is in progress, the conflicting file_lease
> remains on the flc_list until fencing is complete. This guarantees
> that no other clients can access the file, and the client with
> exclusive access is properly blocked before disposal.
> 
> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
> ---
>  Documentation/filesystems/locking.rst |  2 +
>  fs/locks.c                            | 10 +++-
>  fs/nfsd/blocklayout.c                 | 38 ++++++-------
>  fs/nfsd/nfs4layouts.c                 | 79 +++++++++++++++++++++++++--
>  fs/nfsd/nfs4state.c                   |  1 +
>  fs/nfsd/state.h                       |  6 ++
>  include/linux/filelock.h              |  1 +
>  7 files changed, 110 insertions(+), 27 deletions(-)
> 
> v2:
>     . Update Subject line to include fencing operation.
>     . Allow conflicting lease to remain on flc_list until fencing
>       is complete.
>     . Use system worker to perform fencing operation asynchronously.
>     . Use nfs4_stid.sc_count to ensure layout stateid remains
>       valid before starting the fencing operation, nfs4_stid.sc_count
>       is released after fencing operation is complete.
>     . Rework nfsd4_scsi_fence_client to:
>          . wait until fencing to complete before exiting.
>          . wait until fencing in progress to complete before
>            checking the NFSD_MDS_PR_FENCED flag.
>     . Remove lm_need_to_retry from lease_manager_operations.
> v3:
>     . correct locking requirement in locking.rst.
>     . add max retry count to fencing operation.
>     . add missing nfs4_put_stid in nfsd4_layout_fence_worker.
>     . remove special-casing of FL_LAYOUT in lease_modify.
>     . remove lease_want_dispose.
>     . move lm_breaker_timedout call to time_out_leases.
> v4:
>     . only increment ls_fence_retry_cnt after successfully
>       schedule new work in nfsd4_layout_lm_breaker_timedout.
> 
> diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
> index 04c7691e50e0..a339491f02e4 100644
> --- a/Documentation/filesystems/locking.rst
> +++ b/Documentation/filesystems/locking.rst
> @@ -403,6 +403,7 @@ prototypes::
>  	bool (*lm_breaker_owns_lease)(struct file_lock *);
>          bool (*lm_lock_expirable)(struct file_lock *);
>          void (*lm_expire_lock)(void);
> +        void (*lm_breaker_timedout)(struct file_lease *);
>  
>  locking rules:
>  
> @@ -417,6 +418,7 @@ lm_breaker_owns_lease:	yes     	no			no
>  lm_lock_expirable	yes		no			no
>  lm_expire_lock		no		no			yes
>  lm_open_conflict	yes		no			no
> +lm_breaker_timedout     yes             no                      no
>  ======================	=============	=================	=========
>  
>  buffer_head
> diff --git a/fs/locks.c b/fs/locks.c
> index 46f229f740c8..1b63aa704598 100644
> --- a/fs/locks.c
> +++ b/fs/locks.c
> @@ -1524,6 +1524,7 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
>  {
>  	struct file_lock_context *ctx = inode->i_flctx;
>  	struct file_lease *fl, *tmp;
> +	bool remove = true;
>  
>  	lockdep_assert_held(&ctx->flc_lock);
>  
> @@ -1531,8 +1532,13 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
>  		trace_time_out_leases(inode, fl);
>  		if (past_time(fl->fl_downgrade_time))
>  			lease_modify(fl, F_RDLCK, dispose);
> -		if (past_time(fl->fl_break_time))
> -			lease_modify(fl, F_UNLCK, dispose);
> +
> +		if (past_time(fl->fl_break_time)) {
> +			if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout)
> +				remove = fl->fl_lmops->lm_breaker_timedout(fl);
> +			if (remove)
> +				lease_modify(fl, F_UNLCK, dispose);

I'd not bother with the return code to lm_breaker_timedout.

Make it void return and have it call lease_modify itself before
returning in the cases where you have it returning true. If the
operation isn't defined then just do the lease_modify here like we
always have.

> +		}
>  	}
>  }
>  
> diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
> index 7ba9e2dd0875..69d3889df302 100644
> --- a/fs/nfsd/blocklayout.c
> +++ b/fs/nfsd/blocklayout.c
> @@ -443,6 +443,14 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp,
>  	return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
>  }
>  
> +/*
> + * Perform the fence operation to prevent the client from accessing the
> + * block device. If a fence operation is already in progress, wait for
> + * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the
> + * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set,
> + * update the layout stateid by setting the ls_fenced flag to indicate
> + * that the client has been fenced.
> + */
>  static void
>  nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
>  {
> @@ -450,31 +458,23 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
>  	struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev;
>  	int status;
>  
> -	if (nfsd4_scsi_fence_set(clp, bdev->bd_dev))
> +	mutex_lock(&clp->cl_fence_mutex);
> +	if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) {
> +		ls->ls_fenced = true;
> +		mutex_unlock(&clp->cl_fence_mutex);
> +		nfs4_put_stid(&ls->ls_stid);
>  		return;
> +	}
>  

I don't understand what this new mutex is protecting, and this all
seems overly complex. If feels kind of like you want nfsd to be driving
the fencing retries, but I don't think we really do. Here's what I'd
do.

I'd just make ->fence_client a bool or int return, and have it indicate
whether the client was successfully fenced or not. If it was
successfully fenced, then have the caller call lease_modify() to remove
the lease. If it wasn't successfully fenced, have the caller (the
workqueue job) requeue itself if you want to retry. If the caller is
ready to give up, then call lease_modify() on it and remove it (and
probably throw a pr_warn()).

>  	status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
>  			nfsd4_scsi_pr_key(clp),
>  			PR_EXCLUSIVE_ACCESS_REG_ONLY, true);
> -	/*
> -	 * Reset to allow retry only when the command could not have
> -	 * reached the device. Negative status means a local error
> -	 * (e.g., -ENOMEM) prevented the command from being sent.
> -	 * PR_STS_PATH_FAILED, PR_STS_PATH_FAST_FAILED, and
> -	 * PR_STS_RETRY_PATH_FAILURE indicate transport path failures
> -	 * before device delivery.
> -	 *
> -	 * For all other errors, the command may have reached the device
> -	 * and the preempt may have succeeded. Avoid resetting, since
> -	 * retrying a successful preempt returns PR_STS_IOERR or
> -	 * PR_STS_RESERVATION_CONFLICT, which would cause an infinite
> -	 * retry loop.
> -	 */
> -	if (status < 0 ||
> -	    status == PR_STS_PATH_FAILED ||
> -	    status == PR_STS_PATH_FAST_FAILED ||
> -	    status == PR_STS_RETRY_PATH_FAILURE)
> +	if (status)
>  		nfsd4_scsi_fence_clear(clp, bdev->bd_dev);
> +	else
> +		ls->ls_fenced = true;
> +	mutex_unlock(&clp->cl_fence_mutex);
> +	nfs4_put_stid(&ls->ls_stid);
>  
>  	trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status);
>  }
> diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
> index ad7af8cfcf1f..1c498f3cd059 100644
> --- a/fs/nfsd/nfs4layouts.c
> +++ b/fs/nfsd/nfs4layouts.c
> @@ -222,6 +222,29 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
>  	return 0;
>  }
>  
> +static void
> +nfsd4_layout_fence_worker(struct work_struct *work)
> +{
> +	struct nfsd_file *nf;
> +	struct delayed_work *dwork = to_delayed_work(work);
> +	struct nfs4_layout_stateid *ls = container_of(dwork,
> +			struct nfs4_layout_stateid, ls_fence_work);
> +	u32 type;
> +
> +	rcu_read_lock();
> +	nf = nfsd_file_get(ls->ls_file);
> +	rcu_read_unlock();
> +	if (!nf) {
> +		nfs4_put_stid(&ls->ls_stid);
> +		return;
> +	}
> +
> +	type = ls->ls_layout_type;
> +	if (nfsd4_layout_ops[type]->fence_client)
> +		nfsd4_layout_ops[type]->fence_client(ls, nf);

If you make fence_client an int/bool return, then you could just
requeue this job to try it again.

> +	nfsd_file_put(nf);
> +}
> +
>  static struct nfs4_layout_stateid *
>  nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
>  		struct nfs4_stid *parent, u32 layout_type)
> @@ -271,6 +294,10 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
>  	list_add(&ls->ls_perfile, &fp->fi_lo_states);
>  	spin_unlock(&fp->fi_lock);
>  
> +	INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker);
> +	ls->ls_fenced = false;
> +	ls->ls_fence_retry_cnt = 0;
> +
>  	trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid);
>  	return ls;
>  }
> @@ -708,9 +735,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
>  		rcu_read_unlock();
>  		if (fl) {
>  			ops = nfsd4_layout_ops[ls->ls_layout_type];
> -			if (ops->fence_client)
> +			if (ops->fence_client) {
> +				refcount_inc(&ls->ls_stid.sc_count);
>  				ops->fence_client(ls, fl);
> -			else
> +			} else
>  				nfsd4_cb_layout_fail(ls, fl);
>  			nfsd_file_put(fl);
>  		}
> @@ -747,11 +775,9 @@ static bool
>  nfsd4_layout_lm_break(struct file_lease *fl)
>  {
>  	/*
> -	 * We don't want the locks code to timeout the lease for us;
> -	 * we'll remove it ourself if a layout isn't returned
> -	 * in time:
> +	 * Enforce break lease timeout to prevent NFSD
> +	 * thread from hanging in __break_lease.
>  	 */
> -	fl->fl_break_time = 0;
>  	nfsd4_recall_file_layout(fl->c.flc_owner);
>  	return false;
>  }
> @@ -782,10 +808,51 @@ nfsd4_layout_lm_open_conflict(struct file *filp, int arg)
>  	return 0;
>  }
>  
> +/**
> + * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out.
> + * If the layout type supports a fence operation, schedule a worker to
> + * fence the client from accessing the block device.
> + *
> + * @fl: file to check
> + *
> + * Return true if the file lease should be disposed of by the caller;
> + * otherwise, return false.
> + */
> +static bool
> +nfsd4_layout_lm_breaker_timedout(struct file_lease *fl)
> +{
> +	struct nfs4_layout_stateid *ls = fl->c.flc_owner;
> +	bool ret;
> +
> +	if (!nfsd4_layout_ops[ls->ls_layout_type]->fence_client)
> +		return true;
> +	if (ls->ls_fenced || ls->ls_fence_retry_cnt >= LO_MAX_FENCE_RETRY)
> +		return true;
> +
> +	if (work_busy(&ls->ls_fence_work.work))
> +		return false;
> +	/* Schedule work to do the fence operation */
> +	ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
> +	if (!ret) {
> +		/*
> +		 * If there is no pending work, mod_delayed_work queues
> +		 * new task. While fencing is in progress, a reference
> +		 * count is added to the layout stateid to ensure its
> +		 * validity. This reference count is released once fencing
> +		 * has been completed.
> +		 */
> +		refcount_inc(&ls->ls_stid.sc_count);
> +		++ls->ls_fence_retry_cnt;
> +		return true;

The cases where the fencing didn't work after too many retries, or the
job couldn't be queued should probably get a pr_warn or something. The
admin needs to know that data corruption is possible and that they
might need to nuke the client manually.

> +	}
> +	return false;
> +}
> +
>  static const struct lease_manager_operations nfsd4_layouts_lm_ops = {
>  	.lm_break		= nfsd4_layout_lm_break,
>  	.lm_change		= nfsd4_layout_lm_change,
>  	.lm_open_conflict	= nfsd4_layout_lm_open_conflict,
> +	.lm_breaker_timedout	= nfsd4_layout_lm_breaker_timedout,
>  };
>  
>  int
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 583c13b5aaf3..a57fa3318362 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -2385,6 +2385,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name,
>  #endif
>  #ifdef CONFIG_NFSD_SCSILAYOUT
>  	xa_init(&clp->cl_dev_fences);
> +	mutex_init(&clp->cl_fence_mutex);
>  #endif
>  	INIT_LIST_HEAD(&clp->async_copies);
>  	spin_lock_init(&clp->async_lock);
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index 713f55ef6554..57e54dfb406c 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -529,6 +529,7 @@ struct nfs4_client {
>  	time64_t		cl_ra_time;
>  #ifdef CONFIG_NFSD_SCSILAYOUT
>  	struct xarray		cl_dev_fences;
> +	struct mutex		cl_fence_mutex;
>  #endif
>  };
>  
> @@ -738,8 +739,13 @@ struct nfs4_layout_stateid {
>  	stateid_t			ls_recall_sid;
>  	bool				ls_recalled;
>  	struct mutex			ls_mutex;
> +	struct delayed_work		ls_fence_work;
> +	bool				ls_fenced;
> +	int				ls_fence_retry_cnt;
>  };
>  
> +#define	LO_MAX_FENCE_RETRY		5
> +
>  static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
>  {
>  	return container_of(s, struct nfs4_layout_stateid, ls_stid);
> diff --git a/include/linux/filelock.h b/include/linux/filelock.h
> index 2f5e5588ee07..13b9c9f04589 100644
> --- a/include/linux/filelock.h
> +++ b/include/linux/filelock.h
> @@ -50,6 +50,7 @@ struct lease_manager_operations {
>  	void (*lm_setup)(struct file_lease *, void **);
>  	bool (*lm_breaker_owns_lease)(struct file_lease *);
>  	int (*lm_open_conflict)(struct file *, int);
> +	bool (*lm_breaker_timedout)(struct file_lease *fl);
>  };
>  
>  struct lock_manager {

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing
  2026-01-27 17:54 ` Jeff Layton
@ 2026-01-27 20:52   ` Dai Ngo
  2026-01-28 17:25   ` Dai Ngo
  1 sibling, 0 replies; 12+ messages in thread
From: Dai Ngo @ 2026-01-27 20:52 UTC (permalink / raw)
  To: Jeff Layton, chuck.lever, neil, okorniev, tom, hch, alex.aring,
	viro, brauner, jack
  Cc: linux-fsdevel, linux-nfs


On 1/27/26 9:54 AM, Jeff Layton wrote:
> On Mon, 2026-01-26 at 16:50 -0800, Dai Ngo wrote:
>> When a layout conflict triggers a recall, enforcing a timeout is
>> necessary to prevent excessive nfsd threads from being blocked in
>> __break_lease ensuring the server continues servicing incoming
>> requests efficiently.
>>
>> This patch introduces a new function to lease_manager_operations:
>>
>> lm_breaker_timedout: Invoked when a lease recall times out and is
>> about to be disposed of. This function enables the lease manager
>> to inform the caller whether the file_lease should remain on the
>> flc_list or be disposed of.
>>
>> For the NFSD lease manager, this function now handles layout recall
>> timeouts. If the layout type supports fencing and the client has not
>> been fenced, a fence operation is triggered to prevent the client
>> from accessing the block device.
>>
>> While the fencing operation is in progress, the conflicting file_lease
>> remains on the flc_list until fencing is complete. This guarantees
>> that no other clients can access the file, and the client with
>> exclusive access is properly blocked before disposal.
>>
>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>> ---
>>   Documentation/filesystems/locking.rst |  2 +
>>   fs/locks.c                            | 10 +++-
>>   fs/nfsd/blocklayout.c                 | 38 ++++++-------
>>   fs/nfsd/nfs4layouts.c                 | 79 +++++++++++++++++++++++++--
>>   fs/nfsd/nfs4state.c                   |  1 +
>>   fs/nfsd/state.h                       |  6 ++
>>   include/linux/filelock.h              |  1 +
>>   7 files changed, 110 insertions(+), 27 deletions(-)
>>
>> v2:
>>      . Update Subject line to include fencing operation.
>>      . Allow conflicting lease to remain on flc_list until fencing
>>        is complete.
>>      . Use system worker to perform fencing operation asynchronously.
>>      . Use nfs4_stid.sc_count to ensure layout stateid remains
>>        valid before starting the fencing operation, nfs4_stid.sc_count
>>        is released after fencing operation is complete.
>>      . Rework nfsd4_scsi_fence_client to:
>>           . wait until fencing to complete before exiting.
>>           . wait until fencing in progress to complete before
>>             checking the NFSD_MDS_PR_FENCED flag.
>>      . Remove lm_need_to_retry from lease_manager_operations.
>> v3:
>>      . correct locking requirement in locking.rst.
>>      . add max retry count to fencing operation.
>>      . add missing nfs4_put_stid in nfsd4_layout_fence_worker.
>>      . remove special-casing of FL_LAYOUT in lease_modify.
>>      . remove lease_want_dispose.
>>      . move lm_breaker_timedout call to time_out_leases.
>> v4:
>>      . only increment ls_fence_retry_cnt after successfully
>>        schedule new work in nfsd4_layout_lm_breaker_timedout.
>>
>> diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
>> index 04c7691e50e0..a339491f02e4 100644
>> --- a/Documentation/filesystems/locking.rst
>> +++ b/Documentation/filesystems/locking.rst
>> @@ -403,6 +403,7 @@ prototypes::
>>   	bool (*lm_breaker_owns_lease)(struct file_lock *);
>>           bool (*lm_lock_expirable)(struct file_lock *);
>>           void (*lm_expire_lock)(void);
>> +        void (*lm_breaker_timedout)(struct file_lease *);
>>   
>>   locking rules:
>>   
>> @@ -417,6 +418,7 @@ lm_breaker_owns_lease:	yes     	no			no
>>   lm_lock_expirable	yes		no			no
>>   lm_expire_lock		no		no			yes
>>   lm_open_conflict	yes		no			no
>> +lm_breaker_timedout     yes             no                      no
>>   ======================	=============	=================	=========
>>   
>>   buffer_head
>> diff --git a/fs/locks.c b/fs/locks.c
>> index 46f229f740c8..1b63aa704598 100644
>> --- a/fs/locks.c
>> +++ b/fs/locks.c
>> @@ -1524,6 +1524,7 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
>>   {
>>   	struct file_lock_context *ctx = inode->i_flctx;
>>   	struct file_lease *fl, *tmp;
>> +	bool remove = true;
>>   
>>   	lockdep_assert_held(&ctx->flc_lock);
>>   
>> @@ -1531,8 +1532,13 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
>>   		trace_time_out_leases(inode, fl);
>>   		if (past_time(fl->fl_downgrade_time))
>>   			lease_modify(fl, F_RDLCK, dispose);
>> -		if (past_time(fl->fl_break_time))
>> -			lease_modify(fl, F_UNLCK, dispose);
>> +
>> +		if (past_time(fl->fl_break_time)) {
>> +			if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout)
>> +				remove = fl->fl_lmops->lm_breaker_timedout(fl);
>> +			if (remove)
>> +				lease_modify(fl, F_UNLCK, dispose);
> I'd not bother with the return code to lm_breaker_timedout.
>
> Make it void return and have it call lease_modify itself before
> returning in the cases where you have it returning true. If the
> operation isn't defined then just do the lease_modify here like we
> always have.

Ok, I will try this approach.

>
>> +		}
>>   	}
>>   }
>>   
>> diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
>> index 7ba9e2dd0875..69d3889df302 100644
>> --- a/fs/nfsd/blocklayout.c
>> +++ b/fs/nfsd/blocklayout.c
>> @@ -443,6 +443,14 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp,
>>   	return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
>>   }
>>   
>> +/*
>> + * Perform the fence operation to prevent the client from accessing the
>> + * block device. If a fence operation is already in progress, wait for
>> + * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the
>> + * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set,
>> + * update the layout stateid by setting the ls_fenced flag to indicate
>> + * that the client has been fenced.
>> + */
>>   static void
>>   nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
>>   {
>> @@ -450,31 +458,23 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
>>   	struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev;
>>   	int status;
>>   
>> -	if (nfsd4_scsi_fence_set(clp, bdev->bd_dev))
>> +	mutex_lock(&clp->cl_fence_mutex);
>> +	if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) {
>> +		ls->ls_fenced = true;
>> +		mutex_unlock(&clp->cl_fence_mutex);
>> +		nfs4_put_stid(&ls->ls_stid);
>>   		return;
>> +	}
>>   
> I don't understand what this new mutex is protecting,

The purpose of this mutex is to ensure that once this function returns,
either the client was fenced or not fence and not fencing in progress
(some one is in the process of doing it).

>   and this all
> seems overly complex. If feels kind of like you want nfsd to be driving
> the fencing retries, but I don't think we really do. Here's what I'd
> do.
>
> I'd just make ->fence_client a bool or int return, and have it indicate
> whether the client was successfully fenced or not. If it was
> successfully fenced, then have the caller call lease_modify() to remove
> the lease. If it wasn't successfully fenced, have the caller (the
> workqueue job) requeue itself if you want to retry. If the caller is
> ready to give up, then call lease_modify() on it and remove it (and
> probably throw a pr_warn()).

Ok, I will try this approach: have the fence worker doing the retry
instead of nfsd so that the fence worker is only scheduled once to
do the fencing.

>
>>   	status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
>>   			nfsd4_scsi_pr_key(clp),
>>   			PR_EXCLUSIVE_ACCESS_REG_ONLY, true);
>> -	/*
>> -	 * Reset to allow retry only when the command could not have
>> -	 * reached the device. Negative status means a local error
>> -	 * (e.g., -ENOMEM) prevented the command from being sent.
>> -	 * PR_STS_PATH_FAILED, PR_STS_PATH_FAST_FAILED, and
>> -	 * PR_STS_RETRY_PATH_FAILURE indicate transport path failures
>> -	 * before device delivery.
>> -	 *
>> -	 * For all other errors, the command may have reached the device
>> -	 * and the preempt may have succeeded. Avoid resetting, since
>> -	 * retrying a successful preempt returns PR_STS_IOERR or
>> -	 * PR_STS_RESERVATION_CONFLICT, which would cause an infinite
>> -	 * retry loop.
>> -	 */
>> -	if (status < 0 ||
>> -	    status == PR_STS_PATH_FAILED ||
>> -	    status == PR_STS_PATH_FAST_FAILED ||
>> -	    status == PR_STS_RETRY_PATH_FAILURE)
>> +	if (status)
>>   		nfsd4_scsi_fence_clear(clp, bdev->bd_dev);
>> +	else
>> +		ls->ls_fenced = true;
>> +	mutex_unlock(&clp->cl_fence_mutex);
>> +	nfs4_put_stid(&ls->ls_stid);
>>   
>>   	trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status);
>>   }
>> diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
>> index ad7af8cfcf1f..1c498f3cd059 100644
>> --- a/fs/nfsd/nfs4layouts.c
>> +++ b/fs/nfsd/nfs4layouts.c
>> @@ -222,6 +222,29 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
>>   	return 0;
>>   }
>>   
>> +static void
>> +nfsd4_layout_fence_worker(struct work_struct *work)
>> +{
>> +	struct nfsd_file *nf;
>> +	struct delayed_work *dwork = to_delayed_work(work);
>> +	struct nfs4_layout_stateid *ls = container_of(dwork,
>> +			struct nfs4_layout_stateid, ls_fence_work);
>> +	u32 type;
>> +
>> +	rcu_read_lock();
>> +	nf = nfsd_file_get(ls->ls_file);
>> +	rcu_read_unlock();
>> +	if (!nf) {
>> +		nfs4_put_stid(&ls->ls_stid);
>> +		return;
>> +	}
>> +
>> +	type = ls->ls_layout_type;
>> +	if (nfsd4_layout_ops[type]->fence_client)
>> +		nfsd4_layout_ops[type]->fence_client(ls, nf);
> If you make fence_client an int/bool return, then you could just
> requeue this job to try it again.

Yes.

>
>> +	nfsd_file_put(nf);
>> +}
>> +
>>   static struct nfs4_layout_stateid *
>>   nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
>>   		struct nfs4_stid *parent, u32 layout_type)
>> @@ -271,6 +294,10 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
>>   	list_add(&ls->ls_perfile, &fp->fi_lo_states);
>>   	spin_unlock(&fp->fi_lock);
>>   
>> +	INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker);
>> +	ls->ls_fenced = false;
>> +	ls->ls_fence_retry_cnt = 0;
>> +
>>   	trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid);
>>   	return ls;
>>   }
>> @@ -708,9 +735,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
>>   		rcu_read_unlock();
>>   		if (fl) {
>>   			ops = nfsd4_layout_ops[ls->ls_layout_type];
>> -			if (ops->fence_client)
>> +			if (ops->fence_client) {
>> +				refcount_inc(&ls->ls_stid.sc_count);
>>   				ops->fence_client(ls, fl);
>> -			else
>> +			} else
>>   				nfsd4_cb_layout_fail(ls, fl);
>>   			nfsd_file_put(fl);
>>   		}
>> @@ -747,11 +775,9 @@ static bool
>>   nfsd4_layout_lm_break(struct file_lease *fl)
>>   {
>>   	/*
>> -	 * We don't want the locks code to timeout the lease for us;
>> -	 * we'll remove it ourself if a layout isn't returned
>> -	 * in time:
>> +	 * Enforce break lease timeout to prevent NFSD
>> +	 * thread from hanging in __break_lease.
>>   	 */
>> -	fl->fl_break_time = 0;
>>   	nfsd4_recall_file_layout(fl->c.flc_owner);
>>   	return false;
>>   }
>> @@ -782,10 +808,51 @@ nfsd4_layout_lm_open_conflict(struct file *filp, int arg)
>>   	return 0;
>>   }
>>   
>> +/**
>> + * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out.
>> + * If the layout type supports a fence operation, schedule a worker to
>> + * fence the client from accessing the block device.
>> + *
>> + * @fl: file to check
>> + *
>> + * Return true if the file lease should be disposed of by the caller;
>> + * otherwise, return false.
>> + */
>> +static bool
>> +nfsd4_layout_lm_breaker_timedout(struct file_lease *fl)
>> +{
>> +	struct nfs4_layout_stateid *ls = fl->c.flc_owner;
>> +	bool ret;
>> +
>> +	if (!nfsd4_layout_ops[ls->ls_layout_type]->fence_client)
>> +		return true;
>> +	if (ls->ls_fenced || ls->ls_fence_retry_cnt >= LO_MAX_FENCE_RETRY)
>> +		return true;
>> +
>> +	if (work_busy(&ls->ls_fence_work.work))
>> +		return false;
>> +	/* Schedule work to do the fence operation */
>> +	ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
>> +	if (!ret) {
>> +		/*
>> +		 * If there is no pending work, mod_delayed_work queues
>> +		 * new task. While fencing is in progress, a reference
>> +		 * count is added to the layout stateid to ensure its
>> +		 * validity. This reference count is released once fencing
>> +		 * has been completed.
>> +		 */
>> +		refcount_inc(&ls->ls_stid.sc_count);
>> +		++ls->ls_fence_retry_cnt;
>> +		return true;
> The cases where the fencing didn't work after too many retries, or the
> job couldn't be queued should probably get a pr_warn or something. The
> admin needs to know that data corruption is possible and that they
> might need to nuke the client manually.

Yes, will do.

Thanks,
-Dai

>
>> +	}
>> +	return false;
>> +}
>> +
>>   static const struct lease_manager_operations nfsd4_layouts_lm_ops = {
>>   	.lm_break		= nfsd4_layout_lm_break,
>>   	.lm_change		= nfsd4_layout_lm_change,
>>   	.lm_open_conflict	= nfsd4_layout_lm_open_conflict,
>> +	.lm_breaker_timedout	= nfsd4_layout_lm_breaker_timedout,
>>   };
>>   
>>   int
>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>> index 583c13b5aaf3..a57fa3318362 100644
>> --- a/fs/nfsd/nfs4state.c
>> +++ b/fs/nfsd/nfs4state.c
>> @@ -2385,6 +2385,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name,
>>   #endif
>>   #ifdef CONFIG_NFSD_SCSILAYOUT
>>   	xa_init(&clp->cl_dev_fences);
>> +	mutex_init(&clp->cl_fence_mutex);
>>   #endif
>>   	INIT_LIST_HEAD(&clp->async_copies);
>>   	spin_lock_init(&clp->async_lock);
>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
>> index 713f55ef6554..57e54dfb406c 100644
>> --- a/fs/nfsd/state.h
>> +++ b/fs/nfsd/state.h
>> @@ -529,6 +529,7 @@ struct nfs4_client {
>>   	time64_t		cl_ra_time;
>>   #ifdef CONFIG_NFSD_SCSILAYOUT
>>   	struct xarray		cl_dev_fences;
>> +	struct mutex		cl_fence_mutex;
>>   #endif
>>   };
>>   
>> @@ -738,8 +739,13 @@ struct nfs4_layout_stateid {
>>   	stateid_t			ls_recall_sid;
>>   	bool				ls_recalled;
>>   	struct mutex			ls_mutex;
>> +	struct delayed_work		ls_fence_work;
>> +	bool				ls_fenced;
>> +	int				ls_fence_retry_cnt;
>>   };
>>   
>> +#define	LO_MAX_FENCE_RETRY		5
>> +
>>   static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
>>   {
>>   	return container_of(s, struct nfs4_layout_stateid, ls_stid);
>> diff --git a/include/linux/filelock.h b/include/linux/filelock.h
>> index 2f5e5588ee07..13b9c9f04589 100644
>> --- a/include/linux/filelock.h
>> +++ b/include/linux/filelock.h
>> @@ -50,6 +50,7 @@ struct lease_manager_operations {
>>   	void (*lm_setup)(struct file_lease *, void **);
>>   	bool (*lm_breaker_owns_lease)(struct file_lease *);
>>   	int (*lm_open_conflict)(struct file *, int);
>> +	bool (*lm_breaker_timedout)(struct file_lease *fl);
>>   };
>>   
>>   struct lock_manager {

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing
  2026-01-27 17:54 ` Jeff Layton
  2026-01-27 20:52   ` Dai Ngo
@ 2026-01-28 17:25   ` Dai Ngo
  2026-01-28 18:01     ` Jeff Layton
  1 sibling, 1 reply; 12+ messages in thread
From: Dai Ngo @ 2026-01-28 17:25 UTC (permalink / raw)
  To: Jeff Layton, chuck.lever, neil, okorniev, tom, hch, alex.aring,
	viro, brauner, jack
  Cc: linux-fsdevel, linux-nfs


On 1/27/26 9:54 AM, Jeff Layton wrote:
> On Mon, 2026-01-26 at 16:50 -0800, Dai Ngo wrote:
>> When a layout conflict triggers a recall, enforcing a timeout is
>> necessary to prevent excessive nfsd threads from being blocked in
>> __break_lease ensuring the server continues servicing incoming
>> requests efficiently.
>>
>> This patch introduces a new function to lease_manager_operations:
>>
>> lm_breaker_timedout: Invoked when a lease recall times out and is
>> about to be disposed of. This function enables the lease manager
>> to inform the caller whether the file_lease should remain on the
>> flc_list or be disposed of.
>>
>> For the NFSD lease manager, this function now handles layout recall
>> timeouts. If the layout type supports fencing and the client has not
>> been fenced, a fence operation is triggered to prevent the client
>> from accessing the block device.
>>
>> While the fencing operation is in progress, the conflicting file_lease
>> remains on the flc_list until fencing is complete. This guarantees
>> that no other clients can access the file, and the client with
>> exclusive access is properly blocked before disposal.
>>
>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>> ---
>>   Documentation/filesystems/locking.rst |  2 +
>>   fs/locks.c                            | 10 +++-
>>   fs/nfsd/blocklayout.c                 | 38 ++++++-------
>>   fs/nfsd/nfs4layouts.c                 | 79 +++++++++++++++++++++++++--
>>   fs/nfsd/nfs4state.c                   |  1 +
>>   fs/nfsd/state.h                       |  6 ++
>>   include/linux/filelock.h              |  1 +
>>   7 files changed, 110 insertions(+), 27 deletions(-)
>>
>> v2:
>>      . Update Subject line to include fencing operation.
>>      . Allow conflicting lease to remain on flc_list until fencing
>>        is complete.
>>      . Use system worker to perform fencing operation asynchronously.
>>      . Use nfs4_stid.sc_count to ensure layout stateid remains
>>        valid before starting the fencing operation, nfs4_stid.sc_count
>>        is released after fencing operation is complete.
>>      . Rework nfsd4_scsi_fence_client to:
>>           . wait until fencing to complete before exiting.
>>           . wait until fencing in progress to complete before
>>             checking the NFSD_MDS_PR_FENCED flag.
>>      . Remove lm_need_to_retry from lease_manager_operations.
>> v3:
>>      . correct locking requirement in locking.rst.
>>      . add max retry count to fencing operation.
>>      . add missing nfs4_put_stid in nfsd4_layout_fence_worker.
>>      . remove special-casing of FL_LAYOUT in lease_modify.
>>      . remove lease_want_dispose.
>>      . move lm_breaker_timedout call to time_out_leases.
>> v4:
>>      . only increment ls_fence_retry_cnt after successfully
>>        schedule new work in nfsd4_layout_lm_breaker_timedout.
>>
>> diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
>> index 04c7691e50e0..a339491f02e4 100644
>> --- a/Documentation/filesystems/locking.rst
>> +++ b/Documentation/filesystems/locking.rst
>> @@ -403,6 +403,7 @@ prototypes::
>>   	bool (*lm_breaker_owns_lease)(struct file_lock *);
>>           bool (*lm_lock_expirable)(struct file_lock *);
>>           void (*lm_expire_lock)(void);
>> +        void (*lm_breaker_timedout)(struct file_lease *);
>>   
>>   locking rules:
>>   
>> @@ -417,6 +418,7 @@ lm_breaker_owns_lease:	yes     	no			no
>>   lm_lock_expirable	yes		no			no
>>   lm_expire_lock		no		no			yes
>>   lm_open_conflict	yes		no			no
>> +lm_breaker_timedout     yes             no                      no
>>   ======================	=============	=================	=========
>>   
>>   buffer_head
>> diff --git a/fs/locks.c b/fs/locks.c
>> index 46f229f740c8..1b63aa704598 100644
>> --- a/fs/locks.c
>> +++ b/fs/locks.c
>> @@ -1524,6 +1524,7 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
>>   {
>>   	struct file_lock_context *ctx = inode->i_flctx;
>>   	struct file_lease *fl, *tmp;
>> +	bool remove = true;
>>   
>>   	lockdep_assert_held(&ctx->flc_lock);
>>   
>> @@ -1531,8 +1532,13 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
>>   		trace_time_out_leases(inode, fl);
>>   		if (past_time(fl->fl_downgrade_time))
>>   			lease_modify(fl, F_RDLCK, dispose);
>> -		if (past_time(fl->fl_break_time))
>> -			lease_modify(fl, F_UNLCK, dispose);
>> +
>> +		if (past_time(fl->fl_break_time)) {
>> +			if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout)
>> +				remove = fl->fl_lmops->lm_breaker_timedout(fl);
>> +			if (remove)
>> +				lease_modify(fl, F_UNLCK, dispose);
> I'd not bother with the return code to lm_breaker_timedout.
>
> Make it void return and have it call lease_modify itself before
> returning in the cases where you have it returning true. If the
> operation isn't defined then just do the lease_modify here like we
> always have.

The first call to lm_breaker_timedout schedules the fence worker so
fencing is not done yet. If lm_breaker_timedout return to time_out_leases
without any return value then lease_modify is called to remove the lease.

Am i missing something?

-Dai

>
>> +		}
>>   	}
>>   }
>>   
>> diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
>> index 7ba9e2dd0875..69d3889df302 100644
>> --- a/fs/nfsd/blocklayout.c
>> +++ b/fs/nfsd/blocklayout.c
>> @@ -443,6 +443,14 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp,
>>   	return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
>>   }
>>   
>> +/*
>> + * Perform the fence operation to prevent the client from accessing the
>> + * block device. If a fence operation is already in progress, wait for
>> + * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the
>> + * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set,
>> + * update the layout stateid by setting the ls_fenced flag to indicate
>> + * that the client has been fenced.
>> + */
>>   static void
>>   nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
>>   {
>> @@ -450,31 +458,23 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
>>   	struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev;
>>   	int status;
>>   
>> -	if (nfsd4_scsi_fence_set(clp, bdev->bd_dev))
>> +	mutex_lock(&clp->cl_fence_mutex);
>> +	if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) {
>> +		ls->ls_fenced = true;
>> +		mutex_unlock(&clp->cl_fence_mutex);
>> +		nfs4_put_stid(&ls->ls_stid);
>>   		return;
>> +	}
>>   
> I don't understand what this new mutex is protecting, and this all
> seems overly complex. If feels kind of like you want nfsd to be driving
> the fencing retries, but I don't think we really do. Here's what I'd
> do.
>
> I'd just make ->fence_client a bool or int return, and have it indicate
> whether the client was successfully fenced or not. If it was
> successfully fenced, then have the caller call lease_modify() to remove
> the lease. If it wasn't successfully fenced, have the caller (the
> workqueue job) requeue itself if you want to retry. If the caller is
> ready to give up, then call lease_modify() on it and remove it (and
> probably throw a pr_warn()).
>
>>   	status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
>>   			nfsd4_scsi_pr_key(clp),
>>   			PR_EXCLUSIVE_ACCESS_REG_ONLY, true);
>> -	/*
>> -	 * Reset to allow retry only when the command could not have
>> -	 * reached the device. Negative status means a local error
>> -	 * (e.g., -ENOMEM) prevented the command from being sent.
>> -	 * PR_STS_PATH_FAILED, PR_STS_PATH_FAST_FAILED, and
>> -	 * PR_STS_RETRY_PATH_FAILURE indicate transport path failures
>> -	 * before device delivery.
>> -	 *
>> -	 * For all other errors, the command may have reached the device
>> -	 * and the preempt may have succeeded. Avoid resetting, since
>> -	 * retrying a successful preempt returns PR_STS_IOERR or
>> -	 * PR_STS_RESERVATION_CONFLICT, which would cause an infinite
>> -	 * retry loop.
>> -	 */
>> -	if (status < 0 ||
>> -	    status == PR_STS_PATH_FAILED ||
>> -	    status == PR_STS_PATH_FAST_FAILED ||
>> -	    status == PR_STS_RETRY_PATH_FAILURE)
>> +	if (status)
>>   		nfsd4_scsi_fence_clear(clp, bdev->bd_dev);
>> +	else
>> +		ls->ls_fenced = true;
>> +	mutex_unlock(&clp->cl_fence_mutex);
>> +	nfs4_put_stid(&ls->ls_stid);
>>   
>>   	trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status);
>>   }
>> diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
>> index ad7af8cfcf1f..1c498f3cd059 100644
>> --- a/fs/nfsd/nfs4layouts.c
>> +++ b/fs/nfsd/nfs4layouts.c
>> @@ -222,6 +222,29 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
>>   	return 0;
>>   }
>>   
>> +static void
>> +nfsd4_layout_fence_worker(struct work_struct *work)
>> +{
>> +	struct nfsd_file *nf;
>> +	struct delayed_work *dwork = to_delayed_work(work);
>> +	struct nfs4_layout_stateid *ls = container_of(dwork,
>> +			struct nfs4_layout_stateid, ls_fence_work);
>> +	u32 type;
>> +
>> +	rcu_read_lock();
>> +	nf = nfsd_file_get(ls->ls_file);
>> +	rcu_read_unlock();
>> +	if (!nf) {
>> +		nfs4_put_stid(&ls->ls_stid);
>> +		return;
>> +	}
>> +
>> +	type = ls->ls_layout_type;
>> +	if (nfsd4_layout_ops[type]->fence_client)
>> +		nfsd4_layout_ops[type]->fence_client(ls, nf);
> If you make fence_client an int/bool return, then you could just
> requeue this job to try it again.
>
>> +	nfsd_file_put(nf);
>> +}
>> +
>>   static struct nfs4_layout_stateid *
>>   nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
>>   		struct nfs4_stid *parent, u32 layout_type)
>> @@ -271,6 +294,10 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
>>   	list_add(&ls->ls_perfile, &fp->fi_lo_states);
>>   	spin_unlock(&fp->fi_lock);
>>   
>> +	INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker);
>> +	ls->ls_fenced = false;
>> +	ls->ls_fence_retry_cnt = 0;
>> +
>>   	trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid);
>>   	return ls;
>>   }
>> @@ -708,9 +735,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
>>   		rcu_read_unlock();
>>   		if (fl) {
>>   			ops = nfsd4_layout_ops[ls->ls_layout_type];
>> -			if (ops->fence_client)
>> +			if (ops->fence_client) {
>> +				refcount_inc(&ls->ls_stid.sc_count);
>>   				ops->fence_client(ls, fl);
>> -			else
>> +			} else
>>   				nfsd4_cb_layout_fail(ls, fl);
>>   			nfsd_file_put(fl);
>>   		}
>> @@ -747,11 +775,9 @@ static bool
>>   nfsd4_layout_lm_break(struct file_lease *fl)
>>   {
>>   	/*
>> -	 * We don't want the locks code to timeout the lease for us;
>> -	 * we'll remove it ourself if a layout isn't returned
>> -	 * in time:
>> +	 * Enforce break lease timeout to prevent NFSD
>> +	 * thread from hanging in __break_lease.
>>   	 */
>> -	fl->fl_break_time = 0;
>>   	nfsd4_recall_file_layout(fl->c.flc_owner);
>>   	return false;
>>   }
>> @@ -782,10 +808,51 @@ nfsd4_layout_lm_open_conflict(struct file *filp, int arg)
>>   	return 0;
>>   }
>>   
>> +/**
>> + * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out.
>> + * If the layout type supports a fence operation, schedule a worker to
>> + * fence the client from accessing the block device.
>> + *
>> + * @fl: file to check
>> + *
>> + * Return true if the file lease should be disposed of by the caller;
>> + * otherwise, return false.
>> + */
>> +static bool
>> +nfsd4_layout_lm_breaker_timedout(struct file_lease *fl)
>> +{
>> +	struct nfs4_layout_stateid *ls = fl->c.flc_owner;
>> +	bool ret;
>> +
>> +	if (!nfsd4_layout_ops[ls->ls_layout_type]->fence_client)
>> +		return true;
>> +	if (ls->ls_fenced || ls->ls_fence_retry_cnt >= LO_MAX_FENCE_RETRY)
>> +		return true;
>> +
>> +	if (work_busy(&ls->ls_fence_work.work))
>> +		return false;
>> +	/* Schedule work to do the fence operation */
>> +	ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
>> +	if (!ret) {
>> +		/*
>> +		 * If there is no pending work, mod_delayed_work queues
>> +		 * new task. While fencing is in progress, a reference
>> +		 * count is added to the layout stateid to ensure its
>> +		 * validity. This reference count is released once fencing
>> +		 * has been completed.
>> +		 */
>> +		refcount_inc(&ls->ls_stid.sc_count);
>> +		++ls->ls_fence_retry_cnt;
>> +		return true;
> The cases where the fencing didn't work after too many retries, or the
> job couldn't be queued should probably get a pr_warn or something. The
> admin needs to know that data corruption is possible and that they
> might need to nuke the client manually.
>
>> +	}
>> +	return false;
>> +}
>> +
>>   static const struct lease_manager_operations nfsd4_layouts_lm_ops = {
>>   	.lm_break		= nfsd4_layout_lm_break,
>>   	.lm_change		= nfsd4_layout_lm_change,
>>   	.lm_open_conflict	= nfsd4_layout_lm_open_conflict,
>> +	.lm_breaker_timedout	= nfsd4_layout_lm_breaker_timedout,
>>   };
>>   
>>   int
>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>> index 583c13b5aaf3..a57fa3318362 100644
>> --- a/fs/nfsd/nfs4state.c
>> +++ b/fs/nfsd/nfs4state.c
>> @@ -2385,6 +2385,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name,
>>   #endif
>>   #ifdef CONFIG_NFSD_SCSILAYOUT
>>   	xa_init(&clp->cl_dev_fences);
>> +	mutex_init(&clp->cl_fence_mutex);
>>   #endif
>>   	INIT_LIST_HEAD(&clp->async_copies);
>>   	spin_lock_init(&clp->async_lock);
>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
>> index 713f55ef6554..57e54dfb406c 100644
>> --- a/fs/nfsd/state.h
>> +++ b/fs/nfsd/state.h
>> @@ -529,6 +529,7 @@ struct nfs4_client {
>>   	time64_t		cl_ra_time;
>>   #ifdef CONFIG_NFSD_SCSILAYOUT
>>   	struct xarray		cl_dev_fences;
>> +	struct mutex		cl_fence_mutex;
>>   #endif
>>   };
>>   
>> @@ -738,8 +739,13 @@ struct nfs4_layout_stateid {
>>   	stateid_t			ls_recall_sid;
>>   	bool				ls_recalled;
>>   	struct mutex			ls_mutex;
>> +	struct delayed_work		ls_fence_work;
>> +	bool				ls_fenced;
>> +	int				ls_fence_retry_cnt;
>>   };
>>   
>> +#define	LO_MAX_FENCE_RETRY		5
>> +
>>   static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
>>   {
>>   	return container_of(s, struct nfs4_layout_stateid, ls_stid);
>> diff --git a/include/linux/filelock.h b/include/linux/filelock.h
>> index 2f5e5588ee07..13b9c9f04589 100644
>> --- a/include/linux/filelock.h
>> +++ b/include/linux/filelock.h
>> @@ -50,6 +50,7 @@ struct lease_manager_operations {
>>   	void (*lm_setup)(struct file_lease *, void **);
>>   	bool (*lm_breaker_owns_lease)(struct file_lease *);
>>   	int (*lm_open_conflict)(struct file *, int);
>> +	bool (*lm_breaker_timedout)(struct file_lease *fl);
>>   };
>>   
>>   struct lock_manager {

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing
  2026-01-28 17:25   ` Dai Ngo
@ 2026-01-28 18:01     ` Jeff Layton
  2026-01-28 18:27       ` Dai Ngo
  0 siblings, 1 reply; 12+ messages in thread
From: Jeff Layton @ 2026-01-28 18:01 UTC (permalink / raw)
  To: Dai Ngo, chuck.lever, neil, okorniev, tom, hch, alex.aring, viro,
	brauner, jack
  Cc: linux-fsdevel, linux-nfs

On Wed, 2026-01-28 at 09:25 -0800, Dai Ngo wrote:
> On 1/27/26 9:54 AM, Jeff Layton wrote:
> > On Mon, 2026-01-26 at 16:50 -0800, Dai Ngo wrote:
> > > When a layout conflict triggers a recall, enforcing a timeout is
> > > necessary to prevent excessive nfsd threads from being blocked in
> > > __break_lease ensuring the server continues servicing incoming
> > > requests efficiently.
> > > 
> > > This patch introduces a new function to lease_manager_operations:
> > > 
> > > lm_breaker_timedout: Invoked when a lease recall times out and is
> > > about to be disposed of. This function enables the lease manager
> > > to inform the caller whether the file_lease should remain on the
> > > flc_list or be disposed of.
> > > 
> > > For the NFSD lease manager, this function now handles layout recall
> > > timeouts. If the layout type supports fencing and the client has not
> > > been fenced, a fence operation is triggered to prevent the client
> > > from accessing the block device.
> > > 
> > > While the fencing operation is in progress, the conflicting file_lease
> > > remains on the flc_list until fencing is complete. This guarantees
> > > that no other clients can access the file, and the client with
> > > exclusive access is properly blocked before disposal.
> > > 
> > > Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
> > > ---
> > >   Documentation/filesystems/locking.rst |  2 +
> > >   fs/locks.c                            | 10 +++-
> > >   fs/nfsd/blocklayout.c                 | 38 ++++++-------
> > >   fs/nfsd/nfs4layouts.c                 | 79 +++++++++++++++++++++++++--
> > >   fs/nfsd/nfs4state.c                   |  1 +
> > >   fs/nfsd/state.h                       |  6 ++
> > >   include/linux/filelock.h              |  1 +
> > >   7 files changed, 110 insertions(+), 27 deletions(-)
> > > 
> > > v2:
> > >      . Update Subject line to include fencing operation.
> > >      . Allow conflicting lease to remain on flc_list until fencing
> > >        is complete.
> > >      . Use system worker to perform fencing operation asynchronously.
> > >      . Use nfs4_stid.sc_count to ensure layout stateid remains
> > >        valid before starting the fencing operation, nfs4_stid.sc_count
> > >        is released after fencing operation is complete.
> > >      . Rework nfsd4_scsi_fence_client to:
> > >           . wait until fencing to complete before exiting.
> > >           . wait until fencing in progress to complete before
> > >             checking the NFSD_MDS_PR_FENCED flag.
> > >      . Remove lm_need_to_retry from lease_manager_operations.
> > > v3:
> > >      . correct locking requirement in locking.rst.
> > >      . add max retry count to fencing operation.
> > >      . add missing nfs4_put_stid in nfsd4_layout_fence_worker.
> > >      . remove special-casing of FL_LAYOUT in lease_modify.
> > >      . remove lease_want_dispose.
> > >      . move lm_breaker_timedout call to time_out_leases.
> > > v4:
> > >      . only increment ls_fence_retry_cnt after successfully
> > >        schedule new work in nfsd4_layout_lm_breaker_timedout.
> > > 
> > > diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
> > > index 04c7691e50e0..a339491f02e4 100644
> > > --- a/Documentation/filesystems/locking.rst
> > > +++ b/Documentation/filesystems/locking.rst
> > > @@ -403,6 +403,7 @@ prototypes::
> > >   	bool (*lm_breaker_owns_lease)(struct file_lock *);
> > >           bool (*lm_lock_expirable)(struct file_lock *);
> > >           void (*lm_expire_lock)(void);
> > > +        void (*lm_breaker_timedout)(struct file_lease *);
> > >   
> > >   locking rules:
> > >   
> > > @@ -417,6 +418,7 @@ lm_breaker_owns_lease:	yes     	no			no
> > >   lm_lock_expirable	yes		no			no
> > >   lm_expire_lock		no		no			yes
> > >   lm_open_conflict	yes		no			no
> > > +lm_breaker_timedout     yes             no                      no
> > >   ======================	=============	=================	=========
> > >   
> > >   buffer_head
> > > diff --git a/fs/locks.c b/fs/locks.c
> > > index 46f229f740c8..1b63aa704598 100644
> > > --- a/fs/locks.c
> > > +++ b/fs/locks.c
> > > @@ -1524,6 +1524,7 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
> > >   {
> > >   	struct file_lock_context *ctx = inode->i_flctx;
> > >   	struct file_lease *fl, *tmp;
> > > +	bool remove = true;
> > >   
> > >   	lockdep_assert_held(&ctx->flc_lock);
> > >   
> > > @@ -1531,8 +1532,13 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
> > >   		trace_time_out_leases(inode, fl);
> > >   		if (past_time(fl->fl_downgrade_time))
> > >   			lease_modify(fl, F_RDLCK, dispose);
> > > -		if (past_time(fl->fl_break_time))
> > > -			lease_modify(fl, F_UNLCK, dispose);
> > > +
> > > +		if (past_time(fl->fl_break_time)) {
> > > +			if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout)
> > > +				remove = fl->fl_lmops->lm_breaker_timedout(fl);
> > > +			if (remove)
> > > +				lease_modify(fl, F_UNLCK, dispose);
> > I'd not bother with the return code to lm_breaker_timedout.
> > 
> > Make it void return and have it call lease_modify itself before
> > returning in the cases where you have it returning true. If the
> > operation isn't defined then just do the lease_modify here like we
> > always have.
> 
> The first call to lm_breaker_timedout schedules the fence worker so
> fencing is not done yet. If lm_breaker_timedout return to time_out_leases
> without any return value then lease_modify is called to remove the lease.
> 
> Am i missing something?
> 
> -Dai
> 

if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout)
	fl->fl_lmops->lm_breaker_timedout(fl);
else
	lease_modify(fl, F_UNLCK, dispose);

... the lm_breaker_timedout() function (or the workqueue job it spawns)
is then responsible for calling lease_modify() at the appropriate time.

In most cases you're going to need to queue a workqueue job to fence,
and you don't want to clear out the lease until after it's done.


> > 
> > > +		}
> > >   	}
> > >   }
> > >   
> > > diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
> > > index 7ba9e2dd0875..69d3889df302 100644
> > > --- a/fs/nfsd/blocklayout.c
> > > +++ b/fs/nfsd/blocklayout.c
> > > @@ -443,6 +443,14 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp,
> > >   	return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
> > >   }
> > >   
> > > +/*
> > > + * Perform the fence operation to prevent the client from accessing the
> > > + * block device. If a fence operation is already in progress, wait for
> > > + * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the
> > > + * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set,
> > > + * update the layout stateid by setting the ls_fenced flag to indicate
> > > + * that the client has been fenced.
> > > + */
> > >   static void
> > >   nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
> > >   {
> > > @@ -450,31 +458,23 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
> > >   	struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev;
> > >   	int status;
> > >   
> > > -	if (nfsd4_scsi_fence_set(clp, bdev->bd_dev))
> > > +	mutex_lock(&clp->cl_fence_mutex);
> > > +	if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) {
> > > +		ls->ls_fenced = true;
> > > +		mutex_unlock(&clp->cl_fence_mutex);
> > > +		nfs4_put_stid(&ls->ls_stid);
> > >   		return;
> > > +	}
> > >   
> > I don't understand what this new mutex is protecting, and this all
> > seems overly complex. If feels kind of like you want nfsd to be driving
> > the fencing retries, but I don't think we really do. Here's what I'd
> > do.
> > 
> > I'd just make ->fence_client a bool or int return, and have it indicate
> > whether the client was successfully fenced or not. If it was
> > successfully fenced, then have the caller call lease_modify() to remove
> > the lease. If it wasn't successfully fenced, have the caller (the
> > workqueue job) requeue itself if you want to retry. If the caller is
> > ready to give up, then call lease_modify() on it and remove it (and
> > probably throw a pr_warn()).
> > 
> > >   	status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
> > >   			nfsd4_scsi_pr_key(clp),
> > >   			PR_EXCLUSIVE_ACCESS_REG_ONLY, true);
> > > -	/*
> > > -	 * Reset to allow retry only when the command could not have
> > > -	 * reached the device. Negative status means a local error
> > > -	 * (e.g., -ENOMEM) prevented the command from being sent.
> > > -	 * PR_STS_PATH_FAILED, PR_STS_PATH_FAST_FAILED, and
> > > -	 * PR_STS_RETRY_PATH_FAILURE indicate transport path failures
> > > -	 * before device delivery.
> > > -	 *
> > > -	 * For all other errors, the command may have reached the device
> > > -	 * and the preempt may have succeeded. Avoid resetting, since
> > > -	 * retrying a successful preempt returns PR_STS_IOERR or
> > > -	 * PR_STS_RESERVATION_CONFLICT, which would cause an infinite
> > > -	 * retry loop.
> > > -	 */
> > > -	if (status < 0 ||
> > > -	    status == PR_STS_PATH_FAILED ||
> > > -	    status == PR_STS_PATH_FAST_FAILED ||
> > > -	    status == PR_STS_RETRY_PATH_FAILURE)
> > > +	if (status)
> > >   		nfsd4_scsi_fence_clear(clp, bdev->bd_dev);
> > > +	else
> > > +		ls->ls_fenced = true;
> > > +	mutex_unlock(&clp->cl_fence_mutex);
> > > +	nfs4_put_stid(&ls->ls_stid);
> > >   
> > >   	trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status);
> > >   }
> > > diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
> > > index ad7af8cfcf1f..1c498f3cd059 100644
> > > --- a/fs/nfsd/nfs4layouts.c
> > > +++ b/fs/nfsd/nfs4layouts.c
> > > @@ -222,6 +222,29 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
> > >   	return 0;
> > >   }
> > >   
> > > +static void
> > > +nfsd4_layout_fence_worker(struct work_struct *work)
> > > +{
> > > +	struct nfsd_file *nf;
> > > +	struct delayed_work *dwork = to_delayed_work(work);
> > > +	struct nfs4_layout_stateid *ls = container_of(dwork,
> > > +			struct nfs4_layout_stateid, ls_fence_work);
> > > +	u32 type;
> > > +
> > > +	rcu_read_lock();
> > > +	nf = nfsd_file_get(ls->ls_file);
> > > +	rcu_read_unlock();
> > > +	if (!nf) {
> > > +		nfs4_put_stid(&ls->ls_stid);
> > > +		return;
> > > +	}
> > > +
> > > +	type = ls->ls_layout_type;
> > > +	if (nfsd4_layout_ops[type]->fence_client)
> > > +		nfsd4_layout_ops[type]->fence_client(ls, nf);
> > If you make fence_client an int/bool return, then you could just
> > requeue this job to try it again.
> > 
> > > +	nfsd_file_put(nf);
> > > +}
> > > +
> > >   static struct nfs4_layout_stateid *
> > >   nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
> > >   		struct nfs4_stid *parent, u32 layout_type)
> > > @@ -271,6 +294,10 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
> > >   	list_add(&ls->ls_perfile, &fp->fi_lo_states);
> > >   	spin_unlock(&fp->fi_lock);
> > >   
> > > +	INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker);
> > > +	ls->ls_fenced = false;
> > > +	ls->ls_fence_retry_cnt = 0;
> > > +
> > >   	trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid);
> > >   	return ls;
> > >   }
> > > @@ -708,9 +735,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
> > >   		rcu_read_unlock();
> > >   		if (fl) {
> > >   			ops = nfsd4_layout_ops[ls->ls_layout_type];
> > > -			if (ops->fence_client)
> > > +			if (ops->fence_client) {
> > > +				refcount_inc(&ls->ls_stid.sc_count);
> > >   				ops->fence_client(ls, fl);
> > > -			else
> > > +			} else
> > >   				nfsd4_cb_layout_fail(ls, fl);
> > >   			nfsd_file_put(fl);
> > >   		}
> > > @@ -747,11 +775,9 @@ static bool
> > >   nfsd4_layout_lm_break(struct file_lease *fl)
> > >   {
> > >   	/*
> > > -	 * We don't want the locks code to timeout the lease for us;
> > > -	 * we'll remove it ourself if a layout isn't returned
> > > -	 * in time:
> > > +	 * Enforce break lease timeout to prevent NFSD
> > > +	 * thread from hanging in __break_lease.
> > >   	 */
> > > -	fl->fl_break_time = 0;
> > >   	nfsd4_recall_file_layout(fl->c.flc_owner);
> > >   	return false;
> > >   }
> > > @@ -782,10 +808,51 @@ nfsd4_layout_lm_open_conflict(struct file *filp, int arg)
> > >   	return 0;
> > >   }
> > >   
> > > +/**
> > > + * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out.
> > > + * If the layout type supports a fence operation, schedule a worker to
> > > + * fence the client from accessing the block device.
> > > + *
> > > + * @fl: file to check
> > > + *
> > > + * Return true if the file lease should be disposed of by the caller;
> > > + * otherwise, return false.
> > > + */
> > > +static bool
> > > +nfsd4_layout_lm_breaker_timedout(struct file_lease *fl)
> > > +{
> > > +	struct nfs4_layout_stateid *ls = fl->c.flc_owner;
> > > +	bool ret;
> > > +
> > > +	if (!nfsd4_layout_ops[ls->ls_layout_type]->fence_client)
> > > +		return true;
> > > +	if (ls->ls_fenced || ls->ls_fence_retry_cnt >= LO_MAX_FENCE_RETRY)
> > > +		return true;
> > > +
> > > +	if (work_busy(&ls->ls_fence_work.work))
> > > +		return false;
> > > +	/* Schedule work to do the fence operation */
> > > +	ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
> > > +	if (!ret) {
> > > +		/*
> > > +		 * If there is no pending work, mod_delayed_work queues
> > > +		 * new task. While fencing is in progress, a reference
> > > +		 * count is added to the layout stateid to ensure its
> > > +		 * validity. This reference count is released once fencing
> > > +		 * has been completed.
> > > +		 */
> > > +		refcount_inc(&ls->ls_stid.sc_count);
> > > +		++ls->ls_fence_retry_cnt;
> > > +		return true;
> > The cases where the fencing didn't work after too many retries, or the
> > job couldn't be queued should probably get a pr_warn or something. The
> > admin needs to know that data corruption is possible and that they
> > might need to nuke the client manually.
> > 
> > > +	}
> > > +	return false;
> > > +}
> > > +
> > >   static const struct lease_manager_operations nfsd4_layouts_lm_ops = {
> > >   	.lm_break		= nfsd4_layout_lm_break,
> > >   	.lm_change		= nfsd4_layout_lm_change,
> > >   	.lm_open_conflict	= nfsd4_layout_lm_open_conflict,
> > > +	.lm_breaker_timedout	= nfsd4_layout_lm_breaker_timedout,
> > >   };
> > >   
> > >   int
> > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > index 583c13b5aaf3..a57fa3318362 100644
> > > --- a/fs/nfsd/nfs4state.c
> > > +++ b/fs/nfsd/nfs4state.c
> > > @@ -2385,6 +2385,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name,
> > >   #endif
> > >   #ifdef CONFIG_NFSD_SCSILAYOUT
> > >   	xa_init(&clp->cl_dev_fences);
> > > +	mutex_init(&clp->cl_fence_mutex);
> > >   #endif
> > >   	INIT_LIST_HEAD(&clp->async_copies);
> > >   	spin_lock_init(&clp->async_lock);
> > > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > > index 713f55ef6554..57e54dfb406c 100644
> > > --- a/fs/nfsd/state.h
> > > +++ b/fs/nfsd/state.h
> > > @@ -529,6 +529,7 @@ struct nfs4_client {
> > >   	time64_t		cl_ra_time;
> > >   #ifdef CONFIG_NFSD_SCSILAYOUT
> > >   	struct xarray		cl_dev_fences;
> > > +	struct mutex		cl_fence_mutex;
> > >   #endif
> > >   };
> > >   
> > > @@ -738,8 +739,13 @@ struct nfs4_layout_stateid {
> > >   	stateid_t			ls_recall_sid;
> > >   	bool				ls_recalled;
> > >   	struct mutex			ls_mutex;
> > > +	struct delayed_work		ls_fence_work;
> > > +	bool				ls_fenced;
> > > +	int				ls_fence_retry_cnt;
> > >   };
> > >   
> > > +#define	LO_MAX_FENCE_RETRY		5
> > > +
> > >   static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
> > >   {
> > >   	return container_of(s, struct nfs4_layout_stateid, ls_stid);
> > > diff --git a/include/linux/filelock.h b/include/linux/filelock.h
> > > index 2f5e5588ee07..13b9c9f04589 100644
> > > --- a/include/linux/filelock.h
> > > +++ b/include/linux/filelock.h
> > > @@ -50,6 +50,7 @@ struct lease_manager_operations {
> > >   	void (*lm_setup)(struct file_lease *, void **);
> > >   	bool (*lm_breaker_owns_lease)(struct file_lease *);
> > >   	int (*lm_open_conflict)(struct file *, int);
> > > +	bool (*lm_breaker_timedout)(struct file_lease *fl);
> > >   };
> > >   
> > >   struct lock_manager {

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing
  2026-01-28 18:01     ` Jeff Layton
@ 2026-01-28 18:27       ` Dai Ngo
  0 siblings, 0 replies; 12+ messages in thread
From: Dai Ngo @ 2026-01-28 18:27 UTC (permalink / raw)
  To: Jeff Layton, chuck.lever, neil, okorniev, tom, hch, alex.aring,
	viro, brauner, jack
  Cc: linux-fsdevel, linux-nfs


On 1/28/26 10:01 AM, Jeff Layton wrote:
> On Wed, 2026-01-28 at 09:25 -0800, Dai Ngo wrote:
>> On 1/27/26 9:54 AM, Jeff Layton wrote:
>>> On Mon, 2026-01-26 at 16:50 -0800, Dai Ngo wrote:
>>>> When a layout conflict triggers a recall, enforcing a timeout is
>>>> necessary to prevent excessive nfsd threads from being blocked in
>>>> __break_lease ensuring the server continues servicing incoming
>>>> requests efficiently.
>>>>
>>>> This patch introduces a new function to lease_manager_operations:
>>>>
>>>> lm_breaker_timedout: Invoked when a lease recall times out and is
>>>> about to be disposed of. This function enables the lease manager
>>>> to inform the caller whether the file_lease should remain on the
>>>> flc_list or be disposed of.
>>>>
>>>> For the NFSD lease manager, this function now handles layout recall
>>>> timeouts. If the layout type supports fencing and the client has not
>>>> been fenced, a fence operation is triggered to prevent the client
>>>> from accessing the block device.
>>>>
>>>> While the fencing operation is in progress, the conflicting file_lease
>>>> remains on the flc_list until fencing is complete. This guarantees
>>>> that no other clients can access the file, and the client with
>>>> exclusive access is properly blocked before disposal.
>>>>
>>>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>>>> ---
>>>>    Documentation/filesystems/locking.rst |  2 +
>>>>    fs/locks.c                            | 10 +++-
>>>>    fs/nfsd/blocklayout.c                 | 38 ++++++-------
>>>>    fs/nfsd/nfs4layouts.c                 | 79 +++++++++++++++++++++++++--
>>>>    fs/nfsd/nfs4state.c                   |  1 +
>>>>    fs/nfsd/state.h                       |  6 ++
>>>>    include/linux/filelock.h              |  1 +
>>>>    7 files changed, 110 insertions(+), 27 deletions(-)
>>>>
>>>> v2:
>>>>       . Update Subject line to include fencing operation.
>>>>       . Allow conflicting lease to remain on flc_list until fencing
>>>>         is complete.
>>>>       . Use system worker to perform fencing operation asynchronously.
>>>>       . Use nfs4_stid.sc_count to ensure layout stateid remains
>>>>         valid before starting the fencing operation, nfs4_stid.sc_count
>>>>         is released after fencing operation is complete.
>>>>       . Rework nfsd4_scsi_fence_client to:
>>>>            . wait until fencing to complete before exiting.
>>>>            . wait until fencing in progress to complete before
>>>>              checking the NFSD_MDS_PR_FENCED flag.
>>>>       . Remove lm_need_to_retry from lease_manager_operations.
>>>> v3:
>>>>       . correct locking requirement in locking.rst.
>>>>       . add max retry count to fencing operation.
>>>>       . add missing nfs4_put_stid in nfsd4_layout_fence_worker.
>>>>       . remove special-casing of FL_LAYOUT in lease_modify.
>>>>       . remove lease_want_dispose.
>>>>       . move lm_breaker_timedout call to time_out_leases.
>>>> v4:
>>>>       . only increment ls_fence_retry_cnt after successfully
>>>>         schedule new work in nfsd4_layout_lm_breaker_timedout.
>>>>
>>>> diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
>>>> index 04c7691e50e0..a339491f02e4 100644
>>>> --- a/Documentation/filesystems/locking.rst
>>>> +++ b/Documentation/filesystems/locking.rst
>>>> @@ -403,6 +403,7 @@ prototypes::
>>>>    	bool (*lm_breaker_owns_lease)(struct file_lock *);
>>>>            bool (*lm_lock_expirable)(struct file_lock *);
>>>>            void (*lm_expire_lock)(void);
>>>> +        void (*lm_breaker_timedout)(struct file_lease *);
>>>>    
>>>>    locking rules:
>>>>    
>>>> @@ -417,6 +418,7 @@ lm_breaker_owns_lease:	yes     	no			no
>>>>    lm_lock_expirable	yes		no			no
>>>>    lm_expire_lock		no		no			yes
>>>>    lm_open_conflict	yes		no			no
>>>> +lm_breaker_timedout     yes             no                      no
>>>>    ======================	=============	=================	=========
>>>>    
>>>>    buffer_head
>>>> diff --git a/fs/locks.c b/fs/locks.c
>>>> index 46f229f740c8..1b63aa704598 100644
>>>> --- a/fs/locks.c
>>>> +++ b/fs/locks.c
>>>> @@ -1524,6 +1524,7 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
>>>>    {
>>>>    	struct file_lock_context *ctx = inode->i_flctx;
>>>>    	struct file_lease *fl, *tmp;
>>>> +	bool remove = true;
>>>>    
>>>>    	lockdep_assert_held(&ctx->flc_lock);
>>>>    
>>>> @@ -1531,8 +1532,13 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
>>>>    		trace_time_out_leases(inode, fl);
>>>>    		if (past_time(fl->fl_downgrade_time))
>>>>    			lease_modify(fl, F_RDLCK, dispose);
>>>> -		if (past_time(fl->fl_break_time))
>>>> -			lease_modify(fl, F_UNLCK, dispose);
>>>> +
>>>> +		if (past_time(fl->fl_break_time)) {
>>>> +			if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout)
>>>> +				remove = fl->fl_lmops->lm_breaker_timedout(fl);
>>>> +			if (remove)
>>>> +				lease_modify(fl, F_UNLCK, dispose);
>>> I'd not bother with the return code to lm_breaker_timedout.
>>>
>>> Make it void return and have it call lease_modify itself before
>>> returning in the cases where you have it returning true. If the
>>> operation isn't defined then just do the lease_modify here like we
>>> always have.
>> The first call to lm_breaker_timedout schedules the fence worker so
>> fencing is not done yet. If lm_breaker_timedout return to time_out_leases
>> without any return value then lease_modify is called to remove the lease.
>>
>> Am i missing something?
>>
>> -Dai
>>
> if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout)
> 	fl->fl_lmops->lm_breaker_timedout(fl);
> else
> 	lease_modify(fl, F_UNLCK, dispose);
>
> ... the lm_breaker_timedout() function (or the workqueue job it spawns)
> is then responsible for calling lease_modify() at the appropriate time.
>
> In most cases you're going to need to queue a workqueue job to fence,
> and you don't want to clear out the lease until after it's done.

Oh I see, thank you!

-Dai

>
>
>>>> +		}
>>>>    	}
>>>>    }
>>>>    
>>>> diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
>>>> index 7ba9e2dd0875..69d3889df302 100644
>>>> --- a/fs/nfsd/blocklayout.c
>>>> +++ b/fs/nfsd/blocklayout.c
>>>> @@ -443,6 +443,14 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp,
>>>>    	return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
>>>>    }
>>>>    
>>>> +/*
>>>> + * Perform the fence operation to prevent the client from accessing the
>>>> + * block device. If a fence operation is already in progress, wait for
>>>> + * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the
>>>> + * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set,
>>>> + * update the layout stateid by setting the ls_fenced flag to indicate
>>>> + * that the client has been fenced.
>>>> + */
>>>>    static void
>>>>    nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
>>>>    {
>>>> @@ -450,31 +458,23 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
>>>>    	struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev;
>>>>    	int status;
>>>>    
>>>> -	if (nfsd4_scsi_fence_set(clp, bdev->bd_dev))
>>>> +	mutex_lock(&clp->cl_fence_mutex);
>>>> +	if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) {
>>>> +		ls->ls_fenced = true;
>>>> +		mutex_unlock(&clp->cl_fence_mutex);
>>>> +		nfs4_put_stid(&ls->ls_stid);
>>>>    		return;
>>>> +	}
>>>>    
>>> I don't understand what this new mutex is protecting, and this all
>>> seems overly complex. If feels kind of like you want nfsd to be driving
>>> the fencing retries, but I don't think we really do. Here's what I'd
>>> do.
>>>
>>> I'd just make ->fence_client a bool or int return, and have it indicate
>>> whether the client was successfully fenced or not. If it was
>>> successfully fenced, then have the caller call lease_modify() to remove
>>> the lease. If it wasn't successfully fenced, have the caller (the
>>> workqueue job) requeue itself if you want to retry. If the caller is
>>> ready to give up, then call lease_modify() on it and remove it (and
>>> probably throw a pr_warn()).
>>>
>>>>    	status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
>>>>    			nfsd4_scsi_pr_key(clp),
>>>>    			PR_EXCLUSIVE_ACCESS_REG_ONLY, true);
>>>> -	/*
>>>> -	 * Reset to allow retry only when the command could not have
>>>> -	 * reached the device. Negative status means a local error
>>>> -	 * (e.g., -ENOMEM) prevented the command from being sent.
>>>> -	 * PR_STS_PATH_FAILED, PR_STS_PATH_FAST_FAILED, and
>>>> -	 * PR_STS_RETRY_PATH_FAILURE indicate transport path failures
>>>> -	 * before device delivery.
>>>> -	 *
>>>> -	 * For all other errors, the command may have reached the device
>>>> -	 * and the preempt may have succeeded. Avoid resetting, since
>>>> -	 * retrying a successful preempt returns PR_STS_IOERR or
>>>> -	 * PR_STS_RESERVATION_CONFLICT, which would cause an infinite
>>>> -	 * retry loop.
>>>> -	 */
>>>> -	if (status < 0 ||
>>>> -	    status == PR_STS_PATH_FAILED ||
>>>> -	    status == PR_STS_PATH_FAST_FAILED ||
>>>> -	    status == PR_STS_RETRY_PATH_FAILURE)
>>>> +	if (status)
>>>>    		nfsd4_scsi_fence_clear(clp, bdev->bd_dev);
>>>> +	else
>>>> +		ls->ls_fenced = true;
>>>> +	mutex_unlock(&clp->cl_fence_mutex);
>>>> +	nfs4_put_stid(&ls->ls_stid);
>>>>    
>>>>    	trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status);
>>>>    }
>>>> diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
>>>> index ad7af8cfcf1f..1c498f3cd059 100644
>>>> --- a/fs/nfsd/nfs4layouts.c
>>>> +++ b/fs/nfsd/nfs4layouts.c
>>>> @@ -222,6 +222,29 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
>>>>    	return 0;
>>>>    }
>>>>    
>>>> +static void
>>>> +nfsd4_layout_fence_worker(struct work_struct *work)
>>>> +{
>>>> +	struct nfsd_file *nf;
>>>> +	struct delayed_work *dwork = to_delayed_work(work);
>>>> +	struct nfs4_layout_stateid *ls = container_of(dwork,
>>>> +			struct nfs4_layout_stateid, ls_fence_work);
>>>> +	u32 type;
>>>> +
>>>> +	rcu_read_lock();
>>>> +	nf = nfsd_file_get(ls->ls_file);
>>>> +	rcu_read_unlock();
>>>> +	if (!nf) {
>>>> +		nfs4_put_stid(&ls->ls_stid);
>>>> +		return;
>>>> +	}
>>>> +
>>>> +	type = ls->ls_layout_type;
>>>> +	if (nfsd4_layout_ops[type]->fence_client)
>>>> +		nfsd4_layout_ops[type]->fence_client(ls, nf);
>>> If you make fence_client an int/bool return, then you could just
>>> requeue this job to try it again.
>>>
>>>> +	nfsd_file_put(nf);
>>>> +}
>>>> +
>>>>    static struct nfs4_layout_stateid *
>>>>    nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
>>>>    		struct nfs4_stid *parent, u32 layout_type)
>>>> @@ -271,6 +294,10 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
>>>>    	list_add(&ls->ls_perfile, &fp->fi_lo_states);
>>>>    	spin_unlock(&fp->fi_lock);
>>>>    
>>>> +	INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker);
>>>> +	ls->ls_fenced = false;
>>>> +	ls->ls_fence_retry_cnt = 0;
>>>> +
>>>>    	trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid);
>>>>    	return ls;
>>>>    }
>>>> @@ -708,9 +735,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
>>>>    		rcu_read_unlock();
>>>>    		if (fl) {
>>>>    			ops = nfsd4_layout_ops[ls->ls_layout_type];
>>>> -			if (ops->fence_client)
>>>> +			if (ops->fence_client) {
>>>> +				refcount_inc(&ls->ls_stid.sc_count);
>>>>    				ops->fence_client(ls, fl);
>>>> -			else
>>>> +			} else
>>>>    				nfsd4_cb_layout_fail(ls, fl);
>>>>    			nfsd_file_put(fl);
>>>>    		}
>>>> @@ -747,11 +775,9 @@ static bool
>>>>    nfsd4_layout_lm_break(struct file_lease *fl)
>>>>    {
>>>>    	/*
>>>> -	 * We don't want the locks code to timeout the lease for us;
>>>> -	 * we'll remove it ourself if a layout isn't returned
>>>> -	 * in time:
>>>> +	 * Enforce break lease timeout to prevent NFSD
>>>> +	 * thread from hanging in __break_lease.
>>>>    	 */
>>>> -	fl->fl_break_time = 0;
>>>>    	nfsd4_recall_file_layout(fl->c.flc_owner);
>>>>    	return false;
>>>>    }
>>>> @@ -782,10 +808,51 @@ nfsd4_layout_lm_open_conflict(struct file *filp, int arg)
>>>>    	return 0;
>>>>    }
>>>>    
>>>> +/**
>>>> + * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out.
>>>> + * If the layout type supports a fence operation, schedule a worker to
>>>> + * fence the client from accessing the block device.
>>>> + *
>>>> + * @fl: file to check
>>>> + *
>>>> + * Return true if the file lease should be disposed of by the caller;
>>>> + * otherwise, return false.
>>>> + */
>>>> +static bool
>>>> +nfsd4_layout_lm_breaker_timedout(struct file_lease *fl)
>>>> +{
>>>> +	struct nfs4_layout_stateid *ls = fl->c.flc_owner;
>>>> +	bool ret;
>>>> +
>>>> +	if (!nfsd4_layout_ops[ls->ls_layout_type]->fence_client)
>>>> +		return true;
>>>> +	if (ls->ls_fenced || ls->ls_fence_retry_cnt >= LO_MAX_FENCE_RETRY)
>>>> +		return true;
>>>> +
>>>> +	if (work_busy(&ls->ls_fence_work.work))
>>>> +		return false;
>>>> +	/* Schedule work to do the fence operation */
>>>> +	ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
>>>> +	if (!ret) {
>>>> +		/*
>>>> +		 * If there is no pending work, mod_delayed_work queues
>>>> +		 * new task. While fencing is in progress, a reference
>>>> +		 * count is added to the layout stateid to ensure its
>>>> +		 * validity. This reference count is released once fencing
>>>> +		 * has been completed.
>>>> +		 */
>>>> +		refcount_inc(&ls->ls_stid.sc_count);
>>>> +		++ls->ls_fence_retry_cnt;
>>>> +		return true;
>>> The cases where the fencing didn't work after too many retries, or the
>>> job couldn't be queued should probably get a pr_warn or something. The
>>> admin needs to know that data corruption is possible and that they
>>> might need to nuke the client manually.
>>>
>>>> +	}
>>>> +	return false;
>>>> +}
>>>> +
>>>>    static const struct lease_manager_operations nfsd4_layouts_lm_ops = {
>>>>    	.lm_break		= nfsd4_layout_lm_break,
>>>>    	.lm_change		= nfsd4_layout_lm_change,
>>>>    	.lm_open_conflict	= nfsd4_layout_lm_open_conflict,
>>>> +	.lm_breaker_timedout	= nfsd4_layout_lm_breaker_timedout,
>>>>    };
>>>>    
>>>>    int
>>>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>>>> index 583c13b5aaf3..a57fa3318362 100644
>>>> --- a/fs/nfsd/nfs4state.c
>>>> +++ b/fs/nfsd/nfs4state.c
>>>> @@ -2385,6 +2385,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name,
>>>>    #endif
>>>>    #ifdef CONFIG_NFSD_SCSILAYOUT
>>>>    	xa_init(&clp->cl_dev_fences);
>>>> +	mutex_init(&clp->cl_fence_mutex);
>>>>    #endif
>>>>    	INIT_LIST_HEAD(&clp->async_copies);
>>>>    	spin_lock_init(&clp->async_lock);
>>>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
>>>> index 713f55ef6554..57e54dfb406c 100644
>>>> --- a/fs/nfsd/state.h
>>>> +++ b/fs/nfsd/state.h
>>>> @@ -529,6 +529,7 @@ struct nfs4_client {
>>>>    	time64_t		cl_ra_time;
>>>>    #ifdef CONFIG_NFSD_SCSILAYOUT
>>>>    	struct xarray		cl_dev_fences;
>>>> +	struct mutex		cl_fence_mutex;
>>>>    #endif
>>>>    };
>>>>    
>>>> @@ -738,8 +739,13 @@ struct nfs4_layout_stateid {
>>>>    	stateid_t			ls_recall_sid;
>>>>    	bool				ls_recalled;
>>>>    	struct mutex			ls_mutex;
>>>> +	struct delayed_work		ls_fence_work;
>>>> +	bool				ls_fenced;
>>>> +	int				ls_fence_retry_cnt;
>>>>    };
>>>>    
>>>> +#define	LO_MAX_FENCE_RETRY		5
>>>> +
>>>>    static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
>>>>    {
>>>>    	return container_of(s, struct nfs4_layout_stateid, ls_stid);
>>>> diff --git a/include/linux/filelock.h b/include/linux/filelock.h
>>>> index 2f5e5588ee07..13b9c9f04589 100644
>>>> --- a/include/linux/filelock.h
>>>> +++ b/include/linux/filelock.h
>>>> @@ -50,6 +50,7 @@ struct lease_manager_operations {
>>>>    	void (*lm_setup)(struct file_lease *, void **);
>>>>    	bool (*lm_breaker_owns_lease)(struct file_lease *);
>>>>    	int (*lm_open_conflict)(struct file *, int);
>>>> +	bool (*lm_breaker_timedout)(struct file_lease *fl);
>>>>    };
>>>>    
>>>>    struct lock_manager {

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2026-01-28 18:28 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-01-27  0:50 [PATCH v4 1/1] NFSD: Enforce Timeout on Layout Recall and Integrate Lease Manager Fencing Dai Ngo
2026-01-27 17:11 ` Chuck Lever
2026-01-27 18:14   ` Jeff Layton
2026-01-27 20:38     ` Dai Ngo
2026-01-27 20:36   ` Dai Ngo
2026-01-27 21:53     ` Chuck Lever
2026-01-28  0:50       ` Dai Ngo
2026-01-27 17:54 ` Jeff Layton
2026-01-27 20:52   ` Dai Ngo
2026-01-28 17:25   ` Dai Ngo
2026-01-28 18:01     ` Jeff Layton
2026-01-28 18:27       ` Dai Ngo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox