[PATCH] nfs: Implement delayed data server destruction with hold cache

Linux NFS development
 help / color / mirror / Atom feed

* [PATCH] nfs: Implement delayed data server destruction with hold cache
@ 2025-11-18 10:57 Gaurav Gangalwar
  2025-11-18 14:43 ` Trond Myklebust
  2025-11-19 11:20 ` kernel test robot
  0 siblings, 2 replies; 6+ messages in thread
From: Gaurav Gangalwar @ 2025-11-18 10:57 UTC (permalink / raw)
  To: trondmy, anna, tom, chuck.lever; +Cc: linux-nfs, Gaurav Gangalwar

Introduce a hold cache mechanism for NFS pNFS data servers to avoid
unnecessary connection churn when data servers are temporarily idle.

Key changes:

1. Hold Cache Implementation:
   - Add nfs4_data_server_hold_cache to namespace structure
   - Move data servers to hold cache when refcount reaches zero
   - Always update ds_last_access timestamp on every reference

2. Configurable Module Parameters:
   - nfs4_pnfs_ds_grace_period: Grace period before destroying idle
     data servers (default: 300 seconds)
   - nfs4_pnfs_ds_cleanup_interval: Interval for periodic cleanup
     work (default: 300 seconds)

3. Periodic Cleanup Work:
   - Schedule delayed work on first DS usage (lazy initialization)
   - Check hold cache and destroy data servers that exceed grace period
   - Reschedule work automatically for continuous monitoring

4. Callback Mechanism:
   - Use function pointer callback to avoid circular module dependencies
   - nfsv4.ko registers cleanup callback during initialization
   - nfs.ko calls callback during namespace cleanup (if registered)

5. Timestamp Tracking:
   - Add ds_last_access field to nfs4_pnfs_ds structure
   - Update timestamp on DS allocation, lookup, and reference

Benefits:
- Reduces connection setup/teardown overhead for intermittently used DSs
- Allows DS reuse if accessed again within grace period
- Configurable behavior via module parameters

Signed-off-by: Gaurav Gangalwar <gaurav.gangalwar@gmail.com>
---
 fs/nfs/client.c   |   9 +++
 fs/nfs/netns.h    |   4 ++
 fs/nfs/pnfs.h     |   8 +++
 fs/nfs/pnfs_nfs.c | 174 ++++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 188 insertions(+), 7 deletions(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 54699299d5b1..c487a388ea86 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1253,7 +1253,9 @@ void nfs_clients_init(struct net *net)
 #endif
 #if IS_ENABLED(CONFIG_NFS_V4_1)
 	INIT_LIST_HEAD(&nn->nfs4_data_server_cache);
+	INIT_LIST_HEAD(&nn->nfs4_data_server_hold_cache);
 	spin_lock_init(&nn->nfs4_data_server_lock);
+	/* Cleanup work will be initialized when pNFS is first used */
 #endif
 	spin_lock_init(&nn->nfs_client_lock);
 	nn->boot_time = ktime_get_real();
@@ -1267,12 +1269,19 @@ void nfs_clients_exit(struct net *net)
 {
 	struct nfs_net *nn = net_generic(net, nfs_net_id);
 
+#if IS_ENABLED(CONFIG_NFS_V4_1)
+	/* Clean up DS caches if pnfs was used - call via callback to avoid module dependency */
+	if (nn->nfs4_data_server_cleanup_callback)
+		nn->nfs4_data_server_cleanup_callback(net);
+#endif
+
 	nfs_netns_sysfs_destroy(nn);
 	nfs_cleanup_cb_ident_idr(net);
 	WARN_ON_ONCE(!list_empty(&nn->nfs_client_list));
 	WARN_ON_ONCE(!list_empty(&nn->nfs_volume_list));
 #if IS_ENABLED(CONFIG_NFS_V4_1)
 	WARN_ON_ONCE(!list_empty(&nn->nfs4_data_server_cache));
+	WARN_ON_ONCE(!list_empty(&nn->nfs4_data_server_hold_cache));
 #endif
 }
 
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index 6ba3ea39e928..dc6142bf87e6 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -34,7 +34,11 @@ struct nfs_net {
 #endif /* CONFIG_NFS_V4 */
 #if IS_ENABLED(CONFIG_NFS_V4_1)
 	struct list_head nfs4_data_server_cache;
+	struct list_head nfs4_data_server_hold_cache;
 	spinlock_t nfs4_data_server_lock;
+	struct delayed_work nfs4_data_server_cleanup_work;
+	bool nfs4_data_server_cleanup_initialized;
+	void (*nfs4_data_server_cleanup_callback)(struct net *);
 #endif /* CONFIG_NFS_V4_1 */
 	struct nfs_netns_client *nfs_client;
 	spinlock_t nfs_client_lock;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 91ff877185c8..7fd09100e130 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -65,6 +65,7 @@ struct nfs4_pnfs_ds {
 	refcount_t		ds_count;
 	unsigned long		ds_state;
 #define NFS4DS_CONNECTING	0	/* ds is establishing connection */
+	unsigned long		ds_last_access;	/* timestamp of last reference */
 };
 
 struct pnfs_layout_segment {
@@ -416,6 +417,13 @@ int pnfs_generic_commit_pagelist(struct inode *inode,
 int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max);
 void pnfs_generic_write_commit_done(struct rpc_task *task, void *data);
 void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds);
+void nfs4_pnfs_ds_cleanup_work(struct work_struct *work);
+void destroy_ds(struct nfs4_pnfs_ds *ds);
+
+/* Module parameters for DS cache management */
+extern unsigned int nfs4_pnfs_ds_grace_period;
+extern unsigned int nfs4_pnfs_ds_cleanup_interval;
+
 struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(const struct net *net,
 				      struct list_head *dsaddrs,
 				      gfp_t gfp_flags);
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 9976cc16b689..83ef72017370 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -21,6 +21,17 @@
 
 #define NFSDBG_FACILITY		NFSDBG_PNFS
 
+/* Module parameters */
+unsigned int nfs4_pnfs_ds_grace_period = 300;
+module_param_named(nfs4_pnfs_ds_grace_period, nfs4_pnfs_ds_grace_period, uint, 0644);
+MODULE_PARM_DESC(nfs4_pnfs_ds_grace_period, "Grace period in seconds before destroying idle data servers (default 300)");
+EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_grace_period);
+
+unsigned int nfs4_pnfs_ds_cleanup_interval = 300;
+module_param_named(nfs4_pnfs_ds_cleanup_interval, nfs4_pnfs_ds_cleanup_interval, uint, 0644);
+MODULE_PARM_DESC(nfs4_pnfs_ds_cleanup_interval, "Interval in seconds for data server cleanup work (default 300)");
+EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_cleanup_interval);
+
 void pnfs_generic_rw_release(void *data)
 {
 	struct nfs_pgio_header *hdr = data;
@@ -604,15 +615,33 @@ _same_data_server_addrs_locked(const struct list_head *dsaddrs1,
 
 /*
  * Lookup DS by addresses.  nfs4_ds_cache_lock is held
+ * Check both active cache and hold cache
  */
 static struct nfs4_pnfs_ds *
-_data_server_lookup_locked(const struct nfs_net *nn, const struct list_head *dsaddrs)
+_data_server_lookup_locked(struct nfs_net *nn, const struct list_head *dsaddrs)
 {
 	struct nfs4_pnfs_ds *ds;
 
-	list_for_each_entry(ds, &nn->nfs4_data_server_cache, ds_node)
-		if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
+	/* First check active cache */
+	list_for_each_entry(ds, &nn->nfs4_data_server_cache, ds_node) {
+		if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs)) {
+			refcount_inc(&ds->ds_count);
+			ds->ds_last_access = jiffies;
 			return ds;
+		}
+	}
+
+	/* Check hold cache - if found, move back to active cache */
+	list_for_each_entry(ds, &nn->nfs4_data_server_hold_cache, ds_node) {
+		if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs)) {
+			dprintk("NFS: DS %s found in hold cache, moving back to active cache\n",
+				ds->ds_remotestr);
+			list_move(&ds->ds_node, &nn->nfs4_data_server_cache);
+			ds->ds_last_access = jiffies;
+			return ds;
+		}
+	}
+
 	return NULL;
 }
 
@@ -631,7 +660,7 @@ static void nfs4_pnfs_ds_addr_free(struct nfs4_pnfs_ds_addr *da)
 	kfree(da);
 }
 
-static void destroy_ds(struct nfs4_pnfs_ds *ds)
+void destroy_ds(struct nfs4_pnfs_ds *ds)
 {
 	struct nfs4_pnfs_ds_addr *da;
 
@@ -652,15 +681,143 @@ static void destroy_ds(struct nfs4_pnfs_ds *ds)
 	kfree(ds->ds_remotestr);
 	kfree(ds);
 }
+EXPORT_SYMBOL_GPL(destroy_ds);
+
+/* Forward declaration */
+static void nfs4_cleanup_ds_for_namespace(struct net *net);
+
+/*
+ * Initialize DS cleanup work for a namespace (called on first DS add)
+ */
+static void nfs4_init_ds_cleanup_work(struct nfs_net *nn)
+{
+	if (!nn->nfs4_data_server_cleanup_initialized) {
+		INIT_DELAYED_WORK(&nn->nfs4_data_server_cleanup_work,
+				  nfs4_pnfs_ds_cleanup_work);
+		schedule_delayed_work(&nn->nfs4_data_server_cleanup_work,
+				      nfs4_pnfs_ds_cleanup_interval * HZ);
+		/* Register callback so nfs.ko can call us during namespace cleanup */
+		nn->nfs4_data_server_cleanup_callback = nfs4_cleanup_ds_for_namespace;
+		nn->nfs4_data_server_cleanup_initialized = true;
+		dprintk("NFS: Initialized DS cleanup work for namespace (interval=%u seconds)\n",
+			nfs4_pnfs_ds_cleanup_interval);
+	}
+}
+
+/*
+ * Cleanup DS work and destroy all DS entries for a namespace
+ */
+static void nfs4_cleanup_ds_for_namespace(struct net *net)
+{
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+	struct nfs4_pnfs_ds *ds, *tmp;
+
+	if (!nn->nfs4_data_server_cleanup_initialized)
+		return;
+
+	dprintk("NFS: Cleaning up DS for namespace\n");
+	cancel_delayed_work_sync(&nn->nfs4_data_server_cleanup_work);
+
+	/* Clean up data servers in both caches */
+	spin_lock(&nn->nfs4_data_server_lock);
+	list_for_each_entry_safe(ds, tmp, &nn->nfs4_data_server_cache, ds_node) {
+		list_del_init(&ds->ds_node);
+		spin_unlock(&nn->nfs4_data_server_lock);
+		destroy_ds(ds);
+		spin_lock(&nn->nfs4_data_server_lock);
+	}
+	list_for_each_entry_safe(ds, tmp, &nn->nfs4_data_server_hold_cache, ds_node) {
+		list_del_init(&ds->ds_node);
+		spin_unlock(&nn->nfs4_data_server_lock);
+		destroy_ds(ds);
+		spin_lock(&nn->nfs4_data_server_lock);
+	}
+	spin_unlock(&nn->nfs4_data_server_lock);
+
+	/* Unregister callback */
+	nn->nfs4_data_server_cleanup_callback = NULL;
+	nn->nfs4_data_server_cleanup_initialized = false;
+}
+
+/*
+ * Periodic cleanup task to check hold cache and destroy expired DS entries
+ */
+void nfs4_pnfs_ds_cleanup_work(struct work_struct *work)
+{
+	struct nfs_net *nn = container_of(work, struct nfs_net,
+					  nfs4_data_server_cleanup_work.work);
+	struct nfs4_pnfs_ds *ds, *tmp;
+	LIST_HEAD(destroy_list);
+	unsigned long grace_period = nfs4_pnfs_ds_grace_period * HZ;
+	unsigned long now = jiffies;
+	int active_count = 0, hold_count = 0, expired_count = 0;
+
+	dprintk("NFS: DS cleanup work started for namespace (jiffies=%lu)\n", now);
+
+	spin_lock(&nn->nfs4_data_server_lock);
+
+	/* Count entries in active cache */
+	list_for_each_entry(ds, &nn->nfs4_data_server_cache, ds_node)
+		active_count++;
+
+	/* Process hold cache */
+	list_for_each_entry_safe(ds, tmp, &nn->nfs4_data_server_hold_cache, ds_node) {
+		unsigned long time_since_last_access = now - ds->ds_last_access;
+
+		hold_count++;
+		if (time_since_last_access >= grace_period) {
+			/* Grace period expired, move to destroy list */
+			dprintk("NFS: DS cleanup task destroying expired DS: %s (idle for %lu seconds)\n",
+				ds->ds_remotestr, time_since_last_access / HZ);
+			list_move(&ds->ds_node, &destroy_list);
+			expired_count++;
+		} else {
+			dprintk("NFS: DS %s in hold cache (idle for %lu seconds, %lu seconds remaining)\n",
+				ds->ds_remotestr, time_since_last_access / HZ,
+				(grace_period - time_since_last_access) / HZ);
+		}
+	}
+
+	spin_unlock(&nn->nfs4_data_server_lock);
+
+	dprintk("NFS: DS cleanup work: active_cache=%d, hold_cache=%d, expired=%d\n",
+		active_count, hold_count, expired_count);
+
+	/* Destroy DS entries outside of lock */
+	list_for_each_entry_safe(ds, tmp, &destroy_list, ds_node) {
+		list_del_init(&ds->ds_node);
+		destroy_ds(ds);
+	}
+
+	/* Reschedule cleanup task */
+	dprintk("NFS: DS cleanup work completed, rescheduling in %u seconds\n",
+		nfs4_pnfs_ds_cleanup_interval);
+	schedule_delayed_work(&nn->nfs4_data_server_cleanup_work,
+			      nfs4_pnfs_ds_cleanup_interval * HZ);
+}
+EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_cleanup_work);
 
 void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds)
 {
 	struct nfs_net *nn = net_generic(ds->ds_net, nfs_net_id);
 
 	if (refcount_dec_and_lock(&ds->ds_count, &nn->nfs4_data_server_lock)) {
-		list_del_init(&ds->ds_node);
+		/* Update last access time */
+		ds->ds_last_access = jiffies;
+
+		dprintk("NFS: DS refcount reached 0 for %s, moving to hold cache\n",
+			ds->ds_remotestr);
+
+		/* Move to hold cache - cleanup work will check grace period */
+		refcount_set(&ds->ds_count, 1);
+		list_move(&ds->ds_node, &nn->nfs4_data_server_hold_cache);
+		spin_unlock(&nn->nfs4_data_server_lock);
+		dprintk("NFS: DS %s moved to hold cache\n", ds->ds_remotestr);
+	} else {
+		/* Update last access time even when not moving to hold cache */
+		spin_lock(&nn->nfs4_data_server_lock);
+		ds->ds_last_access = jiffies;
 		spin_unlock(&nn->nfs4_data_server_lock);
-		destroy_ds(ds);
 	}
 }
 EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_put);
@@ -753,13 +910,16 @@ nfs4_pnfs_ds_add(const struct net *net, struct list_head *dsaddrs, gfp_t gfp_fla
 	} else {
 		kfree(remotestr);
 		kfree(ds);
-		refcount_inc(&tmp_ds->ds_count);
 		dprintk("%s data server %s found, inc'ed ds_count to %d\n",
 			__func__, tmp_ds->ds_remotestr,
 			refcount_read(&tmp_ds->ds_count));
 		ds = tmp_ds;
 	}
 	spin_unlock(&nn->nfs4_data_server_lock);
+
+	/* Initialize cleanup work on first DS add */
+	if (ds)
+		nfs4_init_ds_cleanup_work(nn);
 out:
 	return ds;
 }
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] nfs: Implement delayed data server destruction with hold cache
  2025-11-18 10:57 [PATCH] nfs: Implement delayed data server destruction with hold cache Gaurav Gangalwar
@ 2025-11-18 14:43 ` Trond Myklebust
  2025-11-18 16:16   ` Trond Myklebust
  2025-11-19 11:20 ` kernel test robot
  1 sibling, 1 reply; 6+ messages in thread
From: Trond Myklebust @ 2025-11-18 14:43 UTC (permalink / raw)
  To: Gaurav Gangalwar, anna, tom, chuck.lever; +Cc: linux-nfs

On Tue, 2025-11-18 at 05:57 -0500, Gaurav Gangalwar wrote:
> Introduce a hold cache mechanism for NFS pNFS data servers to avoid
> unnecessary connection churn when data servers are temporarily idle.
> 
> Key changes:
> 
> 1. Hold Cache Implementation:
>    - Add nfs4_data_server_hold_cache to namespace structure
>    - Move data servers to hold cache when refcount reaches zero
>    - Always update ds_last_access timestamp on every reference
> 
> 2. Configurable Module Parameters:
>    - nfs4_pnfs_ds_grace_period: Grace period before destroying idle
>      data servers (default: 300 seconds)
>    - nfs4_pnfs_ds_cleanup_interval: Interval for periodic cleanup
>      work (default: 300 seconds)
> 
> 3. Periodic Cleanup Work:
>    - Schedule delayed work on first DS usage (lazy initialization)
>    - Check hold cache and destroy data servers that exceed grace
> period
>    - Reschedule work automatically for continuous monitoring
> 
> 4. Callback Mechanism:
>    - Use function pointer callback to avoid circular module
> dependencies
>    - nfsv4.ko registers cleanup callback during initialization
>    - nfs.ko calls callback during namespace cleanup (if registered)
> 
> 5. Timestamp Tracking:
>    - Add ds_last_access field to nfs4_pnfs_ds structure
>    - Update timestamp on DS allocation, lookup, and reference
> 
> Benefits:
> - Reduces connection setup/teardown overhead for intermittently used
> DSs
> - Allows DS reuse if accessed again within grace period
> - Configurable behavior via module parameters
> 

Please read RFC8881 Section 12.2.10
(https://datatracker.ietf.org/doc/html/rfc8881#device_ids)

Specifically, the following paragraph, which disallows what you are
proposing:

Device ID to device address mappings are not leased, and can be changed
at any time. (Note that while device ID to device address mappings are
likely to change after the metadata server restarts, the server is not
required to change the mappings.) A server has two choices for changing
mappings. It can recall all layouts referring to the device ID or it
can use a notification mechanism.


-- 
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trondmy@kernel.org, trond.myklebust@hammerspace.com

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] nfs: Implement delayed data server destruction with hold cache
  2025-11-18 14:43 ` Trond Myklebust
@ 2025-11-18 16:16   ` Trond Myklebust
  2025-11-19 16:34     ` gaurav gangalwar
  0 siblings, 1 reply; 6+ messages in thread
From: Trond Myklebust @ 2025-11-18 16:16 UTC (permalink / raw)
  To: Gaurav Gangalwar, anna, tom, chuck.lever; +Cc: linux-nfs

On Tue, 2025-11-18 at 09:43 -0500, Trond Myklebust wrote:
> On Tue, 2025-11-18 at 05:57 -0500, Gaurav Gangalwar wrote:
> > Introduce a hold cache mechanism for NFS pNFS data servers to avoid
> > unnecessary connection churn when data servers are temporarily
> > idle.
> > 
> > Key changes:
> > 
> > 1. Hold Cache Implementation:
> >    - Add nfs4_data_server_hold_cache to namespace structure
> >    - Move data servers to hold cache when refcount reaches zero
> >    - Always update ds_last_access timestamp on every reference
> > 
> > 2. Configurable Module Parameters:
> >    - nfs4_pnfs_ds_grace_period: Grace period before destroying idle
> >      data servers (default: 300 seconds)
> >    - nfs4_pnfs_ds_cleanup_interval: Interval for periodic cleanup
> >      work (default: 300 seconds)
> > 
> > 3. Periodic Cleanup Work:
> >    - Schedule delayed work on first DS usage (lazy initialization)
> >    - Check hold cache and destroy data servers that exceed grace
> > period
> >    - Reschedule work automatically for continuous monitoring
> > 
> > 4. Callback Mechanism:
> >    - Use function pointer callback to avoid circular module
> > dependencies
> >    - nfsv4.ko registers cleanup callback during initialization
> >    - nfs.ko calls callback during namespace cleanup (if registered)
> > 
> > 5. Timestamp Tracking:
> >    - Add ds_last_access field to nfs4_pnfs_ds structure
> >    - Update timestamp on DS allocation, lookup, and reference
> > 
> > Benefits:
> > - Reduces connection setup/teardown overhead for intermittently
> > used
> > DSs
> > - Allows DS reuse if accessed again within grace period
> > - Configurable behavior via module parameters
> > 
> 
> Please read RFC8881 Section 12.2.10
> (https://datatracker.ietf.org/doc/html/rfc8881#device_ids)
> 
> Specifically, the following paragraph, which disallows what you are
> proposing:
> 
> Device ID to device address mappings are not leased, and can be
> changed
> at any time. (Note that while device ID to device address mappings
> are
> likely to change after the metadata server restarts, the server is
> not
> required to change the mappings.) A server has two choices for
> changing
> mappings. It can recall all layouts referring to the device ID or it
> can use a notification mechanism.
> 

Note that you could circumvent the above restriction by adding a
revalidating step.
i.e. in order to figure out if the cached addresses and connections are
still valid and preferred, call GETDEVICEINFO after receiving the first
LAYOUTGET to re-reference the cached device id.

However given that we usually keep layouts around until the delegation
is returned (assuming the server handed us one), we should be caching
these connections for a minute or so already.

-- 
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trondmy@kernel.org, trond.myklebust@hammerspace.com

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] nfs: Implement delayed data server destruction with hold cache
  2025-11-18 16:16   ` Trond Myklebust
@ 2025-11-19 16:34     ` gaurav gangalwar
  2025-11-19 18:24       ` Trond Myklebust
  0 siblings, 1 reply; 6+ messages in thread
From: gaurav gangalwar @ 2025-11-19 16:34 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: anna, tom, chuck.lever, linux-nfs

Thanks Trond for review comments, reply inline.

On Tue, Nov 18, 2025 at 9:46 PM Trond Myklebust <trondmy@kernel.org> wrote:
>
> On Tue, 2025-11-18 at 09:43 -0500, Trond Myklebust wrote:
> > On Tue, 2025-11-18 at 05:57 -0500, Gaurav Gangalwar wrote:
> > > Introduce a hold cache mechanism for NFS pNFS data servers to avoid
> > > unnecessary connection churn when data servers are temporarily
> > > idle.
> > >
> > > Key changes:
> > >
> > > 1. Hold Cache Implementation:
> > >    - Add nfs4_data_server_hold_cache to namespace structure
> > >    - Move data servers to hold cache when refcount reaches zero
> > >    - Always update ds_last_access timestamp on every reference
> > >
> > > 2. Configurable Module Parameters:
> > >    - nfs4_pnfs_ds_grace_period: Grace period before destroying idle
> > >      data servers (default: 300 seconds)
> > >    - nfs4_pnfs_ds_cleanup_interval: Interval for periodic cleanup
> > >      work (default: 300 seconds)
> > >
> > > 3. Periodic Cleanup Work:
> > >    - Schedule delayed work on first DS usage (lazy initialization)
> > >    - Check hold cache and destroy data servers that exceed grace
> > > period
> > >    - Reschedule work automatically for continuous monitoring
> > >
> > > 4. Callback Mechanism:
> > >    - Use function pointer callback to avoid circular module
> > > dependencies
> > >    - nfsv4.ko registers cleanup callback during initialization
> > >    - nfs.ko calls callback during namespace cleanup (if registered)
> > >
> > > 5. Timestamp Tracking:
> > >    - Add ds_last_access field to nfs4_pnfs_ds structure
> > >    - Update timestamp on DS allocation, lookup, and reference
> > >
> > > Benefits:
> > > - Reduces connection setup/teardown overhead for intermittently
> > > used
> > > DSs
> > > - Allows DS reuse if accessed again within grace period
> > > - Configurable behavior via module parameters
> > >
> >
> > Please read RFC8881 Section 12.2.10
> > (https://datatracker.ietf.org/doc/html/rfc8881#device_ids)
> >
> > Specifically, the following paragraph, which disallows what you are
> > proposing:
> >
> > Device ID to device address mappings are not leased, and can be
> > changed
> > at any time. (Note that while device ID to device address mappings
> > are
> > likely to change after the metadata server restarts, the server is
> > not
> > required to change the mappings.) A server has two choices for
> > changing
> > mappings. It can recall all layouts referring to the device ID or it
> > can use a notification mechanism.
> >
nfs4_data_server_cache is per network namespace and cache ds_addrs ->
nfs_client, so it should be independent of device id.
I am trying to understand how a change in Device ID to device address
mapping can make difference to nfs4_data_server_cache,
since this cache lookup is done using ds address. As long as the
address and connections are valid it should be fine.
One scenario I can think of for address is valid but connection is not
could be an ip address move, but in that case connection should reset
and nfs client should reconnect.
>
> Note that you could circumvent the above restriction by adding a
> revalidating step.
> i.e. in order to figure out if the cached addresses and connections are
> still valid and preferred, call GETDEVICEINFO after receiving the first
> LAYOUTGET to re-reference the cached device id.
Didn't get this, GETDEVICEINFO should be already happening after
LAYOUTGET, so if there is change in device info it will get it.
>
> However given that we usually keep layouts around until the delegation
> is returned (assuming the server handed us one), we should be caching
> these connections for a minute or so already.

We have enabled only read delegations, so this is unlikely to help.

Regards,
Gaurav



>
> --
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> trondmy@kernel.org, trond.myklebust@hammerspace.com

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] nfs: Implement delayed data server destruction with hold cache
  2025-11-19 16:34     ` gaurav gangalwar
@ 2025-11-19 18:24       ` Trond Myklebust
  0 siblings, 0 replies; 6+ messages in thread
From: Trond Myklebust @ 2025-11-19 18:24 UTC (permalink / raw)
  To: gaurav gangalwar; +Cc: anna, tom, chuck.lever, linux-nfs

On Wed, 2025-11-19 at 22:04 +0530, gaurav gangalwar wrote:
> Thanks Trond for review comments, reply inline.
> 
> On Tue, Nov 18, 2025 at 9:46 PM Trond Myklebust <trondmy@kernel.org>
> wrote:
> > 
> > On Tue, 2025-11-18 at 09:43 -0500, Trond Myklebust wrote:
> > > On Tue, 2025-11-18 at 05:57 -0500, Gaurav Gangalwar wrote:
> > > > Introduce a hold cache mechanism for NFS pNFS data servers to
> > > > avoid
> > > > unnecessary connection churn when data servers are temporarily
> > > > idle.
> > > > 
> > > > Key changes:
> > > > 
> > > > 1. Hold Cache Implementation:
> > > >    - Add nfs4_data_server_hold_cache to namespace structure
> > > >    - Move data servers to hold cache when refcount reaches zero
> > > >    - Always update ds_last_access timestamp on every reference
> > > > 
> > > > 2. Configurable Module Parameters:
> > > >    - nfs4_pnfs_ds_grace_period: Grace period before destroying
> > > > idle
> > > >      data servers (default: 300 seconds)
> > > >    - nfs4_pnfs_ds_cleanup_interval: Interval for periodic
> > > > cleanup
> > > >      work (default: 300 seconds)
> > > > 
> > > > 3. Periodic Cleanup Work:
> > > >    - Schedule delayed work on first DS usage (lazy
> > > > initialization)
> > > >    - Check hold cache and destroy data servers that exceed
> > > > grace
> > > > period
> > > >    - Reschedule work automatically for continuous monitoring
> > > > 
> > > > 4. Callback Mechanism:
> > > >    - Use function pointer callback to avoid circular module
> > > > dependencies
> > > >    - nfsv4.ko registers cleanup callback during initialization
> > > >    - nfs.ko calls callback during namespace cleanup (if
> > > > registered)
> > > > 
> > > > 5. Timestamp Tracking:
> > > >    - Add ds_last_access field to nfs4_pnfs_ds structure
> > > >    - Update timestamp on DS allocation, lookup, and reference
> > > > 
> > > > Benefits:
> > > > - Reduces connection setup/teardown overhead for intermittently
> > > > used
> > > > DSs
> > > > - Allows DS reuse if accessed again within grace period
> > > > - Configurable behavior via module parameters
> > > > 
> > > 
> > > Please read RFC8881 Section 12.2.10
> > > (https://datatracker.ietf.org/doc/html/rfc8881#device_ids)
> > > 
> > > Specifically, the following paragraph, which disallows what you
> > > are
> > > proposing:
> > > 
> > > Device ID to device address mappings are not leased, and can be
> > > changed
> > > at any time. (Note that while device ID to device address
> > > mappings
> > > are
> > > likely to change after the metadata server restarts, the server
> > > is
> > > not
> > > required to change the mappings.) A server has two choices for
> > > changing
> > > mappings. It can recall all layouts referring to the device ID or
> > > it
> > > can use a notification mechanism.
> > > 
> nfs4_data_server_cache is per network namespace and cache ds_addrs ->
> nfs_client, so it should be independent of device id.

OK, but that dissociates the address cache from the deviceid cache, and
means that when we finally get round to implementing deviceid
notifications, then we'll have to manage 2 levels of caching. That's
not desirable either.

If you really need this extra caching of connections, then is there any
reason why you can't just implement it with deviceid notifications?

> I am trying to understand how a change in Device ID to device address
> mapping can make difference to nfs4_data_server_cache,
> since this cache lookup is done using ds address. As long as the
> address and connections are valid it should be fine.
> One scenario I can think of for address is valid but connection is
> not
> could be an ip address move, but in that case connection should reset
> and nfs client should reconnect.

Are you asking under what circumstances a notification might want to be
sent?
The following come to mind: rebalancing client load across multiple IP
addresses, managing RDMA vs plain TCP connections, network
failover/failback to a different IP and/or subnet, or just letting the
client know about temporary outages of some addresses. In some cases,
it could even just be that the data server is being decommissioned, and
so the deviceids are being deleted permanently.

The point is that notifications allow you to do caching of connections
indefinitely if you want to.

One thing to note though, is that since hyperscalers have been known to
set up environments where the number of data servers reaches the 1000s,
you will at the very least want to limit the maximum size of the cache.

> > 
> > Note that you could circumvent the above restriction by adding a
> > revalidating step.
> > i.e. in order to figure out if the cached addresses and connections
> > are
> > still valid and preferred, call GETDEVICEINFO after receiving the
> > first
> > LAYOUTGET to re-reference the cached device id.
> Didn't get this, GETDEVICEINFO should be already happening after
> LAYOUTGET, so if there is change in device info it will get it.
> > 
> > However given that we usually keep layouts around until the
> > delegation
> > is returned (assuming the server handed us one), we should be
> > caching
> > these connections for a minute or so already.
> 
> We have enabled only read delegations, so this is unlikely to help.

Sure, but that's something you can fix on the server. The client
support is already fully implemented.

-- 
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trondmy@kernel.org, trond.myklebust@hammerspace.com

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] nfs: Implement delayed data server destruction with hold cache
  2025-11-18 10:57 [PATCH] nfs: Implement delayed data server destruction with hold cache Gaurav Gangalwar
  2025-11-18 14:43 ` Trond Myklebust
@ 2025-11-19 11:20 ` kernel test robot
  1 sibling, 0 replies; 6+ messages in thread
From: kernel test robot @ 2025-11-19 11:20 UTC (permalink / raw)
  To: Gaurav Gangalwar, trondmy, anna, tom, chuck.lever
  Cc: llvm, oe-kbuild-all, linux-nfs, Gaurav Gangalwar

Hi Gaurav,

kernel test robot noticed the following build warnings:

[auto build test WARNING on trondmy-nfs/linux-next]
[also build test WARNING on linus/master v6.18-rc6 next-20251119]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Gaurav-Gangalwar/nfs-Implement-delayed-data-server-destruction-with-hold-cache/20251118-190020
base:   git://git.linux-nfs.org/projects/trondmy/linux-nfs.git linux-next
patch link:    https://lore.kernel.org/r/20251118105752.52098-1-gaurav.gangalwar%40gmail.com
patch subject: [PATCH] nfs: Implement delayed data server destruction with hold cache
config: arm-lpc32xx_defconfig (https://download.01.org/0day-ci/archive/20251119/202511191852.nGdrhdUC-lkp@intel.com/config)
compiler: clang version 17.0.6 (https://github.com/llvm/llvm-project 6009708b4367171ccdbf4b5905cb6a803753fe18)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251119/202511191852.nGdrhdUC-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202511191852.nGdrhdUC-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> fs/nfs/pnfs_nfs.c:753:6: warning: variable 'active_count' set but not used [-Wunused-but-set-variable]
     753 |         int active_count = 0, hold_count = 0, expired_count = 0;
         |             ^
>> fs/nfs/pnfs_nfs.c:753:24: warning: variable 'hold_count' set but not used [-Wunused-but-set-variable]
     753 |         int active_count = 0, hold_count = 0, expired_count = 0;
         |                               ^
>> fs/nfs/pnfs_nfs.c:753:40: warning: variable 'expired_count' set but not used [-Wunused-but-set-variable]
     753 |         int active_count = 0, hold_count = 0, expired_count = 0;
         |                                               ^
   3 warnings generated.


vim +/active_count +753 fs/nfs/pnfs_nfs.c

   741	
   742	/*
   743	 * Periodic cleanup task to check hold cache and destroy expired DS entries
   744	 */
   745	void nfs4_pnfs_ds_cleanup_work(struct work_struct *work)
   746	{
   747		struct nfs_net *nn = container_of(work, struct nfs_net,
   748						  nfs4_data_server_cleanup_work.work);
   749		struct nfs4_pnfs_ds *ds, *tmp;
   750		LIST_HEAD(destroy_list);
   751		unsigned long grace_period = nfs4_pnfs_ds_grace_period * HZ;
   752		unsigned long now = jiffies;
 > 753		int active_count = 0, hold_count = 0, expired_count = 0;
   754	
   755		dprintk("NFS: DS cleanup work started for namespace (jiffies=%lu)\n", now);
   756	
   757		spin_lock(&nn->nfs4_data_server_lock);
   758	
   759		/* Count entries in active cache */
   760		list_for_each_entry(ds, &nn->nfs4_data_server_cache, ds_node)
   761			active_count++;
   762	
   763		/* Process hold cache */
   764		list_for_each_entry_safe(ds, tmp, &nn->nfs4_data_server_hold_cache, ds_node) {
   765			unsigned long time_since_last_access = now - ds->ds_last_access;
   766	
   767			hold_count++;
   768			if (time_since_last_access >= grace_period) {
   769				/* Grace period expired, move to destroy list */
   770				dprintk("NFS: DS cleanup task destroying expired DS: %s (idle for %lu seconds)\n",
   771					ds->ds_remotestr, time_since_last_access / HZ);
   772				list_move(&ds->ds_node, &destroy_list);
   773				expired_count++;
   774			} else {
   775				dprintk("NFS: DS %s in hold cache (idle for %lu seconds, %lu seconds remaining)\n",
   776					ds->ds_remotestr, time_since_last_access / HZ,
   777					(grace_period - time_since_last_access) / HZ);
   778			}
   779		}
   780	
   781		spin_unlock(&nn->nfs4_data_server_lock);
   782	
   783		dprintk("NFS: DS cleanup work: active_cache=%d, hold_cache=%d, expired=%d\n",
   784			active_count, hold_count, expired_count);
   785	
   786		/* Destroy DS entries outside of lock */
   787		list_for_each_entry_safe(ds, tmp, &destroy_list, ds_node) {
   788			list_del_init(&ds->ds_node);
   789			destroy_ds(ds);
   790		}
   791	
   792		/* Reschedule cleanup task */
   793		dprintk("NFS: DS cleanup work completed, rescheduling in %u seconds\n",
   794			nfs4_pnfs_ds_cleanup_interval);
   795		schedule_delayed_work(&nn->nfs4_data_server_cleanup_work,
   796				      nfs4_pnfs_ds_cleanup_interval * HZ);
   797	}
   798	EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_cleanup_work);
   799	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2025-11-19 18:24 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-18 10:57 [PATCH] nfs: Implement delayed data server destruction with hold cache Gaurav Gangalwar
2025-11-18 14:43 ` Trond Myklebust
2025-11-18 16:16   ` Trond Myklebust
2025-11-19 16:34     ` gaurav gangalwar
2025-11-19 18:24       ` Trond Myklebust
2025-11-19 11:20 ` kernel test robot

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox