linux-nfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/3] pNFS generic device ID cache
@ 2010-04-16 15:52 andros
  2010-04-16 15:52 ` [PATCH 1/3] SQUASHME pnfs_submit: " andros
  0 siblings, 1 reply; 12+ messages in thread
From: andros @ 2010-04-16 15:52 UTC (permalink / raw)
  To: pnfs; +Cc: linux-nfs

This patch set implements a shared RCU device ID cache servicing multiple
mounts of a single layout type per meta data server (struct nfs_client).

0001-pnfs_submit-generic-device-ID-cache.patch
0002-pnfs_submit-fix-multiple-mount-set_pnfs_layoutdriver.patch
0003-pnfs-submit-file-layout-driver-generic-device-ID-cac.patch

These patches apply to the 2.6.34-rc3 pnfs branch below the following patch
which is the head of the revert patches - otherwise known as the first patch
not included in the pnfs file layout driver submission.

9a75b7356ca22b10903b507383646201cdcc9020

pnfs: filelayout: CB_NOTIFY_DEVICE support

    This reverts commit 933b55c15ed2b7fd89d8e0342bd5b2825726201a
    "SQUASHME pnfs_submit: remove filelayout CB_NOTIFY_DEVICE support"

Testing:

CONFIG_NFS_V4_1 set:

NFSv4.1/pNFS mounts:
Connectathon tests pass against GFS2/pNFS with a single AUTH_SYS mount, a double
AUTH_SYS mount, and an AUTH_SYS and AUTH_GSS/KRB5 mount (which creates
two superblocks under a struct nfs_client and both share the device id cache).

NFSv4.0 mount;
Connectathon tests pass

Did not test with multiple device ID's. I will create a mulitple device ID
test with the pynfs file layout server.

CONFIG_NFS_V4_1 not set:

NFSv4.0 mount: Connectathon tests pass.


-->Andy


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache
  2010-04-16 15:52 [PATCH 0/3] pNFS generic device ID cache andros
@ 2010-04-16 15:52 ` andros
  2010-04-16 15:52   ` [PATCH 2/3] SQUASHME pnfs_submit: fix multiple mount set_pnfs_layoutdriver andros
  2010-04-16 16:04   ` [PATCH 1/3] SQUASHME pnfs_submit: " William A. (Andy) Adamson
  0 siblings, 2 replies; 12+ messages in thread
From: andros @ 2010-04-16 15:52 UTC (permalink / raw)
  To: pnfs-Xh+NVF5n0LLYtjvyW6yDsg; +Cc: linux-nfs, Andy Adamson

From: Andy Adamson <andros@netapp.com>

A shared RCU device ID cache servicing multiple mounts of a single layout type
per meta data server (struct nfs_client).

Device IDs of type deviceid4 are required by all layout types, long lived and
read at each I/O.  They are added to the deviceid cache at first reference by
a layout via GETDEVICEINFO and (currently) are only removed at umount.

Reference count the device ID cache for each mounted file system
in the initialize_mountpoint layoutdriver_io_operation.

Dereference the device id cache on file system in the uninitialize_mountpoint
layoutdriver_io_operation called at umount

Signed-off-by: Andy Adamson <andros@netapp.com>
---
 fs/nfs/pnfs.c             |  119 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/nfs4_pnfs.h |   27 ++++++++++
 include/linux/nfs_fs_sb.h |    1 +
 3 files changed, 147 insertions(+), 0 deletions(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 91572aa..8492aef 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -45,6 +45,7 @@
 #include <linux/nfs4.h>
 #include <linux/pnfs_xdr.h>
 #include <linux/nfs4_pnfs.h>
+#include <linux/rculist.h>
 
 #include "internal.h"
 #include "nfs4_fs.h"
@@ -2296,3 +2297,121 @@ struct pnfs_client_operations pnfs_ops = {
 
 EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
 EXPORT_SYMBOL(pnfs_register_layoutdriver);
+
+
+/* Device ID cache. Supports one layout type per struct nfs_client */
+int
+nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
+			 void (*free_callback)(struct rcu_head *))
+{
+	struct nfs4_deviceid_cache *c;
+
+	c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL);
+	if (!c)
+		return -ENOMEM;
+	spin_lock(&clp->cl_lock);
+	if (clp->cl_devid_cache != NULL) {
+		kref_get(&clp->cl_devid_cache->dc_kref);
+		spin_unlock(&clp->cl_lock);
+		dprintk("%s [kref [%d]]\n", __func__,
+			atomic_read(&clp->cl_devid_cache->dc_kref.refcount));
+		kfree(c);
+	} else {
+		spin_lock_init(&c->dc_lock);
+		INIT_LIST_HEAD(&c->dc_deviceids);
+		kref_init(&c->dc_kref);
+		c->dc_free_callback = free_callback;
+		c->dc_clp = clp;
+		clp->cl_devid_cache = c;
+		spin_unlock(&clp->cl_lock);
+		dprintk("%s [new]\n", __func__);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
+
+void
+nfs4_init_deviceid_node(struct nfs4_deviceid *d)
+{
+	INIT_LIST_HEAD(&d->de_node);
+	INIT_RCU_HEAD(&d->de_rcu);
+}
+EXPORT_SYMBOL(nfs4_init_deviceid_node);
+
+struct nfs4_deviceid *
+nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id)
+{
+	struct nfs4_deviceid *d;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
+		if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVICEID4_SIZE)) {
+			rcu_read_unlock();
+			return d;
+		}
+	}
+	rcu_read_unlock();
+	return NULL;
+}
+EXPORT_SYMBOL(nfs4_find_deviceid);
+
+/*
+ * Add or kref_get a deviceid.
+ * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
+ */
+void
+nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new)
+{
+	struct nfs4_deviceid *d;
+
+	spin_lock(&c->dc_lock);
+	list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
+		if (!memcmp(&d->de_id, &new->de_id, NFS4_PNFS_DEVICEID4_SIZE)) {
+			spin_unlock(&c->dc_lock);
+			dprintk("%s [discard]\n", __func__);
+			c->dc_free_callback(&new->de_rcu);
+		}
+	}
+	list_add_rcu(&new->de_node, &c->dc_deviceids);
+	spin_unlock(&c->dc_lock);
+	dprintk("%s [new]\n", __func__);
+}
+EXPORT_SYMBOL(nfs4_add_deviceid);
+
+static int
+nfs4_remove_deviceid(struct nfs4_deviceid_cache *c)
+{
+	struct nfs4_deviceid *d;
+
+	spin_lock(&c->dc_lock);
+	list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
+		list_del_rcu(&d->de_node);
+		spin_unlock(&c->dc_lock);
+		synchronize_rcu();
+		c->dc_free_callback(&d->de_rcu);
+		return 1;
+	}
+	spin_unlock(&c->dc_lock);
+	return 0;
+}
+
+static void
+nfs4_free_deviceid_cache(struct kref *kref)
+{
+	struct nfs4_deviceid_cache *cache =
+		container_of(kref, struct nfs4_deviceid_cache, dc_kref);
+	int more = 1;
+
+	while (more)
+		more = nfs4_remove_deviceid(cache);
+	cache->dc_clp->cl_devid_cache = NULL;
+	kfree(cache);
+}
+
+void
+nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *c)
+{
+	dprintk("%s [%d]\n", __func__, atomic_read(&c->dc_kref.refcount));
+	kref_put(&c->dc_kref, nfs4_free_deviceid_cache);
+}
+EXPORT_SYMBOL(nfs4_put_deviceid_cache);
diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
index 1d521f4..2a88a06 100644
--- a/include/linux/nfs4_pnfs.h
+++ b/include/linux/nfs4_pnfs.h
@@ -281,6 +281,33 @@ struct pnfs_devicelist {
 	struct pnfs_deviceid	dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
 };
 
+/*
+ * Device ID RCU cache. A device ID is unique per client ID and layout type.
+ */
+struct nfs4_deviceid_cache {
+	spinlock_t		dc_lock;
+	struct list_head	dc_deviceids;
+	struct kref		dc_kref;
+	void			(*dc_free_callback)(struct rcu_head *);
+	struct nfs_client	*dc_clp;
+};
+
+/* Device ID cache node */
+struct nfs4_deviceid {
+	struct list_head	de_node;
+	struct rcu_head		de_rcu;
+	struct pnfs_deviceid	de_id;
+};
+
+extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
+				void (*free_callback)(struct rcu_head *));
+extern void nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *);
+extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
+extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_deviceid_cache *,
+				struct pnfs_deviceid *);
+extern void nfs4_add_deviceid(struct nfs4_deviceid_cache *,
+				struct nfs4_deviceid *);
+
 /* pNFS client callback functions.
  * These operations allow the layout driver to access pNFS client
  * specific information or call pNFS client->server operations.
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 8522461..ef2e18e 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -87,6 +87,7 @@ struct nfs_client {
 	u32			cl_exchange_flags;
 	struct nfs4_session	*cl_session; 	/* sharred session */
 	struct list_head	cl_lo_inodes;	/* Inodes having layouts */
+	struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
 #endif /* CONFIG_NFS_V4_1 */
 
 #ifdef CONFIG_NFS_FSCACHE
-- 
1.6.6


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/3] SQUASHME pnfs_submit: fix multiple mount set_pnfs_layoutdriver
  2010-04-16 15:52 ` [PATCH 1/3] SQUASHME pnfs_submit: " andros
@ 2010-04-16 15:52   ` andros
  2010-04-16 15:52     ` [PATCH 3/3] SQUASHME pnfs-submit: file layout driver generic device ID cache andros
  2010-04-16 16:04   ` [PATCH 1/3] SQUASHME pnfs_submit: " William A. (Andy) Adamson
  1 sibling, 1 reply; 12+ messages in thread
From: andros @ 2010-04-16 15:52 UTC (permalink / raw)
  To: pnfs-Xh+NVF5n0LLYtjvyW6yDsg; +Cc: linux-nfs, Andy Adamson

From: Andy Adamson <andros@netapp.com>

The same struct nfs_server can enter set_pnfs_layoutdriver for mounts that
share a super block.  Don't initialize a pnfs mountpoint more than once.

Don't set the pnfs_curr_ld until the pnfs mountpoint initialization succeeds

Signed-off-by: Andy Adamson <andros@netapp.com>
---
 fs/nfs/pnfs.c |   15 ++++++++++-----
 1 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 8492aef..1d903fd 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -215,20 +215,25 @@ set_pnfs_layoutdriver(struct super_block *sb, struct nfs_fh *fh, u32 id)
 	struct pnfs_mount_type *mt;
 	struct nfs_server *server = NFS_SB(sb);
 
+	if (server->pnfs_curr_ld)
+		return;
+
 	if (id > 0 && find_pnfs(id, &mod)) {
-		dprintk("%s: Setting pNFS module\n", __func__);
-		server->pnfs_curr_ld = mod->pnfs_ld_type;
-		mt = server->pnfs_curr_ld->ld_io_ops->initialize_mountpoint(
+		mt = mod->pnfs_ld_type->ld_io_ops->initialize_mountpoint(
 			sb, fh);
 		if (!mt) {
 			printk(KERN_ERR "%s: Error initializing mount point "
 			       "for layout driver %u. ", __func__, id);
 			goto out_err;
 		}
-		/* Layout driver succeeded in initializing mountpoint */
+		/*
+		 * Layout driver succeeded in initializing mountpoint
+		 * and has taken a reference on the nfs_client cl_devid_cache
+		 */
+		server->pnfs_curr_ld = mod->pnfs_ld_type;
 		server->pnfs_mountid = mt;
-		/* Set the rpc_ops */
 		server->nfs_client->rpc_ops = &pnfs_v4_clientops;
+		dprintk("%s: pNFS module for %u set\n", __func__, id);
 		return;
 	}
 
-- 
1.6.6


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 3/3] SQUASHME pnfs-submit: file layout driver generic device ID cache
  2010-04-16 15:52   ` [PATCH 2/3] SQUASHME pnfs_submit: fix multiple mount set_pnfs_layoutdriver andros
@ 2010-04-16 15:52     ` andros
  0 siblings, 0 replies; 12+ messages in thread
From: andros @ 2010-04-16 15:52 UTC (permalink / raw)
  To: pnfs-Xh+NVF5n0LLYtjvyW6yDsg; +Cc: linux-nfs, Andy Adamson

From: Andy Adamson <andros@netapp.com>

Replace the per superblock deviceid cache with the generic deviceid cache.

Embed struct nfs4_deviceid into struct nfs4_file_layout_dsaddr, the file layout
specific deviceid structure.  Provide a free_deviceid_callback.

Signed-off-by: Andy Adamson <andros@netapp.com>
---
 fs/nfs/client.c            |    1 +
 fs/nfs/nfs4filelayout.c    |   31 +++-----
 fs/nfs/nfs4filelayout.h    |   12 +--
 fs/nfs/nfs4filelayoutdev.c |  190 ++++++++------------------------------------
 4 files changed, 51 insertions(+), 183 deletions(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index e1d1646..82775b7 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -38,6 +38,7 @@
 #include <net/ipv6.h>
 #include <linux/nfs_xdr.h>
 #include <linux/sunrpc/bc_xprt.h>
+#include <linux/nfs4_pnfs.h>
 
 #include <asm/system.h>
 
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 0530b59..c155586 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -76,17 +76,11 @@ filelayout_initialize_mountpoint(struct super_block *sb, struct nfs_fh *fh)
 {
 	struct filelayout_mount_type *fl_mt;
 	struct pnfs_mount_type *mt;
-	int status;
 
 	fl_mt = kmalloc(sizeof(struct filelayout_mount_type), GFP_KERNEL);
 	if (!fl_mt)
 		goto error_ret;
 
-	/* Initialize nfs4 file layout specific device list structure */
-	fl_mt->hlist = kmalloc(sizeof(struct nfs4_pnfs_dev_hlist), GFP_KERNEL);
-	if (!fl_mt->hlist)
-		goto cleanup_fl_mt;
-
 	mt = kmalloc(sizeof(struct pnfs_mount_type), GFP_KERNEL);
 	if (!mt)
 		goto cleanup_fl_mt;
@@ -94,11 +88,11 @@ filelayout_initialize_mountpoint(struct super_block *sb, struct nfs_fh *fh)
 	fl_mt->fl_sb = sb;
 	mt->mountid = (void *)fl_mt;
 
-	status = nfs4_pnfs_devlist_init(fl_mt->hlist);
-	if (status)
+	if (nfs4_alloc_init_deviceid_cache(NFS_SB(sb)->nfs_client,
+					   nfs4_fl_free_deviceid_callback))
 		goto cleanup_mt;
 
-	dprintk("%s: device list has been initialized successfully\n",
+	dprintk("%s: deviceid cache has been initialized successfully\n",
 		__func__);
 	return mt;
 
@@ -106,11 +100,10 @@ cleanup_mt: ;
 	kfree(mt);
 
 cleanup_fl_mt: ;
-	kfree(fl_mt->hlist);
 	kfree(fl_mt);
 
 error_ret: ;
-	printk(KERN_WARNING "%s: device list could not be initialized\n",
+	printk(KERN_WARNING "%s: deviceid cache could not be initialized\n",
 		__func__);
 
 	return NULL;
@@ -123,13 +116,14 @@ filelayout_uninitialize_mountpoint(struct pnfs_mount_type *mountid)
 {
 	struct filelayout_mount_type *fl_mt = NULL;
 
+	dprintk("--> %s\n", __func__);
 	if (mountid) {
-		fl_mt = (struct filelayout_mount_type *)mountid->mountid;
+		struct nfs4_deviceid_cache *cache;
 
-		if (fl_mt != NULL) {
-			nfs4_pnfs_devlist_destroy(fl_mt->hlist);
-			kfree(fl_mt);
-		}
+		fl_mt = (struct filelayout_mount_type *)mountid->mountid;
+		cache = NFS_SB(fl_mt->fl_sb)->nfs_client->cl_devid_cache;
+		nfs4_put_deviceid_cache(cache);
+		kfree(fl_mt);
 		kfree(mountid);
 	}
 	return 0;
@@ -381,8 +375,7 @@ filelayout_check_layout(struct pnfs_layout_type *lo,
 	struct nfs_server *nfss = NFS_SERVER(PNFS_INODE(lo));
 
 	dprintk("--> %s\n", __func__);
-	dsaddr = nfs4_pnfs_device_item_find(FILE_MT(PNFS_INODE(lo))->hlist,
-					     &fl->dev_id);
+	dsaddr = nfs4_pnfs_device_item_find(nfss->nfs_client, &fl->dev_id);
 	if (dsaddr == NULL) {
 		dsaddr = get_device_info(PNFS_INODE(lo), &fl->dev_id);
 		if (dsaddr == NULL) {
@@ -618,7 +611,7 @@ filelayout_commit(struct pnfs_layout_type *layoutid, int sync,
 	stripesz = filelayout_get_stripesize(layoutid);
 	dprintk("%s stripesize %Zd\n", __func__, stripesz);
 
-	dsaddr = nfs4_pnfs_device_item_find(FILE_MT(data->inode)->hlist,
+	dsaddr = nfs4_pnfs_device_item_find(NFS_SERVER(data->inode)->nfs_client,
 					     &nfslay->dev_id);
 	if (dsaddr == NULL) {
 		data->pdata.pnfs_error = -EIO;
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 12498a2..2cb05bd 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -43,8 +43,7 @@ struct nfs4_pnfs_ds {
 };
 
 struct nfs4_file_layout_dsaddr {
-	struct hlist_node	hash_node;   /* nfs4_pnfs_dev_hlist dev_list */
-	struct pnfs_deviceid	dev_id;
+	struct nfs4_deviceid	deviceid;
 	u32 			stripe_count;
 	u8			*stripe_indices;
 	u32			ds_num;
@@ -86,15 +85,13 @@ struct nfs4_filelayout {
 
 struct filelayout_mount_type {
 	struct super_block *fl_sb;
-	struct nfs4_pnfs_dev_hlist *hlist;
 };
 
 extern struct pnfs_client_operations *pnfs_callback_ops;
 
+extern void nfs4_fl_free_deviceid_callback(struct rcu_head *);
 extern void print_ds(struct nfs4_pnfs_ds *ds);
 char *deviceid_fmt(const struct pnfs_deviceid *dev_id);
-int  nfs4_pnfs_devlist_init(struct nfs4_pnfs_dev_hlist *hlist);
-void nfs4_pnfs_devlist_destroy(struct nfs4_pnfs_dev_hlist *hlist);
 int nfs4_pnfs_dserver_get(struct pnfs_layout_segment *lseg,
 			  loff_t offset,
 			  size_t count,
@@ -102,9 +99,8 @@ int nfs4_pnfs_dserver_get(struct pnfs_layout_segment *lseg,
 u32 filelayout_dserver_get_index(loff_t offset,
 				 struct nfs4_file_layout_dsaddr *di,
 				 struct nfs4_filelayout_segment *layout);
-struct nfs4_file_layout_dsaddr *
-nfs4_pnfs_device_item_find(struct nfs4_pnfs_dev_hlist *hlist,
-			   struct pnfs_deviceid *dev_id);
+extern struct nfs4_file_layout_dsaddr *
+nfs4_pnfs_device_item_find(struct nfs_client *, struct pnfs_deviceid *dev_id);
 struct nfs4_file_layout_dsaddr *
 get_device_info(struct inode *inode, struct pnfs_deviceid *dev_id);
 
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 813ddbb..411ffcb 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -45,6 +45,7 @@
 
 #include <linux/utsname.h>
 #include <linux/vmalloc.h>
+#include <linux/nfs4_pnfs.h>
 #include <linux/pnfs_xdr.h>
 #include "nfs4filelayout.h"
 #include "internal.h"
@@ -98,42 +99,6 @@ deviceid_fmt(const struct pnfs_deviceid *dev_id)
 	return buf;
 }
 
-unsigned long
-_deviceid_hash(const struct pnfs_deviceid *dev_id)
-{
-	unsigned char *cptr = (unsigned char *)dev_id->data;
-	unsigned int nbytes = NFS4_PNFS_DEVICEID4_SIZE;
-	u64 x = 0;
-
-	while (nbytes--) {
-		x *= 37;
-		x += *cptr++;
-	}
-	return x & NFS4_PNFS_DEV_HASH_MASK;
-}
-
-/* Assumes lock is held */
-static inline struct nfs4_file_layout_dsaddr *
-_device_lookup(struct nfs4_pnfs_dev_hlist *hlist,
-	       const struct pnfs_deviceid *dev_id)
-{
-	unsigned long      hash;
-	struct hlist_node *np;
-
-	dprintk("_device_lookup: dev_id=%s\n", deviceid_fmt(dev_id));
-
-	hash = _deviceid_hash(dev_id);
-
-	hlist_for_each(np, &hlist->dev_list[hash]) {
-		struct nfs4_file_layout_dsaddr *dsaddr;
-		dsaddr = hlist_entry(np, struct nfs4_file_layout_dsaddr,
-				  hash_node);
-		if (!memcmp(&dsaddr->dev_id, dev_id, NFS4_PNFS_DEVICEID4_SIZE))
-			return dsaddr;
-	}
-	return NULL;
-}
-
 /* nfs4_ds_cache_lock is held */
 static inline struct nfs4_pnfs_ds *
 _data_server_lookup(u32 ip_addr, u32 port)
@@ -152,22 +117,6 @@ _data_server_lookup(u32 ip_addr, u32 port)
 	return NULL;
 }
 
-
-/* Assumes lock is held */
-static inline void
-_device_add(struct nfs4_pnfs_dev_hlist *hlist,
-	    struct nfs4_file_layout_dsaddr *dsaddr)
-{
-	unsigned long      hash;
-
-	dprintk("_device_add: dev_id=%s ds_list:\n",
-		deviceid_fmt(&dsaddr->dev_id));
-	print_ds_list(dsaddr);
-
-	hash = _deviceid_hash(&dsaddr->dev_id);
-	hlist_add_head(&dsaddr->hash_node, &hlist->dev_list[hash]);
-}
-
 /* Create an rpc to the data server defined in 'dev_list' */
 static int
 nfs4_pnfs_ds_create(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
@@ -269,118 +218,47 @@ out_put:
 static void
 destroy_ds(struct nfs4_pnfs_ds *ds)
 {
+	dprintk("--> %s\n", __func__);
+	print_ds(ds);
+
 	if (ds->ds_clp)
 		nfs_put_client(ds->ds_clp);
 	kfree(ds);
 }
 
-/* Assumes lock is NOT held */
 static void
-nfs4_pnfs_device_destroy(struct nfs4_file_layout_dsaddr *dsaddr,
-			 struct nfs4_pnfs_dev_hlist *hlist)
+nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
 {
 	struct nfs4_pnfs_ds *ds;
-	LIST_HEAD(release);
 	int i;
 
-	if (!dsaddr)
-		return;
-
-	dprintk("%s: dev_id=%s\ndev_list:\n", __func__,
-		deviceid_fmt(&dsaddr->dev_id));
-	print_ds_list(dsaddr);
-
-	write_lock(&hlist->dev_lock);
-	hlist_del_init(&dsaddr->hash_node);
+	dprintk("%s: device id=%s\n", __func__,
+		deviceid_fmt(&dsaddr->deviceid.de_id));
 
 	for (i = 0; i < dsaddr->ds_num; i++) {
 		ds = dsaddr->ds_list[i];
 		if (ds != NULL) {
-			/* if we are last user - move to release list */
 			if (atomic_dec_and_lock(&ds->ds_count,
 						&nfs4_ds_cache_lock)) {
 				list_del_init(&ds->ds_node);
 				spin_unlock(&nfs4_ds_cache_lock);
-				list_add(&ds->ds_node, &release);
+				destroy_ds(ds);
 			}
 		}
 	}
-	write_unlock(&hlist->dev_lock);
-	while (!list_empty(&release)) {
-		ds = list_entry(release.next, struct nfs4_pnfs_ds, ds_node);
-		list_del(&ds->ds_node);
-		destroy_ds(ds);
-	}
+	kfree(dsaddr->stripe_indices);
 	kfree(dsaddr);
 }
 
-int
-nfs4_pnfs_devlist_init(struct nfs4_pnfs_dev_hlist *hlist)
-{
-	int i;
-
-	rwlock_init(&hlist->dev_lock);
-
-	for (i = 0; i < NFS4_PNFS_DEV_HASH_SIZE; i++) {
-		INIT_HLIST_HEAD(&hlist->dev_list[i]);
-	}
-
-	return 0;
-}
-
-/* De-alloc all devices for a mount point.  This is called in
- * nfs4_kill_super.
- */
 void
-nfs4_pnfs_devlist_destroy(struct nfs4_pnfs_dev_hlist *hlist)
-{
-	int i;
-
-	if (hlist == NULL)
-		return;
-
-	/* No lock held, as synchronization should occur at upper levels */
-	for (i = 0; i < NFS4_PNFS_DEV_HASH_SIZE; i++) {
-		struct hlist_node *np, *next;
-
-		hlist_for_each_safe(np, next, &hlist->dev_list[i]) {
-			struct nfs4_file_layout_dsaddr *dsaddr;
-			dsaddr = hlist_entry(np,
-					     struct nfs4_file_layout_dsaddr,
-					     hash_node);
-			/* nfs4_pnfs_device_destroy grabs hlist->dev_lock */
-			nfs4_pnfs_device_destroy(dsaddr, hlist);
-		}
-	}
-}
-
-/*
- * Add the device to the list of available devices for this mount point.
- * The * rpc client is created during first I/O.
- */
-static int
-nfs4_pnfs_device_add(struct filelayout_mount_type *mt,
-		     struct nfs4_file_layout_dsaddr *dsaddr)
+nfs4_fl_free_deviceid_callback(struct rcu_head *rcu)
 {
-	struct nfs4_file_layout_dsaddr *tmp_dsaddr;
-	struct nfs4_pnfs_dev_hlist *hlist = mt->hlist;
-
-	dprintk("nfs4_pnfs_device_add\n");
-
-	/* Write lock, do lookup again, and then add device */
-	write_lock(&hlist->dev_lock);
-	tmp_dsaddr = _device_lookup(hlist, &dsaddr->dev_id);
-	if (tmp_dsaddr == NULL)
-		_device_add(hlist, dsaddr);
-	write_unlock(&hlist->dev_lock);
-
-	/* Cleanup, if device was recently added */
-	if (tmp_dsaddr != NULL) {
-		dprintk(" device found, not adding (after creation)\n");
-		nfs4_pnfs_device_destroy(dsaddr, hlist);
-	}
+	struct nfs4_deviceid *device =
+		container_of(rcu, struct nfs4_deviceid, de_rcu);
+	struct nfs4_file_layout_dsaddr *dsaddr =
+		container_of(device, struct nfs4_file_layout_dsaddr, deviceid);
 
-	return 0;
+	nfs4_fl_free_deviceid(dsaddr);
 }
 
 static void
@@ -514,7 +392,8 @@ decode_device(struct inode *ino, struct pnfs_device *pdev)
 	dsaddr->stripe_count = cnt;
 	dsaddr->ds_num = num;
 
-	memcpy(&dsaddr->dev_id, &pdev->dev_id, NFS4_PNFS_DEVICEID4_SIZE);
+	memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id,
+	       NFS4_PNFS_DEVICEID4_SIZE);
 
 	/* Go back an read stripe indices */
 	p = indicesp;
@@ -553,19 +432,20 @@ decode_device(struct inode *ino, struct pnfs_device *pdev)
 			}
 		}
 	}
+	nfs4_init_deviceid_node(&dsaddr->deviceid);
+
 	return dsaddr;
 
 out_err_free:
-	nfs4_pnfs_device_destroy(dsaddr, FILE_MT(ino)->hlist);
+	nfs4_fl_free_deviceid(dsaddr);
 out_err:
 	dprintk("%s ERROR: returning NULL\n", __func__);
 	return NULL;
 }
 
-/* Decode the opaque device specified in 'dev'
- * and add it to the list of available devices for this
- * mount point.
- * Must at some point be followed up with nfs4_pnfs_device_destroy
+/*
+ * Decode the opaque device specified in 'dev'
+ * and add it to the list of available devices
  */
 static struct nfs4_file_layout_dsaddr*
 decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
@@ -574,14 +454,13 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
 
 	dsaddr = decode_device(inode, dev);
 	if (!dsaddr) {
-		printk(KERN_WARNING "%s: Could not decode device\n",
+		printk(KERN_WARNING "%s: Could not decode or add device\n",
 			__func__);
-		nfs4_pnfs_device_destroy(dsaddr, FILE_MT(inode)->hlist);
 		return NULL;
 	}
 
-	if (nfs4_pnfs_device_add(FILE_MT(inode), dsaddr))
-		return NULL;
+	nfs4_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache,
+			 &dsaddr->deviceid);
 
 	return dsaddr;
 }
@@ -660,16 +539,15 @@ out_free:
 }
 
 struct nfs4_file_layout_dsaddr *
-nfs4_pnfs_device_item_find(struct nfs4_pnfs_dev_hlist *hlist,
-			   struct pnfs_deviceid *dev_id)
+nfs4_pnfs_device_item_find(struct nfs_client *clp, struct pnfs_deviceid *id)
 {
-	struct nfs4_file_layout_dsaddr *dsaddr;
-
-	read_lock(&hlist->dev_lock);
-	dsaddr = _device_lookup(hlist, dev_id);
-	read_unlock(&hlist->dev_lock);
+	struct nfs4_deviceid *d;
 
-	return dsaddr;
+	d = nfs4_find_deviceid(clp->cl_devid_cache, id);
+	dprintk("%s device id (%s) nfs4_deviceid %p\n", __func__,
+		deviceid_fmt(id), d);
+	return (d == NULL) ? NULL :
+		container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
 }
 
 /* Want res = ((offset / layout->stripe_unit) % dsaddr->stripe_count)
@@ -707,7 +585,7 @@ nfs4_pnfs_dserver_get(struct pnfs_layout_segment *lseg,
 	if (!layout)
 		return 1;
 
-	dsaddr = nfs4_pnfs_device_item_find(FILE_MT(inode)->hlist,
+	dsaddr = nfs4_pnfs_device_item_find(NFS_SERVER(inode)->nfs_client,
 					    &layout->dev_id);
 	if (dsaddr == NULL)
 		return 1;
-- 
1.6.6


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache
  2010-04-16 15:52 ` [PATCH 1/3] SQUASHME pnfs_submit: " andros
  2010-04-16 15:52   ` [PATCH 2/3] SQUASHME pnfs_submit: fix multiple mount set_pnfs_layoutdriver andros
@ 2010-04-16 16:04   ` William A. (Andy) Adamson
       [not found]     ` <u2n89c397151004160904m9e862360xcaf0e187640b0177-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  1 sibling, 1 reply; 12+ messages in thread
From: William A. (Andy) Adamson @ 2010-04-16 16:04 UTC (permalink / raw)
  To: pnfs; +Cc: linux-nfs, Andy Adamson

On Fri, Apr 16, 2010 at 11:52 AM,  <andros@netapp.com> wrote:
> From: Andy Adamson <andros@netapp.com>
>
> A shared RCU device ID cache servicing multiple mounts of a single la=
yout type
> per meta data server (struct nfs_client).
>
> Device IDs of type deviceid4 are required by all layout types, long l=
ived and
> read at each I/O. =A0They are added to the deviceid cache at first re=
ference by
> a layout via GETDEVICEINFO and (currently) are only removed at umount=
=2E
>
> Reference count the device ID cache for each mounted file system
> in the initialize_mountpoint layoutdriver_io_operation.
>
> Dereference the device id cache on file system in the uninitialize_mo=
untpoint
> layoutdriver_io_operation called at umount
>
> Signed-off-by: Andy Adamson <andros@netapp.com>
> ---
> =A0fs/nfs/pnfs.c =A0 =A0 =A0 =A0 =A0 =A0 | =A0119 +++++++++++++++++++=
++++++++++++++++++++++++++
> =A0include/linux/nfs4_pnfs.h | =A0 27 ++++++++++
> =A0include/linux/nfs_fs_sb.h | =A0 =A01 +
> =A03 files changed, 147 insertions(+), 0 deletions(-)
>
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index 91572aa..8492aef 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -45,6 +45,7 @@
> =A0#include <linux/nfs4.h>
> =A0#include <linux/pnfs_xdr.h>
> =A0#include <linux/nfs4_pnfs.h>
> +#include <linux/rculist.h>
>
> =A0#include "internal.h"
> =A0#include "nfs4_fs.h"
> @@ -2296,3 +2297,121 @@ struct pnfs_client_operations pnfs_ops =3D {
>
> =A0EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
> =A0EXPORT_SYMBOL(pnfs_register_layoutdriver);
> +
> +
> +/* Device ID cache. Supports one layout type per struct nfs_client *=
/
> +int
> +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0void (*free_callback=
)(struct rcu_head *))
> +{
> + =A0 =A0 =A0 struct nfs4_deviceid_cache *c;
> +
> + =A0 =A0 =A0 c =3D kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_K=
ERNEL);
> + =A0 =A0 =A0 if (!c)
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 return -ENOMEM;
> + =A0 =A0 =A0 spin_lock(&clp->cl_lock);
> + =A0 =A0 =A0 if (clp->cl_devid_cache !=3D NULL) {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kref_get(&clp->cl_devid_cache->dc_kref)=
;
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&clp->cl_lock);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [kref [%d]]\n", __func__,
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 atomic_read(&clp->cl_de=
vid_cache->dc_kref.refcount));
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kfree(c);
> + =A0 =A0 =A0 } else {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_lock_init(&c->dc_lock);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 INIT_LIST_HEAD(&c->dc_deviceids);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kref_init(&c->dc_kref);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback =3D free_callback;
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_clp =3D clp;
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 clp->cl_devid_cache =3D c;
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&clp->cl_lock);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [new]\n", __func__);
> + =A0 =A0 =A0 }
> + =A0 =A0 =A0 return 0;
> +}
> +EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
> +
> +void
> +nfs4_init_deviceid_node(struct nfs4_deviceid *d)
> +{
> + =A0 =A0 =A0 INIT_LIST_HEAD(&d->de_node);
> + =A0 =A0 =A0 INIT_RCU_HEAD(&d->de_rcu);
> +}
> +EXPORT_SYMBOL(nfs4_init_deviceid_node);
> +
> +struct nfs4_deviceid *
> +nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_device=
id *id)
> +{
> + =A0 =A0 =A0 struct nfs4_deviceid *d;
> +
> + =A0 =A0 =A0 rcu_read_lock();
> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 if (!memcmp(&d->de_id, id, NFS4_PNFS_DE=
VICEID4_SIZE)) {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 rcu_read_unlock();
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 return d;
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 }
> + =A0 =A0 =A0 }
> + =A0 =A0 =A0 rcu_read_unlock();
> + =A0 =A0 =A0 return NULL;
> +}
> +EXPORT_SYMBOL(nfs4_find_deviceid);
> +
> +/*
> + * Add or kref_get a deviceid.
> + * GETDEVICEINFOs for same deviceid can race. If deviceid is found, =
discard new
> + */
> +void
> +nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_devicei=
d *new)
> +{
> + =A0 =A0 =A0 struct nfs4_deviceid *d;
> +
> + =A0 =A0 =A0 spin_lock(&c->dc_lock);
> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 if (!memcmp(&d->de_id, &new->de_id, NFS=
4_PNFS_DEVICEID4_SIZE)) {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&c->dc_lock=
);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [discard]\n=
", __func__);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback(&ne=
w->de_rcu);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 }
> + =A0 =A0 =A0 }
> + =A0 =A0 =A0 list_add_rcu(&new->de_node, &c->dc_deviceids);
> + =A0 =A0 =A0 spin_unlock(&c->dc_lock);
> + =A0 =A0 =A0 dprintk("%s [new]\n", __func__);
> +}
> +EXPORT_SYMBOL(nfs4_add_deviceid);
> +
> +static int
> +nfs4_remove_deviceid(struct nfs4_deviceid_cache *c)
> +{
> + =A0 =A0 =A0 struct nfs4_deviceid *d;
> +
> + =A0 =A0 =A0 spin_lock(&c->dc_lock);
> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 list_del_rcu(&d->de_node);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&c->dc_lock);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 synchronize_rcu();
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback(&d->de_rcu);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 return 1;
> + =A0 =A0 =A0 }
> + =A0 =A0 =A0 spin_unlock(&c->dc_lock);
> + =A0 =A0 =A0 return 0;
> +}
> +
> +static void
> +nfs4_free_deviceid_cache(struct kref *kref)
> +{
> + =A0 =A0 =A0 struct nfs4_deviceid_cache *cache =3D
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 container_of(kref, struct nfs4_deviceid=
_cache, dc_kref);
> + =A0 =A0 =A0 int more =3D 1;
> +
> + =A0 =A0 =A0 while (more)
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 more =3D nfs4_remove_deviceid(cache);
> + =A0 =A0 =A0 cache->dc_clp->cl_devid_cache =3D NULL;

Need to take the cl_lock around this assignment

spin_lock(&cache->dc_clp->cl_lock);
cache->dc_clp->cl_devid_cache =3D NULL
spin_unlock(&cache->dc_clp->cl_lock);


> + =A0 =A0 =A0 kfree(cache);
> +}
> +
> +void
> +nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *c)
> +{
> + =A0 =A0 =A0 dprintk("%s [%d]\n", __func__, atomic_read(&c->dc_kref.=
refcount));
> + =A0 =A0 =A0 kref_put(&c->dc_kref, nfs4_free_deviceid_cache);
> +}
> +EXPORT_SYMBOL(nfs4_put_deviceid_cache);
> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
> index 1d521f4..2a88a06 100644
> --- a/include/linux/nfs4_pnfs.h
> +++ b/include/linux/nfs4_pnfs.h
> @@ -281,6 +281,33 @@ struct pnfs_devicelist {
> =A0 =A0 =A0 =A0struct pnfs_deviceid =A0 =A0dev_id[NFS4_PNFS_GETDEVLIS=
T_MAXNUM];
> =A0};
>
> +/*
> + * Device ID RCU cache. A device ID is unique per client ID and layo=
ut type.
> + */
> +struct nfs4_deviceid_cache {
> + =A0 =A0 =A0 spinlock_t =A0 =A0 =A0 =A0 =A0 =A0 =A0dc_lock;
> + =A0 =A0 =A0 struct list_head =A0 =A0 =A0 =A0dc_deviceids;
> + =A0 =A0 =A0 struct kref =A0 =A0 =A0 =A0 =A0 =A0 dc_kref;
> + =A0 =A0 =A0 void =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0(*dc_free_c=
allback)(struct rcu_head *);
> + =A0 =A0 =A0 struct nfs_client =A0 =A0 =A0 *dc_clp;
> +};
> +
> +/* Device ID cache node */
> +struct nfs4_deviceid {
> + =A0 =A0 =A0 struct list_head =A0 =A0 =A0 =A0de_node;
> + =A0 =A0 =A0 struct rcu_head =A0 =A0 =A0 =A0 de_rcu;
> + =A0 =A0 =A0 struct pnfs_deviceid =A0 =A0de_id;
> +};
> +
> +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 void (*=
free_callback)(struct rcu_head *));
> +extern void nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *);
> +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
> +extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_deviceid=
_cache *,
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 struct =
pnfs_deviceid *);
> +extern void nfs4_add_deviceid(struct nfs4_deviceid_cache *,
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 struct =
nfs4_deviceid *);
> +
> =A0/* pNFS client callback functions.
> =A0* These operations allow the layout driver to access pNFS client
> =A0* specific information or call pNFS client->server operations.
> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
> index 8522461..ef2e18e 100644
> --- a/include/linux/nfs_fs_sb.h
> +++ b/include/linux/nfs_fs_sb.h
> @@ -87,6 +87,7 @@ struct nfs_client {
> =A0 =A0 =A0 =A0u32 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 cl_exchang=
e_flags;
> =A0 =A0 =A0 =A0struct nfs4_session =A0 =A0 *cl_session; =A0 =A0/* sha=
rred session */
> =A0 =A0 =A0 =A0struct list_head =A0 =A0 =A0 =A0cl_lo_inodes; =A0 /* I=
nodes having layouts */
> + =A0 =A0 =A0 struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS dev=
iceid cache */
> =A0#endif /* CONFIG_NFS_V4_1 */
>
> =A0#ifdef CONFIG_NFS_FSCACHE
> --
> 1.6.6
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" =
in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at =A0http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [pnfs] [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache
       [not found]     ` <u2n89c397151004160904m9e862360xcaf0e187640b0177-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2010-04-21  5:59       ` Benny Halevy
  2010-04-21 15:22         ` William A. (Andy) Adamson
  0 siblings, 1 reply; 12+ messages in thread
From: Benny Halevy @ 2010-04-21  5:59 UTC (permalink / raw)
  To: William A. (Andy) Adamson; +Cc: pnfs, Andy Adamson, linux-nfs

On Apr. 16, 2010, 19:04 +0300, "William A. (Andy) Adamson" <androsadamson@gmail.com> wrote:
> On Fri, Apr 16, 2010 at 11:52 AM,  <andros@netapp.com> wrote:
>> From: Andy Adamson <andros@netapp.com>
>>
>> A shared RCU device ID cache servicing multiple mounts of a single layout type
>> per meta data server (struct nfs_client).
>>
>> Device IDs of type deviceid4 are required by all layout types, long lived and
>> read at each I/O.  They are added to the deviceid cache at first reference by
>> a layout via GETDEVICEINFO and (currently) are only removed at umount.
>>
>> Reference count the device ID cache for each mounted file system
>> in the initialize_mountpoint layoutdriver_io_operation.
>>
>> Dereference the device id cache on file system in the uninitialize_mountpoint
>> layoutdriver_io_operation called at umount
>>
>> Signed-off-by: Andy Adamson <andros@netapp.com>
>> ---
>>  fs/nfs/pnfs.c             |  119 +++++++++++++++++++++++++++++++++++++++++++++
>>  include/linux/nfs4_pnfs.h |   27 ++++++++++
>>  include/linux/nfs_fs_sb.h |    1 +
>>  3 files changed, 147 insertions(+), 0 deletions(-)
>>
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index 91572aa..8492aef 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -45,6 +45,7 @@
>>  #include <linux/nfs4.h>
>>  #include <linux/pnfs_xdr.h>
>>  #include <linux/nfs4_pnfs.h>
>> +#include <linux/rculist.h>
>>
>>  #include "internal.h"
>>  #include "nfs4_fs.h"
>> @@ -2296,3 +2297,121 @@ struct pnfs_client_operations pnfs_ops = {
>>
>>  EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
>>  EXPORT_SYMBOL(pnfs_register_layoutdriver);
>> +
>> +
>> +/* Device ID cache. Supports one layout type per struct nfs_client */
>> +int
>> +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
>> +                        void (*free_callback)(struct rcu_head *))
>> +{
>> +       struct nfs4_deviceid_cache *c;
>> +
>> +       c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL);
>> +       if (!c)
>> +               return -ENOMEM;
>> +       spin_lock(&clp->cl_lock);
>> +       if (clp->cl_devid_cache != NULL) {
>> +               kref_get(&clp->cl_devid_cache->dc_kref);
>> +               spin_unlock(&clp->cl_lock);
>> +               dprintk("%s [kref [%d]]\n", __func__,
>> +                       atomic_read(&clp->cl_devid_cache->dc_kref.refcount));
>> +               kfree(c);
>> +       } else {
>> +               spin_lock_init(&c->dc_lock);
>> +               INIT_LIST_HEAD(&c->dc_deviceids);
>> +               kref_init(&c->dc_kref);
>> +               c->dc_free_callback = free_callback;
>> +               c->dc_clp = clp;
>> +               clp->cl_devid_cache = c;
>> +               spin_unlock(&clp->cl_lock);
>> +               dprintk("%s [new]\n", __func__);
>> +       }
>> +       return 0;
>> +}
>> +EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
>> +
>> +void
>> +nfs4_init_deviceid_node(struct nfs4_deviceid *d)
>> +{
>> +       INIT_LIST_HEAD(&d->de_node);
>> +       INIT_RCU_HEAD(&d->de_rcu);
>> +}
>> +EXPORT_SYMBOL(nfs4_init_deviceid_node);
>> +
>> +struct nfs4_deviceid *
>> +nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id)
>> +{
>> +       struct nfs4_deviceid *d;
>> +
>> +       rcu_read_lock();
>> +       list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>> +               if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVICEID4_SIZE)) {
>> +                       rcu_read_unlock();
>> +                       return d;
>> +               }
>> +       }
>> +       rcu_read_unlock();

I hope this is worth the added complexity...

Out of curiosity, do you have a benchmark comparing the cost
of rcu_read_lock (and unlock) vs. spin_lock(&c->dc_lock)?

>> +       return NULL;
>> +}
>> +EXPORT_SYMBOL(nfs4_find_deviceid);
>> +
>> +/*
>> + * Add or kref_get a deviceid.
>> + * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
>> + */
>> +void
>> +nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new)
>> +{
>> +       struct nfs4_deviceid *d;
>> +
>> +       spin_lock(&c->dc_lock);
>> +       list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>> +               if (!memcmp(&d->de_id, &new->de_id, NFS4_PNFS_DEVICEID4_SIZE)) {
>> +                       spin_unlock(&c->dc_lock);
>> +                       dprintk("%s [discard]\n", __func__);
>> +                       c->dc_free_callback(&new->de_rcu);
>> +               }
>> +       }
>> +       list_add_rcu(&new->de_node, &c->dc_deviceids);
>> +       spin_unlock(&c->dc_lock);
>> +       dprintk("%s [new]\n", __func__);
>> +}
>> +EXPORT_SYMBOL(nfs4_add_deviceid);
>> +
>> +static int
>> +nfs4_remove_deviceid(struct nfs4_deviceid_cache *c)
>> +{
>> +       struct nfs4_deviceid *d;
>> +
>> +       spin_lock(&c->dc_lock);
>> +       list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>> +               list_del_rcu(&d->de_node);
>> +               spin_unlock(&c->dc_lock);
>> +               synchronize_rcu();
>> +               c->dc_free_callback(&d->de_rcu);
>> +               return 1;
>> +       }
>> +       spin_unlock(&c->dc_lock);
>> +       return 0;
>> +}
>> +
>> +static void
>> +nfs4_free_deviceid_cache(struct kref *kref)
>> +{
>> +       struct nfs4_deviceid_cache *cache =
>> +               container_of(kref, struct nfs4_deviceid_cache, dc_kref);
>> +       int more = 1;
>> +
>> +       while (more)
>> +               more = nfs4_remove_deviceid(cache);
>> +       cache->dc_clp->cl_devid_cache = NULL;
> 
> Need to take the cl_lock around this assignment
> 
> spin_lock(&cache->dc_clp->cl_lock);
> cache->dc_clp->cl_devid_cache = NULL
> spin_unlock(&cache->dc_clp->cl_lock);
> 
> 

That must happen atomically before kref_put.
It's illegal to have cl_devid_cache be referenced by cache->dc_clp
without a reference count backing it up.
Otherwise, if accessed concurrently to this piece of code
someone might call kref_get while the refcount is zero.

Normally, you'd first clear the referencing pointer to prevent any
new reference to it and only then kref_put it, e.g.:

spin_lock(&cache->dc_clp->cl_lock);
tmp = cache->dc_clp->cl_devid_cache;
cache->dc_clp->cl_devid_cache = NULL
spin_unlock(&cache->dc_clp->cl_lock);
kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache);

Benny

>> +       kfree(cache);
>> +}
>> +
>> +void
>> +nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *c)
>> +{
>> +       dprintk("%s [%d]\n", __func__, atomic_read(&c->dc_kref.refcount));
>> +       kref_put(&c->dc_kref, nfs4_free_deviceid_cache);
>> +}
>> +EXPORT_SYMBOL(nfs4_put_deviceid_cache);
>> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
>> index 1d521f4..2a88a06 100644
>> --- a/include/linux/nfs4_pnfs.h
>> +++ b/include/linux/nfs4_pnfs.h
>> @@ -281,6 +281,33 @@ struct pnfs_devicelist {
>>        struct pnfs_deviceid    dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
>>  };
>>
>> +/*
>> + * Device ID RCU cache. A device ID is unique per client ID and layout type.
>> + */
>> +struct nfs4_deviceid_cache {
>> +       spinlock_t              dc_lock;
>> +       struct list_head        dc_deviceids;
>> +       struct kref             dc_kref;
>> +       void                    (*dc_free_callback)(struct rcu_head *);
>> +       struct nfs_client       *dc_clp;
>> +};
>> +
>> +/* Device ID cache node */
>> +struct nfs4_deviceid {
>> +       struct list_head        de_node;
>> +       struct rcu_head         de_rcu;
>> +       struct pnfs_deviceid    de_id;
>> +};
>> +
>> +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
>> +                               void (*free_callback)(struct rcu_head *));
>> +extern void nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *);
>> +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
>> +extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_deviceid_cache *,
>> +                               struct pnfs_deviceid *);
>> +extern void nfs4_add_deviceid(struct nfs4_deviceid_cache *,
>> +                               struct nfs4_deviceid *);
>> +
>>  /* pNFS client callback functions.
>>  * These operations allow the layout driver to access pNFS client
>>  * specific information or call pNFS client->server operations.
>> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
>> index 8522461..ef2e18e 100644
>> --- a/include/linux/nfs_fs_sb.h
>> +++ b/include/linux/nfs_fs_sb.h
>> @@ -87,6 +87,7 @@ struct nfs_client {
>>        u32                     cl_exchange_flags;
>>        struct nfs4_session     *cl_session;    /* sharred session */
>>        struct list_head        cl_lo_inodes;   /* Inodes having layouts */
>> +       struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
>>  #endif /* CONFIG_NFS_V4_1 */
>>
>>  #ifdef CONFIG_NFS_FSCACHE
>> --
>> 1.6.6
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
> _______________________________________________
> pNFS mailing list
> pNFS@linux-nfs.org
> http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [pnfs] [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache
  2010-04-21  5:59       ` [pnfs] " Benny Halevy
@ 2010-04-21 15:22         ` William A. (Andy) Adamson
       [not found]           ` <l2l89c397151004210822j8b43009o3a9e78ceed901fd9-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: William A. (Andy) Adamson @ 2010-04-21 15:22 UTC (permalink / raw)
  To: Benny Halevy; +Cc: pnfs, linux-nfs

On Wed, Apr 21, 2010 at 1:59 AM, Benny Halevy <bhalevy@panasas.com> wro=
te:
> On Apr. 16, 2010, 19:04 +0300, "William A. (Andy) Adamson" <androsada=
mson@gmail.com> wrote:
>> On Fri, Apr 16, 2010 at 11:52 AM, =A0<andros@netapp.com> wrote:
>>> From: Andy Adamson <andros@netapp.com>
>>>
>>> A shared RCU device ID cache servicing multiple mounts of a single =
layout type
>>> per meta data server (struct nfs_client).
>>>
>>> Device IDs of type deviceid4 are required by all layout types, long=
 lived and
>>> read at each I/O. =A0They are added to the deviceid cache at first =
reference by
>>> a layout via GETDEVICEINFO and (currently) are only removed at umou=
nt.
>>>
>>> Reference count the device ID cache for each mounted file system
>>> in the initialize_mountpoint layoutdriver_io_operation.
>>>
>>> Dereference the device id cache on file system in the uninitialize_=
mountpoint
>>> layoutdriver_io_operation called at umount
>>>
>>> Signed-off-by: Andy Adamson <andros@netapp.com>
>>> ---
>>> =A0fs/nfs/pnfs.c =A0 =A0 =A0 =A0 =A0 =A0 | =A0119 +++++++++++++++++=
++++++++++++++++++++++++++++
>>> =A0include/linux/nfs4_pnfs.h | =A0 27 ++++++++++
>>> =A0include/linux/nfs_fs_sb.h | =A0 =A01 +
>>> =A03 files changed, 147 insertions(+), 0 deletions(-)
>>>
>>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>>> index 91572aa..8492aef 100644
>>> --- a/fs/nfs/pnfs.c
>>> +++ b/fs/nfs/pnfs.c
>>> @@ -45,6 +45,7 @@
>>> =A0#include <linux/nfs4.h>
>>> =A0#include <linux/pnfs_xdr.h>
>>> =A0#include <linux/nfs4_pnfs.h>
>>> +#include <linux/rculist.h>
>>>
>>> =A0#include "internal.h"
>>> =A0#include "nfs4_fs.h"
>>> @@ -2296,3 +2297,121 @@ struct pnfs_client_operations pnfs_ops =3D =
{
>>>
>>> =A0EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
>>> =A0EXPORT_SYMBOL(pnfs_register_layoutdriver);
>>> +
>>> +
>>> +/* Device ID cache. Supports one layout type per struct nfs_client=
 */
>>> +int
>>> +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0void (*free_callba=
ck)(struct rcu_head *))
>>> +{
>>> + =A0 =A0 =A0 struct nfs4_deviceid_cache *c;
>>> +
>>> + =A0 =A0 =A0 c =3D kzalloc(sizeof(struct nfs4_deviceid_cache), GFP=
_KERNEL);
>>> + =A0 =A0 =A0 if (!c)
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 return -ENOMEM;
>>> + =A0 =A0 =A0 spin_lock(&clp->cl_lock);
>>> + =A0 =A0 =A0 if (clp->cl_devid_cache !=3D NULL) {
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kref_get(&clp->cl_devid_cache->dc_kre=
f);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&clp->cl_lock);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [kref [%d]]\n", __func__,
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 atomic_read(&clp->cl_=
devid_cache->dc_kref.refcount));
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kfree(c);
>>> + =A0 =A0 =A0 } else {
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_lock_init(&c->dc_lock);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 INIT_LIST_HEAD(&c->dc_deviceids);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kref_init(&c->dc_kref);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback =3D free_callback=
;
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_clp =3D clp;
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 clp->cl_devid_cache =3D c;
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&clp->cl_lock);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [new]\n", __func__);
>>> + =A0 =A0 =A0 }
>>> + =A0 =A0 =A0 return 0;
>>> +}
>>> +EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
>>> +
>>> +void
>>> +nfs4_init_deviceid_node(struct nfs4_deviceid *d)
>>> +{
>>> + =A0 =A0 =A0 INIT_LIST_HEAD(&d->de_node);
>>> + =A0 =A0 =A0 INIT_RCU_HEAD(&d->de_rcu);
>>> +}
>>> +EXPORT_SYMBOL(nfs4_init_deviceid_node);
>>> +
>>> +struct nfs4_deviceid *
>>> +nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_devi=
ceid *id)
>>> +{
>>> + =A0 =A0 =A0 struct nfs4_deviceid *d;
>>> +
>>> + =A0 =A0 =A0 rcu_read_lock();
>>> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_node)=
 {
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 if (!memcmp(&d->de_id, id, NFS4_PNFS_=
DEVICEID4_SIZE)) {
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 rcu_read_unlock();
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 return d;
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 }
>>> + =A0 =A0 =A0 }
>>> + =A0 =A0 =A0 rcu_read_unlock();
>
> I hope this is worth the added complexity...
>
> Out of curiosity, do you have a benchmark comparing the cost
> of rcu_read_lock (and unlock) vs. spin_lock(&c->dc_lock)?

The deviceid cache is read at each I/O. If we use a spin_lock to
protect the deviceid cache, this would mean that all I/0 is serialized
behind the spin_lock even though the deviceid cache is changed
infrequently. The RCU allows readers to "run almost naked" and does
not serialize I/O behind reading the deviceid cache.

So I think it is worth it. I have not benchmarked the difference.


>
>>> + =A0 =A0 =A0 return NULL;
>>> +}
>>> +EXPORT_SYMBOL(nfs4_find_deviceid);
>>> +
>>> +/*
>>> + * Add or kref_get a deviceid.
>>> + * GETDEVICEINFOs for same deviceid can race. If deviceid is found=
, discard new
>>> + */
>>> +void
>>> +nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_devic=
eid *new)
>>> +{
>>> + =A0 =A0 =A0 struct nfs4_deviceid *d;
>>> +
>>> + =A0 =A0 =A0 spin_lock(&c->dc_lock);
>>> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_node)=
 {
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 if (!memcmp(&d->de_id, &new->de_id, N=
=46S4_PNFS_DEVICEID4_SIZE)) {
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&c->dc_lo=
ck);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [discard]=
\n", __func__);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback(&=
new->de_rcu);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 }
>>> + =A0 =A0 =A0 }
>>> + =A0 =A0 =A0 list_add_rcu(&new->de_node, &c->dc_deviceids);
>>> + =A0 =A0 =A0 spin_unlock(&c->dc_lock);
>>> + =A0 =A0 =A0 dprintk("%s [new]\n", __func__);
>>> +}
>>> +EXPORT_SYMBOL(nfs4_add_deviceid);
>>> +
>>> +static int
>>> +nfs4_remove_deviceid(struct nfs4_deviceid_cache *c)
>>> +{
>>> + =A0 =A0 =A0 struct nfs4_deviceid *d;
>>> +
>>> + =A0 =A0 =A0 spin_lock(&c->dc_lock);
>>> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_node)=
 {
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 list_del_rcu(&d->de_node);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&c->dc_lock);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 synchronize_rcu();
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback(&d->de_rcu);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 return 1;
>>> + =A0 =A0 =A0 }
>>> + =A0 =A0 =A0 spin_unlock(&c->dc_lock);
>>> + =A0 =A0 =A0 return 0;
>>> +}
>>> +
>>> +static void
>>> +nfs4_free_deviceid_cache(struct kref *kref)
>>> +{
>>> + =A0 =A0 =A0 struct nfs4_deviceid_cache *cache =3D
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 container_of(kref, struct nfs4_device=
id_cache, dc_kref);
>>> + =A0 =A0 =A0 int more =3D 1;
>>> +
>>> + =A0 =A0 =A0 while (more)
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 more =3D nfs4_remove_deviceid(cache);
>>> + =A0 =A0 =A0 cache->dc_clp->cl_devid_cache =3D NULL;
>>
>> Need to take the cl_lock around this assignment
>>
>> spin_lock(&cache->dc_clp->cl_lock);
>> cache->dc_clp->cl_devid_cache =3D NULL
>> spin_unlock(&cache->dc_clp->cl_lock);
>>
>>
>
> That must happen atomically before kref_put.
> It's illegal to have cl_devid_cache be referenced by cache->dc_clp
> without a reference count backing it up.
> Otherwise, if accessed concurrently to this piece of code
> someone might call kref_get while the refcount is zero.
>
> Normally, you'd first clear the referencing pointer to prevent any
> new reference to it and only then kref_put it, e.g.:
>
> spin_lock(&cache->dc_clp->cl_lock);
> tmp =3D cache->dc_clp->cl_devid_cache;
> cache->dc_clp->cl_devid_cache =3D NULL
> spin_unlock(&cache->dc_clp->cl_lock);
> kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache);

Good point. Thanks for the review. I'll rethink and resend

-->Andy

>
> Benny
>
>>> + =A0 =A0 =A0 kfree(cache);
>>> +}
>>> +
>>> +void
>>> +nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *c)
>>> +{
>>> + =A0 =A0 =A0 dprintk("%s [%d]\n", __func__, atomic_read(&c->dc_kre=
f.refcount));
>>> + =A0 =A0 =A0 kref_put(&c->dc_kref, nfs4_free_deviceid_cache);
>>> +}
>>> +EXPORT_SYMBOL(nfs4_put_deviceid_cache);
>>> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
>>> index 1d521f4..2a88a06 100644
>>> --- a/include/linux/nfs4_pnfs.h
>>> +++ b/include/linux/nfs4_pnfs.h
>>> @@ -281,6 +281,33 @@ struct pnfs_devicelist {
>>> =A0 =A0 =A0 =A0struct pnfs_deviceid =A0 =A0dev_id[NFS4_PNFS_GETDEVL=
IST_MAXNUM];
>>> =A0};
>>>
>>> +/*
>>> + * Device ID RCU cache. A device ID is unique per client ID and la=
yout type.
>>> + */
>>> +struct nfs4_deviceid_cache {
>>> + =A0 =A0 =A0 spinlock_t =A0 =A0 =A0 =A0 =A0 =A0 =A0dc_lock;
>>> + =A0 =A0 =A0 struct list_head =A0 =A0 =A0 =A0dc_deviceids;
>>> + =A0 =A0 =A0 struct kref =A0 =A0 =A0 =A0 =A0 =A0 dc_kref;
>>> + =A0 =A0 =A0 void =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0(*dc_free=
_callback)(struct rcu_head *);
>>> + =A0 =A0 =A0 struct nfs_client =A0 =A0 =A0 *dc_clp;
>>> +};
>>> +
>>> +/* Device ID cache node */
>>> +struct nfs4_deviceid {
>>> + =A0 =A0 =A0 struct list_head =A0 =A0 =A0 =A0de_node;
>>> + =A0 =A0 =A0 struct rcu_head =A0 =A0 =A0 =A0 de_rcu;
>>> + =A0 =A0 =A0 struct pnfs_deviceid =A0 =A0de_id;
>>> +};
>>> +
>>> +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 void =
(*free_callback)(struct rcu_head *));
>>> +extern void nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *);
>>> +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
>>> +extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_device=
id_cache *,
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 struc=
t pnfs_deviceid *);
>>> +extern void nfs4_add_deviceid(struct nfs4_deviceid_cache *,
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 struc=
t nfs4_deviceid *);
>>> +
>>> =A0/* pNFS client callback functions.
>>> =A0* These operations allow the layout driver to access pNFS client
>>> =A0* specific information or call pNFS client->server operations.
>>> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
>>> index 8522461..ef2e18e 100644
>>> --- a/include/linux/nfs_fs_sb.h
>>> +++ b/include/linux/nfs_fs_sb.h
>>> @@ -87,6 +87,7 @@ struct nfs_client {
>>> =A0 =A0 =A0 =A0u32 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 cl_excha=
nge_flags;
>>> =A0 =A0 =A0 =A0struct nfs4_session =A0 =A0 *cl_session; =A0 =A0/* s=
harred session */
>>> =A0 =A0 =A0 =A0struct list_head =A0 =A0 =A0 =A0cl_lo_inodes; =A0 /*=
 Inodes having layouts */
>>> + =A0 =A0 =A0 struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS d=
eviceid cache */
>>> =A0#endif /* CONFIG_NFS_V4_1 */
>>>
>>> =A0#ifdef CONFIG_NFS_FSCACHE
>>> --
>>> 1.6.6
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-nfs=
" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at =A0http://vger.kernel.org/majordomo-info.htm=
l
>>>
>> _______________________________________________
>> pNFS mailing list
>> pNFS@linux-nfs.org
>> http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [pnfs] [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache
       [not found]           ` <l2l89c397151004210822j8b43009o3a9e78ceed901fd9-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2010-04-22 11:20             ` Benny Halevy
  2010-04-22 15:47               ` William A. (Andy) Adamson
  0 siblings, 1 reply; 12+ messages in thread
From: Benny Halevy @ 2010-04-22 11:20 UTC (permalink / raw)
  To: William A. (Andy) Adamson; +Cc: pnfs, linux-nfs

On Apr. 21, 2010, 18:22 +0300, "William A. (Andy) Adamson" <androsadamson@gmail.com> wrote:
> On Wed, Apr 21, 2010 at 1:59 AM, Benny Halevy <bhalevy@panasas.com> wrote:
>> On Apr. 16, 2010, 19:04 +0300, "William A. (Andy) Adamson" <androsadamson@gmail.com> wrote:
>>> On Fri, Apr 16, 2010 at 11:52 AM,  <andros@netapp.com> wrote:
>>>> From: Andy Adamson <andros@netapp.com>
>>>>
>>>> A shared RCU device ID cache servicing multiple mounts of a single layout type
>>>> per meta data server (struct nfs_client).
>>>>
>>>> Device IDs of type deviceid4 are required by all layout types, long lived and
>>>> read at each I/O.  They are added to the deviceid cache at first reference by
>>>> a layout via GETDEVICEINFO and (currently) are only removed at umount.
>>>>
>>>> Reference count the device ID cache for each mounted file system
>>>> in the initialize_mountpoint layoutdriver_io_operation.
>>>>
>>>> Dereference the device id cache on file system in the uninitialize_mountpoint
>>>> layoutdriver_io_operation called at umount
>>>>
>>>> Signed-off-by: Andy Adamson <andros@netapp.com>
>>>> ---
>>>>  fs/nfs/pnfs.c             |  119 +++++++++++++++++++++++++++++++++++++++++++++
>>>>  include/linux/nfs4_pnfs.h |   27 ++++++++++
>>>>  include/linux/nfs_fs_sb.h |    1 +
>>>>  3 files changed, 147 insertions(+), 0 deletions(-)
>>>>
>>>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>>>> index 91572aa..8492aef 100644
>>>> --- a/fs/nfs/pnfs.c
>>>> +++ b/fs/nfs/pnfs.c
>>>> @@ -45,6 +45,7 @@
>>>>  #include <linux/nfs4.h>
>>>>  #include <linux/pnfs_xdr.h>
>>>>  #include <linux/nfs4_pnfs.h>
>>>> +#include <linux/rculist.h>
>>>>
>>>>  #include "internal.h"
>>>>  #include "nfs4_fs.h"
>>>> @@ -2296,3 +2297,121 @@ struct pnfs_client_operations pnfs_ops = {
>>>>
>>>>  EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
>>>>  EXPORT_SYMBOL(pnfs_register_layoutdriver);
>>>> +
>>>> +
>>>> +/* Device ID cache. Supports one layout type per struct nfs_client */
>>>> +int
>>>> +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
>>>> +                        void (*free_callback)(struct rcu_head *))
>>>> +{
>>>> +       struct nfs4_deviceid_cache *c;
>>>> +
>>>> +       c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL);
>>>> +       if (!c)
>>>> +               return -ENOMEM;
>>>> +       spin_lock(&clp->cl_lock);
>>>> +       if (clp->cl_devid_cache != NULL) {
>>>> +               kref_get(&clp->cl_devid_cache->dc_kref);
>>>> +               spin_unlock(&clp->cl_lock);
>>>> +               dprintk("%s [kref [%d]]\n", __func__,
>>>> +                       atomic_read(&clp->cl_devid_cache->dc_kref.refcount));
>>>> +               kfree(c);
>>>> +       } else {
>>>> +               spin_lock_init(&c->dc_lock);
>>>> +               INIT_LIST_HEAD(&c->dc_deviceids);
>>>> +               kref_init(&c->dc_kref);
>>>> +               c->dc_free_callback = free_callback;
>>>> +               c->dc_clp = clp;
>>>> +               clp->cl_devid_cache = c;
>>>> +               spin_unlock(&clp->cl_lock);
>>>> +               dprintk("%s [new]\n", __func__);
>>>> +       }
>>>> +       return 0;
>>>> +}
>>>> +EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
>>>> +
>>>> +void
>>>> +nfs4_init_deviceid_node(struct nfs4_deviceid *d)
>>>> +{
>>>> +       INIT_LIST_HEAD(&d->de_node);
>>>> +       INIT_RCU_HEAD(&d->de_rcu);
>>>> +}
>>>> +EXPORT_SYMBOL(nfs4_init_deviceid_node);
>>>> +
>>>> +struct nfs4_deviceid *
>>>> +nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id)
>>>> +{
>>>> +       struct nfs4_deviceid *d;
>>>> +
>>>> +       rcu_read_lock();
>>>> +       list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>>>> +               if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVICEID4_SIZE)) {
>>>> +                       rcu_read_unlock();
>>>> +                       return d;
>>>> +               }
>>>> +       }
>>>> +       rcu_read_unlock();
>>
>> I hope this is worth the added complexity...
>>
>> Out of curiosity, do you have a benchmark comparing the cost
>> of rcu_read_lock (and unlock) vs. spin_lock(&c->dc_lock)?
> 
> The deviceid cache is read at each I/O. If we use a spin_lock to

Yeah, I see where this goes...
In the objects layout driver we get a reference on the device structure
at alloc_lseg time and keep a pointer to it throughout the lseg's life time.
This saves the deviceid lookup on every I/O.

Benny

> protect the deviceid cache, this would mean that all I/0 is serialized
> behind the spin_lock even though the deviceid cache is changed
> infrequently. The RCU allows readers to "run almost naked" and does
> not serialize I/O behind reading the deviceid cache.
> 
> So I think it is worth it. I have not benchmarked the difference.
> 
> 
>>
>>>> +       return NULL;
>>>> +}
>>>> +EXPORT_SYMBOL(nfs4_find_deviceid);
>>>> +
>>>> +/*
>>>> + * Add or kref_get a deviceid.
>>>> + * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
>>>> + */
>>>> +void
>>>> +nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new)
>>>> +{
>>>> +       struct nfs4_deviceid *d;
>>>> +
>>>> +       spin_lock(&c->dc_lock);
>>>> +       list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>>>> +               if (!memcmp(&d->de_id, &new->de_id, NFS4_PNFS_DEVICEID4_SIZE)) {
>>>> +                       spin_unlock(&c->dc_lock);
>>>> +                       dprintk("%s [discard]\n", __func__);
>>>> +                       c->dc_free_callback(&new->de_rcu);
>>>> +               }
>>>> +       }
>>>> +       list_add_rcu(&new->de_node, &c->dc_deviceids);
>>>> +       spin_unlock(&c->dc_lock);
>>>> +       dprintk("%s [new]\n", __func__);
>>>> +}
>>>> +EXPORT_SYMBOL(nfs4_add_deviceid);
>>>> +
>>>> +static int
>>>> +nfs4_remove_deviceid(struct nfs4_deviceid_cache *c)
>>>> +{
>>>> +       struct nfs4_deviceid *d;
>>>> +
>>>> +       spin_lock(&c->dc_lock);
>>>> +       list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>>>> +               list_del_rcu(&d->de_node);
>>>> +               spin_unlock(&c->dc_lock);
>>>> +               synchronize_rcu();
>>>> +               c->dc_free_callback(&d->de_rcu);
>>>> +               return 1;
>>>> +       }
>>>> +       spin_unlock(&c->dc_lock);
>>>> +       return 0;
>>>> +}
>>>> +
>>>> +static void
>>>> +nfs4_free_deviceid_cache(struct kref *kref)
>>>> +{
>>>> +       struct nfs4_deviceid_cache *cache =
>>>> +               container_of(kref, struct nfs4_deviceid_cache, dc_kref);
>>>> +       int more = 1;
>>>> +
>>>> +       while (more)
>>>> +               more = nfs4_remove_deviceid(cache);
>>>> +       cache->dc_clp->cl_devid_cache = NULL;
>>>
>>> Need to take the cl_lock around this assignment
>>>
>>> spin_lock(&cache->dc_clp->cl_lock);
>>> cache->dc_clp->cl_devid_cache = NULL
>>> spin_unlock(&cache->dc_clp->cl_lock);
>>>
>>>
>>
>> That must happen atomically before kref_put.
>> It's illegal to have cl_devid_cache be referenced by cache->dc_clp
>> without a reference count backing it up.
>> Otherwise, if accessed concurrently to this piece of code
>> someone might call kref_get while the refcount is zero.
>>
>> Normally, you'd first clear the referencing pointer to prevent any
>> new reference to it and only then kref_put it, e.g.:
>>
>> spin_lock(&cache->dc_clp->cl_lock);
>> tmp = cache->dc_clp->cl_devid_cache;
>> cache->dc_clp->cl_devid_cache = NULL
>> spin_unlock(&cache->dc_clp->cl_lock);
>> kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache);
> 
> Good point. Thanks for the review. I'll rethink and resend
> 
> -->Andy
> 
>>
>> Benny
>>
>>>> +       kfree(cache);
>>>> +}
>>>> +
>>>> +void
>>>> +nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *c)
>>>> +{
>>>> +       dprintk("%s [%d]\n", __func__, atomic_read(&c->dc_kref.refcount));
>>>> +       kref_put(&c->dc_kref, nfs4_free_deviceid_cache);
>>>> +}
>>>> +EXPORT_SYMBOL(nfs4_put_deviceid_cache);
>>>> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
>>>> index 1d521f4..2a88a06 100644
>>>> --- a/include/linux/nfs4_pnfs.h
>>>> +++ b/include/linux/nfs4_pnfs.h
>>>> @@ -281,6 +281,33 @@ struct pnfs_devicelist {
>>>>        struct pnfs_deviceid    dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
>>>>  };
>>>>
>>>> +/*
>>>> + * Device ID RCU cache. A device ID is unique per client ID and layout type.
>>>> + */
>>>> +struct nfs4_deviceid_cache {
>>>> +       spinlock_t              dc_lock;
>>>> +       struct list_head        dc_deviceids;
>>>> +       struct kref             dc_kref;
>>>> +       void                    (*dc_free_callback)(struct rcu_head *);
>>>> +       struct nfs_client       *dc_clp;
>>>> +};
>>>> +
>>>> +/* Device ID cache node */
>>>> +struct nfs4_deviceid {
>>>> +       struct list_head        de_node;
>>>> +       struct rcu_head         de_rcu;
>>>> +       struct pnfs_deviceid    de_id;
>>>> +};
>>>> +
>>>> +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
>>>> +                               void (*free_callback)(struct rcu_head *));
>>>> +extern void nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *);
>>>> +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
>>>> +extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_deviceid_cache *,
>>>> +                               struct pnfs_deviceid *);
>>>> +extern void nfs4_add_deviceid(struct nfs4_deviceid_cache *,
>>>> +                               struct nfs4_deviceid *);
>>>> +
>>>>  /* pNFS client callback functions.
>>>>  * These operations allow the layout driver to access pNFS client
>>>>  * specific information or call pNFS client->server operations.
>>>> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
>>>> index 8522461..ef2e18e 100644
>>>> --- a/include/linux/nfs_fs_sb.h
>>>> +++ b/include/linux/nfs_fs_sb.h
>>>> @@ -87,6 +87,7 @@ struct nfs_client {
>>>>        u32                     cl_exchange_flags;
>>>>        struct nfs4_session     *cl_session;    /* sharred session */
>>>>        struct list_head        cl_lo_inodes;   /* Inodes having layouts */
>>>> +       struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
>>>>  #endif /* CONFIG_NFS_V4_1 */
>>>>
>>>>  #ifdef CONFIG_NFS_FSCACHE
>>>> --
>>>> 1.6.6
>>>>
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>>
>>> _______________________________________________
>>> pNFS mailing list
>>> pNFS@linux-nfs.org
>>> http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs
>>


-- 
Benny Halevy
Software Architect
Panasas, Inc.
bhalevy@panasas.com
Tel/Fax: +972-3-647-8340
Mobile: +972-54-802-8340

Panasas: The Leader in Parallel Storage
www.panasas.com

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [pnfs] [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache
  2010-04-22 11:20             ` Benny Halevy
@ 2010-04-22 15:47               ` William A. (Andy) Adamson
       [not found]                 ` <v2h89c397151004220847v3a31c493s4089d0cd53cf3e19-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: William A. (Andy) Adamson @ 2010-04-22 15:47 UTC (permalink / raw)
  To: Benny Halevy; +Cc: pnfs, linux-nfs

On Thu, Apr 22, 2010 at 7:20 AM, Benny Halevy <bhalevy@panasas.com> wro=
te:
> On Apr. 21, 2010, 18:22 +0300, "William A. (Andy) Adamson" <androsada=
mson@gmail.com> wrote:
>> On Wed, Apr 21, 2010 at 1:59 AM, Benny Halevy <bhalevy@panasas.com> =
wrote:
>>> On Apr. 16, 2010, 19:04 +0300, "William A. (Andy) Adamson" <androsa=
damson@gmail.com> wrote:
>>>> On Fri, Apr 16, 2010 at 11:52 AM, =A0<andros@netapp.com> wrote:
>>>>> From: Andy Adamson <andros@netapp.com>
>>>>>
>>>>> A shared RCU device ID cache servicing multiple mounts of a singl=
e layout type
>>>>> per meta data server (struct nfs_client).
>>>>>
>>>>> Device IDs of type deviceid4 are required by all layout types, lo=
ng lived and
>>>>> read at each I/O. =A0They are added to the deviceid cache at firs=
t reference by
>>>>> a layout via GETDEVICEINFO and (currently) are only removed at um=
ount.
>>>>>
>>>>> Reference count the device ID cache for each mounted file system
>>>>> in the initialize_mountpoint layoutdriver_io_operation.
>>>>>
>>>>> Dereference the device id cache on file system in the uninitializ=
e_mountpoint
>>>>> layoutdriver_io_operation called at umount
>>>>>
>>>>> Signed-off-by: Andy Adamson <andros@netapp.com>
>>>>> ---
>>>>> =A0fs/nfs/pnfs.c =A0 =A0 =A0 =A0 =A0 =A0 | =A0119 +++++++++++++++=
++++++++++++++++++++++++++++++
>>>>> =A0include/linux/nfs4_pnfs.h | =A0 27 ++++++++++
>>>>> =A0include/linux/nfs_fs_sb.h | =A0 =A01 +
>>>>> =A03 files changed, 147 insertions(+), 0 deletions(-)
>>>>>
>>>>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>>>>> index 91572aa..8492aef 100644
>>>>> --- a/fs/nfs/pnfs.c
>>>>> +++ b/fs/nfs/pnfs.c
>>>>> @@ -45,6 +45,7 @@
>>>>> =A0#include <linux/nfs4.h>
>>>>> =A0#include <linux/pnfs_xdr.h>
>>>>> =A0#include <linux/nfs4_pnfs.h>
>>>>> +#include <linux/rculist.h>
>>>>>
>>>>> =A0#include "internal.h"
>>>>> =A0#include "nfs4_fs.h"
>>>>> @@ -2296,3 +2297,121 @@ struct pnfs_client_operations pnfs_ops =3D=
 {
>>>>>
>>>>> =A0EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
>>>>> =A0EXPORT_SYMBOL(pnfs_register_layoutdriver);
>>>>> +
>>>>> +
>>>>> +/* Device ID cache. Supports one layout type per struct nfs_clie=
nt */
>>>>> +int
>>>>> +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0void (*free_call=
back)(struct rcu_head *))
>>>>> +{
>>>>> + =A0 =A0 =A0 struct nfs4_deviceid_cache *c;
>>>>> +
>>>>> + =A0 =A0 =A0 c =3D kzalloc(sizeof(struct nfs4_deviceid_cache), G=
=46P_KERNEL);
>>>>> + =A0 =A0 =A0 if (!c)
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 return -ENOMEM;
>>>>> + =A0 =A0 =A0 spin_lock(&clp->cl_lock);
>>>>> + =A0 =A0 =A0 if (clp->cl_devid_cache !=3D NULL) {
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kref_get(&clp->cl_devid_cache->dc_k=
ref);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&clp->cl_lock);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [kref [%d]]\n", __func_=
_,
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 atomic_read(&clp->c=
l_devid_cache->dc_kref.refcount));
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kfree(c);
>>>>> + =A0 =A0 =A0 } else {
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_lock_init(&c->dc_lock);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 INIT_LIST_HEAD(&c->dc_deviceids);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kref_init(&c->dc_kref);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback =3D free_callba=
ck;
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_clp =3D clp;
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 clp->cl_devid_cache =3D c;
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&clp->cl_lock);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [new]\n", __func__);
>>>>> + =A0 =A0 =A0 }
>>>>> + =A0 =A0 =A0 return 0;
>>>>> +}
>>>>> +EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
>>>>> +
>>>>> +void
>>>>> +nfs4_init_deviceid_node(struct nfs4_deviceid *d)
>>>>> +{
>>>>> + =A0 =A0 =A0 INIT_LIST_HEAD(&d->de_node);
>>>>> + =A0 =A0 =A0 INIT_RCU_HEAD(&d->de_rcu);
>>>>> +}
>>>>> +EXPORT_SYMBOL(nfs4_init_deviceid_node);
>>>>> +
>>>>> +struct nfs4_deviceid *
>>>>> +nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_de=
viceid *id)
>>>>> +{
>>>>> + =A0 =A0 =A0 struct nfs4_deviceid *d;
>>>>> +
>>>>> + =A0 =A0 =A0 rcu_read_lock();
>>>>> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_nod=
e) {
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 if (!memcmp(&d->de_id, id, NFS4_PNF=
S_DEVICEID4_SIZE)) {
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 rcu_read_unlock();
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 return d;
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 }
>>>>> + =A0 =A0 =A0 }
>>>>> + =A0 =A0 =A0 rcu_read_unlock();
>>>
>>> I hope this is worth the added complexity...
>>>
>>> Out of curiosity, do you have a benchmark comparing the cost
>>> of rcu_read_lock (and unlock) vs. spin_lock(&c->dc_lock)?
>>
>> The deviceid cache is read at each I/O. If we use a spin_lock to
>
> Yeah, I see where this goes...
> In the objects layout driver we get a reference on the device structu=
re
> at alloc_lseg time and keep a pointer to it throughout the lseg's lif=
e time.
> This saves the deviceid lookup on every I/O.

Perhaps that is a better way to go. How many deviceid's is 'normal'
for the object driver?

-->Andy


>
> Benny
>
>> protect the deviceid cache, this would mean that all I/0 is serializ=
ed
>> behind the spin_lock even though the deviceid cache is changed
>> infrequently. The RCU allows readers to "run almost naked" and does
>> not serialize I/O behind reading the deviceid cache.
>>
>> So I think it is worth it. I have not benchmarked the difference.
>>
>>
>>>
>>>>> + =A0 =A0 =A0 return NULL;
>>>>> +}
>>>>> +EXPORT_SYMBOL(nfs4_find_deviceid);
>>>>> +
>>>>> +/*
>>>>> + * Add or kref_get a deviceid.
>>>>> + * GETDEVICEINFOs for same deviceid can race. If deviceid is fou=
nd, discard new
>>>>> + */
>>>>> +void
>>>>> +nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_dev=
iceid *new)
>>>>> +{
>>>>> + =A0 =A0 =A0 struct nfs4_deviceid *d;
>>>>> +
>>>>> + =A0 =A0 =A0 spin_lock(&c->dc_lock);
>>>>> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_nod=
e) {
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 if (!memcmp(&d->de_id, &new->de_id,=
 NFS4_PNFS_DEVICEID4_SIZE)) {
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&c->dc_=
lock);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [discar=
d]\n", __func__);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback=
(&new->de_rcu);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 }
>>>>> + =A0 =A0 =A0 }
>>>>> + =A0 =A0 =A0 list_add_rcu(&new->de_node, &c->dc_deviceids);
>>>>> + =A0 =A0 =A0 spin_unlock(&c->dc_lock);
>>>>> + =A0 =A0 =A0 dprintk("%s [new]\n", __func__);
>>>>> +}
>>>>> +EXPORT_SYMBOL(nfs4_add_deviceid);
>>>>> +
>>>>> +static int
>>>>> +nfs4_remove_deviceid(struct nfs4_deviceid_cache *c)
>>>>> +{
>>>>> + =A0 =A0 =A0 struct nfs4_deviceid *d;
>>>>> +
>>>>> + =A0 =A0 =A0 spin_lock(&c->dc_lock);
>>>>> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_nod=
e) {
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 list_del_rcu(&d->de_node);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&c->dc_lock);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 synchronize_rcu();
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback(&d->de_rcu);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 return 1;
>>>>> + =A0 =A0 =A0 }
>>>>> + =A0 =A0 =A0 spin_unlock(&c->dc_lock);
>>>>> + =A0 =A0 =A0 return 0;
>>>>> +}
>>>>> +
>>>>> +static void
>>>>> +nfs4_free_deviceid_cache(struct kref *kref)
>>>>> +{
>>>>> + =A0 =A0 =A0 struct nfs4_deviceid_cache *cache =3D
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 container_of(kref, struct nfs4_devi=
ceid_cache, dc_kref);
>>>>> + =A0 =A0 =A0 int more =3D 1;
>>>>> +
>>>>> + =A0 =A0 =A0 while (more)
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 more =3D nfs4_remove_deviceid(cache=
);
>>>>> + =A0 =A0 =A0 cache->dc_clp->cl_devid_cache =3D NULL;
>>>>
>>>> Need to take the cl_lock around this assignment
>>>>
>>>> spin_lock(&cache->dc_clp->cl_lock);
>>>> cache->dc_clp->cl_devid_cache =3D NULL
>>>> spin_unlock(&cache->dc_clp->cl_lock);
>>>>
>>>>
>>>
>>> That must happen atomically before kref_put.
>>> It's illegal to have cl_devid_cache be referenced by cache->dc_clp
>>> without a reference count backing it up.
>>> Otherwise, if accessed concurrently to this piece of code
>>> someone might call kref_get while the refcount is zero.
>>>
>>> Normally, you'd first clear the referencing pointer to prevent any
>>> new reference to it and only then kref_put it, e.g.:
>>>
>>> spin_lock(&cache->dc_clp->cl_lock);
>>> tmp =3D cache->dc_clp->cl_devid_cache;
>>> cache->dc_clp->cl_devid_cache =3D NULL
>>> spin_unlock(&cache->dc_clp->cl_lock);
>>> kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache);
>>
>> Good point. Thanks for the review. I'll rethink and resend
>>
>> -->Andy
>>
>>>
>>> Benny
>>>
>>>>> + =A0 =A0 =A0 kfree(cache);
>>>>> +}
>>>>> +
>>>>> +void
>>>>> +nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *c)
>>>>> +{
>>>>> + =A0 =A0 =A0 dprintk("%s [%d]\n", __func__, atomic_read(&c->dc_k=
ref.refcount));
>>>>> + =A0 =A0 =A0 kref_put(&c->dc_kref, nfs4_free_deviceid_cache);
>>>>> +}
>>>>> +EXPORT_SYMBOL(nfs4_put_deviceid_cache);
>>>>> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.=
h
>>>>> index 1d521f4..2a88a06 100644
>>>>> --- a/include/linux/nfs4_pnfs.h
>>>>> +++ b/include/linux/nfs4_pnfs.h
>>>>> @@ -281,6 +281,33 @@ struct pnfs_devicelist {
>>>>> =A0 =A0 =A0 =A0struct pnfs_deviceid =A0 =A0dev_id[NFS4_PNFS_GETDE=
VLIST_MAXNUM];
>>>>> =A0};
>>>>>
>>>>> +/*
>>>>> + * Device ID RCU cache. A device ID is unique per client ID and =
layout type.
>>>>> + */
>>>>> +struct nfs4_deviceid_cache {
>>>>> + =A0 =A0 =A0 spinlock_t =A0 =A0 =A0 =A0 =A0 =A0 =A0dc_lock;
>>>>> + =A0 =A0 =A0 struct list_head =A0 =A0 =A0 =A0dc_deviceids;
>>>>> + =A0 =A0 =A0 struct kref =A0 =A0 =A0 =A0 =A0 =A0 dc_kref;
>>>>> + =A0 =A0 =A0 void =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0(*dc_fr=
ee_callback)(struct rcu_head *);
>>>>> + =A0 =A0 =A0 struct nfs_client =A0 =A0 =A0 *dc_clp;
>>>>> +};
>>>>> +
>>>>> +/* Device ID cache node */
>>>>> +struct nfs4_deviceid {
>>>>> + =A0 =A0 =A0 struct list_head =A0 =A0 =A0 =A0de_node;
>>>>> + =A0 =A0 =A0 struct rcu_head =A0 =A0 =A0 =A0 de_rcu;
>>>>> + =A0 =A0 =A0 struct pnfs_deviceid =A0 =A0de_id;
>>>>> +};
>>>>> +
>>>>> +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 voi=
d (*free_callback)(struct rcu_head *));
>>>>> +extern void nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *=
);
>>>>> +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
>>>>> +extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_devi=
ceid_cache *,
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 str=
uct pnfs_deviceid *);
>>>>> +extern void nfs4_add_deviceid(struct nfs4_deviceid_cache *,
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 str=
uct nfs4_deviceid *);
>>>>> +
>>>>> =A0/* pNFS client callback functions.
>>>>> =A0* These operations allow the layout driver to access pNFS clie=
nt
>>>>> =A0* specific information or call pNFS client->server operations.
>>>>> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.=
h
>>>>> index 8522461..ef2e18e 100644
>>>>> --- a/include/linux/nfs_fs_sb.h
>>>>> +++ b/include/linux/nfs_fs_sb.h
>>>>> @@ -87,6 +87,7 @@ struct nfs_client {
>>>>> =A0 =A0 =A0 =A0u32 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 cl_exc=
hange_flags;
>>>>> =A0 =A0 =A0 =A0struct nfs4_session =A0 =A0 *cl_session; =A0 =A0/*=
 sharred session */
>>>>> =A0 =A0 =A0 =A0struct list_head =A0 =A0 =A0 =A0cl_lo_inodes; =A0 =
/* Inodes having layouts */
>>>>> + =A0 =A0 =A0 struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS=
 deviceid cache */
>>>>> =A0#endif /* CONFIG_NFS_V4_1 */
>>>>>
>>>>> =A0#ifdef CONFIG_NFS_FSCACHE
>>>>> --
>>>>> 1.6.6
>>>>>
>>>>> --
>>>>> To unsubscribe from this list: send the line "unsubscribe linux-n=
fs" in
>>>>> the body of a message to majordomo@vger.kernel.org
>>>>> More majordomo info at =A0http://vger.kernel.org/majordomo-info.h=
tml
>>>>>
>>>> _______________________________________________
>>>> pNFS mailing list
>>>> pNFS@linux-nfs.org
>>>> http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs
>>>
>
>
> --
> Benny Halevy
> Software Architect
> Panasas, Inc.
> bhalevy@panasas.com
> Tel/Fax: +972-3-647-8340
> Mobile: +972-54-802-8340
>
> Panasas: The Leader in Parallel Storage
> www.panasas.com
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [pnfs] [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache
       [not found]                 ` <v2h89c397151004220847v3a31c493s4089d0cd53cf3e19-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2010-04-22 15:51                   ` Benny Halevy
  0 siblings, 0 replies; 12+ messages in thread
From: Benny Halevy @ 2010-04-22 15:51 UTC (permalink / raw)
  To: William A. (Andy) Adamson; +Cc: pnfs, linux-nfs

On Apr. 22, 2010, 18:47 +0300, "William A. (Andy) Adamson" <androsadamson@gmail.com> wrote:
> On Thu, Apr 22, 2010 at 7:20 AM, Benny Halevy <bhalevy@panasas.com> wrote:
>> On Apr. 21, 2010, 18:22 +0300, "William A. (Andy) Adamson" <androsadamson@gmail.com> wrote:
>>> On Wed, Apr 21, 2010 at 1:59 AM, Benny Halevy <bhalevy@panasas.com> wrote:
>>>> On Apr. 16, 2010, 19:04 +0300, "William A. (Andy) Adamson" <androsadamson@gmail.com> wrote:
>>>>> On Fri, Apr 16, 2010 at 11:52 AM,  <andros@netapp.com> wrote:
>>>>>> From: Andy Adamson <andros@netapp.com>
>>>>>>
>>>>>> A shared RCU device ID cache servicing multiple mounts of a single layout type
>>>>>> per meta data server (struct nfs_client).
>>>>>>
>>>>>> Device IDs of type deviceid4 are required by all layout types, long lived and
>>>>>> read at each I/O.  They are added to the deviceid cache at first reference by
>>>>>> a layout via GETDEVICEINFO and (currently) are only removed at umount.
>>>>>>
>>>>>> Reference count the device ID cache for each mounted file system
>>>>>> in the initialize_mountpoint layoutdriver_io_operation.
>>>>>>
>>>>>> Dereference the device id cache on file system in the uninitialize_mountpoint
>>>>>> layoutdriver_io_operation called at umount
>>>>>>
>>>>>> Signed-off-by: Andy Adamson <andros@netapp.com>
>>>>>> ---
>>>>>>  fs/nfs/pnfs.c             |  119 +++++++++++++++++++++++++++++++++++++++++++++
>>>>>>  include/linux/nfs4_pnfs.h |   27 ++++++++++
>>>>>>  include/linux/nfs_fs_sb.h |    1 +
>>>>>>  3 files changed, 147 insertions(+), 0 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>>>>>> index 91572aa..8492aef 100644
>>>>>> --- a/fs/nfs/pnfs.c
>>>>>> +++ b/fs/nfs/pnfs.c
>>>>>> @@ -45,6 +45,7 @@
>>>>>>  #include <linux/nfs4.h>
>>>>>>  #include <linux/pnfs_xdr.h>
>>>>>>  #include <linux/nfs4_pnfs.h>
>>>>>> +#include <linux/rculist.h>
>>>>>>
>>>>>>  #include "internal.h"
>>>>>>  #include "nfs4_fs.h"
>>>>>> @@ -2296,3 +2297,121 @@ struct pnfs_client_operations pnfs_ops = {
>>>>>>
>>>>>>  EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
>>>>>>  EXPORT_SYMBOL(pnfs_register_layoutdriver);
>>>>>> +
>>>>>> +
>>>>>> +/* Device ID cache. Supports one layout type per struct nfs_client */
>>>>>> +int
>>>>>> +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
>>>>>> +                        void (*free_callback)(struct rcu_head *))
>>>>>> +{
>>>>>> +       struct nfs4_deviceid_cache *c;
>>>>>> +
>>>>>> +       c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL);
>>>>>> +       if (!c)
>>>>>> +               return -ENOMEM;
>>>>>> +       spin_lock(&clp->cl_lock);
>>>>>> +       if (clp->cl_devid_cache != NULL) {
>>>>>> +               kref_get(&clp->cl_devid_cache->dc_kref);
>>>>>> +               spin_unlock(&clp->cl_lock);
>>>>>> +               dprintk("%s [kref [%d]]\n", __func__,
>>>>>> +                       atomic_read(&clp->cl_devid_cache->dc_kref.refcount));
>>>>>> +               kfree(c);
>>>>>> +       } else {
>>>>>> +               spin_lock_init(&c->dc_lock);
>>>>>> +               INIT_LIST_HEAD(&c->dc_deviceids);
>>>>>> +               kref_init(&c->dc_kref);
>>>>>> +               c->dc_free_callback = free_callback;
>>>>>> +               c->dc_clp = clp;
>>>>>> +               clp->cl_devid_cache = c;
>>>>>> +               spin_unlock(&clp->cl_lock);
>>>>>> +               dprintk("%s [new]\n", __func__);
>>>>>> +       }
>>>>>> +       return 0;
>>>>>> +}
>>>>>> +EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
>>>>>> +
>>>>>> +void
>>>>>> +nfs4_init_deviceid_node(struct nfs4_deviceid *d)
>>>>>> +{
>>>>>> +       INIT_LIST_HEAD(&d->de_node);
>>>>>> +       INIT_RCU_HEAD(&d->de_rcu);
>>>>>> +}
>>>>>> +EXPORT_SYMBOL(nfs4_init_deviceid_node);
>>>>>> +
>>>>>> +struct nfs4_deviceid *
>>>>>> +nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id)
>>>>>> +{
>>>>>> +       struct nfs4_deviceid *d;
>>>>>> +
>>>>>> +       rcu_read_lock();
>>>>>> +       list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>>>>>> +               if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVICEID4_SIZE)) {
>>>>>> +                       rcu_read_unlock();
>>>>>> +                       return d;
>>>>>> +               }
>>>>>> +       }
>>>>>> +       rcu_read_unlock();
>>>>
>>>> I hope this is worth the added complexity...
>>>>
>>>> Out of curiosity, do you have a benchmark comparing the cost
>>>> of rcu_read_lock (and unlock) vs. spin_lock(&c->dc_lock)?
>>>
>>> The deviceid cache is read at each I/O. If we use a spin_lock to
>>
>> Yeah, I see where this goes...
>> In the objects layout driver we get a reference on the device structure
>> at alloc_lseg time and keep a pointer to it throughout the lseg's life time.
>> This saves the deviceid lookup on every I/O.
> 
> Perhaps that is a better way to go. How many deviceid's is 'normal'
> for the object driver?
> 

I'd say that order of 10's to a few 100's is normal.
1000 and up is a big installation.

Benny


> -->Andy
> 
> 
>>
>> Benny
>>
>>> protect the deviceid cache, this would mean that all I/0 is serialized
>>> behind the spin_lock even though the deviceid cache is changed
>>> infrequently. The RCU allows readers to "run almost naked" and does
>>> not serialize I/O behind reading the deviceid cache.
>>>
>>> So I think it is worth it. I have not benchmarked the difference.
>>>
>>>
>>>>
>>>>>> +       return NULL;
>>>>>> +}
>>>>>> +EXPORT_SYMBOL(nfs4_find_deviceid);
>>>>>> +
>>>>>> +/*
>>>>>> + * Add or kref_get a deviceid.
>>>>>> + * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
>>>>>> + */
>>>>>> +void
>>>>>> +nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new)
>>>>>> +{
>>>>>> +       struct nfs4_deviceid *d;
>>>>>> +
>>>>>> +       spin_lock(&c->dc_lock);
>>>>>> +       list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>>>>>> +               if (!memcmp(&d->de_id, &new->de_id, NFS4_PNFS_DEVICEID4_SIZE)) {
>>>>>> +                       spin_unlock(&c->dc_lock);
>>>>>> +                       dprintk("%s [discard]\n", __func__);
>>>>>> +                       c->dc_free_callback(&new->de_rcu);
>>>>>> +               }
>>>>>> +       }
>>>>>> +       list_add_rcu(&new->de_node, &c->dc_deviceids);
>>>>>> +       spin_unlock(&c->dc_lock);
>>>>>> +       dprintk("%s [new]\n", __func__);
>>>>>> +}
>>>>>> +EXPORT_SYMBOL(nfs4_add_deviceid);
>>>>>> +
>>>>>> +static int
>>>>>> +nfs4_remove_deviceid(struct nfs4_deviceid_cache *c)
>>>>>> +{
>>>>>> +       struct nfs4_deviceid *d;
>>>>>> +
>>>>>> +       spin_lock(&c->dc_lock);
>>>>>> +       list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>>>>>> +               list_del_rcu(&d->de_node);
>>>>>> +               spin_unlock(&c->dc_lock);
>>>>>> +               synchronize_rcu();
>>>>>> +               c->dc_free_callback(&d->de_rcu);
>>>>>> +               return 1;
>>>>>> +       }
>>>>>> +       spin_unlock(&c->dc_lock);
>>>>>> +       return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static void
>>>>>> +nfs4_free_deviceid_cache(struct kref *kref)
>>>>>> +{
>>>>>> +       struct nfs4_deviceid_cache *cache =
>>>>>> +               container_of(kref, struct nfs4_deviceid_cache, dc_kref);
>>>>>> +       int more = 1;
>>>>>> +
>>>>>> +       while (more)
>>>>>> +               more = nfs4_remove_deviceid(cache);
>>>>>> +       cache->dc_clp->cl_devid_cache = NULL;
>>>>>
>>>>> Need to take the cl_lock around this assignment
>>>>>
>>>>> spin_lock(&cache->dc_clp->cl_lock);
>>>>> cache->dc_clp->cl_devid_cache = NULL
>>>>> spin_unlock(&cache->dc_clp->cl_lock);
>>>>>
>>>>>
>>>>
>>>> That must happen atomically before kref_put.
>>>> It's illegal to have cl_devid_cache be referenced by cache->dc_clp
>>>> without a reference count backing it up.
>>>> Otherwise, if accessed concurrently to this piece of code
>>>> someone might call kref_get while the refcount is zero.
>>>>
>>>> Normally, you'd first clear the referencing pointer to prevent any
>>>> new reference to it and only then kref_put it, e.g.:
>>>>
>>>> spin_lock(&cache->dc_clp->cl_lock);
>>>> tmp = cache->dc_clp->cl_devid_cache;
>>>> cache->dc_clp->cl_devid_cache = NULL
>>>> spin_unlock(&cache->dc_clp->cl_lock);
>>>> kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache);
>>>
>>> Good point. Thanks for the review. I'll rethink and resend
>>>
>>> -->Andy
>>>
>>>>
>>>> Benny
>>>>
>>>>>> +       kfree(cache);
>>>>>> +}
>>>>>> +
>>>>>> +void
>>>>>> +nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *c)
>>>>>> +{
>>>>>> +       dprintk("%s [%d]\n", __func__, atomic_read(&c->dc_kref.refcount));
>>>>>> +       kref_put(&c->dc_kref, nfs4_free_deviceid_cache);
>>>>>> +}
>>>>>> +EXPORT_SYMBOL(nfs4_put_deviceid_cache);
>>>>>> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
>>>>>> index 1d521f4..2a88a06 100644
>>>>>> --- a/include/linux/nfs4_pnfs.h
>>>>>> +++ b/include/linux/nfs4_pnfs.h
>>>>>> @@ -281,6 +281,33 @@ struct pnfs_devicelist {
>>>>>>        struct pnfs_deviceid    dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
>>>>>>  };
>>>>>>
>>>>>> +/*
>>>>>> + * Device ID RCU cache. A device ID is unique per client ID and layout type.
>>>>>> + */
>>>>>> +struct nfs4_deviceid_cache {
>>>>>> +       spinlock_t              dc_lock;
>>>>>> +       struct list_head        dc_deviceids;
>>>>>> +       struct kref             dc_kref;
>>>>>> +       void                    (*dc_free_callback)(struct rcu_head *);
>>>>>> +       struct nfs_client       *dc_clp;
>>>>>> +};
>>>>>> +
>>>>>> +/* Device ID cache node */
>>>>>> +struct nfs4_deviceid {
>>>>>> +       struct list_head        de_node;
>>>>>> +       struct rcu_head         de_rcu;
>>>>>> +       struct pnfs_deviceid    de_id;
>>>>>> +};
>>>>>> +
>>>>>> +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
>>>>>> +                               void (*free_callback)(struct rcu_head *));
>>>>>> +extern void nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *);
>>>>>> +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
>>>>>> +extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_deviceid_cache *,
>>>>>> +                               struct pnfs_deviceid *);
>>>>>> +extern void nfs4_add_deviceid(struct nfs4_deviceid_cache *,
>>>>>> +                               struct nfs4_deviceid *);
>>>>>> +
>>>>>>  /* pNFS client callback functions.
>>>>>>  * These operations allow the layout driver to access pNFS client
>>>>>>  * specific information or call pNFS client->server operations.
>>>>>> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
>>>>>> index 8522461..ef2e18e 100644
>>>>>> --- a/include/linux/nfs_fs_sb.h
>>>>>> +++ b/include/linux/nfs_fs_sb.h
>>>>>> @@ -87,6 +87,7 @@ struct nfs_client {
>>>>>>        u32                     cl_exchange_flags;
>>>>>>        struct nfs4_session     *cl_session;    /* sharred session */
>>>>>>        struct list_head        cl_lo_inodes;   /* Inodes having layouts */
>>>>>> +       struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
>>>>>>  #endif /* CONFIG_NFS_V4_1 */
>>>>>>
>>>>>>  #ifdef CONFIG_NFS_FSCACHE
>>>>>> --
>>>>>> 1.6.6
>>>>>>
>>>>>> --
>>>>>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>>>>>> the body of a message to majordomo@vger.kernel.org
>>>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>>>>
>>>>> _______________________________________________
>>>>> pNFS mailing list
>>>>> pNFS@linux-nfs.org
>>>>> http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs
>>>>
>>
>>
>> --
>> Benny Halevy
>> Software Architect
>> Panasas, Inc.
>> bhalevy@panasas.com
>> Tel/Fax: +972-3-647-8340
>> Mobile: +972-54-802-8340
>>
>> Panasas: The Leader in Parallel Storage
>> www.panasas.com
>>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [pnfs] [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache
  2010-04-26 16:18 ` [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache andros
@ 2010-05-03 11:48   ` Benny Halevy
  2010-05-03 13:57     ` William A. (Andy) Adamson
  0 siblings, 1 reply; 12+ messages in thread
From: Benny Halevy @ 2010-05-03 11:48 UTC (permalink / raw)
  To: andros; +Cc: linux-nfs

On Apr. 26, 2010, 19:18 +0300, andros@netapp.com wrote:
> From: Andy Adamson <andros@netapp.com>
> 
> A shared RCU device ID cache servicing multiple mounts of a single layout type
> per meta data server (struct nfs_client).
> 
> Device IDs of type deviceid4 are required by all layout types, long lived and
> read at each I/O.  They are added to the deviceid cache at first reference by
> a layout via GETDEVICEINFO and (currently) are only removed at umount.
> 
> Reference count the device ID cache for each mounted file system
> in the initialize_mountpoint layoutdriver_io_operation.
> 
> Dereference the device id cache on file system in the uninitialize_mountpoint
> layoutdriver_io_operation called at umount
> 
> Each layoutsegment assigns a pointer and takes a reference to the
> nfs4_deviceid structure identified by the layout deviceid.
> This is so that there are no deviceid lookups for the normal I/O path.
> 
> Even thought required by all layouttypes, the deviceid is not exposed in the
> LAYOUTGET4res but is instead hidden in the opaque layouttype4.
> 
> Therefore, each layout type alloc_lseg calls nfs4_set_layout_deviceid,
> and free_lseg calls nfs4_unset_layout_deviceid.
> 
> While the file layout driver will not cache very many deviceid's, the object
> and block layout drivers could cache 100's for a large installation.
> Use an hlist.
> 
> Signed-off-by: Andy Adamson <andros@netapp.com>
> ---
>  fs/nfs/pnfs.c             |  167 +++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/nfs4_pnfs.h |   50 +++++++++++++
>  include/linux/nfs_fs_sb.h |    1 +
>  3 files changed, 218 insertions(+), 0 deletions(-)
> 
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index 91572aa..bf906cc 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -45,6 +45,7 @@
>  #include <linux/nfs4.h>
>  #include <linux/pnfs_xdr.h>
>  #include <linux/nfs4_pnfs.h>
> +#include <linux/rculist.h>
>  
>  #include "internal.h"
>  #include "nfs4_fs.h"
> @@ -2296,3 +2297,169 @@ struct pnfs_client_operations pnfs_ops = {
>  
>  EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
>  EXPORT_SYMBOL(pnfs_register_layoutdriver);
> +
> +
> +/* Device ID cache. Supports one layout type per struct nfs_client */
> +int
> +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
> +			 void (*free_callback)(struct kref *))
> +{
> +	struct nfs4_deviceid_cache *c;
> +
> +	c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL);
> +	if (!c)
> +		return -ENOMEM;
> +	spin_lock(&clp->cl_lock);
> +	if (clp->cl_devid_cache != NULL) {
> +		kref_get(&clp->cl_devid_cache->dc_kref);
> +		spin_unlock(&clp->cl_lock);
> +		dprintk("%s [kref [%d]]\n", __func__,
> +			atomic_read(&clp->cl_devid_cache->dc_kref.refcount));
> +		kfree(c);
> +	} else {
> +		int i;
> +
> +		spin_lock_init(&c->dc_lock);
> +		for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE ; i++)
> +			INIT_HLIST_HEAD(&c->dc_deviceids[i]);
> +		kref_init(&c->dc_kref);
> +		c->dc_free_callback = free_callback;
> +		clp->cl_devid_cache = c;
> +		spin_unlock(&clp->cl_lock);
> +		dprintk("%s [new]\n", __func__);
> +	}
> +	return 0;
> +}
> +EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
> +
> +void
> +nfs4_init_deviceid_node(struct nfs4_deviceid *d)
> +{
> +	INIT_HLIST_NODE(&d->de_node);
> +	kref_init(&d->de_kref);
> +}
> +EXPORT_SYMBOL(nfs4_init_deviceid_node);
> +
> +/* Called from layoutdriver_io_operations->alloc_lseg */
> +void
> +nfs4_set_layout_deviceid(struct pnfs_layout_segment *l, struct nfs4_deviceid *d)
> +{
> +	dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.refcount));
> +	l->deviceid = d;
> +	kref_get(&d->de_kref);
> +}
> +EXPORT_SYMBOL(nfs4_set_layout_deviceid);
> +
> +/* Called from layoutdriver_io_operations->free_lseg */
> +void
> +nfs4_unset_layout_deviceid(struct pnfs_layout_segment *l,
> +			   struct nfs4_deviceid *d,
> +			   void (*free_callback)(struct kref *))
> +{
> +	dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.refcount));
> +	l->deviceid = NULL;
> +	kref_put(&d->de_kref, free_callback);
> +}
> +EXPORT_SYMBOL(nfs4_unset_layout_deviceid);
> +
> +struct nfs4_deviceid *
> +nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id)
> +{
> +	struct nfs4_deviceid *d;
> +	struct hlist_node *n;
> +	long hash = nfs4_deviceid_hash(id);
> +
> +	dprintk("--> %s hash %ld\n", __func__, hash);
> +	rcu_read_lock();
> +	hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
> +		if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVICEID4_SIZE)) {
> +			rcu_read_unlock();
> +			return d;
> +		}
> +	}
> +	rcu_read_unlock();
> +	return NULL;
> +}
> +EXPORT_SYMBOL(nfs4_find_deviceid);
> +
> +/*
> + * Add or kref_get a deviceid.
> + * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
> + */
> +struct nfs4_deviceid *
> +nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new)
> +{
> +	struct nfs4_deviceid *d;
> +	struct hlist_node *n;
> +	long hash = nfs4_deviceid_hash(&new->de_id);
> +
> +	dprintk("--> %s hash %ld\n", __func__, hash);
> +	spin_lock(&c->dc_lock);
> +	hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
> +		if (!memcmp(&d->de_id, &new->de_id, NFS4_PNFS_DEVICEID4_SIZE)) {
> +			spin_unlock(&c->dc_lock);
> +			dprintk("%s [discard]\n", __func__);
> +			c->dc_free_callback(&new->de_kref);
> +			return d;
> +		}
> +	}
> +	hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
> +	spin_unlock(&c->dc_lock);
> +	dprintk("%s [new]\n", __func__);
> +	return new;
> +}
> +EXPORT_SYMBOL(nfs4_add_deviceid);
> +
> +static int
> +nfs4_remove_deviceid(struct nfs4_deviceid_cache *c, long hash)
> +{
> +	struct nfs4_deviceid *d;
> +	struct hlist_node *n;
> +
> +	dprintk("--> %s hash %ld\n", __func__, hash);
> +	spin_lock(&c->dc_lock);
> +	hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
> +		hlist_del_rcu(&d->de_node);
> +		spin_unlock(&c->dc_lock);
> +		synchronize_rcu();
> +		dprintk("%s [%d]\n", __func__,
> +			atomic_read(&d->de_kref.refcount));
> +		kref_put(&d->de_kref, c->dc_free_callback);
> +		return 1;
> +	}
> +	spin_unlock(&c->dc_lock);
> +	return 0;
> +}
> +
> +static void
> +nfs4_free_deviceid_cache(struct kref *kref)
> +{
> +	struct nfs4_deviceid_cache *cache =
> +		container_of(kref, struct nfs4_deviceid_cache, dc_kref);
> +	int more;
> +	long i;
> +
> +	for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++) {
> +		more = 1;
> +		while (more)
> +			more = nfs4_remove_deviceid(cache, i);

Andy, this can be simplified to

		while (nfs4_remove_deviceid(cache, i))
			;

If ok with you, I'll make this change upon merging.

Benny

> +	}
> +	kfree(cache);
> +}
> +
> +void
> +nfs4_put_deviceid_cache(struct nfs_client *clp)
> +{
> +	struct nfs4_deviceid_cache *tmp = clp->cl_devid_cache;
> +	int refcount;
> +
> +	dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache);
> +	spin_lock(&clp->cl_lock);
> +	refcount = atomic_read(&clp->cl_devid_cache->dc_kref.refcount);
> +	if (refcount == 1)
> +		clp->cl_devid_cache = NULL;
> +	spin_unlock(&clp->cl_lock);
> +	dprintk("%s [%d]\n", __func__, refcount);
> +	kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache);
> +}
> +EXPORT_SYMBOL(nfs4_put_deviceid_cache);
> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
> index 3caac60..3b7aeb7 100644
> --- a/include/linux/nfs4_pnfs.h
> +++ b/include/linux/nfs4_pnfs.h
> @@ -106,6 +106,7 @@ struct pnfs_layout_segment {
>  	struct kref kref;
>  	bool valid;
>  	struct pnfs_layout_type *layout;
> +	struct nfs4_deviceid *deviceid;
>  	u8 ld_data[];			/* layout driver private data */
>  };
>  
> @@ -275,6 +276,55 @@ struct pnfs_devicelist {
>  	struct pnfs_deviceid	dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
>  };
>  
> +/*
> + * Device ID RCU cache. A device ID is unique per client ID and layout type.
> + */
> +#define NFS4_DEVICE_ID_HASH_BITS	5
> +#define NFS4_DEVICE_ID_HASH_SIZE	(1 << NFS4_DEVICE_ID_HASH_BITS)
> +#define NFS4_DEVICE_ID_HASH_MASK	(NFS4_DEVICE_ID_HASH_SIZE - 1)
> +
> +static inline u32
> +nfs4_deviceid_hash(struct pnfs_deviceid *id)
> +{
> +	unsigned char *cptr = (unsigned char *)id->data;
> +	unsigned int nbytes = NFS4_PNFS_DEVICEID4_SIZE;
> +	u32 x = 0;
> +
> +	while (nbytes--) {
> +		x *= 37;
> +		x += *cptr++;
> +	}
> +	return x & NFS4_DEVICE_ID_HASH_MASK;
> +}
> +
> +struct nfs4_deviceid_cache {
> +	spinlock_t		dc_lock;
> +	struct kref		dc_kref;
> +	void			(*dc_free_callback)(struct kref *);
> +	struct hlist_head	dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
> +};
> +
> +/* Device ID cache node */
> +struct nfs4_deviceid {
> +	struct hlist_node	de_node;
> +	struct pnfs_deviceid	de_id;
> +	struct kref		de_kref;
> +};
> +
> +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
> +				void (*free_callback)(struct kref *));
> +extern void nfs4_put_deviceid_cache(struct nfs_client *);
> +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
> +extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_deviceid_cache *,
> +				struct pnfs_deviceid *);
> +extern struct nfs4_deviceid *nfs4_add_deviceid(struct nfs4_deviceid_cache *,
> +				struct nfs4_deviceid *);
> +extern void nfs4_set_layout_deviceid(struct pnfs_layout_segment *,
> +				struct nfs4_deviceid *);
> +extern void nfs4_unset_layout_deviceid(struct pnfs_layout_segment *,
> +				struct nfs4_deviceid *,
> +				void (*free_callback)(struct kref *));
> +
>  /* pNFS client callback functions.
>   * These operations allow the layout driver to access pNFS client
>   * specific information or call pNFS client->server operations.
> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
> index 8522461..ef2e18e 100644
> --- a/include/linux/nfs_fs_sb.h
> +++ b/include/linux/nfs_fs_sb.h
> @@ -87,6 +87,7 @@ struct nfs_client {
>  	u32			cl_exchange_flags;
>  	struct nfs4_session	*cl_session; 	/* sharred session */
>  	struct list_head	cl_lo_inodes;	/* Inodes having layouts */
> +	struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
>  #endif /* CONFIG_NFS_V4_1 */
>  
>  #ifdef CONFIG_NFS_FSCACHE


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [pnfs] [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache
  2010-05-03 11:48   ` [pnfs] " Benny Halevy
@ 2010-05-03 13:57     ` William A. (Andy) Adamson
  0 siblings, 0 replies; 12+ messages in thread
From: William A. (Andy) Adamson @ 2010-05-03 13:57 UTC (permalink / raw)
  To: Benny Halevy; +Cc: linux-nfs

On Mon, May 3, 2010 at 7:48 AM, Benny Halevy <bhalevy@panasas.com> wrot=
e:
> On Apr. 26, 2010, 19:18 +0300, andros@netapp.com wrote:
>> From: Andy Adamson <andros@netapp.com>
>>
>> A shared RCU device ID cache servicing multiple mounts of a single l=
ayout type
>> per meta data server (struct nfs_client).
>>
>> Device IDs of type deviceid4 are required by all layout types, long =
lived and
>> read at each I/O. =A0They are added to the deviceid cache at first r=
eference by
>> a layout via GETDEVICEINFO and (currently) are only removed at umoun=
t.
>>
>> Reference count the device ID cache for each mounted file system
>> in the initialize_mountpoint layoutdriver_io_operation.
>>
>> Dereference the device id cache on file system in the uninitialize_m=
ountpoint
>> layoutdriver_io_operation called at umount
>>
>> Each layoutsegment assigns a pointer and takes a reference to the
>> nfs4_deviceid structure identified by the layout deviceid.
>> This is so that there are no deviceid lookups for the normal I/O pat=
h.
>>
>> Even thought required by all layouttypes, the deviceid is not expose=
d in the
>> LAYOUTGET4res but is instead hidden in the opaque layouttype4.
>>
>> Therefore, each layout type alloc_lseg calls nfs4_set_layout_devicei=
d,
>> and free_lseg calls nfs4_unset_layout_deviceid.
>>
>> While the file layout driver will not cache very many deviceid's, th=
e object
>> and block layout drivers could cache 100's for a large installation.
>> Use an hlist.
>>
>> Signed-off-by: Andy Adamson <andros@netapp.com>
>> ---
>> =A0fs/nfs/pnfs.c =A0 =A0 =A0 =A0 =A0 =A0 | =A0167 ++++++++++++++++++=
+++++++++++++++++++++++++++
>> =A0include/linux/nfs4_pnfs.h | =A0 50 +++++++++++++
>> =A0include/linux/nfs_fs_sb.h | =A0 =A01 +
>> =A03 files changed, 218 insertions(+), 0 deletions(-)
>>
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index 91572aa..bf906cc 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -45,6 +45,7 @@
>> =A0#include <linux/nfs4.h>
>> =A0#include <linux/pnfs_xdr.h>
>> =A0#include <linux/nfs4_pnfs.h>
>> +#include <linux/rculist.h>
>>
>> =A0#include "internal.h"
>> =A0#include "nfs4_fs.h"
>> @@ -2296,3 +2297,169 @@ struct pnfs_client_operations pnfs_ops =3D {
>>
>> =A0EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
>> =A0EXPORT_SYMBOL(pnfs_register_layoutdriver);
>> +
>> +
>> +/* Device ID cache. Supports one layout type per struct nfs_client =
*/
>> +int
>> +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0void (*free_callback)(s=
truct kref *))
>> +{
>> + =A0 =A0 struct nfs4_deviceid_cache *c;
>> +
>> + =A0 =A0 c =3D kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERN=
EL);
>> + =A0 =A0 if (!c)
>> + =A0 =A0 =A0 =A0 =A0 =A0 return -ENOMEM;
>> + =A0 =A0 spin_lock(&clp->cl_lock);
>> + =A0 =A0 if (clp->cl_devid_cache !=3D NULL) {
>> + =A0 =A0 =A0 =A0 =A0 =A0 kref_get(&clp->cl_devid_cache->dc_kref);
>> + =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&clp->cl_lock);
>> + =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [kref [%d]]\n", __func__,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 atomic_read(&clp->cl_devid=
_cache->dc_kref.refcount));
>> + =A0 =A0 =A0 =A0 =A0 =A0 kfree(c);
>> + =A0 =A0 } else {
>> + =A0 =A0 =A0 =A0 =A0 =A0 int i;
>> +
>> + =A0 =A0 =A0 =A0 =A0 =A0 spin_lock_init(&c->dc_lock);
>> + =A0 =A0 =A0 =A0 =A0 =A0 for (i =3D 0; i < NFS4_DEVICE_ID_HASH_SIZE=
 ; i++)
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 INIT_HLIST_HEAD(&c->dc_dev=
iceids[i]);
>> + =A0 =A0 =A0 =A0 =A0 =A0 kref_init(&c->dc_kref);
>> + =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback =3D free_callback;
>> + =A0 =A0 =A0 =A0 =A0 =A0 clp->cl_devid_cache =3D c;
>> + =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&clp->cl_lock);
>> + =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [new]\n", __func__);
>> + =A0 =A0 }
>> + =A0 =A0 return 0;
>> +}
>> +EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
>> +
>> +void
>> +nfs4_init_deviceid_node(struct nfs4_deviceid *d)
>> +{
>> + =A0 =A0 INIT_HLIST_NODE(&d->de_node);
>> + =A0 =A0 kref_init(&d->de_kref);
>> +}
>> +EXPORT_SYMBOL(nfs4_init_deviceid_node);
>> +
>> +/* Called from layoutdriver_io_operations->alloc_lseg */
>> +void
>> +nfs4_set_layout_deviceid(struct pnfs_layout_segment *l, struct nfs4=
_deviceid *d)
>> +{
>> + =A0 =A0 dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.ref=
count));
>> + =A0 =A0 l->deviceid =3D d;
>> + =A0 =A0 kref_get(&d->de_kref);
>> +}
>> +EXPORT_SYMBOL(nfs4_set_layout_deviceid);
>> +
>> +/* Called from layoutdriver_io_operations->free_lseg */
>> +void
>> +nfs4_unset_layout_deviceid(struct pnfs_layout_segment *l,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0struct nfs4_devicei=
d *d,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0void (*free_callbac=
k)(struct kref *))
>> +{
>> + =A0 =A0 dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.ref=
count));
>> + =A0 =A0 l->deviceid =3D NULL;
>> + =A0 =A0 kref_put(&d->de_kref, free_callback);
>> +}
>> +EXPORT_SYMBOL(nfs4_unset_layout_deviceid);
>> +
>> +struct nfs4_deviceid *
>> +nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_devic=
eid *id)
>> +{
>> + =A0 =A0 struct nfs4_deviceid *d;
>> + =A0 =A0 struct hlist_node *n;
>> + =A0 =A0 long hash =3D nfs4_deviceid_hash(id);
>> +
>> + =A0 =A0 dprintk("--> %s hash %ld\n", __func__, hash);
>> + =A0 =A0 rcu_read_lock();
>> + =A0 =A0 hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_=
node) {
>> + =A0 =A0 =A0 =A0 =A0 =A0 if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVIC=
EID4_SIZE)) {
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 rcu_read_unlock();
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 return d;
>> + =A0 =A0 =A0 =A0 =A0 =A0 }
>> + =A0 =A0 }
>> + =A0 =A0 rcu_read_unlock();
>> + =A0 =A0 return NULL;
>> +}
>> +EXPORT_SYMBOL(nfs4_find_deviceid);
>> +
>> +/*
>> + * Add or kref_get a deviceid.
>> + * GETDEVICEINFOs for same deviceid can race. If deviceid is found,=
 discard new
>> + */
>> +struct nfs4_deviceid *
>> +nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_device=
id *new)
>> +{
>> + =A0 =A0 struct nfs4_deviceid *d;
>> + =A0 =A0 struct hlist_node *n;
>> + =A0 =A0 long hash =3D nfs4_deviceid_hash(&new->de_id);
>> +
>> + =A0 =A0 dprintk("--> %s hash %ld\n", __func__, hash);
>> + =A0 =A0 spin_lock(&c->dc_lock);
>> + =A0 =A0 hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_=
node) {
>> + =A0 =A0 =A0 =A0 =A0 =A0 if (!memcmp(&d->de_id, &new->de_id, NFS4_P=
NFS_DEVICEID4_SIZE)) {
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&c->dc_lock);
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [discard]\n", =
__func__);
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback(&new->=
de_kref);
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 return d;
>> + =A0 =A0 =A0 =A0 =A0 =A0 }
>> + =A0 =A0 }
>> + =A0 =A0 hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
>> + =A0 =A0 spin_unlock(&c->dc_lock);
>> + =A0 =A0 dprintk("%s [new]\n", __func__);
>> + =A0 =A0 return new;
>> +}
>> +EXPORT_SYMBOL(nfs4_add_deviceid);
>> +
>> +static int
>> +nfs4_remove_deviceid(struct nfs4_deviceid_cache *c, long hash)
>> +{
>> + =A0 =A0 struct nfs4_deviceid *d;
>> + =A0 =A0 struct hlist_node *n;
>> +
>> + =A0 =A0 dprintk("--> %s hash %ld\n", __func__, hash);
>> + =A0 =A0 spin_lock(&c->dc_lock);
>> + =A0 =A0 hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_=
node) {
>> + =A0 =A0 =A0 =A0 =A0 =A0 hlist_del_rcu(&d->de_node);
>> + =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&c->dc_lock);
>> + =A0 =A0 =A0 =A0 =A0 =A0 synchronize_rcu();
>> + =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [%d]\n", __func__,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 atomic_read(&d->de_kref.re=
fcount));
>> + =A0 =A0 =A0 =A0 =A0 =A0 kref_put(&d->de_kref, c->dc_free_callback)=
;
>> + =A0 =A0 =A0 =A0 =A0 =A0 return 1;
>> + =A0 =A0 }
>> + =A0 =A0 spin_unlock(&c->dc_lock);
>> + =A0 =A0 return 0;
>> +}
>> +
>> +static void
>> +nfs4_free_deviceid_cache(struct kref *kref)
>> +{
>> + =A0 =A0 struct nfs4_deviceid_cache *cache =3D
>> + =A0 =A0 =A0 =A0 =A0 =A0 container_of(kref, struct nfs4_deviceid_ca=
che, dc_kref);
>> + =A0 =A0 int more;
>> + =A0 =A0 long i;
>> +
>> + =A0 =A0 for (i =3D 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++) {
>> + =A0 =A0 =A0 =A0 =A0 =A0 more =3D 1;
>> + =A0 =A0 =A0 =A0 =A0 =A0 while (more)
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 more =3D nfs4_remove_devic=
eid(cache, i);
>
> Andy, this can be simplified to
>
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0while (nfs4_remove_deviceid(cache, i))
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0;
>
> If ok with you, I'll make this change upon merging.

Yes - looks fine, thanks.

-->Andy

>
> Benny
>
>> + =A0 =A0 }
>> + =A0 =A0 kfree(cache);
>> +}
>> +
>> +void
>> +nfs4_put_deviceid_cache(struct nfs_client *clp)
>> +{
>> + =A0 =A0 struct nfs4_deviceid_cache *tmp =3D clp->cl_devid_cache;
>> + =A0 =A0 int refcount;
>> +
>> + =A0 =A0 dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_de=
vid_cache);
>> + =A0 =A0 spin_lock(&clp->cl_lock);
>> + =A0 =A0 refcount =3D atomic_read(&clp->cl_devid_cache->dc_kref.ref=
count);
>> + =A0 =A0 if (refcount =3D=3D 1)
>> + =A0 =A0 =A0 =A0 =A0 =A0 clp->cl_devid_cache =3D NULL;
>> + =A0 =A0 spin_unlock(&clp->cl_lock);
>> + =A0 =A0 dprintk("%s [%d]\n", __func__, refcount);
>> + =A0 =A0 kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache);
>> +}
>> +EXPORT_SYMBOL(nfs4_put_deviceid_cache);
>> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
>> index 3caac60..3b7aeb7 100644
>> --- a/include/linux/nfs4_pnfs.h
>> +++ b/include/linux/nfs4_pnfs.h
>> @@ -106,6 +106,7 @@ struct pnfs_layout_segment {
>> =A0 =A0 =A0 struct kref kref;
>> =A0 =A0 =A0 bool valid;
>> =A0 =A0 =A0 struct pnfs_layout_type *layout;
>> + =A0 =A0 struct nfs4_deviceid *deviceid;
>> =A0 =A0 =A0 u8 ld_data[]; =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 /* lay=
out driver private data */
>> =A0};
>>
>> @@ -275,6 +276,55 @@ struct pnfs_devicelist {
>> =A0 =A0 =A0 struct pnfs_deviceid =A0 =A0dev_id[NFS4_PNFS_GETDEVLIST_=
MAXNUM];
>> =A0};
>>
>> +/*
>> + * Device ID RCU cache. A device ID is unique per client ID and lay=
out type.
>> + */
>> +#define NFS4_DEVICE_ID_HASH_BITS =A0 =A0 5
>> +#define NFS4_DEVICE_ID_HASH_SIZE =A0 =A0 (1 << NFS4_DEVICE_ID_HASH_=
BITS)
>> +#define NFS4_DEVICE_ID_HASH_MASK =A0 =A0 (NFS4_DEVICE_ID_HASH_SIZE =
- 1)
>> +
>> +static inline u32
>> +nfs4_deviceid_hash(struct pnfs_deviceid *id)
>> +{
>> + =A0 =A0 unsigned char *cptr =3D (unsigned char *)id->data;
>> + =A0 =A0 unsigned int nbytes =3D NFS4_PNFS_DEVICEID4_SIZE;
>> + =A0 =A0 u32 x =3D 0;
>> +
>> + =A0 =A0 while (nbytes--) {
>> + =A0 =A0 =A0 =A0 =A0 =A0 x *=3D 37;
>> + =A0 =A0 =A0 =A0 =A0 =A0 x +=3D *cptr++;
>> + =A0 =A0 }
>> + =A0 =A0 return x & NFS4_DEVICE_ID_HASH_MASK;
>> +}
>> +
>> +struct nfs4_deviceid_cache {
>> + =A0 =A0 spinlock_t =A0 =A0 =A0 =A0 =A0 =A0 =A0dc_lock;
>> + =A0 =A0 struct kref =A0 =A0 =A0 =A0 =A0 =A0 dc_kref;
>> + =A0 =A0 void =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0(*dc_free_call=
back)(struct kref *);
>> + =A0 =A0 struct hlist_head =A0 =A0 =A0 dc_deviceids[NFS4_DEVICE_ID_=
HASH_SIZE];
>> +};
>> +
>> +/* Device ID cache node */
>> +struct nfs4_deviceid {
>> + =A0 =A0 struct hlist_node =A0 =A0 =A0 de_node;
>> + =A0 =A0 struct pnfs_deviceid =A0 =A0de_id;
>> + =A0 =A0 struct kref =A0 =A0 =A0 =A0 =A0 =A0 de_kref;
>> +};
>> +
>> +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 void (*fre=
e_callback)(struct kref *));
>> +extern void nfs4_put_deviceid_cache(struct nfs_client *);
>> +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
>> +extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_devicei=
d_cache *,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 struct pnf=
s_deviceid *);
>> +extern struct nfs4_deviceid *nfs4_add_deviceid(struct nfs4_deviceid=
_cache *,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 struct nfs=
4_deviceid *);
>> +extern void nfs4_set_layout_deviceid(struct pnfs_layout_segment *,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 struct nfs=
4_deviceid *);
>> +extern void nfs4_unset_layout_deviceid(struct pnfs_layout_segment *=
,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 struct nfs=
4_deviceid *,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 void (*fre=
e_callback)(struct kref *));
>> +
>> =A0/* pNFS client callback functions.
>> =A0 * These operations allow the layout driver to access pNFS client
>> =A0 * specific information or call pNFS client->server operations.
>> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
>> index 8522461..ef2e18e 100644
>> --- a/include/linux/nfs_fs_sb.h
>> +++ b/include/linux/nfs_fs_sb.h
>> @@ -87,6 +87,7 @@ struct nfs_client {
>> =A0 =A0 =A0 u32 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 cl_exchange_=
flags;
>> =A0 =A0 =A0 struct nfs4_session =A0 =A0 *cl_session; =A0 =A0/* sharr=
ed session */
>> =A0 =A0 =A0 struct list_head =A0 =A0 =A0 =A0cl_lo_inodes; =A0 /* Ino=
des having layouts */
>> + =A0 =A0 struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS device=
id cache */
>> =A0#endif /* CONFIG_NFS_V4_1 */
>>
>> =A0#ifdef CONFIG_NFS_FSCACHE
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" =
in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at =A0http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2010-05-03 13:57 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-04-16 15:52 [PATCH 0/3] pNFS generic device ID cache andros
2010-04-16 15:52 ` [PATCH 1/3] SQUASHME pnfs_submit: " andros
2010-04-16 15:52   ` [PATCH 2/3] SQUASHME pnfs_submit: fix multiple mount set_pnfs_layoutdriver andros
2010-04-16 15:52     ` [PATCH 3/3] SQUASHME pnfs-submit: file layout driver generic device ID cache andros
2010-04-16 16:04   ` [PATCH 1/3] SQUASHME pnfs_submit: " William A. (Andy) Adamson
     [not found]     ` <u2n89c397151004160904m9e862360xcaf0e187640b0177-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2010-04-21  5:59       ` [pnfs] " Benny Halevy
2010-04-21 15:22         ` William A. (Andy) Adamson
     [not found]           ` <l2l89c397151004210822j8b43009o3a9e78ceed901fd9-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2010-04-22 11:20             ` Benny Halevy
2010-04-22 15:47               ` William A. (Andy) Adamson
     [not found]                 ` <v2h89c397151004220847v3a31c493s4089d0cd53cf3e19-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2010-04-22 15:51                   ` Benny Halevy
  -- strict thread matches above, loose matches on Subject: below --
2010-04-26 16:18 [PATCH 0/3] pNFS generic device ID cache version 3 andros
2010-04-26 16:18 ` [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache andros
2010-05-03 11:48   ` [pnfs] " Benny Halevy
2010-05-03 13:57     ` William A. (Andy) Adamson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).