* [PATCH 0/3] pNFS generic device ID cache
@ 2010-04-16 15:52 andros
2010-04-16 15:52 ` [PATCH 1/3] SQUASHME pnfs_submit: " andros
0 siblings, 1 reply; 12+ messages in thread
From: andros @ 2010-04-16 15:52 UTC (permalink / raw)
To: pnfs; +Cc: linux-nfs
This patch set implements a shared RCU device ID cache servicing multiple
mounts of a single layout type per meta data server (struct nfs_client).
0001-pnfs_submit-generic-device-ID-cache.patch
0002-pnfs_submit-fix-multiple-mount-set_pnfs_layoutdriver.patch
0003-pnfs-submit-file-layout-driver-generic-device-ID-cac.patch
These patches apply to the 2.6.34-rc3 pnfs branch below the following patch
which is the head of the revert patches - otherwise known as the first patch
not included in the pnfs file layout driver submission.
9a75b7356ca22b10903b507383646201cdcc9020
pnfs: filelayout: CB_NOTIFY_DEVICE support
This reverts commit 933b55c15ed2b7fd89d8e0342bd5b2825726201a
"SQUASHME pnfs_submit: remove filelayout CB_NOTIFY_DEVICE support"
Testing:
CONFIG_NFS_V4_1 set:
NFSv4.1/pNFS mounts:
Connectathon tests pass against GFS2/pNFS with a single AUTH_SYS mount, a double
AUTH_SYS mount, and an AUTH_SYS and AUTH_GSS/KRB5 mount (which creates
two superblocks under a struct nfs_client and both share the device id cache).
NFSv4.0 mount;
Connectathon tests pass
Did not test with multiple device ID's. I will create a mulitple device ID
test with the pynfs file layout server.
CONFIG_NFS_V4_1 not set:
NFSv4.0 mount: Connectathon tests pass.
-->Andy
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache
2010-04-16 15:52 [PATCH 0/3] pNFS generic device ID cache andros
@ 2010-04-16 15:52 ` andros
2010-04-16 15:52 ` [PATCH 2/3] SQUASHME pnfs_submit: fix multiple mount set_pnfs_layoutdriver andros
2010-04-16 16:04 ` [PATCH 1/3] SQUASHME pnfs_submit: " William A. (Andy) Adamson
0 siblings, 2 replies; 12+ messages in thread
From: andros @ 2010-04-16 15:52 UTC (permalink / raw)
To: pnfs-Xh+NVF5n0LLYtjvyW6yDsg; +Cc: linux-nfs, Andy Adamson
From: Andy Adamson <andros@netapp.com>
A shared RCU device ID cache servicing multiple mounts of a single layout type
per meta data server (struct nfs_client).
Device IDs of type deviceid4 are required by all layout types, long lived and
read at each I/O. They are added to the deviceid cache at first reference by
a layout via GETDEVICEINFO and (currently) are only removed at umount.
Reference count the device ID cache for each mounted file system
in the initialize_mountpoint layoutdriver_io_operation.
Dereference the device id cache on file system in the uninitialize_mountpoint
layoutdriver_io_operation called at umount
Signed-off-by: Andy Adamson <andros@netapp.com>
---
fs/nfs/pnfs.c | 119 +++++++++++++++++++++++++++++++++++++++++++++
include/linux/nfs4_pnfs.h | 27 ++++++++++
include/linux/nfs_fs_sb.h | 1 +
3 files changed, 147 insertions(+), 0 deletions(-)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 91572aa..8492aef 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -45,6 +45,7 @@
#include <linux/nfs4.h>
#include <linux/pnfs_xdr.h>
#include <linux/nfs4_pnfs.h>
+#include <linux/rculist.h>
#include "internal.h"
#include "nfs4_fs.h"
@@ -2296,3 +2297,121 @@ struct pnfs_client_operations pnfs_ops = {
EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
EXPORT_SYMBOL(pnfs_register_layoutdriver);
+
+
+/* Device ID cache. Supports one layout type per struct nfs_client */
+int
+nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
+ void (*free_callback)(struct rcu_head *))
+{
+ struct nfs4_deviceid_cache *c;
+
+ c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL);
+ if (!c)
+ return -ENOMEM;
+ spin_lock(&clp->cl_lock);
+ if (clp->cl_devid_cache != NULL) {
+ kref_get(&clp->cl_devid_cache->dc_kref);
+ spin_unlock(&clp->cl_lock);
+ dprintk("%s [kref [%d]]\n", __func__,
+ atomic_read(&clp->cl_devid_cache->dc_kref.refcount));
+ kfree(c);
+ } else {
+ spin_lock_init(&c->dc_lock);
+ INIT_LIST_HEAD(&c->dc_deviceids);
+ kref_init(&c->dc_kref);
+ c->dc_free_callback = free_callback;
+ c->dc_clp = clp;
+ clp->cl_devid_cache = c;
+ spin_unlock(&clp->cl_lock);
+ dprintk("%s [new]\n", __func__);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
+
+void
+nfs4_init_deviceid_node(struct nfs4_deviceid *d)
+{
+ INIT_LIST_HEAD(&d->de_node);
+ INIT_RCU_HEAD(&d->de_rcu);
+}
+EXPORT_SYMBOL(nfs4_init_deviceid_node);
+
+struct nfs4_deviceid *
+nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id)
+{
+ struct nfs4_deviceid *d;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
+ if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVICEID4_SIZE)) {
+ rcu_read_unlock();
+ return d;
+ }
+ }
+ rcu_read_unlock();
+ return NULL;
+}
+EXPORT_SYMBOL(nfs4_find_deviceid);
+
+/*
+ * Add or kref_get a deviceid.
+ * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
+ */
+void
+nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new)
+{
+ struct nfs4_deviceid *d;
+
+ spin_lock(&c->dc_lock);
+ list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
+ if (!memcmp(&d->de_id, &new->de_id, NFS4_PNFS_DEVICEID4_SIZE)) {
+ spin_unlock(&c->dc_lock);
+ dprintk("%s [discard]\n", __func__);
+ c->dc_free_callback(&new->de_rcu);
+ }
+ }
+ list_add_rcu(&new->de_node, &c->dc_deviceids);
+ spin_unlock(&c->dc_lock);
+ dprintk("%s [new]\n", __func__);
+}
+EXPORT_SYMBOL(nfs4_add_deviceid);
+
+static int
+nfs4_remove_deviceid(struct nfs4_deviceid_cache *c)
+{
+ struct nfs4_deviceid *d;
+
+ spin_lock(&c->dc_lock);
+ list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
+ list_del_rcu(&d->de_node);
+ spin_unlock(&c->dc_lock);
+ synchronize_rcu();
+ c->dc_free_callback(&d->de_rcu);
+ return 1;
+ }
+ spin_unlock(&c->dc_lock);
+ return 0;
+}
+
+static void
+nfs4_free_deviceid_cache(struct kref *kref)
+{
+ struct nfs4_deviceid_cache *cache =
+ container_of(kref, struct nfs4_deviceid_cache, dc_kref);
+ int more = 1;
+
+ while (more)
+ more = nfs4_remove_deviceid(cache);
+ cache->dc_clp->cl_devid_cache = NULL;
+ kfree(cache);
+}
+
+void
+nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *c)
+{
+ dprintk("%s [%d]\n", __func__, atomic_read(&c->dc_kref.refcount));
+ kref_put(&c->dc_kref, nfs4_free_deviceid_cache);
+}
+EXPORT_SYMBOL(nfs4_put_deviceid_cache);
diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
index 1d521f4..2a88a06 100644
--- a/include/linux/nfs4_pnfs.h
+++ b/include/linux/nfs4_pnfs.h
@@ -281,6 +281,33 @@ struct pnfs_devicelist {
struct pnfs_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
};
+/*
+ * Device ID RCU cache. A device ID is unique per client ID and layout type.
+ */
+struct nfs4_deviceid_cache {
+ spinlock_t dc_lock;
+ struct list_head dc_deviceids;
+ struct kref dc_kref;
+ void (*dc_free_callback)(struct rcu_head *);
+ struct nfs_client *dc_clp;
+};
+
+/* Device ID cache node */
+struct nfs4_deviceid {
+ struct list_head de_node;
+ struct rcu_head de_rcu;
+ struct pnfs_deviceid de_id;
+};
+
+extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
+ void (*free_callback)(struct rcu_head *));
+extern void nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *);
+extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
+extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_deviceid_cache *,
+ struct pnfs_deviceid *);
+extern void nfs4_add_deviceid(struct nfs4_deviceid_cache *,
+ struct nfs4_deviceid *);
+
/* pNFS client callback functions.
* These operations allow the layout driver to access pNFS client
* specific information or call pNFS client->server operations.
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 8522461..ef2e18e 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -87,6 +87,7 @@ struct nfs_client {
u32 cl_exchange_flags;
struct nfs4_session *cl_session; /* sharred session */
struct list_head cl_lo_inodes; /* Inodes having layouts */
+ struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
#endif /* CONFIG_NFS_V4_1 */
#ifdef CONFIG_NFS_FSCACHE
--
1.6.6
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 2/3] SQUASHME pnfs_submit: fix multiple mount set_pnfs_layoutdriver
2010-04-16 15:52 ` [PATCH 1/3] SQUASHME pnfs_submit: " andros
@ 2010-04-16 15:52 ` andros
2010-04-16 15:52 ` [PATCH 3/3] SQUASHME pnfs-submit: file layout driver generic device ID cache andros
2010-04-16 16:04 ` [PATCH 1/3] SQUASHME pnfs_submit: " William A. (Andy) Adamson
1 sibling, 1 reply; 12+ messages in thread
From: andros @ 2010-04-16 15:52 UTC (permalink / raw)
To: pnfs-Xh+NVF5n0LLYtjvyW6yDsg; +Cc: linux-nfs, Andy Adamson
From: Andy Adamson <andros@netapp.com>
The same struct nfs_server can enter set_pnfs_layoutdriver for mounts that
share a super block. Don't initialize a pnfs mountpoint more than once.
Don't set the pnfs_curr_ld until the pnfs mountpoint initialization succeeds
Signed-off-by: Andy Adamson <andros@netapp.com>
---
fs/nfs/pnfs.c | 15 ++++++++++-----
1 files changed, 10 insertions(+), 5 deletions(-)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 8492aef..1d903fd 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -215,20 +215,25 @@ set_pnfs_layoutdriver(struct super_block *sb, struct nfs_fh *fh, u32 id)
struct pnfs_mount_type *mt;
struct nfs_server *server = NFS_SB(sb);
+ if (server->pnfs_curr_ld)
+ return;
+
if (id > 0 && find_pnfs(id, &mod)) {
- dprintk("%s: Setting pNFS module\n", __func__);
- server->pnfs_curr_ld = mod->pnfs_ld_type;
- mt = server->pnfs_curr_ld->ld_io_ops->initialize_mountpoint(
+ mt = mod->pnfs_ld_type->ld_io_ops->initialize_mountpoint(
sb, fh);
if (!mt) {
printk(KERN_ERR "%s: Error initializing mount point "
"for layout driver %u. ", __func__, id);
goto out_err;
}
- /* Layout driver succeeded in initializing mountpoint */
+ /*
+ * Layout driver succeeded in initializing mountpoint
+ * and has taken a reference on the nfs_client cl_devid_cache
+ */
+ server->pnfs_curr_ld = mod->pnfs_ld_type;
server->pnfs_mountid = mt;
- /* Set the rpc_ops */
server->nfs_client->rpc_ops = &pnfs_v4_clientops;
+ dprintk("%s: pNFS module for %u set\n", __func__, id);
return;
}
--
1.6.6
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 3/3] SQUASHME pnfs-submit: file layout driver generic device ID cache
2010-04-16 15:52 ` [PATCH 2/3] SQUASHME pnfs_submit: fix multiple mount set_pnfs_layoutdriver andros
@ 2010-04-16 15:52 ` andros
0 siblings, 0 replies; 12+ messages in thread
From: andros @ 2010-04-16 15:52 UTC (permalink / raw)
To: pnfs-Xh+NVF5n0LLYtjvyW6yDsg; +Cc: linux-nfs, Andy Adamson
From: Andy Adamson <andros@netapp.com>
Replace the per superblock deviceid cache with the generic deviceid cache.
Embed struct nfs4_deviceid into struct nfs4_file_layout_dsaddr, the file layout
specific deviceid structure. Provide a free_deviceid_callback.
Signed-off-by: Andy Adamson <andros@netapp.com>
---
fs/nfs/client.c | 1 +
fs/nfs/nfs4filelayout.c | 31 +++-----
fs/nfs/nfs4filelayout.h | 12 +--
fs/nfs/nfs4filelayoutdev.c | 190 ++++++++------------------------------------
4 files changed, 51 insertions(+), 183 deletions(-)
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index e1d1646..82775b7 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -38,6 +38,7 @@
#include <net/ipv6.h>
#include <linux/nfs_xdr.h>
#include <linux/sunrpc/bc_xprt.h>
+#include <linux/nfs4_pnfs.h>
#include <asm/system.h>
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 0530b59..c155586 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -76,17 +76,11 @@ filelayout_initialize_mountpoint(struct super_block *sb, struct nfs_fh *fh)
{
struct filelayout_mount_type *fl_mt;
struct pnfs_mount_type *mt;
- int status;
fl_mt = kmalloc(sizeof(struct filelayout_mount_type), GFP_KERNEL);
if (!fl_mt)
goto error_ret;
- /* Initialize nfs4 file layout specific device list structure */
- fl_mt->hlist = kmalloc(sizeof(struct nfs4_pnfs_dev_hlist), GFP_KERNEL);
- if (!fl_mt->hlist)
- goto cleanup_fl_mt;
-
mt = kmalloc(sizeof(struct pnfs_mount_type), GFP_KERNEL);
if (!mt)
goto cleanup_fl_mt;
@@ -94,11 +88,11 @@ filelayout_initialize_mountpoint(struct super_block *sb, struct nfs_fh *fh)
fl_mt->fl_sb = sb;
mt->mountid = (void *)fl_mt;
- status = nfs4_pnfs_devlist_init(fl_mt->hlist);
- if (status)
+ if (nfs4_alloc_init_deviceid_cache(NFS_SB(sb)->nfs_client,
+ nfs4_fl_free_deviceid_callback))
goto cleanup_mt;
- dprintk("%s: device list has been initialized successfully\n",
+ dprintk("%s: deviceid cache has been initialized successfully\n",
__func__);
return mt;
@@ -106,11 +100,10 @@ cleanup_mt: ;
kfree(mt);
cleanup_fl_mt: ;
- kfree(fl_mt->hlist);
kfree(fl_mt);
error_ret: ;
- printk(KERN_WARNING "%s: device list could not be initialized\n",
+ printk(KERN_WARNING "%s: deviceid cache could not be initialized\n",
__func__);
return NULL;
@@ -123,13 +116,14 @@ filelayout_uninitialize_mountpoint(struct pnfs_mount_type *mountid)
{
struct filelayout_mount_type *fl_mt = NULL;
+ dprintk("--> %s\n", __func__);
if (mountid) {
- fl_mt = (struct filelayout_mount_type *)mountid->mountid;
+ struct nfs4_deviceid_cache *cache;
- if (fl_mt != NULL) {
- nfs4_pnfs_devlist_destroy(fl_mt->hlist);
- kfree(fl_mt);
- }
+ fl_mt = (struct filelayout_mount_type *)mountid->mountid;
+ cache = NFS_SB(fl_mt->fl_sb)->nfs_client->cl_devid_cache;
+ nfs4_put_deviceid_cache(cache);
+ kfree(fl_mt);
kfree(mountid);
}
return 0;
@@ -381,8 +375,7 @@ filelayout_check_layout(struct pnfs_layout_type *lo,
struct nfs_server *nfss = NFS_SERVER(PNFS_INODE(lo));
dprintk("--> %s\n", __func__);
- dsaddr = nfs4_pnfs_device_item_find(FILE_MT(PNFS_INODE(lo))->hlist,
- &fl->dev_id);
+ dsaddr = nfs4_pnfs_device_item_find(nfss->nfs_client, &fl->dev_id);
if (dsaddr == NULL) {
dsaddr = get_device_info(PNFS_INODE(lo), &fl->dev_id);
if (dsaddr == NULL) {
@@ -618,7 +611,7 @@ filelayout_commit(struct pnfs_layout_type *layoutid, int sync,
stripesz = filelayout_get_stripesize(layoutid);
dprintk("%s stripesize %Zd\n", __func__, stripesz);
- dsaddr = nfs4_pnfs_device_item_find(FILE_MT(data->inode)->hlist,
+ dsaddr = nfs4_pnfs_device_item_find(NFS_SERVER(data->inode)->nfs_client,
&nfslay->dev_id);
if (dsaddr == NULL) {
data->pdata.pnfs_error = -EIO;
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 12498a2..2cb05bd 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -43,8 +43,7 @@ struct nfs4_pnfs_ds {
};
struct nfs4_file_layout_dsaddr {
- struct hlist_node hash_node; /* nfs4_pnfs_dev_hlist dev_list */
- struct pnfs_deviceid dev_id;
+ struct nfs4_deviceid deviceid;
u32 stripe_count;
u8 *stripe_indices;
u32 ds_num;
@@ -86,15 +85,13 @@ struct nfs4_filelayout {
struct filelayout_mount_type {
struct super_block *fl_sb;
- struct nfs4_pnfs_dev_hlist *hlist;
};
extern struct pnfs_client_operations *pnfs_callback_ops;
+extern void nfs4_fl_free_deviceid_callback(struct rcu_head *);
extern void print_ds(struct nfs4_pnfs_ds *ds);
char *deviceid_fmt(const struct pnfs_deviceid *dev_id);
-int nfs4_pnfs_devlist_init(struct nfs4_pnfs_dev_hlist *hlist);
-void nfs4_pnfs_devlist_destroy(struct nfs4_pnfs_dev_hlist *hlist);
int nfs4_pnfs_dserver_get(struct pnfs_layout_segment *lseg,
loff_t offset,
size_t count,
@@ -102,9 +99,8 @@ int nfs4_pnfs_dserver_get(struct pnfs_layout_segment *lseg,
u32 filelayout_dserver_get_index(loff_t offset,
struct nfs4_file_layout_dsaddr *di,
struct nfs4_filelayout_segment *layout);
-struct nfs4_file_layout_dsaddr *
-nfs4_pnfs_device_item_find(struct nfs4_pnfs_dev_hlist *hlist,
- struct pnfs_deviceid *dev_id);
+extern struct nfs4_file_layout_dsaddr *
+nfs4_pnfs_device_item_find(struct nfs_client *, struct pnfs_deviceid *dev_id);
struct nfs4_file_layout_dsaddr *
get_device_info(struct inode *inode, struct pnfs_deviceid *dev_id);
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 813ddbb..411ffcb 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -45,6 +45,7 @@
#include <linux/utsname.h>
#include <linux/vmalloc.h>
+#include <linux/nfs4_pnfs.h>
#include <linux/pnfs_xdr.h>
#include "nfs4filelayout.h"
#include "internal.h"
@@ -98,42 +99,6 @@ deviceid_fmt(const struct pnfs_deviceid *dev_id)
return buf;
}
-unsigned long
-_deviceid_hash(const struct pnfs_deviceid *dev_id)
-{
- unsigned char *cptr = (unsigned char *)dev_id->data;
- unsigned int nbytes = NFS4_PNFS_DEVICEID4_SIZE;
- u64 x = 0;
-
- while (nbytes--) {
- x *= 37;
- x += *cptr++;
- }
- return x & NFS4_PNFS_DEV_HASH_MASK;
-}
-
-/* Assumes lock is held */
-static inline struct nfs4_file_layout_dsaddr *
-_device_lookup(struct nfs4_pnfs_dev_hlist *hlist,
- const struct pnfs_deviceid *dev_id)
-{
- unsigned long hash;
- struct hlist_node *np;
-
- dprintk("_device_lookup: dev_id=%s\n", deviceid_fmt(dev_id));
-
- hash = _deviceid_hash(dev_id);
-
- hlist_for_each(np, &hlist->dev_list[hash]) {
- struct nfs4_file_layout_dsaddr *dsaddr;
- dsaddr = hlist_entry(np, struct nfs4_file_layout_dsaddr,
- hash_node);
- if (!memcmp(&dsaddr->dev_id, dev_id, NFS4_PNFS_DEVICEID4_SIZE))
- return dsaddr;
- }
- return NULL;
-}
-
/* nfs4_ds_cache_lock is held */
static inline struct nfs4_pnfs_ds *
_data_server_lookup(u32 ip_addr, u32 port)
@@ -152,22 +117,6 @@ _data_server_lookup(u32 ip_addr, u32 port)
return NULL;
}
-
-/* Assumes lock is held */
-static inline void
-_device_add(struct nfs4_pnfs_dev_hlist *hlist,
- struct nfs4_file_layout_dsaddr *dsaddr)
-{
- unsigned long hash;
-
- dprintk("_device_add: dev_id=%s ds_list:\n",
- deviceid_fmt(&dsaddr->dev_id));
- print_ds_list(dsaddr);
-
- hash = _deviceid_hash(&dsaddr->dev_id);
- hlist_add_head(&dsaddr->hash_node, &hlist->dev_list[hash]);
-}
-
/* Create an rpc to the data server defined in 'dev_list' */
static int
nfs4_pnfs_ds_create(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
@@ -269,118 +218,47 @@ out_put:
static void
destroy_ds(struct nfs4_pnfs_ds *ds)
{
+ dprintk("--> %s\n", __func__);
+ print_ds(ds);
+
if (ds->ds_clp)
nfs_put_client(ds->ds_clp);
kfree(ds);
}
-/* Assumes lock is NOT held */
static void
-nfs4_pnfs_device_destroy(struct nfs4_file_layout_dsaddr *dsaddr,
- struct nfs4_pnfs_dev_hlist *hlist)
+nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
{
struct nfs4_pnfs_ds *ds;
- LIST_HEAD(release);
int i;
- if (!dsaddr)
- return;
-
- dprintk("%s: dev_id=%s\ndev_list:\n", __func__,
- deviceid_fmt(&dsaddr->dev_id));
- print_ds_list(dsaddr);
-
- write_lock(&hlist->dev_lock);
- hlist_del_init(&dsaddr->hash_node);
+ dprintk("%s: device id=%s\n", __func__,
+ deviceid_fmt(&dsaddr->deviceid.de_id));
for (i = 0; i < dsaddr->ds_num; i++) {
ds = dsaddr->ds_list[i];
if (ds != NULL) {
- /* if we are last user - move to release list */
if (atomic_dec_and_lock(&ds->ds_count,
&nfs4_ds_cache_lock)) {
list_del_init(&ds->ds_node);
spin_unlock(&nfs4_ds_cache_lock);
- list_add(&ds->ds_node, &release);
+ destroy_ds(ds);
}
}
}
- write_unlock(&hlist->dev_lock);
- while (!list_empty(&release)) {
- ds = list_entry(release.next, struct nfs4_pnfs_ds, ds_node);
- list_del(&ds->ds_node);
- destroy_ds(ds);
- }
+ kfree(dsaddr->stripe_indices);
kfree(dsaddr);
}
-int
-nfs4_pnfs_devlist_init(struct nfs4_pnfs_dev_hlist *hlist)
-{
- int i;
-
- rwlock_init(&hlist->dev_lock);
-
- for (i = 0; i < NFS4_PNFS_DEV_HASH_SIZE; i++) {
- INIT_HLIST_HEAD(&hlist->dev_list[i]);
- }
-
- return 0;
-}
-
-/* De-alloc all devices for a mount point. This is called in
- * nfs4_kill_super.
- */
void
-nfs4_pnfs_devlist_destroy(struct nfs4_pnfs_dev_hlist *hlist)
-{
- int i;
-
- if (hlist == NULL)
- return;
-
- /* No lock held, as synchronization should occur at upper levels */
- for (i = 0; i < NFS4_PNFS_DEV_HASH_SIZE; i++) {
- struct hlist_node *np, *next;
-
- hlist_for_each_safe(np, next, &hlist->dev_list[i]) {
- struct nfs4_file_layout_dsaddr *dsaddr;
- dsaddr = hlist_entry(np,
- struct nfs4_file_layout_dsaddr,
- hash_node);
- /* nfs4_pnfs_device_destroy grabs hlist->dev_lock */
- nfs4_pnfs_device_destroy(dsaddr, hlist);
- }
- }
-}
-
-/*
- * Add the device to the list of available devices for this mount point.
- * The * rpc client is created during first I/O.
- */
-static int
-nfs4_pnfs_device_add(struct filelayout_mount_type *mt,
- struct nfs4_file_layout_dsaddr *dsaddr)
+nfs4_fl_free_deviceid_callback(struct rcu_head *rcu)
{
- struct nfs4_file_layout_dsaddr *tmp_dsaddr;
- struct nfs4_pnfs_dev_hlist *hlist = mt->hlist;
-
- dprintk("nfs4_pnfs_device_add\n");
-
- /* Write lock, do lookup again, and then add device */
- write_lock(&hlist->dev_lock);
- tmp_dsaddr = _device_lookup(hlist, &dsaddr->dev_id);
- if (tmp_dsaddr == NULL)
- _device_add(hlist, dsaddr);
- write_unlock(&hlist->dev_lock);
-
- /* Cleanup, if device was recently added */
- if (tmp_dsaddr != NULL) {
- dprintk(" device found, not adding (after creation)\n");
- nfs4_pnfs_device_destroy(dsaddr, hlist);
- }
+ struct nfs4_deviceid *device =
+ container_of(rcu, struct nfs4_deviceid, de_rcu);
+ struct nfs4_file_layout_dsaddr *dsaddr =
+ container_of(device, struct nfs4_file_layout_dsaddr, deviceid);
- return 0;
+ nfs4_fl_free_deviceid(dsaddr);
}
static void
@@ -514,7 +392,8 @@ decode_device(struct inode *ino, struct pnfs_device *pdev)
dsaddr->stripe_count = cnt;
dsaddr->ds_num = num;
- memcpy(&dsaddr->dev_id, &pdev->dev_id, NFS4_PNFS_DEVICEID4_SIZE);
+ memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id,
+ NFS4_PNFS_DEVICEID4_SIZE);
/* Go back an read stripe indices */
p = indicesp;
@@ -553,19 +432,20 @@ decode_device(struct inode *ino, struct pnfs_device *pdev)
}
}
}
+ nfs4_init_deviceid_node(&dsaddr->deviceid);
+
return dsaddr;
out_err_free:
- nfs4_pnfs_device_destroy(dsaddr, FILE_MT(ino)->hlist);
+ nfs4_fl_free_deviceid(dsaddr);
out_err:
dprintk("%s ERROR: returning NULL\n", __func__);
return NULL;
}
-/* Decode the opaque device specified in 'dev'
- * and add it to the list of available devices for this
- * mount point.
- * Must at some point be followed up with nfs4_pnfs_device_destroy
+/*
+ * Decode the opaque device specified in 'dev'
+ * and add it to the list of available devices
*/
static struct nfs4_file_layout_dsaddr*
decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
@@ -574,14 +454,13 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
dsaddr = decode_device(inode, dev);
if (!dsaddr) {
- printk(KERN_WARNING "%s: Could not decode device\n",
+ printk(KERN_WARNING "%s: Could not decode or add device\n",
__func__);
- nfs4_pnfs_device_destroy(dsaddr, FILE_MT(inode)->hlist);
return NULL;
}
- if (nfs4_pnfs_device_add(FILE_MT(inode), dsaddr))
- return NULL;
+ nfs4_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache,
+ &dsaddr->deviceid);
return dsaddr;
}
@@ -660,16 +539,15 @@ out_free:
}
struct nfs4_file_layout_dsaddr *
-nfs4_pnfs_device_item_find(struct nfs4_pnfs_dev_hlist *hlist,
- struct pnfs_deviceid *dev_id)
+nfs4_pnfs_device_item_find(struct nfs_client *clp, struct pnfs_deviceid *id)
{
- struct nfs4_file_layout_dsaddr *dsaddr;
-
- read_lock(&hlist->dev_lock);
- dsaddr = _device_lookup(hlist, dev_id);
- read_unlock(&hlist->dev_lock);
+ struct nfs4_deviceid *d;
- return dsaddr;
+ d = nfs4_find_deviceid(clp->cl_devid_cache, id);
+ dprintk("%s device id (%s) nfs4_deviceid %p\n", __func__,
+ deviceid_fmt(id), d);
+ return (d == NULL) ? NULL :
+ container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
}
/* Want res = ((offset / layout->stripe_unit) % dsaddr->stripe_count)
@@ -707,7 +585,7 @@ nfs4_pnfs_dserver_get(struct pnfs_layout_segment *lseg,
if (!layout)
return 1;
- dsaddr = nfs4_pnfs_device_item_find(FILE_MT(inode)->hlist,
+ dsaddr = nfs4_pnfs_device_item_find(NFS_SERVER(inode)->nfs_client,
&layout->dev_id);
if (dsaddr == NULL)
return 1;
--
1.6.6
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache
2010-04-16 15:52 ` [PATCH 1/3] SQUASHME pnfs_submit: " andros
2010-04-16 15:52 ` [PATCH 2/3] SQUASHME pnfs_submit: fix multiple mount set_pnfs_layoutdriver andros
@ 2010-04-16 16:04 ` William A. (Andy) Adamson
[not found] ` <u2n89c397151004160904m9e862360xcaf0e187640b0177-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
1 sibling, 1 reply; 12+ messages in thread
From: William A. (Andy) Adamson @ 2010-04-16 16:04 UTC (permalink / raw)
To: pnfs; +Cc: linux-nfs, Andy Adamson
On Fri, Apr 16, 2010 at 11:52 AM, <andros@netapp.com> wrote:
> From: Andy Adamson <andros@netapp.com>
>
> A shared RCU device ID cache servicing multiple mounts of a single la=
yout type
> per meta data server (struct nfs_client).
>
> Device IDs of type deviceid4 are required by all layout types, long l=
ived and
> read at each I/O. =A0They are added to the deviceid cache at first re=
ference by
> a layout via GETDEVICEINFO and (currently) are only removed at umount=
=2E
>
> Reference count the device ID cache for each mounted file system
> in the initialize_mountpoint layoutdriver_io_operation.
>
> Dereference the device id cache on file system in the uninitialize_mo=
untpoint
> layoutdriver_io_operation called at umount
>
> Signed-off-by: Andy Adamson <andros@netapp.com>
> ---
> =A0fs/nfs/pnfs.c =A0 =A0 =A0 =A0 =A0 =A0 | =A0119 +++++++++++++++++++=
++++++++++++++++++++++++++
> =A0include/linux/nfs4_pnfs.h | =A0 27 ++++++++++
> =A0include/linux/nfs_fs_sb.h | =A0 =A01 +
> =A03 files changed, 147 insertions(+), 0 deletions(-)
>
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index 91572aa..8492aef 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -45,6 +45,7 @@
> =A0#include <linux/nfs4.h>
> =A0#include <linux/pnfs_xdr.h>
> =A0#include <linux/nfs4_pnfs.h>
> +#include <linux/rculist.h>
>
> =A0#include "internal.h"
> =A0#include "nfs4_fs.h"
> @@ -2296,3 +2297,121 @@ struct pnfs_client_operations pnfs_ops =3D {
>
> =A0EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
> =A0EXPORT_SYMBOL(pnfs_register_layoutdriver);
> +
> +
> +/* Device ID cache. Supports one layout type per struct nfs_client *=
/
> +int
> +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0void (*free_callback=
)(struct rcu_head *))
> +{
> + =A0 =A0 =A0 struct nfs4_deviceid_cache *c;
> +
> + =A0 =A0 =A0 c =3D kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_K=
ERNEL);
> + =A0 =A0 =A0 if (!c)
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 return -ENOMEM;
> + =A0 =A0 =A0 spin_lock(&clp->cl_lock);
> + =A0 =A0 =A0 if (clp->cl_devid_cache !=3D NULL) {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kref_get(&clp->cl_devid_cache->dc_kref)=
;
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&clp->cl_lock);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [kref [%d]]\n", __func__,
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 atomic_read(&clp->cl_de=
vid_cache->dc_kref.refcount));
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kfree(c);
> + =A0 =A0 =A0 } else {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_lock_init(&c->dc_lock);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 INIT_LIST_HEAD(&c->dc_deviceids);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kref_init(&c->dc_kref);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback =3D free_callback;
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_clp =3D clp;
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 clp->cl_devid_cache =3D c;
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&clp->cl_lock);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [new]\n", __func__);
> + =A0 =A0 =A0 }
> + =A0 =A0 =A0 return 0;
> +}
> +EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
> +
> +void
> +nfs4_init_deviceid_node(struct nfs4_deviceid *d)
> +{
> + =A0 =A0 =A0 INIT_LIST_HEAD(&d->de_node);
> + =A0 =A0 =A0 INIT_RCU_HEAD(&d->de_rcu);
> +}
> +EXPORT_SYMBOL(nfs4_init_deviceid_node);
> +
> +struct nfs4_deviceid *
> +nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_device=
id *id)
> +{
> + =A0 =A0 =A0 struct nfs4_deviceid *d;
> +
> + =A0 =A0 =A0 rcu_read_lock();
> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 if (!memcmp(&d->de_id, id, NFS4_PNFS_DE=
VICEID4_SIZE)) {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 rcu_read_unlock();
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 return d;
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 }
> + =A0 =A0 =A0 }
> + =A0 =A0 =A0 rcu_read_unlock();
> + =A0 =A0 =A0 return NULL;
> +}
> +EXPORT_SYMBOL(nfs4_find_deviceid);
> +
> +/*
> + * Add or kref_get a deviceid.
> + * GETDEVICEINFOs for same deviceid can race. If deviceid is found, =
discard new
> + */
> +void
> +nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_devicei=
d *new)
> +{
> + =A0 =A0 =A0 struct nfs4_deviceid *d;
> +
> + =A0 =A0 =A0 spin_lock(&c->dc_lock);
> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 if (!memcmp(&d->de_id, &new->de_id, NFS=
4_PNFS_DEVICEID4_SIZE)) {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&c->dc_lock=
);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [discard]\n=
", __func__);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback(&ne=
w->de_rcu);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 }
> + =A0 =A0 =A0 }
> + =A0 =A0 =A0 list_add_rcu(&new->de_node, &c->dc_deviceids);
> + =A0 =A0 =A0 spin_unlock(&c->dc_lock);
> + =A0 =A0 =A0 dprintk("%s [new]\n", __func__);
> +}
> +EXPORT_SYMBOL(nfs4_add_deviceid);
> +
> +static int
> +nfs4_remove_deviceid(struct nfs4_deviceid_cache *c)
> +{
> + =A0 =A0 =A0 struct nfs4_deviceid *d;
> +
> + =A0 =A0 =A0 spin_lock(&c->dc_lock);
> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 list_del_rcu(&d->de_node);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&c->dc_lock);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 synchronize_rcu();
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback(&d->de_rcu);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 return 1;
> + =A0 =A0 =A0 }
> + =A0 =A0 =A0 spin_unlock(&c->dc_lock);
> + =A0 =A0 =A0 return 0;
> +}
> +
> +static void
> +nfs4_free_deviceid_cache(struct kref *kref)
> +{
> + =A0 =A0 =A0 struct nfs4_deviceid_cache *cache =3D
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 container_of(kref, struct nfs4_deviceid=
_cache, dc_kref);
> + =A0 =A0 =A0 int more =3D 1;
> +
> + =A0 =A0 =A0 while (more)
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 more =3D nfs4_remove_deviceid(cache);
> + =A0 =A0 =A0 cache->dc_clp->cl_devid_cache =3D NULL;
Need to take the cl_lock around this assignment
spin_lock(&cache->dc_clp->cl_lock);
cache->dc_clp->cl_devid_cache =3D NULL
spin_unlock(&cache->dc_clp->cl_lock);
> + =A0 =A0 =A0 kfree(cache);
> +}
> +
> +void
> +nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *c)
> +{
> + =A0 =A0 =A0 dprintk("%s [%d]\n", __func__, atomic_read(&c->dc_kref.=
refcount));
> + =A0 =A0 =A0 kref_put(&c->dc_kref, nfs4_free_deviceid_cache);
> +}
> +EXPORT_SYMBOL(nfs4_put_deviceid_cache);
> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
> index 1d521f4..2a88a06 100644
> --- a/include/linux/nfs4_pnfs.h
> +++ b/include/linux/nfs4_pnfs.h
> @@ -281,6 +281,33 @@ struct pnfs_devicelist {
> =A0 =A0 =A0 =A0struct pnfs_deviceid =A0 =A0dev_id[NFS4_PNFS_GETDEVLIS=
T_MAXNUM];
> =A0};
>
> +/*
> + * Device ID RCU cache. A device ID is unique per client ID and layo=
ut type.
> + */
> +struct nfs4_deviceid_cache {
> + =A0 =A0 =A0 spinlock_t =A0 =A0 =A0 =A0 =A0 =A0 =A0dc_lock;
> + =A0 =A0 =A0 struct list_head =A0 =A0 =A0 =A0dc_deviceids;
> + =A0 =A0 =A0 struct kref =A0 =A0 =A0 =A0 =A0 =A0 dc_kref;
> + =A0 =A0 =A0 void =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0(*dc_free_c=
allback)(struct rcu_head *);
> + =A0 =A0 =A0 struct nfs_client =A0 =A0 =A0 *dc_clp;
> +};
> +
> +/* Device ID cache node */
> +struct nfs4_deviceid {
> + =A0 =A0 =A0 struct list_head =A0 =A0 =A0 =A0de_node;
> + =A0 =A0 =A0 struct rcu_head =A0 =A0 =A0 =A0 de_rcu;
> + =A0 =A0 =A0 struct pnfs_deviceid =A0 =A0de_id;
> +};
> +
> +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 void (*=
free_callback)(struct rcu_head *));
> +extern void nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *);
> +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
> +extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_deviceid=
_cache *,
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 struct =
pnfs_deviceid *);
> +extern void nfs4_add_deviceid(struct nfs4_deviceid_cache *,
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 struct =
nfs4_deviceid *);
> +
> =A0/* pNFS client callback functions.
> =A0* These operations allow the layout driver to access pNFS client
> =A0* specific information or call pNFS client->server operations.
> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
> index 8522461..ef2e18e 100644
> --- a/include/linux/nfs_fs_sb.h
> +++ b/include/linux/nfs_fs_sb.h
> @@ -87,6 +87,7 @@ struct nfs_client {
> =A0 =A0 =A0 =A0u32 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 cl_exchang=
e_flags;
> =A0 =A0 =A0 =A0struct nfs4_session =A0 =A0 *cl_session; =A0 =A0/* sha=
rred session */
> =A0 =A0 =A0 =A0struct list_head =A0 =A0 =A0 =A0cl_lo_inodes; =A0 /* I=
nodes having layouts */
> + =A0 =A0 =A0 struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS dev=
iceid cache */
> =A0#endif /* CONFIG_NFS_V4_1 */
>
> =A0#ifdef CONFIG_NFS_FSCACHE
> --
> 1.6.6
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" =
in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at =A0http://vger.kernel.org/majordomo-info.html
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [pnfs] [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache
[not found] ` <u2n89c397151004160904m9e862360xcaf0e187640b0177-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2010-04-21 5:59 ` Benny Halevy
2010-04-21 15:22 ` William A. (Andy) Adamson
0 siblings, 1 reply; 12+ messages in thread
From: Benny Halevy @ 2010-04-21 5:59 UTC (permalink / raw)
To: William A. (Andy) Adamson; +Cc: pnfs, Andy Adamson, linux-nfs
On Apr. 16, 2010, 19:04 +0300, "William A. (Andy) Adamson" <androsadamson@gmail.com> wrote:
> On Fri, Apr 16, 2010 at 11:52 AM, <andros@netapp.com> wrote:
>> From: Andy Adamson <andros@netapp.com>
>>
>> A shared RCU device ID cache servicing multiple mounts of a single layout type
>> per meta data server (struct nfs_client).
>>
>> Device IDs of type deviceid4 are required by all layout types, long lived and
>> read at each I/O. They are added to the deviceid cache at first reference by
>> a layout via GETDEVICEINFO and (currently) are only removed at umount.
>>
>> Reference count the device ID cache for each mounted file system
>> in the initialize_mountpoint layoutdriver_io_operation.
>>
>> Dereference the device id cache on file system in the uninitialize_mountpoint
>> layoutdriver_io_operation called at umount
>>
>> Signed-off-by: Andy Adamson <andros@netapp.com>
>> ---
>> fs/nfs/pnfs.c | 119 +++++++++++++++++++++++++++++++++++++++++++++
>> include/linux/nfs4_pnfs.h | 27 ++++++++++
>> include/linux/nfs_fs_sb.h | 1 +
>> 3 files changed, 147 insertions(+), 0 deletions(-)
>>
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index 91572aa..8492aef 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -45,6 +45,7 @@
>> #include <linux/nfs4.h>
>> #include <linux/pnfs_xdr.h>
>> #include <linux/nfs4_pnfs.h>
>> +#include <linux/rculist.h>
>>
>> #include "internal.h"
>> #include "nfs4_fs.h"
>> @@ -2296,3 +2297,121 @@ struct pnfs_client_operations pnfs_ops = {
>>
>> EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
>> EXPORT_SYMBOL(pnfs_register_layoutdriver);
>> +
>> +
>> +/* Device ID cache. Supports one layout type per struct nfs_client */
>> +int
>> +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
>> + void (*free_callback)(struct rcu_head *))
>> +{
>> + struct nfs4_deviceid_cache *c;
>> +
>> + c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL);
>> + if (!c)
>> + return -ENOMEM;
>> + spin_lock(&clp->cl_lock);
>> + if (clp->cl_devid_cache != NULL) {
>> + kref_get(&clp->cl_devid_cache->dc_kref);
>> + spin_unlock(&clp->cl_lock);
>> + dprintk("%s [kref [%d]]\n", __func__,
>> + atomic_read(&clp->cl_devid_cache->dc_kref.refcount));
>> + kfree(c);
>> + } else {
>> + spin_lock_init(&c->dc_lock);
>> + INIT_LIST_HEAD(&c->dc_deviceids);
>> + kref_init(&c->dc_kref);
>> + c->dc_free_callback = free_callback;
>> + c->dc_clp = clp;
>> + clp->cl_devid_cache = c;
>> + spin_unlock(&clp->cl_lock);
>> + dprintk("%s [new]\n", __func__);
>> + }
>> + return 0;
>> +}
>> +EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
>> +
>> +void
>> +nfs4_init_deviceid_node(struct nfs4_deviceid *d)
>> +{
>> + INIT_LIST_HEAD(&d->de_node);
>> + INIT_RCU_HEAD(&d->de_rcu);
>> +}
>> +EXPORT_SYMBOL(nfs4_init_deviceid_node);
>> +
>> +struct nfs4_deviceid *
>> +nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id)
>> +{
>> + struct nfs4_deviceid *d;
>> +
>> + rcu_read_lock();
>> + list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>> + if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVICEID4_SIZE)) {
>> + rcu_read_unlock();
>> + return d;
>> + }
>> + }
>> + rcu_read_unlock();
I hope this is worth the added complexity...
Out of curiosity, do you have a benchmark comparing the cost
of rcu_read_lock (and unlock) vs. spin_lock(&c->dc_lock)?
>> + return NULL;
>> +}
>> +EXPORT_SYMBOL(nfs4_find_deviceid);
>> +
>> +/*
>> + * Add or kref_get a deviceid.
>> + * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
>> + */
>> +void
>> +nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new)
>> +{
>> + struct nfs4_deviceid *d;
>> +
>> + spin_lock(&c->dc_lock);
>> + list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>> + if (!memcmp(&d->de_id, &new->de_id, NFS4_PNFS_DEVICEID4_SIZE)) {
>> + spin_unlock(&c->dc_lock);
>> + dprintk("%s [discard]\n", __func__);
>> + c->dc_free_callback(&new->de_rcu);
>> + }
>> + }
>> + list_add_rcu(&new->de_node, &c->dc_deviceids);
>> + spin_unlock(&c->dc_lock);
>> + dprintk("%s [new]\n", __func__);
>> +}
>> +EXPORT_SYMBOL(nfs4_add_deviceid);
>> +
>> +static int
>> +nfs4_remove_deviceid(struct nfs4_deviceid_cache *c)
>> +{
>> + struct nfs4_deviceid *d;
>> +
>> + spin_lock(&c->dc_lock);
>> + list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>> + list_del_rcu(&d->de_node);
>> + spin_unlock(&c->dc_lock);
>> + synchronize_rcu();
>> + c->dc_free_callback(&d->de_rcu);
>> + return 1;
>> + }
>> + spin_unlock(&c->dc_lock);
>> + return 0;
>> +}
>> +
>> +static void
>> +nfs4_free_deviceid_cache(struct kref *kref)
>> +{
>> + struct nfs4_deviceid_cache *cache =
>> + container_of(kref, struct nfs4_deviceid_cache, dc_kref);
>> + int more = 1;
>> +
>> + while (more)
>> + more = nfs4_remove_deviceid(cache);
>> + cache->dc_clp->cl_devid_cache = NULL;
>
> Need to take the cl_lock around this assignment
>
> spin_lock(&cache->dc_clp->cl_lock);
> cache->dc_clp->cl_devid_cache = NULL
> spin_unlock(&cache->dc_clp->cl_lock);
>
>
That must happen atomically before kref_put.
It's illegal to have cl_devid_cache be referenced by cache->dc_clp
without a reference count backing it up.
Otherwise, if accessed concurrently to this piece of code
someone might call kref_get while the refcount is zero.
Normally, you'd first clear the referencing pointer to prevent any
new reference to it and only then kref_put it, e.g.:
spin_lock(&cache->dc_clp->cl_lock);
tmp = cache->dc_clp->cl_devid_cache;
cache->dc_clp->cl_devid_cache = NULL
spin_unlock(&cache->dc_clp->cl_lock);
kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache);
Benny
>> + kfree(cache);
>> +}
>> +
>> +void
>> +nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *c)
>> +{
>> + dprintk("%s [%d]\n", __func__, atomic_read(&c->dc_kref.refcount));
>> + kref_put(&c->dc_kref, nfs4_free_deviceid_cache);
>> +}
>> +EXPORT_SYMBOL(nfs4_put_deviceid_cache);
>> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
>> index 1d521f4..2a88a06 100644
>> --- a/include/linux/nfs4_pnfs.h
>> +++ b/include/linux/nfs4_pnfs.h
>> @@ -281,6 +281,33 @@ struct pnfs_devicelist {
>> struct pnfs_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
>> };
>>
>> +/*
>> + * Device ID RCU cache. A device ID is unique per client ID and layout type.
>> + */
>> +struct nfs4_deviceid_cache {
>> + spinlock_t dc_lock;
>> + struct list_head dc_deviceids;
>> + struct kref dc_kref;
>> + void (*dc_free_callback)(struct rcu_head *);
>> + struct nfs_client *dc_clp;
>> +};
>> +
>> +/* Device ID cache node */
>> +struct nfs4_deviceid {
>> + struct list_head de_node;
>> + struct rcu_head de_rcu;
>> + struct pnfs_deviceid de_id;
>> +};
>> +
>> +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
>> + void (*free_callback)(struct rcu_head *));
>> +extern void nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *);
>> +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
>> +extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_deviceid_cache *,
>> + struct pnfs_deviceid *);
>> +extern void nfs4_add_deviceid(struct nfs4_deviceid_cache *,
>> + struct nfs4_deviceid *);
>> +
>> /* pNFS client callback functions.
>> * These operations allow the layout driver to access pNFS client
>> * specific information or call pNFS client->server operations.
>> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
>> index 8522461..ef2e18e 100644
>> --- a/include/linux/nfs_fs_sb.h
>> +++ b/include/linux/nfs_fs_sb.h
>> @@ -87,6 +87,7 @@ struct nfs_client {
>> u32 cl_exchange_flags;
>> struct nfs4_session *cl_session; /* sharred session */
>> struct list_head cl_lo_inodes; /* Inodes having layouts */
>> + struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
>> #endif /* CONFIG_NFS_V4_1 */
>>
>> #ifdef CONFIG_NFS_FSCACHE
>> --
>> 1.6.6
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>
> _______________________________________________
> pNFS mailing list
> pNFS@linux-nfs.org
> http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [pnfs] [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache
2010-04-21 5:59 ` [pnfs] " Benny Halevy
@ 2010-04-21 15:22 ` William A. (Andy) Adamson
[not found] ` <l2l89c397151004210822j8b43009o3a9e78ceed901fd9-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
0 siblings, 1 reply; 12+ messages in thread
From: William A. (Andy) Adamson @ 2010-04-21 15:22 UTC (permalink / raw)
To: Benny Halevy; +Cc: pnfs, linux-nfs
On Wed, Apr 21, 2010 at 1:59 AM, Benny Halevy <bhalevy@panasas.com> wro=
te:
> On Apr. 16, 2010, 19:04 +0300, "William A. (Andy) Adamson" <androsada=
mson@gmail.com> wrote:
>> On Fri, Apr 16, 2010 at 11:52 AM, =A0<andros@netapp.com> wrote:
>>> From: Andy Adamson <andros@netapp.com>
>>>
>>> A shared RCU device ID cache servicing multiple mounts of a single =
layout type
>>> per meta data server (struct nfs_client).
>>>
>>> Device IDs of type deviceid4 are required by all layout types, long=
lived and
>>> read at each I/O. =A0They are added to the deviceid cache at first =
reference by
>>> a layout via GETDEVICEINFO and (currently) are only removed at umou=
nt.
>>>
>>> Reference count the device ID cache for each mounted file system
>>> in the initialize_mountpoint layoutdriver_io_operation.
>>>
>>> Dereference the device id cache on file system in the uninitialize_=
mountpoint
>>> layoutdriver_io_operation called at umount
>>>
>>> Signed-off-by: Andy Adamson <andros@netapp.com>
>>> ---
>>> =A0fs/nfs/pnfs.c =A0 =A0 =A0 =A0 =A0 =A0 | =A0119 +++++++++++++++++=
++++++++++++++++++++++++++++
>>> =A0include/linux/nfs4_pnfs.h | =A0 27 ++++++++++
>>> =A0include/linux/nfs_fs_sb.h | =A0 =A01 +
>>> =A03 files changed, 147 insertions(+), 0 deletions(-)
>>>
>>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>>> index 91572aa..8492aef 100644
>>> --- a/fs/nfs/pnfs.c
>>> +++ b/fs/nfs/pnfs.c
>>> @@ -45,6 +45,7 @@
>>> =A0#include <linux/nfs4.h>
>>> =A0#include <linux/pnfs_xdr.h>
>>> =A0#include <linux/nfs4_pnfs.h>
>>> +#include <linux/rculist.h>
>>>
>>> =A0#include "internal.h"
>>> =A0#include "nfs4_fs.h"
>>> @@ -2296,3 +2297,121 @@ struct pnfs_client_operations pnfs_ops =3D =
{
>>>
>>> =A0EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
>>> =A0EXPORT_SYMBOL(pnfs_register_layoutdriver);
>>> +
>>> +
>>> +/* Device ID cache. Supports one layout type per struct nfs_client=
*/
>>> +int
>>> +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0void (*free_callba=
ck)(struct rcu_head *))
>>> +{
>>> + =A0 =A0 =A0 struct nfs4_deviceid_cache *c;
>>> +
>>> + =A0 =A0 =A0 c =3D kzalloc(sizeof(struct nfs4_deviceid_cache), GFP=
_KERNEL);
>>> + =A0 =A0 =A0 if (!c)
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 return -ENOMEM;
>>> + =A0 =A0 =A0 spin_lock(&clp->cl_lock);
>>> + =A0 =A0 =A0 if (clp->cl_devid_cache !=3D NULL) {
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kref_get(&clp->cl_devid_cache->dc_kre=
f);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&clp->cl_lock);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [kref [%d]]\n", __func__,
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 atomic_read(&clp->cl_=
devid_cache->dc_kref.refcount));
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kfree(c);
>>> + =A0 =A0 =A0 } else {
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_lock_init(&c->dc_lock);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 INIT_LIST_HEAD(&c->dc_deviceids);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kref_init(&c->dc_kref);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback =3D free_callback=
;
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_clp =3D clp;
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 clp->cl_devid_cache =3D c;
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&clp->cl_lock);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [new]\n", __func__);
>>> + =A0 =A0 =A0 }
>>> + =A0 =A0 =A0 return 0;
>>> +}
>>> +EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
>>> +
>>> +void
>>> +nfs4_init_deviceid_node(struct nfs4_deviceid *d)
>>> +{
>>> + =A0 =A0 =A0 INIT_LIST_HEAD(&d->de_node);
>>> + =A0 =A0 =A0 INIT_RCU_HEAD(&d->de_rcu);
>>> +}
>>> +EXPORT_SYMBOL(nfs4_init_deviceid_node);
>>> +
>>> +struct nfs4_deviceid *
>>> +nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_devi=
ceid *id)
>>> +{
>>> + =A0 =A0 =A0 struct nfs4_deviceid *d;
>>> +
>>> + =A0 =A0 =A0 rcu_read_lock();
>>> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_node)=
{
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 if (!memcmp(&d->de_id, id, NFS4_PNFS_=
DEVICEID4_SIZE)) {
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 rcu_read_unlock();
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 return d;
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 }
>>> + =A0 =A0 =A0 }
>>> + =A0 =A0 =A0 rcu_read_unlock();
>
> I hope this is worth the added complexity...
>
> Out of curiosity, do you have a benchmark comparing the cost
> of rcu_read_lock (and unlock) vs. spin_lock(&c->dc_lock)?
The deviceid cache is read at each I/O. If we use a spin_lock to
protect the deviceid cache, this would mean that all I/0 is serialized
behind the spin_lock even though the deviceid cache is changed
infrequently. The RCU allows readers to "run almost naked" and does
not serialize I/O behind reading the deviceid cache.
So I think it is worth it. I have not benchmarked the difference.
>
>>> + =A0 =A0 =A0 return NULL;
>>> +}
>>> +EXPORT_SYMBOL(nfs4_find_deviceid);
>>> +
>>> +/*
>>> + * Add or kref_get a deviceid.
>>> + * GETDEVICEINFOs for same deviceid can race. If deviceid is found=
, discard new
>>> + */
>>> +void
>>> +nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_devic=
eid *new)
>>> +{
>>> + =A0 =A0 =A0 struct nfs4_deviceid *d;
>>> +
>>> + =A0 =A0 =A0 spin_lock(&c->dc_lock);
>>> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_node)=
{
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 if (!memcmp(&d->de_id, &new->de_id, N=
=46S4_PNFS_DEVICEID4_SIZE)) {
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&c->dc_lo=
ck);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [discard]=
\n", __func__);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback(&=
new->de_rcu);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 }
>>> + =A0 =A0 =A0 }
>>> + =A0 =A0 =A0 list_add_rcu(&new->de_node, &c->dc_deviceids);
>>> + =A0 =A0 =A0 spin_unlock(&c->dc_lock);
>>> + =A0 =A0 =A0 dprintk("%s [new]\n", __func__);
>>> +}
>>> +EXPORT_SYMBOL(nfs4_add_deviceid);
>>> +
>>> +static int
>>> +nfs4_remove_deviceid(struct nfs4_deviceid_cache *c)
>>> +{
>>> + =A0 =A0 =A0 struct nfs4_deviceid *d;
>>> +
>>> + =A0 =A0 =A0 spin_lock(&c->dc_lock);
>>> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_node)=
{
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 list_del_rcu(&d->de_node);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&c->dc_lock);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 synchronize_rcu();
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback(&d->de_rcu);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 return 1;
>>> + =A0 =A0 =A0 }
>>> + =A0 =A0 =A0 spin_unlock(&c->dc_lock);
>>> + =A0 =A0 =A0 return 0;
>>> +}
>>> +
>>> +static void
>>> +nfs4_free_deviceid_cache(struct kref *kref)
>>> +{
>>> + =A0 =A0 =A0 struct nfs4_deviceid_cache *cache =3D
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 container_of(kref, struct nfs4_device=
id_cache, dc_kref);
>>> + =A0 =A0 =A0 int more =3D 1;
>>> +
>>> + =A0 =A0 =A0 while (more)
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 more =3D nfs4_remove_deviceid(cache);
>>> + =A0 =A0 =A0 cache->dc_clp->cl_devid_cache =3D NULL;
>>
>> Need to take the cl_lock around this assignment
>>
>> spin_lock(&cache->dc_clp->cl_lock);
>> cache->dc_clp->cl_devid_cache =3D NULL
>> spin_unlock(&cache->dc_clp->cl_lock);
>>
>>
>
> That must happen atomically before kref_put.
> It's illegal to have cl_devid_cache be referenced by cache->dc_clp
> without a reference count backing it up.
> Otherwise, if accessed concurrently to this piece of code
> someone might call kref_get while the refcount is zero.
>
> Normally, you'd first clear the referencing pointer to prevent any
> new reference to it and only then kref_put it, e.g.:
>
> spin_lock(&cache->dc_clp->cl_lock);
> tmp =3D cache->dc_clp->cl_devid_cache;
> cache->dc_clp->cl_devid_cache =3D NULL
> spin_unlock(&cache->dc_clp->cl_lock);
> kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache);
Good point. Thanks for the review. I'll rethink and resend
-->Andy
>
> Benny
>
>>> + =A0 =A0 =A0 kfree(cache);
>>> +}
>>> +
>>> +void
>>> +nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *c)
>>> +{
>>> + =A0 =A0 =A0 dprintk("%s [%d]\n", __func__, atomic_read(&c->dc_kre=
f.refcount));
>>> + =A0 =A0 =A0 kref_put(&c->dc_kref, nfs4_free_deviceid_cache);
>>> +}
>>> +EXPORT_SYMBOL(nfs4_put_deviceid_cache);
>>> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
>>> index 1d521f4..2a88a06 100644
>>> --- a/include/linux/nfs4_pnfs.h
>>> +++ b/include/linux/nfs4_pnfs.h
>>> @@ -281,6 +281,33 @@ struct pnfs_devicelist {
>>> =A0 =A0 =A0 =A0struct pnfs_deviceid =A0 =A0dev_id[NFS4_PNFS_GETDEVL=
IST_MAXNUM];
>>> =A0};
>>>
>>> +/*
>>> + * Device ID RCU cache. A device ID is unique per client ID and la=
yout type.
>>> + */
>>> +struct nfs4_deviceid_cache {
>>> + =A0 =A0 =A0 spinlock_t =A0 =A0 =A0 =A0 =A0 =A0 =A0dc_lock;
>>> + =A0 =A0 =A0 struct list_head =A0 =A0 =A0 =A0dc_deviceids;
>>> + =A0 =A0 =A0 struct kref =A0 =A0 =A0 =A0 =A0 =A0 dc_kref;
>>> + =A0 =A0 =A0 void =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0(*dc_free=
_callback)(struct rcu_head *);
>>> + =A0 =A0 =A0 struct nfs_client =A0 =A0 =A0 *dc_clp;
>>> +};
>>> +
>>> +/* Device ID cache node */
>>> +struct nfs4_deviceid {
>>> + =A0 =A0 =A0 struct list_head =A0 =A0 =A0 =A0de_node;
>>> + =A0 =A0 =A0 struct rcu_head =A0 =A0 =A0 =A0 de_rcu;
>>> + =A0 =A0 =A0 struct pnfs_deviceid =A0 =A0de_id;
>>> +};
>>> +
>>> +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 void =
(*free_callback)(struct rcu_head *));
>>> +extern void nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *);
>>> +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
>>> +extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_device=
id_cache *,
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 struc=
t pnfs_deviceid *);
>>> +extern void nfs4_add_deviceid(struct nfs4_deviceid_cache *,
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 struc=
t nfs4_deviceid *);
>>> +
>>> =A0/* pNFS client callback functions.
>>> =A0* These operations allow the layout driver to access pNFS client
>>> =A0* specific information or call pNFS client->server operations.
>>> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
>>> index 8522461..ef2e18e 100644
>>> --- a/include/linux/nfs_fs_sb.h
>>> +++ b/include/linux/nfs_fs_sb.h
>>> @@ -87,6 +87,7 @@ struct nfs_client {
>>> =A0 =A0 =A0 =A0u32 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 cl_excha=
nge_flags;
>>> =A0 =A0 =A0 =A0struct nfs4_session =A0 =A0 *cl_session; =A0 =A0/* s=
harred session */
>>> =A0 =A0 =A0 =A0struct list_head =A0 =A0 =A0 =A0cl_lo_inodes; =A0 /*=
Inodes having layouts */
>>> + =A0 =A0 =A0 struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS d=
eviceid cache */
>>> =A0#endif /* CONFIG_NFS_V4_1 */
>>>
>>> =A0#ifdef CONFIG_NFS_FSCACHE
>>> --
>>> 1.6.6
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-nfs=
" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at =A0http://vger.kernel.org/majordomo-info.htm=
l
>>>
>> _______________________________________________
>> pNFS mailing list
>> pNFS@linux-nfs.org
>> http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [pnfs] [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache
[not found] ` <l2l89c397151004210822j8b43009o3a9e78ceed901fd9-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2010-04-22 11:20 ` Benny Halevy
2010-04-22 15:47 ` William A. (Andy) Adamson
0 siblings, 1 reply; 12+ messages in thread
From: Benny Halevy @ 2010-04-22 11:20 UTC (permalink / raw)
To: William A. (Andy) Adamson; +Cc: pnfs, linux-nfs
On Apr. 21, 2010, 18:22 +0300, "William A. (Andy) Adamson" <androsadamson@gmail.com> wrote:
> On Wed, Apr 21, 2010 at 1:59 AM, Benny Halevy <bhalevy@panasas.com> wrote:
>> On Apr. 16, 2010, 19:04 +0300, "William A. (Andy) Adamson" <androsadamson@gmail.com> wrote:
>>> On Fri, Apr 16, 2010 at 11:52 AM, <andros@netapp.com> wrote:
>>>> From: Andy Adamson <andros@netapp.com>
>>>>
>>>> A shared RCU device ID cache servicing multiple mounts of a single layout type
>>>> per meta data server (struct nfs_client).
>>>>
>>>> Device IDs of type deviceid4 are required by all layout types, long lived and
>>>> read at each I/O. They are added to the deviceid cache at first reference by
>>>> a layout via GETDEVICEINFO and (currently) are only removed at umount.
>>>>
>>>> Reference count the device ID cache for each mounted file system
>>>> in the initialize_mountpoint layoutdriver_io_operation.
>>>>
>>>> Dereference the device id cache on file system in the uninitialize_mountpoint
>>>> layoutdriver_io_operation called at umount
>>>>
>>>> Signed-off-by: Andy Adamson <andros@netapp.com>
>>>> ---
>>>> fs/nfs/pnfs.c | 119 +++++++++++++++++++++++++++++++++++++++++++++
>>>> include/linux/nfs4_pnfs.h | 27 ++++++++++
>>>> include/linux/nfs_fs_sb.h | 1 +
>>>> 3 files changed, 147 insertions(+), 0 deletions(-)
>>>>
>>>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>>>> index 91572aa..8492aef 100644
>>>> --- a/fs/nfs/pnfs.c
>>>> +++ b/fs/nfs/pnfs.c
>>>> @@ -45,6 +45,7 @@
>>>> #include <linux/nfs4.h>
>>>> #include <linux/pnfs_xdr.h>
>>>> #include <linux/nfs4_pnfs.h>
>>>> +#include <linux/rculist.h>
>>>>
>>>> #include "internal.h"
>>>> #include "nfs4_fs.h"
>>>> @@ -2296,3 +2297,121 @@ struct pnfs_client_operations pnfs_ops = {
>>>>
>>>> EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
>>>> EXPORT_SYMBOL(pnfs_register_layoutdriver);
>>>> +
>>>> +
>>>> +/* Device ID cache. Supports one layout type per struct nfs_client */
>>>> +int
>>>> +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
>>>> + void (*free_callback)(struct rcu_head *))
>>>> +{
>>>> + struct nfs4_deviceid_cache *c;
>>>> +
>>>> + c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL);
>>>> + if (!c)
>>>> + return -ENOMEM;
>>>> + spin_lock(&clp->cl_lock);
>>>> + if (clp->cl_devid_cache != NULL) {
>>>> + kref_get(&clp->cl_devid_cache->dc_kref);
>>>> + spin_unlock(&clp->cl_lock);
>>>> + dprintk("%s [kref [%d]]\n", __func__,
>>>> + atomic_read(&clp->cl_devid_cache->dc_kref.refcount));
>>>> + kfree(c);
>>>> + } else {
>>>> + spin_lock_init(&c->dc_lock);
>>>> + INIT_LIST_HEAD(&c->dc_deviceids);
>>>> + kref_init(&c->dc_kref);
>>>> + c->dc_free_callback = free_callback;
>>>> + c->dc_clp = clp;
>>>> + clp->cl_devid_cache = c;
>>>> + spin_unlock(&clp->cl_lock);
>>>> + dprintk("%s [new]\n", __func__);
>>>> + }
>>>> + return 0;
>>>> +}
>>>> +EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
>>>> +
>>>> +void
>>>> +nfs4_init_deviceid_node(struct nfs4_deviceid *d)
>>>> +{
>>>> + INIT_LIST_HEAD(&d->de_node);
>>>> + INIT_RCU_HEAD(&d->de_rcu);
>>>> +}
>>>> +EXPORT_SYMBOL(nfs4_init_deviceid_node);
>>>> +
>>>> +struct nfs4_deviceid *
>>>> +nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id)
>>>> +{
>>>> + struct nfs4_deviceid *d;
>>>> +
>>>> + rcu_read_lock();
>>>> + list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>>>> + if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVICEID4_SIZE)) {
>>>> + rcu_read_unlock();
>>>> + return d;
>>>> + }
>>>> + }
>>>> + rcu_read_unlock();
>>
>> I hope this is worth the added complexity...
>>
>> Out of curiosity, do you have a benchmark comparing the cost
>> of rcu_read_lock (and unlock) vs. spin_lock(&c->dc_lock)?
>
> The deviceid cache is read at each I/O. If we use a spin_lock to
Yeah, I see where this goes...
In the objects layout driver we get a reference on the device structure
at alloc_lseg time and keep a pointer to it throughout the lseg's life time.
This saves the deviceid lookup on every I/O.
Benny
> protect the deviceid cache, this would mean that all I/0 is serialized
> behind the spin_lock even though the deviceid cache is changed
> infrequently. The RCU allows readers to "run almost naked" and does
> not serialize I/O behind reading the deviceid cache.
>
> So I think it is worth it. I have not benchmarked the difference.
>
>
>>
>>>> + return NULL;
>>>> +}
>>>> +EXPORT_SYMBOL(nfs4_find_deviceid);
>>>> +
>>>> +/*
>>>> + * Add or kref_get a deviceid.
>>>> + * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
>>>> + */
>>>> +void
>>>> +nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new)
>>>> +{
>>>> + struct nfs4_deviceid *d;
>>>> +
>>>> + spin_lock(&c->dc_lock);
>>>> + list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>>>> + if (!memcmp(&d->de_id, &new->de_id, NFS4_PNFS_DEVICEID4_SIZE)) {
>>>> + spin_unlock(&c->dc_lock);
>>>> + dprintk("%s [discard]\n", __func__);
>>>> + c->dc_free_callback(&new->de_rcu);
>>>> + }
>>>> + }
>>>> + list_add_rcu(&new->de_node, &c->dc_deviceids);
>>>> + spin_unlock(&c->dc_lock);
>>>> + dprintk("%s [new]\n", __func__);
>>>> +}
>>>> +EXPORT_SYMBOL(nfs4_add_deviceid);
>>>> +
>>>> +static int
>>>> +nfs4_remove_deviceid(struct nfs4_deviceid_cache *c)
>>>> +{
>>>> + struct nfs4_deviceid *d;
>>>> +
>>>> + spin_lock(&c->dc_lock);
>>>> + list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>>>> + list_del_rcu(&d->de_node);
>>>> + spin_unlock(&c->dc_lock);
>>>> + synchronize_rcu();
>>>> + c->dc_free_callback(&d->de_rcu);
>>>> + return 1;
>>>> + }
>>>> + spin_unlock(&c->dc_lock);
>>>> + return 0;
>>>> +}
>>>> +
>>>> +static void
>>>> +nfs4_free_deviceid_cache(struct kref *kref)
>>>> +{
>>>> + struct nfs4_deviceid_cache *cache =
>>>> + container_of(kref, struct nfs4_deviceid_cache, dc_kref);
>>>> + int more = 1;
>>>> +
>>>> + while (more)
>>>> + more = nfs4_remove_deviceid(cache);
>>>> + cache->dc_clp->cl_devid_cache = NULL;
>>>
>>> Need to take the cl_lock around this assignment
>>>
>>> spin_lock(&cache->dc_clp->cl_lock);
>>> cache->dc_clp->cl_devid_cache = NULL
>>> spin_unlock(&cache->dc_clp->cl_lock);
>>>
>>>
>>
>> That must happen atomically before kref_put.
>> It's illegal to have cl_devid_cache be referenced by cache->dc_clp
>> without a reference count backing it up.
>> Otherwise, if accessed concurrently to this piece of code
>> someone might call kref_get while the refcount is zero.
>>
>> Normally, you'd first clear the referencing pointer to prevent any
>> new reference to it and only then kref_put it, e.g.:
>>
>> spin_lock(&cache->dc_clp->cl_lock);
>> tmp = cache->dc_clp->cl_devid_cache;
>> cache->dc_clp->cl_devid_cache = NULL
>> spin_unlock(&cache->dc_clp->cl_lock);
>> kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache);
>
> Good point. Thanks for the review. I'll rethink and resend
>
> -->Andy
>
>>
>> Benny
>>
>>>> + kfree(cache);
>>>> +}
>>>> +
>>>> +void
>>>> +nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *c)
>>>> +{
>>>> + dprintk("%s [%d]\n", __func__, atomic_read(&c->dc_kref.refcount));
>>>> + kref_put(&c->dc_kref, nfs4_free_deviceid_cache);
>>>> +}
>>>> +EXPORT_SYMBOL(nfs4_put_deviceid_cache);
>>>> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
>>>> index 1d521f4..2a88a06 100644
>>>> --- a/include/linux/nfs4_pnfs.h
>>>> +++ b/include/linux/nfs4_pnfs.h
>>>> @@ -281,6 +281,33 @@ struct pnfs_devicelist {
>>>> struct pnfs_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
>>>> };
>>>>
>>>> +/*
>>>> + * Device ID RCU cache. A device ID is unique per client ID and layout type.
>>>> + */
>>>> +struct nfs4_deviceid_cache {
>>>> + spinlock_t dc_lock;
>>>> + struct list_head dc_deviceids;
>>>> + struct kref dc_kref;
>>>> + void (*dc_free_callback)(struct rcu_head *);
>>>> + struct nfs_client *dc_clp;
>>>> +};
>>>> +
>>>> +/* Device ID cache node */
>>>> +struct nfs4_deviceid {
>>>> + struct list_head de_node;
>>>> + struct rcu_head de_rcu;
>>>> + struct pnfs_deviceid de_id;
>>>> +};
>>>> +
>>>> +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
>>>> + void (*free_callback)(struct rcu_head *));
>>>> +extern void nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *);
>>>> +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
>>>> +extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_deviceid_cache *,
>>>> + struct pnfs_deviceid *);
>>>> +extern void nfs4_add_deviceid(struct nfs4_deviceid_cache *,
>>>> + struct nfs4_deviceid *);
>>>> +
>>>> /* pNFS client callback functions.
>>>> * These operations allow the layout driver to access pNFS client
>>>> * specific information or call pNFS client->server operations.
>>>> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
>>>> index 8522461..ef2e18e 100644
>>>> --- a/include/linux/nfs_fs_sb.h
>>>> +++ b/include/linux/nfs_fs_sb.h
>>>> @@ -87,6 +87,7 @@ struct nfs_client {
>>>> u32 cl_exchange_flags;
>>>> struct nfs4_session *cl_session; /* sharred session */
>>>> struct list_head cl_lo_inodes; /* Inodes having layouts */
>>>> + struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
>>>> #endif /* CONFIG_NFS_V4_1 */
>>>>
>>>> #ifdef CONFIG_NFS_FSCACHE
>>>> --
>>>> 1.6.6
>>>>
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>>>
>>> _______________________________________________
>>> pNFS mailing list
>>> pNFS@linux-nfs.org
>>> http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs
>>
--
Benny Halevy
Software Architect
Panasas, Inc.
bhalevy@panasas.com
Tel/Fax: +972-3-647-8340
Mobile: +972-54-802-8340
Panasas: The Leader in Parallel Storage
www.panasas.com
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [pnfs] [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache
2010-04-22 11:20 ` Benny Halevy
@ 2010-04-22 15:47 ` William A. (Andy) Adamson
[not found] ` <v2h89c397151004220847v3a31c493s4089d0cd53cf3e19-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
0 siblings, 1 reply; 12+ messages in thread
From: William A. (Andy) Adamson @ 2010-04-22 15:47 UTC (permalink / raw)
To: Benny Halevy; +Cc: pnfs, linux-nfs
On Thu, Apr 22, 2010 at 7:20 AM, Benny Halevy <bhalevy@panasas.com> wro=
te:
> On Apr. 21, 2010, 18:22 +0300, "William A. (Andy) Adamson" <androsada=
mson@gmail.com> wrote:
>> On Wed, Apr 21, 2010 at 1:59 AM, Benny Halevy <bhalevy@panasas.com> =
wrote:
>>> On Apr. 16, 2010, 19:04 +0300, "William A. (Andy) Adamson" <androsa=
damson@gmail.com> wrote:
>>>> On Fri, Apr 16, 2010 at 11:52 AM, =A0<andros@netapp.com> wrote:
>>>>> From: Andy Adamson <andros@netapp.com>
>>>>>
>>>>> A shared RCU device ID cache servicing multiple mounts of a singl=
e layout type
>>>>> per meta data server (struct nfs_client).
>>>>>
>>>>> Device IDs of type deviceid4 are required by all layout types, lo=
ng lived and
>>>>> read at each I/O. =A0They are added to the deviceid cache at firs=
t reference by
>>>>> a layout via GETDEVICEINFO and (currently) are only removed at um=
ount.
>>>>>
>>>>> Reference count the device ID cache for each mounted file system
>>>>> in the initialize_mountpoint layoutdriver_io_operation.
>>>>>
>>>>> Dereference the device id cache on file system in the uninitializ=
e_mountpoint
>>>>> layoutdriver_io_operation called at umount
>>>>>
>>>>> Signed-off-by: Andy Adamson <andros@netapp.com>
>>>>> ---
>>>>> =A0fs/nfs/pnfs.c =A0 =A0 =A0 =A0 =A0 =A0 | =A0119 +++++++++++++++=
++++++++++++++++++++++++++++++
>>>>> =A0include/linux/nfs4_pnfs.h | =A0 27 ++++++++++
>>>>> =A0include/linux/nfs_fs_sb.h | =A0 =A01 +
>>>>> =A03 files changed, 147 insertions(+), 0 deletions(-)
>>>>>
>>>>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>>>>> index 91572aa..8492aef 100644
>>>>> --- a/fs/nfs/pnfs.c
>>>>> +++ b/fs/nfs/pnfs.c
>>>>> @@ -45,6 +45,7 @@
>>>>> =A0#include <linux/nfs4.h>
>>>>> =A0#include <linux/pnfs_xdr.h>
>>>>> =A0#include <linux/nfs4_pnfs.h>
>>>>> +#include <linux/rculist.h>
>>>>>
>>>>> =A0#include "internal.h"
>>>>> =A0#include "nfs4_fs.h"
>>>>> @@ -2296,3 +2297,121 @@ struct pnfs_client_operations pnfs_ops =3D=
{
>>>>>
>>>>> =A0EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
>>>>> =A0EXPORT_SYMBOL(pnfs_register_layoutdriver);
>>>>> +
>>>>> +
>>>>> +/* Device ID cache. Supports one layout type per struct nfs_clie=
nt */
>>>>> +int
>>>>> +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0void (*free_call=
back)(struct rcu_head *))
>>>>> +{
>>>>> + =A0 =A0 =A0 struct nfs4_deviceid_cache *c;
>>>>> +
>>>>> + =A0 =A0 =A0 c =3D kzalloc(sizeof(struct nfs4_deviceid_cache), G=
=46P_KERNEL);
>>>>> + =A0 =A0 =A0 if (!c)
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 return -ENOMEM;
>>>>> + =A0 =A0 =A0 spin_lock(&clp->cl_lock);
>>>>> + =A0 =A0 =A0 if (clp->cl_devid_cache !=3D NULL) {
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kref_get(&clp->cl_devid_cache->dc_k=
ref);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&clp->cl_lock);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [kref [%d]]\n", __func_=
_,
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 atomic_read(&clp->c=
l_devid_cache->dc_kref.refcount));
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kfree(c);
>>>>> + =A0 =A0 =A0 } else {
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_lock_init(&c->dc_lock);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 INIT_LIST_HEAD(&c->dc_deviceids);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kref_init(&c->dc_kref);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback =3D free_callba=
ck;
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_clp =3D clp;
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 clp->cl_devid_cache =3D c;
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&clp->cl_lock);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [new]\n", __func__);
>>>>> + =A0 =A0 =A0 }
>>>>> + =A0 =A0 =A0 return 0;
>>>>> +}
>>>>> +EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
>>>>> +
>>>>> +void
>>>>> +nfs4_init_deviceid_node(struct nfs4_deviceid *d)
>>>>> +{
>>>>> + =A0 =A0 =A0 INIT_LIST_HEAD(&d->de_node);
>>>>> + =A0 =A0 =A0 INIT_RCU_HEAD(&d->de_rcu);
>>>>> +}
>>>>> +EXPORT_SYMBOL(nfs4_init_deviceid_node);
>>>>> +
>>>>> +struct nfs4_deviceid *
>>>>> +nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_de=
viceid *id)
>>>>> +{
>>>>> + =A0 =A0 =A0 struct nfs4_deviceid *d;
>>>>> +
>>>>> + =A0 =A0 =A0 rcu_read_lock();
>>>>> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_nod=
e) {
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 if (!memcmp(&d->de_id, id, NFS4_PNF=
S_DEVICEID4_SIZE)) {
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 rcu_read_unlock();
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 return d;
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 }
>>>>> + =A0 =A0 =A0 }
>>>>> + =A0 =A0 =A0 rcu_read_unlock();
>>>
>>> I hope this is worth the added complexity...
>>>
>>> Out of curiosity, do you have a benchmark comparing the cost
>>> of rcu_read_lock (and unlock) vs. spin_lock(&c->dc_lock)?
>>
>> The deviceid cache is read at each I/O. If we use a spin_lock to
>
> Yeah, I see where this goes...
> In the objects layout driver we get a reference on the device structu=
re
> at alloc_lseg time and keep a pointer to it throughout the lseg's lif=
e time.
> This saves the deviceid lookup on every I/O.
Perhaps that is a better way to go. How many deviceid's is 'normal'
for the object driver?
-->Andy
>
> Benny
>
>> protect the deviceid cache, this would mean that all I/0 is serializ=
ed
>> behind the spin_lock even though the deviceid cache is changed
>> infrequently. The RCU allows readers to "run almost naked" and does
>> not serialize I/O behind reading the deviceid cache.
>>
>> So I think it is worth it. I have not benchmarked the difference.
>>
>>
>>>
>>>>> + =A0 =A0 =A0 return NULL;
>>>>> +}
>>>>> +EXPORT_SYMBOL(nfs4_find_deviceid);
>>>>> +
>>>>> +/*
>>>>> + * Add or kref_get a deviceid.
>>>>> + * GETDEVICEINFOs for same deviceid can race. If deviceid is fou=
nd, discard new
>>>>> + */
>>>>> +void
>>>>> +nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_dev=
iceid *new)
>>>>> +{
>>>>> + =A0 =A0 =A0 struct nfs4_deviceid *d;
>>>>> +
>>>>> + =A0 =A0 =A0 spin_lock(&c->dc_lock);
>>>>> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_nod=
e) {
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 if (!memcmp(&d->de_id, &new->de_id,=
NFS4_PNFS_DEVICEID4_SIZE)) {
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&c->dc_=
lock);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [discar=
d]\n", __func__);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback=
(&new->de_rcu);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 }
>>>>> + =A0 =A0 =A0 }
>>>>> + =A0 =A0 =A0 list_add_rcu(&new->de_node, &c->dc_deviceids);
>>>>> + =A0 =A0 =A0 spin_unlock(&c->dc_lock);
>>>>> + =A0 =A0 =A0 dprintk("%s [new]\n", __func__);
>>>>> +}
>>>>> +EXPORT_SYMBOL(nfs4_add_deviceid);
>>>>> +
>>>>> +static int
>>>>> +nfs4_remove_deviceid(struct nfs4_deviceid_cache *c)
>>>>> +{
>>>>> + =A0 =A0 =A0 struct nfs4_deviceid *d;
>>>>> +
>>>>> + =A0 =A0 =A0 spin_lock(&c->dc_lock);
>>>>> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_nod=
e) {
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 list_del_rcu(&d->de_node);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&c->dc_lock);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 synchronize_rcu();
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback(&d->de_rcu);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 return 1;
>>>>> + =A0 =A0 =A0 }
>>>>> + =A0 =A0 =A0 spin_unlock(&c->dc_lock);
>>>>> + =A0 =A0 =A0 return 0;
>>>>> +}
>>>>> +
>>>>> +static void
>>>>> +nfs4_free_deviceid_cache(struct kref *kref)
>>>>> +{
>>>>> + =A0 =A0 =A0 struct nfs4_deviceid_cache *cache =3D
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 container_of(kref, struct nfs4_devi=
ceid_cache, dc_kref);
>>>>> + =A0 =A0 =A0 int more =3D 1;
>>>>> +
>>>>> + =A0 =A0 =A0 while (more)
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 more =3D nfs4_remove_deviceid(cache=
);
>>>>> + =A0 =A0 =A0 cache->dc_clp->cl_devid_cache =3D NULL;
>>>>
>>>> Need to take the cl_lock around this assignment
>>>>
>>>> spin_lock(&cache->dc_clp->cl_lock);
>>>> cache->dc_clp->cl_devid_cache =3D NULL
>>>> spin_unlock(&cache->dc_clp->cl_lock);
>>>>
>>>>
>>>
>>> That must happen atomically before kref_put.
>>> It's illegal to have cl_devid_cache be referenced by cache->dc_clp
>>> without a reference count backing it up.
>>> Otherwise, if accessed concurrently to this piece of code
>>> someone might call kref_get while the refcount is zero.
>>>
>>> Normally, you'd first clear the referencing pointer to prevent any
>>> new reference to it and only then kref_put it, e.g.:
>>>
>>> spin_lock(&cache->dc_clp->cl_lock);
>>> tmp =3D cache->dc_clp->cl_devid_cache;
>>> cache->dc_clp->cl_devid_cache =3D NULL
>>> spin_unlock(&cache->dc_clp->cl_lock);
>>> kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache);
>>
>> Good point. Thanks for the review. I'll rethink and resend
>>
>> -->Andy
>>
>>>
>>> Benny
>>>
>>>>> + =A0 =A0 =A0 kfree(cache);
>>>>> +}
>>>>> +
>>>>> +void
>>>>> +nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *c)
>>>>> +{
>>>>> + =A0 =A0 =A0 dprintk("%s [%d]\n", __func__, atomic_read(&c->dc_k=
ref.refcount));
>>>>> + =A0 =A0 =A0 kref_put(&c->dc_kref, nfs4_free_deviceid_cache);
>>>>> +}
>>>>> +EXPORT_SYMBOL(nfs4_put_deviceid_cache);
>>>>> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.=
h
>>>>> index 1d521f4..2a88a06 100644
>>>>> --- a/include/linux/nfs4_pnfs.h
>>>>> +++ b/include/linux/nfs4_pnfs.h
>>>>> @@ -281,6 +281,33 @@ struct pnfs_devicelist {
>>>>> =A0 =A0 =A0 =A0struct pnfs_deviceid =A0 =A0dev_id[NFS4_PNFS_GETDE=
VLIST_MAXNUM];
>>>>> =A0};
>>>>>
>>>>> +/*
>>>>> + * Device ID RCU cache. A device ID is unique per client ID and =
layout type.
>>>>> + */
>>>>> +struct nfs4_deviceid_cache {
>>>>> + =A0 =A0 =A0 spinlock_t =A0 =A0 =A0 =A0 =A0 =A0 =A0dc_lock;
>>>>> + =A0 =A0 =A0 struct list_head =A0 =A0 =A0 =A0dc_deviceids;
>>>>> + =A0 =A0 =A0 struct kref =A0 =A0 =A0 =A0 =A0 =A0 dc_kref;
>>>>> + =A0 =A0 =A0 void =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0(*dc_fr=
ee_callback)(struct rcu_head *);
>>>>> + =A0 =A0 =A0 struct nfs_client =A0 =A0 =A0 *dc_clp;
>>>>> +};
>>>>> +
>>>>> +/* Device ID cache node */
>>>>> +struct nfs4_deviceid {
>>>>> + =A0 =A0 =A0 struct list_head =A0 =A0 =A0 =A0de_node;
>>>>> + =A0 =A0 =A0 struct rcu_head =A0 =A0 =A0 =A0 de_rcu;
>>>>> + =A0 =A0 =A0 struct pnfs_deviceid =A0 =A0de_id;
>>>>> +};
>>>>> +
>>>>> +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 voi=
d (*free_callback)(struct rcu_head *));
>>>>> +extern void nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *=
);
>>>>> +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
>>>>> +extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_devi=
ceid_cache *,
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 str=
uct pnfs_deviceid *);
>>>>> +extern void nfs4_add_deviceid(struct nfs4_deviceid_cache *,
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 str=
uct nfs4_deviceid *);
>>>>> +
>>>>> =A0/* pNFS client callback functions.
>>>>> =A0* These operations allow the layout driver to access pNFS clie=
nt
>>>>> =A0* specific information or call pNFS client->server operations.
>>>>> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.=
h
>>>>> index 8522461..ef2e18e 100644
>>>>> --- a/include/linux/nfs_fs_sb.h
>>>>> +++ b/include/linux/nfs_fs_sb.h
>>>>> @@ -87,6 +87,7 @@ struct nfs_client {
>>>>> =A0 =A0 =A0 =A0u32 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 cl_exc=
hange_flags;
>>>>> =A0 =A0 =A0 =A0struct nfs4_session =A0 =A0 *cl_session; =A0 =A0/*=
sharred session */
>>>>> =A0 =A0 =A0 =A0struct list_head =A0 =A0 =A0 =A0cl_lo_inodes; =A0 =
/* Inodes having layouts */
>>>>> + =A0 =A0 =A0 struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS=
deviceid cache */
>>>>> =A0#endif /* CONFIG_NFS_V4_1 */
>>>>>
>>>>> =A0#ifdef CONFIG_NFS_FSCACHE
>>>>> --
>>>>> 1.6.6
>>>>>
>>>>> --
>>>>> To unsubscribe from this list: send the line "unsubscribe linux-n=
fs" in
>>>>> the body of a message to majordomo@vger.kernel.org
>>>>> More majordomo info at =A0http://vger.kernel.org/majordomo-info.h=
tml
>>>>>
>>>> _______________________________________________
>>>> pNFS mailing list
>>>> pNFS@linux-nfs.org
>>>> http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs
>>>
>
>
> --
> Benny Halevy
> Software Architect
> Panasas, Inc.
> bhalevy@panasas.com
> Tel/Fax: +972-3-647-8340
> Mobile: +972-54-802-8340
>
> Panasas: The Leader in Parallel Storage
> www.panasas.com
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [pnfs] [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache
[not found] ` <v2h89c397151004220847v3a31c493s4089d0cd53cf3e19-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2010-04-22 15:51 ` Benny Halevy
0 siblings, 0 replies; 12+ messages in thread
From: Benny Halevy @ 2010-04-22 15:51 UTC (permalink / raw)
To: William A. (Andy) Adamson; +Cc: pnfs, linux-nfs
On Apr. 22, 2010, 18:47 +0300, "William A. (Andy) Adamson" <androsadamson@gmail.com> wrote:
> On Thu, Apr 22, 2010 at 7:20 AM, Benny Halevy <bhalevy@panasas.com> wrote:
>> On Apr. 21, 2010, 18:22 +0300, "William A. (Andy) Adamson" <androsadamson@gmail.com> wrote:
>>> On Wed, Apr 21, 2010 at 1:59 AM, Benny Halevy <bhalevy@panasas.com> wrote:
>>>> On Apr. 16, 2010, 19:04 +0300, "William A. (Andy) Adamson" <androsadamson@gmail.com> wrote:
>>>>> On Fri, Apr 16, 2010 at 11:52 AM, <andros@netapp.com> wrote:
>>>>>> From: Andy Adamson <andros@netapp.com>
>>>>>>
>>>>>> A shared RCU device ID cache servicing multiple mounts of a single layout type
>>>>>> per meta data server (struct nfs_client).
>>>>>>
>>>>>> Device IDs of type deviceid4 are required by all layout types, long lived and
>>>>>> read at each I/O. They are added to the deviceid cache at first reference by
>>>>>> a layout via GETDEVICEINFO and (currently) are only removed at umount.
>>>>>>
>>>>>> Reference count the device ID cache for each mounted file system
>>>>>> in the initialize_mountpoint layoutdriver_io_operation.
>>>>>>
>>>>>> Dereference the device id cache on file system in the uninitialize_mountpoint
>>>>>> layoutdriver_io_operation called at umount
>>>>>>
>>>>>> Signed-off-by: Andy Adamson <andros@netapp.com>
>>>>>> ---
>>>>>> fs/nfs/pnfs.c | 119 +++++++++++++++++++++++++++++++++++++++++++++
>>>>>> include/linux/nfs4_pnfs.h | 27 ++++++++++
>>>>>> include/linux/nfs_fs_sb.h | 1 +
>>>>>> 3 files changed, 147 insertions(+), 0 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>>>>>> index 91572aa..8492aef 100644
>>>>>> --- a/fs/nfs/pnfs.c
>>>>>> +++ b/fs/nfs/pnfs.c
>>>>>> @@ -45,6 +45,7 @@
>>>>>> #include <linux/nfs4.h>
>>>>>> #include <linux/pnfs_xdr.h>
>>>>>> #include <linux/nfs4_pnfs.h>
>>>>>> +#include <linux/rculist.h>
>>>>>>
>>>>>> #include "internal.h"
>>>>>> #include "nfs4_fs.h"
>>>>>> @@ -2296,3 +2297,121 @@ struct pnfs_client_operations pnfs_ops = {
>>>>>>
>>>>>> EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
>>>>>> EXPORT_SYMBOL(pnfs_register_layoutdriver);
>>>>>> +
>>>>>> +
>>>>>> +/* Device ID cache. Supports one layout type per struct nfs_client */
>>>>>> +int
>>>>>> +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
>>>>>> + void (*free_callback)(struct rcu_head *))
>>>>>> +{
>>>>>> + struct nfs4_deviceid_cache *c;
>>>>>> +
>>>>>> + c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL);
>>>>>> + if (!c)
>>>>>> + return -ENOMEM;
>>>>>> + spin_lock(&clp->cl_lock);
>>>>>> + if (clp->cl_devid_cache != NULL) {
>>>>>> + kref_get(&clp->cl_devid_cache->dc_kref);
>>>>>> + spin_unlock(&clp->cl_lock);
>>>>>> + dprintk("%s [kref [%d]]\n", __func__,
>>>>>> + atomic_read(&clp->cl_devid_cache->dc_kref.refcount));
>>>>>> + kfree(c);
>>>>>> + } else {
>>>>>> + spin_lock_init(&c->dc_lock);
>>>>>> + INIT_LIST_HEAD(&c->dc_deviceids);
>>>>>> + kref_init(&c->dc_kref);
>>>>>> + c->dc_free_callback = free_callback;
>>>>>> + c->dc_clp = clp;
>>>>>> + clp->cl_devid_cache = c;
>>>>>> + spin_unlock(&clp->cl_lock);
>>>>>> + dprintk("%s [new]\n", __func__);
>>>>>> + }
>>>>>> + return 0;
>>>>>> +}
>>>>>> +EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
>>>>>> +
>>>>>> +void
>>>>>> +nfs4_init_deviceid_node(struct nfs4_deviceid *d)
>>>>>> +{
>>>>>> + INIT_LIST_HEAD(&d->de_node);
>>>>>> + INIT_RCU_HEAD(&d->de_rcu);
>>>>>> +}
>>>>>> +EXPORT_SYMBOL(nfs4_init_deviceid_node);
>>>>>> +
>>>>>> +struct nfs4_deviceid *
>>>>>> +nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id)
>>>>>> +{
>>>>>> + struct nfs4_deviceid *d;
>>>>>> +
>>>>>> + rcu_read_lock();
>>>>>> + list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>>>>>> + if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVICEID4_SIZE)) {
>>>>>> + rcu_read_unlock();
>>>>>> + return d;
>>>>>> + }
>>>>>> + }
>>>>>> + rcu_read_unlock();
>>>>
>>>> I hope this is worth the added complexity...
>>>>
>>>> Out of curiosity, do you have a benchmark comparing the cost
>>>> of rcu_read_lock (and unlock) vs. spin_lock(&c->dc_lock)?
>>>
>>> The deviceid cache is read at each I/O. If we use a spin_lock to
>>
>> Yeah, I see where this goes...
>> In the objects layout driver we get a reference on the device structure
>> at alloc_lseg time and keep a pointer to it throughout the lseg's life time.
>> This saves the deviceid lookup on every I/O.
>
> Perhaps that is a better way to go. How many deviceid's is 'normal'
> for the object driver?
>
I'd say that order of 10's to a few 100's is normal.
1000 and up is a big installation.
Benny
> -->Andy
>
>
>>
>> Benny
>>
>>> protect the deviceid cache, this would mean that all I/0 is serialized
>>> behind the spin_lock even though the deviceid cache is changed
>>> infrequently. The RCU allows readers to "run almost naked" and does
>>> not serialize I/O behind reading the deviceid cache.
>>>
>>> So I think it is worth it. I have not benchmarked the difference.
>>>
>>>
>>>>
>>>>>> + return NULL;
>>>>>> +}
>>>>>> +EXPORT_SYMBOL(nfs4_find_deviceid);
>>>>>> +
>>>>>> +/*
>>>>>> + * Add or kref_get a deviceid.
>>>>>> + * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
>>>>>> + */
>>>>>> +void
>>>>>> +nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new)
>>>>>> +{
>>>>>> + struct nfs4_deviceid *d;
>>>>>> +
>>>>>> + spin_lock(&c->dc_lock);
>>>>>> + list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>>>>>> + if (!memcmp(&d->de_id, &new->de_id, NFS4_PNFS_DEVICEID4_SIZE)) {
>>>>>> + spin_unlock(&c->dc_lock);
>>>>>> + dprintk("%s [discard]\n", __func__);
>>>>>> + c->dc_free_callback(&new->de_rcu);
>>>>>> + }
>>>>>> + }
>>>>>> + list_add_rcu(&new->de_node, &c->dc_deviceids);
>>>>>> + spin_unlock(&c->dc_lock);
>>>>>> + dprintk("%s [new]\n", __func__);
>>>>>> +}
>>>>>> +EXPORT_SYMBOL(nfs4_add_deviceid);
>>>>>> +
>>>>>> +static int
>>>>>> +nfs4_remove_deviceid(struct nfs4_deviceid_cache *c)
>>>>>> +{
>>>>>> + struct nfs4_deviceid *d;
>>>>>> +
>>>>>> + spin_lock(&c->dc_lock);
>>>>>> + list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>>>>>> + list_del_rcu(&d->de_node);
>>>>>> + spin_unlock(&c->dc_lock);
>>>>>> + synchronize_rcu();
>>>>>> + c->dc_free_callback(&d->de_rcu);
>>>>>> + return 1;
>>>>>> + }
>>>>>> + spin_unlock(&c->dc_lock);
>>>>>> + return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static void
>>>>>> +nfs4_free_deviceid_cache(struct kref *kref)
>>>>>> +{
>>>>>> + struct nfs4_deviceid_cache *cache =
>>>>>> + container_of(kref, struct nfs4_deviceid_cache, dc_kref);
>>>>>> + int more = 1;
>>>>>> +
>>>>>> + while (more)
>>>>>> + more = nfs4_remove_deviceid(cache);
>>>>>> + cache->dc_clp->cl_devid_cache = NULL;
>>>>>
>>>>> Need to take the cl_lock around this assignment
>>>>>
>>>>> spin_lock(&cache->dc_clp->cl_lock);
>>>>> cache->dc_clp->cl_devid_cache = NULL
>>>>> spin_unlock(&cache->dc_clp->cl_lock);
>>>>>
>>>>>
>>>>
>>>> That must happen atomically before kref_put.
>>>> It's illegal to have cl_devid_cache be referenced by cache->dc_clp
>>>> without a reference count backing it up.
>>>> Otherwise, if accessed concurrently to this piece of code
>>>> someone might call kref_get while the refcount is zero.
>>>>
>>>> Normally, you'd first clear the referencing pointer to prevent any
>>>> new reference to it and only then kref_put it, e.g.:
>>>>
>>>> spin_lock(&cache->dc_clp->cl_lock);
>>>> tmp = cache->dc_clp->cl_devid_cache;
>>>> cache->dc_clp->cl_devid_cache = NULL
>>>> spin_unlock(&cache->dc_clp->cl_lock);
>>>> kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache);
>>>
>>> Good point. Thanks for the review. I'll rethink and resend
>>>
>>> -->Andy
>>>
>>>>
>>>> Benny
>>>>
>>>>>> + kfree(cache);
>>>>>> +}
>>>>>> +
>>>>>> +void
>>>>>> +nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *c)
>>>>>> +{
>>>>>> + dprintk("%s [%d]\n", __func__, atomic_read(&c->dc_kref.refcount));
>>>>>> + kref_put(&c->dc_kref, nfs4_free_deviceid_cache);
>>>>>> +}
>>>>>> +EXPORT_SYMBOL(nfs4_put_deviceid_cache);
>>>>>> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
>>>>>> index 1d521f4..2a88a06 100644
>>>>>> --- a/include/linux/nfs4_pnfs.h
>>>>>> +++ b/include/linux/nfs4_pnfs.h
>>>>>> @@ -281,6 +281,33 @@ struct pnfs_devicelist {
>>>>>> struct pnfs_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
>>>>>> };
>>>>>>
>>>>>> +/*
>>>>>> + * Device ID RCU cache. A device ID is unique per client ID and layout type.
>>>>>> + */
>>>>>> +struct nfs4_deviceid_cache {
>>>>>> + spinlock_t dc_lock;
>>>>>> + struct list_head dc_deviceids;
>>>>>> + struct kref dc_kref;
>>>>>> + void (*dc_free_callback)(struct rcu_head *);
>>>>>> + struct nfs_client *dc_clp;
>>>>>> +};
>>>>>> +
>>>>>> +/* Device ID cache node */
>>>>>> +struct nfs4_deviceid {
>>>>>> + struct list_head de_node;
>>>>>> + struct rcu_head de_rcu;
>>>>>> + struct pnfs_deviceid de_id;
>>>>>> +};
>>>>>> +
>>>>>> +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
>>>>>> + void (*free_callback)(struct rcu_head *));
>>>>>> +extern void nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *);
>>>>>> +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
>>>>>> +extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_deviceid_cache *,
>>>>>> + struct pnfs_deviceid *);
>>>>>> +extern void nfs4_add_deviceid(struct nfs4_deviceid_cache *,
>>>>>> + struct nfs4_deviceid *);
>>>>>> +
>>>>>> /* pNFS client callback functions.
>>>>>> * These operations allow the layout driver to access pNFS client
>>>>>> * specific information or call pNFS client->server operations.
>>>>>> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
>>>>>> index 8522461..ef2e18e 100644
>>>>>> --- a/include/linux/nfs_fs_sb.h
>>>>>> +++ b/include/linux/nfs_fs_sb.h
>>>>>> @@ -87,6 +87,7 @@ struct nfs_client {
>>>>>> u32 cl_exchange_flags;
>>>>>> struct nfs4_session *cl_session; /* sharred session */
>>>>>> struct list_head cl_lo_inodes; /* Inodes having layouts */
>>>>>> + struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
>>>>>> #endif /* CONFIG_NFS_V4_1 */
>>>>>>
>>>>>> #ifdef CONFIG_NFS_FSCACHE
>>>>>> --
>>>>>> 1.6.6
>>>>>>
>>>>>> --
>>>>>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>>>>>> the body of a message to majordomo@vger.kernel.org
>>>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>>>>>
>>>>> _______________________________________________
>>>>> pNFS mailing list
>>>>> pNFS@linux-nfs.org
>>>>> http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs
>>>>
>>
>>
>> --
>> Benny Halevy
>> Software Architect
>> Panasas, Inc.
>> bhalevy@panasas.com
>> Tel/Fax: +972-3-647-8340
>> Mobile: +972-54-802-8340
>>
>> Panasas: The Leader in Parallel Storage
>> www.panasas.com
>>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [pnfs] [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache
2010-04-26 16:18 ` [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache andros
@ 2010-05-03 11:48 ` Benny Halevy
2010-05-03 13:57 ` William A. (Andy) Adamson
0 siblings, 1 reply; 12+ messages in thread
From: Benny Halevy @ 2010-05-03 11:48 UTC (permalink / raw)
To: andros; +Cc: linux-nfs
On Apr. 26, 2010, 19:18 +0300, andros@netapp.com wrote:
> From: Andy Adamson <andros@netapp.com>
>
> A shared RCU device ID cache servicing multiple mounts of a single layout type
> per meta data server (struct nfs_client).
>
> Device IDs of type deviceid4 are required by all layout types, long lived and
> read at each I/O. They are added to the deviceid cache at first reference by
> a layout via GETDEVICEINFO and (currently) are only removed at umount.
>
> Reference count the device ID cache for each mounted file system
> in the initialize_mountpoint layoutdriver_io_operation.
>
> Dereference the device id cache on file system in the uninitialize_mountpoint
> layoutdriver_io_operation called at umount
>
> Each layoutsegment assigns a pointer and takes a reference to the
> nfs4_deviceid structure identified by the layout deviceid.
> This is so that there are no deviceid lookups for the normal I/O path.
>
> Even thought required by all layouttypes, the deviceid is not exposed in the
> LAYOUTGET4res but is instead hidden in the opaque layouttype4.
>
> Therefore, each layout type alloc_lseg calls nfs4_set_layout_deviceid,
> and free_lseg calls nfs4_unset_layout_deviceid.
>
> While the file layout driver will not cache very many deviceid's, the object
> and block layout drivers could cache 100's for a large installation.
> Use an hlist.
>
> Signed-off-by: Andy Adamson <andros@netapp.com>
> ---
> fs/nfs/pnfs.c | 167 +++++++++++++++++++++++++++++++++++++++++++++
> include/linux/nfs4_pnfs.h | 50 +++++++++++++
> include/linux/nfs_fs_sb.h | 1 +
> 3 files changed, 218 insertions(+), 0 deletions(-)
>
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index 91572aa..bf906cc 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -45,6 +45,7 @@
> #include <linux/nfs4.h>
> #include <linux/pnfs_xdr.h>
> #include <linux/nfs4_pnfs.h>
> +#include <linux/rculist.h>
>
> #include "internal.h"
> #include "nfs4_fs.h"
> @@ -2296,3 +2297,169 @@ struct pnfs_client_operations pnfs_ops = {
>
> EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
> EXPORT_SYMBOL(pnfs_register_layoutdriver);
> +
> +
> +/* Device ID cache. Supports one layout type per struct nfs_client */
> +int
> +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
> + void (*free_callback)(struct kref *))
> +{
> + struct nfs4_deviceid_cache *c;
> +
> + c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL);
> + if (!c)
> + return -ENOMEM;
> + spin_lock(&clp->cl_lock);
> + if (clp->cl_devid_cache != NULL) {
> + kref_get(&clp->cl_devid_cache->dc_kref);
> + spin_unlock(&clp->cl_lock);
> + dprintk("%s [kref [%d]]\n", __func__,
> + atomic_read(&clp->cl_devid_cache->dc_kref.refcount));
> + kfree(c);
> + } else {
> + int i;
> +
> + spin_lock_init(&c->dc_lock);
> + for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE ; i++)
> + INIT_HLIST_HEAD(&c->dc_deviceids[i]);
> + kref_init(&c->dc_kref);
> + c->dc_free_callback = free_callback;
> + clp->cl_devid_cache = c;
> + spin_unlock(&clp->cl_lock);
> + dprintk("%s [new]\n", __func__);
> + }
> + return 0;
> +}
> +EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
> +
> +void
> +nfs4_init_deviceid_node(struct nfs4_deviceid *d)
> +{
> + INIT_HLIST_NODE(&d->de_node);
> + kref_init(&d->de_kref);
> +}
> +EXPORT_SYMBOL(nfs4_init_deviceid_node);
> +
> +/* Called from layoutdriver_io_operations->alloc_lseg */
> +void
> +nfs4_set_layout_deviceid(struct pnfs_layout_segment *l, struct nfs4_deviceid *d)
> +{
> + dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.refcount));
> + l->deviceid = d;
> + kref_get(&d->de_kref);
> +}
> +EXPORT_SYMBOL(nfs4_set_layout_deviceid);
> +
> +/* Called from layoutdriver_io_operations->free_lseg */
> +void
> +nfs4_unset_layout_deviceid(struct pnfs_layout_segment *l,
> + struct nfs4_deviceid *d,
> + void (*free_callback)(struct kref *))
> +{
> + dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.refcount));
> + l->deviceid = NULL;
> + kref_put(&d->de_kref, free_callback);
> +}
> +EXPORT_SYMBOL(nfs4_unset_layout_deviceid);
> +
> +struct nfs4_deviceid *
> +nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id)
> +{
> + struct nfs4_deviceid *d;
> + struct hlist_node *n;
> + long hash = nfs4_deviceid_hash(id);
> +
> + dprintk("--> %s hash %ld\n", __func__, hash);
> + rcu_read_lock();
> + hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
> + if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVICEID4_SIZE)) {
> + rcu_read_unlock();
> + return d;
> + }
> + }
> + rcu_read_unlock();
> + return NULL;
> +}
> +EXPORT_SYMBOL(nfs4_find_deviceid);
> +
> +/*
> + * Add or kref_get a deviceid.
> + * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
> + */
> +struct nfs4_deviceid *
> +nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new)
> +{
> + struct nfs4_deviceid *d;
> + struct hlist_node *n;
> + long hash = nfs4_deviceid_hash(&new->de_id);
> +
> + dprintk("--> %s hash %ld\n", __func__, hash);
> + spin_lock(&c->dc_lock);
> + hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
> + if (!memcmp(&d->de_id, &new->de_id, NFS4_PNFS_DEVICEID4_SIZE)) {
> + spin_unlock(&c->dc_lock);
> + dprintk("%s [discard]\n", __func__);
> + c->dc_free_callback(&new->de_kref);
> + return d;
> + }
> + }
> + hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
> + spin_unlock(&c->dc_lock);
> + dprintk("%s [new]\n", __func__);
> + return new;
> +}
> +EXPORT_SYMBOL(nfs4_add_deviceid);
> +
> +static int
> +nfs4_remove_deviceid(struct nfs4_deviceid_cache *c, long hash)
> +{
> + struct nfs4_deviceid *d;
> + struct hlist_node *n;
> +
> + dprintk("--> %s hash %ld\n", __func__, hash);
> + spin_lock(&c->dc_lock);
> + hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
> + hlist_del_rcu(&d->de_node);
> + spin_unlock(&c->dc_lock);
> + synchronize_rcu();
> + dprintk("%s [%d]\n", __func__,
> + atomic_read(&d->de_kref.refcount));
> + kref_put(&d->de_kref, c->dc_free_callback);
> + return 1;
> + }
> + spin_unlock(&c->dc_lock);
> + return 0;
> +}
> +
> +static void
> +nfs4_free_deviceid_cache(struct kref *kref)
> +{
> + struct nfs4_deviceid_cache *cache =
> + container_of(kref, struct nfs4_deviceid_cache, dc_kref);
> + int more;
> + long i;
> +
> + for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++) {
> + more = 1;
> + while (more)
> + more = nfs4_remove_deviceid(cache, i);
Andy, this can be simplified to
while (nfs4_remove_deviceid(cache, i))
;
If ok with you, I'll make this change upon merging.
Benny
> + }
> + kfree(cache);
> +}
> +
> +void
> +nfs4_put_deviceid_cache(struct nfs_client *clp)
> +{
> + struct nfs4_deviceid_cache *tmp = clp->cl_devid_cache;
> + int refcount;
> +
> + dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache);
> + spin_lock(&clp->cl_lock);
> + refcount = atomic_read(&clp->cl_devid_cache->dc_kref.refcount);
> + if (refcount == 1)
> + clp->cl_devid_cache = NULL;
> + spin_unlock(&clp->cl_lock);
> + dprintk("%s [%d]\n", __func__, refcount);
> + kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache);
> +}
> +EXPORT_SYMBOL(nfs4_put_deviceid_cache);
> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
> index 3caac60..3b7aeb7 100644
> --- a/include/linux/nfs4_pnfs.h
> +++ b/include/linux/nfs4_pnfs.h
> @@ -106,6 +106,7 @@ struct pnfs_layout_segment {
> struct kref kref;
> bool valid;
> struct pnfs_layout_type *layout;
> + struct nfs4_deviceid *deviceid;
> u8 ld_data[]; /* layout driver private data */
> };
>
> @@ -275,6 +276,55 @@ struct pnfs_devicelist {
> struct pnfs_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
> };
>
> +/*
> + * Device ID RCU cache. A device ID is unique per client ID and layout type.
> + */
> +#define NFS4_DEVICE_ID_HASH_BITS 5
> +#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS)
> +#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1)
> +
> +static inline u32
> +nfs4_deviceid_hash(struct pnfs_deviceid *id)
> +{
> + unsigned char *cptr = (unsigned char *)id->data;
> + unsigned int nbytes = NFS4_PNFS_DEVICEID4_SIZE;
> + u32 x = 0;
> +
> + while (nbytes--) {
> + x *= 37;
> + x += *cptr++;
> + }
> + return x & NFS4_DEVICE_ID_HASH_MASK;
> +}
> +
> +struct nfs4_deviceid_cache {
> + spinlock_t dc_lock;
> + struct kref dc_kref;
> + void (*dc_free_callback)(struct kref *);
> + struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
> +};
> +
> +/* Device ID cache node */
> +struct nfs4_deviceid {
> + struct hlist_node de_node;
> + struct pnfs_deviceid de_id;
> + struct kref de_kref;
> +};
> +
> +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
> + void (*free_callback)(struct kref *));
> +extern void nfs4_put_deviceid_cache(struct nfs_client *);
> +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
> +extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_deviceid_cache *,
> + struct pnfs_deviceid *);
> +extern struct nfs4_deviceid *nfs4_add_deviceid(struct nfs4_deviceid_cache *,
> + struct nfs4_deviceid *);
> +extern void nfs4_set_layout_deviceid(struct pnfs_layout_segment *,
> + struct nfs4_deviceid *);
> +extern void nfs4_unset_layout_deviceid(struct pnfs_layout_segment *,
> + struct nfs4_deviceid *,
> + void (*free_callback)(struct kref *));
> +
> /* pNFS client callback functions.
> * These operations allow the layout driver to access pNFS client
> * specific information or call pNFS client->server operations.
> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
> index 8522461..ef2e18e 100644
> --- a/include/linux/nfs_fs_sb.h
> +++ b/include/linux/nfs_fs_sb.h
> @@ -87,6 +87,7 @@ struct nfs_client {
> u32 cl_exchange_flags;
> struct nfs4_session *cl_session; /* sharred session */
> struct list_head cl_lo_inodes; /* Inodes having layouts */
> + struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
> #endif /* CONFIG_NFS_V4_1 */
>
> #ifdef CONFIG_NFS_FSCACHE
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [pnfs] [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache
2010-05-03 11:48 ` [pnfs] " Benny Halevy
@ 2010-05-03 13:57 ` William A. (Andy) Adamson
0 siblings, 0 replies; 12+ messages in thread
From: William A. (Andy) Adamson @ 2010-05-03 13:57 UTC (permalink / raw)
To: Benny Halevy; +Cc: linux-nfs
On Mon, May 3, 2010 at 7:48 AM, Benny Halevy <bhalevy@panasas.com> wrot=
e:
> On Apr. 26, 2010, 19:18 +0300, andros@netapp.com wrote:
>> From: Andy Adamson <andros@netapp.com>
>>
>> A shared RCU device ID cache servicing multiple mounts of a single l=
ayout type
>> per meta data server (struct nfs_client).
>>
>> Device IDs of type deviceid4 are required by all layout types, long =
lived and
>> read at each I/O. =A0They are added to the deviceid cache at first r=
eference by
>> a layout via GETDEVICEINFO and (currently) are only removed at umoun=
t.
>>
>> Reference count the device ID cache for each mounted file system
>> in the initialize_mountpoint layoutdriver_io_operation.
>>
>> Dereference the device id cache on file system in the uninitialize_m=
ountpoint
>> layoutdriver_io_operation called at umount
>>
>> Each layoutsegment assigns a pointer and takes a reference to the
>> nfs4_deviceid structure identified by the layout deviceid.
>> This is so that there are no deviceid lookups for the normal I/O pat=
h.
>>
>> Even thought required by all layouttypes, the deviceid is not expose=
d in the
>> LAYOUTGET4res but is instead hidden in the opaque layouttype4.
>>
>> Therefore, each layout type alloc_lseg calls nfs4_set_layout_devicei=
d,
>> and free_lseg calls nfs4_unset_layout_deviceid.
>>
>> While the file layout driver will not cache very many deviceid's, th=
e object
>> and block layout drivers could cache 100's for a large installation.
>> Use an hlist.
>>
>> Signed-off-by: Andy Adamson <andros@netapp.com>
>> ---
>> =A0fs/nfs/pnfs.c =A0 =A0 =A0 =A0 =A0 =A0 | =A0167 ++++++++++++++++++=
+++++++++++++++++++++++++++
>> =A0include/linux/nfs4_pnfs.h | =A0 50 +++++++++++++
>> =A0include/linux/nfs_fs_sb.h | =A0 =A01 +
>> =A03 files changed, 218 insertions(+), 0 deletions(-)
>>
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index 91572aa..bf906cc 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -45,6 +45,7 @@
>> =A0#include <linux/nfs4.h>
>> =A0#include <linux/pnfs_xdr.h>
>> =A0#include <linux/nfs4_pnfs.h>
>> +#include <linux/rculist.h>
>>
>> =A0#include "internal.h"
>> =A0#include "nfs4_fs.h"
>> @@ -2296,3 +2297,169 @@ struct pnfs_client_operations pnfs_ops =3D {
>>
>> =A0EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
>> =A0EXPORT_SYMBOL(pnfs_register_layoutdriver);
>> +
>> +
>> +/* Device ID cache. Supports one layout type per struct nfs_client =
*/
>> +int
>> +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0void (*free_callback)(s=
truct kref *))
>> +{
>> + =A0 =A0 struct nfs4_deviceid_cache *c;
>> +
>> + =A0 =A0 c =3D kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERN=
EL);
>> + =A0 =A0 if (!c)
>> + =A0 =A0 =A0 =A0 =A0 =A0 return -ENOMEM;
>> + =A0 =A0 spin_lock(&clp->cl_lock);
>> + =A0 =A0 if (clp->cl_devid_cache !=3D NULL) {
>> + =A0 =A0 =A0 =A0 =A0 =A0 kref_get(&clp->cl_devid_cache->dc_kref);
>> + =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&clp->cl_lock);
>> + =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [kref [%d]]\n", __func__,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 atomic_read(&clp->cl_devid=
_cache->dc_kref.refcount));
>> + =A0 =A0 =A0 =A0 =A0 =A0 kfree(c);
>> + =A0 =A0 } else {
>> + =A0 =A0 =A0 =A0 =A0 =A0 int i;
>> +
>> + =A0 =A0 =A0 =A0 =A0 =A0 spin_lock_init(&c->dc_lock);
>> + =A0 =A0 =A0 =A0 =A0 =A0 for (i =3D 0; i < NFS4_DEVICE_ID_HASH_SIZE=
; i++)
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 INIT_HLIST_HEAD(&c->dc_dev=
iceids[i]);
>> + =A0 =A0 =A0 =A0 =A0 =A0 kref_init(&c->dc_kref);
>> + =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback =3D free_callback;
>> + =A0 =A0 =A0 =A0 =A0 =A0 clp->cl_devid_cache =3D c;
>> + =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&clp->cl_lock);
>> + =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [new]\n", __func__);
>> + =A0 =A0 }
>> + =A0 =A0 return 0;
>> +}
>> +EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
>> +
>> +void
>> +nfs4_init_deviceid_node(struct nfs4_deviceid *d)
>> +{
>> + =A0 =A0 INIT_HLIST_NODE(&d->de_node);
>> + =A0 =A0 kref_init(&d->de_kref);
>> +}
>> +EXPORT_SYMBOL(nfs4_init_deviceid_node);
>> +
>> +/* Called from layoutdriver_io_operations->alloc_lseg */
>> +void
>> +nfs4_set_layout_deviceid(struct pnfs_layout_segment *l, struct nfs4=
_deviceid *d)
>> +{
>> + =A0 =A0 dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.ref=
count));
>> + =A0 =A0 l->deviceid =3D d;
>> + =A0 =A0 kref_get(&d->de_kref);
>> +}
>> +EXPORT_SYMBOL(nfs4_set_layout_deviceid);
>> +
>> +/* Called from layoutdriver_io_operations->free_lseg */
>> +void
>> +nfs4_unset_layout_deviceid(struct pnfs_layout_segment *l,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0struct nfs4_devicei=
d *d,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0void (*free_callbac=
k)(struct kref *))
>> +{
>> + =A0 =A0 dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.ref=
count));
>> + =A0 =A0 l->deviceid =3D NULL;
>> + =A0 =A0 kref_put(&d->de_kref, free_callback);
>> +}
>> +EXPORT_SYMBOL(nfs4_unset_layout_deviceid);
>> +
>> +struct nfs4_deviceid *
>> +nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_devic=
eid *id)
>> +{
>> + =A0 =A0 struct nfs4_deviceid *d;
>> + =A0 =A0 struct hlist_node *n;
>> + =A0 =A0 long hash =3D nfs4_deviceid_hash(id);
>> +
>> + =A0 =A0 dprintk("--> %s hash %ld\n", __func__, hash);
>> + =A0 =A0 rcu_read_lock();
>> + =A0 =A0 hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_=
node) {
>> + =A0 =A0 =A0 =A0 =A0 =A0 if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVIC=
EID4_SIZE)) {
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 rcu_read_unlock();
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 return d;
>> + =A0 =A0 =A0 =A0 =A0 =A0 }
>> + =A0 =A0 }
>> + =A0 =A0 rcu_read_unlock();
>> + =A0 =A0 return NULL;
>> +}
>> +EXPORT_SYMBOL(nfs4_find_deviceid);
>> +
>> +/*
>> + * Add or kref_get a deviceid.
>> + * GETDEVICEINFOs for same deviceid can race. If deviceid is found,=
discard new
>> + */
>> +struct nfs4_deviceid *
>> +nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_device=
id *new)
>> +{
>> + =A0 =A0 struct nfs4_deviceid *d;
>> + =A0 =A0 struct hlist_node *n;
>> + =A0 =A0 long hash =3D nfs4_deviceid_hash(&new->de_id);
>> +
>> + =A0 =A0 dprintk("--> %s hash %ld\n", __func__, hash);
>> + =A0 =A0 spin_lock(&c->dc_lock);
>> + =A0 =A0 hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_=
node) {
>> + =A0 =A0 =A0 =A0 =A0 =A0 if (!memcmp(&d->de_id, &new->de_id, NFS4_P=
NFS_DEVICEID4_SIZE)) {
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&c->dc_lock);
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [discard]\n", =
__func__);
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback(&new->=
de_kref);
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 return d;
>> + =A0 =A0 =A0 =A0 =A0 =A0 }
>> + =A0 =A0 }
>> + =A0 =A0 hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
>> + =A0 =A0 spin_unlock(&c->dc_lock);
>> + =A0 =A0 dprintk("%s [new]\n", __func__);
>> + =A0 =A0 return new;
>> +}
>> +EXPORT_SYMBOL(nfs4_add_deviceid);
>> +
>> +static int
>> +nfs4_remove_deviceid(struct nfs4_deviceid_cache *c, long hash)
>> +{
>> + =A0 =A0 struct nfs4_deviceid *d;
>> + =A0 =A0 struct hlist_node *n;
>> +
>> + =A0 =A0 dprintk("--> %s hash %ld\n", __func__, hash);
>> + =A0 =A0 spin_lock(&c->dc_lock);
>> + =A0 =A0 hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_=
node) {
>> + =A0 =A0 =A0 =A0 =A0 =A0 hlist_del_rcu(&d->de_node);
>> + =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&c->dc_lock);
>> + =A0 =A0 =A0 =A0 =A0 =A0 synchronize_rcu();
>> + =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [%d]\n", __func__,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 atomic_read(&d->de_kref.re=
fcount));
>> + =A0 =A0 =A0 =A0 =A0 =A0 kref_put(&d->de_kref, c->dc_free_callback)=
;
>> + =A0 =A0 =A0 =A0 =A0 =A0 return 1;
>> + =A0 =A0 }
>> + =A0 =A0 spin_unlock(&c->dc_lock);
>> + =A0 =A0 return 0;
>> +}
>> +
>> +static void
>> +nfs4_free_deviceid_cache(struct kref *kref)
>> +{
>> + =A0 =A0 struct nfs4_deviceid_cache *cache =3D
>> + =A0 =A0 =A0 =A0 =A0 =A0 container_of(kref, struct nfs4_deviceid_ca=
che, dc_kref);
>> + =A0 =A0 int more;
>> + =A0 =A0 long i;
>> +
>> + =A0 =A0 for (i =3D 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++) {
>> + =A0 =A0 =A0 =A0 =A0 =A0 more =3D 1;
>> + =A0 =A0 =A0 =A0 =A0 =A0 while (more)
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 more =3D nfs4_remove_devic=
eid(cache, i);
>
> Andy, this can be simplified to
>
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0while (nfs4_remove_deviceid(cache, i))
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0;
>
> If ok with you, I'll make this change upon merging.
Yes - looks fine, thanks.
-->Andy
>
> Benny
>
>> + =A0 =A0 }
>> + =A0 =A0 kfree(cache);
>> +}
>> +
>> +void
>> +nfs4_put_deviceid_cache(struct nfs_client *clp)
>> +{
>> + =A0 =A0 struct nfs4_deviceid_cache *tmp =3D clp->cl_devid_cache;
>> + =A0 =A0 int refcount;
>> +
>> + =A0 =A0 dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_de=
vid_cache);
>> + =A0 =A0 spin_lock(&clp->cl_lock);
>> + =A0 =A0 refcount =3D atomic_read(&clp->cl_devid_cache->dc_kref.ref=
count);
>> + =A0 =A0 if (refcount =3D=3D 1)
>> + =A0 =A0 =A0 =A0 =A0 =A0 clp->cl_devid_cache =3D NULL;
>> + =A0 =A0 spin_unlock(&clp->cl_lock);
>> + =A0 =A0 dprintk("%s [%d]\n", __func__, refcount);
>> + =A0 =A0 kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache);
>> +}
>> +EXPORT_SYMBOL(nfs4_put_deviceid_cache);
>> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
>> index 3caac60..3b7aeb7 100644
>> --- a/include/linux/nfs4_pnfs.h
>> +++ b/include/linux/nfs4_pnfs.h
>> @@ -106,6 +106,7 @@ struct pnfs_layout_segment {
>> =A0 =A0 =A0 struct kref kref;
>> =A0 =A0 =A0 bool valid;
>> =A0 =A0 =A0 struct pnfs_layout_type *layout;
>> + =A0 =A0 struct nfs4_deviceid *deviceid;
>> =A0 =A0 =A0 u8 ld_data[]; =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 /* lay=
out driver private data */
>> =A0};
>>
>> @@ -275,6 +276,55 @@ struct pnfs_devicelist {
>> =A0 =A0 =A0 struct pnfs_deviceid =A0 =A0dev_id[NFS4_PNFS_GETDEVLIST_=
MAXNUM];
>> =A0};
>>
>> +/*
>> + * Device ID RCU cache. A device ID is unique per client ID and lay=
out type.
>> + */
>> +#define NFS4_DEVICE_ID_HASH_BITS =A0 =A0 5
>> +#define NFS4_DEVICE_ID_HASH_SIZE =A0 =A0 (1 << NFS4_DEVICE_ID_HASH_=
BITS)
>> +#define NFS4_DEVICE_ID_HASH_MASK =A0 =A0 (NFS4_DEVICE_ID_HASH_SIZE =
- 1)
>> +
>> +static inline u32
>> +nfs4_deviceid_hash(struct pnfs_deviceid *id)
>> +{
>> + =A0 =A0 unsigned char *cptr =3D (unsigned char *)id->data;
>> + =A0 =A0 unsigned int nbytes =3D NFS4_PNFS_DEVICEID4_SIZE;
>> + =A0 =A0 u32 x =3D 0;
>> +
>> + =A0 =A0 while (nbytes--) {
>> + =A0 =A0 =A0 =A0 =A0 =A0 x *=3D 37;
>> + =A0 =A0 =A0 =A0 =A0 =A0 x +=3D *cptr++;
>> + =A0 =A0 }
>> + =A0 =A0 return x & NFS4_DEVICE_ID_HASH_MASK;
>> +}
>> +
>> +struct nfs4_deviceid_cache {
>> + =A0 =A0 spinlock_t =A0 =A0 =A0 =A0 =A0 =A0 =A0dc_lock;
>> + =A0 =A0 struct kref =A0 =A0 =A0 =A0 =A0 =A0 dc_kref;
>> + =A0 =A0 void =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0(*dc_free_call=
back)(struct kref *);
>> + =A0 =A0 struct hlist_head =A0 =A0 =A0 dc_deviceids[NFS4_DEVICE_ID_=
HASH_SIZE];
>> +};
>> +
>> +/* Device ID cache node */
>> +struct nfs4_deviceid {
>> + =A0 =A0 struct hlist_node =A0 =A0 =A0 de_node;
>> + =A0 =A0 struct pnfs_deviceid =A0 =A0de_id;
>> + =A0 =A0 struct kref =A0 =A0 =A0 =A0 =A0 =A0 de_kref;
>> +};
>> +
>> +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 void (*fre=
e_callback)(struct kref *));
>> +extern void nfs4_put_deviceid_cache(struct nfs_client *);
>> +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
>> +extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_devicei=
d_cache *,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 struct pnf=
s_deviceid *);
>> +extern struct nfs4_deviceid *nfs4_add_deviceid(struct nfs4_deviceid=
_cache *,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 struct nfs=
4_deviceid *);
>> +extern void nfs4_set_layout_deviceid(struct pnfs_layout_segment *,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 struct nfs=
4_deviceid *);
>> +extern void nfs4_unset_layout_deviceid(struct pnfs_layout_segment *=
,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 struct nfs=
4_deviceid *,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 void (*fre=
e_callback)(struct kref *));
>> +
>> =A0/* pNFS client callback functions.
>> =A0 * These operations allow the layout driver to access pNFS client
>> =A0 * specific information or call pNFS client->server operations.
>> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
>> index 8522461..ef2e18e 100644
>> --- a/include/linux/nfs_fs_sb.h
>> +++ b/include/linux/nfs_fs_sb.h
>> @@ -87,6 +87,7 @@ struct nfs_client {
>> =A0 =A0 =A0 u32 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 cl_exchange_=
flags;
>> =A0 =A0 =A0 struct nfs4_session =A0 =A0 *cl_session; =A0 =A0/* sharr=
ed session */
>> =A0 =A0 =A0 struct list_head =A0 =A0 =A0 =A0cl_lo_inodes; =A0 /* Ino=
des having layouts */
>> + =A0 =A0 struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS device=
id cache */
>> =A0#endif /* CONFIG_NFS_V4_1 */
>>
>> =A0#ifdef CONFIG_NFS_FSCACHE
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" =
in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at =A0http://vger.kernel.org/majordomo-info.html
>
^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2010-05-03 13:57 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-04-16 15:52 [PATCH 0/3] pNFS generic device ID cache andros
2010-04-16 15:52 ` [PATCH 1/3] SQUASHME pnfs_submit: " andros
2010-04-16 15:52 ` [PATCH 2/3] SQUASHME pnfs_submit: fix multiple mount set_pnfs_layoutdriver andros
2010-04-16 15:52 ` [PATCH 3/3] SQUASHME pnfs-submit: file layout driver generic device ID cache andros
2010-04-16 16:04 ` [PATCH 1/3] SQUASHME pnfs_submit: " William A. (Andy) Adamson
[not found] ` <u2n89c397151004160904m9e862360xcaf0e187640b0177-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2010-04-21 5:59 ` [pnfs] " Benny Halevy
2010-04-21 15:22 ` William A. (Andy) Adamson
[not found] ` <l2l89c397151004210822j8b43009o3a9e78ceed901fd9-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2010-04-22 11:20 ` Benny Halevy
2010-04-22 15:47 ` William A. (Andy) Adamson
[not found] ` <v2h89c397151004220847v3a31c493s4089d0cd53cf3e19-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2010-04-22 15:51 ` Benny Halevy
-- strict thread matches above, loose matches on Subject: below --
2010-04-26 16:18 [PATCH 0/3] pNFS generic device ID cache version 3 andros
2010-04-26 16:18 ` [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache andros
2010-05-03 11:48 ` [pnfs] " Benny Halevy
2010-05-03 13:57 ` William A. (Andy) Adamson
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).