From: Greg Banks <gnb@sgi.com>
To: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Linux NFS ML <linux-nfs@vger.kernel.org>
Subject: [patch 17/29] knfsd: make the reply cache SMP-friendly
Date: Wed, 01 Apr 2009 07:28:17 +1100 [thread overview]
Message-ID: <20090331202944.005574000@sgi.com> (raw)
In-Reply-To: 20090331202800.739621000@sgi.com
Make the reply cache scale better on large multiprocessor NFS servers,
by splitting the global lock into multiple locks one in each hash
bucket. To avoid introducing a new global contention point, the
global LRU list of reply cache entries is split into multiple LRU
lists, one per hash bucket.
Signed-off-by: Greg Banks <gnb@sgi.com>
---
fs/nfsd/nfscache.c | 166 ++++++++++++++++++++++++++++++------------
1 file changed, 121 insertions(+), 45 deletions(-)
Index: bfields/fs/nfsd/nfscache.c
===================================================================
--- bfields.orig/fs/nfsd/nfscache.c
+++ bfields/fs/nfsd/nfscache.c
@@ -8,6 +8,9 @@
* it does things a bit differently.
*
* Copyright (C) 1995, 1996 Olaf Kirch <okir-pn4DOG8n3UYbFoVRYvo4fw@public.gmane.org>
+ *
+ * SMP lock splitting by Greg Banks <gnb@sgi.com>
+ * Copyright (c) 2005-2009 Silicon Graphics, Inc.
*/
#include <linux/kernel.h>
@@ -27,11 +30,43 @@
* Solaris2: 1024
* DEC Unix: 512-4096
*/
-#define CACHESIZE 1024
-#define HASHSIZE 64
+/* number of buckets used to manage LRU lists and cache locks (power of 2) */
+#ifdef CONFIG_SMP
+#define CACHE_NUM_BUCKETS 64
+#else
+#define CACHE_NUM_BUCKETS 1
+#endif
+/* number of entries in all LRU lists (power of 2) */
+#define CACHE_SIZE (1024)
+/* largest possible number of entries in LRU per bucket */
+#define CACHE_BUCKET_MAX_SIZE (CACHE_SIZE/CACHE_NUM_BUCKETS)
+/* log2 of largest desired hash chain length */
+#define MAX_CHAIN_ORDER 2
+/* initial and maximum size of the per-bucket hash table */
+#define HASHSIZE ((CACHE_SIZE>>MAX_CHAIN_ORDER)/CACHE_NUM_BUCKETS)
+
+/*
+ * locking for the reply cache:
+ * A cache entry is "single use" if c_state == RC_INPROG
+ * Otherwise, it when accessing _prev or _next, the lock for the
+ * appropriate bucket must be held.
+ *
+ * On uniprocessor systems, we have a single global lock and LRU
+ * list. However on multiprocessor systems, to reduce contention
+ * on that spinlock under heavy nonidempotent load (think chmod -R)
+ * we use multiple buckets. The lower few bits of the hash index
+ * are used to index the buckets.
+ */
+struct svc_cache_bucket
+{
+ spinlock_t lock;
+ struct list_head lru;
+ unsigned int size;
+ struct hlist_head *hash;
+} ____cacheline_aligned_in_smp;
-static struct hlist_head * cache_hash;
-static struct list_head lru_head;
+static struct svc_cache_bucket cache_buckets[CACHE_NUM_BUCKETS];
+#define bucket_for_hash(hash) (&cache_buckets[(hash) & (CACHE_NUM_BUCKETS-1)])
static int cache_disabled = 1;
/*
@@ -49,39 +84,70 @@ static inline u32 request_hash(u32 xid,
h ^= (xid >> 24);
h ^= ((xid & 0xff0000) >> 8);
h ^= sin->sin_addr.s_addr;
- return h & (HASHSIZE-1);
+ return h;
}
static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
/*
- * locking for the reply cache:
- * A cache entry is "single use" if c_state == RC_INPROG
- * Otherwise, it when accessing _prev or _next, the lock must be held.
+ * Initialise the reply cache data structures.
+ * Called without cache_lock, uses it internally. Returns
+ * 0 on success, an error otherwise.
*/
-static DEFINE_SPINLOCK(cache_lock);
-
-int nfsd_reply_cache_init(void)
+static int nfsd_cache_bucket_init(struct svc_cache_bucket *b, unsigned int num)
{
- struct svc_cacherep *rp;
- int i;
+ struct svc_cacherep *rp;
+ unsigned int i;
+ LIST_HEAD(lru);
- INIT_LIST_HEAD(&lru_head);
- i = CACHESIZE;
+ /* allocate new entries without the lock, keep them on their own list */
+ i = num;
while (i) {
rp = kmalloc(sizeof(*rp), GFP_KERNEL);
if (!rp)
goto out_nomem;
- list_add(&rp->c_lru, &lru_head);
+ list_add(&rp->c_lru, &lru);
rp->c_state = RC_UNUSED;
rp->c_type = RC_NOCACHE;
INIT_HLIST_NODE(&rp->c_hash);
i--;
}
- cache_hash = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
- if (!cache_hash)
- goto out_nomem;
+ /* add the new entries */
+ spin_lock(&b->lock);
+
+ b->size = num;
+ list_splice(&lru, &b->lru);
+
+ spin_unlock(&b->lock);
+ return 0;
+
+out_nomem:
+ /* free any entries we've allocated but not spliced in */
+ while (!list_empty(&lru)) {
+ rp = list_entry(lru.next, struct svc_cacherep, c_lru);
+ list_del(&rp->c_lru);
+ kfree (rp);
+ }
+ return -ENOMEM;
+}
+
+int nfsd_reply_cache_init(void)
+{
+ unsigned int bucket;
+
+ for (bucket = 0 ; bucket < CACHE_NUM_BUCKETS ; bucket++)
+ {
+ struct svc_cache_bucket *b = &cache_buckets[bucket];
+
+ INIT_LIST_HEAD(&b->lru);
+ spin_lock_init(&b->lock);
+ if (nfsd_cache_bucket_init(b, CACHE_SIZE/CACHE_NUM_BUCKETS))
+ goto out_nomem;
+ b->hash = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
+ if (!b->hash)
+ goto out_nomem;
+ }
cache_disabled = 0;
return 0;
@@ -94,28 +160,32 @@ out_nomem:
void nfsd_reply_cache_shutdown(void)
{
struct svc_cacherep *rp;
+ unsigned int bucket;
- while (!list_empty(&lru_head)) {
- rp = list_entry(lru_head.next, struct svc_cacherep, c_lru);
- if (rp->c_state == RC_DONE && rp->c_type == RC_REPLBUFF)
- kfree(rp->c_replvec.iov_base);
- list_del(&rp->c_lru);
- kfree(rp);
+ for (bucket = 0 ; bucket < CACHE_NUM_BUCKETS ; bucket++)
+ {
+ struct svc_cache_bucket *b = &cache_buckets[bucket];
+
+ while (!list_empty(&b->lru)) {
+ rp = list_entry(b->lru.next, struct svc_cacherep, c_lru);
+ if (rp->c_state == RC_DONE && rp->c_type == RC_REPLBUFF)
+ kfree(rp->c_replvec.iov_base);
+ list_del(&rp->c_lru);
+ kfree(rp);
+ }
+ kfree (b->hash);
+ b->hash = NULL;
}
cache_disabled = 1;
-
- kfree (cache_hash);
- cache_hash = NULL;
}
/*
* Move cache entry to end of LRU list
*/
-static void
-lru_put_end(struct svc_cacherep *rp)
+static inline void lru_put_end(struct svc_cache_bucket *b, struct svc_cacherep *rp)
{
- list_move_tail(&rp->c_lru, &lru_head);
+ list_move_tail(&rp->c_lru, &b->lru);
}
/*
@@ -134,8 +204,9 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
vers = rqstp->rq_vers,
proc = rqstp->rq_proc,
h;
+ struct svc_cache_bucket *b;
unsigned long age;
- int rtn;
+ int rtn;
rqstp->rq_cacherep = NULL;
if (cache_disabled || type == RC_NOCACHE) {
@@ -143,11 +214,13 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
return RC_DOIT;
}
h = request_hash(xid, svc_addr_in(rqstp));
+ b = bucket_for_hash(h);
+ h = (h / CACHE_NUM_BUCKETS) & (HASHSIZE-1);
- spin_lock(&cache_lock);
+ spin_lock(&b->lock);
rtn = RC_DOIT;
- rh = &cache_hash[h];
+ rh = &b->hash[h];
hlist_for_each_entry(rp, hn, rh, c_hash) {
if (rp->c_state != RC_UNUSED &&
xid == rp->c_xid && proc == rp->c_proc &&
@@ -163,10 +236,10 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
/* This loop shouldn't take more than a few iterations normally */
{
int safe = 0;
- list_for_each_entry(rp, &lru_head, c_lru) {
+ list_for_each_entry(rp, &b->lru, c_lru) {
if (rp->c_state != RC_INPROG)
break;
- if (safe++ > CACHESIZE) {
+ if (safe++ > b->size) {
printk("nfsd: loop in repcache LRU list\n");
cache_disabled = 1;
goto out;
@@ -175,7 +248,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
}
/* All entries on the LRU are in-progress. This should not happen */
- if (&rp->c_lru == &lru_head) {
+ if (&rp->c_lru == &b->lru) {
static int complaints;
printk(KERN_WARNING "nfsd: all repcache entries locked!\n");
@@ -197,7 +270,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
/* Move the cache entry from one hash list to another */
hlist_del_init(&rp->c_hash);
- hlist_add_head(&rp->c_hash, cache_hash + h);
+ hlist_add_head(&rp->c_hash, b->hash + h);
/* release any buffer */
if (rp->c_type == RC_REPLBUFF) {
@@ -206,14 +279,14 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
}
rp->c_type = RC_NOCACHE;
out:
- spin_unlock(&cache_lock);
+ spin_unlock(&b->lock);
return rtn;
found_entry:
/* We found a matching entry which is either in progress or done. */
age = jiffies - rp->c_timestamp;
rp->c_timestamp = jiffies;
- lru_put_end(rp);
+ lru_put_end(b, rp);
rtn = RC_DROPIT;
/* Request being processed or excessive rexmits */
@@ -269,10 +342,13 @@ nfsd_cache_update(struct svc_rqst *rqstp
struct svc_cacherep *rp;
struct kvec *resv = &rqstp->rq_res.head[0], *cachv;
int len;
+ struct svc_cache_bucket *b;
if (!(rp = rqstp->rq_cacherep) || cache_disabled)
return;
+ b = bucket_for_hash(request_hash(rp->c_xid, &rp->c_addr));
+
len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
len >>= 2;
@@ -292,22 +368,22 @@ nfsd_cache_update(struct svc_rqst *rqstp
cachv = &rp->c_replvec;
cachv->iov_base = kmalloc(len << 2, GFP_KERNEL);
if (!cachv->iov_base) {
- spin_lock(&cache_lock);
+ spin_lock(&b->lock);
rp->c_state = RC_UNUSED;
- spin_unlock(&cache_lock);
+ spin_unlock(&b->lock);
return;
}
cachv->iov_len = len << 2;
memcpy(cachv->iov_base, statp, len << 2);
break;
}
- spin_lock(&cache_lock);
- lru_put_end(rp);
+ spin_lock(&b->lock);
+ lru_put_end(b, rp);
rp->c_secure = rqstp->rq_secure;
rp->c_type = cachetype;
rp->c_state = RC_DONE;
rp->c_timestamp = jiffies;
- spin_unlock(&cache_lock);
+ spin_unlock(&b->lock);
return;
}
--
Greg
next prev parent reply other threads:[~2009-03-31 21:02 UTC|newest]
Thread overview: 63+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
2009-03-31 20:28 ` [patch 01/29] knfsd: Add infrastructure for measuring RPC service times Greg Banks
2009-04-25 2:13 ` J. Bruce Fields
2009-04-25 2:14 ` J. Bruce Fields
2009-04-25 2:52 ` Greg Banks
2009-03-31 20:28 ` [patch 02/29] knfsd: Add stats table infrastructure Greg Banks
2009-04-25 3:56 ` J. Bruce Fields
2009-04-26 4:12 ` Greg Banks
2009-03-31 20:28 ` [patch 03/29] knfsd: add userspace controls for stats tables Greg Banks
2009-04-25 21:57 ` J. Bruce Fields
2009-04-25 22:03 ` J. Bruce Fields
2009-04-27 16:06 ` Chuck Lever
2009-04-27 23:22 ` J. Bruce Fields
2009-04-28 15:37 ` Chuck Lever
2009-04-28 15:57 ` J. Bruce Fields
2009-04-28 16:03 ` Chuck Lever
2009-04-28 16:26 ` J. Bruce Fields
2009-04-29 1:45 ` Greg Banks
[not found] ` <ac442c870904271827w6041a67ew82fe36a843beeac3@mail.gmail.com>
[not found] ` <ac442c870904271827w6041a67ew82fe36a843beeac3-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2009-04-28 1:31 ` Greg Banks
2009-04-26 4:14 ` Greg Banks
2009-03-31 20:28 ` [patch 04/29] knfsd: Add stats updating API Greg Banks
2009-03-31 20:28 ` [patch 05/29] knfsd: Infrastructure for providing stats to userspace Greg Banks
2009-04-01 0:28 ` J. Bruce Fields
2009-04-01 3:43 ` Greg Banks
2009-03-31 20:28 ` [patch 06/29] knfsd: Gather per-export stats Greg Banks
2009-03-31 20:28 ` [patch 07/29] knfsd: Prefetch the per-export stats entry Greg Banks
2009-03-31 20:28 ` [patch 08/29] knfsd: Gather per-client stats Greg Banks
2009-03-31 20:28 ` [patch 09/29] knfsd: Cache per-client stats entry on TCP transports Greg Banks
2009-03-31 20:28 ` [patch 10/29] knfsd: Update per-client & per-export stats from NFSv3 Greg Banks
2009-03-31 20:28 ` [patch 11/29] knfsd: Update per-client & per-export stats from NFSv2 Greg Banks
2009-03-31 20:28 ` [patch 12/29] knfsd: Update per-client & per-export stats from NFSv4 Greg Banks
2009-03-31 20:28 ` [patch 13/29] knfsd: reply cache cleanups Greg Banks
2009-05-12 19:54 ` J. Bruce Fields
2009-03-31 20:28 ` [patch 14/29] knfsd: better hashing in the reply cache Greg Banks
2009-05-08 22:01 ` J. Bruce Fields
2009-03-31 20:28 ` [patch 15/29] knfsd: fix reply cache memory corruption Greg Banks
2009-05-12 19:55 ` J. Bruce Fields
2009-03-31 20:28 ` [patch 16/29] knfsd: use client IPv4 address in reply cache hash Greg Banks
2009-05-11 21:48 ` J. Bruce Fields
2009-03-31 20:28 ` Greg Banks [this message]
2009-03-31 20:28 ` [patch 18/29] knfsd: dynamically expand the reply cache Greg Banks
2009-05-26 18:57 ` J. Bruce Fields
2009-05-26 19:04 ` J. Bruce Fields
2009-05-26 21:24 ` Rob Gardner
2009-05-26 21:52 ` J. Bruce Fields
2009-05-27 0:28 ` Greg Banks
2009-03-31 20:28 ` [patch 19/29] knfsd: faster probing in " Greg Banks
2009-03-31 20:28 ` [patch 20/29] knfsd: add extended reply cache stats Greg Banks
2009-03-31 20:28 ` [patch 21/29] knfsd: remove unreported filehandle stats counters Greg Banks
2009-05-12 20:00 ` J. Bruce Fields
2009-03-31 20:28 ` [patch 22/29] knfsd: make svc_authenticate() scale Greg Banks
2009-05-12 21:24 ` J. Bruce Fields
2009-03-31 20:28 ` [patch 23/29] knfsd: introduce SVC_INC_STAT Greg Banks
2009-03-31 20:28 ` [patch 24/29] knfsd: remove the program field from struct svc_stat Greg Banks
2009-03-31 20:28 ` [patch 25/29] knfsd: allocate svc_serv.sv_stats dynamically Greg Banks
2009-03-31 20:28 ` [patch 26/29] knfsd: make svc_serv.sv_stats per-CPU Greg Banks
2009-03-31 20:28 ` [patch 27/29] knfsd: move hot procedure count field out of svc_procedure Greg Banks
2009-03-31 20:28 ` [patch 28/29] knfsd: introduce NFSD_INC_STAT() Greg Banks
2009-03-31 20:28 ` [patch 29/29] knfsd: make nfsdstats per-CPU Greg Banks
2009-04-01 0:23 ` [patch 00/29] SGI enhancedNFS patches J. Bruce Fields
2009-04-01 3:32 ` Greg Banks
[not found] ` <ac442c870903312032t34630c6dvdbb644cb510f8079-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2009-04-01 6:34 ` Jeff Garzik
2009-04-01 6:41 ` Greg Banks
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20090331202944.005574000@sgi.com \
--to=gnb@sgi.com \
--cc=bfields@fieldses.org \
--cc=linux-nfs@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.