From: Trond Myklebust <trond.myklebust@fys.uio.no>
To: Greg Banks <gnb@melbourne.sgi.com>
Cc: Neil Brown <neilb@suse.de>,
Linux NFS Mailing List <nfs@lists.sourceforge.net>
Subject: Re: [PATCH 010 of 11] knfsd: make pools numa aware
Date: Tue, 25 Jul 2006 08:43:13 -0400 [thread overview]
Message-ID: <1153831393.5660.13.camel@localhost> (raw)
In-Reply-To: <1153804618.21040.25.camel@hole.melbourne.sgi.com>
On Tue, 2006-07-25 at 15:16 +1000, Greg Banks wrote:
> knfsd: Actually implement multiple pools. On NUMA machines, allocate
> a svc_pool per NUMA node; on SMP a svc_pool per CPU; otherwise a single
> global pool. Enqueue sockets on the svc_pool corresponding to the CPU
> on which the socket bh is run (i.e. the NIC interrupt CPU). Threads
> have their cpu mask set to limit them to the CPUs in the svc_pool that
> owns them.
>
> This is the patch that allows an Altix to scale NFS traffic linearly
> beyond 4 CPUs and 4 NICs.
>
> Signed-off-by: Greg Banks <gnb@melbourne.sgi.com>
> ---
>
> include/linux/sunrpc/svc.h | 62 +++++++++++
> net/sunrpc/svc.c | 184 +++++++++++++++++++++++++++++++++-
> net/sunrpc/svcsock.c | 7 +
> 3 files changed, 251 insertions(+), 2 deletions(-)
>
> Index: linus-git/net/sunrpc/svc.c
> ===================================================================
> --- linus-git.orig/net/sunrpc/svc.c 2006-07-24 22:16:36.157203063 +1000
> +++ linus-git/net/sunrpc/svc.c 2006-07-24 22:54:13.557820093 +1000
> @@ -4,6 +4,10 @@
> * High-level RPC service routines
> *
> * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
> + *
> + * Multiple threads pools and NUMAisation
> + * Copyright (c) 2006 Silicon Graphics, Inc.
> + * by Greg Banks <gnb@melbourne.sgi.com>
> */
>
> #include <linux/linkage.h>
> @@ -24,6 +28,161 @@
> #define RPCDBG_FACILITY RPCDBG_SVCDSP
> #define RPC_PARANOIA 1
>
> +
> +#if SVC_HAVE_MULTIPLE_POOLS
> +
> +struct svc_pool_map svc_pool_map = { .mode = -1, .init = 0 };
> +
> +/*
> + * Build the global map of cpus to pools and vice versa.
> + */
> +static unsigned int
> +svc_pool_map_init(void)
> +{
> + struct svc_pool_map *m = &svc_pool_map;
> + unsigned int node;
> + unsigned int cpu;
> + unsigned int pidx = 0;
> + unsigned int maxpools;
> +
> + if (m->init)
> + return m->npools;
> + m->init = 1;
> +
> + if (m->mode < 0) {
> + /*
> + * Detect best pool mapping mode heuristically.
> + */
> + m->mode = 0; /* default: one global pool */
> +#ifdef CONFIG_NUMA
^^^^^^^^^^^^^^^^^^ Growl...
Perhaps a helper function to hide the ifdef.
> + if (num_online_nodes() > 1) {
> + /*
> + * Actually have multiple NUMA nodes,
> + * so split pools on NUMA node boundaries
> + */
> + m->mode = 2;
> + } else {
> + node = any_online_node(node_online_map);
> + if (nr_cpus_node(node) > 2) {
> + /*
> + * Apparently we're running with CONFIG_NUMA
> + * on non-NUMA hardware, e.g. with a generic
> + * x86_64 kernel on Xeons. In this case we
> + * want to divide the pools on cpu boundaries.
> + */
> + m->mode = 1;
> + }
> + }
> +#else
> + if (num_online_cpus() > 1) {
> + /*
> + * Plain SMP with multiple CPUs online.
> + */
> + m->mode = 1;
> + }
> +#endif
> + }
> +
> + switch (m->mode) {
> + case 0:
> +fallback:
> + m->mode = 0;
> + m->npools = 1;
> + printk("nfsd: initialising 1 global pool\n");
^^^^ ho hum....
Please keep sunrpc and nfsd separate. Also, this should probably be a
dprintk() in order to avoid spamming the syslogs.
> + break;
> +
> + case 1:
> + maxpools = num_possible_cpus();
> + m->cpu_to_pool = kcalloc(maxpools, sizeof(unsigned int),
> + GFP_KERNEL);
> + if (!m->cpu_to_pool)
> + goto fallback;
> + m->pool_to_cpu = kcalloc(maxpools, sizeof(unsigned int),
> + GFP_KERNEL);
> + if (!m->pool_to_cpu) {
> + kfree(m->cpu_to_pool);
> + goto fallback;
> + }
> + for_each_online_cpu(cpu) {
> + BUG_ON(pidx > maxpools);
> + m->cpu_to_pool[cpu] = pidx;
> + m->pool_to_cpu[pidx] = cpu;
> + pidx++;
> + }
> + /* cpus brought online later all get mapped to pool0, sorry */
> + m->npools = pidx;
> +
> + printk("nfsd: initialising %u pools, one per cpu\n", m->npools);
^^^^
> + break;
> +
> +#ifdef CONFIG_NUMA
^^^^^^^^^^^^^^^^^^^ See above
> + case 2:
> + maxpools = num_possible_nodes();
> + m->node_to_pool = kcalloc(maxpools, sizeof(unsigned int),
> + GFP_KERNEL);
> + if (!m->node_to_pool)
> + goto fallback;
> + m->pool_to_node = kcalloc(maxpools, sizeof(unsigned int),
> + GFP_KERNEL);
> + if (!m->pool_to_node) {
> + kfree(m->node_to_pool);
> + goto fallback;
> + }
> + for_each_node_with_cpus(node) {
> + /* some architectures (e.g. SN2) have cpuless nodes */
> + BUG_ON(pidx > maxpools);
> + m->node_to_pool[node] = pidx;
> + m->pool_to_node[pidx] = node;
> + pidx++;
> + }
> + /* nodes brought online later all get mapped to pool0, sorry */
> + m->npools = pidx;
> +
> + printk("nfsd: initialising %u pools, one per numa node\n", m->npools);
^^^^
> + break;
> +#endif /* CONFIG_NUMA */
> + }
> +
> + return m->npools;
> +}
> +
> +/*
> + * Set the current thread's cpus_allowed mask so that it
> + * will only run on cpus in the given pool.
> + *
> + * Returns 1 and fills in oldmask iff a cpumask was applied.
> + */
> +static int
> +svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask)
> +{
> + struct svc_pool_map *m = &svc_pool_map;
> + unsigned int node;
> + unsigned int cpu;
> +
> + BUG_ON(!m->init);
> +
> + switch (m->mode)
> + {
> + default:
> + case 0:
> + return 0;
> + case 1:
> + cpu = m->pool_to_cpu[pidx];
> + *oldmask = current->cpus_allowed;
> + set_cpus_allowed(current, cpumask_of_cpu(cpu));
> + return 1;
> +#ifdef CONFIG_NUMA
^^^^^^^^^^^^^^^^^ See above
> + case 2:
> + node = m->pool_to_node[pidx];
> + *oldmask = current->cpus_allowed;
> + set_cpus_allowed(current, node_to_cpumask(node));
> + return 1;
> +#endif /* CONFIG_NUMA */
> + }
> +}
> +
> +#endif /* SVC_HAVE_MULTIPLE_POOLS */
> +
> /*
> * Create an RPC service
> */
> @@ -101,8 +260,13 @@ svc_create_pooled(struct svc_program *pr
> svc_thread_fn func, int sig, struct module *mod)
> {
> struct svc_serv *serv;
> + unsigned int npools = 1;
>
> - serv = __svc_create(prog, bufsize, /*npools*/1);
> +#if SVC_HAVE_MULTIPLE_POOLS
No...
#ifndef SVC_HAVE_MULTIPLE_POOLS
static inline svc_pool_map_init(void)
{
return 0;
}
#else
.....
#endif
> + npools = svc_pool_map_init();
> +#endif
> +
> + serv = __svc_create(prog, bufsize, npools);
>
> if (serv != NULL) {
> serv->sv_function = func;
> @@ -202,12 +366,18 @@ svc_release_buffer(struct svc_rqst *rqst
>
> /*
> * Create a thread in the given pool. Caller must hold BKL.
> + * On a NUMA or SMP machine, with a multi-pool serv, the thread
> + * will be restricted to run on the cpus belonging to the pool.
> */
> static int
> __svc_create_thread(svc_thread_fn func, struct svc_serv *serv, struct svc_pool *pool)
> {
> struct svc_rqst *rqstp;
> int error = -ENOMEM;
> +#if SVC_HAVE_MULTIPLE_POOLS
> + int have_oldmask = 0;
> + cpumask_t oldmask;
> +#endif
>
> rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
> if (!rqstp)
> @@ -227,7 +397,19 @@ __svc_create_thread(svc_thread_fn func,
> spin_unlock_bh(&pool->sp_lock);
> rqstp->rq_server = serv;
> rqstp->rq_pool = pool;
> +
> +#if SVC_HAVE_MULTIPLE_POOLS
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
See above. Setting have_oldmask to zero in the case where
SVC_HAVE_MULTIPLE_POOLS should work fine, and will be optimised away by
the compiler.
> + if (serv->sv_nrpools > 1)
> + have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask);
> +#endif
> +
> error = kernel_thread((int (*)(void *)) func, rqstp, 0);
> +
> +#if SVC_HAVE_MULTIPLE_POOLS
> + if (have_oldmask)
> + set_cpus_allowed(current, oldmask);
> +#endif
> +
> if (error < 0)
> goto out_thread;
> svc_sock_update_bufs(serv);
> Index: linus-git/net/sunrpc/svcsock.c
> ===================================================================
> --- linus-git.orig/net/sunrpc/svcsock.c 2006-07-24 20:44:46.911435470 +1000
> +++ linus-git/net/sunrpc/svcsock.c 2006-07-24 22:45:23.263878219 +1000
> @@ -150,8 +150,9 @@ static void
> svc_sock_enqueue(struct svc_sock *svsk)
> {
> struct svc_serv *serv = svsk->sk_server;
> - struct svc_pool *pool = &serv->sv_pools[0];
> + struct svc_pool *pool;
> struct svc_rqst *rqstp;
> + int cpu;
>
> if (!(svsk->sk_flags &
> ( (1<<SK_CONN)|(1<<SK_DATA)|(1<<SK_CLOSE)|(1<<SK_DEFERRED)) ))
> @@ -159,6 +160,10 @@ svc_sock_enqueue(struct svc_sock *svsk)
> if (test_bit(SK_DEAD, &svsk->sk_flags))
> return;
>
> + cpu = get_cpu();
> + pool = svc_pool_for_cpu(svsk->sk_server, cpu);
> + put_cpu();
> +
> spin_lock_bh(&pool->sp_lock);
>
> if (!list_empty(&pool->sp_threads) &&
> Index: linus-git/include/linux/sunrpc/svc.h
> ===================================================================
> --- linus-git.orig/include/linux/sunrpc/svc.h 2006-07-24 22:16:36.041218126 +1000
> +++ linus-git/include/linux/sunrpc/svc.h 2006-07-24 22:45:23.347867112 +1000
> @@ -41,6 +41,39 @@ struct svc_pool {
> struct list_head sp_all_threads; /* all server threads */
> } ____cacheline_aligned_in_smp;
>
> +#if defined(CONFIG_NUMA) || defined(CONFIG_SMP)
> +#define SVC_HAVE_MULTIPLE_POOLS 1
> +#else
> +#define SVC_HAVE_MULTIPLE_POOLS 0
> +#endif
> +
> +#if SVC_HAVE_MULTIPLE_POOLS
^^^^^^^^^^^^ Any reason why you've done this? A definition shouldn't be
that worrying to us...
> +/*
> + * Global structure for mapping cpus to pools and vice versa.
> + * Setup once during sunrpc initialisation.
> + */
> +struct svc_pool_map {
> + /*
> + * Mode for mapping cpus to pools.
> + *
> + * -1 = automatic, choose one of the other modes at boot
> + * 0 = no mapping, just a single global pool (legacy & UP mode)
> + * 1 = one pool per cpu
> + * 2 = one pool per numa node
> + */
> + int mode;
> + int init;
> + unsigned int npools;
> + unsigned int *pool_to_cpu;
> + unsigned int *cpu_to_pool;
> +#ifdef CONFIG_NUMA
> + unsigned int *node_to_pool;
> + unsigned int *pool_to_node;
> +#endif /* CONFIG_NUMA */
> +};
> +#endif /* SVC_HAVE_MULTIPLE_POOLS */
> +
> +
> /*
> * RPC service.
> *
> @@ -360,5 +393,34 @@ int svc_process(struct svc_serv *, s
> int svc_register(struct svc_serv *, int, unsigned short);
> void svc_wake_up(struct svc_serv *);
> void svc_reserve(struct svc_rqst *rqstp, int space);
> +extern struct svc_pool_map svc_pool_map;
> +
> +
> +static inline struct svc_pool *svc_pool_for_cpu(struct svc_serv *serv, int cpu)
> +{
> +#if SVC_HAVE_MULTIPLE_POOLS
> + struct svc_pool_map *m = &svc_pool_map;
> + unsigned int pidx;
> +
> + switch (m->mode) {
> + default:
> + case 0:
> + pidx = 0;
> + break;
> + case 1:
> + pidx = m->cpu_to_pool[cpu];
> + break;
> +#ifdef CONFIG_NUMA
> + case 2:
> + pidx = m->node_to_pool[cpu_to_node(cpu)];
> + break;
> +#endif /* CONFIG_NUMA */
> + }
> + return &serv->sv_pools[pidx % serv->sv_nrpools];
> +#else
> + return &serv->sv_pools[0];
> +#endif
> +}
> +
>
> #endif /* SUNRPC_SVC_H */
>
Cheers,
Trond
-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys -- and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
NFS maillist - NFS@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nfs
next prev parent reply other threads:[~2006-07-25 12:43 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2006-07-25 5:16 [PATCH 010 of 11] knfsd: make pools numa aware Greg Banks
2006-07-25 12:43 ` Trond Myklebust [this message]
2006-07-26 2:20 ` Greg Banks
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1153831393.5660.13.camel@localhost \
--to=trond.myklebust@fys.uio.no \
--cc=gnb@melbourne.sgi.com \
--cc=neilb@suse.de \
--cc=nfs@lists.sourceforge.net \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox