Linux NFS development
 help / color / mirror / Atom feed
* [PATCH 010 of 11] knfsd: make pools numa aware
@ 2006-07-25  5:16 Greg Banks
  2006-07-25 12:43 ` Trond Myklebust
  0 siblings, 1 reply; 3+ messages in thread
From: Greg Banks @ 2006-07-25  5:16 UTC (permalink / raw)
  To: Neil Brown; +Cc: Linux NFS Mailing List

knfsd: Actually implement multiple pools.  On NUMA machines, allocate
a svc_pool per NUMA node; on SMP a svc_pool per CPU; otherwise a single
global pool.  Enqueue sockets on the svc_pool corresponding to the CPU
on which the socket bh is run (i.e. the NIC interrupt CPU).  Threads
have their cpu mask set to limit them to the CPUs in the svc_pool that
owns them.

This is the patch that allows an Altix to scale NFS traffic linearly
beyond 4 CPUs and 4 NICs.

Signed-off-by: Greg Banks <gnb@melbourne.sgi.com>
---

 include/linux/sunrpc/svc.h |   62 +++++++++++
 net/sunrpc/svc.c           |  184 +++++++++++++++++++++++++++++++++-
 net/sunrpc/svcsock.c       |    7 +
 3 files changed, 251 insertions(+), 2 deletions(-)

Index: linus-git/net/sunrpc/svc.c
===================================================================
--- linus-git.orig/net/sunrpc/svc.c	2006-07-24 22:16:36.157203063 +1000
+++ linus-git/net/sunrpc/svc.c	2006-07-24 22:54:13.557820093 +1000
@@ -4,6 +4,10 @@
  * High-level RPC service routines
  *
  * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ *
+ * Multiple threads pools and NUMAisation
+ * Copyright (c) 2006 Silicon Graphics, Inc.
+ * by Greg Banks <gnb@melbourne.sgi.com>
  */
 
 #include <linux/linkage.h>
@@ -24,6 +28,161 @@
 #define RPCDBG_FACILITY	RPCDBG_SVCDSP
 #define RPC_PARANOIA 1
 
+
+#if SVC_HAVE_MULTIPLE_POOLS
+
+struct svc_pool_map svc_pool_map = { .mode = -1, .init = 0 };
+
+/*
+ * Build the global map of cpus to pools and vice versa.
+ */
+static unsigned int
+svc_pool_map_init(void)
+{
+	struct svc_pool_map *m = &svc_pool_map;
+	unsigned int node;
+	unsigned int cpu;
+	unsigned int pidx = 0;
+	unsigned int maxpools;
+
+	if (m->init)
+		return m->npools;
+	m->init = 1;
+
+	if (m->mode < 0) {
+		/*
+		 * Detect best pool mapping mode heuristically.
+		 */
+		m->mode = 0;	/* default: one global pool */
+#ifdef CONFIG_NUMA
+		if (num_online_nodes() > 1) {
+			/*
+			 * Actually have multiple NUMA nodes,
+			 * so split pools on NUMA node boundaries
+			 */
+			m->mode = 2;
+		} else {
+			node = any_online_node(node_online_map);
+			if (nr_cpus_node(node) > 2) {
+				/*
+				 * Apparently we're running with CONFIG_NUMA
+				 * on non-NUMA hardware, e.g. with a generic
+				 * x86_64 kernel on Xeons.  In this case we
+				 * want to divide the pools on cpu boundaries.
+				 */
+				m->mode = 1;
+			}
+		}
+#else
+		if (num_online_cpus() > 1) {
+			/*
+			 * Plain SMP with multiple CPUs online.
+			 */
+			m->mode = 1;
+		}
+#endif
+	}
+
+	switch (m->mode) {
+	case 0:
+fallback:
+		m->mode = 0;
+		m->npools = 1;
+		printk("nfsd: initialising 1 global pool\n");
+		break;
+
+	case 1:
+		maxpools = num_possible_cpus();
+		m->cpu_to_pool = kcalloc(maxpools, sizeof(unsigned int),
+					       GFP_KERNEL);
+		if (!m->cpu_to_pool)
+			goto fallback;
+		m->pool_to_cpu = kcalloc(maxpools, sizeof(unsigned int),
+					       GFP_KERNEL);
+		if (!m->pool_to_cpu) {
+			kfree(m->cpu_to_pool);
+			goto fallback;
+		}
+		for_each_online_cpu(cpu) {
+			BUG_ON(pidx > maxpools);
+			m->cpu_to_pool[cpu] = pidx;
+			m->pool_to_cpu[pidx] = cpu;
+			pidx++;
+		}
+		/* cpus brought online later all get mapped to pool0, sorry */
+		m->npools = pidx;
+
+		printk("nfsd: initialising %u pools, one per cpu\n", m->npools);
+		break;
+
+#ifdef CONFIG_NUMA
+	case 2:
+		maxpools = num_possible_nodes();
+		m->node_to_pool = kcalloc(maxpools, sizeof(unsigned int),
+					       GFP_KERNEL);
+		if (!m->node_to_pool)
+			goto fallback;
+		m->pool_to_node = kcalloc(maxpools, sizeof(unsigned int),
+					       GFP_KERNEL);
+		if (!m->pool_to_node) {
+			kfree(m->node_to_pool);
+			goto fallback;
+		}
+		for_each_node_with_cpus(node) {
+			/* some architectures (e.g. SN2) have cpuless nodes */
+			BUG_ON(pidx > maxpools);
+			m->node_to_pool[node] = pidx;
+			m->pool_to_node[pidx] = node;
+			pidx++;
+		}
+		/* nodes brought online later all get mapped to pool0, sorry */
+		m->npools = pidx;
+
+		printk("nfsd: initialising %u pools, one per numa node\n", m->npools);
+		break;
+#endif /* CONFIG_NUMA */
+	}
+
+	return m->npools;
+}
+
+/*
+ * Set the current thread's cpus_allowed mask so that it
+ * will only run on cpus in the given pool.
+ *
+ * Returns 1 and fills in oldmask iff a cpumask was applied.
+ */
+static int
+svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask)
+{
+	struct svc_pool_map *m = &svc_pool_map;
+	unsigned int node;
+	unsigned int cpu;
+
+	BUG_ON(!m->init);
+
+	switch (m->mode)
+	{
+	default:
+	case 0:
+		return 0;
+	case 1:
+		cpu = m->pool_to_cpu[pidx];
+		*oldmask = current->cpus_allowed;
+		set_cpus_allowed(current, cpumask_of_cpu(cpu));
+		return 1;
+#ifdef CONFIG_NUMA
+	case 2:
+		node = m->pool_to_node[pidx];
+		*oldmask = current->cpus_allowed;
+		set_cpus_allowed(current, node_to_cpumask(node));
+		return 1;
+#endif /* CONFIG_NUMA */
+	}
+}
+
+#endif /* SVC_HAVE_MULTIPLE_POOLS */
+
 /*
  * Create an RPC service
  */
@@ -101,8 +260,13 @@ svc_create_pooled(struct svc_program *pr
 		  svc_thread_fn func, int sig, struct module *mod)
 {
 	struct svc_serv *serv;
+	unsigned int npools = 1;
 
-	serv = __svc_create(prog, bufsize, /*npools*/1);
+#if SVC_HAVE_MULTIPLE_POOLS
+	npools = svc_pool_map_init();
+#endif
+
+	serv = __svc_create(prog, bufsize, npools);
 
 	if (serv != NULL) {
 		serv->sv_function = func;
@@ -202,12 +366,18 @@ svc_release_buffer(struct svc_rqst *rqst
 
 /*
  * Create a thread in the given pool.  Caller must hold BKL.
+ * On a NUMA or SMP machine, with a multi-pool serv, the thread
+ * will be restricted to run on the cpus belonging to the pool.
  */
 static int
 __svc_create_thread(svc_thread_fn func, struct svc_serv *serv, struct svc_pool *pool)
 {
 	struct svc_rqst	*rqstp;
 	int		error = -ENOMEM;
+#if SVC_HAVE_MULTIPLE_POOLS
+	int		have_oldmask = 0;
+	cpumask_t	oldmask;
+#endif
 
 	rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
 	if (!rqstp)
@@ -227,7 +397,19 @@ __svc_create_thread(svc_thread_fn func, 
 	spin_unlock_bh(&pool->sp_lock);
 	rqstp->rq_server = serv;
 	rqstp->rq_pool = pool;
+
+#if SVC_HAVE_MULTIPLE_POOLS
+	if (serv->sv_nrpools > 1)
+		have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask);
+#endif
+
 	error = kernel_thread((int (*)(void *)) func, rqstp, 0);
+
+#if SVC_HAVE_MULTIPLE_POOLS
+	if (have_oldmask)
+		set_cpus_allowed(current, oldmask);
+#endif
+
 	if (error < 0)
 		goto out_thread;
 	svc_sock_update_bufs(serv);
Index: linus-git/net/sunrpc/svcsock.c
===================================================================
--- linus-git.orig/net/sunrpc/svcsock.c	2006-07-24 20:44:46.911435470 +1000
+++ linus-git/net/sunrpc/svcsock.c	2006-07-24 22:45:23.263878219 +1000
@@ -150,8 +150,9 @@ static void
 svc_sock_enqueue(struct svc_sock *svsk)
 {
 	struct svc_serv	*serv = svsk->sk_server;
-	struct svc_pool *pool = &serv->sv_pools[0];
+	struct svc_pool *pool;
 	struct svc_rqst	*rqstp;
+	int cpu;
 
 	if (!(svsk->sk_flags &
 	      ( (1<<SK_CONN)|(1<<SK_DATA)|(1<<SK_CLOSE)|(1<<SK_DEFERRED)) ))
@@ -159,6 +160,10 @@ svc_sock_enqueue(struct svc_sock *svsk)
 	if (test_bit(SK_DEAD, &svsk->sk_flags))
 		return;
 
+	cpu = get_cpu();
+	pool = svc_pool_for_cpu(svsk->sk_server, cpu);
+	put_cpu();
+
 	spin_lock_bh(&pool->sp_lock);
 
 	if (!list_empty(&pool->sp_threads) &&
Index: linus-git/include/linux/sunrpc/svc.h
===================================================================
--- linus-git.orig/include/linux/sunrpc/svc.h	2006-07-24 22:16:36.041218126 +1000
+++ linus-git/include/linux/sunrpc/svc.h	2006-07-24 22:45:23.347867112 +1000
@@ -41,6 +41,39 @@ struct svc_pool {
 	struct list_head	sp_all_threads;	/* all server threads */
 } ____cacheline_aligned_in_smp;
 
+#if defined(CONFIG_NUMA) || defined(CONFIG_SMP)
+#define SVC_HAVE_MULTIPLE_POOLS	1
+#else
+#define SVC_HAVE_MULTIPLE_POOLS	0
+#endif
+
+#if SVC_HAVE_MULTIPLE_POOLS
+/*
+ * Global structure for mapping cpus to pools and vice versa.
+ * Setup once during sunrpc initialisation.
+ */
+struct svc_pool_map {
+	/*
+	 * Mode for mapping cpus to pools.
+	 *
+	 * -1 = automatic, choose one of the other modes at boot
+	 * 0 = no mapping, just a single global pool (legacy & UP mode)
+	 * 1 = one pool per cpu
+	 * 2 = one pool per numa node
+	 */
+	int mode;
+	int init;
+	unsigned int npools;
+	unsigned int *pool_to_cpu;
+	unsigned int *cpu_to_pool;
+#ifdef CONFIG_NUMA
+	unsigned int *node_to_pool;
+	unsigned int *pool_to_node;
+#endif /* CONFIG_NUMA */
+};
+#endif /* SVC_HAVE_MULTIPLE_POOLS */
+
+
 /*
  * RPC service.
  *
@@ -360,5 +393,34 @@ int		   svc_process(struct svc_serv *, s
 int		   svc_register(struct svc_serv *, int, unsigned short);
 void		   svc_wake_up(struct svc_serv *);
 void		   svc_reserve(struct svc_rqst *rqstp, int space);
+extern struct svc_pool_map svc_pool_map;
+
+
+static inline struct svc_pool *svc_pool_for_cpu(struct svc_serv *serv, int cpu)
+{
+#if SVC_HAVE_MULTIPLE_POOLS
+	struct svc_pool_map *m = &svc_pool_map;
+	unsigned int pidx;
+
+	switch (m->mode) {
+	default:
+	case 0:
+		pidx = 0;
+		break;
+	case 1:
+		pidx = m->cpu_to_pool[cpu];
+		break;
+#ifdef CONFIG_NUMA
+	case 2:
+		pidx = m->node_to_pool[cpu_to_node(cpu)];
+		break;
+#endif /* CONFIG_NUMA */
+	}
+	return &serv->sv_pools[pidx % serv->sv_nrpools];
+#else
+	return &serv->sv_pools[0];
+#endif
+}
+
 
 #endif /* SUNRPC_SVC_H */

-- 
Greg Banks, R&D Software Engineer, SGI Australian Software Group.
I don't speak for SGI.



-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys -- and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
NFS maillist  -  NFS@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nfs

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2006-07-26  2:25 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-07-25  5:16 [PATCH 010 of 11] knfsd: make pools numa aware Greg Banks
2006-07-25 12:43 ` Trond Myklebust
2006-07-26  2:20   ` Greg Banks

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox