Linux NFS development
 help / color / mirror / Atom feed
* [PATCH 0/3] [RFC] knfsd: convert to kthread API and remove signaling for shutdown
@ 2008-05-18  2:35 Jeff Layton
  2008-05-18  2:35 ` [PATCH 1/3] [RFC] knfsd: convert knfsd to kthread API Jeff Layton
  2008-05-19  6:07 ` [PATCH 0/3] [RFC] knfsd: convert to kthread API and remove signaling for shutdown Neil Brown
  0 siblings, 2 replies; 26+ messages in thread
From: Jeff Layton @ 2008-05-18  2:35 UTC (permalink / raw)
  To: linux-nfs, nfsv4

knfsd is the last NFS-related kernel thread that does not use the
kthread API. This patchset represents a first pass at converting it. It
seems to work, but changes the shutdown interface. knfsd currently
allows signals to tell it when to come down.

My main question is...how tied to this shutdown method are we? We can
also take down nfsd by having people run:

     # rpc.nfsd 0

...which basically does:

     # echo 0 > /proc/fs/nfsd/threads

...so we don't think we *have* to use signals here. Is signaling
something we can reasonably eliminate? In addition to making the code a
bit simpler and cleaner, I think it will also eliminate this race:

    http://lkml.org/lkml/2007/8/2/462

If this isn't feasible, then I can add the signaling back in, but am not
sure whether we can eliminate the race without adding more locking.

If we can do this, we may need to provide an alternate way to specify
that we want to take down all nfsd's but not flush the export table.
Currently that's done with a SIGHUP, but the value of this facility is
not clear to me since the kernel can just do another upcall.

Comments and suggestions appreciated...

Signed-off-by: Jeff Layton <jlayton@redhat.com>


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 1/3] [RFC] knfsd: convert knfsd to kthread API
  2008-05-18  2:35 [PATCH 0/3] [RFC] knfsd: convert to kthread API and remove signaling for shutdown Jeff Layton
@ 2008-05-18  2:35 ` Jeff Layton
  2008-05-18  2:35   ` [PATCH 2/3] [RFC] sunrpc: remove unneeded fields from svc_serv struct Jeff Layton
  2008-05-19  6:07 ` [PATCH 0/3] [RFC] knfsd: convert to kthread API and remove signaling for shutdown Neil Brown
  1 sibling, 1 reply; 26+ messages in thread
From: Jeff Layton @ 2008-05-18  2:35 UTC (permalink / raw)
  To: linux-nfs, nfsv4

This patch is rather large, but I couldn't figure out a way to break it
up that would remain bisectable. It does several things:

- change svc_thread_fn typedef to better match what kthread_create expects
- change svc_pool_map_set_cpumask to be more kthread friendly. Make it
  take a task arg and and get rid of the "oldmask"
- have svc_set_num_threads call set up the thread and call kthread_run
  directly, and to call kthread_stop to take down threads.
- change nfsd() to ignore signals, and to loop forever until kthread_stop
  is called on it

Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/nfsd/nfssvc.c           |   57 ++++++++----------------
 include/linux/sunrpc/svc.h |    2 +-
 net/sunrpc/svc.c           |  101 +++++++++++++++-----------------------------
 3 files changed, 54 insertions(+), 106 deletions(-)

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 941041f..b5fdb9b 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -21,6 +21,7 @@
 #include <linux/smp_lock.h>
 #include <linux/freezer.h>
 #include <linux/fs_struct.h>
+#include <linux/kthread.h>
 
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/stats.h>
@@ -51,7 +52,7 @@
 #define	SIG_NOCLEAN	SIGHUP
 
 extern struct svc_program	nfsd_program;
-static void			nfsd(struct svc_rqst *rqstp);
+static int			nfsd(void *vrqstp);
 struct timeval			nfssvc_boot;
        struct svc_serv 		*nfsd_serv;
 static atomic_t			nfsd_busy;
@@ -391,18 +392,17 @@ update_thread_usage(int busy_threads)
 /*
  * This is the NFS server kernel thread
  */
-static void
-nfsd(struct svc_rqst *rqstp)
+static int
+nfsd(void *vrqstp)
 {
+	struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
 	struct fs_struct *fsp;
-	int		err;
-	sigset_t shutdown_mask, allowed_mask;
+	int err, preverr = 0;
 
 	/* Lock module and set up kernel thread */
 	lock_kernel();
-	daemonize("nfsd");
 
-	/* After daemonize() this kernel thread shares current->fs
+	/* At this point, the thread shares current->fs
 	 * with the init process. We need to create files with a
 	 * umask of 0 instead of init's umask. */
 	fsp = copy_fs_struct(current->fs);
@@ -414,13 +414,8 @@ nfsd(struct svc_rqst *rqstp)
 	current->fs = fsp;
 	current->fs->umask = 0;
 
-	siginitsetinv(&shutdown_mask, SHUTDOWN_SIGS);
-	siginitsetinv(&allowed_mask, ALLOWED_SIGS);
-
 	nfsdstats.th_cnt++;
 
-	rqstp->rq_task = current;
-
 	unlock_kernel();
 
 	/*
@@ -434,27 +429,29 @@ nfsd(struct svc_rqst *rqstp)
 	/*
 	 * The main request loop
 	 */
-	for (;;) {
-		/* Block all but the shutdown signals */
-		sigprocmask(SIG_SETMASK, &shutdown_mask, NULL);
-
+	while (!kthread_should_stop()) {
 		/*
 		 * Find a socket with data available and call its
 		 * recvfrom routine.
 		 */
 		while ((err = svc_recv(rqstp, 60*60*HZ)) == -EAGAIN)
 			;
-		if (err < 0)
-			break;
+		if (err < 0) {
+			if (err != preverr) {
+				printk(KERN_WARNING "%s: unexpected error "
+					"from svc_recv (%d)\n", __func__, -err);
+				preverr = err;
+			}
+			schedule_timeout_uninterruptible(HZ);
+			continue;
+		}
+
 		update_thread_usage(atomic_read(&nfsd_busy));
 		atomic_inc(&nfsd_busy);
 
 		/* Lock the export hash tables for reading. */
 		exp_readlock();
 
-		/* Process request with signals blocked.  */
-		sigprocmask(SIG_SETMASK, &allowed_mask, NULL);
-
 		svc_process(rqstp);
 
 		/* Unlock export hash tables */
@@ -463,31 +460,15 @@ nfsd(struct svc_rqst *rqstp)
 		atomic_dec(&nfsd_busy);
 	}
 
-	if (err != -EINTR) {
-		printk(KERN_WARNING "nfsd: terminating on error %d\n", -err);
-	} else {
-		unsigned int	signo;
-
-		for (signo = 1; signo <= _NSIG; signo++)
-			if (sigismember(&current->pending.signal, signo) &&
-			    !sigismember(&current->blocked, signo))
-				break;
-		killsig = signo;
-	}
-	/* Clear signals before calling svc_exit_thread() */
-	flush_signals(current);
-
 	lock_kernel();
-
 	nfsdstats.th_cnt --;
 
 out:
 	/* Release the thread */
 	svc_exit_thread(rqstp);
 
-	/* Release module */
 	unlock_kernel();
-	module_put_and_exit(0);
+	return 0;
 }
 
 static __be32 map_new_errors(u32 vers, __be32 nfserr)
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 4b54c5f..011d6d8 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -22,7 +22,7 @@
 /*
  * This is the RPC server thread function prototype
  */
-typedef void		(*svc_thread_fn)(struct svc_rqst *);
+typedef int		(*svc_thread_fn)(void *);
 
 /*
  *
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 01c7e31..18a9e33 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -18,6 +18,7 @@
 #include <linux/mm.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
+#include <linux/kthread.h>
 
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/xdr.h>
@@ -291,15 +292,14 @@ svc_pool_map_put(void)
 
 
 /*
- * Set the current thread's cpus_allowed mask so that it
+ * Set the given thread's cpus_allowed mask so that it
  * will only run on cpus in the given pool.
- *
- * Returns 1 and fills in oldmask iff a cpumask was applied.
  */
-static inline int
-svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask)
+static inline void
+svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx)
 {
 	struct svc_pool_map *m = &svc_pool_map;
+	unsigned int node = m->pool_to[pidx];
 
 	/*
 	 * The caller checks for sv_nrpools > 1, which
@@ -307,26 +307,17 @@ svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask)
 	 */
 	BUG_ON(m->count == 0);
 
-	switch (m->mode)
-	{
-	default:
-		return 0;
+	switch (m->mode) {
 	case SVC_POOL_PERCPU:
 	{
-		unsigned int cpu = m->pool_to[pidx];
-
-		*oldmask = current->cpus_allowed;
-		set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
-		return 1;
+		set_cpus_allowed_ptr(task, &cpumask_of_cpu(node));
+		break;
 	}
 	case SVC_POOL_PERNODE:
 	{
-		unsigned int node = m->pool_to[pidx];
 		node_to_cpumask_ptr(nodecpumask, node);
-
-		*oldmask = current->cpus_allowed;
-		set_cpus_allowed_ptr(current, nodecpumask);
-		return 1;
+		set_cpus_allowed_ptr(task, nodecpumask);
+		break;
 	}
 	}
 }
@@ -578,46 +569,6 @@ out_enomem:
 EXPORT_SYMBOL(svc_prepare_thread);
 
 /*
- * Create a thread in the given pool.  Caller must hold BKL.
- * On a NUMA or SMP machine, with a multi-pool serv, the thread
- * will be restricted to run on the cpus belonging to the pool.
- */
-static int
-__svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
-		    struct svc_pool *pool)
-{
-	struct svc_rqst	*rqstp;
-	int		error = -ENOMEM;
-	int		have_oldmask = 0;
-	cpumask_t	uninitialized_var(oldmask);
-
-	rqstp = svc_prepare_thread(serv, pool);
-	if (IS_ERR(rqstp)) {
-		error = PTR_ERR(rqstp);
-		goto out;
-	}
-
-	if (serv->sv_nrpools > 1)
-		have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask);
-
-	error = kernel_thread((int (*)(void *)) func, rqstp, 0);
-
-	if (have_oldmask)
-		set_cpus_allowed(current, oldmask);
-
-	if (error < 0)
-		goto out_thread;
-	svc_sock_update_bufs(serv);
-	error = 0;
-out:
-	return error;
-
-out_thread:
-	svc_exit_thread(rqstp);
-	goto out;
-}
-
-/*
  * Choose a pool in which to create a new thread, for svc_set_num_threads
  */
 static inline struct svc_pool *
@@ -686,7 +637,9 @@ found_pool:
 int
 svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 {
-	struct task_struct *victim;
+	struct svc_rqst	*rqstp;
+	struct task_struct *task;
+	struct svc_pool *chosen_pool;
 	int error = 0;
 	unsigned int state = serv->sv_nrthreads-1;
 
@@ -702,18 +655,32 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 	/* create new threads */
 	while (nrservs > 0) {
 		nrservs--;
-		__module_get(serv->sv_module);
-		error = __svc_create_thread(serv->sv_function, serv,
-					    choose_pool(serv, pool, &state));
-		if (error < 0) {
-			module_put(serv->sv_module);
+		chosen_pool = choose_pool(serv, pool, &state);
+
+		rqstp = svc_prepare_thread(serv, chosen_pool);
+		if (IS_ERR(rqstp)) {
+			error = PTR_ERR(rqstp);
 			break;
 		}
+
+		task = kthread_create(serv->sv_function, rqstp, serv->sv_name);
+		if (IS_ERR(task)) {
+			error = PTR_ERR(task);
+			svc_exit_thread(rqstp);
+			break;
+		}
+
+		rqstp->rq_task = task;
+		if (serv->sv_nrpools > 1)
+			svc_pool_map_set_cpumask(task, chosen_pool->sp_id);
+
+		svc_sock_update_bufs(serv);
+		wake_up_process(task);
 	}
 	/* destroy old threads */
 	while (nrservs < 0 &&
-	       (victim = choose_victim(serv, pool, &state)) != NULL) {
-		send_sig(serv->sv_kill_signal, victim, 1);
+	       (task = choose_victim(serv, pool, &state)) != NULL) {
+		kthread_stop(task);
 		nrservs++;
 	}
 
-- 
1.5.3.6


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 2/3] [RFC] sunrpc: remove unneeded fields from svc_serv struct
  2008-05-18  2:35 ` [PATCH 1/3] [RFC] knfsd: convert knfsd to kthread API Jeff Layton
@ 2008-05-18  2:35   ` Jeff Layton
  2008-05-18  2:35     ` [PATCH 3/3] [RFC] knfsd: remove signal defines and extraneous variables Jeff Layton
  0 siblings, 1 reply; 26+ messages in thread
From: Jeff Layton @ 2008-05-18  2:35 UTC (permalink / raw)
  To: linux-nfs, nfsv4

Remove 2 fields that are no longer needed in the svc_serv struct,
sv_module and sv_kill_signal. sv_kill_signal doesn't appear to
currently be used for anything, and sv_module is no longer
needed since svc_set_num_threads doesn't bother with module reference
counts anymore. Also, remove the args from svc_create_pooled from
which these fields were populated.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/nfsd/nfssvc.c           |    6 ++----
 include/linux/sunrpc/svc.h |   10 ++--------
 net/sunrpc/svc.c           |    7 ++-----
 3 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index b5fdb9b..d4d4be9 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -218,10 +218,8 @@ int nfsd_create_serv(void)
 	}
 
 	atomic_set(&nfsd_busy, 0);
-	nfsd_serv = svc_create_pooled(&nfsd_program,
-				      nfsd_max_blksize,
-				      nfsd_last_thread,
-				      nfsd, SIG_NOCLEAN, THIS_MODULE);
+	nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
+				      nfsd_last_thread, nfsd);
 	if (nfsd_serv == NULL)
 		err = -ENOMEM;
 	unlock_kernel();
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 011d6d8..cd304c4 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -72,15 +72,10 @@ struct svc_serv {
 	unsigned int		sv_nrpools;	/* number of thread pools */
 	struct svc_pool *	sv_pools;	/* array of thread pools */
 
+	/* Callback to use when last thread exits */
 	void			(*sv_shutdown)(struct svc_serv *serv);
-						/* Callback to use when last thread
-						 * exits.
-						 */
 
-	struct module *		sv_module;	/* optional module to count when
-						 * adding threads */
 	svc_thread_fn		sv_function;	/* main function for threads */
-	int			sv_kill_signal;	/* signal to kill threads */
 };
 
 /*
@@ -388,8 +383,7 @@ struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
 					struct svc_pool *pool);
 void		   svc_exit_thread(struct svc_rqst *);
 struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
-			void (*shutdown)(struct svc_serv*),
-			svc_thread_fn, int sig, struct module *);
+			void (*shutdown)(struct svc_serv*), svc_thread_fn);
 int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
 void		   svc_destroy(struct svc_serv *);
 int		   svc_process(struct svc_rqst *);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 18a9e33..da90ce0 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -434,18 +434,15 @@ EXPORT_SYMBOL(svc_create);
 struct svc_serv *
 svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
 		void (*shutdown)(struct svc_serv *serv),
-		  svc_thread_fn func, int sig, struct module *mod)
+		  svc_thread_fn func)
 {
 	struct svc_serv *serv;
 	unsigned int npools = svc_pool_map_get();
 
 	serv = __svc_create(prog, bufsize, npools, shutdown);
 
-	if (serv != NULL) {
+	if (serv != NULL)
 		serv->sv_function = func;
-		serv->sv_kill_signal = sig;
-		serv->sv_module = mod;
-	}
 
 	return serv;
 }
-- 
1.5.3.6


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 3/3] [RFC] knfsd: remove signal defines and extraneous variables
  2008-05-18  2:35   ` [PATCH 2/3] [RFC] sunrpc: remove unneeded fields from svc_serv struct Jeff Layton
@ 2008-05-18  2:35     ` Jeff Layton
  0 siblings, 0 replies; 26+ messages in thread
From: Jeff Layton @ 2008-05-18  2:35 UTC (permalink / raw)
  To: linux-nfs, nfsv4

There are some defines for signals delivered to knfsd which are no
longer used with the new kthread-based server.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/nfsd/nfssvc.c |   21 ++-------------------
 1 files changed, 2 insertions(+), 19 deletions(-)

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index d4d4be9..fb70660 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -37,20 +37,6 @@
 
 #define NFSDDBG_FACILITY	NFSDDBG_SVC
 
-/* these signals will be delivered to an nfsd thread 
- * when handling a request
- */
-#define ALLOWED_SIGS	(sigmask(SIGKILL))
-/* these signals will be delivered to an nfsd thread
- * when not handling a request. i.e. when waiting
- */
-#define SHUTDOWN_SIGS	(sigmask(SIGKILL) | sigmask(SIGHUP) | sigmask(SIGINT) | sigmask(SIGQUIT))
-/* if the last thread dies with SIGHUP, then the exports table is
- * left unchanged ( like 2.4-{0-9} ).  Any other signal will clear
- * the exports table (like 2.2).
- */
-#define	SIG_NOCLEAN	SIGHUP
-
 extern struct svc_program	nfsd_program;
 static int			nfsd(void *vrqstp);
 struct timeval			nfssvc_boot;
@@ -152,7 +138,6 @@ int nfsd_nrthreads(void)
 		return nfsd_serv->sv_nrthreads;
 }
 
-static int killsig;	/* signal that was used to kill last nfsd */
 static void nfsd_last_thread(struct svc_serv *serv)
 {
 	/* When last nfsd thread exits we need to do some clean-up */
@@ -164,10 +149,8 @@ static void nfsd_last_thread(struct svc_serv *serv)
 	nfs4_state_shutdown();
 
 	printk(KERN_WARNING "nfsd: last server has exited\n");
-	if (killsig != SIG_NOCLEAN) {
-		printk(KERN_WARNING "nfsd: unexporting all filesystems\n");
-		nfsd_export_flush();
-	}
+	printk(KERN_WARNING "nfsd: unexporting all filesystems\n");
+	nfsd_export_flush();
 }
 
 void nfsd_reset_versions(void)
-- 
1.5.3.6


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/3] [RFC] knfsd: convert to kthread API and remove signaling for shutdown
  2008-05-18  2:35 [PATCH 0/3] [RFC] knfsd: convert to kthread API and remove signaling for shutdown Jeff Layton
  2008-05-18  2:35 ` [PATCH 1/3] [RFC] knfsd: convert knfsd to kthread API Jeff Layton
@ 2008-05-19  6:07 ` Neil Brown
       [not found]   ` <18481.6416.571430.593722-wvvUuzkyo1EYVZTmpyfIwg@public.gmane.org>
  1 sibling, 1 reply; 26+ messages in thread
From: Neil Brown @ 2008-05-19  6:07 UTC (permalink / raw)
  To: Jeff Layton; +Cc: linux-nfs, nfsv4

On Saturday May 17, jlayton@redhat.com wrote:
> knfsd is the last NFS-related kernel thread that does not use the
> kthread API. This patchset represents a first pass at converting it. It
> seems to work, but changes the shutdown interface. knfsd currently
> allows signals to tell it when to come down.
> 
> My main question is...how tied to this shutdown method are we? We can
> also take down nfsd by having people run:
> 
>      # rpc.nfsd 0
> 
> ...which basically does:
> 
>      # echo 0 > /proc/fs/nfsd/threads
> 
> ...so we don't think we *have* to use signals here. Is signaling
> something we can reasonably eliminate? 

Good question.  I suspect lots of distros still use "killall" or some
equivalent to stop nfsd - so at a minimum, some education would be
required.

Most kernel threads are there to provide a service for some other
entity, such as a filesystem or a device etc.  So it doesn't make
sense to kill them except when the device/filesystem/whatever goes
away.

With nfsd, the thread *is* the central entity.  So killing it does
make sense.
This doesn't argue strongly in favour of a killable nfsd, but does
explain why it might be different from all other kernel threads.

>                                         In addition to making the code a
> bit simpler and cleaner, I think it will also eliminate this race:
> 
>     http://lkml.org/lkml/2007/8/2/462

I never followed up on this did I...
The core problem seems to be the principle of
  "The last one out turns off the lights"
but once you've turned off the lights, you can't see if someone else
snuck back in so you aren't the last one.  You really have to have
only one door and stand in the doorway while switching off the
lights....

If we replace the BKL usage with a simple global semaphore, that
problem might just go away.  We should only need to protect
svc_destroy, svc_exit_thread, and svc_set_num_threads from each other.

It's long past time to discard the BLK here anyway.

> 
> If this isn't feasible, then I can add the signaling back in, but am not
> sure whether we can eliminate the race without adding more locking.

I think I prefer keeping the signalling and fixing the locking.

> 
> If we can do this, we may need to provide an alternate way to specify
> that we want to take down all nfsd's but not flush the export table.
> Currently that's done with a SIGHUP, but the value of this facility is
> not clear to me since the kernel can just do another upcall.

I added this functionality well before the kernel could do upcalls to
refill the export table.

I wanted to be able to change the number of active threads without a
complete shutdown and restart, so I make the /proc/fs/nfs/threads
file accessed by "nfsd NN".

Then I wanted
   nfsd 0
   nfsd 8
to just change the number of threads, not flush the exports table as
well (because at the time, flushing the exports table was more
serious).
So I arranged that "nfsd 0" just reduced the number of threads to 0
and changed no other (externally visible) state.

As you say, it isn't really needed now.   I wouldn't have a problem
with removing the SIGHUP feature, but I think keeping SIGKILL and
SIGINT with their old meaning is important.

> 
> Comments and suggestions appreciated...
> 

Thanks for the efforts!

NeilBrown

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/3] [RFC] knfsd: convert to kthread API and remove signaling for shutdown
       [not found]   ` <18481.6416.571430.593722-wvvUuzkyo1EYVZTmpyfIwg@public.gmane.org>
@ 2008-05-19 21:01     ` Jeff Layton
  2008-05-19 22:00     ` Greg Banks
  1 sibling, 0 replies; 26+ messages in thread
From: Jeff Layton @ 2008-05-19 21:01 UTC (permalink / raw)
  To: Neil Brown; +Cc: linux-nfs, nfsv4

On Mon, 19 May 2008 16:07:12 +1000
Neil Brown <neilb@suse.de> wrote:

> On Saturday May 17, jlayton@redhat.com wrote:
> > knfsd is the last NFS-related kernel thread that does not use the
> > kthread API. This patchset represents a first pass at converting it. It
> > seems to work, but changes the shutdown interface. knfsd currently
> > allows signals to tell it when to come down.
> > 
> > My main question is...how tied to this shutdown method are we? We can
> > also take down nfsd by having people run:
> > 
> >      # rpc.nfsd 0
> > 
> > ...which basically does:
> > 
> >      # echo 0 > /proc/fs/nfsd/threads
> > 
> > ...so we don't think we *have* to use signals here. Is signaling
> > something we can reasonably eliminate? 
> 
> Good question.  I suspect lots of distros still use "killall" or some
> equivalent to stop nfsd - so at a minimum, some education would be
> required.
> 
> Most kernel threads are there to provide a service for some other
> entity, such as a filesystem or a device etc.  So it doesn't make
> sense to kill them except when the device/filesystem/whatever goes
> away.
> 
> With nfsd, the thread *is* the central entity.  So killing it does
> make sense.
> This doesn't argue strongly in favour of a killable nfsd, but does
> explain why it might be different from all other kernel threads.
> 
> >                                         In addition to making the code a
> > bit simpler and cleaner, I think it will also eliminate this race:
> > 
> >     http://lkml.org/lkml/2007/8/2/462
> 
> I never followed up on this did I...
> The core problem seems to be the principle of
>   "The last one out turns off the lights"
> but once you've turned off the lights, you can't see if someone else
> snuck back in so you aren't the last one.  You really have to have
> only one door and stand in the doorway while switching off the
> lights....
> 
> If we replace the BKL usage with a simple global semaphore, that
> problem might just go away.  We should only need to protect
> svc_destroy, svc_exit_thread, and svc_set_num_threads from each other.
> 
> It's long past time to discard the BLK here anyway.
> 
> > 
> > If this isn't feasible, then I can add the signaling back in, but am not
> > sure whether we can eliminate the race without adding more locking.
> 
> I think I prefer keeping the signalling and fixing the locking.
> 
> > 
> > If we can do this, we may need to provide an alternate way to specify
> > that we want to take down all nfsd's but not flush the export table.
> > Currently that's done with a SIGHUP, but the value of this facility is
> > not clear to me since the kernel can just do another upcall.
> 
> I added this functionality well before the kernel could do upcalls to
> refill the export table.
> 
> I wanted to be able to change the number of active threads without a
> complete shutdown and restart, so I make the /proc/fs/nfs/threads
> file accessed by "nfsd NN".
> 
> Then I wanted
>    nfsd 0
>    nfsd 8
> to just change the number of threads, not flush the exports table as
> well (because at the time, flushing the exports table was more
> serious).
> So I arranged that "nfsd 0" just reduced the number of threads to 0
> and changed no other (externally visible) state.
> 
> As you say, it isn't really needed now.   I wouldn't have a problem
> with removing the SIGHUP feature, but I think keeping SIGKILL and
> SIGINT with their old meaning is important.
> 

Ok, I'll plan to take a second crack at this when I get some time and
put the signaling back in. I think removing the SIGHUP feature will
make the code a little simpler, so I'll plan to take it out. I'll also
have a look at putting better locking around the svc_* functions you
mention above...

Thanks for the review,
-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/3] [RFC] knfsd: convert to kthread API and remove signaling for shutdown
       [not found]   ` <18481.6416.571430.593722-wvvUuzkyo1EYVZTmpyfIwg@public.gmane.org>
  2008-05-19 21:01     ` Jeff Layton
@ 2008-05-19 22:00     ` Greg Banks
  2008-05-19 23:52       ` Neil Brown
  2008-05-20 20:36       ` Greg Banks
  1 sibling, 2 replies; 26+ messages in thread
From: Greg Banks @ 2008-05-19 22:00 UTC (permalink / raw)
  To: Neil Brown; +Cc: Jeff Layton, linux-nfs, nfsv4

Neil Brown wrote:
> On Saturday May 17, jlayton@redhat.com wrote:
>   
>
> With nfsd, the thread *is* the central entity.  So killing it does
> make sense.
> This doesn't argue strongly in favour of a killable nfsd, but does
> explain why it might be different from all other kernel threads.
>   
There's also the unique history of nfsd, that it was historically a
userspace daemon, so that "killall nfsd" was once the only method of
shutting down nfsds.  Arguably we need to preserve that semantic to
avoid breaking distros.

Unfortunately, it's a horrible and racy semantic.  At least on SLES10,
the /etc/init.d/ script that performs graceful shutdowns of the NFS
server will do "killall nfsd" and then, without checking to see whether
all the nfsds have actually died, proceeds to run the script that shuts
down the portmapper.  If the nfsd are a long time dying, e.g. due to
some slow filesystem work or if you just have a very large number of
nfsds, the last nfsd will try to unregister program 100003 with the the
portmapper after the portmapper is down.  This will leave a single nfsd
alive, failing and retrying for a total of hundreds of seconds.  During
that time, you can't restart the NFS service.

So while "killall nfsd" needs to continue to work, a additional smarter
mechanism would be great also.

>
> If we replace the BKL usage with a simple global semaphore, that
> problem might just go away.  We should only need to protect
> svc_destroy, svc_exit_thread, and svc_set_num_threads from each other.
>   
I think we also need to protect those against svc_create_pooled(). I
tried this a few weeks back in an attempt to deal with a customer's
problem, but gave up in disgust when the obvious changes resulted in the
nfsds self-deadlocking during startup.  The svc_serv was being created
as a side effect of the first write to a /proc/fs/nfsd/ file, and the
locking was very very confused.  I think it would be great if you gave
that code some care and attention.
> It's long past time to discard the BLK here anyway.
>   
Very true!
>   
>> If this isn't feasible, then I can add the signaling back in, but am not
>> sure whether we can eliminate the race without adding more locking.
>>     
>
> I think I prefer keeping the signalling and fixing the locking.
>   
I agree.
>   
>> Comments and suggestions appreciated...
>>
>>     
>
>   
Thanks for doing this :-)  I'll send out the patch I mentioned when we
chatted at Cthon, maybe you can consider the issue it fixes.

-- 
Greg Banks, P.Engineer, SGI Australian Software Group.
The cake is *not* a lie.
I don't speak for SGI.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/3] [RFC] knfsd: convert to kthread API and remove signaling for shutdown
  2008-05-19 22:00     ` Greg Banks
@ 2008-05-19 23:52       ` Neil Brown
       [not found]         ` <18482.4782.858347.981553-wvvUuzkyo1EYVZTmpyfIwg@public.gmane.org>
  2008-05-20  2:24         ` Jeff Layton
  2008-05-20 20:36       ` Greg Banks
  1 sibling, 2 replies; 26+ messages in thread
From: Neil Brown @ 2008-05-19 23:52 UTC (permalink / raw)
  To: Greg Banks; +Cc: linux-nfs, nfsv4, Jeff Layton

On Monday May 19, gnb@melbourne.sgi.com wrote:
> I think we also need to protect those against svc_create_pooled(). I
> tried this a few weeks back in an attempt to deal with a customer's
> problem, but gave up in disgust when the obvious changes resulted in the
> nfsds self-deadlocking during startup.  The svc_serv was being created
> as a side effect of the first write to a /proc/fs/nfsd/ file, and the
> locking was very very confused.  I think it would be great if you gave
> that code some care and attention.

What was confused???

Here is a quick-and-dirty conversion to mutexs.  It starts and stops
threads without any deadlocking or lockdep warnings.  I haven't tried
to synthesise any races so I cannot promise it is perfect, but it
feels close at least.

You are correct of course, svc_create_pooled is one of the bits that
needs protection.

NeilBrown


Replace lock_kernel with a mutex for nfsd thread startup/shutdown locking.


Signed-off-by: Neil Brown <neilb@suse.de>

### Diffstat output
 ./fs/nfsd/nfsctl.c |   39 +++++++++++++++++++++++++--------------
 ./fs/nfsd/nfssvc.c |   28 ++++++++++++++++------------
 2 files changed, 41 insertions(+), 26 deletions(-)

diff .prev/fs/nfsd/nfsctl.c ./fs/nfsd/nfsctl.c
--- .prev/fs/nfsd/nfsctl.c	2008-05-20 09:49:52.000000000 +1000
+++ ./fs/nfsd/nfsctl.c	2008-05-20 09:50:25.000000000 +1000
@@ -441,6 +441,8 @@ static ssize_t write_threads(struct file
 	return strlen(buf);
 }
 
+extern struct mutex nfsd_mutex;
+
 static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
 {
 	/* if size > 0, look for an array of number of threads per node
@@ -450,22 +452,26 @@ static ssize_t write_pool_threads(struct
 	int i;
 	int rv;
 	int len;
-    	int npools = nfsd_nrpools();
+	int npools;
 	int *nthreads;
 
+	mutex_lock(&nfsd_mutex);
+	npools = nfsd_nrpools();
 	if (npools == 0) {
 		/*
 		 * NFS is shut down.  The admin can start it by
 		 * writing to the threads file but NOT the pool_threads
 		 * file, sorry.  Report zero threads.
 		 */
+		mutex_unlock(&nfsd_mutex);
 		strcpy(buf, "0\n");
 		return strlen(buf);
 	}
 
 	nthreads = kcalloc(npools, sizeof(int), GFP_KERNEL);
+	rv = -ENOMEM;
 	if (nthreads == NULL)
-		return -ENOMEM;
+		goto out_free;
 
 	if (size > 0) {
 		for (i = 0; i < npools; i++) {
@@ -496,10 +502,12 @@ static ssize_t write_pool_threads(struct
 		mesg += len;
 	}
 
+	mutex_unlock(&nfsd_mutex);
 	return (mesg-buf);
 
 out_free:
 	kfree(nthreads);
+	mutex_unlock(&nfsd_mutex);
 	return rv;
 }
 
@@ -566,14 +574,13 @@ static ssize_t write_versions(struct fil
 	return len;
 }
 
-static ssize_t write_ports(struct file *file, char *buf, size_t size)
+static ssize_t __write_ports(struct file *file, char *buf, size_t size)
 {
 	if (size == 0) {
 		int len = 0;
-		lock_kernel();
+
 		if (nfsd_serv)
 			len = svc_xprt_names(nfsd_serv, buf, 0);
-		unlock_kernel();
 		return len;
 	}
 	/* Either a single 'fd' number is written, in which
@@ -603,9 +610,7 @@ static ssize_t write_ports(struct file *
 			/* Decrease the count, but don't shutdown the
 			 * the service
 			 */
-			lock_kernel();
 			nfsd_serv->sv_nrthreads--;
-			unlock_kernel();
 		}
 		return err < 0 ? err : 0;
 	}
@@ -614,10 +619,8 @@ static ssize_t write_ports(struct file *
 		int len = 0;
 		if (!toclose)
 			return -ENOMEM;
-		lock_kernel();
 		if (nfsd_serv)
 			len = svc_sock_names(buf, nfsd_serv, toclose);
-		unlock_kernel();
 		if (len >= 0)
 			lockd_down();
 		kfree(toclose);
@@ -655,7 +658,6 @@ static ssize_t write_ports(struct file *
 		if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) {
 			if (port == 0)
 				return -EINVAL;
-			lock_kernel();
 			if (nfsd_serv) {
 				xprt = svc_find_xprt(nfsd_serv, transport,
 						     AF_UNSPEC, port);
@@ -666,13 +668,22 @@ static ssize_t write_ports(struct file *
 				} else
 					err = -ENOTCONN;
 			}
-			unlock_kernel();
 			return err < 0 ? err : 0;
 		}
 	}
 	return -EINVAL;
 }
 
+static ssize_t write_ports(struct file *file, char *buf, size_t size)
+{
+	ssize_t rv;
+	mutex_lock(&nfsd_mutex);
+	rv = __write_ports(file, buf, size);
+	mutex_unlock(&nfsd_mutex);
+	return rv;
+}
+
+
 int nfsd_max_blksize;
 
 static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
@@ -691,13 +702,13 @@ static ssize_t write_maxblksize(struct f
 		if (bsize > NFSSVC_MAXBLKSIZE)
 			bsize = NFSSVC_MAXBLKSIZE;
 		bsize &= ~(1024-1);
-		lock_kernel();
+		mutex_lock(&nfsd_mutex);
 		if (nfsd_serv && nfsd_serv->sv_nrthreads) {
-			unlock_kernel();
+			mutex_unlock(&nfsd_mutex);
 			return -EBUSY;
 		}
 		nfsd_max_blksize = bsize;
-		unlock_kernel();
+		mutex_unlock(&nfsd_mutex);
 	}
 	return sprintf(buf, "%d\n", nfsd_max_blksize);
 }

diff .prev/fs/nfsd/nfssvc.c ./fs/nfsd/nfssvc.c
--- .prev/fs/nfsd/nfssvc.c	2008-05-20 09:49:52.000000000 +1000
+++ ./fs/nfsd/nfssvc.c	2008-05-20 09:47:59.000000000 +1000
@@ -190,13 +190,15 @@ void nfsd_reset_versions(void)
 	}
 }
 
+DEFINE_MUTEX(nfsd_mutex);
+
 int nfsd_create_serv(void)
 {
 	int err = 0;
-	lock_kernel();
+
+	WARN_ON(!mutex_is_locked(&nfsd_mutex));
 	if (nfsd_serv) {
 		svc_get(nfsd_serv);
-		unlock_kernel();
 		return 0;
 	}
 	if (nfsd_max_blksize == 0) {
@@ -223,7 +225,7 @@ int nfsd_create_serv(void)
 				      nfsd, SIG_NOCLEAN, THIS_MODULE);
 	if (nfsd_serv == NULL)
 		err = -ENOMEM;
-	unlock_kernel();
+
 	do_gettimeofday(&nfssvc_boot);		/* record boot time */
 	return err;
 }
@@ -282,6 +284,8 @@ int nfsd_set_nrthreads(int n, int *nthre
 	int tot = 0;
 	int err = 0;
 
+	WARN_ON(!mutex_is_locked(&nfsd_mutex));
+
 	if (nfsd_serv == NULL || n <= 0)
 		return 0;
 
@@ -316,7 +320,6 @@ int nfsd_set_nrthreads(int n, int *nthre
 		nthreads[0] = 1;
 
 	/* apply the new numbers */
-	lock_kernel();
 	svc_get(nfsd_serv);
 	for (i = 0; i < n; i++) {
 		err = svc_set_num_threads(nfsd_serv, &nfsd_serv->sv_pools[i],
@@ -325,7 +328,6 @@ int nfsd_set_nrthreads(int n, int *nthre
 			break;
 	}
 	svc_destroy(nfsd_serv);
-	unlock_kernel();
 
 	return err;
 }
@@ -334,8 +336,8 @@ int
 nfsd_svc(unsigned short port, int nrservs)
 {
 	int	error;
-	
-	lock_kernel();
+
+	mutex_lock(&nfsd_mutex);
 	dprintk("nfsd: creating service\n");
 	error = -EINVAL;
 	if (nrservs <= 0)
@@ -363,7 +365,7 @@ nfsd_svc(unsigned short port, int nrserv
  failure:
 	svc_destroy(nfsd_serv);		/* Release server */
  out:
-	unlock_kernel();
+	mutex_unlock(&nfsd_mutex);
 	return error;
 }
 
@@ -399,7 +401,7 @@ nfsd(struct svc_rqst *rqstp)
 	sigset_t shutdown_mask, allowed_mask;
 
 	/* Lock module and set up kernel thread */
-	lock_kernel();
+	mutex_lock(&nfsd_mutex);
 	daemonize("nfsd");
 
 	/* After daemonize() this kernel thread shares current->fs
@@ -417,11 +419,13 @@ nfsd(struct svc_rqst *rqstp)
 	siginitsetinv(&shutdown_mask, SHUTDOWN_SIGS);
 	siginitsetinv(&allowed_mask, ALLOWED_SIGS);
 
+
 	nfsdstats.th_cnt++;
 
 	rqstp->rq_task = current;
 
-	unlock_kernel();
+	mutex_unlock(&nfsd_mutex);
+
 
 	/*
 	 * We want less throttling in balance_dirty_pages() so that nfs to
@@ -477,7 +481,7 @@ nfsd(struct svc_rqst *rqstp)
 	/* Clear signals before calling svc_exit_thread() */
 	flush_signals(current);
 
-	lock_kernel();
+	mutex_lock(&nfsd_mutex);
 
 	nfsdstats.th_cnt --;
 
@@ -486,7 +490,7 @@ out:
 	svc_exit_thread(rqstp);
 
 	/* Release module */
-	unlock_kernel();
+	mutex_unlock(&nfsd_mutex);
 	module_put_and_exit(0);
 }

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/3] [RFC] knfsd: convert to kthread API and remove signaling for shutdown
       [not found]         ` <18482.4782.858347.981553-wvvUuzkyo1EYVZTmpyfIwg@public.gmane.org>
@ 2008-05-20  2:04           ` Greg Banks
  0 siblings, 0 replies; 26+ messages in thread
From: Greg Banks @ 2008-05-20  2:04 UTC (permalink / raw)
  To: Neil Brown; +Cc: Jeff Layton, linux-nfs, nfsv4

Neil Brown wrote:

>  
> +static ssize_t write_ports(struct file *file, char *buf, size_t size)
> +{
> +	ssize_t rv;
> +	mutex_lock(&nfsd_mutex);
> +	rv = __write_ports(file, buf, size);
> +	mutex_unlock(&nfsd_mutex);
> +	return rv;
> +}
>   
Oooh, much better.  I was foolishly trying to fix all the lock_kernels()
in the original write_ports().

There are some comments in net/sunrpc/svc.c referring to callers needing
to own the BKL, which will need updating also. Otherwise, looks good :-)
Perhaps the trouble I was having was in the older codebase only, or
because I didn't pull the locking out far enough.

-- 
Greg Banks, P.Engineer, SGI Australian Software Group.
The cake is *not* a lie.
I don't speak for SGI.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/3] [RFC] knfsd: convert to kthread API and remove signaling for shutdown
  2008-05-19 23:52       ` Neil Brown
       [not found]         ` <18482.4782.858347.981553-wvvUuzkyo1EYVZTmpyfIwg@public.gmane.org>
@ 2008-05-20  2:24         ` Jeff Layton
       [not found]           ` <20080519222457.6f24daa5-PC62bkCOHzGdMjc06nkz3ljfA9RmPOcC@public.gmane.org>
  2008-05-20  3:13           ` Neil Brown
  1 sibling, 2 replies; 26+ messages in thread
From: Jeff Layton @ 2008-05-20  2:24 UTC (permalink / raw)
  To: Neil Brown; +Cc: linux-nfs, nfsv4, Greg Banks

On Tue, 20 May 2008 09:52:14 +1000
Neil Brown <neilb@suse.de> wrote:

> On Monday May 19, gnb@melbourne.sgi.com wrote:
> > I think we also need to protect those against svc_create_pooled(). I
> > tried this a few weeks back in an attempt to deal with a customer's
> > problem, but gave up in disgust when the obvious changes resulted in the
> > nfsds self-deadlocking during startup.  The svc_serv was being created
> > as a side effect of the first write to a /proc/fs/nfsd/ file, and the
> > locking was very very confused.  I think it would be great if you gave
> > that code some care and attention.
> 
> What was confused???
> 

> Here is a quick-and-dirty conversion to mutexs.  It starts and stops
> threads without any deadlocking or lockdep warnings.  I haven't tried
> to synthesise any races so I cannot promise it is perfect, but it
> feels close at least.
> 
> You are correct of course, svc_create_pooled is one of the bits that
> needs protection.
> 
> NeilBrown
> 
> 

I find all of the locking in rpc/nlm/nfs/nfsd *very* confusing. It's
always been explained to me that it's best to not use locking to
protect a section of code, but rather data structures. This makes it
easier to get the locking right when the code changes.

This is the problem we have now with the BKL. So much of
rpc/nlm/nfs/nfsd runs under it that it's nearly impossible to tell what
it's intended to actually protect. If we're going to start a push
toward BKL removal, my humble request is that we try to be as explicit
as possible about what locks protect what data structures.

So that's my question about this patch -- exactly what data is this new
mutex intended to protect?


> Replace lock_kernel with a mutex for nfsd thread startup/shutdown locking.
> 
> 
> Signed-off-by: Neil Brown <neilb@suse.de>
> 
> ### Diffstat output
>  ./fs/nfsd/nfsctl.c |   39 +++++++++++++++++++++++++--------------
>  ./fs/nfsd/nfssvc.c |   28 ++++++++++++++++------------
>  2 files changed, 41 insertions(+), 26 deletions(-)
> 
> diff .prev/fs/nfsd/nfsctl.c ./fs/nfsd/nfsctl.c
> --- .prev/fs/nfsd/nfsctl.c	2008-05-20 09:49:52.000000000 +1000
> +++ ./fs/nfsd/nfsctl.c	2008-05-20 09:50:25.000000000 +1000
> @@ -441,6 +441,8 @@ static ssize_t write_threads(struct file
>  	return strlen(buf);
>  }
>  
> +extern struct mutex nfsd_mutex;
> +
>  static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
>  {
>  	/* if size > 0, look for an array of number of threads per node
> @@ -450,22 +452,26 @@ static ssize_t write_pool_threads(struct
>  	int i;
>  	int rv;
>  	int len;
> -    	int npools = nfsd_nrpools();
> +	int npools;
>  	int *nthreads;
>  
> +	mutex_lock(&nfsd_mutex);
> +	npools = nfsd_nrpools();
>  	if (npools == 0) {
>  		/*
>  		 * NFS is shut down.  The admin can start it by
>  		 * writing to the threads file but NOT the pool_threads
>  		 * file, sorry.  Report zero threads.
>  		 */
> +		mutex_unlock(&nfsd_mutex);
>  		strcpy(buf, "0\n");
>  		return strlen(buf);
>  	}
>  
>  	nthreads = kcalloc(npools, sizeof(int), GFP_KERNEL);
> +	rv = -ENOMEM;
>  	if (nthreads == NULL)
> -		return -ENOMEM;
> +		goto out_free;
>  
>  	if (size > 0) {
>  		for (i = 0; i < npools; i++) {
> @@ -496,10 +502,12 @@ static ssize_t write_pool_threads(struct
>  		mesg += len;
>  	}
>  
> +	mutex_unlock(&nfsd_mutex);
>  	return (mesg-buf);
>  
>  out_free:
>  	kfree(nthreads);
> +	mutex_unlock(&nfsd_mutex);
>  	return rv;
>  }
>  
> @@ -566,14 +574,13 @@ static ssize_t write_versions(struct fil
>  	return len;
>  }
>  
> -static ssize_t write_ports(struct file *file, char *buf, size_t size)
> +static ssize_t __write_ports(struct file *file, char *buf, size_t size)
>  {
>  	if (size == 0) {
>  		int len = 0;
> -		lock_kernel();
> +
>  		if (nfsd_serv)
>  			len = svc_xprt_names(nfsd_serv, buf, 0);
> -		unlock_kernel();
>  		return len;
>  	}
>  	/* Either a single 'fd' number is written, in which
> @@ -603,9 +610,7 @@ static ssize_t write_ports(struct file *
>  			/* Decrease the count, but don't shutdown the
>  			 * the service
>  			 */
> -			lock_kernel();
>  			nfsd_serv->sv_nrthreads--;
> -			unlock_kernel();
>  		}
>  		return err < 0 ? err : 0;
>  	}
> @@ -614,10 +619,8 @@ static ssize_t write_ports(struct file *
>  		int len = 0;
>  		if (!toclose)
>  			return -ENOMEM;
> -		lock_kernel();
>  		if (nfsd_serv)
>  			len = svc_sock_names(buf, nfsd_serv, toclose);
> -		unlock_kernel();
>  		if (len >= 0)
>  			lockd_down();
>  		kfree(toclose);
> @@ -655,7 +658,6 @@ static ssize_t write_ports(struct file *
>  		if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) {
>  			if (port == 0)
>  				return -EINVAL;
> -			lock_kernel();
>  			if (nfsd_serv) {
>  				xprt = svc_find_xprt(nfsd_serv, transport,
>  						     AF_UNSPEC, port);
> @@ -666,13 +668,22 @@ static ssize_t write_ports(struct file *
>  				} else
>  					err = -ENOTCONN;
>  			}
> -			unlock_kernel();
>  			return err < 0 ? err : 0;
>  		}
>  	}
>  	return -EINVAL;
>  }
>  
> +static ssize_t write_ports(struct file *file, char *buf, size_t size)
> +{
> +	ssize_t rv;
> +	mutex_lock(&nfsd_mutex);
> +	rv = __write_ports(file, buf, size);
> +	mutex_unlock(&nfsd_mutex);
> +	return rv;
> +}
> +
> +
>  int nfsd_max_blksize;
>  
>  static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
> @@ -691,13 +702,13 @@ static ssize_t write_maxblksize(struct f
>  		if (bsize > NFSSVC_MAXBLKSIZE)
>  			bsize = NFSSVC_MAXBLKSIZE;
>  		bsize &= ~(1024-1);
> -		lock_kernel();
> +		mutex_lock(&nfsd_mutex);
>  		if (nfsd_serv && nfsd_serv->sv_nrthreads) {
> -			unlock_kernel();
> +			mutex_unlock(&nfsd_mutex);
>  			return -EBUSY;
>  		}
>  		nfsd_max_blksize = bsize;
> -		unlock_kernel();
> +		mutex_unlock(&nfsd_mutex);
>  	}
>  	return sprintf(buf, "%d\n", nfsd_max_blksize);
>  }
> 
> diff .prev/fs/nfsd/nfssvc.c ./fs/nfsd/nfssvc.c
> --- .prev/fs/nfsd/nfssvc.c	2008-05-20 09:49:52.000000000 +1000
> +++ ./fs/nfsd/nfssvc.c	2008-05-20 09:47:59.000000000 +1000
> @@ -190,13 +190,15 @@ void nfsd_reset_versions(void)
>  	}
>  }
>  
> +DEFINE_MUTEX(nfsd_mutex);
> +
>  int nfsd_create_serv(void)
>  {
>  	int err = 0;
> -	lock_kernel();
> +
> +	WARN_ON(!mutex_is_locked(&nfsd_mutex));
>  	if (nfsd_serv) {
>  		svc_get(nfsd_serv);
> -		unlock_kernel();
>  		return 0;
>  	}
>  	if (nfsd_max_blksize == 0) {
> @@ -223,7 +225,7 @@ int nfsd_create_serv(void)
>  				      nfsd, SIG_NOCLEAN, THIS_MODULE);
>  	if (nfsd_serv == NULL)
>  		err = -ENOMEM;
> -	unlock_kernel();
> +
>  	do_gettimeofday(&nfssvc_boot);		/* record boot time */
>  	return err;
>  }
> @@ -282,6 +284,8 @@ int nfsd_set_nrthreads(int n, int *nthre
>  	int tot = 0;
>  	int err = 0;
>  
> +	WARN_ON(!mutex_is_locked(&nfsd_mutex));
> +
>  	if (nfsd_serv == NULL || n <= 0)
>  		return 0;
>  
> @@ -316,7 +320,6 @@ int nfsd_set_nrthreads(int n, int *nthre
>  		nthreads[0] = 1;
>  
>  	/* apply the new numbers */
> -	lock_kernel();
>  	svc_get(nfsd_serv);
>  	for (i = 0; i < n; i++) {
>  		err = svc_set_num_threads(nfsd_serv, &nfsd_serv->sv_pools[i],
> @@ -325,7 +328,6 @@ int nfsd_set_nrthreads(int n, int *nthre
>  			break;
>  	}
>  	svc_destroy(nfsd_serv);
> -	unlock_kernel();
>  
>  	return err;
>  }
> @@ -334,8 +336,8 @@ int
>  nfsd_svc(unsigned short port, int nrservs)
>  {
>  	int	error;
> -	
> -	lock_kernel();
> +
> +	mutex_lock(&nfsd_mutex);
>  	dprintk("nfsd: creating service\n");
>  	error = -EINVAL;
>  	if (nrservs <= 0)
> @@ -363,7 +365,7 @@ nfsd_svc(unsigned short port, int nrserv
>   failure:
>  	svc_destroy(nfsd_serv);		/* Release server */
>   out:
> -	unlock_kernel();
> +	mutex_unlock(&nfsd_mutex);
>  	return error;
>  }
>  
> @@ -399,7 +401,7 @@ nfsd(struct svc_rqst *rqstp)
>  	sigset_t shutdown_mask, allowed_mask;
>  
>  	/* Lock module and set up kernel thread */
> -	lock_kernel();
> +	mutex_lock(&nfsd_mutex);
>  	daemonize("nfsd");
>  
>  	/* After daemonize() this kernel thread shares current->fs
> @@ -417,11 +419,13 @@ nfsd(struct svc_rqst *rqstp)
>  	siginitsetinv(&shutdown_mask, SHUTDOWN_SIGS);
>  	siginitsetinv(&allowed_mask, ALLOWED_SIGS);
>  
> +
>  	nfsdstats.th_cnt++;
>  
>  	rqstp->rq_task = current;
>  
> -	unlock_kernel();
> +	mutex_unlock(&nfsd_mutex);
> +
>  
>  	/*
>  	 * We want less throttling in balance_dirty_pages() so that nfs to
> @@ -477,7 +481,7 @@ nfsd(struct svc_rqst *rqstp)
>  	/* Clear signals before calling svc_exit_thread() */
>  	flush_signals(current);
>  
> -	lock_kernel();
> +	mutex_lock(&nfsd_mutex);
>  
>  	nfsdstats.th_cnt --;
>  
> @@ -486,7 +490,7 @@ out:
>  	svc_exit_thread(rqstp);
>  
>  	/* Release module */
> -	unlock_kernel();
> +	mutex_unlock(&nfsd_mutex);
>  	module_put_and_exit(0);
>  }
>  
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/3] [RFC] knfsd: convert to kthread API and remove signaling for shutdown
       [not found]           ` <20080519222457.6f24daa5-PC62bkCOHzGdMjc06nkz3ljfA9RmPOcC@public.gmane.org>
@ 2008-05-20  2:34             ` Greg Banks
  2008-05-20 11:05               ` Jeff Layton
       [not found]             ` <483238B3.4010702-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org>
  1 sibling, 1 reply; 26+ messages in thread
From: Greg Banks @ 2008-05-20  2:34 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Neil Brown, linux-nfs, nfsv4

Jeff Layton wrote:
>
> This is the problem we have now with the BKL. So much of
> rpc/nlm/nfs/nfsd runs under it that it's nearly impossible to tell what
> it's intended to actually protect. 
Practically none of nfsd runs under BKL, except the startup and shutdown
sequences and some of the write handlers for /proc/fs/nfsd/.
> If we're going to start a push
> toward BKL removal, my humble request is that we try to be as explicit
> as possible about what locks protect what data structures.
>   
See comments in net/sunrpc/svcsock.c and net/sunrpc/svc.c.  Here the BKL
is protecting the global nfsd_serv pointer.

-- 
Greg Banks, P.Engineer, SGI Australian Software Group.
The cake is *not* a lie.
I don't speak for SGI.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/3] [RFC] knfsd: convert to kthread API and remove signaling for shutdown
  2008-05-20  2:24         ` Jeff Layton
       [not found]           ` <20080519222457.6f24daa5-PC62bkCOHzGdMjc06nkz3ljfA9RmPOcC@public.gmane.org>
@ 2008-05-20  3:13           ` Neil Brown
  2008-05-20 11:13             ` Jeff Layton
       [not found]             ` <18482.16837.381955.636390-wvvUuzkyo1EYVZTmpyfIwg@public.gmane.org>
  1 sibling, 2 replies; 26+ messages in thread
From: Neil Brown @ 2008-05-20  3:13 UTC (permalink / raw)
  To: Jeff Layton; +Cc: linux-nfs, nfsv4, Greg Banks

On Monday May 19, jlayton@redhat.com wrote:
> 
> I find all of the locking in rpc/nlm/nfs/nfsd *very* confusing. It's
> always been explained to me that it's best to not use locking to
> protect a section of code, but rather data structures. This makes it
> easier to get the locking right when the code changes.

Absolutely.

> 
> This is the problem we have now with the BKL. So much of
> rpc/nlm/nfs/nfsd runs under it that it's nearly impossible to tell what
> it's intended to actually protect. If we're going to start a push
> toward BKL removal, my humble request is that we try to be as explicit
> as possible about what locks protect what data structures.
> 
> So that's my question about this patch -- exactly what data is this new
> mutex intended to protect?
> 

Fair question.
The new nfsd_mutex primarily protects "nfsd_serv", both the pointer
itself and various members of the structure that it sometimes points
to.
In particular, ->sv_nrthreads but also to some extent sv_temp_socks
and sv_permsocks.
Also nfsdstats.th_cnt.

If (out side the lock) nfsd_serv is non-NULL, then it must point to
a properly initialised 'struct svc_serv' with ->sv_nrthreads > 0, and
that number of nfsd threads must exist and each must listed in
->sp_all_threads for exactly on ->sv_pools[].

The particular thing that the lock is protecting is transitions
between 0 and non-0.  On these transitions, the svc_serv struct needs
to be either created and initialised, or freed up.

Having said all that, I think I see a race.
When a new rqst is created at put on sp_all_threads, ->rq_task it not
set and doesn't get set until the thread runs and gets the mutex.  So
there is a brief hole when ->rq_task isn't set.  I don't know if that
can cause a problem, but it feels wrong.
When you switch to kthreads, you use the fact that kthread_create
returns a task_struct, and assign that to ->rq_task in
__svc_create_thread instead of nfsd, which will close that hole.

If you look in nfsctl.c, you will probably be able to find plenty of
places where nfsd_serv is dereferenced without any locking.  These are
all wrong and need fixing.

NeilBrown

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/3] [RFC] knfsd: convert to kthread API and remove signaling for shutdown
  2008-05-20  2:34             ` Greg Banks
@ 2008-05-20 11:05               ` Jeff Layton
  0 siblings, 0 replies; 26+ messages in thread
From: Jeff Layton @ 2008-05-20 11:05 UTC (permalink / raw)
  To: Greg Banks; +Cc: linux-nfs, nfsv4

On Mon, 19 May 2008 19:34:27 -0700
Greg Banks <gnb@melbourne.sgi.com> wrote:

> Jeff Layton wrote:
> >
> > This is the problem we have now with the BKL. So much of
> > rpc/nlm/nfs/nfsd runs under it that it's nearly impossible to tell what
> > it's intended to actually protect. 
> Practically none of nfsd runs under BKL, except the startup and shutdown
> sequences and some of the write handlers for /proc/fs/nfsd/.

You're quite correct. I was thinking mainly of lockd (which runs
entirely under the BKL) and large swaths of NFS client and RPC code
which also do. It looks like nfsd doesn't do as much under the BKL (and
that is definitely a good thing!).

> > If we're going to start a push
> > toward BKL removal, my humble request is that we try to be as explicit
> > as possible about what locks protect what data structures.
> >   
> See comments in net/sunrpc/svcsock.c and net/sunrpc/svc.c.  Here the BKL
> is protecting the global nfsd_serv pointer.
> 

Ok. Good to know. When I do the kthread conversion I'll make sure
that's protected, though with Neil's mutex patch I shouldn't need to
change much of the locking, if any.

-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/3] [RFC] knfsd: convert to kthread API and remove signaling for shutdown
  2008-05-20  3:13           ` Neil Brown
@ 2008-05-20 11:13             ` Jeff Layton
       [not found]             ` <18482.16837.381955.636390-wvvUuzkyo1EYVZTmpyfIwg@public.gmane.org>
  1 sibling, 0 replies; 26+ messages in thread
From: Jeff Layton @ 2008-05-20 11:13 UTC (permalink / raw)
  To: Neil Brown; +Cc: linux-nfs, nfsv4, Greg Banks

On Tue, 20 May 2008 13:13:09 +1000
Neil Brown <neilb@suse.de> wrote:

> On Monday May 19, jlayton@redhat.com wrote:
> > 
> > I find all of the locking in rpc/nlm/nfs/nfsd *very* confusing. It's
> > always been explained to me that it's best to not use locking to
> > protect a section of code, but rather data structures. This makes it
> > easier to get the locking right when the code changes.
> 
> Absolutely.
> 
> > 
> > This is the problem we have now with the BKL. So much of
> > rpc/nlm/nfs/nfsd runs under it that it's nearly impossible to tell what
> > it's intended to actually protect. If we're going to start a push
> > toward BKL removal, my humble request is that we try to be as explicit
> > as possible about what locks protect what data structures.
> > 
> > So that's my question about this patch -- exactly what data is this new
> > mutex intended to protect?
> > 
> 
> Fair question.
> The new nfsd_mutex primarily protects "nfsd_serv", both the pointer
> itself and various members of the structure that it sometimes points
> to.
> In particular, ->sv_nrthreads but also to some extent sv_temp_socks
> and sv_permsocks.
> Also nfsdstats.th_cnt.
> 
> If (out side the lock) nfsd_serv is non-NULL, then it must point to
> a properly initialised 'struct svc_serv' with ->sv_nrthreads > 0, and
> that number of nfsd threads must exist and each must listed in
> ->sp_all_threads for exactly on ->sv_pools[].
> 
> The particular thing that the lock is protecting is transitions
> between 0 and non-0.  On these transitions, the svc_serv struct needs
> to be either created and initialised, or freed up.
> 

Thanks. That makes sense. It might not hurt to sprinkle in some
comments to this effect to help others who come along later. Also,
cleaning up the comments referencing the BKL as Greg suggests would
also be good...

> Having said all that, I think I see a race.
> When a new rqst is created at put on sp_all_threads, ->rq_task it not
> set and doesn't get set until the thread runs and gets the mutex.  So
> there is a brief hole when ->rq_task isn't set.  I don't know if that
> can cause a problem, but it feels wrong.

Yes, seems like that's depending on scheduler behavior so it could be
racy if that changes.

> When you switch to kthreads, you use the fact that kthread_create
> returns a task_struct, and assign that to ->rq_task in
> __svc_create_thread instead of nfsd, which will close that hole.
> 

My original patch used that to set the rq_task, so I'll plan to do that
for the respin too. 

> If you look in nfsctl.c, you will probably be able to find plenty of
> places where nfsd_serv is dereferenced without any locking.  These are
> all wrong and need fixing.
> 

I'll keep an eye out for them. Since you're working on this locking
patch, I'll plan to resubmit the kthread conversion after you've got
it complete so I can base it on top of it.

Thanks,
-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/3] [RFC] knfsd: convert to kthread API and remove signaling for shutdown
       [not found]             ` <483238B3.4010702-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org>
@ 2008-05-20 13:33               ` Talpey, Thomas
  0 siblings, 0 replies; 26+ messages in thread
From: Talpey, Thomas @ 2008-05-20 13:33 UTC (permalink / raw)
  To: Greg Banks; +Cc: Jeff Layton, linux-nfs, nfsv4

At 10:34 PM 5/19/2008, Greg Banks wrote:
>Practically none of nfsd runs under BKL, except the startup and shutdown
>sequences and some of the write handlers for /proc/fs/nfsd/.

Locking and NFSv4 delegations.

In my opinion, getting rid of the BKL in NFS comes down to getting
locking right. It's incredibly intertwined, and it's not NFS's fault.

Tom.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/3] [RFC] knfsd: convert to kthread API and remove signaling for shutdown
       [not found]             ` <18482.16837.381955.636390-wvvUuzkyo1EYVZTmpyfIwg@public.gmane.org>
@ 2008-05-20 20:26               ` Greg Banks
  0 siblings, 0 replies; 26+ messages in thread
From: Greg Banks @ 2008-05-20 20:26 UTC (permalink / raw)
  To: Neil Brown; +Cc: Jeff Layton, linux-nfs, nfsv4

Neil Brown wrote:
>
> Fair question.
> The new nfsd_mutex primarily protects "nfsd_serv", both the pointer
> itself and various members of the structure that it sometimes points
> to.
> In particular, ->sv_nrthreads but also to some extent sv_temp_socks
> and sv_permsocks.
>   
Those two fields should be guarded by svc_serv->sv_lock only.  In fact
IIRC the only field of svc_serv guarded by the global mutex is
sv_nrthreads in it's role as pseudo-refcount.

> Having said all that, I think I see a race.
> When a new rqst is created at put on sp_all_threads, ->rq_task it not
> set and doesn't get set until the thread runs and gets the mutex.  So
> there is a brief hole when ->rq_task isn't set.  I don't know if that
> can cause a problem, but it feels wrong.
> When you switch to kthreads, you use the fact that kthread_create
> returns a task_struct, and assign that to ->rq_task in
> __svc_create_thread instead of nfsd, which will close that hole.
>   
Yes, that aspect of Jeff's patch is a definite improvement.
> If you look in nfsctl.c, you will probably be able to find plenty of
> places where nfsd_serv is dereferenced without any locking.  These are
> all wrong and need fixing.
>   
Agreed!

-- 
Greg Banks, P.Engineer, SGI Australian Software Group.
The cake is *not* a lie.
I don't speak for SGI.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/3] [RFC] knfsd: convert to kthread API and remove signaling for shutdown
  2008-05-19 22:00     ` Greg Banks
  2008-05-19 23:52       ` Neil Brown
@ 2008-05-20 20:36       ` Greg Banks
       [not found]         ` <4833364A.4010803-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org>
  1 sibling, 1 reply; 26+ messages in thread
From: Greg Banks @ 2008-05-20 20:36 UTC (permalink / raw)
  To: Jeff Layton; +Cc: linux-nfs, nfsv4

[-- Attachment #1: Type: text/plain, Size: 606 bytes --]

Greg Banks wrote:
> [...]
> I'll send out the patch I mentioned when we
> chatted at Cthon, maybe you can consider the issue it fixes.
>
>   
Here's the patch.  You probably don't want to do it the same way, but
the intent is to delay the allocation of the svc_rqst structure until
the nfsd is running on the correct cpu so that the allocation comes from
the correct local memory on NUMA systems.  Absent the patch, most of the
nfsds spend a lot of time updating remote memory allocated on node0.

-- 
Greg Banks, P.Engineer, SGI Australian Software Group.
The cake is *not* a lie.
I don't speak for SGI.


[-- Attachment #2: knfsd-allocate-svc-rqst-and-pages-on-correct-nodes-4 --]
[-- Type: text/plain, Size: 4068 bytes --]

When allocating the per-thread NFS server data structures, ensure
the allocation proceeds on the NUMA node where the thread will then
spend all of its life.  This reduces the amount of cross-node traffic
needed to run NFS at high traffic rates on large NUMA machines.

Signed-off-by: Greg Banks <gnb@melbourne.sgi.com>
---

 net/sunrpc/svc.c |   94 ++++++++++++++++++++++++++++++++------------
 1 file changed, 70 insertions(+), 24 deletions(-)

Index: linux-2.6.16/net/sunrpc/svc.c
===================================================================
--- linux-2.6.16.orig/net/sunrpc/svc.c
+++ linux-2.6.16/net/sunrpc/svc.c
@@ -538,30 +538,43 @@ svc_release_buffer(struct svc_rqst *rqst
 	rqstp->rq_argused = 0;
 }
 
+struct svc_create_thread_rec
+{
+	svc_thread_fn func;
+	struct svc_serv *serv;
+	struct svc_pool *pool;
+	struct completion comp;
+};
+
 /*
- * Create a thread in the given pool.  Caller must hold BKL.
- * On a NUMA or SMP machine, with a multi-pool serv, the thread
- * will be restricted to run on the cpus belonging to the pool.
+ * Trampoline function called as the thread function for new nfsd threads.
+ * Performs all the per-thread initialisation in the thread's own context,
+ * so that the thread's memory policy and cpus_allowed mask govern the
+ * allocation of the per-thread structures and the buffer pages used
+ * to do network and disk IO.  For large NUMA machines with a lot of
+ * network and disk bandwidth, getting this page placement right can
+ * double system throughput.
  */
-static int
-__svc_create_thread(svc_thread_fn func, struct svc_serv *serv, struct svc_pool *pool)
+static int __svc_create_thread_tramp(void *data)
 {
+	struct svc_create_thread_rec *rec = data;
+	struct svc_serv *serv = rec->serv;
+	struct svc_pool *pool = rec->pool;
+	int error = -ENOMEM;
 	struct svc_rqst	*rqstp;
-	int		error = -ENOMEM;
-	int		have_oldmask = 0;
-	cpumask_t	oldmask;
 
-	rqstp = kmalloc(sizeof(*rqstp), GFP_KERNEL);
+	rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
 	if (!rqstp)
 		goto out;
 
-	memset(rqstp, 0, sizeof(*rqstp));
 	init_waitqueue_head(&rqstp->rq_wait);
 
-	if (!(rqstp->rq_argp = kmalloc(serv->sv_xdrsize, GFP_KERNEL))
-	 || !(rqstp->rq_resp = kmalloc(serv->sv_xdrsize, GFP_KERNEL))
-	 || !svc_init_buffer(rqstp, serv->sv_bufsz))
-		goto out_thread;
+	if (!(rqstp->rq_argp = kmalloc(serv->sv_xdrsize, GFP_KERNEL)))
+		goto out_argp;
+	if (!(rqstp->rq_resp = kmalloc(serv->sv_xdrsize, GFP_KERNEL)))
+		goto out_resp;
+	if (!svc_init_buffer(rqstp, serv->sv_bufsz))
+		goto out_buffer;
 
 	serv->sv_nrthreads++;
 	pool->sp_nrthreads++;
@@ -571,24 +584,57 @@ __svc_create_thread(svc_thread_fn func, 
 	rqstp->rq_server = serv;
 	rqstp->rq_pool = pool;
 
+	svc_sock_update_bufs(serv);
+
+	complete(&rec->comp);
+
+	rec->func(rqstp);
+	return 0;
+
+out_buffer:
+	svc_release_buffer(rqstp);
+	kfree(rqstp->rq_resp);
+out_resp:
+	kfree(rqstp->rq_argp);
+out_argp:
+	kfree(rqstp);
+out:
+	complete(&rec->comp);
+	return error;
+}
+
+/*
+ * Create a thread in the given pool.  Caller must hold BKL.
+ * On a NUMA or SMP machine, with a multi-pool serv, the thread
+ * will be restricted to run on the cpus belonging to the pool.
+ */
+static int
+__svc_create_thread(svc_thread_fn func, struct svc_serv *serv, struct svc_pool *pool)
+{
+	int		error = -ENOMEM;
+	int		have_oldmask = 0;
+	cpumask_t	oldmask;
+	struct svc_create_thread_rec rec = {
+				.func = func,
+				.serv = serv,
+				.pool = pool
+			};
+	init_completion(&rec.comp);
+
 	if (serv->sv_nrpools > 1)
 		have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask);
 
-	error = kernel_thread((int (*)(void *)) func, rqstp, 0);
+	error = kernel_thread(__svc_create_thread_tramp, &rec, 0);
+	if (error > 0)
+		error = 0;
 
 	if (have_oldmask)
 		set_cpus_allowed(current, oldmask);
 
-	if (error < 0)
-		goto out_thread;
-	svc_sock_update_bufs(serv);
-	error = 0;
-out:
-	return error;
+	/* wait for the child thread to finish initialising */
+	wait_for_completion(&rec.comp);
 
-out_thread:
-	svc_exit_thread(rqstp);
-	goto out;
+	return error;
 }
 
 /*

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
NFSv4 mailing list
NFSv4@linux-nfs.org
http://linux-nfs.org/cgi-bin/mailman/listinfo/nfsv4

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/3] [RFC] knfsd: convert to kthread API and remove signaling for shutdown
       [not found]         ` <4833364A.4010803-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org>
@ 2008-05-21  1:48           ` Jeff Layton
       [not found]             ` <20080520214823.576ad7a7-RtJpwOs3+0O+kQycOl6kW4xkIHaj4LzF@public.gmane.org>
  0 siblings, 1 reply; 26+ messages in thread
From: Jeff Layton @ 2008-05-21  1:48 UTC (permalink / raw)
  To: Greg Banks; +Cc: Neil Brown, linux-nfs, nfsv4

On Tue, 20 May 2008 13:36:26 -0700
Greg Banks <gnb-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org> wrote:

> Greg Banks wrote:
> > [...]
> > I'll send out the patch I mentioned when we
> > chatted at Cthon, maybe you can consider the issue it fixes.
> >
> >   
> Here's the patch.  You probably don't want to do it the same way, but
> the intent is to delay the allocation of the svc_rqst structure until
> the nfsd is running on the correct cpu so that the allocation comes from
> the correct local memory on NUMA systems.  Absent the patch, most of the
> nfsds spend a lot of time updating remote memory allocated on node0.
> 

Thanks Greg. I'll make sure I incorporate a similar fix in the next
iteration of the patch. Rather than delaying the allocation like this,
I wonder if we can just figure out the node from the cpumask and then
do a kmalloc_node()?

-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/3] [RFC] knfsd: convert to kthread API and remove signaling for shutdown
       [not found]             ` <20080520214823.576ad7a7-RtJpwOs3+0O+kQycOl6kW4xkIHaj4LzF@public.gmane.org>
@ 2008-05-21  3:29               ` Greg Banks
       [not found]                 ` <48339730.3060206-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org>
  0 siblings, 1 reply; 26+ messages in thread
From: Greg Banks @ 2008-05-21  3:29 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Neil Brown, linux-nfs, nfsv4

Jeff Layton wrote:
>   
> [...] Rather than delaying the allocation like this,
> I wonder if we can just figure out the node from the cpumask and then
> do a kmalloc_node()?
>
>   
Sure.

-- 
Greg Banks, P.Engineer, SGI Australian Software Group.
The cake is *not* a lie.
I don't speak for SGI.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/3] [RFC] knfsd: convert to kthread API and remove signaling for shutdown
       [not found]                 ` <48339730.3060206-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org>
@ 2008-05-30 16:25                   ` Jeff Layton
       [not found]                     ` <20080530122517.4f18c48e-RtJpwOs3+0O+kQycOl6kW4xkIHaj4LzF@public.gmane.org>
  0 siblings, 1 reply; 26+ messages in thread
From: Jeff Layton @ 2008-05-30 16:25 UTC (permalink / raw)
  To: Greg Banks; +Cc: Neil Brown, linux-nfs, nfsv4

On Tue, 20 May 2008 20:29:52 -0700
Greg Banks <gnb-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org> wrote:

> Jeff Layton wrote:
> >   
> > [...] Rather than delaying the allocation like this,
> > I wonder if we can just figure out the node from the cpumask and then
> > do a kmalloc_node()?
> >
> >   
> Sure.
> 

I started to do this patch as part of the kthread conversion, but it's
actually pretty independent. We can probably treat it separately.
This hasn't been tested on an actual NUMA machine yet, but any thoughts
on the following patch?

-----------------[snip]-------------------

>From 46432a021fe1931f4a44587124ac9442e83c4731 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 30 May 2008 11:57:18 -0400
Subject: [PATCH] sunrpc: have pooled services make NUMA-friendly allocations

Currently, svc_prepare_thread allocates memory using plain kmalloc()
and alloc_page() calls, even for threads that are destined to run on
different CPUs or NUMA nodes than the current one. Add a function to
translate a poolid into a NUMA node, and have svc_prepare_thread and
svc_init_buffer allocate memory on those nodes instead.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 net/sunrpc/svc.c |   46 ++++++++++++++++++++++++++++++++++++++++------
 1 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 01c7e31..3985fbc 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -332,6 +332,32 @@ svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask)
 }
 
 /*
+ * for a given poolid, return the NUMA memory node. This allows us to
+ * allocate memory close to the CPU's where the task will be running
+ */
+static inline unsigned int
+svc_pool_to_node(unsigned int pidx)
+{
+	struct svc_pool_map *m = &svc_pool_map;
+	unsigned int poolnode = m->pool_to[pidx];
+
+	/*
+	 * The caller checks for sv_nrpools > 1, which
+	 * implies that we've been initialized.
+	 */
+	BUG_ON(m->count == 0);
+
+	switch (m->mode) {
+	case SVC_POOL_PERNODE:
+		return poolnode;
+	case SVC_POOL_PERCPU:
+		return cpu_to_node(poolnode);
+	}
+
+	return numa_node_id();
+}
+
+/*
  * Use the mapping mode to choose a pool for a given CPU.
  * Used when enqueueing an incoming RPC.  Always returns
  * a non-NULL pool pointer.
@@ -507,7 +533,7 @@ EXPORT_SYMBOL(svc_destroy);
  * We allocate pages and place them in rq_argpages.
  */
 static int
-svc_init_buffer(struct svc_rqst *rqstp, unsigned int size)
+svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, unsigned int node)
 {
 	unsigned int pages, arghi;
 
@@ -517,7 +543,7 @@ svc_init_buffer(struct svc_rqst *rqstp, unsigned int size)
 	arghi = 0;
 	BUG_ON(pages > RPCSVC_MAXPAGES);
 	while (pages) {
-		struct page *p = alloc_page(GFP_KERNEL);
+		struct page *p = alloc_pages_node(node, GFP_KERNEL, 0);
 		if (!p)
 			break;
 		rqstp->rq_pages[arghi++] = p;
@@ -543,8 +569,14 @@ struct svc_rqst *
 svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool)
 {
 	struct svc_rqst	*rqstp;
+	unsigned int	node;
+
+	if (serv->sv_nrpools > 1)
+		node = svc_pool_to_node(pool->sp_id);
+	else
+		node = numa_node_id();
 
-	rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
+	rqstp = kmalloc_node(sizeof(*rqstp), GFP_KERNEL | __GFP_ZERO, node);
 	if (!rqstp)
 		goto out_enomem;
 
@@ -558,15 +590,17 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool)
 	rqstp->rq_server = serv;
 	rqstp->rq_pool = pool;
 
-	rqstp->rq_argp = kmalloc(serv->sv_xdrsize, GFP_KERNEL);
+	rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize,
+					GFP_KERNEL | __GFP_ZERO, node);
 	if (!rqstp->rq_argp)
 		goto out_thread;
 
-	rqstp->rq_resp = kmalloc(serv->sv_xdrsize, GFP_KERNEL);
+	rqstp->rq_resp = kmalloc_node(serv->sv_xdrsize,
+					GFP_KERNEL | __GFP_ZERO, node);
 	if (!rqstp->rq_resp)
 		goto out_thread;
 
-	if (!svc_init_buffer(rqstp, serv->sv_max_mesg))
+	if (!svc_init_buffer(rqstp, serv->sv_max_mesg, node))
 		goto out_thread;
 
 	return rqstp;
-- 
1.5.3.6


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/3] [RFC] knfsd: convert to kthread API and remove signaling for shutdown
       [not found]                     ` <20080530122517.4f18c48e-RtJpwOs3+0O+kQycOl6kW4xkIHaj4LzF@public.gmane.org>
@ 2008-05-30 18:46                       ` J. Bruce Fields
  2008-05-30 20:59                         ` Jeff Layton
  2008-06-02  5:51                       ` Greg Banks
  1 sibling, 1 reply; 26+ messages in thread
From: J. Bruce Fields @ 2008-05-30 18:46 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Greg Banks, linux-nfs, nfsv4

On Fri, May 30, 2008 at 12:25:17PM -0400, Jeff Layton wrote:
> On Tue, 20 May 2008 20:29:52 -0700
> Greg Banks <gnb-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org> wrote:
> 
> > Jeff Layton wrote:
> > >   
> > > [...] Rather than delaying the allocation like this,
> > > I wonder if we can just figure out the node from the cpumask and then
> > > do a kmalloc_node()?
> > >
> > >   
> > Sure.
> > 
> 
> I started to do this patch as part of the kthread conversion, but it's
> actually pretty independent. We can probably treat it separately.
> This hasn't been tested on an actual NUMA machine yet, but any thoughts
> on the following patch?
> 
> -----------------[snip]-------------------
> 
> >From 46432a021fe1931f4a44587124ac9442e83c4731 Mon Sep 17 00:00:00 2001
> From: Jeff Layton <jlayton@redhat.com>
> Date: Fri, 30 May 2008 11:57:18 -0400
> Subject: [PATCH] sunrpc: have pooled services make NUMA-friendly allocations
> 
> Currently, svc_prepare_thread allocates memory using plain kmalloc()
> and alloc_page() calls, even for threads that are destined to run on
> different CPUs or NUMA nodes than the current one. Add a function to
> translate a poolid into a NUMA node, and have svc_prepare_thread and
> svc_init_buffer allocate memory on those nodes instead.
> 
> Signed-off-by: Jeff Layton <jlayton@redhat.com>
> ---
>  net/sunrpc/svc.c |   46 ++++++++++++++++++++++++++++++++++++++++------
>  1 files changed, 40 insertions(+), 6 deletions(-)
> 
> diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
> index 01c7e31..3985fbc 100644
> --- a/net/sunrpc/svc.c
> +++ b/net/sunrpc/svc.c
> @@ -332,6 +332,32 @@ svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask)
>  }
>  
>  /*
> + * for a given poolid, return the NUMA memory node. This allows us to
> + * allocate memory close to the CPU's where the task will be running
> + */
> +static inline unsigned int
> +svc_pool_to_node(unsigned int pidx)
> +{
> +	struct svc_pool_map *m = &svc_pool_map;
> +	unsigned int poolnode = m->pool_to[pidx];
> +
> +	/*
> +	 * The caller checks for sv_nrpools > 1, which
> +	 * implies that we've been initialized.
> +	 */
> +	BUG_ON(m->count == 0);
> +
> +	switch (m->mode) {
> +	case SVC_POOL_PERNODE:
> +		return poolnode;
> +	case SVC_POOL_PERCPU:
> +		return cpu_to_node(poolnode);
> +	}
> +
> +	return numa_node_id();
> +}
> +
> +/*
>   * Use the mapping mode to choose a pool for a given CPU.
>   * Used when enqueueing an incoming RPC.  Always returns
>   * a non-NULL pool pointer.
> @@ -507,7 +533,7 @@ EXPORT_SYMBOL(svc_destroy);
>   * We allocate pages and place them in rq_argpages.
>   */
>  static int
> -svc_init_buffer(struct svc_rqst *rqstp, unsigned int size)
> +svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, unsigned int node)
>  {
>  	unsigned int pages, arghi;
>  
> @@ -517,7 +543,7 @@ svc_init_buffer(struct svc_rqst *rqstp, unsigned int size)
>  	arghi = 0;
>  	BUG_ON(pages > RPCSVC_MAXPAGES);
>  	while (pages) {
> -		struct page *p = alloc_page(GFP_KERNEL);
> +		struct page *p = alloc_pages_node(node, GFP_KERNEL, 0);
>  		if (!p)
>  			break;
>  		rqstp->rq_pages[arghi++] = p;
> @@ -543,8 +569,14 @@ struct svc_rqst *
>  svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool)
>  {
>  	struct svc_rqst	*rqstp;
> +	unsigned int	node;
> +
> +	if (serv->sv_nrpools > 1)
> +		node = svc_pool_to_node(pool->sp_id);
> +	else
> +		node = numa_node_id();
>  
> -	rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
> +	rqstp = kmalloc_node(sizeof(*rqstp), GFP_KERNEL | __GFP_ZERO, node);
>  	if (!rqstp)
>  		goto out_enomem;
>  
> @@ -558,15 +590,17 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool)
>  	rqstp->rq_server = serv;
>  	rqstp->rq_pool = pool;
>  
> -	rqstp->rq_argp = kmalloc(serv->sv_xdrsize, GFP_KERNEL);
> +	rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize,
> +					GFP_KERNEL | __GFP_ZERO, node);
>  	if (!rqstp->rq_argp)
>  		goto out_thread;
>  
> -	rqstp->rq_resp = kmalloc(serv->sv_xdrsize, GFP_KERNEL);
> +	rqstp->rq_resp = kmalloc_node(serv->sv_xdrsize,
> +					GFP_KERNEL | __GFP_ZERO, node);

Why the __GFP_ZERO's on these last two?

--b.

>  	if (!rqstp->rq_resp)
>  		goto out_thread;
>  
> -	if (!svc_init_buffer(rqstp, serv->sv_max_mesg))
> +	if (!svc_init_buffer(rqstp, serv->sv_max_mesg, node))
>  		goto out_thread;
>  
>  	return rqstp;
> -- 
> 1.5.3.6
> 
> _______________________________________________
> NFSv4 mailing list
> NFSv4@linux-nfs.org
> http://linux-nfs.org/cgi-bin/mailman/listinfo/nfsv4

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/3] [RFC] knfsd: convert to kthread API and remove signaling for shutdown
  2008-05-30 18:46                       ` J. Bruce Fields
@ 2008-05-30 20:59                         ` Jeff Layton
  0 siblings, 0 replies; 26+ messages in thread
From: Jeff Layton @ 2008-05-30 20:59 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: linux-nfs, nfsv4, Greg Banks

On Fri, 30 May 2008 14:46:53 -0400
"J. Bruce Fields" <bfields@fieldses.org> wrote:

> On Fri, May 30, 2008 at 12:25:17PM -0400, Jeff Layton wrote:
> > On Tue, 20 May 2008 20:29:52 -0700
> > Greg Banks <gnb@melbourne.sgi.com> wrote:
> > 
> > > Jeff Layton wrote:
> > > >   
> > > > [...] Rather than delaying the allocation like this,
> > > > I wonder if we can just figure out the node from the cpumask and then
> > > > do a kmalloc_node()?
> > > >
> > > >   
> > > Sure.
> > > 
> > 
> > I started to do this patch as part of the kthread conversion, but it's
> > actually pretty independent. We can probably treat it separately.
> > This hasn't been tested on an actual NUMA machine yet, but any thoughts
> > on the following patch?
> > 
> > -----------------[snip]-------------------
> > 
> > >From 46432a021fe1931f4a44587124ac9442e83c4731 Mon Sep 17 00:00:00 2001
> > From: Jeff Layton <jlayton@redhat.com>
> > Date: Fri, 30 May 2008 11:57:18 -0400
> > Subject: [PATCH] sunrpc: have pooled services make NUMA-friendly allocations
> > 
> > Currently, svc_prepare_thread allocates memory using plain kmalloc()
> > and alloc_page() calls, even for threads that are destined to run on
> > different CPUs or NUMA nodes than the current one. Add a function to
> > translate a poolid into a NUMA node, and have svc_prepare_thread and
> > svc_init_buffer allocate memory on those nodes instead.
> > 
> > Signed-off-by: Jeff Layton <jlayton@redhat.com>
> > ---
> >  net/sunrpc/svc.c |   46 ++++++++++++++++++++++++++++++++++++++++------
> >  1 files changed, 40 insertions(+), 6 deletions(-)
> > 
> > diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
> > index 01c7e31..3985fbc 100644
> > --- a/net/sunrpc/svc.c
> > +++ b/net/sunrpc/svc.c
> > @@ -332,6 +332,32 @@ svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask)
> >  }
> >  
> >  /*
> > + * for a given poolid, return the NUMA memory node. This allows us to
> > + * allocate memory close to the CPU's where the task will be running
> > + */
> > +static inline unsigned int
> > +svc_pool_to_node(unsigned int pidx)
> > +{
> > +	struct svc_pool_map *m = &svc_pool_map;
> > +	unsigned int poolnode = m->pool_to[pidx];
> > +
> > +	/*
> > +	 * The caller checks for sv_nrpools > 1, which
> > +	 * implies that we've been initialized.
> > +	 */
> > +	BUG_ON(m->count == 0);
> > +
> > +	switch (m->mode) {
> > +	case SVC_POOL_PERNODE:
> > +		return poolnode;
> > +	case SVC_POOL_PERCPU:
> > +		return cpu_to_node(poolnode);
> > +	}
> > +
> > +	return numa_node_id();
> > +}
> > +
> > +/*
> >   * Use the mapping mode to choose a pool for a given CPU.
> >   * Used when enqueueing an incoming RPC.  Always returns
> >   * a non-NULL pool pointer.
> > @@ -507,7 +533,7 @@ EXPORT_SYMBOL(svc_destroy);
> >   * We allocate pages and place them in rq_argpages.
> >   */
> >  static int
> > -svc_init_buffer(struct svc_rqst *rqstp, unsigned int size)
> > +svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, unsigned int node)
> >  {
> >  	unsigned int pages, arghi;
> >  
> > @@ -517,7 +543,7 @@ svc_init_buffer(struct svc_rqst *rqstp, unsigned int size)
> >  	arghi = 0;
> >  	BUG_ON(pages > RPCSVC_MAXPAGES);
> >  	while (pages) {
> > -		struct page *p = alloc_page(GFP_KERNEL);
> > +		struct page *p = alloc_pages_node(node, GFP_KERNEL, 0);
> >  		if (!p)
> >  			break;
> >  		rqstp->rq_pages[arghi++] = p;
> > @@ -543,8 +569,14 @@ struct svc_rqst *
> >  svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool)
> >  {
> >  	struct svc_rqst	*rqstp;
> > +	unsigned int	node;
> > +
> > +	if (serv->sv_nrpools > 1)
> > +		node = svc_pool_to_node(pool->sp_id);
> > +	else
> > +		node = numa_node_id();
> >  
> > -	rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
> > +	rqstp = kmalloc_node(sizeof(*rqstp), GFP_KERNEL | __GFP_ZERO, node);
> >  	if (!rqstp)
> >  		goto out_enomem;
> >  
> > @@ -558,15 +590,17 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool)
> >  	rqstp->rq_server = serv;
> >  	rqstp->rq_pool = pool;
> >  
> > -	rqstp->rq_argp = kmalloc(serv->sv_xdrsize, GFP_KERNEL);
> > +	rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize,
> > +					GFP_KERNEL | __GFP_ZERO, node);
> >  	if (!rqstp->rq_argp)
> >  		goto out_thread;
> >  
> > -	rqstp->rq_resp = kmalloc(serv->sv_xdrsize, GFP_KERNEL);
> > +	rqstp->rq_resp = kmalloc_node(serv->sv_xdrsize,
> > +					GFP_KERNEL | __GFP_ZERO, node);
> 
> Why the __GFP_ZERO's on these last two?
> 
> --b.
> 

Uhhh...because I worked on this just before going to bed? Good catch.
I think those flags can be removed...

> >  	if (!rqstp->rq_resp)
> >  		goto out_thread;
> >  
> > -	if (!svc_init_buffer(rqstp, serv->sv_max_mesg))
> > +	if (!svc_init_buffer(rqstp, serv->sv_max_mesg, node))
> >  		goto out_thread;
> >  
> >  	return rqstp;
> > -- 
> > 1.5.3.6
> > 
> > _______________________________________________
> > NFSv4 mailing list
> > NFSv4@linux-nfs.org
> > http://linux-nfs.org/cgi-bin/mailman/listinfo/nfsv4


-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/3] [RFC] knfsd: convert to kthread API and remove signaling for shutdown
       [not found]                     ` <20080530122517.4f18c48e-RtJpwOs3+0O+kQycOl6kW4xkIHaj4LzF@public.gmane.org>
  2008-05-30 18:46                       ` J. Bruce Fields
@ 2008-06-02  5:51                       ` Greg Banks
       [not found]                         ` <48438A76.6000400-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org>
  1 sibling, 1 reply; 26+ messages in thread
From: Greg Banks @ 2008-06-02  5:51 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Neil Brown, linux-nfs, nfsv4

Jeff Layton wrote:
>
> I started to do this patch as part of the kthread conversion, but it's
> actually pretty independent. We can probably treat it separately.
> This hasn't been tested on an actual NUMA machine yet, but any thoughts
> on the following patch?
> [...]
>  
> -	rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
> +	rqstp = kmalloc_node(sizeof(*rqstp), GFP_KERNEL | __GFP_ZERO, node);
>  	if (!rqstp)
>  		goto out_enomem;
>   
I'm not an expert, but from looking at the slab code I don't think
__GFP_ZERO works the way you're expecting on the kmalloc() family, only
on the alloc_pages() family.  In particular it's not used when the
object is allocated on the slab fastpath.   kzalloc() seems to be a
kmalloc() followed by a memset().  Perhaps you'd be better off either
adding a kzalloc_node() or doing kmalloc_node() plus memset().

Apart from that and Bruce's comment the patch looks like it would work.

-- 
Greg Banks, P.Engineer, SGI Australian Software Group.
The cake is *not* a lie.
I don't speak for SGI.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/3] [RFC] knfsd: convert to kthread API and remove signaling for shutdown
       [not found]                         ` <48438A76.6000400-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org>
@ 2008-06-02 10:41                           ` Jeff Layton
       [not found]                             ` <20080602064132.10c69c88-RtJpwOs3+0O+kQycOl6kW4xkIHaj4LzF@public.gmane.org>
  0 siblings, 1 reply; 26+ messages in thread
From: Jeff Layton @ 2008-06-02 10:41 UTC (permalink / raw)
  To: Greg Banks; +Cc: Neil Brown, linux-nfs, nfsv4

On Mon, 02 Jun 2008 00:51:50 -0500
Greg Banks <gnb-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org> wrote:

> Jeff Layton wrote:
> >
> > I started to do this patch as part of the kthread conversion, but it's
> > actually pretty independent. We can probably treat it separately.
> > This hasn't been tested on an actual NUMA machine yet, but any thoughts
> > on the following patch?
> > [...]
> >  
> > -	rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
> > +	rqstp = kmalloc_node(sizeof(*rqstp), GFP_KERNEL | __GFP_ZERO, node);
> >  	if (!rqstp)
> >  		goto out_enomem;
> >   
> I'm not an expert, but from looking at the slab code I don't think
> __GFP_ZERO works the way you're expecting on the kmalloc() family, only
> on the alloc_pages() family.  In particular it's not used when the
> object is allocated on the slab fastpath. kzalloc() seems to be a
> kmalloc() followed by a memset().  Perhaps you'd be better off either
> adding a kzalloc_node() or doing kmalloc_node() plus memset().
> 

I'm certainly no VM expert either, but where are you seeing the
kmalloc + memset? The only definition I see is the inlined function
in slab.h which is:

static inline void *kzalloc(size_t size, gfp_t flags)
{
        return kmalloc(size, flags | __GFP_ZERO);
}

Still though, you're probably right. A kzalloc_node() might be handy for
other uses. I'll consider it once I unravel what kzalloc actually
does...

-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/3] [RFC] knfsd: convert to kthread API and remove signaling for shutdown
       [not found]                             ` <20080602064132.10c69c88-RtJpwOs3+0O+kQycOl6kW4xkIHaj4LzF@public.gmane.org>
@ 2008-06-03  3:27                               ` Greg Banks
       [not found]                                 ` <4844BA3C.3010605-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org>
  0 siblings, 1 reply; 26+ messages in thread
From: Greg Banks @ 2008-06-03  3:27 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Neil Brown, linux-nfs, nfsv4

Jeff Layton wrote:
> On Mon, 02 Jun 2008 00:51:50 -0500
> Greg Banks <gnb-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org> wrote:
>   
>
> I'm certainly no VM expert either, but where are you seeing the
> kmalloc + memset? The only definition I see is the inlined function
> in slab.h which is:
>
> static inline void *kzalloc(size_t size, gfp_t flags)
>
>   
Sorry, I was looking in the wrong tree (sles10 not tot).  Nevermind :-)

-- 
Greg Banks, P.Engineer, SGI Australian Software Group.
The cake is *not* a lie.
I don't speak for SGI.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/3] [RFC] knfsd: convert to kthread API and remove signaling for shutdown
       [not found]                                 ` <4844BA3C.3010605-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org>
@ 2008-06-03 10:51                                   ` Jeff Layton
  0 siblings, 0 replies; 26+ messages in thread
From: Jeff Layton @ 2008-06-03 10:51 UTC (permalink / raw)
  To: Greg Banks; +Cc: Neil Brown, linux-nfs, nfsv4

On Tue, 03 Jun 2008 13:27:56 +1000
Greg Banks <gnb-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org> wrote:

> Jeff Layton wrote:
> > On Mon, 02 Jun 2008 00:51:50 -0500
> > Greg Banks <gnb-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org> wrote:
> >   
> >
> > I'm certainly no VM expert either, but where are you seeing the
> > kmalloc + memset? The only definition I see is the inlined function
> > in slab.h which is:
> >
> > static inline void *kzalloc(size_t size, gfp_t flags)
> >
> >   
> Sorry, I was looking in the wrong tree (sles10 not tot).  Nevermind :-)
> 

No worries... :-)

I went back and walked through the slab and slub code and it looks like
simply setting __GFP_ZERO on the allocation will do the right thing.
I'll plan to push the patchset today sometime.

Thanks,
-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2008-06-03 10:51 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-05-18  2:35 [PATCH 0/3] [RFC] knfsd: convert to kthread API and remove signaling for shutdown Jeff Layton
2008-05-18  2:35 ` [PATCH 1/3] [RFC] knfsd: convert knfsd to kthread API Jeff Layton
2008-05-18  2:35   ` [PATCH 2/3] [RFC] sunrpc: remove unneeded fields from svc_serv struct Jeff Layton
2008-05-18  2:35     ` [PATCH 3/3] [RFC] knfsd: remove signal defines and extraneous variables Jeff Layton
2008-05-19  6:07 ` [PATCH 0/3] [RFC] knfsd: convert to kthread API and remove signaling for shutdown Neil Brown
     [not found]   ` <18481.6416.571430.593722-wvvUuzkyo1EYVZTmpyfIwg@public.gmane.org>
2008-05-19 21:01     ` Jeff Layton
2008-05-19 22:00     ` Greg Banks
2008-05-19 23:52       ` Neil Brown
     [not found]         ` <18482.4782.858347.981553-wvvUuzkyo1EYVZTmpyfIwg@public.gmane.org>
2008-05-20  2:04           ` Greg Banks
2008-05-20  2:24         ` Jeff Layton
     [not found]           ` <20080519222457.6f24daa5-PC62bkCOHzGdMjc06nkz3ljfA9RmPOcC@public.gmane.org>
2008-05-20  2:34             ` Greg Banks
2008-05-20 11:05               ` Jeff Layton
     [not found]             ` <483238B3.4010702-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org>
2008-05-20 13:33               ` Talpey, Thomas
2008-05-20  3:13           ` Neil Brown
2008-05-20 11:13             ` Jeff Layton
     [not found]             ` <18482.16837.381955.636390-wvvUuzkyo1EYVZTmpyfIwg@public.gmane.org>
2008-05-20 20:26               ` Greg Banks
2008-05-20 20:36       ` Greg Banks
     [not found]         ` <4833364A.4010803-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org>
2008-05-21  1:48           ` Jeff Layton
     [not found]             ` <20080520214823.576ad7a7-RtJpwOs3+0O+kQycOl6kW4xkIHaj4LzF@public.gmane.org>
2008-05-21  3:29               ` Greg Banks
     [not found]                 ` <48339730.3060206-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org>
2008-05-30 16:25                   ` Jeff Layton
     [not found]                     ` <20080530122517.4f18c48e-RtJpwOs3+0O+kQycOl6kW4xkIHaj4LzF@public.gmane.org>
2008-05-30 18:46                       ` J. Bruce Fields
2008-05-30 20:59                         ` Jeff Layton
2008-06-02  5:51                       ` Greg Banks
     [not found]                         ` <48438A76.6000400-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org>
2008-06-02 10:41                           ` Jeff Layton
     [not found]                             ` <20080602064132.10c69c88-RtJpwOs3+0O+kQycOl6kW4xkIHaj4LzF@public.gmane.org>
2008-06-03  3:27                               ` Greg Banks
     [not found]                                 ` <4844BA3C.3010605-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org>
2008-06-03 10:51                                   ` Jeff Layton

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox