All of lore.kernel.org
 help / color / mirror / Atom feed
From: Mateusz Guzik <mguzik@redhat.com>
To: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>,
	Andrew Morton <akpm@linux-foundation.org>,
	"Paul E. McKenney" <paulmck@linux.vnet.ibm.com>,
	Yann Droneaud <ydroneaud@opteya.com>,
	Konstantin Khlebnikov <khlebnikov@yandex-team.ru>,
	linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install
Date: Tue, 21 Apr 2015 22:06:25 +0200	[thread overview]
Message-ID: <20150421200624.GA16097@mguzik> (raw)
In-Reply-To: <1429639543.7346.329.camel@edumazet-glaptop2.roam.corp.google.com>

On Tue, Apr 21, 2015 at 11:05:43AM -0700, Eric Dumazet wrote:
> On Mon, 2015-04-20 at 13:49 -0700, Eric Dumazet wrote:
> > On Mon, 2015-04-20 at 10:15 -0700, Eric Dumazet wrote:
> > > On Mon, 2015-04-20 at 17:10 +0200, Mateusz Guzik wrote:
> > > 
> > > > Sorry for spam but I came up with another hack. :)
> > > > 
> > > > The idea is that we can have a variable which would signify the that
> > > > given thread is playing with fd table in fd_install (kind of a lock
> > > > embedded into task_struct). We would also have a flag in files struct
> > > > indicating that a thread would like to resize it.
> > > > 
> > > > expand_fdtable would set the flag and iterate over all threads waiting
> > > > for all of them to have the var set to 0.
> > > 
> > > The opposite : you have to block them in some way and add a rcu_sched()
> > > or something.
> > 

What I described would block them, although it was a crappy approach
(iterating threads vs cpus). I was wondering if RCU could be abused for
this feature and apparently it can.

> > Here is the patch I cooked here but not yet tested.
> 
> In following version :
> 
> 1) I replaced the yield() hack by a proper wait queue.
> 
> 2) I do not invoke synchronize_sched() for mono threaded programs.
> 
> 3) I avoid multiple threads doing a resize and then only one wins the
> deal.
> 

One could argue this last bit could be committed separately (a different
logical change).

As I read up about synchronize_sched and rcu_read_lock_sched, the code
should be correct.

Also see nits below.

> (copying/clearing big amount of memory only to discover another guy did
> the same is a big waste of resources)
> 
> 
> This seems to run properly on my hosts.
> 
> Your comments/tests are most welcomed, thanks !
> 
>  fs/file.c               |   41 +++++++++++++++++++++++++++++++++-----
>  include/linux/fdtable.h |    3 ++
>  2 files changed, 39 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/file.c b/fs/file.c
> index 93c5f89c248b..e0e113a56444 100644
> --- a/fs/file.c
> +++ b/fs/file.c
> @@ -147,6 +147,9 @@ static int expand_fdtable(struct files_struct *files, int nr)
>  
>  	spin_unlock(&files->file_lock);
>  	new_fdt = alloc_fdtable(nr);
> +	/* make sure no __fd_install() are still updating fdt */
> +	if (atomic_read(&files->count) > 1)
> +		synchronize_sched();
>  	spin_lock(&files->file_lock);
>  	if (!new_fdt)
>  		return -ENOMEM;
> @@ -170,9 +173,12 @@ static int expand_fdtable(struct files_struct *files, int nr)
>  		if (cur_fdt != &files->fdtab)
>  			call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
>  	} else {
> +		WARN_ON_ONCE(1);
>  		/* Somebody else expanded, so undo our attempt */
>  		__free_fdtable(new_fdt);

The reader may be left confused why there is a warning while the comment
does not indicate anything is wrong.

>  	}
> +	/* coupled with smp_rmb() in __fd_install() */
> +	smp_wmb();
>  	return 1;
>  }
>  
> @@ -187,19 +193,33 @@ static int expand_fdtable(struct files_struct *files, int nr)
>  static int expand_files(struct files_struct *files, int nr)
>  {
>  	struct fdtable *fdt;
> +	int expanded = 0;
>  
> +begin:
>  	fdt = files_fdtable(files);
>  
>  	/* Do we need to expand? */
>  	if (nr < fdt->max_fds)
> -		return 0;
> +		return expanded;
>  
>  	/* Can we expand? */
>  	if (nr >= sysctl_nr_open)
>  		return -EMFILE;
>  
> +	while (unlikely(files->resize_in_progress)) {
> +		spin_unlock(&files->file_lock);
> +		expanded = 1;
> +		wait_event(files->resize_wait, !files->resize_in_progress);
> +		spin_lock(&files->file_lock);
> +		goto begin;
> +	}

This does not loop anymore, so s/while/if/ ?

> +
>  	/* All good, so we try */
> -	return expand_fdtable(files, nr);
> +	files->resize_in_progress = true;
> +	expanded = expand_fdtable(files, nr);
> +	files->resize_in_progress = false;
> +	wake_up_all(&files->resize_wait);
> +	return expanded;
>  }
>  
>  static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
> @@ -256,6 +276,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
>  	atomic_set(&newf->count, 1);
>  
>  	spin_lock_init(&newf->file_lock);
> +	newf->resize_in_progress = 0;
> +	init_waitqueue_head(&newf->resize_wait);
>  	newf->next_fd = 0;
>  	new_fdt = &newf->fdtab;
>  	new_fdt->max_fds = NR_OPEN_DEFAULT;
> @@ -553,11 +575,20 @@ void __fd_install(struct files_struct *files, unsigned int fd,
>  		struct file *file)
>  {
>  	struct fdtable *fdt;
> -	spin_lock(&files->file_lock);
> -	fdt = files_fdtable(files);
> +
> +	rcu_read_lock_sched();
> +
> +	while (unlikely(files->resize_in_progress)) {
> +		rcu_read_unlock_sched();
> +		wait_event(files->resize_wait, !files->resize_in_progress);
> +		rcu_read_lock_sched();
> +	}
> +	/* coupled with smp_wmb() in expand_fdtable() */
> +	smp_rmb();
> +	fdt = READ_ONCE(files->fdt);
>  	BUG_ON(fdt->fd[fd] != NULL);
>  	rcu_assign_pointer(fdt->fd[fd], file);
> -	spin_unlock(&files->file_lock);
> +	rcu_read_unlock_sched();
>  }
>  
>  void fd_install(unsigned int fd, struct file *file)
> diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
> index 230f87bdf5ad..fbb88740634a 100644
> --- a/include/linux/fdtable.h
> +++ b/include/linux/fdtable.h
> @@ -47,6 +47,9 @@ struct files_struct {
>     * read mostly part
>     */
>  	atomic_t count;
> +	bool resize_in_progress;
> +	wait_queue_head_t resize_wait;
> +
>  	struct fdtable __rcu *fdt;
>  	struct fdtable fdtab;
>    /*
> 
> 
> 
> 

-- 
Mateusz Guzik

  reply	other threads:[~2015-04-21 20:06 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-04-16 12:16 [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install Mateusz Guzik
2015-04-16 17:47 ` Eric Dumazet
2015-04-16 18:09 ` Al Viro
2015-04-16 20:42   ` Eric Dumazet
2015-04-16 20:55     ` Eric Dumazet
2015-04-16 22:00       ` Mateusz Guzik
2015-04-16 22:52         ` Eric Dumazet
2015-04-16 22:35   ` Mateusz Guzik
2015-04-17 21:46 ` Eric Dumazet
2015-04-17 22:16   ` Mateusz Guzik
2015-04-17 23:02     ` Al Viro
2015-04-18 19:41       ` Eric Dumazet
2015-04-20 13:41         ` Mateusz Guzik
2015-04-20 16:46           ` Eric Dumazet
2015-04-20 16:48             ` Eric Dumazet
2015-04-20 13:06       ` Mateusz Guzik
2015-04-20 13:43         ` Mateusz Guzik
2015-04-20 15:10           ` Mateusz Guzik
2015-04-20 17:15             ` Eric Dumazet
2015-04-20 20:49               ` Eric Dumazet
2015-04-21 18:05                 ` Eric Dumazet
2015-04-21 20:06                   ` Mateusz Guzik [this message]
2015-04-21 20:12                     ` Mateusz Guzik
2015-04-21 21:06                       ` Eric Dumazet
2015-04-22  4:59                         ` [PATCH] fs/file.c: don't acquire files->file_lock in fd_install() Eric Dumazet
2015-04-27 19:05                           ` Mateusz Guzik
2015-04-28 16:20                             ` Eric Dumazet
2015-04-29  4:25                           ` [PATCH v2] " Eric Dumazet
2015-06-22  2:32                             ` Al Viro
2015-06-22  2:32                               ` Al Viro
2015-06-23  5:31                               ` Eric Dumazet
2015-06-23  5:31                                 ` Eric Dumazet
2015-06-30 13:54                             ` [PATCH v3] " Eric Dumazet
2015-04-22 13:31                         ` [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install Mateusz Guzik
2015-04-22 13:55                           ` Eric Dumazet
2015-04-21 20:57                     ` Eric Dumazet

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20150421200624.GA16097@mguzik \
    --to=mguzik@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=eric.dumazet@gmail.com \
    --cc=khlebnikov@yandex-team.ru \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=paulmck@linux.vnet.ibm.com \
    --cc=viro@ZenIV.linux.org.uk \
    --cc=ydroneaud@opteya.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.