Re: [PATCH 1/6] mmput: use notifier chain to call subsystem exit handler.

* Re: [PATCH 1/6] mmput: use notifier chain to call subsystem exit handler.
       [not found] ` <1403920822-14488-2-git-send-email-j.glisse@gmail.com>
@ 2014-06-30 14:41   ` Gabbay, Oded
       [not found]     ` <019CCE693E457142B37B791721487FD91806B836-0nO7ALo/ziwxlywnonMhLEEOCMrvLtNR@public.gmane.org>
  0 siblings, 1 reply; 26+ messages in thread
From: Gabbay, Oded @ 2014-06-30 14:41 UTC (permalink / raw)
  To: Jérôme Glisse
  Cc: Deucher, Alexander, Lewycky, Andrew, Cornwall, Jay,
	akpm@linux-foundation.org, linux-mm@kvack.org,
	linux-kernel@vger.kernel.org, mgorman@suse.de, hpa@zytor.com,
	peterz@infraread.org, aarcange@redhat.com, riel@redhat.com,
	jweiner@redhat.com, torvalds@linux-foundation.org, Mark Hairgrove,
	Jatin Kumar, Subhash Gutti, Lucien Dunning, Cameron Buschardt,
	Arvind Gopalakrishnan, John Hubbard, Sherry

On Fri, 2014-06-27 at 22:00 -0400, Jérôme Glisse wrote:
> From: Jérôme Glisse <jglisse@redhat.com>
> 
> Several subsystem require a callback when a mm struct is being destroy
> so that they can cleanup there respective per mm struct. Instead of
> having each subsystem add its callback to mmput use a notifier chain
> to call each of the subsystem.
> 
> This will allow new subsystem to register callback even if they are
> module. There should be no contention on the rw semaphore protecting
> the call chain and the impact on the code path should be low and
> burried in the noise.
> 
> Note that this patch also move the call to cleanup functions after
> exit_mmap so that new call back can assume that mmu_notifier_release
> have already been call. This does not impact existing cleanup functions
> as they do not rely on anything that exit_mmap is freeing. Also moved
> khugepaged_exit to exit_mmap so that ordering is preserved for that
> function.
> 
> Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
> ---
>  fs/aio.c                | 29 ++++++++++++++++++++++-------
>  include/linux/aio.h     |  2 --
>  include/linux/ksm.h     | 11 -----------
>  include/linux/sched.h   |  5 +++++
>  include/linux/uprobes.h |  1 -
>  kernel/events/uprobes.c | 19 ++++++++++++++++---
>  kernel/fork.c           | 22 ++++++++++++++++++----
>  mm/ksm.c                | 26 +++++++++++++++++++++-----
>  mm/mmap.c               |  3 +++
>  9 files changed, 85 insertions(+), 33 deletions(-)
> 
> diff --git a/fs/aio.c b/fs/aio.c
> index c1d8c48..1d06e92 100644
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -40,6 +40,7 @@
>  #include <linux/ramfs.h>
>  #include <linux/percpu-refcount.h>
>  #include <linux/mount.h>
> +#include <linux/notifier.h>
>  
>  #include <asm/kmap_types.h>
>  #include <asm/uaccess.h>
> @@ -774,20 +775,22 @@ ssize_t wait_on_sync_kiocb(struct kiocb *req)
>  EXPORT_SYMBOL(wait_on_sync_kiocb);
>  
>  /*
> - * exit_aio: called when the last user of mm goes away.  At this point, there is
> + * aio_exit: called when the last user of mm goes away.  At this point, there is
>   * no way for any new requests to be submited or any of the io_* syscalls to be
>   * called on the context.
>   *
>   * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on
>   * them.
>   */
> -void exit_aio(struct mm_struct *mm)
> +static int aio_exit(struct notifier_block *nb,
> +		    unsigned long action, void *data)
>  {
> +	struct mm_struct *mm = data;
>  	struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table);
>  	int i;
>  
>  	if (!table)
> -		return;
> +		return 0;
>  
>  	for (i = 0; i < table->nr; ++i) {
>  		struct kioctx *ctx = table->table[i];
> @@ -796,10 +799,10 @@ void exit_aio(struct mm_struct *mm)
>  			continue;
>  		/*
>  		 * We don't need to bother with munmap() here - exit_mmap(mm)
> -		 * is coming and it'll unmap everything. And we simply can't,
> -		 * this is not necessarily our ->mm.
> -		 * Since kill_ioctx() uses non-zero ->mmap_size as indicator
> -		 * that it needs to unmap the area, just set it to 0.
> +		 * have already been call and everything is unmap by now. But
> +		 * to be safe set ->mmap_size to 0 since aio_free_ring() uses
> +		 * non-zero ->mmap_size as indicator that it needs to unmap the
> +		 * area.
>  		 */
>  		ctx->mmap_size = 0;
>  		kill_ioctx(mm, ctx, NULL);
> @@ -807,6 +810,7 @@ void exit_aio(struct mm_struct *mm)
>  
>  	RCU_INIT_POINTER(mm->ioctx_table, NULL);
>  	kfree(table);
> +	return 0;
>  }
>  
>  static void put_reqs_available(struct kioctx *ctx, unsigned nr)
> @@ -1629,3 +1633,14 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
>  	}
>  	return ret;
>  }
> +
> +static struct notifier_block aio_mmput_nb = {
> +	.notifier_call		= aio_exit,
> +	.priority		= 1,
> +};
> +
> +static int __init aio_init(void)
> +{
> +	return mmput_register_notifier(&aio_mmput_nb);
> +}
> +subsys_initcall(aio_init);
> diff --git a/include/linux/aio.h b/include/linux/aio.h
> index d9c92da..6308fac 100644
> --- a/include/linux/aio.h
> +++ b/include/linux/aio.h
> @@ -73,7 +73,6 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
>  extern ssize_t wait_on_sync_kiocb(struct kiocb *iocb);
>  extern void aio_complete(struct kiocb *iocb, long res, long res2);
>  struct mm_struct;
> -extern void exit_aio(struct mm_struct *mm);
>  extern long do_io_submit(aio_context_t ctx_id, long nr,
>  			 struct iocb __user *__user *iocbpp, bool compat);
>  void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel);
> @@ -81,7 +80,6 @@ void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel);
>  static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; }
>  static inline void aio_complete(struct kiocb *iocb, long res, long res2) { }
>  struct mm_struct;
> -static inline void exit_aio(struct mm_struct *mm) { }
>  static inline long do_io_submit(aio_context_t ctx_id, long nr,
>  				struct iocb __user * __user *iocbpp,
>  				bool compat) { return 0; }
> diff --git a/include/linux/ksm.h b/include/linux/ksm.h
> index 3be6bb1..84c184f 100644
> --- a/include/linux/ksm.h
> +++ b/include/linux/ksm.h
> @@ -20,7 +20,6 @@ struct mem_cgroup;
>  int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
>  		unsigned long end, int advice, unsigned long *vm_flags);
>  int __ksm_enter(struct mm_struct *mm);
> -void __ksm_exit(struct mm_struct *mm);
>  
>  static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
>  {
> @@ -29,12 +28,6 @@ static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
>  	return 0;
>  }
>  
> -static inline void ksm_exit(struct mm_struct *mm)
> -{
> -	if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
> -		__ksm_exit(mm);
> -}
> -
>  /*
>   * A KSM page is one of those write-protected "shared pages" or "merged pages"
>   * which KSM maps into multiple mms, wherever identical anonymous page content
> @@ -83,10 +76,6 @@ static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
>  	return 0;
>  }
>  
> -static inline void ksm_exit(struct mm_struct *mm)
> -{
> -}
> -
>  static inline int PageKsm(struct page *page)
>  {
>  	return 0;
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 322d4fc..428b3cf 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -2384,6 +2384,11 @@ static inline void mmdrop(struct mm_struct * mm)
>  		__mmdrop(mm);
>  }
>  
> +/* mmput call list of notifier and subsystem/module can register
> + * new one through this call.
> + */
> +extern int mmput_register_notifier(struct notifier_block *nb);
> +extern int mmput_unregister_notifier(struct notifier_block *nb);
>  /* mmput gets rid of the mappings and all user-space */
>  extern void mmput(struct mm_struct *);
>  /* Grab a reference to a task's mm, if it is not already going away */
> diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
> index 4f844c6..44e7267 100644
> --- a/include/linux/uprobes.h
> +++ b/include/linux/uprobes.h
> @@ -120,7 +120,6 @@ extern int uprobe_pre_sstep_notifier(struct pt_regs *regs);
>  extern void uprobe_notify_resume(struct pt_regs *regs);
>  extern bool uprobe_deny_signal(void);
>  extern bool arch_uprobe_skip_sstep(struct arch_uprobe *aup, struct pt_regs *regs);
> -extern void uprobe_clear_state(struct mm_struct *mm);
>  extern int  arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long addr);
>  extern int  arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs);
>  extern int  arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs);
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 46b7c31..32b04dc 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -37,6 +37,7 @@
>  #include <linux/percpu-rwsem.h>
>  #include <linux/task_work.h>
>  #include <linux/shmem_fs.h>
> +#include <linux/notifier.h>
>  
>  #include <linux/uprobes.h>
>  
> @@ -1220,16 +1221,19 @@ static struct xol_area *get_xol_area(void)
>  /*
>   * uprobe_clear_state - Free the area allocated for slots.
>   */
> -void uprobe_clear_state(struct mm_struct *mm)
> +static int uprobe_clear_state(struct notifier_block *nb,
> +			      unsigned long action, void *data)
>  {
> +	struct mm_struct *mm = data;
>  	struct xol_area *area = mm->uprobes_state.xol_area;
>  
>  	if (!area)
> -		return;
> +		return 0;
>  
>  	put_page(area->page);
>  	kfree(area->bitmap);
>  	kfree(area);
> +	return 0;
>  }
>  
>  void uprobe_start_dup_mmap(void)
> @@ -1979,9 +1983,14 @@ static struct notifier_block uprobe_exception_nb = {
>  	.priority		= INT_MAX-1,	/* notified after kprobes, kgdb */
>  };
>  
> +static struct notifier_block uprobe_mmput_nb = {
> +	.notifier_call		= uprobe_clear_state,
> +	.priority		= 0,
> +};
> +
>  static int __init init_uprobes(void)
>  {
> -	int i;
> +	int i, err;
>  
>  	for (i = 0; i < UPROBES_HASH_SZ; i++)
>  		mutex_init(&uprobes_mmap_mutex[i]);
> @@ -1989,6 +1998,10 @@ static int __init init_uprobes(void)
>  	if (percpu_init_rwsem(&dup_mmap_sem))
>  		return -ENOMEM;
>  
> +	err = mmput_register_notifier(&uprobe_mmput_nb);
> +	if (err)
> +		return err;
> +
>  	return register_die_notifier(&uprobe_exception_nb);
>  }
>  __initcall(init_uprobes);
> diff --git a/kernel/fork.c b/kernel/fork.c
> index dd8864f..b448509 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -87,6 +87,8 @@
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/task.h>
>  
> +static BLOCKING_NOTIFIER_HEAD(mmput_notifier);
> +
>  /*
>   * Protected counters by write_lock_irq(&tasklist_lock)
>   */
> @@ -623,6 +625,21 @@ void __mmdrop(struct mm_struct *mm)
>  EXPORT_SYMBOL_GPL(__mmdrop);
>  
>  /*
> + * Register a notifier that will be call by mmput
> + */
> +int mmput_register_notifier(struct notifier_block *nb)
> +{
> +	return blocking_notifier_chain_register(&mmput_notifier, nb);
> +}
> +EXPORT_SYMBOL_GPL(mmput_register_notifier);
> +
> +int mmput_unregister_notifier(struct notifier_block *nb)
> +{
> +	return blocking_notifier_chain_unregister(&mmput_notifier, nb);
> +}
> +EXPORT_SYMBOL_GPL(mmput_unregister_notifier);
> +
> +/*
>   * Decrement the use count and release all resources for an mm.
>   */
>  void mmput(struct mm_struct *mm)
> @@ -630,11 +647,8 @@ void mmput(struct mm_struct *mm)
>  	might_sleep();
>  
>  	if (atomic_dec_and_test(&mm->mm_users)) {
> -		uprobe_clear_state(mm);
> -		exit_aio(mm);
> -		ksm_exit(mm);
> -		khugepaged_exit(mm); /* must run before exit_mmap */
>  		exit_mmap(mm);
> +		blocking_notifier_call_chain(&mmput_notifier, 0, mm);
>  		set_mm_exe_file(mm, NULL);
>  		if (!list_empty(&mm->mmlist)) {
>  			spin_lock(&mmlist_lock);
> diff --git a/mm/ksm.c b/mm/ksm.c
> index 346ddc9..cb1e976 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -37,6 +37,7 @@
>  #include <linux/freezer.h>
>  #include <linux/oom.h>
>  #include <linux/numa.h>
> +#include <linux/notifier.h>
>  
>  #include <asm/tlbflush.h>
>  #include "internal.h"
> @@ -1586,7 +1587,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
>  		ksm_scan.mm_slot = slot;
>  		spin_unlock(&ksm_mmlist_lock);
>  		/*
> -		 * Although we tested list_empty() above, a racing __ksm_exit
> +		 * Although we tested list_empty() above, a racing ksm_exit
>  		 * of the last mm on the list may have removed it since then.
>  		 */
>  		if (slot == &ksm_mm_head)
> @@ -1658,9 +1659,9 @@ next_mm:
>  		/*
>  		 * We've completed a full scan of all vmas, holding mmap_sem
>  		 * throughout, and found no VM_MERGEABLE: so do the same as
> -		 * __ksm_exit does to remove this mm from all our lists now.
> -		 * This applies either when cleaning up after __ksm_exit
> -		 * (but beware: we can reach here even before __ksm_exit),
> +		 * ksm_exit does to remove this mm from all our lists now.
> +		 * This applies either when cleaning up after ksm_exit
> +		 * (but beware: we can reach here even before ksm_exit),
>  		 * or when all VM_MERGEABLE areas have been unmapped (and
>  		 * mmap_sem then protects against race with MADV_MERGEABLE).
>  		 */
> @@ -1821,11 +1822,16 @@ int __ksm_enter(struct mm_struct *mm)
>  	return 0;
>  }
>  
> -void __ksm_exit(struct mm_struct *mm)
> +static int ksm_exit(struct notifier_block *nb,
> +		    unsigned long action, void *data)
>  {
> +	struct mm_struct *mm = data;
>  	struct mm_slot *mm_slot;
>  	int easy_to_free = 0;
>  
> +	if (!test_bit(MMF_VM_MERGEABLE, &mm->flags))
> +		return 0;
> +
>  	/*
>  	 * This process is exiting: if it's straightforward (as is the
>  	 * case when ksmd was never running), free mm_slot immediately.
> @@ -1857,6 +1863,7 @@ void __ksm_exit(struct mm_struct *mm)
>  		down_write(&mm->mmap_sem);
>  		up_write(&mm->mmap_sem);
>  	}
> +	return 0;
>  }
>  
>  struct page *ksm_might_need_to_copy(struct page *page,
> @@ -2305,11 +2312,20 @@ static struct attribute_group ksm_attr_group = {
>  };
>  #endif /* CONFIG_SYSFS */
>  
> +static struct notifier_block ksm_mmput_nb = {
> +	.notifier_call		= ksm_exit,
> +	.priority		= 2,
> +};
> +
>  static int __init ksm_init(void)
>  {
>  	struct task_struct *ksm_thread;
>  	int err;
>  
> +	err = mmput_register_notifier(&ksm_mmput_nb);
> +	if (err)
> +		return err;
> +
>  	err = ksm_slab_init();
>  	if (err)
>  		goto out;
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 61aec93..b684a21 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -2775,6 +2775,9 @@ void exit_mmap(struct mm_struct *mm)
>  	struct vm_area_struct *vma;
>  	unsigned long nr_accounted = 0;
>  
> +	/* Important to call this first. */
> +	khugepaged_exit(mm);
> +
>  	/* mm's last user has gone, and its about to be pulled down */
>  	mmu_notifier_release(mm);
>  

Hi Jerome, I reviewed the patch, integrated and tested it in AMD's HSA
driver (KFD). It works as expected and I didn't find any problems with
it.

I did face some problems regarding the amd IOMMU v2 driver, which
changed its behavior (see commit "iommu/amd: Implement
mmu_notifier_release call-back") to use mmu_notifier_release and did
some "bad things" inside that
notifier (primarily, but not only, deleting the object which held the
mmu_notifier object itself, which you mustn't do because of the
locking). 

I'm thinking of changing that driver's behavior to use this new
mechanism instead of using mmu_notifier_release. Does that seem
acceptable ? Another solution will be to add a new mmu_notifier call,
but we already ruled that out ;)

So,
Reviewed-by: Oded Gabbay <oded.gabbay@amd.com>

	Oded

^ permalink raw reply	[flat|nested] 26+ messages in thread