All of lore.kernel.org
 help / color / mirror / Atom feed
diff for duplicates of <20170725182619.GQ29716@redhat.com>

diff --git a/a/1.txt b/N1/1.txt
index d32c464..e739523 100644
--- a/a/1.txt
+++ b/N1/1.txt
@@ -44,3 +44,139 @@ So the simplest is to use a similar trick to what ksm_exit uses, this
 is untested just to show the idea it may require further adjustment as
 the bit isn't used only for the test_and_set_bit locking, but I didn't
 see much issues with other set_bit/test_bit.
+
+>From f414244480fdc1f771b3148feb3fac77ec813e4c Mon Sep 17 00:00:00 2001
+From: Andrea Arcangeli <aarcange@redhat.com>
+Date: Tue, 25 Jul 2017 20:02:27 +0200
+Subject: [PATCH 1/1] mm: oom: let oom_reap_task and exit_mmap to run
+ concurrently
+
+This is purely required because exit_aio() may block and exit_mmap() may
+never start, if the oom_reap_task cannot start running on a mm with
+mm_users == 0.
+
+At the same time if the OOM reaper doesn't wait at all for the memory
+of the current OOM candidate to be freed by exit_mmap->unmap_vmas, it
+would generate a spurious OOM kill.
+
+If it wasn't because of the exit_aio it would be enough to change the
+oom_reap_task in the case it finds mm_users == 0, to wait for a
+timeout or to wait for __mmput to set MMF_OOM_SKIP itself, but it's
+not just exit_mmap the problem here so the concurrency of exit_mmap
+and oom_reap_task is apparently warranted.
+
+It's a non standard runtime, exit_mmap runs without mmap_sem, and
+oom_reap_task runs with the mmap_sem for reading as usual (kind of
+MADV_DONTNEED).
+
+The race between the two is solved with pair of test_and_set_bit and a
+dummy down_write/up_write cycle on the same lines of the ksm_exit
+method.
+
+If the OOM reaper sets the bit first, exit_mmap will wait it to finish
+in down_write (before taking down mm structures that would make the
+oom_reap_task fail with use after free).
+
+If exit_mmap comes first, oom reaper will skip the mm as it's already
+all freed.
+
+Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
+---
+ kernel/fork.c |  1 -
+ mm/mmap.c     |  5 +++++
+ mm/oom_kill.c | 15 ++++++++++++---
+ 3 files changed, 17 insertions(+), 4 deletions(-)
+
+diff --git a/kernel/fork.c b/kernel/fork.c
+index 9ec98b0c4675..ed412d85a596 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -910,7 +910,6 @@ static inline void __mmput(struct mm_struct *mm)
+ 	}
+ 	if (mm->binfmt)
+ 		module_put(mm->binfmt->module);
+-	set_bit(MMF_OOM_SKIP, &mm->flags);
+ 	mmdrop(mm);
+ }
+ 
+diff --git a/mm/mmap.c b/mm/mmap.c
+index f19efcf75418..615133762b99 100644
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -2993,6 +2993,11 @@ void exit_mmap(struct mm_struct *mm)
+ 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
+ 	unmap_vmas(&tlb, vma, 0, -1);
+ 
++	if (test_and_set_bit(MMF_OOM_SKIP, &mm->flags)) {
++		/* wait the OOM reaper to stop working on this mm */
++		down_write(&mm->mmap_sem);
++		up_write(&mm->mmap_sem);
++	}
+ 	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
+ 	tlb_finish_mmu(&tlb, 0, -1);
+ 
+diff --git a/mm/oom_kill.c b/mm/oom_kill.c
+index 9e8b4f030c1c..2a7000995784 100644
+--- a/mm/oom_kill.c
++++ b/mm/oom_kill.c
+@@ -471,6 +471,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
+ 	struct mmu_gather tlb;
+ 	struct vm_area_struct *vma;
+ 	bool ret = true;
++	bool mmgot = true;
+ 
+ 	/*
+ 	 * We have to make sure to not race with the victim exit path
+@@ -500,9 +501,16 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
+ 	 * and delayed __mmput doesn't matter that much
+ 	 */
+ 	if (!mmget_not_zero(mm)) {
+-		up_read(&mm->mmap_sem);
+ 		trace_skip_task_reaping(tsk->pid);
+-		goto unlock_oom;
++		/*
++		 * MMF_OOM_SKIP is set by exit_mmap when the OOM
++		 * reaper can't work on the mm anymore.
++		 */
++		if (test_and_set_bit(MMF_OOM_SKIP, &mm->flags)) {
++			up_read(&mm->mmap_sem);
++			goto unlock_oom;
++		}
++		mmgot = false;
+ 	}
+ 
+ 	trace_start_task_reaping(tsk->pid);
+@@ -547,7 +555,8 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
+ 	 * different context because we shouldn't risk we get stuck there and
+ 	 * put the oom_reaper out of the way.
+ 	 */
+-	mmput_async(mm);
++	if (mmgot)
++		mmput_async(mm);
+ 	trace_finish_task_reaping(tsk->pid);
+ unlock_oom:
+ 	mutex_unlock(&oom_lock);
+
+This will perform the same as then the set_bit in __mmput can be
+removed and test_and_set_bit doesn't cost more (at least on x86, on
+other archs it requires an smp_mb too, but it's marginal difference),
+still an atomic op that is.
+
+> > 4) how is it safe to overwrite a VM_FAULT_RETRY that returns without
+> >    mmap_sem and then the arch code will release the mmap_sem despite
+> >    it was already released by handle_mm_fault? Anonymous memory faults
+> >    aren't common to return VM_FAULT_RETRY but an userfault
+> >    can. Shouldn't there be a block that prevents overwriting if
+> >    VM_FAULT_RETRY is set below? (not only VM_FAULT_ERROR)
+> > 
+> > 	if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR)
+> > 				&& test_bit(MMF_UNSTABLE, &vma->vm_mm->flags)))
+> > 		ret = VM_FAULT_SIGBUS;
+> 
+> I am not sure I understand what you mean and how this is related to the
+> patch?
+
+It's not related to the patch but it involves the OOM reaper as it
+only happens when MMF_UNSTABLE is set which is set only by the OOM
+reaper. I was simply reading the OOM reaper code and following up what
+MMF_UNSTABLE does and it ringed a bell.
diff --git a/a/content_digest b/N1/content_digest
index 9984d68..f927232 100644
--- a/a/content_digest
+++ b/N1/content_digest
@@ -59,6 +59,142 @@
  "So the simplest is to use a similar trick to what ksm_exit uses, this\n"
  "is untested just to show the idea it may require further adjustment as\n"
  "the bit isn't used only for the test_and_set_bit locking, but I didn't\n"
- see much issues with other set_bit/test_bit.
+ "see much issues with other set_bit/test_bit.\n"
+ "\n"
+ ">From f414244480fdc1f771b3148feb3fac77ec813e4c Mon Sep 17 00:00:00 2001\n"
+ "From: Andrea Arcangeli <aarcange@redhat.com>\n"
+ "Date: Tue, 25 Jul 2017 20:02:27 +0200\n"
+ "Subject: [PATCH 1/1] mm: oom: let oom_reap_task and exit_mmap to run\n"
+ " concurrently\n"
+ "\n"
+ "This is purely required because exit_aio() may block and exit_mmap() may\n"
+ "never start, if the oom_reap_task cannot start running on a mm with\n"
+ "mm_users == 0.\n"
+ "\n"
+ "At the same time if the OOM reaper doesn't wait at all for the memory\n"
+ "of the current OOM candidate to be freed by exit_mmap->unmap_vmas, it\n"
+ "would generate a spurious OOM kill.\n"
+ "\n"
+ "If it wasn't because of the exit_aio it would be enough to change the\n"
+ "oom_reap_task in the case it finds mm_users == 0, to wait for a\n"
+ "timeout or to wait for __mmput to set MMF_OOM_SKIP itself, but it's\n"
+ "not just exit_mmap the problem here so the concurrency of exit_mmap\n"
+ "and oom_reap_task is apparently warranted.\n"
+ "\n"
+ "It's a non standard runtime, exit_mmap runs without mmap_sem, and\n"
+ "oom_reap_task runs with the mmap_sem for reading as usual (kind of\n"
+ "MADV_DONTNEED).\n"
+ "\n"
+ "The race between the two is solved with pair of test_and_set_bit and a\n"
+ "dummy down_write/up_write cycle on the same lines of the ksm_exit\n"
+ "method.\n"
+ "\n"
+ "If the OOM reaper sets the bit first, exit_mmap will wait it to finish\n"
+ "in down_write (before taking down mm structures that would make the\n"
+ "oom_reap_task fail with use after free).\n"
+ "\n"
+ "If exit_mmap comes first, oom reaper will skip the mm as it's already\n"
+ "all freed.\n"
+ "\n"
+ "Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>\n"
+ "---\n"
+ " kernel/fork.c |  1 -\n"
+ " mm/mmap.c     |  5 +++++\n"
+ " mm/oom_kill.c | 15 ++++++++++++---\n"
+ " 3 files changed, 17 insertions(+), 4 deletions(-)\n"
+ "\n"
+ "diff --git a/kernel/fork.c b/kernel/fork.c\n"
+ "index 9ec98b0c4675..ed412d85a596 100644\n"
+ "--- a/kernel/fork.c\n"
+ "+++ b/kernel/fork.c\n"
+ "@@ -910,7 +910,6 @@ static inline void __mmput(struct mm_struct *mm)\n"
+ " \t}\n"
+ " \tif (mm->binfmt)\n"
+ " \t\tmodule_put(mm->binfmt->module);\n"
+ "-\tset_bit(MMF_OOM_SKIP, &mm->flags);\n"
+ " \tmmdrop(mm);\n"
+ " }\n"
+ " \n"
+ "diff --git a/mm/mmap.c b/mm/mmap.c\n"
+ "index f19efcf75418..615133762b99 100644\n"
+ "--- a/mm/mmap.c\n"
+ "+++ b/mm/mmap.c\n"
+ "@@ -2993,6 +2993,11 @@ void exit_mmap(struct mm_struct *mm)\n"
+ " \t/* Use -1 here to ensure all VMAs in the mm are unmapped */\n"
+ " \tunmap_vmas(&tlb, vma, 0, -1);\n"
+ " \n"
+ "+\tif (test_and_set_bit(MMF_OOM_SKIP, &mm->flags)) {\n"
+ "+\t\t/* wait the OOM reaper to stop working on this mm */\n"
+ "+\t\tdown_write(&mm->mmap_sem);\n"
+ "+\t\tup_write(&mm->mmap_sem);\n"
+ "+\t}\n"
+ " \tfree_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);\n"
+ " \ttlb_finish_mmu(&tlb, 0, -1);\n"
+ " \n"
+ "diff --git a/mm/oom_kill.c b/mm/oom_kill.c\n"
+ "index 9e8b4f030c1c..2a7000995784 100644\n"
+ "--- a/mm/oom_kill.c\n"
+ "+++ b/mm/oom_kill.c\n"
+ "@@ -471,6 +471,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)\n"
+ " \tstruct mmu_gather tlb;\n"
+ " \tstruct vm_area_struct *vma;\n"
+ " \tbool ret = true;\n"
+ "+\tbool mmgot = true;\n"
+ " \n"
+ " \t/*\n"
+ " \t * We have to make sure to not race with the victim exit path\n"
+ "@@ -500,9 +501,16 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)\n"
+ " \t * and delayed __mmput doesn't matter that much\n"
+ " \t */\n"
+ " \tif (!mmget_not_zero(mm)) {\n"
+ "-\t\tup_read(&mm->mmap_sem);\n"
+ " \t\ttrace_skip_task_reaping(tsk->pid);\n"
+ "-\t\tgoto unlock_oom;\n"
+ "+\t\t/*\n"
+ "+\t\t * MMF_OOM_SKIP is set by exit_mmap when the OOM\n"
+ "+\t\t * reaper can't work on the mm anymore.\n"
+ "+\t\t */\n"
+ "+\t\tif (test_and_set_bit(MMF_OOM_SKIP, &mm->flags)) {\n"
+ "+\t\t\tup_read(&mm->mmap_sem);\n"
+ "+\t\t\tgoto unlock_oom;\n"
+ "+\t\t}\n"
+ "+\t\tmmgot = false;\n"
+ " \t}\n"
+ " \n"
+ " \ttrace_start_task_reaping(tsk->pid);\n"
+ "@@ -547,7 +555,8 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)\n"
+ " \t * different context because we shouldn't risk we get stuck there and\n"
+ " \t * put the oom_reaper out of the way.\n"
+ " \t */\n"
+ "-\tmmput_async(mm);\n"
+ "+\tif (mmgot)\n"
+ "+\t\tmmput_async(mm);\n"
+ " \ttrace_finish_task_reaping(tsk->pid);\n"
+ " unlock_oom:\n"
+ " \tmutex_unlock(&oom_lock);\n"
+ "\n"
+ "This will perform the same as then the set_bit in __mmput can be\n"
+ "removed and test_and_set_bit doesn't cost more (at least on x86, on\n"
+ "other archs it requires an smp_mb too, but it's marginal difference),\n"
+ "still an atomic op that is.\n"
+ "\n"
+ "> > 4) how is it safe to overwrite a VM_FAULT_RETRY that returns without\n"
+ "> >    mmap_sem and then the arch code will release the mmap_sem despite\n"
+ "> >    it was already released by handle_mm_fault? Anonymous memory faults\n"
+ "> >    aren't common to return VM_FAULT_RETRY but an userfault\n"
+ "> >    can. Shouldn't there be a block that prevents overwriting if\n"
+ "> >    VM_FAULT_RETRY is set below? (not only VM_FAULT_ERROR)\n"
+ "> > \n"
+ "> > \tif (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR)\n"
+ "> > \t\t\t\t&& test_bit(MMF_UNSTABLE, &vma->vm_mm->flags)))\n"
+ "> > \t\tret = VM_FAULT_SIGBUS;\n"
+ "> \n"
+ "> I am not sure I understand what you mean and how this is related to the\n"
+ "> patch?\n"
+ "\n"
+ "It's not related to the patch but it involves the OOM reaper as it\n"
+ "only happens when MMF_UNSTABLE is set which is set only by the OOM\n"
+ "reaper. I was simply reading the OOM reaper code and following up what\n"
+ MMF_UNSTABLE does and it ringed a bell.
 
-05b36d9aa0c5a91684537dcf36a26a2838d75a477c7985b67f0bd8666400d51c
+3e05aadccf964bcd13b3beb3c95b24b213b026b160910a690549233ce7bbff17

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.