diff for duplicates of <20170725182619.GQ29716@redhat.com> diff --git a/a/1.txt b/N1/1.txt index d32c464..e739523 100644 --- a/a/1.txt +++ b/N1/1.txt @@ -44,3 +44,139 @@ So the simplest is to use a similar trick to what ksm_exit uses, this is untested just to show the idea it may require further adjustment as the bit isn't used only for the test_and_set_bit locking, but I didn't see much issues with other set_bit/test_bit. + +>From f414244480fdc1f771b3148feb3fac77ec813e4c Mon Sep 17 00:00:00 2001 +From: Andrea Arcangeli <aarcange@redhat.com> +Date: Tue, 25 Jul 2017 20:02:27 +0200 +Subject: [PATCH 1/1] mm: oom: let oom_reap_task and exit_mmap to run + concurrently + +This is purely required because exit_aio() may block and exit_mmap() may +never start, if the oom_reap_task cannot start running on a mm with +mm_users == 0. + +At the same time if the OOM reaper doesn't wait at all for the memory +of the current OOM candidate to be freed by exit_mmap->unmap_vmas, it +would generate a spurious OOM kill. + +If it wasn't because of the exit_aio it would be enough to change the +oom_reap_task in the case it finds mm_users == 0, to wait for a +timeout or to wait for __mmput to set MMF_OOM_SKIP itself, but it's +not just exit_mmap the problem here so the concurrency of exit_mmap +and oom_reap_task is apparently warranted. + +It's a non standard runtime, exit_mmap runs without mmap_sem, and +oom_reap_task runs with the mmap_sem for reading as usual (kind of +MADV_DONTNEED). + +The race between the two is solved with pair of test_and_set_bit and a +dummy down_write/up_write cycle on the same lines of the ksm_exit +method. + +If the OOM reaper sets the bit first, exit_mmap will wait it to finish +in down_write (before taking down mm structures that would make the +oom_reap_task fail with use after free). + +If exit_mmap comes first, oom reaper will skip the mm as it's already +all freed. + +Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> +--- + kernel/fork.c | 1 - + mm/mmap.c | 5 +++++ + mm/oom_kill.c | 15 ++++++++++++--- + 3 files changed, 17 insertions(+), 4 deletions(-) + +diff --git a/kernel/fork.c b/kernel/fork.c +index 9ec98b0c4675..ed412d85a596 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -910,7 +910,6 @@ static inline void __mmput(struct mm_struct *mm) + } + if (mm->binfmt) + module_put(mm->binfmt->module); +- set_bit(MMF_OOM_SKIP, &mm->flags); + mmdrop(mm); + } + +diff --git a/mm/mmap.c b/mm/mmap.c +index f19efcf75418..615133762b99 100644 +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -2993,6 +2993,11 @@ void exit_mmap(struct mm_struct *mm) + /* Use -1 here to ensure all VMAs in the mm are unmapped */ + unmap_vmas(&tlb, vma, 0, -1); + ++ if (test_and_set_bit(MMF_OOM_SKIP, &mm->flags)) { ++ /* wait the OOM reaper to stop working on this mm */ ++ down_write(&mm->mmap_sem); ++ up_write(&mm->mmap_sem); ++ } + free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); + tlb_finish_mmu(&tlb, 0, -1); + +diff --git a/mm/oom_kill.c b/mm/oom_kill.c +index 9e8b4f030c1c..2a7000995784 100644 +--- a/mm/oom_kill.c ++++ b/mm/oom_kill.c +@@ -471,6 +471,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) + struct mmu_gather tlb; + struct vm_area_struct *vma; + bool ret = true; ++ bool mmgot = true; + + /* + * We have to make sure to not race with the victim exit path +@@ -500,9 +501,16 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) + * and delayed __mmput doesn't matter that much + */ + if (!mmget_not_zero(mm)) { +- up_read(&mm->mmap_sem); + trace_skip_task_reaping(tsk->pid); +- goto unlock_oom; ++ /* ++ * MMF_OOM_SKIP is set by exit_mmap when the OOM ++ * reaper can't work on the mm anymore. ++ */ ++ if (test_and_set_bit(MMF_OOM_SKIP, &mm->flags)) { ++ up_read(&mm->mmap_sem); ++ goto unlock_oom; ++ } ++ mmgot = false; + } + + trace_start_task_reaping(tsk->pid); +@@ -547,7 +555,8 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) + * different context because we shouldn't risk we get stuck there and + * put the oom_reaper out of the way. + */ +- mmput_async(mm); ++ if (mmgot) ++ mmput_async(mm); + trace_finish_task_reaping(tsk->pid); + unlock_oom: + mutex_unlock(&oom_lock); + +This will perform the same as then the set_bit in __mmput can be +removed and test_and_set_bit doesn't cost more (at least on x86, on +other archs it requires an smp_mb too, but it's marginal difference), +still an atomic op that is. + +> > 4) how is it safe to overwrite a VM_FAULT_RETRY that returns without +> > mmap_sem and then the arch code will release the mmap_sem despite +> > it was already released by handle_mm_fault? Anonymous memory faults +> > aren't common to return VM_FAULT_RETRY but an userfault +> > can. Shouldn't there be a block that prevents overwriting if +> > VM_FAULT_RETRY is set below? (not only VM_FAULT_ERROR) +> > +> > if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR) +> > && test_bit(MMF_UNSTABLE, &vma->vm_mm->flags))) +> > ret = VM_FAULT_SIGBUS; +> +> I am not sure I understand what you mean and how this is related to the +> patch? + +It's not related to the patch but it involves the OOM reaper as it +only happens when MMF_UNSTABLE is set which is set only by the OOM +reaper. I was simply reading the OOM reaper code and following up what +MMF_UNSTABLE does and it ringed a bell. diff --git a/a/content_digest b/N1/content_digest index 9984d68..f927232 100644 --- a/a/content_digest +++ b/N1/content_digest @@ -59,6 +59,142 @@ "So the simplest is to use a similar trick to what ksm_exit uses, this\n" "is untested just to show the idea it may require further adjustment as\n" "the bit isn't used only for the test_and_set_bit locking, but I didn't\n" - see much issues with other set_bit/test_bit. + "see much issues with other set_bit/test_bit.\n" + "\n" + ">From f414244480fdc1f771b3148feb3fac77ec813e4c Mon Sep 17 00:00:00 2001\n" + "From: Andrea Arcangeli <aarcange@redhat.com>\n" + "Date: Tue, 25 Jul 2017 20:02:27 +0200\n" + "Subject: [PATCH 1/1] mm: oom: let oom_reap_task and exit_mmap to run\n" + " concurrently\n" + "\n" + "This is purely required because exit_aio() may block and exit_mmap() may\n" + "never start, if the oom_reap_task cannot start running on a mm with\n" + "mm_users == 0.\n" + "\n" + "At the same time if the OOM reaper doesn't wait at all for the memory\n" + "of the current OOM candidate to be freed by exit_mmap->unmap_vmas, it\n" + "would generate a spurious OOM kill.\n" + "\n" + "If it wasn't because of the exit_aio it would be enough to change the\n" + "oom_reap_task in the case it finds mm_users == 0, to wait for a\n" + "timeout or to wait for __mmput to set MMF_OOM_SKIP itself, but it's\n" + "not just exit_mmap the problem here so the concurrency of exit_mmap\n" + "and oom_reap_task is apparently warranted.\n" + "\n" + "It's a non standard runtime, exit_mmap runs without mmap_sem, and\n" + "oom_reap_task runs with the mmap_sem for reading as usual (kind of\n" + "MADV_DONTNEED).\n" + "\n" + "The race between the two is solved with pair of test_and_set_bit and a\n" + "dummy down_write/up_write cycle on the same lines of the ksm_exit\n" + "method.\n" + "\n" + "If the OOM reaper sets the bit first, exit_mmap will wait it to finish\n" + "in down_write (before taking down mm structures that would make the\n" + "oom_reap_task fail with use after free).\n" + "\n" + "If exit_mmap comes first, oom reaper will skip the mm as it's already\n" + "all freed.\n" + "\n" + "Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>\n" + "---\n" + " kernel/fork.c | 1 -\n" + " mm/mmap.c | 5 +++++\n" + " mm/oom_kill.c | 15 ++++++++++++---\n" + " 3 files changed, 17 insertions(+), 4 deletions(-)\n" + "\n" + "diff --git a/kernel/fork.c b/kernel/fork.c\n" + "index 9ec98b0c4675..ed412d85a596 100644\n" + "--- a/kernel/fork.c\n" + "+++ b/kernel/fork.c\n" + "@@ -910,7 +910,6 @@ static inline void __mmput(struct mm_struct *mm)\n" + " \t}\n" + " \tif (mm->binfmt)\n" + " \t\tmodule_put(mm->binfmt->module);\n" + "-\tset_bit(MMF_OOM_SKIP, &mm->flags);\n" + " \tmmdrop(mm);\n" + " }\n" + " \n" + "diff --git a/mm/mmap.c b/mm/mmap.c\n" + "index f19efcf75418..615133762b99 100644\n" + "--- a/mm/mmap.c\n" + "+++ b/mm/mmap.c\n" + "@@ -2993,6 +2993,11 @@ void exit_mmap(struct mm_struct *mm)\n" + " \t/* Use -1 here to ensure all VMAs in the mm are unmapped */\n" + " \tunmap_vmas(&tlb, vma, 0, -1);\n" + " \n" + "+\tif (test_and_set_bit(MMF_OOM_SKIP, &mm->flags)) {\n" + "+\t\t/* wait the OOM reaper to stop working on this mm */\n" + "+\t\tdown_write(&mm->mmap_sem);\n" + "+\t\tup_write(&mm->mmap_sem);\n" + "+\t}\n" + " \tfree_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);\n" + " \ttlb_finish_mmu(&tlb, 0, -1);\n" + " \n" + "diff --git a/mm/oom_kill.c b/mm/oom_kill.c\n" + "index 9e8b4f030c1c..2a7000995784 100644\n" + "--- a/mm/oom_kill.c\n" + "+++ b/mm/oom_kill.c\n" + "@@ -471,6 +471,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)\n" + " \tstruct mmu_gather tlb;\n" + " \tstruct vm_area_struct *vma;\n" + " \tbool ret = true;\n" + "+\tbool mmgot = true;\n" + " \n" + " \t/*\n" + " \t * We have to make sure to not race with the victim exit path\n" + "@@ -500,9 +501,16 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)\n" + " \t * and delayed __mmput doesn't matter that much\n" + " \t */\n" + " \tif (!mmget_not_zero(mm)) {\n" + "-\t\tup_read(&mm->mmap_sem);\n" + " \t\ttrace_skip_task_reaping(tsk->pid);\n" + "-\t\tgoto unlock_oom;\n" + "+\t\t/*\n" + "+\t\t * MMF_OOM_SKIP is set by exit_mmap when the OOM\n" + "+\t\t * reaper can't work on the mm anymore.\n" + "+\t\t */\n" + "+\t\tif (test_and_set_bit(MMF_OOM_SKIP, &mm->flags)) {\n" + "+\t\t\tup_read(&mm->mmap_sem);\n" + "+\t\t\tgoto unlock_oom;\n" + "+\t\t}\n" + "+\t\tmmgot = false;\n" + " \t}\n" + " \n" + " \ttrace_start_task_reaping(tsk->pid);\n" + "@@ -547,7 +555,8 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)\n" + " \t * different context because we shouldn't risk we get stuck there and\n" + " \t * put the oom_reaper out of the way.\n" + " \t */\n" + "-\tmmput_async(mm);\n" + "+\tif (mmgot)\n" + "+\t\tmmput_async(mm);\n" + " \ttrace_finish_task_reaping(tsk->pid);\n" + " unlock_oom:\n" + " \tmutex_unlock(&oom_lock);\n" + "\n" + "This will perform the same as then the set_bit in __mmput can be\n" + "removed and test_and_set_bit doesn't cost more (at least on x86, on\n" + "other archs it requires an smp_mb too, but it's marginal difference),\n" + "still an atomic op that is.\n" + "\n" + "> > 4) how is it safe to overwrite a VM_FAULT_RETRY that returns without\n" + "> > mmap_sem and then the arch code will release the mmap_sem despite\n" + "> > it was already released by handle_mm_fault? Anonymous memory faults\n" + "> > aren't common to return VM_FAULT_RETRY but an userfault\n" + "> > can. Shouldn't there be a block that prevents overwriting if\n" + "> > VM_FAULT_RETRY is set below? (not only VM_FAULT_ERROR)\n" + "> > \n" + "> > \tif (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR)\n" + "> > \t\t\t\t&& test_bit(MMF_UNSTABLE, &vma->vm_mm->flags)))\n" + "> > \t\tret = VM_FAULT_SIGBUS;\n" + "> \n" + "> I am not sure I understand what you mean and how this is related to the\n" + "> patch?\n" + "\n" + "It's not related to the patch but it involves the OOM reaper as it\n" + "only happens when MMF_UNSTABLE is set which is set only by the OOM\n" + "reaper. I was simply reading the OOM reaper code and following up what\n" + MMF_UNSTABLE does and it ringed a bell. -05b36d9aa0c5a91684537dcf36a26a2838d75a477c7985b67f0bd8666400d51c +3e05aadccf964bcd13b3beb3c95b24b213b026b160910a690549233ce7bbff17
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.