diff for duplicates of <20170726162912.GA29716@redhat.com> diff --git a/a/1.txt b/N1/1.txt index 2963298..3b74bb8 100644 --- a/a/1.txt +++ b/N1/1.txt @@ -42,3 +42,125 @@ Isn't this enough? If this is enough it avoids other modification to the exit_mmap runtime that looks unnecessary: mm->mmap = NULL replaced by MMF_OOM_SKIP that has to be set anyway by __mmput later and one unnecessary branch to call the up_write. + +>From 3d9001490ee1a71f39c7bfaf19e96821f9d3ff16 Mon Sep 17 00:00:00 2001 +From: Andrea Arcangeli <aarcange@redhat.com> +Date: Tue, 25 Jul 2017 20:02:27 +0200 +Subject: [PATCH 1/1] mm: oom: let oom_reap_task and exit_mmap to run + concurrently + +This is purely required because exit_aio() may block and exit_mmap() may +never start, if the oom_reap_task cannot start running on a mm with +mm_users == 0. + +At the same time if the OOM reaper doesn't wait at all for the memory +of the current OOM candidate to be freed by exit_mmap->unmap_vmas, it +would generate a spurious OOM kill. + +If it wasn't because of the exit_aio or similar blocking functions in +the last mmput, it would be enough to change the oom_reap_task() in +the case it finds mm_users == 0, to wait for a timeout or to wait for +__mmput to set MMF_OOM_SKIP itself, but it's not just exit_mmap the +problem here so the concurrency of exit_mmap and oom_reap_task is +apparently warranted. + +It's a non standard runtime, exit_mmap() runs without mmap_sem, and +oom_reap_task runs with the mmap_sem for reading as usual (kind of +MADV_DONTNEED). + +The race between the two is solved with a combination of +tsk_is_oom_victim() (serialized by task_lock) and MMF_OOM_SKIP +(serialized by a dummy down_write/up_write cycle on the same lines of +the ksm_exit method). + +If the oom_reap_task() may be running concurrently during exit_mmap, +exit_mmap will wait it to finish in down_write (before taking down mm +structures that would make the oom_reap_task fail with use after +free). + +If exit_mmap comes first, oom_reap_task() will skip the mm if +MMF_OOM_SKIP is already set and in turn all memory is already freed +and furthermore the mm data structures may already have been taken +down by free_pgtables. + +Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> +--- + kernel/fork.c | 1 - + mm/mmap.c | 17 +++++++++++++++++ + mm/oom_kill.c | 15 +++++---------- + 3 files changed, 22 insertions(+), 11 deletions(-) + +diff --git a/kernel/fork.c b/kernel/fork.c +index 9ec98b0c4675..ed412d85a596 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -910,7 +910,6 @@ static inline void __mmput(struct mm_struct *mm) + } + if (mm->binfmt) + module_put(mm->binfmt->module); +- set_bit(MMF_OOM_SKIP, &mm->flags); + mmdrop(mm); + } + +diff --git a/mm/mmap.c b/mm/mmap.c +index f19efcf75418..bdab595ce25c 100644 +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -2993,6 +2993,23 @@ void exit_mmap(struct mm_struct *mm) + /* Use -1 here to ensure all VMAs in the mm are unmapped */ + unmap_vmas(&tlb, vma, 0, -1); + ++ set_bit(MMF_OOM_SKIP, &mm->flags); ++ if (tsk_is_oom_victim(current)) { ++ /* ++ * Wait for oom_reap_task() to stop working on this ++ * mm. Because MMF_OOM_SKIP is already set before ++ * calling down_read(), oom_reap_task() will not run ++ * on this "mm" post up_write(). ++ * ++ * tsk_is_oom_victim() cannot be set from under us ++ * either because current->mm is already set to NULL ++ * under task_lock before calling mmput and oom_mm is ++ * set not NULL by the OOM killer only if current->mm ++ * is found not NULL while holding the task_lock. ++ */ ++ down_write(&mm->mmap_sem); ++ up_write(&mm->mmap_sem); ++ } + free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); + tlb_finish_mmu(&tlb, 0, -1); + +diff --git a/mm/oom_kill.c b/mm/oom_kill.c +index 9e8b4f030c1c..242a1f0d579b 100644 +--- a/mm/oom_kill.c ++++ b/mm/oom_kill.c +@@ -495,11 +495,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) + } + + /* +- * increase mm_users only after we know we will reap something so +- * that the mmput_async is called only when we have reaped something +- * and delayed __mmput doesn't matter that much ++ * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't ++ * work on the mm anymore. The check for MMF_OOM_SKIP must run ++ * under mmap_sem for reading because it serializes against the ++ * down_write();up_write() cycle in exit_mmap(). + */ +- if (!mmget_not_zero(mm)) { ++ if (test_bit(MMF_OOM_SKIP, &mm->flags)) { + up_read(&mm->mmap_sem); + trace_skip_task_reaping(tsk->pid); + goto unlock_oom; +@@ -542,12 +543,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) + K(get_mm_counter(mm, MM_SHMEMPAGES))); + up_read(&mm->mmap_sem); + +- /* +- * Drop our reference but make sure the mmput slow path is called from a +- * different context because we shouldn't risk we get stuck there and +- * put the oom_reaper out of the way. +- */ +- mmput_async(mm); + trace_finish_task_reaping(tsk->pid); + unlock_oom: + mutex_unlock(&oom_lock); diff --git a/a/content_digest b/N1/content_digest index 5c5f04e..afc15de 100644 --- a/a/content_digest +++ b/N1/content_digest @@ -65,6 +65,128 @@ "Isn't this enough? If this is enough it avoids other modification to\n" "the exit_mmap runtime that looks unnecessary: mm->mmap = NULL replaced\n" "by MMF_OOM_SKIP that has to be set anyway by __mmput later and one\n" - unnecessary branch to call the up_write. + "unnecessary branch to call the up_write.\n" + "\n" + ">From 3d9001490ee1a71f39c7bfaf19e96821f9d3ff16 Mon Sep 17 00:00:00 2001\n" + "From: Andrea Arcangeli <aarcange@redhat.com>\n" + "Date: Tue, 25 Jul 2017 20:02:27 +0200\n" + "Subject: [PATCH 1/1] mm: oom: let oom_reap_task and exit_mmap to run\n" + " concurrently\n" + "\n" + "This is purely required because exit_aio() may block and exit_mmap() may\n" + "never start, if the oom_reap_task cannot start running on a mm with\n" + "mm_users == 0.\n" + "\n" + "At the same time if the OOM reaper doesn't wait at all for the memory\n" + "of the current OOM candidate to be freed by exit_mmap->unmap_vmas, it\n" + "would generate a spurious OOM kill.\n" + "\n" + "If it wasn't because of the exit_aio or similar blocking functions in\n" + "the last mmput, it would be enough to change the oom_reap_task() in\n" + "the case it finds mm_users == 0, to wait for a timeout or to wait for\n" + "__mmput to set MMF_OOM_SKIP itself, but it's not just exit_mmap the\n" + "problem here so the concurrency of exit_mmap and oom_reap_task is\n" + "apparently warranted.\n" + "\n" + "It's a non standard runtime, exit_mmap() runs without mmap_sem, and\n" + "oom_reap_task runs with the mmap_sem for reading as usual (kind of\n" + "MADV_DONTNEED).\n" + "\n" + "The race between the two is solved with a combination of\n" + "tsk_is_oom_victim() (serialized by task_lock) and MMF_OOM_SKIP\n" + "(serialized by a dummy down_write/up_write cycle on the same lines of\n" + "the ksm_exit method).\n" + "\n" + "If the oom_reap_task() may be running concurrently during exit_mmap,\n" + "exit_mmap will wait it to finish in down_write (before taking down mm\n" + "structures that would make the oom_reap_task fail with use after\n" + "free).\n" + "\n" + "If exit_mmap comes first, oom_reap_task() will skip the mm if\n" + "MMF_OOM_SKIP is already set and in turn all memory is already freed\n" + "and furthermore the mm data structures may already have been taken\n" + "down by free_pgtables.\n" + "\n" + "Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>\n" + "---\n" + " kernel/fork.c | 1 -\n" + " mm/mmap.c | 17 +++++++++++++++++\n" + " mm/oom_kill.c | 15 +++++----------\n" + " 3 files changed, 22 insertions(+), 11 deletions(-)\n" + "\n" + "diff --git a/kernel/fork.c b/kernel/fork.c\n" + "index 9ec98b0c4675..ed412d85a596 100644\n" + "--- a/kernel/fork.c\n" + "+++ b/kernel/fork.c\n" + "@@ -910,7 +910,6 @@ static inline void __mmput(struct mm_struct *mm)\n" + " \t}\n" + " \tif (mm->binfmt)\n" + " \t\tmodule_put(mm->binfmt->module);\n" + "-\tset_bit(MMF_OOM_SKIP, &mm->flags);\n" + " \tmmdrop(mm);\n" + " }\n" + " \n" + "diff --git a/mm/mmap.c b/mm/mmap.c\n" + "index f19efcf75418..bdab595ce25c 100644\n" + "--- a/mm/mmap.c\n" + "+++ b/mm/mmap.c\n" + "@@ -2993,6 +2993,23 @@ void exit_mmap(struct mm_struct *mm)\n" + " \t/* Use -1 here to ensure all VMAs in the mm are unmapped */\n" + " \tunmap_vmas(&tlb, vma, 0, -1);\n" + " \n" + "+\tset_bit(MMF_OOM_SKIP, &mm->flags);\n" + "+\tif (tsk_is_oom_victim(current)) {\n" + "+\t\t/*\n" + "+\t\t * Wait for oom_reap_task() to stop working on this\n" + "+\t\t * mm. Because MMF_OOM_SKIP is already set before\n" + "+\t\t * calling down_read(), oom_reap_task() will not run\n" + "+\t\t * on this \"mm\" post up_write().\n" + "+\t\t *\n" + "+\t\t * tsk_is_oom_victim() cannot be set from under us\n" + "+\t\t * either because current->mm is already set to NULL\n" + "+\t\t * under task_lock before calling mmput and oom_mm is\n" + "+\t\t * set not NULL by the OOM killer only if current->mm\n" + "+\t\t * is found not NULL while holding the task_lock.\n" + "+\t\t */\n" + "+\t\tdown_write(&mm->mmap_sem);\n" + "+\t\tup_write(&mm->mmap_sem);\n" + "+\t}\n" + " \tfree_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);\n" + " \ttlb_finish_mmu(&tlb, 0, -1);\n" + " \n" + "diff --git a/mm/oom_kill.c b/mm/oom_kill.c\n" + "index 9e8b4f030c1c..242a1f0d579b 100644\n" + "--- a/mm/oom_kill.c\n" + "+++ b/mm/oom_kill.c\n" + "@@ -495,11 +495,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)\n" + " \t}\n" + " \n" + " \t/*\n" + "-\t * increase mm_users only after we know we will reap something so\n" + "-\t * that the mmput_async is called only when we have reaped something\n" + "-\t * and delayed __mmput doesn't matter that much\n" + "+\t * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't\n" + "+\t * work on the mm anymore. The check for MMF_OOM_SKIP must run\n" + "+\t * under mmap_sem for reading because it serializes against the\n" + "+\t * down_write();up_write() cycle in exit_mmap().\n" + " \t */\n" + "-\tif (!mmget_not_zero(mm)) {\n" + "+\tif (test_bit(MMF_OOM_SKIP, &mm->flags)) {\n" + " \t\tup_read(&mm->mmap_sem);\n" + " \t\ttrace_skip_task_reaping(tsk->pid);\n" + " \t\tgoto unlock_oom;\n" + "@@ -542,12 +543,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)\n" + " \t\t\tK(get_mm_counter(mm, MM_SHMEMPAGES)));\n" + " \tup_read(&mm->mmap_sem);\n" + " \n" + "-\t/*\n" + "-\t * Drop our reference but make sure the mmput slow path is called from a\n" + "-\t * different context because we shouldn't risk we get stuck there and\n" + "-\t * put the oom_reaper out of the way.\n" + "-\t */\n" + "-\tmmput_async(mm);\n" + " \ttrace_finish_task_reaping(tsk->pid);\n" + " unlock_oom:\n" + " \tmutex_unlock(&oom_lock);" -4e487a4d569a70d82d8131afe5b43ef9ea9744e323c6a0a2fa65cfbe6ba72621 +ba860146270d1c8d280358e0adc69b249e891f917bc5bea4638bb698c12b3b97
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.