All of lore.kernel.org
 help / color / mirror / Atom feed
diff for duplicates of <20170726162912.GA29716@redhat.com>

diff --git a/a/1.txt b/N1/1.txt
index 2963298..3b74bb8 100644
--- a/a/1.txt
+++ b/N1/1.txt
@@ -42,3 +42,125 @@ Isn't this enough? If this is enough it avoids other modification to
 the exit_mmap runtime that looks unnecessary: mm->mmap = NULL replaced
 by MMF_OOM_SKIP that has to be set anyway by __mmput later and one
 unnecessary branch to call the up_write.
+
+>From 3d9001490ee1a71f39c7bfaf19e96821f9d3ff16 Mon Sep 17 00:00:00 2001
+From: Andrea Arcangeli <aarcange@redhat.com>
+Date: Tue, 25 Jul 2017 20:02:27 +0200
+Subject: [PATCH 1/1] mm: oom: let oom_reap_task and exit_mmap to run
+ concurrently
+
+This is purely required because exit_aio() may block and exit_mmap() may
+never start, if the oom_reap_task cannot start running on a mm with
+mm_users == 0.
+
+At the same time if the OOM reaper doesn't wait at all for the memory
+of the current OOM candidate to be freed by exit_mmap->unmap_vmas, it
+would generate a spurious OOM kill.
+
+If it wasn't because of the exit_aio or similar blocking functions in
+the last mmput, it would be enough to change the oom_reap_task() in
+the case it finds mm_users == 0, to wait for a timeout or to wait for
+__mmput to set MMF_OOM_SKIP itself, but it's not just exit_mmap the
+problem here so the concurrency of exit_mmap and oom_reap_task is
+apparently warranted.
+
+It's a non standard runtime, exit_mmap() runs without mmap_sem, and
+oom_reap_task runs with the mmap_sem for reading as usual (kind of
+MADV_DONTNEED).
+
+The race between the two is solved with a combination of
+tsk_is_oom_victim() (serialized by task_lock) and MMF_OOM_SKIP
+(serialized by a dummy down_write/up_write cycle on the same lines of
+the ksm_exit method).
+
+If the oom_reap_task() may be running concurrently during exit_mmap,
+exit_mmap will wait it to finish in down_write (before taking down mm
+structures that would make the oom_reap_task fail with use after
+free).
+
+If exit_mmap comes first, oom_reap_task() will skip the mm if
+MMF_OOM_SKIP is already set and in turn all memory is already freed
+and furthermore the mm data structures may already have been taken
+down by free_pgtables.
+
+Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
+---
+ kernel/fork.c |  1 -
+ mm/mmap.c     | 17 +++++++++++++++++
+ mm/oom_kill.c | 15 +++++----------
+ 3 files changed, 22 insertions(+), 11 deletions(-)
+
+diff --git a/kernel/fork.c b/kernel/fork.c
+index 9ec98b0c4675..ed412d85a596 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -910,7 +910,6 @@ static inline void __mmput(struct mm_struct *mm)
+ 	}
+ 	if (mm->binfmt)
+ 		module_put(mm->binfmt->module);
+-	set_bit(MMF_OOM_SKIP, &mm->flags);
+ 	mmdrop(mm);
+ }
+ 
+diff --git a/mm/mmap.c b/mm/mmap.c
+index f19efcf75418..bdab595ce25c 100644
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -2993,6 +2993,23 @@ void exit_mmap(struct mm_struct *mm)
+ 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
+ 	unmap_vmas(&tlb, vma, 0, -1);
+ 
++	set_bit(MMF_OOM_SKIP, &mm->flags);
++	if (tsk_is_oom_victim(current)) {
++		/*
++		 * Wait for oom_reap_task() to stop working on this
++		 * mm. Because MMF_OOM_SKIP is already set before
++		 * calling down_read(), oom_reap_task() will not run
++		 * on this "mm" post up_write().
++		 *
++		 * tsk_is_oom_victim() cannot be set from under us
++		 * either because current->mm is already set to NULL
++		 * under task_lock before calling mmput and oom_mm is
++		 * set not NULL by the OOM killer only if current->mm
++		 * is found not NULL while holding the task_lock.
++		 */
++		down_write(&mm->mmap_sem);
++		up_write(&mm->mmap_sem);
++	}
+ 	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
+ 	tlb_finish_mmu(&tlb, 0, -1);
+ 
+diff --git a/mm/oom_kill.c b/mm/oom_kill.c
+index 9e8b4f030c1c..242a1f0d579b 100644
+--- a/mm/oom_kill.c
++++ b/mm/oom_kill.c
+@@ -495,11 +495,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
+ 	}
+ 
+ 	/*
+-	 * increase mm_users only after we know we will reap something so
+-	 * that the mmput_async is called only when we have reaped something
+-	 * and delayed __mmput doesn't matter that much
++	 * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
++	 * work on the mm anymore. The check for MMF_OOM_SKIP must run
++	 * under mmap_sem for reading because it serializes against the
++	 * down_write();up_write() cycle in exit_mmap().
+ 	 */
+-	if (!mmget_not_zero(mm)) {
++	if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
+ 		up_read(&mm->mmap_sem);
+ 		trace_skip_task_reaping(tsk->pid);
+ 		goto unlock_oom;
+@@ -542,12 +543,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
+ 			K(get_mm_counter(mm, MM_SHMEMPAGES)));
+ 	up_read(&mm->mmap_sem);
+ 
+-	/*
+-	 * Drop our reference but make sure the mmput slow path is called from a
+-	 * different context because we shouldn't risk we get stuck there and
+-	 * put the oom_reaper out of the way.
+-	 */
+-	mmput_async(mm);
+ 	trace_finish_task_reaping(tsk->pid);
+ unlock_oom:
+ 	mutex_unlock(&oom_lock);
diff --git a/a/content_digest b/N1/content_digest
index 5c5f04e..afc15de 100644
--- a/a/content_digest
+++ b/N1/content_digest
@@ -65,6 +65,128 @@
  "Isn't this enough? If this is enough it avoids other modification to\n"
  "the exit_mmap runtime that looks unnecessary: mm->mmap = NULL replaced\n"
  "by MMF_OOM_SKIP that has to be set anyway by __mmput later and one\n"
- unnecessary branch to call the up_write.
+ "unnecessary branch to call the up_write.\n"
+ "\n"
+ ">From 3d9001490ee1a71f39c7bfaf19e96821f9d3ff16 Mon Sep 17 00:00:00 2001\n"
+ "From: Andrea Arcangeli <aarcange@redhat.com>\n"
+ "Date: Tue, 25 Jul 2017 20:02:27 +0200\n"
+ "Subject: [PATCH 1/1] mm: oom: let oom_reap_task and exit_mmap to run\n"
+ " concurrently\n"
+ "\n"
+ "This is purely required because exit_aio() may block and exit_mmap() may\n"
+ "never start, if the oom_reap_task cannot start running on a mm with\n"
+ "mm_users == 0.\n"
+ "\n"
+ "At the same time if the OOM reaper doesn't wait at all for the memory\n"
+ "of the current OOM candidate to be freed by exit_mmap->unmap_vmas, it\n"
+ "would generate a spurious OOM kill.\n"
+ "\n"
+ "If it wasn't because of the exit_aio or similar blocking functions in\n"
+ "the last mmput, it would be enough to change the oom_reap_task() in\n"
+ "the case it finds mm_users == 0, to wait for a timeout or to wait for\n"
+ "__mmput to set MMF_OOM_SKIP itself, but it's not just exit_mmap the\n"
+ "problem here so the concurrency of exit_mmap and oom_reap_task is\n"
+ "apparently warranted.\n"
+ "\n"
+ "It's a non standard runtime, exit_mmap() runs without mmap_sem, and\n"
+ "oom_reap_task runs with the mmap_sem for reading as usual (kind of\n"
+ "MADV_DONTNEED).\n"
+ "\n"
+ "The race between the two is solved with a combination of\n"
+ "tsk_is_oom_victim() (serialized by task_lock) and MMF_OOM_SKIP\n"
+ "(serialized by a dummy down_write/up_write cycle on the same lines of\n"
+ "the ksm_exit method).\n"
+ "\n"
+ "If the oom_reap_task() may be running concurrently during exit_mmap,\n"
+ "exit_mmap will wait it to finish in down_write (before taking down mm\n"
+ "structures that would make the oom_reap_task fail with use after\n"
+ "free).\n"
+ "\n"
+ "If exit_mmap comes first, oom_reap_task() will skip the mm if\n"
+ "MMF_OOM_SKIP is already set and in turn all memory is already freed\n"
+ "and furthermore the mm data structures may already have been taken\n"
+ "down by free_pgtables.\n"
+ "\n"
+ "Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>\n"
+ "---\n"
+ " kernel/fork.c |  1 -\n"
+ " mm/mmap.c     | 17 +++++++++++++++++\n"
+ " mm/oom_kill.c | 15 +++++----------\n"
+ " 3 files changed, 22 insertions(+), 11 deletions(-)\n"
+ "\n"
+ "diff --git a/kernel/fork.c b/kernel/fork.c\n"
+ "index 9ec98b0c4675..ed412d85a596 100644\n"
+ "--- a/kernel/fork.c\n"
+ "+++ b/kernel/fork.c\n"
+ "@@ -910,7 +910,6 @@ static inline void __mmput(struct mm_struct *mm)\n"
+ " \t}\n"
+ " \tif (mm->binfmt)\n"
+ " \t\tmodule_put(mm->binfmt->module);\n"
+ "-\tset_bit(MMF_OOM_SKIP, &mm->flags);\n"
+ " \tmmdrop(mm);\n"
+ " }\n"
+ " \n"
+ "diff --git a/mm/mmap.c b/mm/mmap.c\n"
+ "index f19efcf75418..bdab595ce25c 100644\n"
+ "--- a/mm/mmap.c\n"
+ "+++ b/mm/mmap.c\n"
+ "@@ -2993,6 +2993,23 @@ void exit_mmap(struct mm_struct *mm)\n"
+ " \t/* Use -1 here to ensure all VMAs in the mm are unmapped */\n"
+ " \tunmap_vmas(&tlb, vma, 0, -1);\n"
+ " \n"
+ "+\tset_bit(MMF_OOM_SKIP, &mm->flags);\n"
+ "+\tif (tsk_is_oom_victim(current)) {\n"
+ "+\t\t/*\n"
+ "+\t\t * Wait for oom_reap_task() to stop working on this\n"
+ "+\t\t * mm. Because MMF_OOM_SKIP is already set before\n"
+ "+\t\t * calling down_read(), oom_reap_task() will not run\n"
+ "+\t\t * on this \"mm\" post up_write().\n"
+ "+\t\t *\n"
+ "+\t\t * tsk_is_oom_victim() cannot be set from under us\n"
+ "+\t\t * either because current->mm is already set to NULL\n"
+ "+\t\t * under task_lock before calling mmput and oom_mm is\n"
+ "+\t\t * set not NULL by the OOM killer only if current->mm\n"
+ "+\t\t * is found not NULL while holding the task_lock.\n"
+ "+\t\t */\n"
+ "+\t\tdown_write(&mm->mmap_sem);\n"
+ "+\t\tup_write(&mm->mmap_sem);\n"
+ "+\t}\n"
+ " \tfree_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);\n"
+ " \ttlb_finish_mmu(&tlb, 0, -1);\n"
+ " \n"
+ "diff --git a/mm/oom_kill.c b/mm/oom_kill.c\n"
+ "index 9e8b4f030c1c..242a1f0d579b 100644\n"
+ "--- a/mm/oom_kill.c\n"
+ "+++ b/mm/oom_kill.c\n"
+ "@@ -495,11 +495,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)\n"
+ " \t}\n"
+ " \n"
+ " \t/*\n"
+ "-\t * increase mm_users only after we know we will reap something so\n"
+ "-\t * that the mmput_async is called only when we have reaped something\n"
+ "-\t * and delayed __mmput doesn't matter that much\n"
+ "+\t * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't\n"
+ "+\t * work on the mm anymore. The check for MMF_OOM_SKIP must run\n"
+ "+\t * under mmap_sem for reading because it serializes against the\n"
+ "+\t * down_write();up_write() cycle in exit_mmap().\n"
+ " \t */\n"
+ "-\tif (!mmget_not_zero(mm)) {\n"
+ "+\tif (test_bit(MMF_OOM_SKIP, &mm->flags)) {\n"
+ " \t\tup_read(&mm->mmap_sem);\n"
+ " \t\ttrace_skip_task_reaping(tsk->pid);\n"
+ " \t\tgoto unlock_oom;\n"
+ "@@ -542,12 +543,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)\n"
+ " \t\t\tK(get_mm_counter(mm, MM_SHMEMPAGES)));\n"
+ " \tup_read(&mm->mmap_sem);\n"
+ " \n"
+ "-\t/*\n"
+ "-\t * Drop our reference but make sure the mmput slow path is called from a\n"
+ "-\t * different context because we shouldn't risk we get stuck there and\n"
+ "-\t * put the oom_reaper out of the way.\n"
+ "-\t */\n"
+ "-\tmmput_async(mm);\n"
+ " \ttrace_finish_task_reaping(tsk->pid);\n"
+ " unlock_oom:\n"
+ " \tmutex_unlock(&oom_lock);"
 
-4e487a4d569a70d82d8131afe5b43ef9ea9744e323c6a0a2fa65cfbe6ba72621
+ba860146270d1c8d280358e0adc69b249e891f917bc5bea4638bb698c12b3b97

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.