From: Nadav Amit <namit@vmware.com>
To: linux-mm@kvack.org
Cc: nadav.amit@gmail.com, linux-kernel@vger.kernel.org,
akpm@linux-foundation.org, Andy Lutomirski <luto@kernel.org>,
Nadav Amit <namit@vmware.com>
Subject: [PATCH v6 1/7] mm: migrate: prevent racy access to tlb_flush_pending
Date: Tue, 1 Aug 2017 17:08:12 -0700 [thread overview]
Message-ID: <20170802000818.4760-2-namit@vmware.com> (raw)
In-Reply-To: <20170802000818.4760-1-namit@vmware.com>
From: Nadav Amit <nadav.amit@gmail.com>
Setting and clearing mm->tlb_flush_pending can be performed by multiple
threads, since mmap_sem may only be acquired for read in
task_numa_work(). If this happens, tlb_flush_pending might be cleared
while one of the threads still changes PTEs and batches TLB flushes.
This can lead to the same race between migration and
change_protection_range() that led to the introduction of
tlb_flush_pending. The result of this race was data corruption, which
means that this patch also addresses a theoretically possible data
corruption.
An actual data corruption was not observed, yet the race was
was confirmed by adding assertion to check tlb_flush_pending is not set
by two threads, adding artificial latency in change_protection_range()
and using sysctl to reduce kernel.numa_balancing_scan_delay_ms.
Fixes: 20841405940e ("mm: fix TLB flush race between migration, and
change_protection_range")
Cc: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Nadav Amit <namit@vmware.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Minchan Kim <minchan@kernel.org>
---
include/linux/mm_types.h | 31 ++++++++++++++++++++++---------
kernel/fork.c | 2 +-
mm/debug.c | 2 +-
mm/mprotect.c | 4 ++--
4 files changed, 26 insertions(+), 13 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 45cdb27791a3..f5263dd0f1bc 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -493,7 +493,7 @@ struct mm_struct {
* can move process memory needs to flush the TLB when moving a
* PROT_NONE or PROT_NUMA mapped page.
*/
- bool tlb_flush_pending;
+ atomic_t tlb_flush_pending;
#endif
struct uprobes_state uprobes_state;
#ifdef CONFIG_HUGETLB_PAGE
@@ -528,33 +528,46 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
{
barrier();
- return mm->tlb_flush_pending;
+ return atomic_read(&mm->tlb_flush_pending) > 0;
}
-static inline void set_tlb_flush_pending(struct mm_struct *mm)
+
+static inline void init_tlb_flush_pending(struct mm_struct *mm)
{
- mm->tlb_flush_pending = true;
+ atomic_set(&mm->tlb_flush_pending, 0);
+}
+
+static inline void inc_tlb_flush_pending(struct mm_struct *mm)
+{
+ atomic_inc(&mm->tlb_flush_pending);
/*
- * Guarantee that the tlb_flush_pending store does not leak into the
+ * Guarantee that the tlb_flush_pending increase does not leak into the
* critical section updating the page tables
*/
smp_mb__before_spinlock();
}
+
/* Clearing is done after a TLB flush, which also provides a barrier. */
-static inline void clear_tlb_flush_pending(struct mm_struct *mm)
+static inline void dec_tlb_flush_pending(struct mm_struct *mm)
{
barrier();
- mm->tlb_flush_pending = false;
+ atomic_dec(&mm->tlb_flush_pending);
}
#else
static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
{
return false;
}
-static inline void set_tlb_flush_pending(struct mm_struct *mm)
+
+static inline void init_tlb_flush_pending(struct mm_struct *mm)
{
}
-static inline void clear_tlb_flush_pending(struct mm_struct *mm)
+
+static inline void inc_tlb_flush_pending(struct mm_struct *mm)
+{
+}
+
+static inline void dec_tlb_flush_pending(struct mm_struct *mm)
{
}
#endif
diff --git a/kernel/fork.c b/kernel/fork.c
index e53770d2bf95..840e7a7132e1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -809,7 +809,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_aio(mm);
mm_init_owner(mm, p);
mmu_notifier_mm_init(mm);
- clear_tlb_flush_pending(mm);
+ init_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
mm->pmd_huge_pte = NULL;
#endif
diff --git a/mm/debug.c b/mm/debug.c
index db1cd26d8752..d70103bb4731 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -159,7 +159,7 @@ void dump_mm(const struct mm_struct *mm)
mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq,
#endif
#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
- mm->tlb_flush_pending,
+ atomic_read(&mm->tlb_flush_pending),
#endif
mm->def_flags, &mm->def_flags
);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 8edd0d576254..0c413774c1e3 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -245,7 +245,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
- set_tlb_flush_pending(mm);
+ inc_tlb_flush_pending(mm);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
@@ -257,7 +257,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
/* Only flush the TLB if we actually modified any entries: */
if (pages)
flush_tlb_range(vma, start, end);
- clear_tlb_flush_pending(mm);
+ dec_tlb_flush_pending(mm);
return pages;
}
--
2.11.0
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
WARNING: multiple messages have this Message-ID (diff)
From: Nadav Amit <namit@vmware.com>
To: <linux-mm@kvack.org>
Cc: <nadav.amit@gmail.com>, <linux-kernel@vger.kernel.org>,
<akpm@linux-foundation.org>, Andy Lutomirski <luto@kernel.org>,
Nadav Amit <namit@vmware.com>
Subject: [PATCH v6 1/7] mm: migrate: prevent racy access to tlb_flush_pending
Date: Tue, 1 Aug 2017 17:08:12 -0700 [thread overview]
Message-ID: <20170802000818.4760-2-namit@vmware.com> (raw)
In-Reply-To: <20170802000818.4760-1-namit@vmware.com>
From: Nadav Amit <nadav.amit@gmail.com>
Setting and clearing mm->tlb_flush_pending can be performed by multiple
threads, since mmap_sem may only be acquired for read in
task_numa_work(). If this happens, tlb_flush_pending might be cleared
while one of the threads still changes PTEs and batches TLB flushes.
This can lead to the same race between migration and
change_protection_range() that led to the introduction of
tlb_flush_pending. The result of this race was data corruption, which
means that this patch also addresses a theoretically possible data
corruption.
An actual data corruption was not observed, yet the race was
was confirmed by adding assertion to check tlb_flush_pending is not set
by two threads, adding artificial latency in change_protection_range()
and using sysctl to reduce kernel.numa_balancing_scan_delay_ms.
Fixes: 20841405940e ("mm: fix TLB flush race between migration, and
change_protection_range")
Cc: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Nadav Amit <namit@vmware.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Minchan Kim <minchan@kernel.org>
---
include/linux/mm_types.h | 31 ++++++++++++++++++++++---------
kernel/fork.c | 2 +-
mm/debug.c | 2 +-
mm/mprotect.c | 4 ++--
4 files changed, 26 insertions(+), 13 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 45cdb27791a3..f5263dd0f1bc 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -493,7 +493,7 @@ struct mm_struct {
* can move process memory needs to flush the TLB when moving a
* PROT_NONE or PROT_NUMA mapped page.
*/
- bool tlb_flush_pending;
+ atomic_t tlb_flush_pending;
#endif
struct uprobes_state uprobes_state;
#ifdef CONFIG_HUGETLB_PAGE
@@ -528,33 +528,46 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
{
barrier();
- return mm->tlb_flush_pending;
+ return atomic_read(&mm->tlb_flush_pending) > 0;
}
-static inline void set_tlb_flush_pending(struct mm_struct *mm)
+
+static inline void init_tlb_flush_pending(struct mm_struct *mm)
{
- mm->tlb_flush_pending = true;
+ atomic_set(&mm->tlb_flush_pending, 0);
+}
+
+static inline void inc_tlb_flush_pending(struct mm_struct *mm)
+{
+ atomic_inc(&mm->tlb_flush_pending);
/*
- * Guarantee that the tlb_flush_pending store does not leak into the
+ * Guarantee that the tlb_flush_pending increase does not leak into the
* critical section updating the page tables
*/
smp_mb__before_spinlock();
}
+
/* Clearing is done after a TLB flush, which also provides a barrier. */
-static inline void clear_tlb_flush_pending(struct mm_struct *mm)
+static inline void dec_tlb_flush_pending(struct mm_struct *mm)
{
barrier();
- mm->tlb_flush_pending = false;
+ atomic_dec(&mm->tlb_flush_pending);
}
#else
static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
{
return false;
}
-static inline void set_tlb_flush_pending(struct mm_struct *mm)
+
+static inline void init_tlb_flush_pending(struct mm_struct *mm)
{
}
-static inline void clear_tlb_flush_pending(struct mm_struct *mm)
+
+static inline void inc_tlb_flush_pending(struct mm_struct *mm)
+{
+}
+
+static inline void dec_tlb_flush_pending(struct mm_struct *mm)
{
}
#endif
diff --git a/kernel/fork.c b/kernel/fork.c
index e53770d2bf95..840e7a7132e1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -809,7 +809,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_aio(mm);
mm_init_owner(mm, p);
mmu_notifier_mm_init(mm);
- clear_tlb_flush_pending(mm);
+ init_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
mm->pmd_huge_pte = NULL;
#endif
diff --git a/mm/debug.c b/mm/debug.c
index db1cd26d8752..d70103bb4731 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -159,7 +159,7 @@ void dump_mm(const struct mm_struct *mm)
mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq,
#endif
#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
- mm->tlb_flush_pending,
+ atomic_read(&mm->tlb_flush_pending),
#endif
mm->def_flags, &mm->def_flags
);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 8edd0d576254..0c413774c1e3 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -245,7 +245,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
- set_tlb_flush_pending(mm);
+ inc_tlb_flush_pending(mm);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
@@ -257,7 +257,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
/* Only flush the TLB if we actually modified any entries: */
if (pages)
flush_tlb_range(vma, start, end);
- clear_tlb_flush_pending(mm);
+ dec_tlb_flush_pending(mm);
return pages;
}
--
2.11.0
next prev parent reply other threads:[~2017-08-02 7:18 UTC|newest]
Thread overview: 85+ messages / expand[flat|nested] mbox.gz Atom feed top
2017-08-02 0:08 [PATCH v6 0/7] fixes of TLB batching races Nadav Amit
2017-08-02 0:08 ` Nadav Amit
2017-08-02 0:08 ` Nadav Amit [this message]
2017-08-02 0:08 ` [PATCH v6 1/7] mm: migrate: prevent racy access to tlb_flush_pending Nadav Amit
2017-08-02 0:08 ` [PATCH v6 2/7] mm: migrate: fix barriers around tlb_flush_pending Nadav Amit
2017-08-02 0:08 ` Nadav Amit
2017-08-02 0:08 ` [PATCH v6 3/7] Revert "mm: numa: defer TLB flush for THP migration as long as possible" Nadav Amit
2017-08-02 0:08 ` Nadav Amit
2017-08-11 10:50 ` Peter Zijlstra
2017-08-11 10:50 ` Peter Zijlstra
2017-08-02 0:08 ` [PATCH v6 4/7] mm: refactoring TLB gathering API Nadav Amit
2017-08-02 0:08 ` Nadav Amit
2017-08-02 0:08 ` Nadav Amit
2017-08-11 9:23 ` Peter Zijlstra
2017-08-11 9:23 ` Peter Zijlstra
2017-08-11 17:12 ` Nadav Amit
2017-08-11 17:12 ` Nadav Amit
2017-08-14 0:49 ` Minchan Kim
2017-08-14 0:49 ` Minchan Kim
2017-08-02 0:08 ` [PATCH v6 5/7] mm: make tlb_flush_pending global Nadav Amit
2017-08-02 0:08 ` Nadav Amit
2017-08-02 14:28 ` kbuild test robot
2017-08-02 14:28 ` kbuild test robot
2017-08-02 23:23 ` Minchan Kim
2017-08-02 23:23 ` Minchan Kim
2017-08-02 23:27 ` Andrew Morton
2017-08-02 23:27 ` Andrew Morton
2017-08-02 23:34 ` Minchan Kim
2017-08-02 23:34 ` Minchan Kim
2017-08-03 16:40 ` kbuild test robot
2017-08-03 16:40 ` kbuild test robot
2017-08-02 0:08 ` [PATCH v6 6/7] mm: fix MADV_[FREE|DONTNEED] TLB flush miss problem Nadav Amit
2017-08-02 0:08 ` Nadav Amit
2017-08-02 0:08 ` Nadav Amit
2017-08-08 1:19 ` [lkp-robot] [mm] 7674270022: will-it-scale.per_process_ops -19.3% regression kernel test robot
2017-08-08 1:19 ` kernel test robot
2017-08-08 1:19 ` kernel test robot
2017-08-08 1:19 ` kernel test robot
2017-08-08 2:28 ` Minchan Kim
2017-08-08 2:28 ` Minchan Kim
2017-08-08 2:28 ` Minchan Kim
2017-08-08 4:23 ` Nadav Amit
2017-08-08 4:23 ` Nadav Amit
2017-08-08 4:23 ` Nadav Amit
2017-08-08 5:51 ` Nadav Amit
2017-08-08 5:51 ` Nadav Amit
2017-08-08 5:51 ` Nadav Amit
2017-08-08 8:08 ` Minchan Kim
2017-08-08 8:08 ` Minchan Kim
2017-08-08 8:08 ` Minchan Kim
2017-08-08 8:08 ` Minchan Kim
2017-08-08 8:08 ` Minchan Kim
2017-08-08 8:16 ` Nadav Amit
2017-08-08 8:16 ` Nadav Amit
2017-08-09 1:25 ` Ye Xiaolong
2017-08-09 1:25 ` Ye Xiaolong
2017-08-09 1:25 ` Ye Xiaolong
2017-08-09 1:25 ` Ye Xiaolong
2017-08-09 2:59 ` Ye Xiaolong
2017-08-09 2:59 ` Ye Xiaolong
2017-08-09 2:59 ` Ye Xiaolong
2017-08-09 2:59 ` Ye Xiaolong
2017-08-09 2:59 ` Ye Xiaolong
2017-08-10 4:13 ` Minchan Kim
2017-08-10 4:13 ` Minchan Kim
2017-08-10 4:13 ` Minchan Kim
2017-08-10 4:14 ` Nadav Amit
2017-08-10 4:14 ` Nadav Amit
2017-08-10 4:14 ` Nadav Amit
2017-08-10 4:20 ` Minchan Kim
2017-08-10 4:20 ` Minchan Kim
2017-08-10 4:20 ` Minchan Kim
2017-08-11 13:30 ` [PATCH v6 6/7] mm: fix MADV_[FREE|DONTNEED] TLB flush miss problem Peter Zijlstra
2017-08-11 13:30 ` Peter Zijlstra
2017-08-13 6:14 ` Nadav Amit
2017-08-13 12:08 ` Peter Zijlstra
2017-08-13 12:08 ` Peter Zijlstra
2017-08-13 12:08 ` Peter Zijlstra
2017-08-14 1:26 ` Minchan Kim
2017-08-14 1:26 ` Minchan Kim
2017-08-14 1:26 ` Minchan Kim
2017-08-02 0:08 ` [PATCH v6 7/7] mm: fix KSM data corruption Nadav Amit
2017-08-02 0:08 ` Nadav Amit
2017-08-02 23:26 ` [PATCH v6 0/7] fixes of TLB batching races Minchan Kim
2017-08-02 23:26 ` Minchan Kim
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20170802000818.4760-2-namit@vmware.com \
--to=namit@vmware.com \
--cc=akpm@linux-foundation.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=luto@kernel.org \
--cc=nadav.amit@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.