* [PATCH 2/3] mmu_notifier: Call mmu_notifier_invalidate_range() from VMM
[not found] ` <1414516440-910-1-git-send-email-joro-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>
2014-10-28 17:13 ` [PATCH 1/3] mmu_notifier: Add mmu_notifier_invalidate_range() Joerg Roedel
@ 2014-10-28 17:13 ` Joerg Roedel
2014-10-28 17:14 ` [PATCH 3/3] mmu_notifier: Add the call-back for mmu_notifier_invalidate_range() Joerg Roedel
2 siblings, 0 replies; 4+ messages in thread
From: Joerg Roedel @ 2014-10-28 17:13 UTC (permalink / raw)
To: Andrew Morton, Andrea Arcangeli, Peter Zijlstra, Rik van Riel,
Hugh Dickins, Mel Gorman, Johannes Weiner
Cc: Jay.Cornwall-5C7GfCeVMHo, John.Bridgman-5C7GfCeVMHo,
jroedel-l3A5Bk7waGM, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
linux-mm-Bw31MaZKKs3YtjvyW6yDsg, Jerome Glisse,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Jesse Barnes,
David Woodhouse, ben.sander-5C7GfCeVMHo
From: Joerg Roedel <jroedel@suse.de>
Add calls to the new mmu_notifier_invalidate_range()
function to all places in the VMM that need it.
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
include/linux/mmu_notifier.h | 41 +++++++++++++++++++++++++++++++++++++++++
kernel/events/uprobes.c | 2 +-
mm/fremap.c | 2 +-
mm/huge_memory.c | 9 +++++----
mm/hugetlb.c | 7 ++++++-
mm/ksm.c | 4 ++--
mm/memory.c | 3 ++-
mm/migrate.c | 3 ++-
mm/rmap.c | 2 +-
9 files changed, 61 insertions(+), 12 deletions(-)
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 1790790..966da2b 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -284,6 +284,44 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
__young; \
})
+#define ptep_clear_flush_notify(__vma, __address, __ptep) \
+({ \
+ unsigned long ___addr = __address & PAGE_MASK; \
+ struct mm_struct *___mm = (__vma)->vm_mm; \
+ pte_t ___pte; \
+ \
+ ___pte = ptep_clear_flush(__vma, __address, __ptep); \
+ mmu_notifier_invalidate_range(___mm, ___addr, \
+ ___addr + PAGE_SIZE); \
+ \
+ ___pte; \
+})
+
+#define pmdp_clear_flush_notify(__vma, __haddr, __pmd) \
+({ \
+ unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \
+ struct mm_struct *___mm = (__vma)->vm_mm; \
+ pmd_t ___pmd; \
+ \
+ ___pmd = pmdp_clear_flush(__vma, __haddr, __pmd); \
+ mmu_notifier_invalidate_range(___mm, ___haddr, \
+ ___haddr + HPAGE_PMD_SIZE); \
+ \
+ ___pmd; \
+})
+
+#define pmdp_get_and_clear_notify(__mm, __haddr, __pmd) \
+({ \
+ unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \
+ pmd_t ___pmd; \
+ \
+ ___pmd = pmdp_get_and_clear(__mm, __haddr, __pmd); \
+ mmu_notifier_invalidate_range(__mm, ___haddr, \
+ ___haddr + HPAGE_PMD_SIZE); \
+ \
+ ___pmd; \
+})
+
/*
* set_pte_at_notify() sets the pte _after_ running the notifier.
* This is safe to start by updating the secondary MMUs, because the primary MMU
@@ -362,6 +400,9 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
#define ptep_clear_flush_young_notify ptep_clear_flush_young
#define pmdp_clear_flush_young_notify pmdp_clear_flush_young
+#define ptep_clear_flush_notify ptep_clear_flush
+#define pmdp_clear_flush_notify pmdp_clear_flush
+#define pmdp_get_and_clear_notify pmdp_get_and_clear
#define set_pte_at_notify set_pte_at
#endif /* CONFIG_MMU_NOTIFIER */
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 1d0af8a..bc143cf 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -193,7 +193,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
}
flush_cache_page(vma, addr, pte_pfn(*ptep));
- ptep_clear_flush(vma, addr, ptep);
+ ptep_clear_flush_notify(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
page_remove_rmap(page);
diff --git a/mm/fremap.c b/mm/fremap.c
index 72b8fa3..9129013 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -37,7 +37,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
if (pte_present(pte)) {
flush_cache_page(vma, addr, pte_pfn(pte));
- pte = ptep_clear_flush(vma, addr, ptep);
+ pte = ptep_clear_flush_notify(vma, addr, ptep);
page = vm_normal_page(vma, addr, pte);
if (page) {
if (pte_dirty(pte))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 74c78aa..ef320af 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1036,7 +1036,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
goto out_free_pages;
VM_BUG_ON_PAGE(!PageHead(page), page);
- pmdp_clear_flush(vma, haddr, pmd);
+ pmdp_clear_flush_notify(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
@@ -1179,7 +1179,7 @@ alloc:
pmd_t entry;
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- pmdp_clear_flush(vma, haddr, pmd);
+ pmdp_clear_flush_notify(vma, haddr, pmd);
page_add_new_anon_rmap(new_page, vma, haddr);
mem_cgroup_commit_charge(new_page, memcg, false);
lru_cache_add_active_or_unevictable(new_page, vma);
@@ -1512,7 +1512,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
pmd_t entry;
ret = 1;
if (!prot_numa) {
- entry = pmdp_get_and_clear(mm, addr, pmd);
+ entry = pmdp_get_and_clear_notify(mm, addr, pmd);
if (pmd_numa(entry))
entry = pmd_mknonnuma(entry);
entry = pmd_modify(entry, newprot);
@@ -1644,6 +1644,7 @@ static int __split_huge_page_splitting(struct page *page,
* serialize against split_huge_page*.
*/
pmdp_splitting_flush(vma, address, pmd);
+
ret = 1;
spin_unlock(ptl);
}
@@ -2833,7 +2834,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
pmd_t _pmd;
int i;
- pmdp_clear_flush(vma, haddr, pmd);
+ pmdp_clear_flush_notify(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9fd7227..2e6add0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2598,8 +2598,11 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
}
set_huge_pte_at(dst, addr, dst_pte, entry);
} else {
- if (cow)
+ if (cow) {
huge_ptep_set_wrprotect(src, addr, src_pte);
+ mmu_notifier_invalidate_range(src, mmun_start,
+ mmun_end);
+ }
entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
get_page(ptepage);
@@ -2899,6 +2902,7 @@ retry_avoidcopy:
/* Break COW */
huge_ptep_clear_flush(vma, address, ptep);
+ mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
set_huge_pte_at(mm, address, ptep,
make_huge_pte(vma, new_page, 1));
page_remove_rmap(old_page);
@@ -3374,6 +3378,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* and that page table be reused and filled with junk.
*/
flush_tlb_range(vma, start, end);
+ mmu_notifier_invalidate_range(mm, start, end);
mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
mmu_notifier_invalidate_range_end(mm, start, end);
diff --git a/mm/ksm.c b/mm/ksm.c
index 6b2e337..d247efa 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -892,7 +892,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
* this assure us that no O_DIRECT can happen after the check
* or in the middle of the check.
*/
- entry = ptep_clear_flush(vma, addr, ptep);
+ entry = ptep_clear_flush_notify(vma, addr, ptep);
/*
* Check that no O_DIRECT or similar I/O is in progress on the
* page
@@ -960,7 +960,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
page_add_anon_rmap(kpage, vma, addr);
flush_cache_page(vma, addr, pte_pfn(*ptep));
- ptep_clear_flush(vma, addr, ptep);
+ ptep_clear_flush_notify(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
page_remove_rmap(page);
diff --git a/mm/memory.c b/mm/memory.c
index 1cc6bfb..c287d4c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -238,6 +238,7 @@ static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
{
tlb->need_flush = 0;
tlb_flush(tlb);
+ mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
tlb_table_flush(tlb);
#endif
@@ -2233,7 +2234,7 @@ gotten:
* seen in the presence of one thread doing SMC and another
* thread doing COW.
*/
- ptep_clear_flush(vma, address, page_table);
+ ptep_clear_flush_notify(vma, address, page_table);
page_add_new_anon_rmap(new_page, vma, address);
mem_cgroup_commit_charge(new_page, memcg, false);
lru_cache_add_active_or_unevictable(new_page, vma);
diff --git a/mm/migrate.c b/mm/migrate.c
index 0143995..41945cb 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1854,7 +1854,7 @@ fail_putback:
*/
flush_cache_range(vma, mmun_start, mmun_end);
page_add_anon_rmap(new_page, vma, mmun_start);
- pmdp_clear_flush(vma, mmun_start, pmd);
+ pmdp_clear_flush_notify(vma, mmun_start, pmd);
set_pmd_at(mm, mmun_start, pmd, entry);
flush_tlb_range(vma, mmun_start, mmun_end);
update_mmu_cache_pmd(vma, address, &entry);
@@ -1862,6 +1862,7 @@ fail_putback:
if (page_count(page) != 2) {
set_pmd_at(mm, mmun_start, pmd, orig_entry);
flush_tlb_range(vma, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
update_mmu_cache_pmd(vma, address, &entry);
page_remove_rmap(new_page);
goto fail_putback;
diff --git a/mm/rmap.c b/mm/rmap.c
index 116a505..fdb8055 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1364,7 +1364,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pte));
- pteval = ptep_clear_flush(vma, address, pte);
+ pteval = ptep_clear_flush_notify(vma, address, pte);
/* If nonlinear, store the file page offset in the pte. */
if (page->index != linear_page_index(vma, address)) {
--
1.8.4.5
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH 3/3] mmu_notifier: Add the call-back for mmu_notifier_invalidate_range()
[not found] ` <1414516440-910-1-git-send-email-joro-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>
2014-10-28 17:13 ` [PATCH 1/3] mmu_notifier: Add mmu_notifier_invalidate_range() Joerg Roedel
2014-10-28 17:13 ` [PATCH 2/3] mmu_notifier: Call mmu_notifier_invalidate_range() from VMM Joerg Roedel
@ 2014-10-28 17:14 ` Joerg Roedel
2 siblings, 0 replies; 4+ messages in thread
From: Joerg Roedel @ 2014-10-28 17:14 UTC (permalink / raw)
To: Andrew Morton, Andrea Arcangeli, Peter Zijlstra, Rik van Riel,
Hugh Dickins, Mel Gorman, Johannes Weiner
Cc: Jay.Cornwall-5C7GfCeVMHo, John.Bridgman-5C7GfCeVMHo,
jroedel-l3A5Bk7waGM, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
linux-mm-Bw31MaZKKs3YtjvyW6yDsg, Jerome Glisse,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Jesse Barnes,
David Woodhouse, ben.sander-5C7GfCeVMHo
From: Joerg Roedel <jroedel@suse.de>
Now that the mmu_notifier_invalidate_range() calls are in
place, add the call-back to allow subsystems to register
against it.
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
include/linux/mmu_notifier.h | 37 ++++++++++++++++++++++++++++++++-----
mm/mmu_notifier.c | 25 +++++++++++++++++++++++++
2 files changed, 57 insertions(+), 5 deletions(-)
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 966da2b..94d19f6 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -98,11 +98,11 @@ struct mmu_notifier_ops {
/*
* invalidate_range_start() and invalidate_range_end() must be
* paired and are called only when the mmap_sem and/or the
- * locks protecting the reverse maps are held. The subsystem
- * must guarantee that no additional references are taken to
- * the pages in the range established between the call to
- * invalidate_range_start() and the matching call to
- * invalidate_range_end().
+ * locks protecting the reverse maps are held. If the subsystem
+ * can't guarantee that no additional references are taken to
+ * the pages in the range, it has to implement the
+ * invalidate_range() notifier to remove any references taken
+ * after invalidate_range_start().
*
* Invalidation of multiple concurrent ranges may be
* optionally permitted by the driver. Either way the
@@ -144,6 +144,29 @@ struct mmu_notifier_ops {
void (*invalidate_range_end)(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start, unsigned long end);
+
+ /*
+ * invalidate_range() is either called between
+ * invalidate_range_start() and invalidate_range_end() when the
+ * VM has to free pages that where unmapped, but before the
+ * pages are actually freed, or outside of _start()/_end() when
+ * a (remote) TLB is necessary.
+ *
+ * If invalidate_range() is used to manage a non-CPU TLB with
+ * shared page-tables, it not necessary to implement the
+ * invalidate_range_start()/end() notifiers, as
+ * invalidate_range() alread catches the points in time when an
+ * external TLB range needs to be flushed.
+ *
+ * The invalidate_range() function is called under the ptl
+ * spin-lock and not allowed to sleep.
+ *
+ * Note that this function might be called with just a sub-range
+ * of what was passed to invalidate_range_start()/end(), if
+ * called between those functions.
+ */
+ void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm,
+ unsigned long start, unsigned long end);
};
/*
@@ -190,6 +213,8 @@ extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
unsigned long start, unsigned long end);
extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
unsigned long start, unsigned long end);
+extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
+ unsigned long start, unsigned long end);
static inline void mmu_notifier_release(struct mm_struct *mm)
{
@@ -245,6 +270,8 @@ static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
unsigned long start, unsigned long end)
{
+ if (mm_has_notifiers(mm))
+ __mmu_notifier_invalidate_range(mm, start, end);
}
static inline void mmu_notifier_mm_init(struct mm_struct *mm)
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 2c8da98..3b9b3d0 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -193,6 +193,16 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+ /*
+ * Call invalidate_range here too to avoid the need for the
+ * subsystem of having to register an invalidate_range_end
+ * call-back when there is invalidate_range already. Usually a
+ * subsystem registers either invalidate_range_start()/end() or
+ * invalidate_range(), so this will be no additional overhead
+ * (besides the pointer check).
+ */
+ if (mn->ops->invalidate_range)
+ mn->ops->invalidate_range(mn, mm, start, end);
if (mn->ops->invalidate_range_end)
mn->ops->invalidate_range_end(mn, mm, start, end);
}
@@ -200,6 +210,21 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
}
EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_end);
+void __mmu_notifier_invalidate_range(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ struct mmu_notifier *mn;
+ int id;
+
+ id = srcu_read_lock(&srcu);
+ hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+ if (mn->ops->invalidate_range)
+ mn->ops->invalidate_range(mn, mm, start, end);
+ }
+ srcu_read_unlock(&srcu, id);
+}
+EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
+
static int do_mmu_notifier_register(struct mmu_notifier *mn,
struct mm_struct *mm,
int take_mmap_sem)
--
1.8.4.5
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu
^ permalink raw reply related [flat|nested] 4+ messages in thread