From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
To: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>, Hugh Dickins <hughd@google.com>,
Dave Jones <davej@redhat.com>, Al Viro <viro@zeniv.linux.org.uk>,
Linux Kernel <linux-kernel@vger.kernel.org>,
Rik van Riel <riel@redhat.com>, Ingo Molnar <mingo@redhat.com>,
Michel Lespinasse <walken@google.com>,
"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>,
Sasha Levin <sasha.levin@oracle.com>,
Benjamin Herrenschmidt <benh@kernel.crashing.org>
Subject: Re: pipe/page fault oddness.
Date: Wed, 08 Oct 2014 16:07:38 +0530 [thread overview]
Message-ID: <87lhoq3kzx.fsf@linux.vnet.ibm.com> (raw)
In-Reply-To: <CA+55aFyt8euKZv8O-DMY7LE+467a_RQ38YOgF0Xh73+TseM=kQ@mail.gmail.com>
Linus Torvalds <torvalds@linux-foundation.org> writes:
> On Mon, Oct 6, 2014 at 3:18 PM, Aneesh Kumar K.V
> <aneesh.kumar@linux.vnet.ibm.com> wrote:
>>
>> Are we still looking at these options ? I could look at implementing the
>> first option which will also enable us to free up one pte bit.
>
> We definitely are. If you can test my patch (with the small follow-up
> fix), and do the necessary changes for ppc64, that would be good.
>
> I looked quickly at the ppc64 side, and it didn't look too painful.
> Using pte_protnone() instead of pte_numa() should remove move lines
> than it adds there too..
>
This is a quick hack and gets it running. With perf bench numa mem
-bash-4.2# grep numa /proc/vmstat
numa_hit 3310633
numa_miss 0
numa_foreign 0
numa_interleave 6451
numa_local 3162369
numa_other 148264
numa_pte_updates 27708982
numa_huge_pte_updates 76987
numa_hint_faults 268439275
numa_hint_faults_local 5359216
numa_pages_migrated 3349573
-bash-4.2#
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index d98c1ecc3266..2a9bbe4d2364 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -41,7 +41,7 @@ static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PA
static inline int pte_present(pte_t pte)
{
- return pte_val(pte) & (_PAGE_PRESENT | _PAGE_NUMA);
+ return pte_val(pte) & _PAGE_PRESENT;
}
#define pte_present_nonuma pte_present_nonuma
@@ -50,78 +50,20 @@ static inline int pte_present_nonuma(pte_t pte)
return pte_val(pte) & (_PAGE_PRESENT);
}
-#define pte_numa pte_numa
-static inline int pte_numa(pte_t pte)
+#define pte_protnone pte_protnone
+static inline int pte_protnone(pte_t pte)
{
return (pte_val(pte) &
- (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
+ (_PAGE_PRESENT | _PAGE_USER)) == _PAGE_PRESENT;
}
-#define pte_mknonnuma pte_mknonnuma
-static inline pte_t pte_mknonnuma(pte_t pte)
+#define pmd_protnone pmd_protnone
+static inline int pmd_protnone(pmd_t pmd)
{
- pte_val(pte) &= ~_PAGE_NUMA;
- pte_val(pte) |= _PAGE_PRESENT | _PAGE_ACCESSED;
- return pte;
-}
-
-#define pte_mknuma pte_mknuma
-static inline pte_t pte_mknuma(pte_t pte)
-{
- /*
- * We should not set _PAGE_NUMA on non present ptes. Also clear the
- * present bit so that hash_page will return 1 and we collect this
- * as numa fault.
- */
- if (pte_present(pte)) {
- pte_val(pte) |= _PAGE_NUMA;
- pte_val(pte) &= ~_PAGE_PRESENT;
- } else
- VM_BUG_ON(1);
- return pte;
+ return pte_protnone(pmd_pte(pmd));
}
-#define ptep_set_numa ptep_set_numa
-static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep)
-{
- if ((pte_val(*ptep) & _PAGE_PRESENT) == 0)
- VM_BUG_ON(1);
-
- pte_update(mm, addr, ptep, _PAGE_PRESENT, _PAGE_NUMA, 0);
- return;
-}
-
-#define pmd_numa pmd_numa
-static inline int pmd_numa(pmd_t pmd)
-{
- return pte_numa(pmd_pte(pmd));
-}
-
-#define pmdp_set_numa pmdp_set_numa
-static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp)
-{
- if ((pmd_val(*pmdp) & _PAGE_PRESENT) == 0)
- VM_BUG_ON(1);
-
- pmd_hugepage_update(mm, addr, pmdp, _PAGE_PRESENT, _PAGE_NUMA);
- return;
-}
-
-#define pmd_mknonnuma pmd_mknonnuma
-static inline pmd_t pmd_mknonnuma(pmd_t pmd)
-{
- return pte_pmd(pte_mknonnuma(pmd_pte(pmd)));
-}
-
-#define pmd_mknuma pmd_mknuma
-static inline pmd_t pmd_mknuma(pmd_t pmd)
-{
- return pte_pmd(pte_mknuma(pmd_pte(pmd)));
-}
-
-# else
+#else
static inline int pte_present(pte_t pte)
{
diff --git a/arch/powerpc/include/asm/pte-hash64.h b/arch/powerpc/include/asm/pte-hash64.h
index 2505d8eab15c..68d0a5d01dc3 100644
--- a/arch/powerpc/include/asm/pte-hash64.h
+++ b/arch/powerpc/include/asm/pte-hash64.h
@@ -30,7 +30,7 @@
/*
* Used for tracking numa faults
*/
-#define _PAGE_NUMA 0x00000010 /* Gather numa placement stats */
+//#define _PAGE_NUMA 0x00000010 /* Gather numa placement stats */
/* No separate kernel read-only */
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 084ad54c73cd..27004431b576 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -235,7 +235,11 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
pte_size = psize;
pte = lookup_linux_pte_and_update(pgdir, hva, writing,
&pte_size);
- if (pte_present(pte) && !pte_numa(pte)) {
+ /*
+ * Skip the ptes marked for numa fault tracking in
+ * host page table.
+ */
+ if (pte_present(pte) && !pte_protnone(pte)) {
if (writing && !pte_write(pte))
/* make the actual HPTE be read-only */
ptel = hpte_make_readonly(ptel);
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 51ab9e7e6c39..b77ecac7e61f 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -393,8 +393,6 @@ good_area:
* processors use the same I/D cache coherency mechanism
* as embedded.
*/
- if (error_code & DSISR_PROTFAULT)
- goto bad_area;
#endif /* CONFIG_PPC_STD_MMU */
/*
@@ -418,9 +416,6 @@ good_area:
flags |= FAULT_FLAG_WRITE;
/* a read */
} else {
- /* protection fault */
- if (error_code & 0x08000000)
- goto bad_area;
if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
goto bad_area;
}
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index d8746684f606..89f8568bf0b5 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -39,7 +39,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
/*
* Similar to the PMD case, NUMA hinting must take slow path
*/
- if (pte_numa(pte))
+ if (pte_protnone(pte))
return 0;
if ((pte_val(pte) & mask) != result)
@@ -85,7 +85,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
* slowpath for accounting purposes and so that they
* can be serialised against THP migration.
*/
- if (pmd_numa(pmd))
+ if (pmd_protnone(pmd))
return 0;
if (!gup_hugepte((pte_t *)pmdp, PMD_SIZE, addr, next,
next prev parent reply other threads:[~2014-10-08 10:37 UTC|newest]
Thread overview: 43+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-09-30 3:33 pipe/page fault oddness Dave Jones
2014-09-30 4:27 ` Linus Torvalds
2014-09-30 4:33 ` Dave Jones
[not found] ` <CA+55aFwxdOBKHwwp7Zq1k19mHCyHYmYqigCVt59AtB-P7Zva1w@mail.gmail.com>
2014-09-30 15:52 ` Linus Torvalds
2014-09-30 16:03 ` Rik van Riel
2014-09-30 16:07 ` Dave Jones
2014-09-30 16:26 ` Linus Torvalds
2014-09-30 16:05 ` Dave Jones
2014-09-30 16:10 ` Linus Torvalds
2014-09-30 16:22 ` Dave Jones
2014-09-30 16:40 ` Dave Jones
2014-09-30 16:46 ` Linus Torvalds
2014-09-30 18:20 ` Dave Jones
2014-09-30 18:58 ` Linus Torvalds
2014-10-01 8:19 ` Hugh Dickins
2014-10-01 16:01 ` Linus Torvalds
2014-10-01 16:18 ` Linus Torvalds
2014-10-01 17:29 ` Rik van Riel
2014-10-02 8:28 ` Peter Zijlstra
2014-10-01 20:20 ` Linus Torvalds
2014-10-01 21:09 ` Rik van Riel
2014-10-01 22:08 ` Sasha Levin
2014-10-01 22:28 ` Chuck Ebbert
2014-10-02 3:32 ` Sasha Levin
2014-10-02 8:03 ` Chuck Ebbert
2014-10-02 14:49 ` Sasha Levin
2014-10-01 22:42 ` Linus Torvalds
2014-10-02 14:25 ` Kirill A. Shutemov
2014-10-02 16:01 ` Linus Torvalds
2014-10-02 16:35 ` Kirill A. Shutemov
2014-10-02 15:04 ` Sasha Levin
2014-10-02 16:10 ` Linus Torvalds
2014-10-03 5:00 ` Sasha Levin
2014-10-03 15:43 ` Linus Torvalds
2014-10-03 15:58 ` Dave Jones
2014-10-03 16:02 ` Sasha Levin
2014-10-02 12:45 ` Mel Gorman
2014-10-06 19:18 ` Aneesh Kumar K.V
2014-10-07 12:45 ` Linus Torvalds
2014-10-08 10:37 ` Aneesh Kumar K.V [this message]
2014-10-02 8:47 ` Hugh Dickins
2014-10-02 15:57 ` Linus Torvalds
2014-09-30 4:35 ` Al Viro
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=87lhoq3kzx.fsf@linux.vnet.ibm.com \
--to=aneesh.kumar@linux.vnet.ibm.com \
--cc=benh@kernel.crashing.org \
--cc=davej@redhat.com \
--cc=hughd@google.com \
--cc=kirill.shutemov@linux.intel.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mgorman@suse.de \
--cc=mingo@redhat.com \
--cc=riel@redhat.com \
--cc=sasha.levin@oracle.com \
--cc=torvalds@linux-foundation.org \
--cc=viro@zeniv.linux.org.uk \
--cc=walken@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.