* [PATCH] powerpc/mm: Lockless get_user_pages_fast()
@ 2008-07-30 3:37 Benjamin Herrenschmidt
2008-07-30 4:20 ` Benjamin Herrenschmidt
2008-07-30 12:33 ` Kumar Gala
0 siblings, 2 replies; 11+ messages in thread
From: Benjamin Herrenschmidt @ 2008-07-30 3:37 UTC (permalink / raw)
To: linuxppc-dev list; +Cc: Nick Piggin
From: Nick Piggin <npiggin@suse.de>
Implement lockless get_user_pages_fast for powerpc. Page table existence
is guaranteed with RCU, and speculative page references are used to take a
reference to the pages without having a prior existence guarantee on them.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
I'm going to merge this, sending it to the list for reference, it was
in -mm for some time , minus some changes/fixes I did to solve conflicts
with the new multiple huge page sizes.
Index: linux-work/arch/powerpc/Kconfig
===================================================================
--- linux-work.orig/arch/powerpc/Kconfig 2008-07-30 13:17:06.000000000 +1000
+++ linux-work/arch/powerpc/Kconfig 2008-07-30 13:27:40.000000000 +1000
@@ -42,6 +42,9 @@ config GENERIC_HARDIRQS
bool
default y
+config HAVE_GET_USER_PAGES_FAST
+ def_bool PPC64
+
config HAVE_SETUP_PER_CPU_AREA
def_bool PPC64
Index: linux-work/arch/powerpc/mm/Makefile
===================================================================
--- linux-work.orig/arch/powerpc/mm/Makefile 2008-07-30 13:17:06.000000000 +1000
+++ linux-work/arch/powerpc/mm/Makefile 2008-07-30 13:27:40.000000000 +1000
@@ -6,7 +6,7 @@ ifeq ($(CONFIG_PPC64),y)
EXTRA_CFLAGS += -mno-minimal-toc
endif
-obj-y := fault.o mem.o \
+obj-y := fault.o mem.o gup.o \
init_$(CONFIG_WORD_SIZE).o \
pgtable_$(CONFIG_WORD_SIZE).o \
mmu_context_$(CONFIG_WORD_SIZE).o
Index: linux-work/arch/powerpc/mm/gup.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-work/arch/powerpc/mm/gup.c 2008-07-30 13:28:03.000000000 +1000
@@ -0,0 +1,262 @@
+/*
+ * Lockless get_user_pages_fast for powerpc
+ *
+ * Copyright (C) 2008 Nick Piggin
+ * Copyright (C) 2008 Novell Inc.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/vmstat.h>
+#include <linux/pagemap.h>
+#include <linux/rwsem.h>
+#include <asm/pgtable.h>
+
+/*
+ * The performance critical leaf functions are made noinline otherwise gcc
+ * inlines everything into a single function which results in too much
+ * register pressure.
+ */
+static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ unsigned long mask, result;
+ pte_t *ptep;
+
+ result = _PAGE_PRESENT|_PAGE_USER;
+ if (write)
+ result |= _PAGE_RW;
+ mask = result | _PAGE_SPECIAL;
+
+ ptep = pte_offset_kernel(&pmd, addr);
+ do {
+ pte_t pte = *ptep;
+ struct page *page;
+
+ if ((pte_val(pte) & mask) != result)
+ return 0;
+ VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+ page = pte_page(pte);
+ if (!page_cache_get_speculative(page))
+ return 0;
+ if (unlikely(pte != *ptep)) {
+ put_page(page);
+ return 0;
+ }
+ pages[*nr] = page;
+ (*nr)++;
+
+ } while (ptep++, addr += PAGE_SIZE, addr != end);
+
+ return 1;
+}
+
+static noinline int gup_huge_pte(pte_t *ptep, struct hstate *hstate,
+ unsigned long *addr, unsigned long end,
+ int write, struct page **pages, int *nr)
+{
+ unsigned long mask;
+ unsigned long pte_end;
+ struct page *head, *page;
+ pte_t pte;
+ int refs;
+
+ pte_end = (*addr + huge_page_size(hstate)) & huge_page_mask(hstate);
+ if (pte_end < end)
+ end = pte_end;
+
+ pte = *ptep;
+ mask = _PAGE_PRESENT|_PAGE_USER;
+ if (write)
+ mask |= _PAGE_RW;
+ if ((pte_val(pte) & mask) != mask)
+ return 0;
+ /* hugepages are never "special" */
+ VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+ refs = 0;
+ head = pte_page(pte);
+ page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT);
+ do {
+ VM_BUG_ON(compound_head(page) != head);
+ pages[*nr] = page;
+ (*nr)++;
+ page++;
+ refs++;
+ } while (*addr += PAGE_SIZE, *addr != end);
+
+ if (!page_cache_add_speculative(head, refs)) {
+ *nr -= refs;
+ return 0;
+ }
+ if (unlikely(pte != *ptep)) {
+ /* Could be optimized better */
+ while (*nr) {
+ put_page(page);
+ (*nr)--;
+ }
+ }
+
+ return 1;
+}
+
+static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
+ int write, struct page **pages, int *nr)
+{
+ unsigned long next;
+ pmd_t *pmdp;
+
+ pmdp = pmd_offset(&pud, addr);
+ do {
+ pmd_t pmd = *pmdp;
+
+ next = pmd_addr_end(addr, end);
+ if (pmd_none(pmd))
+ return 0;
+ if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+ return 0;
+ } while (pmdp++, addr = next, addr != end);
+
+ return 1;
+}
+
+static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
+ int write, struct page **pages, int *nr)
+{
+ unsigned long next;
+ pud_t *pudp;
+
+ pudp = pud_offset(&pgd, addr);
+ do {
+ pud_t pud = *pudp;
+
+ next = pud_addr_end(addr, end);
+ if (pud_none(pud))
+ return 0;
+ if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+ return 0;
+ } while (pudp++, addr = next, addr != end);
+
+ return 1;
+}
+
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+ struct page **pages)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long addr, len, end;
+ unsigned long next;
+ pgd_t *pgdp;
+ int psize, nr = 0;
+ unsigned int shift;
+
+ start &= PAGE_MASK;
+ addr = start;
+ len = (unsigned long) nr_pages << PAGE_SHIFT;
+ end = start + len;
+
+ if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+ start, len)))
+ goto slow_irqon;
+
+ /* Cross a slice boundary? */
+ /* XXX could be improved by iterating slices instead */
+ if (addr < SLICE_LOW_TOP) {
+ if (end > SLICE_LOW_TOP)
+ goto slow_irqon;
+
+ if (unlikely(GET_LOW_SLICE_INDEX(addr) !=
+ GET_LOW_SLICE_INDEX(end - 1)))
+ goto slow_irqon;
+ } else {
+ if (unlikely(GET_HIGH_SLICE_INDEX(addr) !=
+ GET_HIGH_SLICE_INDEX(end - 1)))
+ goto slow_irqon;
+ }
+
+ /*
+ * XXX: batch / limit 'nr', to avoid large irq off latency
+ * needs some instrumenting to determine the common sizes used by
+ * important workloads (eg. DB2), and whether limiting the batch size
+ * will decrease performance.
+ *
+ * It seems like we're in the clear for the moment. Direct-IO is
+ * the main guy that batches up lots of get_user_pages, and even
+ * they are limited to 64-at-a-time which is not so many.
+ */
+ /*
+ * This doesn't prevent pagetable teardown, but does prevent
+ * the pagetables from being freed on powerpc.
+ *
+ * So long as we atomically load page table pointers versus teardown,
+ * we can follow the address down to the the page and take a ref on it.
+ */
+ local_irq_disable();
+
+ psize = get_slice_psize(mm, addr);
+ shift = mmu_psize_defs[psize].shift;
+
+ if (unlikely(mmu_huge_psizes[psize])) {
+ pte_t *ptep;
+ unsigned long a = addr;
+ unsigned long sz = ((1UL) << shift);
+ struct hstate *hstate = size_to_hstate(sz);
+
+ BUG_ON(!hstate);
+ /*
+ * XXX: could be optimized to avoid hstate
+ * lookup entirely (just use shift)
+ */
+
+ do {
+ VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, a)].shift);
+ ptep = huge_pte_offset(mm, a);
+ if (!gup_huge_pte(ptep, hstate, &a, end, write, pages,
+ &nr))
+ goto slow;
+ } while (a != end);
+ } else {
+ pgdp = pgd_offset(mm, addr);
+ do {
+ pgd_t pgd = *pgdp;
+
+ VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift);
+
+ next = pgd_addr_end(addr, end);
+ if (pgd_none(pgd))
+ goto slow;
+ if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+ goto slow;
+ } while (pgdp++, addr = next, addr != end);
+ }
+ local_irq_enable();
+
+ VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
+ return nr;
+
+ {
+ int ret;
+
+slow:
+ local_irq_enable();
+slow_irqon:
+ /* Try to get the remaining pages with get_user_pages */
+ start += nr << PAGE_SHIFT;
+ pages += nr;
+
+ down_read(&mm->mmap_sem);
+ ret = get_user_pages(current, mm, start,
+ (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
+ up_read(&mm->mmap_sem);
+
+ /* Have to be a bit careful with return values */
+ if (nr > 0) {
+ if (ret < 0)
+ ret = nr;
+ else
+ ret += nr;
+ }
+
+ return ret;
+ }
+}
Index: linux-work/include/asm-powerpc/pgtable-ppc64.h
===================================================================
--- linux-work.orig/include/asm-powerpc/pgtable-ppc64.h 2008-07-30 13:17:06.000000000 +1000
+++ linux-work/include/asm-powerpc/pgtable-ppc64.h 2008-07-30 13:27:40.000000000 +1000
@@ -461,6 +461,8 @@ void pgtable_cache_init(void);
return pt;
}
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long address);
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_POWERPC_PGTABLE_PPC64_H_ */
Index: linux-work/include/linux/pagemap.h
===================================================================
--- linux-work.orig/include/linux/pagemap.h 2008-07-30 13:17:06.000000000 +1000
+++ linux-work/include/linux/pagemap.h 2008-07-30 13:27:40.000000000 +1000
@@ -142,6 +142,29 @@ static inline int page_cache_get_specula
return 1;
}
+/*
+ * Same as above, but add instead of inc (could just be merged)
+ */
+static inline int page_cache_add_speculative(struct page *page, int count)
+{
+ VM_BUG_ON(in_interrupt());
+
+#if !defined(CONFIG_SMP) && defined(CONFIG_CLASSIC_RCU)
+# ifdef CONFIG_PREEMPT
+ VM_BUG_ON(!in_atomic());
+# endif
+ VM_BUG_ON(page_count(page) == 0);
+ atomic_add(count, &page->_count);
+
+#else
+ if (unlikely(!atomic_add_unless(&page->_count, count, 0)))
+ return 0;
+#endif
+ VM_BUG_ON(PageCompound(page) && page != compound_head(page));
+
+ return 1;
+}
+
static inline int page_freeze_refs(struct page *page, int count)
{
return likely(atomic_cmpxchg(&page->_count, count, 0) == count);
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [PATCH] powerpc/mm: Lockless get_user_pages_fast() 2008-07-30 3:37 [PATCH] powerpc/mm: Lockless get_user_pages_fast() Benjamin Herrenschmidt @ 2008-07-30 4:20 ` Benjamin Herrenschmidt 2008-07-30 5:06 ` Michael Ellerman 2008-07-30 12:33 ` Kumar Gala 1 sibling, 1 reply; 11+ messages in thread From: Benjamin Herrenschmidt @ 2008-07-30 4:20 UTC (permalink / raw) To: linuxppc-dev list; +Cc: Nick Piggin From: Nick Piggin <npiggin@suse.de> Implement lockless get_user_pages_fast for powerpc. Page table existence is guaranteed with RCU, and speculative page references are used to take a reference to the pages without having a prior existence guarantee on them. Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com> Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Cc: Hugh Dickins <hugh@veritas.com> Cc: "Paul E. McKenney" <paulmck@us.ibm.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- v2. Fix makefile to only build gup.o on 64 bits and fix a bug with huge pages where we would oops (null dereference) if huge_pte_offset() returns NULL (ie, not populated yet). v1. I'm going to merge this, sending it to the list for reference, it was in -mm , minus some changes/fixes I did to solve conflicts with the new multiple huge page sizes. Index: linux-work/arch/powerpc/Kconfig =================================================================== --- linux-work.orig/arch/powerpc/Kconfig 2008-07-30 13:17:06.000000000 +1000 +++ linux-work/arch/powerpc/Kconfig 2008-07-30 13:27:40.000000000 +1000 @@ -42,6 +42,9 @@ config GENERIC_HARDIRQS bool default y +config HAVE_GET_USER_PAGES_FAST + def_bool PPC64 + config HAVE_SETUP_PER_CPU_AREA def_bool PPC64 Index: linux-work/arch/powerpc/mm/Makefile =================================================================== --- linux-work.orig/arch/powerpc/mm/Makefile 2008-07-30 13:17:06.000000000 +1000 +++ linux-work/arch/powerpc/mm/Makefile 2008-07-30 13:42:42.000000000 +1000 @@ -12,7 +12,8 @@ obj-y := fault.o mem.o \ mmu_context_$(CONFIG_WORD_SIZE).o hash-$(CONFIG_PPC_NATIVE) := hash_native_64.o obj-$(CONFIG_PPC64) += hash_utils_64.o \ - slb_low.o slb.o stab.o mmap.o $(hash-y) + slb_low.o slb.o stab.o \ + gup.o mmap.o $(hash-y) obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o obj-$(CONFIG_PPC_STD_MMU) += hash_low_$(CONFIG_WORD_SIZE).o \ tlb_$(CONFIG_WORD_SIZE).o Index: linux-work/arch/powerpc/mm/gup.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-work/arch/powerpc/mm/gup.c 2008-07-30 14:20:00.000000000 +1000 @@ -0,0 +1,271 @@ +/* + * Lockless get_user_pages_fast for powerpc + * + * Copyright (C) 2008 Nick Piggin + * Copyright (C) 2008 Novell Inc. + */ +#undef DEBUG + +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/vmstat.h> +#include <linux/pagemap.h> +#include <linux/rwsem.h> +#include <asm/pgtable.h> + +/* + * The performance critical leaf functions are made noinline otherwise gcc + * inlines everything into a single function which results in too much + * register pressure. + */ +static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long mask, result; + pte_t *ptep; + + result = _PAGE_PRESENT|_PAGE_USER; + if (write) + result |= _PAGE_RW; + mask = result | _PAGE_SPECIAL; + + ptep = pte_offset_kernel(&pmd, addr); + do { + pte_t pte = *ptep; + struct page *page; + + if ((pte_val(pte) & mask) != result) + return 0; + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + page = pte_page(pte); + if (!page_cache_get_speculative(page)) + return 0; + if (unlikely(pte != *ptep)) { + put_page(page); + return 0; + } + pages[*nr] = page; + (*nr)++; + + } while (ptep++, addr += PAGE_SIZE, addr != end); + + return 1; +} + +static noinline int gup_huge_pte(pte_t *ptep, struct hstate *hstate, + unsigned long *addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long mask; + unsigned long pte_end; + struct page *head, *page; + pte_t pte; + int refs; + + pte_end = (*addr + huge_page_size(hstate)) & huge_page_mask(hstate); + if (pte_end < end) + end = pte_end; + + pte = *ptep; + mask = _PAGE_PRESENT|_PAGE_USER; + if (write) + mask |= _PAGE_RW; + if ((pte_val(pte) & mask) != mask) + return 0; + /* hugepages are never "special" */ + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + + refs = 0; + head = pte_page(pte); + page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT); + do { + VM_BUG_ON(compound_head(page) != head); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (*addr += PAGE_SIZE, *addr != end); + + if (!page_cache_add_speculative(head, refs)) { + *nr -= refs; + return 0; + } + if (unlikely(pte != *ptep)) { + /* Could be optimized better */ + while (*nr) { + put_page(page); + (*nr)--; + } + } + + return 1; +} + +static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + pmd_t *pmdp; + + pmdp = pmd_offset(&pud, addr); + do { + pmd_t pmd = *pmdp; + + next = pmd_addr_end(addr, end); + if (pmd_none(pmd)) + return 0; + if (!gup_pte_range(pmd, addr, next, write, pages, nr)) + return 0; + } while (pmdp++, addr = next, addr != end); + + return 1; +} + +static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + pud_t *pudp; + + pudp = pud_offset(&pgd, addr); + do { + pud_t pud = *pudp; + + next = pud_addr_end(addr, end); + if (pud_none(pud)) + return 0; + if (!gup_pmd_range(pud, addr, next, write, pages, nr)) + return 0; + } while (pudp++, addr = next, addr != end); + + return 1; +} + +int get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + struct mm_struct *mm = current->mm; + unsigned long addr, len, end; + unsigned long next; + pgd_t *pgdp; + int psize, nr = 0; + unsigned int shift; + + pr_debug("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read"); + + start &= PAGE_MASK; + addr = start; + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; + + if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, + start, len))) + goto slow_irqon; + + pr_debug(" aligned: %lx .. %lx\n", start, end); + + /* Cross a slice boundary? */ + /* XXX could be improved by iterating slices instead */ + if (addr < SLICE_LOW_TOP) { + if (end > SLICE_LOW_TOP) + goto slow_irqon; + + if (unlikely(GET_LOW_SLICE_INDEX(addr) != + GET_LOW_SLICE_INDEX(end - 1))) + goto slow_irqon; + } else { + if (unlikely(GET_HIGH_SLICE_INDEX(addr) != + GET_HIGH_SLICE_INDEX(end - 1))) + goto slow_irqon; + } + + /* + * XXX: batch / limit 'nr', to avoid large irq off latency + * needs some instrumenting to determine the common sizes used by + * important workloads (eg. DB2), and whether limiting the batch size + * will decrease performance. + * + * It seems like we're in the clear for the moment. Direct-IO is + * the main guy that batches up lots of get_user_pages, and even + * they are limited to 64-at-a-time which is not so many. + */ + /* + * This doesn't prevent pagetable teardown, but does prevent + * the pagetables from being freed on powerpc. + * + * So long as we atomically load page table pointers versus teardown, + * we can follow the address down to the the page and take a ref on it. + */ + local_irq_disable(); + + psize = get_slice_psize(mm, addr); + shift = mmu_psize_defs[psize].shift; + + if (unlikely(mmu_huge_psizes[psize])) { + pte_t *ptep; + unsigned long a = addr; + unsigned long sz = ((1UL) << shift); + struct hstate *hstate = size_to_hstate(sz); + + BUG_ON(!hstate); + /* + * XXX: could be optimized to avoid hstate + * lookup entirely (just use shift) + */ + + do { + VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, a)].shift); + ptep = huge_pte_offset(mm, a); + pr_debug(" %016lx: huge ptep %p\n", a, ptep); + if (!ptep || !gup_huge_pte(ptep, hstate, &a, end, write, pages, + &nr)) + goto slow; + } while (a != end); + } else { + pgdp = pgd_offset(mm, addr); + do { + pgd_t pgd = *pgdp; + + VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift); + pr_debug(" %016lx: normal pgd %p\n", addr, (void *)pgd); + next = pgd_addr_end(addr, end); + if (pgd_none(pgd)) + goto slow; + if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + goto slow; + } while (pgdp++, addr = next, addr != end); + } + local_irq_enable(); + + VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); + return nr; + + { + int ret; + +slow: + local_irq_enable(); +slow_irqon: + pr_debug(" slow path ! nr = %d\n", nr); + + /* Try to get the remaining pages with get_user_pages */ + start += nr << PAGE_SHIFT; + pages += nr; + + down_read(&mm->mmap_sem); + ret = get_user_pages(current, mm, start, + (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); + up_read(&mm->mmap_sem); + + /* Have to be a bit careful with return values */ + if (nr > 0) { + if (ret < 0) + ret = nr; + else + ret += nr; + } + + return ret; + } +} Index: linux-work/include/asm-powerpc/pgtable-ppc64.h =================================================================== --- linux-work.orig/include/asm-powerpc/pgtable-ppc64.h 2008-07-30 13:17:06.000000000 +1000 +++ linux-work/include/asm-powerpc/pgtable-ppc64.h 2008-07-30 13:27:40.000000000 +1000 @@ -461,6 +461,8 @@ void pgtable_cache_init(void); return pt; } +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long address); + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_PGTABLE_PPC64_H_ */ Index: linux-work/include/linux/pagemap.h =================================================================== --- linux-work.orig/include/linux/pagemap.h 2008-07-30 13:17:06.000000000 +1000 +++ linux-work/include/linux/pagemap.h 2008-07-30 13:27:40.000000000 +1000 @@ -142,6 +142,29 @@ static inline int page_cache_get_specula return 1; } +/* + * Same as above, but add instead of inc (could just be merged) + */ +static inline int page_cache_add_speculative(struct page *page, int count) +{ + VM_BUG_ON(in_interrupt()); + +#if !defined(CONFIG_SMP) && defined(CONFIG_CLASSIC_RCU) +# ifdef CONFIG_PREEMPT + VM_BUG_ON(!in_atomic()); +# endif + VM_BUG_ON(page_count(page) == 0); + atomic_add(count, &page->_count); + +#else + if (unlikely(!atomic_add_unless(&page->_count, count, 0))) + return 0; +#endif + VM_BUG_ON(PageCompound(page) && page != compound_head(page)); + + return 1; +} + static inline int page_freeze_refs(struct page *page, int count) { return likely(atomic_cmpxchg(&page->_count, count, 0) == count); ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] powerpc/mm: Lockless get_user_pages_fast() 2008-07-30 4:20 ` Benjamin Herrenschmidt @ 2008-07-30 5:06 ` Michael Ellerman 2008-07-30 5:08 ` Benjamin Herrenschmidt 0 siblings, 1 reply; 11+ messages in thread From: Michael Ellerman @ 2008-07-30 5:06 UTC (permalink / raw) To: benh; +Cc: Nick Piggin, linuxppc-dev list [-- Attachment #1: Type: text/plain, Size: 1618 bytes --] On Wed, 2008-07-30 at 14:20 +1000, Benjamin Herrenschmidt wrote: > From: Nick Piggin <npiggin@suse.de> > > Implement lockless get_user_pages_fast for powerpc. Page table existence > is guaranteed with RCU, and speculative page references are used to take a > reference to the pages without having a prior existence guarantee on them. > Index: linux-work/arch/powerpc/mm/gup.c > =================================================================== > --- /dev/null 1970-01-01 00:00:00.000000000 +0000 > +++ linux-work/arch/powerpc/mm/gup.c 2008-07-30 14:20:00.000000000 +1000 > @@ -0,0 +1,271 @@ > +/* > + * Lockless get_user_pages_fast for powerpc > + * > + * Copyright (C) 2008 Nick Piggin > + * Copyright (C) 2008 Novell Inc. > + */ > +#undef DEBUG > + > +#include <linux/sched.h> > +#include <linux/mm.h> > +#include <linux/hugetlb.h> > +#include <linux/vmstat.h> > +#include <linux/pagemap.h> > +#include <linux/rwsem.h> > +#include <asm/pgtable.h> > + > +/* > + * The performance critical leaf functions are made noinline otherwise gcc > + * inlines everything into a single function which results in too much > + * register pressure. > + */ This strikes me as something that is liable to change for compiler version n+1, or n with -fsomething - and might leave us shooting ourselves in the foot, just a thought. cheers -- Michael Ellerman OzLabs, IBM Australia Development Lab wwweb: http://michael.ellerman.id.au phone: +61 2 6212 1183 (tie line 70 21183) We do not inherit the earth from our ancestors, we borrow it from our children. - S.M.A.R.T Person [-- Attachment #2: This is a digitally signed message part --] [-- Type: application/pgp-signature, Size: 189 bytes --] ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] powerpc/mm: Lockless get_user_pages_fast() 2008-07-30 5:06 ` Michael Ellerman @ 2008-07-30 5:08 ` Benjamin Herrenschmidt 2008-07-30 7:26 ` Nick Piggin 0 siblings, 1 reply; 11+ messages in thread From: Benjamin Herrenschmidt @ 2008-07-30 5:08 UTC (permalink / raw) To: michael; +Cc: Nick Piggin, linuxppc-dev list On Wed, 2008-07-30 at 15:06 +1000, Michael Ellerman wrote: > > + > > +/* > > + * The performance critical leaf functions are made noinline otherwise gcc > > + * inlines everything into a single function which results in too much > > + * register pressure. > > + */ > > This strikes me as something that is liable to change for compiler > version n+1, or n with -fsomething - and might leave us shooting > ourselves in the foot, just a thought. > Not that much I'd say... In fact, I wouldn't be too worried on powerpc, I wonder if that comment is stale from the x86 variant :-) Nick ? Cheers, Ben. ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] powerpc/mm: Lockless get_user_pages_fast() 2008-07-30 5:08 ` Benjamin Herrenschmidt @ 2008-07-30 7:26 ` Nick Piggin 0 siblings, 0 replies; 11+ messages in thread From: Nick Piggin @ 2008-07-30 7:26 UTC (permalink / raw) To: Benjamin Herrenschmidt; +Cc: linuxppc-dev list On Wed, Jul 30, 2008 at 03:08:40PM +1000, Benjamin Herrenschmidt wrote: > On Wed, 2008-07-30 at 15:06 +1000, Michael Ellerman wrote: > > > + > > > +/* > > > + * The performance critical leaf functions are made noinline otherwise gcc > > > + * inlines everything into a single function which results in too much > > > + * register pressure. > > > + */ > > > > This strikes me as something that is liable to change for compiler > > version n+1, or n with -fsomething - and might leave us shooting > > ourselves in the foot, just a thought. > > > > Not that much I'd say... In fact, I wouldn't be too worried on powerpc, > I wonder if that comment is stale from the x86 variant :-) Nick ? Right... gcc is really poor at over pressuing registers when inlining, and when I checked I don't think it even allocated registers to the inner-most variables in cases such as this. I thought I checked powerpc and sound some spilling there too, but it was quite a long time ago (and yes it was brought over from x86). Should double check. ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] powerpc/mm: Lockless get_user_pages_fast() 2008-07-30 3:37 [PATCH] powerpc/mm: Lockless get_user_pages_fast() Benjamin Herrenschmidt 2008-07-30 4:20 ` Benjamin Herrenschmidt @ 2008-07-30 12:33 ` Kumar Gala 2008-07-30 13:17 ` Nick Piggin 2008-07-30 22:26 ` Benjamin Herrenschmidt 1 sibling, 2 replies; 11+ messages in thread From: Kumar Gala @ 2008-07-30 12:33 UTC (permalink / raw) To: benh; +Cc: Nick Piggin, linuxppc-dev list On Jul 29, 2008, at 10:37 PM, Benjamin Herrenschmidt wrote: > From: Nick Piggin <npiggin@suse.de> > > Implement lockless get_user_pages_fast for powerpc. Page table > existence > is guaranteed with RCU, and speculative page references are used to > take a > reference to the pages without having a prior existence guarantee on > them. > > Signed-off-by: Nick Piggin <npiggin@suse.de> > Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com> > Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> > Cc: Paul Mackerras <paulus@samba.org> > Cc: Hugh Dickins <hugh@veritas.com> > Cc: "Paul E. McKenney" <paulmck@us.ibm.com> > Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> > Signed-off-by: Andrew Morton <akpm@linux-foundation.org> > --- > > I'm going to merge this, sending it to the list for reference, it was > in -mm for some time , minus some changes/fixes I did to solve > conflicts > with the new multiple huge page sizes. > > Index: linux-work/arch/powerpc/Kconfig > =================================================================== > --- linux-work.orig/arch/powerpc/Kconfig 2008-07-30 > 13:17:06.000000000 +1000 > +++ linux-work/arch/powerpc/Kconfig 2008-07-30 13:27:40.000000000 > +1000 > @@ -42,6 +42,9 @@ config GENERIC_HARDIRQS > bool > default y > > +config HAVE_GET_USER_PAGES_FAST > + def_bool PPC64 > + > config HAVE_SETUP_PER_CPU_AREA > def_bool PPC64 what's ppc64 specific here? - k ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] powerpc/mm: Lockless get_user_pages_fast() 2008-07-30 12:33 ` Kumar Gala @ 2008-07-30 13:17 ` Nick Piggin 2008-07-30 13:39 ` Kumar Gala 2008-07-30 22:26 ` Benjamin Herrenschmidt 1 sibling, 1 reply; 11+ messages in thread From: Nick Piggin @ 2008-07-30 13:17 UTC (permalink / raw) To: Kumar Gala; +Cc: linuxppc-dev list On Wed, Jul 30, 2008 at 07:33:26AM -0500, Kumar Gala wrote: > > On Jul 29, 2008, at 10:37 PM, Benjamin Herrenschmidt wrote: > > >From: Nick Piggin <npiggin@suse.de> > > > >Implement lockless get_user_pages_fast for powerpc. Page table > >existence > >is guaranteed with RCU, and speculative page references are used to > >take a > >reference to the pages without having a prior existence guarantee on > >them. > > > >Signed-off-by: Nick Piggin <npiggin@suse.de> > >Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com> > >Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> > >Cc: Paul Mackerras <paulus@samba.org> > >Cc: Hugh Dickins <hugh@veritas.com> > >Cc: "Paul E. McKenney" <paulmck@us.ibm.com> > >Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> > >Signed-off-by: Andrew Morton <akpm@linux-foundation.org> > >--- > > > >I'm going to merge this, sending it to the list for reference, it was > >in -mm for some time , minus some changes/fixes I did to solve > >conflicts > >with the new multiple huge page sizes. > > > >Index: linux-work/arch/powerpc/Kconfig > >=================================================================== > >--- linux-work.orig/arch/powerpc/Kconfig 2008-07-30 > >13:17:06.000000000 +1000 > >+++ linux-work/arch/powerpc/Kconfig 2008-07-30 13:27:40.000000000 > >+1000 > >@@ -42,6 +42,9 @@ config GENERIC_HARDIRQS > > bool > > default y > > > >+config HAVE_GET_USER_PAGES_FAST > >+ def_bool PPC64 > >+ > >config HAVE_SETUP_PER_CPU_AREA > > def_bool PPC64 > > what's ppc64 specific here? I didn't look how 32-bit powerpc does its TLB shootdown and page table walking, so I don't know if it will work... ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] powerpc/mm: Lockless get_user_pages_fast() 2008-07-30 13:17 ` Nick Piggin @ 2008-07-30 13:39 ` Kumar Gala 0 siblings, 0 replies; 11+ messages in thread From: Kumar Gala @ 2008-07-30 13:39 UTC (permalink / raw) To: Nick Piggin; +Cc: linuxppc-dev list On Jul 30, 2008, at 8:17 AM, Nick Piggin wrote: > On Wed, Jul 30, 2008 at 07:33:26AM -0500, Kumar Gala wrote: >> >> On Jul 29, 2008, at 10:37 PM, Benjamin Herrenschmidt wrote: >> >>> From: Nick Piggin <npiggin@suse.de> >>> >>> Implement lockless get_user_pages_fast for powerpc. Page table >>> existence >>> is guaranteed with RCU, and speculative page references are used to >>> take a >>> reference to the pages without having a prior existence guarantee on >>> them. >>> >>> Signed-off-by: Nick Piggin <npiggin@suse.de> >>> Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com> >>> Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> >>> Cc: Paul Mackerras <paulus@samba.org> >>> Cc: Hugh Dickins <hugh@veritas.com> >>> Cc: "Paul E. McKenney" <paulmck@us.ibm.com> >>> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> >>> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> >>> --- >>> >>> I'm going to merge this, sending it to the list for reference, it >>> was >>> in -mm for some time , minus some changes/fixes I did to solve >>> conflicts >>> with the new multiple huge page sizes. >>> >>> Index: linux-work/arch/powerpc/Kconfig >>> =================================================================== >>> --- linux-work.orig/arch/powerpc/Kconfig 2008-07-30 >>> 13:17:06.000000000 +1000 >>> +++ linux-work/arch/powerpc/Kconfig 2008-07-30 13:27:40.000000000 >>> +1000 >>> @@ -42,6 +42,9 @@ config GENERIC_HARDIRQS >>> bool >>> default y >>> >>> +config HAVE_GET_USER_PAGES_FAST >>> + def_bool PPC64 >>> + >>> config HAVE_SETUP_PER_CPU_AREA >>> def_bool PPC64 >> >> what's ppc64 specific here? > > I didn't look how 32-bit powerpc does its TLB shootdown and page table > walking, so I don't know if it will work... I haven't glanced at your code but we have two cases. Either SW managed TLBs w/no HW walk or a full HW walk that should be similar to ppc64 (just no SLBs). - k ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] powerpc/mm: Lockless get_user_pages_fast() 2008-07-30 12:33 ` Kumar Gala 2008-07-30 13:17 ` Nick Piggin @ 2008-07-30 22:26 ` Benjamin Herrenschmidt 2008-07-30 22:35 ` Kumar Gala 1 sibling, 1 reply; 11+ messages in thread From: Benjamin Herrenschmidt @ 2008-07-30 22:26 UTC (permalink / raw) To: Kumar Gala; +Cc: Nick Piggin, linuxppc-dev list > > Index: linux-work/arch/powerpc/Kconfig > > =================================================================== > > --- linux-work.orig/arch/powerpc/Kconfig 2008-07-30 > > 13:17:06.000000000 +1000 > > +++ linux-work/arch/powerpc/Kconfig 2008-07-30 13:27:40.000000000 > > +1000 > > @@ -42,6 +42,9 @@ config GENERIC_HARDIRQS > > bool > > default y > > > > +config HAVE_GET_USER_PAGES_FAST > > + def_bool PPC64 > > + > > config HAVE_SETUP_PER_CPU_AREA > > def_bool PPC64 > > what's ppc64 specific here? Mostly _PAGE_SPECIAL (which I do plan to add to embedded which is why I've been trying hard to free a PTE bit :-) and the way we synchronize with the freeing of page tables (ppc64 uses RCU, ppc32 doesn't, we'd have to find something to keep fast gup in sync). Ben. ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] powerpc/mm: Lockless get_user_pages_fast() 2008-07-30 22:26 ` Benjamin Herrenschmidt @ 2008-07-30 22:35 ` Kumar Gala 2008-07-30 23:15 ` Kumar Gala 0 siblings, 1 reply; 11+ messages in thread From: Kumar Gala @ 2008-07-30 22:35 UTC (permalink / raw) To: benh; +Cc: Nick Piggin, linuxppc-dev list On Jul 30, 2008, at 5:26 PM, Benjamin Herrenschmidt wrote: > >>> Index: linux-work/arch/powerpc/Kconfig >>> =================================================================== >>> --- linux-work.orig/arch/powerpc/Kconfig 2008-07-30 >>> 13:17:06.000000000 +1000 >>> +++ linux-work/arch/powerpc/Kconfig 2008-07-30 13:27:40.000000000 >>> +1000 >>> @@ -42,6 +42,9 @@ config GENERIC_HARDIRQS >>> bool >>> default y >>> >>> +config HAVE_GET_USER_PAGES_FAST >>> + def_bool PPC64 >>> + >>> config HAVE_SETUP_PER_CPU_AREA >>> def_bool PPC64 >> >> what's ppc64 specific here? > > Mostly _PAGE_SPECIAL (which I do plan to add to embedded which is why > I've been trying hard to free a PTE bit :-) and the way we synchronize > with the freeing of page tables (ppc64 uses RCU, ppc32 doesn't, we'd > have to find something to keep fast gup in sync). I think we have some code that uses RCU for page freeing. I'll dig it up and posted it. - k ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] powerpc/mm: Lockless get_user_pages_fast() 2008-07-30 22:35 ` Kumar Gala @ 2008-07-30 23:15 ` Kumar Gala 0 siblings, 0 replies; 11+ messages in thread From: Kumar Gala @ 2008-07-30 23:15 UTC (permalink / raw) To: benh; +Cc: Nick Piggin, linuxppc-dev list Here's the code.. I haven't looked at this in any detail and I didn't write it. - k diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index c758407..c502909 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -26,7 +26,13 @@ #include <linux/vmalloc.h> #include <linux/init.h> #include <linux/highmem.h> +#include <linux/sched.h> +#ifdef CONFIG_SMP +#include <linux/rcupdate.h> +#endif + +#include <asm/tlb.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/fixmap.h> @@ -48,7 +54,7 @@ EXPORT_SYMBOL(ioremap_bot); /* aka VMALLOC_END */ extern char etext[], _stext[]; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && !defined(CONFIG_FSL_BOOKE) extern void hash_page_sync(void); #endif @@ -79,6 +85,84 @@ extern unsigned long p_mapped_by_tlbcam(unsigned long pa); #define PGDIR_ORDER 0 #endif +#ifdef CONFIG_SMP +struct pte_freelist_batch +{ + struct rcu_head rcu; + unsigned int index; + struct page * tables[0]; + struct mm_struct *mm; +}; + +#define PTE_FREELIST_SIZE \ + ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \ + / sizeof(struct page *)) + +DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); + +static void pte_free_smp_sync(void *arg) +{ + /* Do nothing, just ensure we sync with all CPUs */ +} + +/* This is only called when we are critically out of memory + * (and fail to get a page in pte_free_tlb). + */ +static void pgtable_free_now(struct mm_struct *mm, struct page *pte) +{ + smp_call_function(pte_free_smp_sync, NULL, 0, 1); + + pte_free(mm, pte); +} + +static void pte_free_rcu_callback(struct rcu_head *head) +{ + struct pte_freelist_batch *batch = + container_of(head, struct pte_freelist_batch, rcu); + unsigned int i; + + for (i = 0; i < batch->index; i++) + pte_free(batch->mm, batch->tables[i]); + + free_page((unsigned long)batch); +} + +static void pte_free_submit(struct pte_freelist_batch *batch) +{ + INIT_RCU_HEAD(&batch->rcu); + call_rcu(&batch->rcu, pte_free_rcu_callback); +} + +void pgtable_free_tlb(struct mmu_gather *tlb, struct page *pte) +{ + /* This is safe since tlb_gather_mmu has disabled preemption */ + cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id()); + struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur); + + if (atomic_read(&tlb->mm->mm_users) < 2 || + cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) { + pte_free(tlb->mm, pte); + return; + } + + if (*batchp == NULL) { + *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC); + if (*batchp == NULL) { + pgtable_free_now(tlb->mm, pte); + return; + } + (*batchp)->index = 0; + } + (*batchp)->tables[(*batchp)->index++] = pte; + if ((*batchp)->index == PTE_FREELIST_SIZE) { + (*batchp)->mm = tlb->mm; + pte_free_submit(*batchp); + *batchp = NULL; + } +} + +#endif /* CONFIG_SMP */ + pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *ret; @@ -127,7 +211,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && !defined(CONFIG_FSL_BOOKE) hash_page_sync(); #endif free_page((unsigned long)pte); @@ -135,7 +219,7 @@ void pte_free_kernel(struct mm_struct *mm, pte_t *pte) void pte_free(struct mm_struct *mm, pgtable_t ptepage) { -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && !defined(CONFIG_FSL_BOOKE) hash_page_sync(); #endif pgtable_page_dtor(ptepage); diff --git a/include/asm-powerpc/pgalloc-32.h b/include/asm-powerpc/pgalloc-32.h index 58c0714..1cb9245 100644 --- a/include/asm-powerpc/pgalloc-32.h +++ b/include/asm-powerpc/pgalloc-32.h @@ -36,7 +36,14 @@ extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr); extern void pte_free_kernel(struct mm_struct *mm, pte_t *pte); extern void pte_free(struct mm_struct *mm, pgtable_t pte); +#ifdef CONFIG_SMP +extern void pgtable_free_tlb(struct mmu_gather *tlb, struct page *pte); + +#define __pte_free_tlb(tlb, pte) pgtable_free_tlb(tlb, pte) + +#else #define __pte_free_tlb(tlb, pte) pte_free((tlb)->mm, (pte)) +#endif /* CONFIG_SMP */ #define check_pgt_cache() do { } while (0) ^ permalink raw reply related [flat|nested] 11+ messages in thread
end of thread, other threads:[~2008-07-30 23:15 UTC | newest] Thread overview: 11+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2008-07-30 3:37 [PATCH] powerpc/mm: Lockless get_user_pages_fast() Benjamin Herrenschmidt 2008-07-30 4:20 ` Benjamin Herrenschmidt 2008-07-30 5:06 ` Michael Ellerman 2008-07-30 5:08 ` Benjamin Herrenschmidt 2008-07-30 7:26 ` Nick Piggin 2008-07-30 12:33 ` Kumar Gala 2008-07-30 13:17 ` Nick Piggin 2008-07-30 13:39 ` Kumar Gala 2008-07-30 22:26 ` Benjamin Herrenschmidt 2008-07-30 22:35 ` Kumar Gala 2008-07-30 23:15 ` Kumar Gala
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).