All of lore.kernel.org
 help / color / mirror / Atom feed
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
To: Andrea Arcangeli <aarcange@redhat.com>,
	Avi Kivity <avi@redhat.com>, Thomas Gleixner <tglx@linutronix.de>,
	Rik van Riel <riel@redhat.com>, Ingo Molnar <mingo@elte.hu>,
	akpm@linux-fou
Cc: linux-kernel@vger.kernel.org, linux-arch@vger.kernel.org,
	Benjamin Herrenschmidt <benh@kernel.crashing.org>,
	David Miller <davem@davemloft.net>,
	Hugh Dickins <hugh.dickins@tiscali.co.uk>,
	Mel Gorman <mel@csn.ul.ie>, Nick Piggin <npiggin@suse.de>,
	Peter Zijlstra <a.p.zijlstra@chello.nl>,
	"Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Subject: [PATCH 23/28] sparc64: Implement get_user_pages_fast().
Date: Mon, 07 Jun 2010 13:07:17 +0200	[thread overview]
Message-ID: <20100607111408.500962980@chello.nl> (raw)
In-Reply-To: 20100607110654.606530953@chello.nl

[-- Attachment #1: davem-sparc64-Implement_get_user_pages_fast.patch --]
[-- Type: text/plain, Size: 5484 bytes --]

From: David S. Miller <davem@davemloft.net>

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100408.180015.30977694.davem@davemloft.net>
---
 arch/sparc/mm/Makefile |    2 
 arch/sparc/mm/gup.c    |  181 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 182 insertions(+), 1 deletion(-)
 create mode 100644 arch/sparc/mm/gup.c

Index: linux-2.6/arch/sparc/mm/Makefile
===================================================================
--- linux-2.6.orig/arch/sparc/mm/Makefile
+++ linux-2.6/arch/sparc/mm/Makefile
@@ -4,7 +4,7 @@
 asflags-y := -ansi
 ccflags-y := -Werror
 
-obj-$(CONFIG_SPARC64)   += ultra.o tlb.o tsb.o
+obj-$(CONFIG_SPARC64)   += ultra.o tlb.o tsb.o gup.o
 obj-y                   += fault_$(BITS).o
 obj-y                   += init_$(BITS).o
 obj-$(CONFIG_SPARC32)   += loadmmu.o
Index: linux-2.6/arch/sparc/mm/gup.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/sparc/mm/gup.c
@@ -0,0 +1,181 @@
+/*
+ * Lockless get_user_pages_fast for sparc, cribbed from powerpc
+ *
+ * Copyright (C) 2008 Nick Piggin
+ * Copyright (C) 2008 Novell Inc.
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <linux/pagemap.h>
+#include <linux/rwsem.h>
+#include <asm/pgtable.h>
+
+/*
+ * The performance critical leaf functions are made noinline otherwise gcc
+ * inlines everything into a single function which results in too much
+ * register pressure.
+ */
+static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
+		unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long mask, result;
+	pte_t *ptep;
+
+	if (tlb_type == hypervisor) {
+		result = _PAGE_PRESENT_4V|_PAGE_P_4V;
+		if (write)
+			result |= _PAGE_WRITE_4V;
+	} else {
+		result = _PAGE_PRESENT_4U|_PAGE_P_4U;
+		if (write)
+			result |= _PAGE_WRITE_4U;
+	}
+	mask = result | _PAGE_SPECIAL;
+
+	ptep = pte_offset_kernel(&pmd, addr);
+	do {
+		struct page *page, *head;
+		pte_t pte = *ptep;
+
+		if ((pte_val(pte) & mask) != result)
+			return 0;
+		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+		/* The hugepage case is simplified on sparc64 because
+		 * we encode the sub-page pfn offsets into the
+		 * hugepage PTEs.  We could optimize this in the future
+		 * use page_cache_add_speculative() for the hugepage case.
+		 */
+		page = pte_page(pte);
+		head = compound_head(page);
+		if (!page_cache_get_speculative(head))
+			return 0;
+		if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+			put_page(head);
+			return 0;
+		}
+
+		pages[*nr] = page;
+		(*nr)++;
+	} while (ptep++, addr += PAGE_SIZE, addr != end);
+
+	return 1;
+}
+
+static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
+		int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	pmd_t *pmdp;
+
+	pmdp = pmd_offset(&pud, addr);
+	do {
+		pmd_t pmd = *pmdp;
+
+		next = pmd_addr_end(addr, end);
+		if (pmd_none(pmd))
+			return 0;
+		if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+			return 0;
+	} while (pmdp++, addr = next, addr != end);
+
+	return 1;
+}
+
+static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
+		int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	pud_t *pudp;
+
+	pudp = pud_offset(&pgd, addr);
+	do {
+		pud_t pud = *pudp;
+
+		next = pud_addr_end(addr, end);
+		if (pud_none(pud))
+			return 0;
+		if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+			return 0;
+	} while (pudp++, addr = next, addr != end);
+
+	return 1;
+}
+
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+			struct page **pages)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long addr, len, end;
+	unsigned long next;
+	pgd_t *pgdp;
+	int nr = 0;
+
+	start &= PAGE_MASK;
+	addr = start;
+	len = (unsigned long) nr_pages << PAGE_SHIFT;
+	end = start + len;
+
+	/*
+	 * XXX: batch / limit 'nr', to avoid large irq off latency
+	 * needs some instrumenting to determine the common sizes used by
+	 * important workloads (eg. DB2), and whether limiting the batch size
+	 * will decrease performance.
+	 *
+	 * It seems like we're in the clear for the moment. Direct-IO is
+	 * the main guy that batches up lots of get_user_pages, and even
+	 * they are limited to 64-at-a-time which is not so many.
+	 */
+	/*
+	 * This doesn't prevent pagetable teardown, but does prevent
+	 * the pagetables from being freed on sparc.
+	 *
+	 * So long as we atomically load page table pointers versus teardown,
+	 * we can follow the address down to the the page and take a ref on it.
+	 */
+	local_irq_disable();
+
+	pgdp = pgd_offset(mm, addr);
+	do {
+		pgd_t pgd = *pgdp;
+
+		next = pgd_addr_end(addr, end);
+		if (pgd_none(pgd))
+			goto slow;
+		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+			goto slow;
+	} while (pgdp++, addr = next, addr != end);
+
+	local_irq_enable();
+
+	VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
+	return nr;
+
+	{
+		int ret;
+
+slow:
+		local_irq_enable();
+
+		/* Try to get the remaining pages with get_user_pages */
+		start += nr << PAGE_SHIFT;
+		pages += nr;
+
+		down_read(&mm->mmap_sem);
+		ret = get_user_pages(current, mm, start,
+			(end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
+		up_read(&mm->mmap_sem);
+
+		/* Have to be a bit careful with return values */
+		if (nr > 0) {
+			if (ret < 0)
+				ret = nr;
+			else
+				ret += nr;
+		}
+
+		return ret;
+	}
+}

WARNING: multiple messages have this Message-ID (diff)
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
To: Andrea Arcangeli <aarcange@redhat.com>,
	Avi Kivity <avi@redhat.com>, Thomas Gleixner <tglx@linutronix.de>,
	Rik van Riel <riel@redhat.com>, Ingo Molnar <mingo@elte.hu>,
	akpm@linux-foundation.org,
	Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org, linux-arch@vger.kernel.org,
	Benjamin Herrenschmidt <benh@kernel.crashing.org>,
	David Miller <davem@davemloft.net>,
	Hugh Dickins <hugh.dickins@tiscali.co.uk>,
	Mel Gorman <mel@csn.ul.ie>, Nick Piggin <npiggin@suse.de>,
	Peter Zijlstra <a.p.zijlstra@chello.nl>,
	"Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Subject: [PATCH 23/28] sparc64: Implement get_user_pages_fast().
Date: Mon, 07 Jun 2010 13:07:17 +0200	[thread overview]
Message-ID: <20100607111408.500962980@chello.nl> (raw)
Message-ID: <20100607110717.yY3kjRjCD9t2dvNd2aMTc45SQ9DUG3To8Luew8s0uYs@z> (raw)
In-Reply-To: 20100607110654.606530953@chello.nl

[-- Attachment #1: davem-sparc64-Implement_get_user_pages_fast.patch --]
[-- Type: text/plain, Size: 5486 bytes --]

From: David S. Miller <davem@davemloft.net>

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100408.180015.30977694.davem@davemloft.net>
---
 arch/sparc/mm/Makefile |    2 
 arch/sparc/mm/gup.c    |  181 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 182 insertions(+), 1 deletion(-)
 create mode 100644 arch/sparc/mm/gup.c

Index: linux-2.6/arch/sparc/mm/Makefile
===================================================================
--- linux-2.6.orig/arch/sparc/mm/Makefile
+++ linux-2.6/arch/sparc/mm/Makefile
@@ -4,7 +4,7 @@
 asflags-y := -ansi
 ccflags-y := -Werror
 
-obj-$(CONFIG_SPARC64)   += ultra.o tlb.o tsb.o
+obj-$(CONFIG_SPARC64)   += ultra.o tlb.o tsb.o gup.o
 obj-y                   += fault_$(BITS).o
 obj-y                   += init_$(BITS).o
 obj-$(CONFIG_SPARC32)   += loadmmu.o
Index: linux-2.6/arch/sparc/mm/gup.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/sparc/mm/gup.c
@@ -0,0 +1,181 @@
+/*
+ * Lockless get_user_pages_fast for sparc, cribbed from powerpc
+ *
+ * Copyright (C) 2008 Nick Piggin
+ * Copyright (C) 2008 Novell Inc.
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <linux/pagemap.h>
+#include <linux/rwsem.h>
+#include <asm/pgtable.h>
+
+/*
+ * The performance critical leaf functions are made noinline otherwise gcc
+ * inlines everything into a single function which results in too much
+ * register pressure.
+ */
+static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
+		unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long mask, result;
+	pte_t *ptep;
+
+	if (tlb_type == hypervisor) {
+		result = _PAGE_PRESENT_4V|_PAGE_P_4V;
+		if (write)
+			result |= _PAGE_WRITE_4V;
+	} else {
+		result = _PAGE_PRESENT_4U|_PAGE_P_4U;
+		if (write)
+			result |= _PAGE_WRITE_4U;
+	}
+	mask = result | _PAGE_SPECIAL;
+
+	ptep = pte_offset_kernel(&pmd, addr);
+	do {
+		struct page *page, *head;
+		pte_t pte = *ptep;
+
+		if ((pte_val(pte) & mask) != result)
+			return 0;
+		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+		/* The hugepage case is simplified on sparc64 because
+		 * we encode the sub-page pfn offsets into the
+		 * hugepage PTEs.  We could optimize this in the future
+		 * use page_cache_add_speculative() for the hugepage case.
+		 */
+		page = pte_page(pte);
+		head = compound_head(page);
+		if (!page_cache_get_speculative(head))
+			return 0;
+		if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+			put_page(head);
+			return 0;
+		}
+
+		pages[*nr] = page;
+		(*nr)++;
+	} while (ptep++, addr += PAGE_SIZE, addr != end);
+
+	return 1;
+}
+
+static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
+		int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	pmd_t *pmdp;
+
+	pmdp = pmd_offset(&pud, addr);
+	do {
+		pmd_t pmd = *pmdp;
+
+		next = pmd_addr_end(addr, end);
+		if (pmd_none(pmd))
+			return 0;
+		if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+			return 0;
+	} while (pmdp++, addr = next, addr != end);
+
+	return 1;
+}
+
+static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
+		int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	pud_t *pudp;
+
+	pudp = pud_offset(&pgd, addr);
+	do {
+		pud_t pud = *pudp;
+
+		next = pud_addr_end(addr, end);
+		if (pud_none(pud))
+			return 0;
+		if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+			return 0;
+	} while (pudp++, addr = next, addr != end);
+
+	return 1;
+}
+
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+			struct page **pages)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long addr, len, end;
+	unsigned long next;
+	pgd_t *pgdp;
+	int nr = 0;
+
+	start &= PAGE_MASK;
+	addr = start;
+	len = (unsigned long) nr_pages << PAGE_SHIFT;
+	end = start + len;
+
+	/*
+	 * XXX: batch / limit 'nr', to avoid large irq off latency
+	 * needs some instrumenting to determine the common sizes used by
+	 * important workloads (eg. DB2), and whether limiting the batch size
+	 * will decrease performance.
+	 *
+	 * It seems like we're in the clear for the moment. Direct-IO is
+	 * the main guy that batches up lots of get_user_pages, and even
+	 * they are limited to 64-at-a-time which is not so many.
+	 */
+	/*
+	 * This doesn't prevent pagetable teardown, but does prevent
+	 * the pagetables from being freed on sparc.
+	 *
+	 * So long as we atomically load page table pointers versus teardown,
+	 * we can follow the address down to the the page and take a ref on it.
+	 */
+	local_irq_disable();
+
+	pgdp = pgd_offset(mm, addr);
+	do {
+		pgd_t pgd = *pgdp;
+
+		next = pgd_addr_end(addr, end);
+		if (pgd_none(pgd))
+			goto slow;
+		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+			goto slow;
+	} while (pgdp++, addr = next, addr != end);
+
+	local_irq_enable();
+
+	VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
+	return nr;
+
+	{
+		int ret;
+
+slow:
+		local_irq_enable();
+
+		/* Try to get the remaining pages with get_user_pages */
+		start += nr << PAGE_SHIFT;
+		pages += nr;
+
+		down_read(&mm->mmap_sem);
+		ret = get_user_pages(current, mm, start,
+			(end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
+		up_read(&mm->mmap_sem);
+
+		/* Have to be a bit careful with return values */
+		if (nr > 0) {
+			if (ret < 0)
+				ret = nr;
+			else
+				ret += nr;
+		}
+
+		return ret;
+	}
+}



  parent reply	other threads:[~2010-06-07 11:07 UTC|newest]

Thread overview: 77+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-06-07 11:06 [PATCH 00/28] mm: preemptibility -v3 Peter Zijlstra
2010-06-07 11:06 ` Peter Zijlstra
2010-06-07 11:06 ` [PATCH 01/28] powerpc: Use call_rcu_sched() for pagetables Peter Zijlstra
2010-06-07 11:06   ` Peter Zijlstra
2010-06-07 11:06 ` [PATCH 02/28] mm: Improve page_lock_anon_vma() comment Peter Zijlstra
2010-06-07 11:06   ` Peter Zijlstra
2010-06-09 10:50   ` Mel Gorman
2010-06-07 11:06 ` [PATCH 03/28] rename anon_vma_lock to vma_lock_anon_vma Peter Zijlstra
2010-06-07 11:06   ` Peter Zijlstra
2010-06-07 15:01   ` Peter Zijlstra
2010-06-09 10:59   ` Mel Gorman
2010-06-07 11:06 ` [PATCH 04/28] change direct call of spin_lock(anon_vma->lock) to inline function Peter Zijlstra
2010-06-07 11:06   ` Peter Zijlstra
2010-06-07 11:06 ` [PATCH 05/28] track the root (oldest) anon_vma Peter Zijlstra
2010-06-07 11:06   ` Peter Zijlstra
2010-06-07 11:07 ` [PATCH 06/28] always lock " Peter Zijlstra
2010-06-07 11:07   ` Peter Zijlstra
2010-06-07 11:07 ` [PATCH 07/28] extend KSM refcounts to the anon_vma root Peter Zijlstra
2010-06-07 11:07   ` Peter Zijlstra
2010-06-09 10:55   ` Mel Gorman
2010-06-07 11:07 ` [PATCH 08/28] mm: Rename drop_anon_vma to put_anon_vma Peter Zijlstra
2010-06-07 11:07   ` Peter Zijlstra
2010-06-07 11:07 ` [PATCH 09/28] mm: Move anon_vma ref out from under CONFIG_KSM Peter Zijlstra
2010-06-07 11:07   ` Peter Zijlstra
2010-06-09 11:05   ` Mel Gorman
2010-06-07 11:07 ` [PATCH 10/28] mm: Make use of the anon_vma ref count Peter Zijlstra
2010-06-07 11:07   ` Peter Zijlstra
2010-06-09 13:22   ` Mel Gorman
2010-06-09 13:32     ` Peter Zijlstra
2010-06-07 11:07 ` [PATCH 11/28] mm: Preemptible mmu_gather Peter Zijlstra
2010-06-07 11:07   ` Peter Zijlstra
2010-06-07 11:07 ` [PATCH 12/28] powerpc: " Peter Zijlstra
2010-06-07 11:07   ` Peter Zijlstra
2010-06-07 11:07 ` [PATCH 13/28] sparc: " Peter Zijlstra
2010-06-07 11:07   ` Peter Zijlstra
2010-06-07 11:07 ` [PATCH 14/28] s390: preemptible mmu_gather Peter Zijlstra
2010-06-07 11:07   ` Peter Zijlstra
2010-06-07 11:07 ` [PATCH 15/28] arm: Preemptible mmu_gather Peter Zijlstra
2010-06-07 11:07   ` Peter Zijlstra
2010-06-07 11:07 ` [PATCH 16/28] sh: " Peter Zijlstra
2010-06-07 11:07   ` Peter Zijlstra
2010-06-07 11:38   ` Peter Zijlstra
2010-06-07 11:07 ` [PATCH 17/28] um: " Peter Zijlstra
2010-06-07 11:07   ` Peter Zijlstra
2010-06-07 11:07 ` [PATCH 18/28] ia64: " Peter Zijlstra
2010-06-07 11:07   ` Peter Zijlstra
2010-06-07 11:07 ` [PATCH 19/28] mm, powerpc: Move the RCU page-table freeing into generic code Peter Zijlstra
2010-06-07 11:07   ` Peter Zijlstra
2010-06-07 11:07 ` [PATCH 20/28] sparc64: Kill page table quicklists Peter Zijlstra
2010-06-07 11:07   ` Peter Zijlstra
2010-06-07 11:07 ` [PATCH 21/28] sparc64: Use RCU page table freeing Peter Zijlstra
2010-06-07 11:07   ` Peter Zijlstra
2010-06-07 11:07 ` [PATCH 22/28] sparc64: Add support for _PAGE_SPECIAL Peter Zijlstra
2010-06-07 11:07   ` Peter Zijlstra
2010-06-07 11:07 ` Peter Zijlstra [this message]
2010-06-07 11:07   ` [PATCH 23/28] sparc64: Implement get_user_pages_fast() Peter Zijlstra
2010-06-07 11:07 ` [PATCH 24/28] lockdep, mutex: Provide mutex_lock_nest_lock Peter Zijlstra
2010-06-07 11:07   ` Peter Zijlstra
2010-06-07 11:07 ` [PATCH 25/28] mutex: Provide mutex_is_contended Peter Zijlstra
2010-06-07 11:07   ` Peter Zijlstra
2010-06-07 11:07 ` [PATCH 26/28] mm: Convert i_mmap_lock and anon_vma->lock to mutexes Peter Zijlstra
2010-06-07 11:07   ` Peter Zijlstra
2010-06-07 11:07 ` [PATCH 27/28] mm: Extended batches for generic mmu_gather Peter Zijlstra
2010-06-07 11:07   ` Peter Zijlstra
2010-06-07 11:07 ` [PATCH 28/28] mm: Optimize page_lock_anon_vma() fast-path Peter Zijlstra
2010-06-07 11:07   ` Peter Zijlstra
2010-06-07 13:57 ` [PATCH 00/28] mm: preemptibility -v3 Sam Ravnborg
2010-06-07 15:02   ` Peter Zijlstra
2010-06-07 16:36 ` Andi Kleen
2010-06-07 16:39   ` Peter Zijlstra
2010-06-10  1:45 ` Zhang, Yanmin
2010-06-10  6:52   ` Peter Zijlstra
2010-06-10  6:59     ` Zhang, Yanmin
2010-06-21 10:21       ` Peter Zijlstra
2010-06-24  9:55         ` Peter Zijlstra
2010-06-29  7:40           ` Zhang, Yanmin
2010-06-29  7:48             ` Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20100607111408.500962980@chello.nl \
    --to=a.p.zijlstra@chello.nl \
    --cc=aarcange@redhat.com \
    --cc=akpm@linux-fou \
    --cc=avi@redhat.com \
    --cc=benh@kernel.crashing.org \
    --cc=davem@davemloft.net \
    --cc=hugh.dickins@tiscali.co.uk \
    --cc=linux-arch@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mel@csn.ul.ie \
    --cc=mingo@elte.hu \
    --cc=npiggin@suse.de \
    --cc=paulmck@linux.vnet.ibm.com \
    --cc=riel@redhat.com \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.