public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: clameter@sgi.com
From: Christoph Lameter <clameter@sgi.com>
To: ak@suse.de
Cc: akpm@linux-foundation.org
Cc: travis@sgi.com
Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: linux-kernel@vger.kernel.org
Subject: [rfc 08/45] cpu alloc: x86 support
Date: Mon, 19 Nov 2007 17:11:40 -0800	[thread overview]
Message-ID: <20071120011333.619991903@sgi.com> (raw)
In-Reply-To: 20071120011132.143632442@sgi.com

[-- Attachment #1: cpu_alloc_x86_support --]
[-- Type: text/plain, Size: 7527 bytes --]

64 bit:

Set up a cpu area that allows the use of up 16MB for each processor.

Cpu memory use can grow a bit. F.e. if we assume that a pageset
occupies 64 bytes of memory and we have 3 zones in each of 1024 nodes
then we need 3 * 1k * 16k = 50 million pagesets or 3096 pagesets per
processor. This results in a total of 3.2 GB of page structs.
Each cpu needs around 200k of cpu storage for the page allocator alone.
So its a worth it to use a 2M huge mapping here.

For the UP and SMP case map the area using 4k ptes. Typical use of per cpu
data is around 16k for UP and SMP configurations. It goes up to 45k when the
per cpu area is managed by cpu_alloc (see special x86_64 patchset).
Allocating in 2M segments would be overkill.

For NUMA map the area using 2M PMDs. A large NUMA system may use
lots of cpu data for the page allocator data alone. We typically
have large amounts of memory around on those size. Using a 2M page size
reduces TLB pressure for that case.

Some numbers for envisioned maximum configurations of NUMA systems:

4k cpu configurations with 1k nodes:

	4096 * 16MB = 64TB of virtual space.

Maximum theoretical configuration 16384 processors 1k nodes:

	16384 * 16MB = 256TB of virtual space.

Both fit within the established limits established.

32 bit:

Setup a 256 kB area for the cpu areas below the FIXADDR area.

The use of the cpu alloc area is pretty minimal on i386. An 8p system
with no extras uses only ~8kb. So 256kb should be plenty. A configuration
that supports up to 8 processors takes up 2MB of the scarce
virtual address space

Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
 arch/x86/Kconfig                 |   13 +++++++++++++
 arch/x86/kernel/vmlinux_32.lds.S |    1 +
 arch/x86/kernel/vmlinux_64.lds.S |    3 +++
 arch/x86/mm/init_32.c            |    3 +++
 arch/x86/mm/init_64.c            |   38 ++++++++++++++++++++++++++++++++++++++
 include/asm-x86/pgtable_32.h     |    7 +++++--
 include/asm-x86/pgtable_64.h     |    1 +
 7 files changed, 64 insertions(+), 2 deletions(-)

Index: linux-2.6/arch/x86/mm/init_64.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/init_64.c	2007-11-19 15:45:07.602390533 -0800
+++ linux-2.6/arch/x86/mm/init_64.c	2007-11-19 15:55:53.165640248 -0800
@@ -781,3 +781,41 @@ int __meminit vmemmap_populate(struct pa
 	return 0;
 }
 #endif
+
+#ifdef CONFIG_NUMA
+int __meminit cpu_area_populate(void *start, unsigned long size,
+						gfp_t flags, int node)
+{
+	unsigned long addr = (unsigned long)start;
+	unsigned long end = addr + size;
+	unsigned long next;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	for (; addr < end; addr = next) {
+		next = pmd_addr_end(addr, end);
+
+		pgd = cpu_area_pgd_populate(addr, flags, node);
+		if (!pgd)
+			return -ENOMEM;
+		pud = cpu_area_pud_populate(pgd, addr, flags, node);
+		if (!pud)
+			return -ENOMEM;
+
+		pmd = pmd_offset(pud, addr);
+		if (pmd_none(*pmd)) {
+			pte_t entry;
+			void *p = cpu_area_alloc_block(PMD_SIZE, flags, node);
+			if (!p)
+				return -ENOMEM;
+
+			entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
+			mk_pte_huge(entry);
+			set_pmd(pmd, __pmd(pte_val(entry)));
+		}
+	}
+
+	return 0;
+}
+#endif
Index: linux-2.6/include/asm-x86/pgtable_64.h
===================================================================
--- linux-2.6.orig/include/asm-x86/pgtable_64.h	2007-11-19 15:45:07.638390147 -0800
+++ linux-2.6/include/asm-x86/pgtable_64.h	2007-11-19 15:55:53.165640248 -0800
@@ -138,6 +138,7 @@ static inline pte_t ptep_get_and_clear_f
 #define VMALLOC_START    _AC(0xffffc20000000000, UL)
 #define VMALLOC_END      _AC(0xffffe1ffffffffff, UL)
 #define VMEMMAP_START	 _AC(0xffffe20000000000, UL)
+#define CPU_AREA_BASE	 _AC(0xffffffff84000000, UL)
 #define MODULES_VADDR    _AC(0xffffffff88000000, UL)
 #define MODULES_END      _AC(0xfffffffffff00000, UL)
 #define MODULES_LEN   (MODULES_END - MODULES_VADDR)
Index: linux-2.6/arch/x86/Kconfig
===================================================================
--- linux-2.6.orig/arch/x86/Kconfig	2007-11-19 15:54:10.509139813 -0800
+++ linux-2.6/arch/x86/Kconfig	2007-11-19 15:55:53.165640248 -0800
@@ -159,6 +159,19 @@ config X86_TRAMPOLINE
 
 config KTIME_SCALAR
 	def_bool X86_32
+
+config CPU_AREA_VIRTUAL
+	bool
+	default y
+
+config CPU_AREA_ORDER
+	int
+	default "6"
+
+config CPU_AREA_ALLOC_ORDER
+	int
+	default "0"
+
 source "init/Kconfig"
 
 menu "Processor type and features"
Index: linux-2.6/arch/x86/mm/init_32.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/init_32.c	2007-11-19 15:45:07.610390367 -0800
+++ linux-2.6/arch/x86/mm/init_32.c	2007-11-19 15:55:53.165640248 -0800
@@ -674,6 +674,7 @@ void __init mem_init(void)
 #if 1 /* double-sanity-check paranoia */
 	printk("virtual kernel memory layout:\n"
 	       "    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+	       "    cpu area: 0x%08lx - 0x%08lx   (%4ld kb)\n"
 #ifdef CONFIG_HIGHMEM
 	       "    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
 #endif
@@ -684,6 +685,8 @@ void __init mem_init(void)
 	       "      .text : 0x%08lx - 0x%08lx   (%4ld kB)\n",
 	       FIXADDR_START, FIXADDR_TOP,
 	       (FIXADDR_TOP - FIXADDR_START) >> 10,
+	       CPU_AREA_BASE, FIXADDR_START,
+	       (FIXADDR_START - CPU_AREA_BASE) >> 10,
 
 #ifdef CONFIG_HIGHMEM
 	       PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
Index: linux-2.6/include/asm-x86/pgtable_32.h
===================================================================
--- linux-2.6.orig/include/asm-x86/pgtable_32.h	2007-11-19 15:45:07.646390299 -0800
+++ linux-2.6/include/asm-x86/pgtable_32.h	2007-11-19 15:55:53.165640248 -0800
@@ -79,11 +79,14 @@ void paging_init(void);
 #define VMALLOC_START	(((unsigned long) high_memory + \
 			2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
 #ifdef CONFIG_HIGHMEM
-# define VMALLOC_END	(PKMAP_BASE-2*PAGE_SIZE)
+# define CPU_AREA_BASE	(PKMAP_BASE - NR_CPUS * \
+				(1 << (CONFIG_CPU_AREA_ORDER + PAGE_SHIFT)))
 #else
-# define VMALLOC_END	(FIXADDR_START-2*PAGE_SIZE)
+# define CPU_AREA_BASE	(FIXADDR_START - NR_CPUS * \
+				(1 << (CONFIG_CPU_AREA_ORDER + PAGE_SHIFT)))
 #endif
 
+#define VMALLOC_END	(CPU_AREA_BASE - 2 * PAGE_SIZE)
 /*
  * _PAGE_PSE set in the page directory entry just means that
  * the page directory entry points directly to a 4MB-aligned block of
Index: linux-2.6/arch/x86/kernel/vmlinux_32.lds.S
===================================================================
--- linux-2.6.orig/arch/x86/kernel/vmlinux_32.lds.S	2007-11-19 15:45:07.622390531 -0800
+++ linux-2.6/arch/x86/kernel/vmlinux_32.lds.S	2007-11-19 15:55:53.165640248 -0800
@@ -26,6 +26,7 @@ OUTPUT_FORMAT("elf32-i386", "elf32-i386"
 OUTPUT_ARCH(i386)
 ENTRY(phys_startup_32)
 jiffies = jiffies_64;
+cpu_area = CPU_AREA_BASE;
 
 PHDRS {
 	text PT_LOAD FLAGS(5);	/* R_E */
Index: linux-2.6/arch/x86/kernel/vmlinux_64.lds.S
===================================================================
--- linux-2.6.orig/arch/x86/kernel/vmlinux_64.lds.S	2007-11-19 15:45:07.630390395 -0800
+++ linux-2.6/arch/x86/kernel/vmlinux_64.lds.S	2007-11-19 15:55:53.165640248 -0800
@@ -6,6 +6,7 @@
 
 #include <asm-generic/vmlinux.lds.h>
 #include <asm/page.h>
+#include <asm/pgtable_64.h>
 
 #undef i386	/* in case the preprocessor is a 32bit one */
 
@@ -14,6 +15,8 @@ OUTPUT_ARCH(i386:x86-64)
 ENTRY(phys_startup_64)
 jiffies_64 = jiffies;
 _proxy_pda = 1;
+cpu_area = CPU_AREA_BASE;
+
 PHDRS {
 	text PT_LOAD FLAGS(5);	/* R_E */
 	data PT_LOAD FLAGS(7);	/* RWE */

-- 

  parent reply	other threads:[~2007-11-20  1:15 UTC|newest]

Thread overview: 120+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-11-20  1:11 [rfc 00/45] [RFC] CPU ops and a rework of per cpu data handling on x86_64 clameter, Christoph Lameter
2007-11-20  1:11 ` [rfc 01/45] ACPI: Avoid references to impossible processors clameter, Christoph Lameter
2007-11-20 12:47   ` Mathieu Desnoyers
2007-11-20 20:16     ` Christoph Lameter
2007-11-20 15:29   ` Andi Kleen
2007-11-20 20:18     ` Christoph Lameter
2007-11-20  1:11 ` [rfc 02/45] cpu alloc: Simple version of the allocator (static allocations) clameter, Christoph Lameter
2007-11-20  1:11 ` [rfc 03/45] Generic CPU operations: Core piece clameter, Christoph Lameter
2007-11-20  3:17   ` Mathieu Desnoyers
2007-11-20  3:30     ` Christoph Lameter
2007-11-20  4:07       ` Mathieu Desnoyers
2007-11-20 20:36         ` Christoph Lameter
2007-11-20  1:11 ` [rfc 04/45] cpu alloc: Use in SLUB clameter, Christoph Lameter
2007-11-20 12:42   ` Mathieu Desnoyers
2007-11-20 20:44     ` Christoph Lameter
2007-11-20 21:23       ` Mathieu Desnoyers
2007-11-20 21:36         ` Christoph Lameter
2007-11-20 21:43           ` Mathieu Desnoyers
2007-11-20  1:11 ` [rfc 05/45] cpu alloc: Remove SLUB fields clameter, Christoph Lameter
2007-11-20  1:11 ` [rfc 06/45] cpu alloc: page allocator conversion clameter, Christoph Lameter
2007-11-20  1:11 ` [rfc 07/45] cpu_alloc: Implement dynamically extendable cpu areas clameter, Christoph Lameter
2007-11-20  1:11 ` clameter, Christoph Lameter [this message]
2007-11-20  1:35   ` [rfc 08/45] cpu alloc: x86 support H. Peter Anvin
2007-11-20  2:02     ` Christoph Lameter
2007-11-20  2:18       ` H. Peter Anvin
2007-11-20  3:37       ` Nick Piggin
2007-11-20  3:59       ` Nick Piggin
2007-11-20 12:05         ` Andi Kleen
2007-11-20  3:16   ` Andi Kleen
2007-11-20  3:50     ` Christoph Lameter
2007-11-20 12:01       ` Andi Kleen
2007-11-20 20:35         ` Christoph Lameter
2007-11-20 20:59           ` Andi Kleen
2007-11-20 21:33             ` Christoph Lameter
2007-11-21  0:10               ` Christoph Lameter
2007-11-21  1:16                 ` Christoph Lameter
2007-11-21  1:36                   ` Andi Kleen
2007-11-21  2:08                     ` Christoph Lameter
2007-11-21 13:08                       ` Andi Kleen
2007-11-21 19:01                         ` Christoph Lameter
2007-11-20 20:43         ` H. Peter Anvin
2007-11-20 20:51           ` Andi Kleen
2007-11-20 20:58             ` Christoph Lameter
2007-11-20 21:06               ` H. Peter Anvin
2007-11-20 21:34                 ` Christoph Lameter
2007-11-20 21:01             ` H. Peter Anvin
2007-11-27  4:12         ` John Richard Moser
2007-11-20  1:11 ` [rfc 09/45] cpu alloc: IA64 support clameter, Christoph Lameter
2007-11-20  1:11 ` [rfc 10/45] cpu_alloc: Sparc64 support clameter, Christoph Lameter
2007-11-20  1:11 ` [rfc 11/45] cpu alloc: percpu_counter conversion clameter, Christoph Lameter
2007-11-20  1:11 ` [rfc 12/45] cpu alloc: crash_notes conversion clameter, Christoph Lameter
2007-11-20 13:03   ` Mathieu Desnoyers
2007-11-20 20:50     ` Christoph Lameter
2007-11-20  1:11 ` [rfc 13/45] cpu alloc: workqueue conversion clameter, Christoph Lameter
2007-11-20  1:11 ` [rfc 14/45] cpu alloc: ACPI cstate handling conversion clameter, Christoph Lameter
2007-11-20  1:11 ` [rfc 15/45] cpu alloc: genhd statistics conversion clameter, Christoph Lameter
2007-11-20  1:11 ` [rfc 16/45] cpu alloc: blktrace conversion clameter, Christoph Lameter
2007-11-20  1:11 ` [rfc 17/45] cpu alloc: SRCU clameter, Christoph Lameter
2007-11-20  1:11 ` [rfc 18/45] cpu alloc: XFS counters clameter, Christoph Lameter
2007-11-20  8:12   ` Christoph Hellwig
2007-11-20 20:38     ` Christoph Lameter
2007-11-21  4:47       ` David Chinner
2007-11-21  4:50         ` Christoph Lameter
2007-11-20  1:11 ` [rfc 19/45] cpu alloc: NFS statistics clameter, Christoph Lameter
2007-11-20 13:02   ` Mathieu Desnoyers
2007-11-20 20:49     ` Christoph Lameter
2007-11-20 20:56       ` Trond Myklebust
2007-11-20 21:28         ` Mathieu Desnoyers
2007-11-20 21:48           ` Trond Myklebust
2007-11-20 21:50             ` Mathieu Desnoyers
2007-11-20 22:46               ` Trond Myklebust
2007-11-21  0:53                 ` Mathieu Desnoyers
2007-11-20 21:26       ` Mathieu Desnoyers
2007-11-20  1:11 ` [rfc 20/45] cpu alloc: neigbour statistics clameter, Christoph Lameter
2007-11-20  1:11 ` [rfc 21/45] cpu alloc: tcp statistics clameter, Christoph Lameter
2007-11-20  1:11 ` [rfc 22/45] cpu alloc: convert scatches clameter, Christoph Lameter
2007-11-20  1:11 ` [rfc 23/45] cpu alloc: dmaengine conversion clameter, Christoph Lameter
2007-11-20 12:50   ` Mathieu Desnoyers
2007-11-20 20:46     ` Christoph Lameter
2007-11-20  1:11 ` [rfc 24/45] cpu alloc: convert loopback statistics clameter, Christoph Lameter
2007-11-20  1:11 ` [rfc 25/45] cpu alloc: veth conversion clameter, Christoph Lameter
2007-11-20  1:11 ` [rfc 26/45] cpu alloc: Chelsio statistics conversion clameter, Christoph Lameter
2007-11-20  1:11 ` [rfc 27/45] cpu alloc: convert mib handling to cpu alloc clameter, Christoph Lameter
2007-11-20  1:12 ` [rfc 28/45] cpu_alloc: convert network sockets clameter, Christoph Lameter
2007-11-20  1:12 ` [rfc 29/45] cpu alloc: Use for infiniband clameter, Christoph Lameter
2007-11-20  1:12 ` [rfc 30/45] cpu alloc: Use in the crypto subsystem clameter, Christoph Lameter
2007-11-20  1:12 ` [rfc 31/45] cpu alloc: Remove the allocpercpu functionality clameter, Christoph Lameter
2007-11-20  1:12 ` [rfc 32/45] Module handling: Use CPU_xx ops to dynamically allocate counters clameter, Christoph Lameter
2007-11-20  1:12 ` [rfc 33/45] x86_64: Use CPU ops for nmi alert counter clameter, Christoph Lameter
2007-11-20  1:12 ` [rfc 34/45] x86_64: Fold percpu area into the cpu area clameter, Christoph Lameter
2007-11-20  1:12 ` [rfc 35/45] X86_64: Declare pda as per cpu data thereby moving it " clameter, Christoph Lameter
2007-11-20  1:12 ` [rfc 36/45] X86_64: Place pda first in " clameter, Christoph Lameter
2007-11-20  1:12 ` [rfc 37/45] x86_64: Support for fast per cpu operations clameter, Christoph Lameter
2007-11-20  2:00   ` H. Peter Anvin
2007-11-20  2:03     ` Christoph Lameter
2007-11-20  2:15       ` H. Peter Anvin
2007-11-20  2:17     ` David Miller
2007-11-20  2:19       ` H. Peter Anvin
2007-11-20  3:23         ` Andi Kleen
2007-11-20  2:45     ` Paul Mackerras
2007-11-20  1:12 ` [rfc 38/45] x86_64: Remove obsolete per_cpu offset calculations clameter, Christoph Lameter
2007-11-20  1:12 ` [rfc 39/45] x86_64: Remove the data_offset field from the pda clameter, Christoph Lameter
2007-11-20  1:12 ` [rfc 40/45] x86_64: Provide per_cpu_var definition clameter, Christoph Lameter
2007-11-20  1:12 ` [rfc 41/45] VM statistics: Use CPU ops clameter, Christoph Lameter
2007-11-20  1:12 ` [rfc 43/45] x86_64: Add a CPU_OR to support or_pda() clameter, Christoph Lameter
2007-11-20  1:12 ` [rfc 44/45] Remove local_t support clameter, Christoph Lameter
2007-11-20 12:59   ` Mathieu Desnoyers
2007-11-20 20:48     ` Christoph Lameter
2007-11-20  1:12 ` [rfc 45/45] Modules: Hack to handle symbols that have a zero value clameter, Christoph Lameter
2007-11-20  2:20   ` Mathieu Desnoyers
2007-11-20  2:49     ` Christoph Lameter
2007-11-20  3:29       ` Mathieu Desnoyers
2007-11-20  1:18 ` [rfc 00/45] [RFC] CPU ops and a rework of per cpu data handling on x86_64 Christoph Lameter
2007-11-20  1:51 ` David Miller
2007-11-20  1:59   ` Christoph Lameter
2007-11-20  2:10     ` David Miller
2007-11-20  2:12       ` Christoph Lameter
2007-11-20  3:25   ` Andi Kleen
2007-11-20  3:33     ` Christoph Lameter
2007-11-20  4:04     ` David Miller

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20071120011333.619991903@sgi.com \
    --to=clameter@sgi.com \
    --cc=ak@suse.de \
    --cc=akpm@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox