[PATCH 17/20] x86,percpu: use embedding for 64bit NUMA and page for 32bit NUMA

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

From: Tejun Heo <tj@kernel.org>
To: linux-arch@vger.kernel.org, linux-kernel@vger.kernel.org,
	mingo@redhat.com, benh@kernel.crashing.org, davem@davemloft.net,
	dhowells@redhat.com, npiggin@suse.de, JBeulich@novell.com,
	cl@linux-foundation.org, rusty@rustcorp.com.au, hpa@zytor.com,
	tglx@linutronix.de, akpm@linux-foundation.org, x86@kernel.org,
	andi@firstfloor.org
Cc: Tejun Heo <tj@kernel.org>, Ingo Molnar <mingo@elte.hu>
Subject: [PATCH 17/20] x86,percpu: use embedding for 64bit NUMA and page for 32bit NUMA
Date: Tue, 21 Jul 2009 19:26:16 +0900	[thread overview]
Message-ID: <1248171979-29166-18-git-send-email-tj@kernel.org> (raw)
In-Reply-To: <1248171979-29166-1-git-send-email-tj@kernel.org>

Embedding percpu first chunk allocator can now handle very sparse unit
mapping.  Use embedding allocator instead of lpage for 64bit NUMA.
This removes extra TLB pressure and the need to do complex and fragile
dancing when changing page attributes.

For 32bit, using very sparse unit mapping isn't a good idea because
the vmalloc space is very constrained.  32bit NUMA machines aren't
exactly the focus of optimization and it isn't very clear whether
lpage performs better than page.  Use page first chunk allocator for
32bit NUMAs.

As this leaves setup_pcpu_*() functions pretty much empty, fold them
into setup_per_cpu_areas().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Andi Kleen <andi@firstfloor.org>
---
 arch/x86/Kconfig               |    4 -
 arch/x86/kernel/setup_percpu.c |  155 +++++++--------------------------------
 2 files changed, 28 insertions(+), 131 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5c0a3b4..96d2044 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -155,10 +155,6 @@ config NEED_PER_CPU_EMBED_FIRST_CHUNK
 config NEED_PER_CPU_PAGE_FIRST_CHUNK
 	def_bool y
 
-config NEED_PER_CPU_LPAGE_FIRST_CHUNK
-	def_bool y
-	depends on NEED_MULTIPLE_NODES
-
 config HAVE_CPUMASK_OF_CPU_MAP
 	def_bool X86_64_SMP
 
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 5b03d7e..4aff7a5 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -55,6 +55,7 @@ EXPORT_SYMBOL(__per_cpu_offset);
 #define PERCPU_FIRST_CHUNK_RESERVE	0
 #endif
 
+#ifdef CONFIG_X86_32
 /**
  * pcpu_need_numa - determine percpu allocation needs to consider NUMA
  *
@@ -83,6 +84,7 @@ static bool __init pcpu_need_numa(void)
 #endif
 	return false;
 }
+#endif
 
 /**
  * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
@@ -136,128 +138,23 @@ static void __init pcpu_fc_free(void *ptr, size_t size)
 	free_bootmem(__pa(ptr), size);
 }
 
-/*
- * Large page remapping allocator
- */
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-static void __init pcpul_map(void *ptr, size_t size, void *addr)
-{
-	pmd_t *pmd, pmd_v;
-
-	pmd = populate_extra_pmd((unsigned long)addr);
-	pmd_v = pfn_pmd(page_to_pfn(virt_to_page(ptr)), PAGE_KERNEL_LARGE);
-	set_pmd(pmd, pmd_v);
-}
-
-static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to)
+static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
 {
+#ifdef CONFIG_NEED_MULTIPLE_NODES
 	if (early_cpu_to_node(from) == early_cpu_to_node(to))
 		return LOCAL_DISTANCE;
 	else
 		return REMOTE_DISTANCE;
-}
-
-static int __init setup_pcpu_lpage(bool chosen)
-{
-	size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
-	size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE;
-	struct pcpu_alloc_info *ai;
-	int rc;
-
-	/* on non-NUMA, embedding is better */
-	if (!chosen && !pcpu_need_numa())
-		return -EINVAL;
-
-	/* need PSE */
-	if (!cpu_has_pse) {
-		pr_warning("PERCPU: lpage allocator requires PSE\n");
-		return -EINVAL;
-	}
-
-	/* allocate and build unit_map */
-	ai = pcpu_build_alloc_info(PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
-				   PMD_SIZE, pcpu_lpage_cpu_distance);
-	if (IS_ERR(ai)) {
-		pr_warning("PERCPU: failed to build unit_map (%ld)\n",
-			   PTR_ERR(ai));
-		return PTR_ERR(ai);
-	}
-
-	/* do the parameters look okay? */
-	if (!chosen) {
-		size_t vm_size = VMALLOC_END - VMALLOC_START;
-		size_t tot_size = 0;
-		int group;
-
-		for (group = 0; group < ai->nr_groups; group++)
-			tot_size += ai->unit_size * ai->groups[group].nr_units;
-
-		/* don't consume more than 20% of vmalloc area */
-		if (tot_size > vm_size / 5) {
-			pr_info("PERCPU: too large chunk size %zuMB for "
-				"large page remap\n", tot_size >> 20);
-			rc = -EINVAL;
-			goto out_free;
-		}
-	}
-
-	rc = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
-out_free:
-	pcpu_free_alloc_info(ai);
-	return rc;
-}
 #else
-static int __init setup_pcpu_lpage(bool chosen)
-{
-	return -EINVAL;
-}
+	return LOCAL_DISTANCE;
 #endif
-
-/*
- * Embedding allocator
- *
- * The first chunk is sized to just contain the static area plus
- * module and dynamic reserves and embedded into linear physical
- * mapping so that it can use PMD mapping without additional TLB
- * pressure.
- */
-static int __init setup_pcpu_embed(bool chosen)
-{
-	size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
-
-	/*
-	 * If large page isn't supported, there's no benefit in doing
-	 * this.  Also, embedding allocation doesn't play well with
-	 * NUMA.
-	 */
-	if (!chosen && (!cpu_has_pse || pcpu_need_numa()))
-		return -EINVAL;
-
-	return pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
-				      reserve - PERCPU_FIRST_CHUNK_RESERVE,
-				      PAGE_SIZE, NULL, pcpu_fc_alloc,
-				      pcpu_fc_free);
 }
 
-/*
- * Page allocator
- *
- * Boring fallback 4k page allocator.  This allocator puts more
- * pressure on PTE TLBs but other than that behaves nicely on both UMA
- * and NUMA.
- */
 static void __init pcpup_populate_pte(unsigned long addr)
 {
 	populate_extra_pte(addr);
 }
 
-static int __init setup_pcpu_page(void)
-{
-	return pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
-				     pcpu_fc_alloc, pcpu_fc_free,
-				     pcpup_populate_pte);
-}
-
 static inline void setup_percpu_segment(int cpu)
 {
 #ifdef CONFIG_X86_32
@@ -281,30 +178,34 @@ void __init setup_per_cpu_areas(void)
 		NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
 
 	/*
-	 * Allocate percpu area.  If PSE is supported, try to make use
-	 * of large page mappings.  Please read comments on top of
-	 * each allocator for details.
+	 * Allocate percpu area.  Embedding allocator is our favorite;
+	 * however, on NUMA configurations, it can result in very
+	 * sparse unit mapping and vmalloc area isn't spacious enough
+	 * on 32bit.  Use page in that case.
 	 */
+#ifdef CONFIG_X86_32
+	if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa())
+		pcpu_chosen_fc = PCPU_FC_PAGE;
+#endif
 	rc = -EINVAL;
-	if (pcpu_chosen_fc != PCPU_FC_AUTO) {
-		if (pcpu_chosen_fc != PCPU_FC_PAGE) {
-			if (pcpu_chosen_fc == PCPU_FC_LPAGE)
-				rc = setup_pcpu_lpage(true);
-			else
-				rc = setup_pcpu_embed(true);
-
-			if (rc < 0)
-				pr_warning("PERCPU: %s allocator failed (%d), "
-					   "falling back to page\n",
-					   pcpu_fc_names[pcpu_chosen_fc], rc);
-		}
-	} else {
-		rc = setup_pcpu_lpage(false);
+	if (pcpu_chosen_fc != PCPU_FC_PAGE) {
+		const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
+		const size_t dyn_size = PERCPU_MODULE_RESERVE +
+			PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
+
+		rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
+					    dyn_size, atom_size,
+					    pcpu_cpu_distance,
+					    pcpu_fc_alloc, pcpu_fc_free);
 		if (rc < 0)
-			rc = setup_pcpu_embed(false);
+			pr_warning("PERCPU: %s allocator failed (%d), "
+				   "falling back to page\n",
+				   pcpu_fc_names[pcpu_chosen_fc], rc);
 	}
 	if (rc < 0)
-		rc = setup_pcpu_page();
+		rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
+					   pcpu_fc_alloc, pcpu_fc_free,
+					   pcpup_populate_pte);
 	if (rc < 0)
 		panic("cannot initialize percpu area (err=%d)", rc);
 
-- 
1.6.0.2

next prev parent reply	other threads:[~2009-07-21 10:30 UTC|newest]

Thread overview: 30+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-07-21 10:25 [PATCHSET percpu#for-next] implement and use sparse embedding first chunk allocator Tejun Heo
2009-07-21 10:26 ` [PATCH 01/20] percpu: fix pcpu_reclaim() locking Tejun Heo
2009-07-21 21:41   ` Christoph Lameter
2009-07-21 10:26 ` [PATCH 02/20] percpu: improve boot messages Tejun Heo
2009-07-21 21:43   ` Christoph Lameter
2009-07-21 10:26 ` [PATCH 03/20] percpu: rename 4k first chunk allocator to page Tejun Heo
2009-07-21 21:47   ` Christoph Lameter
2009-07-22  4:38     ` Tejun Heo
2009-07-21 10:26 ` [PATCH 04/20] percpu: build first chunk allocators selectively Tejun Heo
2009-07-21 10:26 ` [PATCH 05/20] percpu: generalize first chunk allocator selection Tejun Heo
2009-07-21 10:26 ` [PATCH 06/20] percpu: drop @static_size from first chunk allocators Tejun Heo
2009-07-21 10:26 ` [PATCH 07/20] percpu: make @dyn_size mandatory for pcpu_setup_first_chunk() Tejun Heo
2009-07-21 10:26 ` [PATCH 08/20] percpu: add @align to pcpu_fc_alloc_fn_t Tejun Heo
2009-07-21 10:26 ` [PATCH 09/20] percpu: move pcpu_lpage_build_unit_map() and pcpul_lpage_dump_cfg() upward Tejun Heo
2009-07-21 10:26 ` [PATCH 10/20] percpu: introduce pcpu_alloc_info and pcpu_group_info Tejun Heo
2009-07-21 10:26 ` [PATCH 11/20] percpu: add pcpu_unit_offsets[] Tejun Heo
2009-07-21 10:26 ` [PATCH 12/20] percpu: add chunk->base_addr Tejun Heo
2009-07-21 10:26 ` [PATCH 13/20] vmalloc: separate out insert_vmalloc_vm() Tejun Heo
2009-07-21 10:26 ` [PATCH 14/20] vmalloc: implement pcpu_get_vm_areas() Tejun Heo
2009-08-14  6:07   ` Tejun Heo
2009-07-21 10:26 ` [PATCH 15/20] percpu: use group information to allocate vmap areas sparsely Tejun Heo
2009-07-21 10:26 ` [PATCH 16/20] percpu: update embedding first chunk allocator to handle sparse units Tejun Heo
2009-07-21 10:26 ` Tejun Heo [this message]
2009-07-21 10:26 ` [PATCH 18/20] percpu: kill lpage first chunk allocator Tejun Heo
2009-07-21 10:26 ` [PATCH 19/20] sparc64: use embedding percpu " Tejun Heo
2009-07-22  3:54   ` David Miller
2009-07-21 10:26 ` [PATCH 20/20] powerpc64: convert to dynamic percpu allocator Tejun Heo
2009-07-21 12:22 ` [RFC PATCH] percpu: kill legacy " Tejun Heo
2009-07-22  4:30   ` Rusty Russell
2009-08-14  6:09 ` [PATCHSET percpu#for-next] implement and use sparse embedding first chunk allocator Tejun Heo

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:5c0a3b4 dfblob:96d2044 dfblob:5b03d7e dfblob:4aff7a5 )
 OR (
bs:"[PATCH 17/20] x86,percpu: use embedding for 64bit NUMA and page for 32bit NUMA" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1248171979-29166-18-git-send-email-tj@kernel.org \
    --to=tj@kernel.org \
    --cc=JBeulich@novell.com \
    --cc=akpm@linux-foundation.org \
    --cc=andi@firstfloor.org \
    --cc=benh@kernel.crashing.org \
    --cc=cl@linux-foundation.org \
    --cc=davem@davemloft.net \
    --cc=dhowells@redhat.com \
    --cc=hpa@zytor.com \
    --cc=linux-arch@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=mingo@redhat.com \
    --cc=npiggin@suse.de \
    --cc=rusty@rustcorp.com.au \
    --cc=tglx@linutronix.de \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox