All of lore.kernel.org
 help / color / mirror / Atom feed
From: Yinghai Lu <yinghai@kernel.org>
To: Thomas Gleixner <tglx@linutronix.de>, Ingo Molnar <mingo@elte.hu>,
	"H. Peter Anvin" <hpa@zytor.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Thomas Renninger <trenn@suse.de>,
	Tang Chen <tangchen@cn.fujitsu.com>
Cc: linux-kernel@vger.kernel.org, Yinghai Lu <yinghai@kernel.org>,
	Tejun Heo <tj@kernel.org>, Pekka Enberg <penberg@kernel.org>,
	Jacob Shin <jacob.shin@amd.com>,
	Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Subject: [PATCH 14/14] x86, mm: Put pagetable on local node ram
Date: Thu,  7 Mar 2013 20:58:40 -0800	[thread overview]
Message-ID: <1362718720-27048-15-git-send-email-yinghai@kernel.org> (raw)
In-Reply-To: <1362718720-27048-1-git-send-email-yinghai@kernel.org>

If node with ram is hotplugable, local node mem for page table and vmemmap
should be on that node ram.

This patch is some kind of refreshment of
| commit 1411e0ec3123ae4c4ead6bfc9fe3ee5a3ae5c327
| Date:   Mon Dec 27 16:48:17 2010 -0800
|
|    x86-64, numa: Put pgtable to local node memory
That was reverted before.

We have reason to reintroduce it to make memory hotplug work.

Split calling of init_mem_mapping into early_initmem_info
for nodes after we get numa info there.

First node will be low range.
Need to rework alloc_low_pages to alloc page table in following order:
	BRK, local node, low range

Still only load_cr3 one time, otherwise we would break xen 64bit again.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Jacob Shin <jacob.shin@amd.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/include/asm/pgtable.h |    2 +-
 arch/x86/kernel/setup.c        |    1 -
 arch/x86/mm/init.c             |   83 ++++++++++++++++++++++------------------
 arch/x86/mm/init_32.c          |    8 ++++
 arch/x86/mm/init_64.c          |    9 +++++
 arch/x86/mm/numa.c             |   56 +++++++++++++++++++++++++++
 6 files changed, 119 insertions(+), 40 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 1e67223..868687c 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -621,7 +621,7 @@ static inline int pgd_none(pgd_t pgd)
 #ifndef __ASSEMBLY__
 
 extern int direct_gbpages;
-void init_mem_mapping(void);
+void init_mem_mapping(unsigned long begin, unsigned long end);
 void early_alloc_pgt_buf(void);
 
 /* local pte updates need not use xchg for locking */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 29a6b94..37d993f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1103,7 +1103,6 @@ void __init setup_arch(char **cmdline_p)
 	acpi_boot_table_init();
 	early_acpi_boot_init();
 	early_initmem_init();
-	init_mem_mapping();
 	memblock.current_limit = get_max_mapped();
 	early_trap_pf_init();
 
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index abcc241..2838bb5 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -24,7 +24,10 @@ static unsigned long __initdata pgt_buf_start;
 static unsigned long __initdata pgt_buf_end;
 static unsigned long __initdata pgt_buf_top;
 
-static unsigned long min_pfn_mapped;
+static unsigned long low_min_pfn_mapped;
+static unsigned long low_max_pfn_mapped;
+static unsigned long local_min_pfn_mapped;
+static unsigned long local_max_pfn_mapped;
 
 static bool __initdata can_use_brk_pgt = true;
 
@@ -52,10 +55,17 @@ __ref void *alloc_low_pages(unsigned int num)
 
 	if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
 		unsigned long ret;
-		if (min_pfn_mapped >= max_pfn_mapped)
-			panic("alloc_low_page: ran out of memory");
-		ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
-					max_pfn_mapped << PAGE_SHIFT,
+		if (local_min_pfn_mapped >= local_max_pfn_mapped) {
+			if (low_min_pfn_mapped >= low_max_pfn_mapped)
+				panic("alloc_low_page: ran out of memory");
+			ret = memblock_find_in_range(
+					low_min_pfn_mapped << PAGE_SHIFT,
+					low_max_pfn_mapped << PAGE_SHIFT,
+					PAGE_SIZE * num , PAGE_SIZE);
+		} else
+			ret = memblock_find_in_range(
+					local_min_pfn_mapped << PAGE_SHIFT,
+					local_max_pfn_mapped << PAGE_SHIFT,
 					PAGE_SIZE * num , PAGE_SIZE);
 		if (!ret)
 			panic("alloc_low_page: can not alloc memory");
@@ -387,67 +397,64 @@ static unsigned long __init init_range_memory_mapping(
 
 /* (PUD_SHIFT-PMD_SHIFT)/2 */
 #define STEP_SIZE_SHIFT 5
-void __init init_mem_mapping(void)
+void __init init_mem_mapping(unsigned long begin, unsigned long end)
 {
-	unsigned long end, real_end, start, last_start;
+	unsigned long real_end, start, last_start;
 	unsigned long step_size;
 	unsigned long addr;
 	unsigned long mapped_ram_size = 0;
 	unsigned long new_mapped_ram_size;
+	bool is_low = false;
+
+	if (!begin) {
+		probe_page_size_mask();
+		/* the ISA range is always mapped regardless of memory holes */
+		init_memory_mapping(0, ISA_END_ADDRESS);
+		begin = ISA_END_ADDRESS;
+		is_low = true;
+	}
 
-	probe_page_size_mask();
-
-#ifdef CONFIG_X86_64
-	end = max_pfn << PAGE_SHIFT;
-#else
-	end = max_low_pfn << PAGE_SHIFT;
-#endif
-
-	/* the ISA range is always mapped regardless of memory holes */
-	init_memory_mapping(0, ISA_END_ADDRESS);
+	if (begin >= end)
+		return;
 
 	/* xen has big range in reserved near end of ram, skip it at first.*/
-	addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE, PMD_SIZE);
+	addr = memblock_find_in_range(begin, end, PMD_SIZE, PMD_SIZE);
 	real_end = addr + PMD_SIZE;
 
 	/* step_size need to be small so pgt_buf from BRK could cover it */
 	step_size = PMD_SIZE;
-	max_pfn_mapped = 0; /* will get exact value next */
-	min_pfn_mapped = real_end >> PAGE_SHIFT;
+	local_max_pfn_mapped = begin >> PAGE_SHIFT;
+	local_min_pfn_mapped = real_end >> PAGE_SHIFT;
 	last_start = start = real_end;
-	while (last_start > ISA_END_ADDRESS) {
+	while (last_start > begin) {
 		if (last_start > step_size) {
 			start = round_down(last_start - 1, step_size);
-			if (start < ISA_END_ADDRESS)
-				start = ISA_END_ADDRESS;
+			if (start < begin)
+				start = begin;
 		} else
-			start = ISA_END_ADDRESS;
+			start = begin;
 		new_mapped_ram_size = init_range_memory_mapping(start,
 							last_start);
+		if ((last_start >> PAGE_SHIFT) > local_max_pfn_mapped)
+			local_max_pfn_mapped = last_start >> PAGE_SHIFT;
+		local_min_pfn_mapped = start >> PAGE_SHIFT;
 		last_start = start;
-		min_pfn_mapped = last_start >> PAGE_SHIFT;
 		/* only increase step_size after big range get mapped */
 		if (new_mapped_ram_size > mapped_ram_size)
 			step_size <<= STEP_SIZE_SHIFT;
 		mapped_ram_size += new_mapped_ram_size;
 	}
 
-	if (real_end < end)
+	if (real_end < end) {
 		init_range_memory_mapping(real_end, end);
-
-#ifdef CONFIG_X86_64
-	if (max_pfn > max_low_pfn) {
-		/* can we preseve max_low_pfn ?*/
-		max_low_pfn = max_pfn;
+		if ((end >> PAGE_SHIFT) > local_max_pfn_mapped)
+			local_max_pfn_mapped = end >> PAGE_SHIFT;
 	}
-#else
-	early_ioremap_page_table_range_init();
-#endif
 
-	load_cr3(swapper_pg_dir);
-	__flush_tlb_all();
-
-	early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
+	if (is_low) {
+		low_min_pfn_mapped = local_min_pfn_mapped;
+		low_max_pfn_mapped = local_max_pfn_mapped;
+	}
 }
 
 /*
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 3801962..37e5768 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -662,6 +662,14 @@ void __init find_low_pfn_range(void)
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 void __init early_initmem_init(void)
 {
+	init_mem_mapping(0, max_low_pfn<<PAGE_SHIFT);
+
+	early_ioremap_page_table_range_init();
+
+	load_cr3(swapper_pg_dir);
+	__flush_tlb_all();
+
+	early_memtest(0, max_pfn_mapped<<PAGE_SHIFT);
 }
 void __init initmem_init(void)
 {
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 218a4e5..a15db8a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -642,6 +642,15 @@ kernel_physical_mapping_init(unsigned long start,
 #ifndef CONFIG_NUMA
 void __init early_initmem_init(void)
 {
+	init_mem_mapping(0, max_pfn<<PAGE_SHIFT);
+
+	if (max_pfn > max_low_pfn)
+		max_low_pfn = max_pfn;
+
+	load_cr3(swapper_pg_dir);
+	__flush_tlb_all();
+
+	early_memtest(0, max_pfn_mapped<<PAGE_SHIFT);
 }
 void __init initmem_init(void)
 {
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 643b39a..0aeb980 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -17,8 +17,10 @@
 #include <asm/dma.h>
 #include <asm/acpi.h>
 #include <asm/amd_nb.h>
+#include <asm/tlbflush.h>
 
 #include "numa_internal.h"
+#include "mm_internal.h"
 
 int __initdata numa_off;
 nodemask_t numa_nodes_parsed __initdata;
@@ -673,9 +675,63 @@ static void __init early_x86_numa_init(void)
 	numa_init(dummy_numa_init);
 }
 
+#ifdef CONFIG_X86_64
+static void __init early_x86_numa_init_mapping(void)
+{
+	unsigned long last_start = 0, last_end = 0;
+	struct numa_meminfo *mi = &numa_meminfo;
+	unsigned long start, end;
+	int last_nid = -1;
+	int i, nid;
+
+	for (i = 0; i < mi->nr_blks; i++) {
+		nid   = mi->blk[i].nid;
+		start = mi->blk[i].start;
+		end   = mi->blk[i].end;
+
+		if (last_nid == nid) {
+			last_end = end;
+			continue;
+		}
+
+		/* other nid now */
+		if (last_nid >= 0) {
+			printk(KERN_DEBUG "Node %d: [mem %#016lx-%#016lx]\n",
+					last_nid, last_start, last_end - 1);
+			init_mem_mapping(last_start, last_end);
+		}
+
+		/* for next nid */
+		last_nid   = nid;
+		last_start = start;
+		last_end   = end;
+	}
+	/* last one */
+	printk(KERN_DEBUG "Node %d: [mem %#016lx-%#016lx]\n",
+			last_nid, last_start, last_end - 1);
+	init_mem_mapping(last_start, last_end);
+
+	if (max_pfn > max_low_pfn)
+		max_low_pfn = max_pfn;
+}
+#else
+static void __init early_x86_numa_init_mapping(void)
+{
+	init_mem_mapping(0, max_low_pfn<<PAGE_SHIFT);
+	early_ioremap_page_table_range_init();
+}
+#endif
+
 void __init early_initmem_init(void)
 {
 	early_x86_numa_init();
+
+	early_x86_numa_init_mapping();
+
+	load_cr3(swapper_pg_dir);
+	__flush_tlb_all();
+
+	early_memtest(0, max_pfn_mapped<<PAGE_SHIFT);
 }
 
 void __init x86_numa_init(void)
-- 
1.7.10.4


  parent reply	other threads:[~2013-03-08  4:59 UTC|newest]

Thread overview: 57+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-03-08  4:58 [PATCH 00/14] x86, ACPI, numa: Parse numa info early Yinghai Lu
2013-03-08  4:58 ` [PATCH 01/14] x86, ACPI, mm: Kill max_low_pfn_mapped Yinghai Lu
2013-03-08  4:58   ` Yinghai Lu
2013-03-08  5:10   ` Tejun Heo
2013-03-08  5:10     ` Tejun Heo
2013-03-08  5:22     ` Yinghai Lu
2013-03-08  5:25       ` Tejun Heo
2013-03-08  5:27         ` Yinghai Lu
2013-03-08  5:28           ` Tejun Heo
2013-03-08  6:09             ` H. Peter Anvin
2013-03-11 22:50               ` Daniel Vetter
2013-03-11 23:09                 ` Chris Wilson
2013-03-12  1:51                 ` H. Peter Anvin
2013-03-08  4:58 ` [PATCH 02/14] x86, ACPI: Split find/copy from acpi_initrd_override Yinghai Lu
2013-03-08  5:33   ` Tejun Heo
2013-03-08  6:47     ` Yinghai Lu
2013-03-08  4:58 ` [PATCH 03/14] x86, ACPI: store override acpi tables phys addr Yinghai Lu
2013-03-08  5:36   ` Tejun Heo
2013-03-08  6:49     ` Yinghai Lu
2013-03-08  7:08       ` Tejun Heo
2013-03-08  4:58 ` [PATCH 04/14] x86, ACPI: make acpi override finding work with 32bit flat mode Yinghai Lu
2013-03-08  5:50   ` Tejun Heo
2013-03-08  6:57     ` Yinghai Lu
2013-03-08  7:06       ` Tejun Heo
2013-03-08  7:25         ` Yinghai Lu
2013-03-08  7:28           ` Tejun Heo
2013-03-08  7:16       ` Andrew Morton
2013-03-08 21:25       ` Thomas Gleixner
2013-03-08  4:58 ` [PATCH 05/14] x86, ACPI: Find acpi tables in initrd early at head_32.S/head64.c Yinghai Lu
2013-03-08  5:57   ` Tejun Heo
2013-03-08  7:02     ` Yinghai Lu
2013-03-08  7:07       ` Tejun Heo
2013-03-08  4:58 ` [PATCH 06/14] x86, mm, numa: Move successful path handling code later Yinghai Lu
2013-03-08  6:04   ` Tejun Heo
2013-03-08  7:03     ` Yinghai Lu
2013-03-08  4:58 ` [PATCH 07/14] x86, mm, numa: call numa_meminfo_cover_memory() early Yinghai Lu
2013-03-08  4:58 ` [PATCH 08/14] x86, mm, numa: use numa_meminfo to check node_map_pfn alignment Yinghai Lu
2013-03-08  6:26   ` Tejun Heo
2013-03-08  7:05     ` Yinghai Lu
2013-03-08  4:58 ` [PATCH 09/14] x86, mm, numa: set memblock nid later Yinghai Lu
2013-03-08  6:28   ` Tejun Heo
2013-03-08  7:11     ` Yinghai Lu
2013-03-08  4:58 ` [PATCH 10/14] x86, mm, numa: Move emulation handling down Yinghai Lu
2013-03-08  6:42   ` Tejun Heo
2013-03-08  7:13     ` Yinghai Lu
2013-03-08  4:58 ` [PATCH 11/14] x86, acpi, numa: split SLIT handling out Yinghai Lu
2013-03-08  6:46   ` Tejun Heo
2013-03-08  7:18     ` Yinghai Lu
2013-03-08  7:19       ` Tejun Heo
2013-03-08  7:33         ` Yinghai Lu
2013-03-08  4:58 ` [PATCH 12/14] x86, mm, numa: Add early_initmem_init() stub Yinghai Lu
2013-03-08  4:58 ` [PATCH 13/14] x86, mm: Parse numa info early Yinghai Lu
2013-03-08  4:58 ` Yinghai Lu [this message]
2013-03-08  7:01   ` [PATCH 14/14] x86, mm: Put pagetable on local node ram Tejun Heo
2013-03-08  7:44     ` Yinghai Lu
2013-03-08  8:20   ` Tang Chen
2013-03-08 17:25     ` Yinghai Lu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1362718720-27048-15-git-send-email-yinghai@kernel.org \
    --to=yinghai@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=hpa@zytor.com \
    --cc=jacob.shin@amd.com \
    --cc=konrad.wilk@oracle.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=penberg@kernel.org \
    --cc=tangchen@cn.fujitsu.com \
    --cc=tglx@linutronix.de \
    --cc=tj@kernel.org \
    --cc=trenn@suse.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.