LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* Re: [PATCH 7/8] mm: use vm_unmapped_area() on powerpc architecture
From: Michel Lespinasse @ 2013-01-09 11:23 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Rik van Riel, Tony Luck, linux-ia64, linux-parisc,
	James E.J. Bottomley, linux-kernel, David Howells, linux-mm,
	linux-alpha, Matt Turner, linuxppc-dev, Andrew Morton
In-Reply-To: <1357702376.4838.32.camel@pasglop>

On Wed, Jan 09, 2013 at 02:32:56PM +1100, Benjamin Herrenschmidt wrote:
> Ok. I think at least you can move that construct:
> 
> +               if (addr < SLICE_LOW_TOP) {
> +                       slice = GET_LOW_SLICE_INDEX(addr);
> +                       addr = (slice + 1) << SLICE_LOW_SHIFT;
> +                       if (!(available.low_slices & (1u << slice)))
> +                               continue;
> +               } else {
> +                       slice = GET_HIGH_SLICE_INDEX(addr);
> +                       addr = (slice + 1) << SLICE_HIGH_SHIFT;
> +                       if (!(available.high_slices & (1u << slice)))
> +                               continue;
> +               }
> 
> Into some kind of helper. It will probably compile to the same thing but
> at least it's more readable and it will avoid a fuckup in the future if
> somebody changes the algorithm and forgets to update one of the
> copies :-)

All right, does the following look more palatable then ?
(didn't re-test it, though)

Signed-off-by: Michel Lespinasse <walken@google.com>

---
 arch/powerpc/mm/slice.c |  123 ++++++++++++++++++++++++++++++-----------------
 1 files changed, 78 insertions(+), 45 deletions(-)

diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 999a74f25ebe..3e99c149271a 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -237,36 +237,69 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz
 #endif
 }
 
+/*
+ * Compute which slice addr is part of;
+ * set *boundary_addr to the start or end boundary of that slice
+ * (depending on 'end' parameter);
+ * return boolean indicating if the slice is marked as available in the
+ * 'available' slice_mark.
+ */
+static bool slice_scan_available(unsigned long addr,
+				 struct slice_mask available,
+				 int end,
+				 unsigned long *boundary_addr)
+{
+	unsigned long slice;
+	if (addr < SLICE_LOW_TOP) {
+		slice = GET_LOW_SLICE_INDEX(addr);
+		*boundary_addr = (slice + end) << SLICE_LOW_SHIFT;
+		return !!(available.low_slices & (1u << slice));
+	} else {
+		slice = GET_HIGH_SLICE_INDEX(addr);
+		*boundary_addr = (slice + end) ?
+			((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP;
+		return !!(available.high_slices & (1u << slice));
+	}
+}
+
 static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
 					      unsigned long len,
 					      struct slice_mask available,
 					      int psize)
 {
-	struct vm_area_struct *vma;
-	unsigned long addr;
-	struct slice_mask mask;
 	int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
+	unsigned long addr, found, next_end;
+	struct vm_unmapped_area_info info;
 
-	addr = TASK_UNMAPPED_BASE;
-
-	for (;;) {
-		addr = _ALIGN_UP(addr, 1ul << pshift);
-		if ((TASK_SIZE - len) < addr)
-			break;
-		vma = find_vma(mm, addr);
-		BUG_ON(vma && (addr >= vma->vm_end));
+	info.flags = 0;
+	info.length = len;
+	info.align_mask = PAGE_MASK & ((1ul << pshift) - 1);
+	info.align_offset = 0;
 
-		mask = slice_range_to_mask(addr, len);
-		if (!slice_check_fit(mask, available)) {
-			if (addr < SLICE_LOW_TOP)
-				addr = _ALIGN_UP(addr + 1,  1ul << SLICE_LOW_SHIFT);
-			else
-				addr = _ALIGN_UP(addr + 1,  1ul << SLICE_HIGH_SHIFT);
+	addr = TASK_UNMAPPED_BASE;
+	while (addr < TASK_SIZE) {
+		info.low_limit = addr;
+		if (!slice_scan_available(addr, available, 1, &addr))
 			continue;
+
+ next_slice:
+		/*
+		 * At this point [info.low_limit; addr) covers
+		 * available slices only and ends at a slice boundary.
+		 * Check if we need to reduce the range, or if we can
+		 * extend it to cover the next available slice.
+		 */
+		if (addr >= TASK_SIZE)
+			addr = TASK_SIZE;
+		else if (slice_scan_available(addr, available, 1, &next_end)) {
+			addr = next_end;
+			goto next_slice;
 		}
-		if (!vma || addr + len <= vma->vm_start)
-			return addr;
-		addr = vma->vm_end;
+		info.high_limit = addr;
+
+		found = vm_unmapped_area(&info);
+		if (!(found & ~PAGE_MASK))
+			return found;
 	}
 
 	return -ENOMEM;
@@ -277,39 +310,39 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm,
 					     struct slice_mask available,
 					     int psize)
 {
-	struct vm_area_struct *vma;
-	unsigned long addr;
-	struct slice_mask mask;
 	int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
+	unsigned long addr, found, prev;
+	struct vm_unmapped_area_info info;
 
-	addr = mm->mmap_base;
-	while (addr > len) {
-		/* Go down by chunk size */
-		addr = _ALIGN_DOWN(addr - len, 1ul << pshift);
+	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+	info.length = len;
+	info.align_mask = PAGE_MASK & ((1ul << pshift) - 1);
+	info.align_offset = 0;
 
-		/* Check for hit with different page size */
-		mask = slice_range_to_mask(addr, len);
-		if (!slice_check_fit(mask, available)) {
-			if (addr < SLICE_LOW_TOP)
-				addr = _ALIGN_DOWN(addr, 1ul << SLICE_LOW_SHIFT);
-			else if (addr < (1ul << SLICE_HIGH_SHIFT))
-				addr = SLICE_LOW_TOP;
-			else
-				addr = _ALIGN_DOWN(addr, 1ul << SLICE_HIGH_SHIFT);
+	addr = mm->mmap_base;
+	while (addr > PAGE_SIZE) {
+		info.high_limit = addr;
+		if (!slice_scan_available(addr - 1, available, 0, &addr))
 			continue;
-		}
 
+ prev_slice:
 		/*
-		 * Lookup failure means no vma is above this address,
-		 * else if new region fits below vma->vm_start,
-		 * return with success:
+		 * At this point [addr; info.high_limit) covers
+		 * available slices only and starts at a slice boundary.
+		 * Check if we need to reduce the range, or if we can
+		 * extend it to cover the previous available slice.
 		 */
-		vma = find_vma(mm, addr);
-		if (!vma || (addr + len) <= vma->vm_start)
-			return addr;
+		if (addr < PAGE_SIZE)
+			addr = PAGE_SIZE;
+		else if (slice_scan_available(addr - 1, available, 0, &prev)) {
+			addr = prev;
+			goto prev_slice;
+		}
+		info.low_limit = addr;
 
-		/* try just below the current vma->vm_start */
-		addr = vma->vm_start;
+		found = vm_unmapped_area(&info);
+		if (!(found & ~PAGE_MASK))
+			return found;
 	}
 
 	/*

-- 
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.

^ permalink raw reply related

* [PATCH v6 15/15] memory-hotplug: Do not allocate pdgat if it was not freed when offline.
From: Tang Chen @ 2013-01-09  9:32 UTC (permalink / raw)
  To: akpm, rientjes, len.brown, benh, paulus, cl, minchan.kim,
	kosaki.motohiro, isimatu.yasuaki, wujianguo, wency, tangchen, hpa,
	linfeng, laijs, mgorman, yinghai, glommer
  Cc: linux-s390, linux-ia64, linux-acpi, linux-sh, x86, linux-kernel,
	cmetcalf, linux-mm, sparclinux, linuxppc-dev
In-Reply-To: <1357723959-5416-1-git-send-email-tangchen@cn.fujitsu.com>

Since there is no way to guarentee the address of pgdat/zone is not
on stack of any kernel threads or used by other kernel objects
without reference counting or other symchronizing method, we cannot
reset node_data and free pgdat when offlining a node. Just reset pgdat
to 0 and reuse the memory when the node is online again.

The problem is suggested by Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
The idea is from Wen Congyang <wency@cn.fujitsu.com>

NOTE: If we don't reset pgdat to 0, the WARN_ON in free_area_init_node()
      will be triggered.

Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
Reviewed-by: Wen Congyang <wency@cn.fujitsu.com>
---
 mm/memory_hotplug.c |   20 ++++++++++++--------
 1 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 8b67752..8aa2b56 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1015,11 +1015,14 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
 	unsigned long zholes_size[MAX_NR_ZONES] = {0};
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 
-	pgdat = arch_alloc_nodedata(nid);
-	if (!pgdat)
-		return NULL;
+	pgdat = NODE_DATA(nid);
+	if (!pgdat) {
+		pgdat = arch_alloc_nodedata(nid);
+		if (!pgdat)
+			return NULL;
 
-	arch_refresh_nodedata(nid, pgdat);
+		arch_refresh_nodedata(nid, pgdat);
+	}
 
 	/* we can use NODE_DATA(nid) from here */
 
@@ -1072,7 +1075,7 @@ out:
 int __ref add_memory(int nid, u64 start, u64 size)
 {
 	pg_data_t *pgdat = NULL;
-	int new_pgdat = 0;
+	int new_pgdat = 0, new_node = 0;
 	struct resource *res;
 	int ret;
 
@@ -1083,12 +1086,13 @@ int __ref add_memory(int nid, u64 start, u64 size)
 	if (!res)
 		goto out;
 
-	if (!node_online(nid)) {
+	new_pgdat = NODE_DATA(nid) ? 0 : 1;
+	new_node = node_online(nid) ? 0 : 1;
+	if (new_node) {
 		pgdat = hotadd_new_pgdat(nid, start);
 		ret = -ENOMEM;
 		if (!pgdat)
 			goto error;
-		new_pgdat = 1;
 	}
 
 	/* call arch's memory hotadd */
@@ -1100,7 +1104,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
 	/* we online node here. we can't roll back from here. */
 	node_set_online(nid);
 
-	if (new_pgdat) {
+	if (new_node) {
 		ret = register_one_node(nid);
 		/*
 		 * If sysfs file of new node can't create, cpu on the node
-- 
1.7.1

^ permalink raw reply related

* [PATCH v6 11/15] memory-hotplug: Integrated __remove_section() of CONFIG_SPARSEMEM_VMEMMAP.
From: Tang Chen @ 2013-01-09  9:32 UTC (permalink / raw)
  To: akpm, rientjes, len.brown, benh, paulus, cl, minchan.kim,
	kosaki.motohiro, isimatu.yasuaki, wujianguo, wency, tangchen, hpa,
	linfeng, laijs, mgorman, yinghai, glommer
  Cc: linux-s390, linux-ia64, linux-acpi, linux-sh, x86, linux-kernel,
	cmetcalf, linux-mm, sparclinux, linuxppc-dev
In-Reply-To: <1357723959-5416-1-git-send-email-tangchen@cn.fujitsu.com>

Currently __remove_section for SPARSEMEM_VMEMMAP does nothing. But even if
we use SPARSEMEM_VMEMMAP, we can unregister the memory_section.

Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
---
 mm/memory_hotplug.c |   11 -----------
 1 files changed, 0 insertions(+), 11 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 674e791..b20c4c7 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -430,16 +430,6 @@ static int __meminit __add_section(int nid, struct zone *zone,
 	return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
 }
 
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-static int __remove_section(struct zone *zone, struct mem_section *ms)
-{
-	/*
-	 * XXX: Freeing memmap with vmemmap is not implement yet.
-	 *      This should be removed later.
-	 */
-	return -EBUSY;
-}
-#else
 static int __remove_section(struct zone *zone, struct mem_section *ms)
 {
 	int ret = -EINVAL;
@@ -454,7 +444,6 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
 	sparse_remove_one_section(zone, ms);
 	return 0;
 }
-#endif
 
 /*
  * Reasonably generic function for adding memory.  It is
-- 
1.7.1

^ permalink raw reply related

* [PATCH v6 14/15] memory-hotplug: free node_data when a node is offlined
From: Tang Chen @ 2013-01-09  9:32 UTC (permalink / raw)
  To: akpm, rientjes, len.brown, benh, paulus, cl, minchan.kim,
	kosaki.motohiro, isimatu.yasuaki, wujianguo, wency, tangchen, hpa,
	linfeng, laijs, mgorman, yinghai, glommer
  Cc: linux-s390, linux-ia64, linux-acpi, linux-sh, x86, linux-kernel,
	cmetcalf, linux-mm, sparclinux, linuxppc-dev
In-Reply-To: <1357723959-5416-1-git-send-email-tangchen@cn.fujitsu.com>

From: Wen Congyang <wency@cn.fujitsu.com>

We call hotadd_new_pgdat() to allocate memory to store node_data. So we
should free it when removing a node.

Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Reviewed-by: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 mm/memory_hotplug.c |   30 +++++++++++++++++++++++++++---
 1 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a8703f7..8b67752 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1699,9 +1699,12 @@ static int check_cpu_on_node(void *data)
 /* offline the node if all memory sections of this node are removed */
 static void try_offline_node(int nid)
 {
-	unsigned long start_pfn = NODE_DATA(nid)->node_start_pfn;
-	unsigned long end_pfn = start_pfn + NODE_DATA(nid)->node_spanned_pages;
+	pg_data_t *pgdat = NODE_DATA(nid);
+	unsigned long start_pfn = pgdat->node_start_pfn;
+	unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
 	unsigned long pfn;
+	struct page *pgdat_page = virt_to_page(pgdat);
+	int i;
 
 	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
 		unsigned long section_nr = pfn_to_section_nr(pfn);
@@ -1719,7 +1722,7 @@ static void try_offline_node(int nid)
 		return;
 	}
 
-	if (stop_machine(check_cpu_on_node, NODE_DATA(nid), NULL))
+	if (stop_machine(check_cpu_on_node, pgdat, NULL))
 		return;
 
 	/*
@@ -1728,6 +1731,27 @@ static void try_offline_node(int nid)
 	 */
 	node_set_offline(nid);
 	unregister_one_node(nid);
+
+	if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page))
+		/* node data is allocated from boot memory */
+		return;
+
+	/* free waittable in each zone */
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		struct zone *zone = pgdat->node_zones + i;
+
+		if (zone->wait_table)
+			vfree(zone->wait_table);
+	}
+
+	/*
+	 * Since there is no way to guarentee the address of pgdat/zone is not
+	 * on stack of any kernel threads or used by other kernel objects
+	 * without reference counting or other symchronizing method, do not
+	 * reset node_data and free pgdat here. Just reset it to 0 and reuse
+	 * the memory when the node is online again.
+	 */
+	memset(pgdat, 0, sizeof(*pgdat));
 }
 
 int __ref remove_memory(int nid, u64 start, u64 size)
-- 
1.7.1

^ permalink raw reply related

* [PATCH v6 12/15] memory-hotplug: memory_hotplug: clear zone when removing the memory
From: Tang Chen @ 2013-01-09  9:32 UTC (permalink / raw)
  To: akpm, rientjes, len.brown, benh, paulus, cl, minchan.kim,
	kosaki.motohiro, isimatu.yasuaki, wujianguo, wency, tangchen, hpa,
	linfeng, laijs, mgorman, yinghai, glommer
  Cc: linux-s390, linux-ia64, linux-acpi, linux-sh, x86, linux-kernel,
	cmetcalf, linux-mm, sparclinux, linuxppc-dev
In-Reply-To: <1357723959-5416-1-git-send-email-tangchen@cn.fujitsu.com>

From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>

When a memory is added, we update zone's and pgdat's start_pfn and
spanned_pages in the function __add_zone(). So we should revert them
when the memory is removed.

The patch adds a new function __remove_zone() to do this.

Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
---
 mm/memory_hotplug.c |  207 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 207 insertions(+), 0 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b20c4c7..da20c14 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -430,8 +430,211 @@ static int __meminit __add_section(int nid, struct zone *zone,
 	return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
 }
 
+/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
+static int find_smallest_section_pfn(int nid, struct zone *zone,
+				     unsigned long start_pfn,
+				     unsigned long end_pfn)
+{
+	struct mem_section *ms;
+
+	for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
+		ms = __pfn_to_section(start_pfn);
+
+		if (unlikely(!valid_section(ms)))
+			continue;
+
+		if (unlikely(pfn_to_nid(start_pfn) != nid))
+			continue;
+
+		if (zone && zone != page_zone(pfn_to_page(start_pfn)))
+			continue;
+
+		return start_pfn;
+	}
+
+	return 0;
+}
+
+/* find the biggest valid pfn in the range [start_pfn, end_pfn). */
+static int find_biggest_section_pfn(int nid, struct zone *zone,
+				    unsigned long start_pfn,
+				    unsigned long end_pfn)
+{
+	struct mem_section *ms;
+	unsigned long pfn;
+
+	/* pfn is the end pfn of a memory section. */
+	pfn = end_pfn - 1;
+	for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
+		ms = __pfn_to_section(pfn);
+
+		if (unlikely(!valid_section(ms)))
+			continue;
+
+		if (unlikely(pfn_to_nid(pfn) != nid))
+			continue;
+
+		if (zone && zone != page_zone(pfn_to_page(pfn)))
+			continue;
+
+		return pfn;
+	}
+
+	return 0;
+}
+
+static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
+			     unsigned long end_pfn)
+{
+	unsigned long zone_start_pfn =  zone->zone_start_pfn;
+	unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+	unsigned long pfn;
+	struct mem_section *ms;
+	int nid = zone_to_nid(zone);
+
+	zone_span_writelock(zone);
+	if (zone_start_pfn == start_pfn) {
+		/*
+		 * If the section is smallest section in the zone, it need
+		 * shrink zone->zone_start_pfn and zone->zone_spanned_pages.
+		 * In this case, we find second smallest valid mem_section
+		 * for shrinking zone.
+		 */
+		pfn = find_smallest_section_pfn(nid, zone, end_pfn,
+						zone_end_pfn);
+		if (pfn) {
+			zone->zone_start_pfn = pfn;
+			zone->spanned_pages = zone_end_pfn - pfn;
+		}
+	} else if (zone_end_pfn == end_pfn) {
+		/*
+		 * If the section is biggest section in the zone, it need
+		 * shrink zone->spanned_pages.
+		 * In this case, we find second biggest valid mem_section for
+		 * shrinking zone.
+		 */
+		pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
+					       start_pfn);
+		if (pfn)
+			zone->spanned_pages = pfn - zone_start_pfn + 1;
+	}
+
+	/*
+	 * The section is not biggest or smallest mem_section in the zone, it
+	 * only creates a hole in the zone. So in this case, we need not
+	 * change the zone. But perhaps, the zone has only hole data. Thus
+	 * it check the zone has only hole or not.
+	 */
+	pfn = zone_start_pfn;
+	for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
+		ms = __pfn_to_section(pfn);
+
+		if (unlikely(!valid_section(ms)))
+			continue;
+
+		if (page_zone(pfn_to_page(pfn)) != zone)
+			continue;
+
+		 /* If the section is current section, it continues the loop */
+		if (start_pfn == pfn)
+			continue;
+
+		/* If we find valid section, we have nothing to do */
+		zone_span_writeunlock(zone);
+		return;
+	}
+
+	/* The zone has no valid section */
+	zone->zone_start_pfn = 0;
+	zone->spanned_pages = 0;
+	zone_span_writeunlock(zone);
+}
+
+static void shrink_pgdat_span(struct pglist_data *pgdat,
+			      unsigned long start_pfn, unsigned long end_pfn)
+{
+	unsigned long pgdat_start_pfn =  pgdat->node_start_pfn;
+	unsigned long pgdat_end_pfn =
+		pgdat->node_start_pfn + pgdat->node_spanned_pages;
+	unsigned long pfn;
+	struct mem_section *ms;
+	int nid = pgdat->node_id;
+
+	if (pgdat_start_pfn == start_pfn) {
+		/*
+		 * If the section is smallest section in the pgdat, it need
+		 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages.
+		 * In this case, we find second smallest valid mem_section
+		 * for shrinking zone.
+		 */
+		pfn = find_smallest_section_pfn(nid, NULL, end_pfn,
+						pgdat_end_pfn);
+		if (pfn) {
+			pgdat->node_start_pfn = pfn;
+			pgdat->node_spanned_pages = pgdat_end_pfn - pfn;
+		}
+	} else if (pgdat_end_pfn == end_pfn) {
+		/*
+		 * If the section is biggest section in the pgdat, it need
+		 * shrink pgdat->node_spanned_pages.
+		 * In this case, we find second biggest valid mem_section for
+		 * shrinking zone.
+		 */
+		pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn,
+					       start_pfn);
+		if (pfn)
+			pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1;
+	}
+
+	/*
+	 * If the section is not biggest or smallest mem_section in the pgdat,
+	 * it only creates a hole in the pgdat. So in this case, we need not
+	 * change the pgdat.
+	 * But perhaps, the pgdat has only hole data. Thus it check the pgdat
+	 * has only hole or not.
+	 */
+	pfn = pgdat_start_pfn;
+	for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) {
+		ms = __pfn_to_section(pfn);
+
+		if (unlikely(!valid_section(ms)))
+			continue;
+
+		if (pfn_to_nid(pfn) != nid)
+			continue;
+
+		 /* If the section is current section, it continues the loop */
+		if (start_pfn == pfn)
+			continue;
+
+		/* If we find valid section, we have nothing to do */
+		return;
+	}
+
+	/* The pgdat has no valid section */
+	pgdat->node_start_pfn = 0;
+	pgdat->node_spanned_pages = 0;
+}
+
+static void __remove_zone(struct zone *zone, unsigned long start_pfn)
+{
+	struct pglist_data *pgdat = zone->zone_pgdat;
+	int nr_pages = PAGES_PER_SECTION;
+	int zone_type;
+	unsigned long flags;
+
+	zone_type = zone - pgdat->node_zones;
+
+	pgdat_resize_lock(zone->zone_pgdat, &flags);
+	shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
+	shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
+	pgdat_resize_unlock(zone->zone_pgdat, &flags);
+}
+
 static int __remove_section(struct zone *zone, struct mem_section *ms)
 {
+	unsigned long start_pfn;
+	int scn_nr;
 	int ret = -EINVAL;
 
 	if (!valid_section(ms))
@@ -441,6 +644,10 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
 	if (ret)
 		return ret;
 
+	scn_nr = __section_nr(ms);
+	start_pfn = section_nr_to_pfn(scn_nr);
+	__remove_zone(zone, start_pfn);
+
 	sparse_remove_one_section(zone, ms);
 	return 0;
 }
-- 
1.7.1

^ permalink raw reply related

* [PATCH v6 10/15] memory-hotplug: remove memmap of sparse-vmemmap
From: Tang Chen @ 2013-01-09  9:32 UTC (permalink / raw)
  To: akpm, rientjes, len.brown, benh, paulus, cl, minchan.kim,
	kosaki.motohiro, isimatu.yasuaki, wujianguo, wency, tangchen, hpa,
	linfeng, laijs, mgorman, yinghai, glommer
  Cc: linux-s390, linux-ia64, linux-acpi, linux-sh, x86, linux-kernel,
	cmetcalf, linux-mm, sparclinux, linuxppc-dev
In-Reply-To: <1357723959-5416-1-git-send-email-tangchen@cn.fujitsu.com>

This patch introduces a new API vmemmap_free() to free and remove
vmemmap pagetables. Since pagetable implements are different, each
architecture has to provide its own version of vmemmap_free(), just
like vmemmap_populate().

Note:  vmemmap_free() are not implemented for ia64, ppc, s390, and sparc.

Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Jianguo Wu <wujianguo@huawei.com>
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
---
 arch/arm64/mm/mmu.c       |    3 +++
 arch/ia64/mm/discontig.c  |    4 ++++
 arch/powerpc/mm/init_64.c |    4 ++++
 arch/s390/mm/vmem.c       |    4 ++++
 arch/sparc/mm/init_64.c   |    4 ++++
 arch/x86/mm/init_64.c     |    8 ++++++++
 include/linux/mm.h        |    1 +
 mm/sparse.c               |    3 ++-
 8 files changed, 30 insertions(+), 1 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index a6885d8..9834886 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -392,4 +392,7 @@ int __meminit vmemmap_populate(struct page *start_page,
 	return 0;
 }
 #endif	/* CONFIG_ARM64_64K_PAGES */
+void vmemmap_free(struct page *memmap, unsigned long nr_pages)
+{
+}
 #endif	/* CONFIG_SPARSEMEM_VMEMMAP */
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index 33943db..882a0fd 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -823,6 +823,10 @@ int __meminit vmemmap_populate(struct page *start_page,
 	return vmemmap_populate_basepages(start_page, size, node);
 }
 
+void vmemmap_free(struct page *memmap, unsigned long nr_pages)
+{
+}
+
 void register_page_bootmem_memmap(unsigned long section_nr,
 				  struct page *start_page, unsigned long size)
 {
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 6466440..2969591 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -298,6 +298,10 @@ int __meminit vmemmap_populate(struct page *start_page,
 	return 0;
 }
 
+void vmemmap_free(struct page *memmap, unsigned long nr_pages)
+{
+}
+
 void register_page_bootmem_memmap(unsigned long section_nr,
 				  struct page *start_page, unsigned long size)
 {
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 2c14bc2..81e6ba3 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -272,6 +272,10 @@ out:
 	return ret;
 }
 
+void vmemmap_free(struct page *memmap, unsigned long nr_pages)
+{
+}
+
 void register_page_bootmem_memmap(unsigned long section_nr,
 				  struct page *start_page, unsigned long size)
 {
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 1f30db3..5afe21a 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2232,6 +2232,10 @@ void __meminit vmemmap_populate_print_last(void)
 	}
 }
 
+void vmemmap_free(struct page *memmap, unsigned long nr_pages)
+{
+}
+
 void register_page_bootmem_memmap(unsigned long section_nr,
 				  struct page *start_page, unsigned long size)
 {
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d950f9b..e829113 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1309,6 +1309,14 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
 	return 0;
 }
 
+void __ref vmemmap_free(struct page *memmap, unsigned long nr_pages)
+{
+	unsigned long start = (unsigned long)memmap;
+	unsigned long end = (unsigned long)(memmap + nr_pages);
+
+	remove_pagetable(start, end, false);
+}
+
 void register_page_bootmem_memmap(unsigned long section_nr,
 				  struct page *start_page, unsigned long size)
 {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1eca498..31d5e5d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1709,6 +1709,7 @@ int vmemmap_populate_basepages(struct page *start_page,
 						unsigned long pages, int node);
 int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
 void vmemmap_populate_print_last(void);
+void vmemmap_free(struct page *memmap, unsigned long nr_pages);
 void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
 				  unsigned long size);
 
diff --git a/mm/sparse.c b/mm/sparse.c
index 05ca73a..cff9796 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -615,10 +615,11 @@ static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
 }
 static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
 {
-	return; /* XXX: Not implemented yet */
+	vmemmap_free(memmap, nr_pages);
 }
 static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
 {
+	vmemmap_free(memmap, nr_pages);
 }
 #else
 static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
-- 
1.7.1

^ permalink raw reply related

* [PATCH v6 13/15] memory-hotplug: remove sysfs file of node
From: Tang Chen @ 2013-01-09  9:32 UTC (permalink / raw)
  To: akpm, rientjes, len.brown, benh, paulus, cl, minchan.kim,
	kosaki.motohiro, isimatu.yasuaki, wujianguo, wency, tangchen, hpa,
	linfeng, laijs, mgorman, yinghai, glommer
  Cc: linux-s390, linux-ia64, linux-acpi, linux-sh, x86, linux-kernel,
	cmetcalf, linux-mm, sparclinux, linuxppc-dev
In-Reply-To: <1357723959-5416-1-git-send-email-tangchen@cn.fujitsu.com>

This patch introduces a new function try_offline_node() to
remove sysfs file of node when all memory sections of this
node are removed. If some memory sections of this node are
not removed, this function does nothing.

Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
---
 drivers/acpi/acpi_memhotplug.c |    8 ++++-
 include/linux/memory_hotplug.h |    2 +-
 mm/memory_hotplug.c            |   58 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 63 insertions(+), 5 deletions(-)

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index eb30e5a..9c53cc6 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -295,9 +295,11 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
 
 static int acpi_memory_remove_memory(struct acpi_memory_device *mem_device)
 {
-	int result = 0;
+	int result = 0, nid;
 	struct acpi_memory_info *info, *n;
 
+	nid = acpi_get_node(mem_device->device->handle);
+
 	list_for_each_entry_safe(info, n, &mem_device->res_list, list) {
 		if (info->failed)
 			/* The kernel does not use this memory block */
@@ -310,7 +312,9 @@ static int acpi_memory_remove_memory(struct acpi_memory_device *mem_device)
 			 */
 			return -EBUSY;
 
-		result = remove_memory(info->start_addr, info->length);
+		if (nid < 0)
+			nid = memory_add_physaddr_to_nid(info->start_addr);
+		result = remove_memory(nid, info->start_addr, info->length);
 		if (result)
 			return result;
 
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 2441f36..f60e728 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -242,7 +242,7 @@ extern int arch_add_memory(int nid, u64 start, u64 size);
 extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
 extern int offline_memory_block(struct memory_block *mem);
 extern bool is_memblock_offlined(struct memory_block *mem);
-extern int remove_memory(u64 start, u64 size);
+extern int remove_memory(int nid, u64 start, u64 size);
 extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
 								int nr_pages);
 extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index da20c14..a8703f7 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -29,6 +29,7 @@
 #include <linux/suspend.h>
 #include <linux/mm_inline.h>
 #include <linux/firmware-map.h>
+#include <linux/stop_machine.h>
 
 #include <asm/tlbflush.h>
 
@@ -1678,7 +1679,58 @@ static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
 	return ret;
 }
 
-int __ref remove_memory(u64 start, u64 size)
+static int check_cpu_on_node(void *data)
+{
+	struct pglist_data *pgdat = data;
+	int cpu;
+
+	for_each_present_cpu(cpu) {
+		if (cpu_to_node(cpu) == pgdat->node_id)
+			/*
+			 * the cpu on this node isn't removed, and we can't
+			 * offline this node.
+			 */
+			return -EBUSY;
+	}
+
+	return 0;
+}
+
+/* offline the node if all memory sections of this node are removed */
+static void try_offline_node(int nid)
+{
+	unsigned long start_pfn = NODE_DATA(nid)->node_start_pfn;
+	unsigned long end_pfn = start_pfn + NODE_DATA(nid)->node_spanned_pages;
+	unsigned long pfn;
+
+	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+		unsigned long section_nr = pfn_to_section_nr(pfn);
+
+		if (!present_section_nr(section_nr))
+			continue;
+
+		if (pfn_to_nid(pfn) != nid)
+			continue;
+
+		/*
+		 * some memory sections of this node are not removed, and we
+		 * can't offline node now.
+		 */
+		return;
+	}
+
+	if (stop_machine(check_cpu_on_node, NODE_DATA(nid), NULL))
+		return;
+
+	/*
+	 * all memory/cpu of this node are removed, we can offline this
+	 * node now.
+	 */
+	node_set_offline(nid);
+	unregister_one_node(nid);
+}
+
+int __ref remove_memory(int nid, u64 start, u64 size)
 {
 	unsigned long start_pfn, end_pfn;
 	int ret = 0;
@@ -1733,6 +1785,8 @@ repeat:
 
 	arch_remove_memory(start, size);
 
+	try_offline_node(nid);
+
 	unlock_memory_hotplug();
 
 	return 0;
@@ -1742,7 +1796,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 {
 	return -EINVAL;
 }
-int remove_memory(u64 start, u64 size)
+int remove_memory(int nid, u64 start, u64 size)
 {
 	return -EINVAL;
 }
-- 
1.7.1

^ permalink raw reply related

* [PATCH v6 09/15] memory-hotplug: remove page table of x86_64 architecture
From: Tang Chen @ 2013-01-09  9:32 UTC (permalink / raw)
  To: akpm, rientjes, len.brown, benh, paulus, cl, minchan.kim,
	kosaki.motohiro, isimatu.yasuaki, wujianguo, wency, tangchen, hpa,
	linfeng, laijs, mgorman, yinghai, glommer
  Cc: linux-s390, linux-ia64, linux-acpi, linux-sh, x86, linux-kernel,
	cmetcalf, linux-mm, sparclinux, linuxppc-dev
In-Reply-To: <1357723959-5416-1-git-send-email-tangchen@cn.fujitsu.com>

This patch searches a page table about the removed memory, and clear
page table for x86_64 architecture.

Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Jianguo Wu <wujianguo@huawei.com>
Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
---
 arch/x86/mm/init_64.c |   10 ++++++++++
 1 files changed, 10 insertions(+), 0 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index fe01116..d950f9b 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -981,6 +981,15 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct)
 	flush_tlb_all();
 }
 
+void __meminit
+kernel_physical_mapping_remove(unsigned long start, unsigned long end)
+{
+	start = (unsigned long)__va(start);
+	end = (unsigned long)__va(end);
+
+	remove_pagetable(start, end, true);
+}
+
 #ifdef CONFIG_MEMORY_HOTREMOVE
 int __ref arch_remove_memory(u64 start, u64 size)
 {
@@ -990,6 +999,7 @@ int __ref arch_remove_memory(u64 start, u64 size)
 	int ret;
 
 	zone = page_zone(pfn_to_page(start_pfn));
+	kernel_physical_mapping_remove(start, start + size);
 	ret = __remove_pages(zone, start_pfn, nr_pages);
 	WARN_ON_ONCE(ret);
 
-- 
1.7.1

^ permalink raw reply related

* [PATCH v6 02/15] memory-hotplug: check whether all memory blocks are offlined or not when removing memory
From: Tang Chen @ 2013-01-09  9:32 UTC (permalink / raw)
  To: akpm, rientjes, len.brown, benh, paulus, cl, minchan.kim,
	kosaki.motohiro, isimatu.yasuaki, wujianguo, wency, tangchen, hpa,
	linfeng, laijs, mgorman, yinghai, glommer
  Cc: linux-s390, linux-ia64, linux-acpi, linux-sh, x86, linux-kernel,
	cmetcalf, linux-mm, sparclinux, linuxppc-dev
In-Reply-To: <1357723959-5416-1-git-send-email-tangchen@cn.fujitsu.com>

From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>

We remove the memory like this:
1. lock memory hotplug
2. offline a memory block
3. unlock memory hotplug
4. repeat 1-3 to offline all memory blocks
5. lock memory hotplug
6. remove memory(TODO)
7. unlock memory hotplug

All memory blocks must be offlined before removing memory. But we don't hold
the lock in the whole operation. So we should check whether all memory blocks
are offlined before step6. Otherwise, kernel maybe panicked.

Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 drivers/base/memory.c          |    6 +++++
 include/linux/memory_hotplug.h |    1 +
 mm/memory_hotplug.c            |   48 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+), 0 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 987604d..8300a18 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -693,6 +693,12 @@ int offline_memory_block(struct memory_block *mem)
 	return ret;
 }
 
+/* return true if the memory block is offlined, otherwise, return false */
+bool is_memblock_offlined(struct memory_block *mem)
+{
+	return mem->state == MEM_OFFLINE;
+}
+
 /*
  * Initialize the sysfs support for memory devices...
  */
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 4a45c4e..8dd0950 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -247,6 +247,7 @@ extern int add_memory(int nid, u64 start, u64 size);
 extern int arch_add_memory(int nid, u64 start, u64 size);
 extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
 extern int offline_memory_block(struct memory_block *mem);
+extern bool is_memblock_offlined(struct memory_block *mem);
 extern int remove_memory(u64 start, u64 size);
 extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
 								int nr_pages);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 62e04c9..5808045 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1430,6 +1430,54 @@ repeat:
 		goto repeat;
 	}
 
+	lock_memory_hotplug();
+
+	/*
+	 * we have offlined all memory blocks like this:
+	 *   1. lock memory hotplug
+	 *   2. offline a memory block
+	 *   3. unlock memory hotplug
+	 *
+	 * repeat step1-3 to offline the memory block. All memory blocks
+	 * must be offlined before removing memory. But we don't hold the
+	 * lock in the whole operation. So we should check whether all
+	 * memory blocks are offlined.
+	 */
+
+	mem = NULL;
+	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+		section_nr = pfn_to_section_nr(pfn);
+		if (!present_section_nr(section_nr))
+			continue;
+
+		section = __nr_to_section(section_nr);
+		/* same memblock? */
+		if (mem)
+			if ((section_nr >= mem->start_section_nr) &&
+			    (section_nr <= mem->end_section_nr))
+				continue;
+
+		mem = find_memory_block_hinted(section, mem);
+		if (!mem)
+			continue;
+
+		ret = is_memblock_offlined(mem);
+		if (!ret) {
+			pr_warn("removing memory fails, because memory "
+				"[%#010llx-%#010llx] is onlined\n",
+				PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)),
+				PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1)) - 1);
+
+			kobject_put(&mem->dev.kobj);
+			unlock_memory_hotplug();
+			return ret;
+		}
+	}
+
+	if (mem)
+		kobject_put(&mem->dev.kobj);
+	unlock_memory_hotplug();
+
 	return 0;
 }
 #else
-- 
1.7.1

^ permalink raw reply related

* [PATCH v6 06/15] memory-hotplug: implement register_page_bootmem_info_section of sparse-vmemmap
From: Tang Chen @ 2013-01-09  9:32 UTC (permalink / raw)
  To: akpm, rientjes, len.brown, benh, paulus, cl, minchan.kim,
	kosaki.motohiro, isimatu.yasuaki, wujianguo, wency, tangchen, hpa,
	linfeng, laijs, mgorman, yinghai, glommer
  Cc: linux-s390, linux-ia64, linux-acpi, linux-sh, x86, linux-kernel,
	cmetcalf, linux-mm, sparclinux, linuxppc-dev
In-Reply-To: <1357723959-5416-1-git-send-email-tangchen@cn.fujitsu.com>

From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>

For removing memmap region of sparse-vmemmap which is allocated bootmem,
memmap region of sparse-vmemmap needs to be registered by get_page_bootmem().
So the patch searches pages of virtual mapping and registers the pages by
get_page_bootmem().

Note: register_page_bootmem_memmap() is not implemented for ia64, ppc, s390,
and sparc.

Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Reviewed-by: Wu Jianguo <wujianguo@huawei.com>
---
 arch/ia64/mm/discontig.c       |    6 ++++
 arch/powerpc/mm/init_64.c      |    6 ++++
 arch/s390/mm/vmem.c            |    6 ++++
 arch/sparc/mm/init_64.c        |    6 ++++
 arch/x86/mm/init_64.c          |   58 ++++++++++++++++++++++++++++++++++++++++
 include/linux/memory_hotplug.h |   11 +------
 include/linux/mm.h             |    3 +-
 mm/memory_hotplug.c            |   33 ++++++++++++++++++++---
 8 files changed, 115 insertions(+), 14 deletions(-)

diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index c641333..33943db 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -822,4 +822,10 @@ int __meminit vmemmap_populate(struct page *start_page,
 {
 	return vmemmap_populate_basepages(start_page, size, node);
 }
+
+void register_page_bootmem_memmap(unsigned long section_nr,
+				  struct page *start_page, unsigned long size)
+{
+	/* TODO */
+}
 #endif
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 95a4529..6466440 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -297,5 +297,11 @@ int __meminit vmemmap_populate(struct page *start_page,
 
 	return 0;
 }
+
+void register_page_bootmem_memmap(unsigned long section_nr,
+				  struct page *start_page, unsigned long size)
+{
+	/* TODO */
+}
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 6ed1426..2c14bc2 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -272,6 +272,12 @@ out:
 	return ret;
 }
 
+void register_page_bootmem_memmap(unsigned long section_nr,
+				  struct page *start_page, unsigned long size)
+{
+	/* TODO */
+}
+
 /*
  * Add memory segment to the segment list if it doesn't overlap with
  * an already present segment.
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index c3b7242..1f30db3 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2231,6 +2231,12 @@ void __meminit vmemmap_populate_print_last(void)
 		node_start = 0;
 	}
 }
+
+void register_page_bootmem_memmap(unsigned long section_nr,
+				  struct page *start_page, unsigned long size)
+{
+	/* TODO */
+}
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
 static void prot_init_common(unsigned long page_none,
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index f78509c..9ac1723 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1000,6 +1000,64 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
 	return 0;
 }
 
+void register_page_bootmem_memmap(unsigned long section_nr,
+				  struct page *start_page, unsigned long size)
+{
+	unsigned long addr = (unsigned long)start_page;
+	unsigned long end = (unsigned long)(start_page + size);
+	unsigned long next;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	unsigned int nr_pages;
+	struct page *page;
+
+	for (; addr < end; addr = next) {
+		pte_t *pte = NULL;
+
+		pgd = pgd_offset_k(addr);
+		if (pgd_none(*pgd)) {
+			next = (addr + PAGE_SIZE) & PAGE_MASK;
+			continue;
+		}
+		get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO);
+
+		pud = pud_offset(pgd, addr);
+		if (pud_none(*pud)) {
+			next = (addr + PAGE_SIZE) & PAGE_MASK;
+			continue;
+		}
+		get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
+
+		if (!cpu_has_pse) {
+			next = (addr + PAGE_SIZE) & PAGE_MASK;
+			pmd = pmd_offset(pud, addr);
+			if (pmd_none(*pmd))
+				continue;
+			get_page_bootmem(section_nr, pmd_page(*pmd),
+					 MIX_SECTION_INFO);
+
+			pte = pte_offset_kernel(pmd, addr);
+			if (pte_none(*pte))
+				continue;
+			get_page_bootmem(section_nr, pte_page(*pte),
+					 SECTION_INFO);
+		} else {
+			next = pmd_addr_end(addr, end);
+
+			pmd = pmd_offset(pud, addr);
+			if (pmd_none(*pmd))
+				continue;
+
+			nr_pages = 1 << (get_order(PMD_SIZE));
+			page = pmd_page(*pmd);
+			while (nr_pages--)
+				get_page_bootmem(section_nr, page++,
+						 SECTION_INFO);
+		}
+	}
+}
+
 void __meminit vmemmap_populate_print_last(void)
 {
 	if (p_start) {
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 31a563b..2441f36 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -174,17 +174,10 @@ static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
 #endif /* CONFIG_NUMA */
 #endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
 
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
-{
-}
-static inline void put_page_bootmem(struct page *page)
-{
-}
-#else
 extern void register_page_bootmem_info_node(struct pglist_data *pgdat);
 extern void put_page_bootmem(struct page *page);
-#endif
+extern void get_page_bootmem(unsigned long ingo, struct page *page,
+			     unsigned long type);
 
 /*
  * Lock for memory hotplug guarantees 1) all callbacks for memory hotplug
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6320407..1eca498 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1709,7 +1709,8 @@ int vmemmap_populate_basepages(struct page *start_page,
 						unsigned long pages, int node);
 int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
 void vmemmap_populate_print_last(void);
-
+void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
+				  unsigned long size);
 
 enum mf_flags {
 	MF_COUNT_INCREASED = 1 << 0,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f6724c2..0682d2a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -91,9 +91,8 @@ static void release_memory_resource(struct resource *res)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
-#ifndef CONFIG_SPARSEMEM_VMEMMAP
-static void get_page_bootmem(unsigned long info,  struct page *page,
-			     unsigned long type)
+void get_page_bootmem(unsigned long info,  struct page *page,
+		      unsigned long type)
 {
 	page->lru.next = (struct list_head *) type;
 	SetPagePrivate(page);
@@ -128,6 +127,7 @@ void __ref put_page_bootmem(struct page *page)
 
 }
 
+#ifndef CONFIG_SPARSEMEM_VMEMMAP
 static void register_page_bootmem_info_section(unsigned long start_pfn)
 {
 	unsigned long *usemap, mapsize, section_nr, i;
@@ -161,6 +161,32 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
 		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
 
 }
+#else
+static void register_page_bootmem_info_section(unsigned long start_pfn)
+{
+	unsigned long *usemap, mapsize, section_nr, i;
+	struct mem_section *ms;
+	struct page *page, *memmap;
+
+	if (!pfn_valid(start_pfn))
+		return;
+
+	section_nr = pfn_to_section_nr(start_pfn);
+	ms = __nr_to_section(section_nr);
+
+	memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+
+	register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
+
+	usemap = __nr_to_section(section_nr)->pageblock_flags;
+	page = virt_to_page(usemap);
+
+	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
+
+	for (i = 0; i < mapsize; i++, page++)
+		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
+}
+#endif
 
 void register_page_bootmem_info_node(struct pglist_data *pgdat)
 {
@@ -203,7 +229,6 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
 			register_page_bootmem_info_section(pfn);
 	}
 }
-#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
 
 static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
 			   unsigned long end_pfn)
-- 
1.7.1

^ permalink raw reply related

* [PATCH v6 08/15] memory-hotplug: Common APIs to support page tables hot-remove
From: Tang Chen @ 2013-01-09  9:32 UTC (permalink / raw)
  To: akpm, rientjes, len.brown, benh, paulus, cl, minchan.kim,
	kosaki.motohiro, isimatu.yasuaki, wujianguo, wency, tangchen, hpa,
	linfeng, laijs, mgorman, yinghai, glommer
  Cc: linux-s390, linux-ia64, linux-acpi, linux-sh, x86, linux-kernel,
	cmetcalf, linux-mm, sparclinux, linuxppc-dev
In-Reply-To: <1357723959-5416-1-git-send-email-tangchen@cn.fujitsu.com>

From: Wen Congyang <wency@cn.fujitsu.com>

When memory is removed, the corresponding pagetables should alse be removed.
This patch introduces some common APIs to support vmemmap pagetable and x86_64
architecture pagetable removing.

All pages of virtual mapping in removed memory cannot be freedi if some pages
used as PGD/PUD includes not only removed memory but also other memory. So the
patch uses the following way to check whether page can be freed or not.

 1. When removing memory, the page structs of the revmoved memory are filled
    with 0FD.
 2. All page structs are filled with 0xFD on PT/PMD, PT/PMD can be cleared.
    In this case, the page used as PT/PMD can be freed.

Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Jianguo Wu <wujianguo@huawei.com>
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
---
 arch/x86/include/asm/pgtable_types.h |    1 +
 arch/x86/mm/init_64.c                |  299 ++++++++++++++++++++++++++++++++++
 arch/x86/mm/pageattr.c               |   47 +++---
 include/linux/bootmem.h              |    1 +
 4 files changed, 326 insertions(+), 22 deletions(-)

diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 3c32db8..4b6fd2a 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -352,6 +352,7 @@ static inline void update_page_count(int level, unsigned long pages) { }
  * as a pte too.
  */
 extern pte_t *lookup_address(unsigned long address, unsigned int *level);
+extern int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase);
 
 #endif	/* !__ASSEMBLY__ */
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 9ac1723..fe01116 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -682,6 +682,305 @@ int arch_add_memory(int nid, u64 start, u64 size)
 }
 EXPORT_SYMBOL_GPL(arch_add_memory);
 
+#define PAGE_INUSE 0xFD
+
+static void __meminit free_pagetable(struct page *page, int order)
+{
+	struct zone *zone;
+	bool bootmem = false;
+	unsigned long magic;
+	unsigned int nr_pages = 1 << order;
+
+	/* bootmem page has reserved flag */
+	if (PageReserved(page)) {
+		__ClearPageReserved(page);
+		bootmem = true;
+
+		magic = (unsigned long)page->lru.next;
+		if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
+			while (nr_pages--)
+				put_page_bootmem(page++);
+		} else
+			__free_pages_bootmem(page, order);
+	} else
+		free_pages((unsigned long)page_address(page), order);
+
+	/*
+	 * SECTION_INFO pages and MIX_SECTION_INFO pages
+	 * are all allocated by bootmem.
+	 */
+	if (bootmem) {
+		zone = page_zone(page);
+		zone_span_writelock(zone);
+		zone->present_pages += nr_pages;
+		zone_span_writeunlock(zone);
+		totalram_pages += nr_pages;
+	}
+}
+
+static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+	pte_t *pte;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PTE; i++) {
+		pte = pte_start + i;
+		if (pte_val(*pte))
+			return;
+	}
+
+	/* free a pte talbe */
+	free_pagetable(pmd_page(*pmd), 0);
+	spin_lock(&init_mm.page_table_lock);
+	pmd_clear(pmd);
+	spin_unlock(&init_mm.page_table_lock);
+}
+
+static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+	pmd_t *pmd;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PMD; i++) {
+		pmd = pmd_start + i;
+		if (pmd_val(*pmd))
+			return;
+	}
+
+	/* free a pmd talbe */
+	free_pagetable(pud_page(*pud), 0);
+	spin_lock(&init_mm.page_table_lock);
+	pud_clear(pud);
+	spin_unlock(&init_mm.page_table_lock);
+}
+
+/* Return true if pgd is changed, otherwise return false. */
+static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd)
+{
+	pud_t *pud;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PUD; i++) {
+		pud = pud_start + i;
+		if (pud_val(*pud))
+			return false;
+	}
+
+	/* free a pud table */
+	free_pagetable(pgd_page(*pgd), 0);
+	spin_lock(&init_mm.page_table_lock);
+	pgd_clear(pgd);
+	spin_unlock(&init_mm.page_table_lock);
+
+	return true;
+}
+
+static void __meminit
+remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
+		 bool direct)
+{
+	unsigned long next, pages = 0;
+	pte_t *pte;
+	void *page_addr;
+	phys_addr_t phys_addr;
+
+	pte = pte_start + pte_index(addr);
+	for (; addr < end; addr = next, pte++) {
+		next = (addr + PAGE_SIZE) & PAGE_MASK;
+		if (next > end)
+			next = end;
+
+		if (!pte_present(*pte))
+			continue;
+
+		/*
+		 * We mapped [0,1G) memory as identity mapping when
+		 * initializing, in arch/x86/kernel/head_64.S. These
+		 * pagetables cannot be removed.
+		 */
+		phys_addr = pte_val(*pte) + (addr & PAGE_MASK);
+		if (phys_addr < (phys_addr_t)0x40000000)
+			return;
+
+		if (IS_ALIGNED(addr, PAGE_SIZE) &&
+		    IS_ALIGNED(next, PAGE_SIZE)) {
+			if (!direct) {
+				free_pagetable(pte_page(*pte), 0);
+				pages++;
+			}
+
+			spin_lock(&init_mm.page_table_lock);
+			pte_clear(&init_mm, addr, pte);
+			spin_unlock(&init_mm.page_table_lock);
+		} else {
+			/*
+			 * If we are not removing the whole page, it means
+			 * other ptes in this page are being used and we canot
+			 * remove them. So fill the unused ptes with 0xFD, and
+			 * remove the page when it is wholly filled with 0xFD.
+			 */
+			memset((void *)addr, PAGE_INUSE, next - addr);
+			page_addr = page_address(pte_page(*pte));
+
+			if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
+				free_pagetable(pte_page(*pte), 0);
+				pages++;
+
+				spin_lock(&init_mm.page_table_lock);
+				pte_clear(&init_mm, addr, pte);
+				spin_unlock(&init_mm.page_table_lock);
+			}
+		}
+	}
+
+	/* Call free_pte_table() in remove_pmd_table(). */
+	flush_tlb_all();
+	if (direct)
+		update_page_count(PG_LEVEL_4K, -pages);
+}
+
+static void __meminit
+remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
+		 bool direct)
+{
+	unsigned long pte_phys, next, pages = 0;
+	pte_t *pte_base;
+	pmd_t *pmd;
+
+	pmd = pmd_start + pmd_index(addr);
+	for (; addr < end; addr = next, pmd++) {
+		next = pmd_addr_end(addr, end);
+
+		if (!pmd_present(*pmd))
+			continue;
+
+		if (pmd_large(*pmd)) {
+			if (IS_ALIGNED(addr, PMD_SIZE) &&
+			    IS_ALIGNED(next, PMD_SIZE)) {
+				if (!direct) {
+					free_pagetable(pmd_page(*pmd),
+						       get_order(PMD_SIZE));
+					pages++;
+				}
+
+				spin_lock(&init_mm.page_table_lock);
+				pmd_clear(pmd);
+				spin_unlock(&init_mm.page_table_lock);
+				continue;
+			}
+
+			/*
+			 * We use 2M page, but we need to remove part of them,
+			 * so split 2M page to 4K page.
+			 */
+			pte_base = (pte_t *)alloc_low_page(&pte_phys);
+			BUG_ON(!pte_base);
+			__split_large_page((pte_t *)pmd, addr,
+					   (pte_t *)pte_base);
+
+			spin_lock(&init_mm.page_table_lock);
+			pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
+			spin_unlock(&init_mm.page_table_lock);
+
+			flush_tlb_all();
+		}
+
+		pte_base = (pte_t *)map_low_page((pte_t *)pmd_page_vaddr(*pmd));
+		remove_pte_table(pte_base, addr, next, direct);
+		free_pte_table(pte_base, pmd);
+		unmap_low_page(pte_base);
+	}
+
+	/* Call free_pmd_table() in remove_pud_table(). */
+	if (direct)
+		update_page_count(PG_LEVEL_2M, -pages);
+}
+
+static void __meminit
+remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
+		 bool direct)
+{
+	unsigned long pmd_phys, next, pages = 0;
+	pmd_t *pmd_base;
+	pud_t *pud;
+
+	pud = pud_start + pud_index(addr);
+	for (; addr < end; addr = next, pud++) {
+		next = pud_addr_end(addr, end);
+
+		if (!pud_present(*pud))
+			continue;
+
+		if (pud_large(*pud)) {
+			if (IS_ALIGNED(addr, PUD_SIZE) &&
+			    IS_ALIGNED(next, PUD_SIZE)) {
+				if (!direct) {
+					free_pagetable(pud_page(*pud),
+						       get_order(PUD_SIZE));
+					pages++;
+				}
+
+				spin_lock(&init_mm.page_table_lock);
+				pud_clear(pud);
+				spin_unlock(&init_mm.page_table_lock);
+				continue;
+			}
+
+			/*
+			 * We use 1G page, but we need to remove part of them,
+			 * so split 1G page to 2M page.
+			 */
+			pmd_base = (pmd_t *)alloc_low_page(&pmd_phys);
+			BUG_ON(!pmd_base);
+			__split_large_page((pte_t *)pud, addr,
+					   (pte_t *)pmd_base);
+
+			spin_lock(&init_mm.page_table_lock);
+			pud_populate(&init_mm, pud, __va(pmd_phys));
+			spin_unlock(&init_mm.page_table_lock);
+
+			flush_tlb_all();
+		}
+
+		pmd_base = (pmd_t *)map_low_page((pmd_t *)pud_page_vaddr(*pud));
+		remove_pmd_table(pmd_base, addr, next, direct);
+		free_pmd_table(pmd_base, pud);
+		unmap_low_page(pmd_base);
+	}
+
+	if (direct)
+		update_page_count(PG_LEVEL_1G, -pages);
+}
+
+/* start and end are both virtual address. */
+static void __meminit
+remove_pagetable(unsigned long start, unsigned long end, bool direct)
+{
+	unsigned long next;
+	pgd_t *pgd;
+	pud_t *pud;
+	bool pgd_changed = false;
+
+	for (; start < end; start = next) {
+		pgd = pgd_offset_k(start);
+		if (!pgd_present(*pgd))
+			continue;
+
+		next = pgd_addr_end(start, end);
+
+		pud = (pud_t *)map_low_page((pud_t *)pgd_page_vaddr(*pgd));
+		remove_pud_table(pud, start, next, direct);
+		if (free_pud_table(pud, pgd))
+			pgd_changed = true;
+		unmap_low_page(pud);
+	}
+
+	if (pgd_changed)
+		sync_global_pgds(start, end - 1);
+
+	flush_tlb_all();
+}
+
 #ifdef CONFIG_MEMORY_HOTREMOVE
 int __ref arch_remove_memory(u64 start, u64 size)
 {
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a718e0d..7dcb6f9 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -501,21 +501,13 @@ out_unlock:
 	return do_split;
 }
 
-static int split_large_page(pte_t *kpte, unsigned long address)
+int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase)
 {
 	unsigned long pfn, pfninc = 1;
 	unsigned int i, level;
-	pte_t *pbase, *tmp;
+	pte_t *tmp;
 	pgprot_t ref_prot;
-	struct page *base;
-
-	if (!debug_pagealloc)
-		spin_unlock(&cpa_lock);
-	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
-	if (!debug_pagealloc)
-		spin_lock(&cpa_lock);
-	if (!base)
-		return -ENOMEM;
+	struct page *base = virt_to_page(pbase);
 
 	spin_lock(&pgd_lock);
 	/*
@@ -523,10 +515,11 @@ static int split_large_page(pte_t *kpte, unsigned long address)
 	 * up for us already:
 	 */
 	tmp = lookup_address(address, &level);
-	if (tmp != kpte)
-		goto out_unlock;
+	if (tmp != kpte) {
+		spin_unlock(&pgd_lock);
+		return 1;
+	}
 
-	pbase = (pte_t *)page_address(base);
 	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
 	ref_prot = pte_pgprot(pte_clrhuge(*kpte));
 	/*
@@ -579,17 +572,27 @@ static int split_large_page(pte_t *kpte, unsigned long address)
 	 * going on.
 	 */
 	__flush_tlb_all();
+	spin_unlock(&pgd_lock);
 
-	base = NULL;
+	return 0;
+}
 
-out_unlock:
-	/*
-	 * If we dropped out via the lookup_address check under
-	 * pgd_lock then stick the page back into the pool:
-	 */
-	if (base)
+static int split_large_page(pte_t *kpte, unsigned long address)
+{
+	pte_t *pbase;
+	struct page *base;
+
+	if (!debug_pagealloc)
+		spin_unlock(&cpa_lock);
+	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
+	if (!debug_pagealloc)
+		spin_lock(&cpa_lock);
+	if (!base)
+		return -ENOMEM;
+
+	pbase = (pte_t *)page_address(base);
+	if (__split_large_page(kpte, address, pbase))
 		__free_page(base);
-	spin_unlock(&pgd_lock);
 
 	return 0;
 }
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 3f778c2..190ff06 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -53,6 +53,7 @@ extern void free_bootmem_node(pg_data_t *pgdat,
 			      unsigned long size);
 extern void free_bootmem(unsigned long physaddr, unsigned long size);
 extern void free_bootmem_late(unsigned long physaddr, unsigned long size);
+extern void __free_pages_bootmem(struct page *page, unsigned int order);
 
 /*
  * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE,
-- 
1.7.1

^ permalink raw reply related

* [PATCH v6 05/15] memory-hotplug: introduce new function arch_remove_memory() for removing page table depends on architecture
From: Tang Chen @ 2013-01-09  9:32 UTC (permalink / raw)
  To: akpm, rientjes, len.brown, benh, paulus, cl, minchan.kim,
	kosaki.motohiro, isimatu.yasuaki, wujianguo, wency, tangchen, hpa,
	linfeng, laijs, mgorman, yinghai, glommer
  Cc: linux-s390, linux-ia64, linux-acpi, linux-sh, x86, linux-kernel,
	cmetcalf, linux-mm, sparclinux, linuxppc-dev
In-Reply-To: <1357723959-5416-1-git-send-email-tangchen@cn.fujitsu.com>

From: Wen Congyang <wency@cn.fujitsu.com>

For removing memory, we need to remove page table. But it depends
on architecture. So the patch introduce arch_remove_memory() for
removing page table. Now it only calls __remove_pages().

Note: __remove_pages() for some archtecuture is not implemented
      (I don't know how to implement it for s390).

Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 arch/ia64/mm/init.c            |   18 ++++++++++++++++++
 arch/powerpc/mm/mem.c          |   12 ++++++++++++
 arch/s390/mm/init.c            |   12 ++++++++++++
 arch/sh/mm/init.c              |   17 +++++++++++++++++
 arch/tile/mm/init.c            |    8 ++++++++
 arch/x86/mm/init_32.c          |   12 ++++++++++++
 arch/x86/mm/init_64.c          |   15 +++++++++++++++
 include/linux/memory_hotplug.h |    1 +
 mm/memory_hotplug.c            |    2 ++
 9 files changed, 97 insertions(+), 0 deletions(-)

diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index b755ea9..20bc967 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -688,6 +688,24 @@ int arch_add_memory(int nid, u64 start, u64 size)
 
 	return ret;
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+	struct zone *zone;
+	int ret;
+
+	zone = page_zone(pfn_to_page(start_pfn));
+	ret = __remove_pages(zone, start_pfn, nr_pages);
+	if (ret)
+		pr_warn("%s: Problem encountered in __remove_pages() as"
+			" ret=%d\n", __func__,  ret);
+
+	return ret;
+}
+#endif
 #endif
 
 /*
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 0dba506..09c6451 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -133,6 +133,18 @@ int arch_add_memory(int nid, u64 start, u64 size)
 
 	return __add_pages(nid, zone, start_pfn, nr_pages);
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+	struct zone *zone;
+
+	zone = page_zone(pfn_to_page(start_pfn));
+	return __remove_pages(zone, start_pfn, nr_pages);
+}
+#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 /*
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index ae672f4..49ce6bb 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -228,4 +228,16 @@ int arch_add_memory(int nid, u64 start, u64 size)
 		vmem_remove_mapping(start, size);
 	return rc;
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+	/*
+	 * There is no hardware or firmware interface which could trigger a
+	 * hot memory remove on s390. So there is nothing that needs to be
+	 * implemented.
+	 */
+	return -EBUSY;
+}
+#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 82cc576..1057940 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -558,4 +558,21 @@ int memory_add_physaddr_to_nid(u64 addr)
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #endif
 
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+	struct zone *zone;
+	int ret;
+
+	zone = page_zone(pfn_to_page(start_pfn));
+	ret = __remove_pages(zone, start_pfn, nr_pages);
+	if (unlikely(ret))
+		pr_warn("%s: Failed, __remove_pages() == %d\n", __func__,
+			ret);
+
+	return ret;
+}
+#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index ef29d6c..2749515 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -935,6 +935,14 @@ int remove_memory(u64 start, u64 size)
 {
 	return -EINVAL;
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+	/* TODO */
+	return -EBUSY;
+}
+#endif
 #endif
 
 struct kmem_cache *pgd_cache;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 745d66b..3166e78 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -836,6 +836,18 @@ int arch_add_memory(int nid, u64 start, u64 size)
 
 	return __add_pages(nid, zone, start_pfn, nr_pages);
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+	struct zone *zone;
+
+	zone = page_zone(pfn_to_page(start_pfn));
+	return __remove_pages(zone, start_pfn, nr_pages);
+}
+#endif
 #endif
 
 /*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index e779e0b..f78509c 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -682,6 +682,21 @@ int arch_add_memory(int nid, u64 start, u64 size)
 }
 EXPORT_SYMBOL_GPL(arch_add_memory);
 
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int __ref arch_remove_memory(u64 start, u64 size)
+{
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+	struct zone *zone;
+	int ret;
+
+	zone = page_zone(pfn_to_page(start_pfn));
+	ret = __remove_pages(zone, start_pfn, nr_pages);
+	WARN_ON_ONCE(ret);
+
+	return ret;
+}
+#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 static struct kcore_list kcore_vsyscall;
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 8dd0950..31a563b 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -96,6 +96,7 @@ extern void __online_page_free(struct page *page);
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
 extern bool is_pageblock_removable_nolock(struct page *page);
+extern int arch_remove_memory(u64 start, u64 size);
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
 /* reasonably generic interface to expand the physical pages in a zone  */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9fd5904..f6724c2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1514,6 +1514,8 @@ repeat:
 	/* remove memmap entry */
 	firmware_map_remove(start, start + size, "System RAM");
 
+	arch_remove_memory(start, size);
+
 	unlock_memory_hotplug();
 
 	return 0;
-- 
1.7.1

^ permalink raw reply related

* [PATCH v6 00/15] memory-hotplug: hot-remove physical memory
From: Tang Chen @ 2013-01-09  9:32 UTC (permalink / raw)
  To: akpm, rientjes, len.brown, benh, paulus, cl, minchan.kim,
	kosaki.motohiro, isimatu.yasuaki, wujianguo, wency, tangchen, hpa,
	linfeng, laijs, mgorman, yinghai, glommer
  Cc: linux-s390, linux-ia64, linux-acpi, linux-sh, x86, linux-kernel,
	cmetcalf, linux-mm, sparclinux, linuxppc-dev

Here is the physical memory hot-remove patch-set based on 3.8rc-2.

This patch-set aims to implement physical memory hot-removing.

The patches can free/remove the following things:

  - /sys/firmware/memmap/X/{end, start, type} : [PATCH 4/15]
  - memmap of sparse-vmemmap                  : [PATCH 6,7,8,10/15]
  - page table of removed memory              : [RFC PATCH 7,8,10/15]
  - node and related sysfs files              : [RFC PATCH 13-15/15]

Existing problem:
If CONFIG_MEMCG is selected, we will allocate memory to store page cgroup
when we online pages.

For example: there is a memory device on node 1. The address range
is [1G, 1.5G). You will find 4 new directories memory8, memory9, memory10,
and memory11 under the directory /sys/devices/system/memory/.

If CONFIG_MEMCG is selected, when we online memory8, the memory stored page
cgroup is not provided by this memory device. But when we online memory9, the
memory stored page cgroup may be provided by memory8. So we can't offline
memory8 now. We should offline the memory in the reversed order.

When the memory device is hotremoved, we will auto offline memory provided
by this memory device. But we don't know which memory is onlined first, so
offlining memory may fail.

In patch1, we provide a solution which is not good enough:
Iterate twice to offline the memory.
1st iterate: offline every non primary memory block.
2nd iterate: offline primary (i.e. first added) memory block.

And a new idea from Wen Congyang <wency@cn.fujitsu.com> is:
allocate the memory from the memory block they are describing.

But we are not sure if it is OK to do so because there is not existing API
to do so, and we need to move page_cgroup memory allocation from MEM_GOING_ONLINE
to MEM_ONLINE. And also, it may interfere the hugepage.

How to test this patchset?
1. apply this patchset and build the kernel. MEMORY_HOTPLUG, MEMORY_HOTREMOVE,
   ACPI_HOTPLUG_MEMORY must be selected.
2. load the module acpi_memhotplug
3. hotplug the memory device(it depends on your hardware)
   You will see the memory device under the directory /sys/bus/acpi/devices/.
   Its name is PNP0C80:XX.
4. online/offline pages provided by this memory device
   You can write online/offline to /sys/devices/system/memory/memoryX/state to
   online/offline pages provided by this memory device
5. hotremove the memory device
   You can hotremove the memory device by the hardware, or writing 1 to
   /sys/bus/acpi/devices/PNP0C80:XX/eject.

Note: if the memory provided by the memory device is used by the kernel, it
can't be offlined. It is not a bug.

Changelogs from v5 to v6:
 Patch3: Add some more comments to explain memory hot-remove.
 Patch4: Remove bootmem member in struct firmware_map_entry.
 Patch6: Repeatedly register bootmem pages when using hugepage.
 Patch8: Repeatedly free bootmem pages when using hugepage.
 Patch14: Don't free pgdat when offlining a node, just reset it to 0.
 Patch15: New patch, pgdat is not freed in patch14, so don't allocate a new
          one when online a node.

Changelogs from v4 to v5:
 Patch7: new patch, move pgdat_resize_lock into sparse_remove_one_section() to
         avoid disabling irq because we need flush tlb when free pagetables.
 Patch8: new patch, pick up some common APIs that are used to free direct mapping
         and vmemmap pagetables.
 Patch9: free direct mapping pagetables on x86_64 arch.
 Patch10: free vmemmap pagetables.
 Patch11: since freeing memmap with vmemmap has been implemented, the config
          macro CONFIG_SPARSEMEM_VMEMMAP when defining __remove_section() is
          no longer needed.
 Patch13: no need to modify acpi_memory_disable_device() since it was removed,
          and add nid parameter when calling remove_memory().

Changelogs from v3 to v4:
 Patch7: remove unused codes.
 Patch8: fix nr_pages that is passed to free_map_bootmem()

Changelogs from v2 to v3:
 Patch9: call sync_global_pgds() if pgd is changed
 Patch10: fix a problem int the patch

Changelogs from v1 to v2:
 Patch1: new patch, offline memory twice. 1st iterate: offline every non primary
         memory block. 2nd iterate: offline primary (i.e. first added) memory
         block.

 Patch3: new patch, no logical change, just remove reduntant codes.

 Patch9: merge the patch from wujianguo into this patch. flush tlb on all cpu
         after the pagetable is changed.

 Patch12: new patch, free node_data when a node is offlined.

Tang Chen (6):
  memory-hotplug: move pgdat_resize_lock into
    sparse_remove_one_section()
  memory-hotplug: remove page table of x86_64 architecture
  memory-hotplug: remove memmap of sparse-vmemmap
  memory-hotplug: Integrated __remove_section() of
    CONFIG_SPARSEMEM_VMEMMAP.
  memory-hotplug: remove sysfs file of node
  memory-hotplug: Do not allocate pdgat if it was not freed when
    offline.

Wen Congyang (5):
  memory-hotplug: try to offline the memory twice to avoid dependence
  memory-hotplug: remove redundant codes
  memory-hotplug: introduce new function arch_remove_memory() for
    removing page table depends on architecture
  memory-hotplug: Common APIs to support page tables hot-remove
  memory-hotplug: free node_data when a node is offlined

Yasuaki Ishimatsu (4):
  memory-hotplug: check whether all memory blocks are offlined or not
    when removing memory
  memory-hotplug: remove /sys/firmware/memmap/X sysfs
  memory-hotplug: implement register_page_bootmem_info_section of
    sparse-vmemmap
  memory-hotplug: memory_hotplug: clear zone when removing the memory

 arch/arm64/mm/mmu.c                  |    3 +
 arch/ia64/mm/discontig.c             |   10 +
 arch/ia64/mm/init.c                  |   18 ++
 arch/powerpc/mm/init_64.c            |   10 +
 arch/powerpc/mm/mem.c                |   12 +
 arch/s390/mm/init.c                  |   12 +
 arch/s390/mm/vmem.c                  |   10 +
 arch/sh/mm/init.c                    |   17 ++
 arch/sparc/mm/init_64.c              |   10 +
 arch/tile/mm/init.c                  |    8 +
 arch/x86/include/asm/pgtable_types.h |    1 +
 arch/x86/mm/init_32.c                |   12 +
 arch/x86/mm/init_64.c                |  390 +++++++++++++++++++++++++++++
 arch/x86/mm/pageattr.c               |   47 ++--
 drivers/acpi/acpi_memhotplug.c       |    8 +-
 drivers/base/memory.c                |    6 +
 drivers/firmware/memmap.c            |   96 +++++++-
 include/linux/bootmem.h              |    1 +
 include/linux/firmware-map.h         |    6 +
 include/linux/memory_hotplug.h       |   15 +-
 include/linux/mm.h                   |    4 +-
 mm/memory_hotplug.c                  |  459 +++++++++++++++++++++++++++++++---
 mm/sparse.c                          |    8 +-
 23 files changed, 1094 insertions(+), 69 deletions(-)

^ permalink raw reply

* [PATCH v6 03/15] memory-hotplug: remove redundant codes
From: Tang Chen @ 2013-01-09  9:32 UTC (permalink / raw)
  To: akpm, rientjes, len.brown, benh, paulus, cl, minchan.kim,
	kosaki.motohiro, isimatu.yasuaki, wujianguo, wency, tangchen, hpa,
	linfeng, laijs, mgorman, yinghai, glommer
  Cc: linux-s390, linux-ia64, linux-acpi, linux-sh, x86, linux-kernel,
	cmetcalf, linux-mm, sparclinux, linuxppc-dev
In-Reply-To: <1357723959-5416-1-git-send-email-tangchen@cn.fujitsu.com>

From: Wen Congyang <wency@cn.fujitsu.com>

offlining memory blocks and checking whether memory blocks are offlined
are very similar. This patch introduces a new function to remove
redundant codes.

Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
Reviewed-by: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 mm/memory_hotplug.c |  129 ++++++++++++++++++++++++++++++++------------------
 1 files changed, 82 insertions(+), 47 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 5808045..69d62eb 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1381,20 +1381,26 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 	return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
 }
 
-int remove_memory(u64 start, u64 size)
+/**
+ * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
+ * @start_pfn: start pfn of the memory range
+ * @end_pfn: end pft of the memory range
+ * @arg: argument passed to func
+ * @func: callback for each memory section walked
+ *
+ * This function walks through all present mem sections in range
+ * [start_pfn, end_pfn) and call func on each mem section.
+ *
+ * Returns the return value of func.
+ */
+static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
+		void *arg, int (*func)(struct memory_block *, void *))
 {
 	struct memory_block *mem = NULL;
 	struct mem_section *section;
-	unsigned long start_pfn, end_pfn;
 	unsigned long pfn, section_nr;
 	int ret;
-	int return_on_error = 0;
-	int retry = 0;
-
-	start_pfn = PFN_DOWN(start);
-	end_pfn = start_pfn + PFN_DOWN(size);
 
-repeat:
 	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
 		section_nr = pfn_to_section_nr(pfn);
 		if (!present_section_nr(section_nr))
@@ -1411,22 +1417,76 @@ repeat:
 		if (!mem)
 			continue;
 
-		ret = offline_memory_block(mem);
+		ret = func(mem, arg);
 		if (ret) {
-			if (return_on_error) {
-				kobject_put(&mem->dev.kobj);
-				return ret;
-			} else {
-				retry = 1;
-			}
+			kobject_put(&mem->dev.kobj);
+			return ret;
 		}
 	}
 
 	if (mem)
 		kobject_put(&mem->dev.kobj);
 
-	if (retry) {
-		return_on_error = 1;
+	return 0;
+}
+
+/**
+ * offline_memory_block_cb - callback function for offlining memory block
+ * @mem: the memory block to be offlined
+ * @arg: buffer to hold error msg
+ *
+ * Always return 0, and put the error msg in arg if any.
+ */
+static int offline_memory_block_cb(struct memory_block *mem, void *arg)
+{
+	int *ret = arg;
+	int error = offline_memory_block(mem);
+
+	if (error != 0 && *ret == 0)
+		*ret = error;
+
+	return 0;
+}
+
+static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
+{
+	int ret = !is_memblock_offlined(mem);
+
+	if (unlikely(ret))
+		pr_warn("removing memory fails, because memory "
+			"[%#010llx-%#010llx] is onlined\n",
+			PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)),
+			PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1);
+
+	return ret;
+}
+
+int remove_memory(u64 start, u64 size)
+{
+	unsigned long start_pfn, end_pfn;
+	int ret = 0;
+	int retry = 1;
+
+	start_pfn = PFN_DOWN(start);
+	end_pfn = start_pfn + PFN_DOWN(size);
+
+	/*
+	 * When CONFIG_MEMCG is on, one memory block may be used by other
+	 * blocks to store page cgroup when onlining pages. But we don't know
+	 * in what order pages are onlined. So we iterate twice to offline
+	 * memory:
+	 * 1st iterate: offline every non primary memory block.
+	 * 2nd iterate: offline primary (i.e. first added) memory block.
+	 */
+repeat:
+	walk_memory_range(start_pfn, end_pfn, &ret,
+			  offline_memory_block_cb);
+	if (ret) {
+		if (!retry)
+			return ret;
+
+		retry = 0;
+		ret = 0;
 		goto repeat;
 	}
 
@@ -1444,38 +1504,13 @@ repeat:
 	 * memory blocks are offlined.
 	 */
 
-	mem = NULL;
-	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
-		section_nr = pfn_to_section_nr(pfn);
-		if (!present_section_nr(section_nr))
-			continue;
-
-		section = __nr_to_section(section_nr);
-		/* same memblock? */
-		if (mem)
-			if ((section_nr >= mem->start_section_nr) &&
-			    (section_nr <= mem->end_section_nr))
-				continue;
-
-		mem = find_memory_block_hinted(section, mem);
-		if (!mem)
-			continue;
-
-		ret = is_memblock_offlined(mem);
-		if (!ret) {
-			pr_warn("removing memory fails, because memory "
-				"[%#010llx-%#010llx] is onlined\n",
-				PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)),
-				PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1)) - 1);
-
-			kobject_put(&mem->dev.kobj);
-			unlock_memory_hotplug();
-			return ret;
-		}
+	ret = walk_memory_range(start_pfn, end_pfn, NULL,
+				is_memblock_offlined_cb);
+	if (ret) {
+		unlock_memory_hotplug();
+		return ret;
 	}
 
-	if (mem)
-		kobject_put(&mem->dev.kobj);
 	unlock_memory_hotplug();
 
 	return 0;
-- 
1.7.1

^ permalink raw reply related

* [PATCH v6 01/15] memory-hotplug: try to offline the memory twice to avoid dependence
From: Tang Chen @ 2013-01-09  9:32 UTC (permalink / raw)
  To: akpm, rientjes, len.brown, benh, paulus, cl, minchan.kim,
	kosaki.motohiro, isimatu.yasuaki, wujianguo, wency, tangchen, hpa,
	linfeng, laijs, mgorman, yinghai, glommer
  Cc: linux-s390, linux-ia64, linux-acpi, linux-sh, x86, linux-kernel,
	cmetcalf, linux-mm, sparclinux, linuxppc-dev
In-Reply-To: <1357723959-5416-1-git-send-email-tangchen@cn.fujitsu.com>

From: Wen Congyang <wency@cn.fujitsu.com>

memory can't be offlined when CONFIG_MEMCG is selected.
For example: there is a memory device on node 1. The address range
is [1G, 1.5G). You will find 4 new directories memory8, memory9, memory10,
and memory11 under the directory /sys/devices/system/memory/.

If CONFIG_MEMCG is selected, we will allocate memory to store page cgroup
when we online pages. When we online memory8, the memory stored page cgroup
is not provided by this memory device. But when we online memory9, the memory
stored page cgroup may be provided by memory8. So we can't offline memory8
now. We should offline the memory in the reversed order.

When the memory device is hotremoved, we will auto offline memory provided
by this memory device. But we don't know which memory is onlined first, so
offlining memory may fail. In such case, iterate twice to offline the memory.
1st iterate: offline every non primary memory block.
2nd iterate: offline primary (i.e. first added) memory block.

This idea is suggested by KOSAKI Motohiro.

Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
---
 mm/memory_hotplug.c |   16 ++++++++++++++--
 1 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d04ed87..62e04c9 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1388,10 +1388,13 @@ int remove_memory(u64 start, u64 size)
 	unsigned long start_pfn, end_pfn;
 	unsigned long pfn, section_nr;
 	int ret;
+	int return_on_error = 0;
+	int retry = 0;

 	start_pfn = PFN_DOWN(start);
 	end_pfn = start_pfn + PFN_DOWN(size);

+repeat:
 	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
 		section_nr = pfn_to_section_nr(pfn);
 		if (!present_section_nr(section_nr))
@@ -1410,14 +1413,23 @@ int remove_memory(u64 start, u64 size)

 		ret = offline_memory_block(mem);
 		if (ret) {
-			kobject_put(&mem->dev.kobj);
-			return ret;
+			if (return_on_error) {
+				kobject_put(&mem->dev.kobj);
+				return ret;
+			} else {
+				retry = 1;
+			}
 		}
 	}

 	if (mem)
 		kobject_put(&mem->dev.kobj);

+	if (retry) {
+		return_on_error = 1;
+		goto repeat;
+	}
+
 	return 0;
 }
 #else
-- 
1.7.1

^ permalink raw reply related

* [PATCH v6 04/15] memory-hotplug: remove /sys/firmware/memmap/X sysfs
From: Tang Chen @ 2013-01-09  9:32 UTC (permalink / raw)
  To: akpm, rientjes, len.brown, benh, paulus, cl, minchan.kim,
	kosaki.motohiro, isimatu.yasuaki, wujianguo, wency, tangchen, hpa,
	linfeng, laijs, mgorman, yinghai, glommer
  Cc: linux-s390, linux-ia64, linux-acpi, linux-sh, x86, linux-kernel,
	cmetcalf, linux-mm, sparclinux, linuxppc-dev
In-Reply-To: <1357723959-5416-1-git-send-email-tangchen@cn.fujitsu.com>

From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>

When (hot)adding memory into system, /sys/firmware/memmap/X/{end, start, type}
sysfs files are created. But there is no code to remove these files. The patch
implements the function to remove them.

Note: The code does not free firmware_map_entry which is allocated by bootmem.
      So the patch makes memory leak. But I think the memory leak size is
      very samll. And it does not affect the system.

Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
Reviewed-by: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 drivers/firmware/memmap.c    |   96 +++++++++++++++++++++++++++++++++++++++++-
 include/linux/firmware-map.h |    6 +++
 mm/memory_hotplug.c          |    5 ++-
 3 files changed, 104 insertions(+), 3 deletions(-)

diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c
index 90723e6..4211da5 100644
--- a/drivers/firmware/memmap.c
+++ b/drivers/firmware/memmap.c
@@ -21,6 +21,7 @@
 #include <linux/types.h>
 #include <linux/bootmem.h>
 #include <linux/slab.h>
+#include <linux/mm.h>
 
 /*
  * Data types ------------------------------------------------------------------
@@ -79,7 +80,26 @@ static const struct sysfs_ops memmap_attr_ops = {
 	.show = memmap_attr_show,
 };
 
+
+static inline struct firmware_map_entry *
+to_memmap_entry(struct kobject *kobj)
+{
+	return container_of(kobj, struct firmware_map_entry, kobj);
+}
+
+static void release_firmware_map_entry(struct kobject *kobj)
+{
+	struct firmware_map_entry *entry = to_memmap_entry(kobj);
+
+	if (PageReserved(virt_to_page(entry)))
+		/* There is no way to free memory allocated from bootmem */
+		return;
+
+	kfree(entry);
+}
+
 static struct kobj_type memmap_ktype = {
+	.release	= release_firmware_map_entry,
 	.sysfs_ops	= &memmap_attr_ops,
 	.default_attrs	= def_attrs,
 };
@@ -94,6 +114,7 @@ static struct kobj_type memmap_ktype = {
  * in firmware initialisation code in one single thread of execution.
  */
 static LIST_HEAD(map_entries);
+static DEFINE_SPINLOCK(map_entries_lock);
 
 /**
  * firmware_map_add_entry() - Does the real work to add a firmware memmap entry.
@@ -118,11 +139,25 @@ static int firmware_map_add_entry(u64 start, u64 end,
 	INIT_LIST_HEAD(&entry->list);
 	kobject_init(&entry->kobj, &memmap_ktype);
 
+	spin_lock(&map_entries_lock);
 	list_add_tail(&entry->list, &map_entries);
+	spin_unlock(&map_entries_lock);
 
 	return 0;
 }
 
+/**
+ * firmware_map_remove_entry() - Does the real work to remove a firmware
+ * memmap entry.
+ * @entry: removed entry.
+ **/
+static inline void firmware_map_remove_entry(struct firmware_map_entry *entry)
+{
+	spin_lock(&map_entries_lock);
+	list_del(&entry->list);
+	spin_unlock(&map_entries_lock);
+}
+
 /*
  * Add memmap entry on sysfs
  */
@@ -144,6 +179,35 @@ static int add_sysfs_fw_map_entry(struct firmware_map_entry *entry)
 	return 0;
 }
 
+/*
+ * Remove memmap entry on sysfs
+ */
+static inline void remove_sysfs_fw_map_entry(struct firmware_map_entry *entry)
+{
+	kobject_put(&entry->kobj);
+}
+
+/*
+ * Search memmap entry
+ */
+
+static struct firmware_map_entry * __meminit
+firmware_map_find_entry(u64 start, u64 end, const char *type)
+{
+	struct firmware_map_entry *entry;
+
+	spin_lock(&map_entries_lock);
+	list_for_each_entry(entry, &map_entries, list)
+		if ((entry->start == start) && (entry->end == end) &&
+		    (!strcmp(entry->type, type))) {
+			spin_unlock(&map_entries_lock);
+			return entry;
+		}
+
+	spin_unlock(&map_entries_lock);
+	return NULL;
+}
+
 /**
  * firmware_map_add_hotplug() - Adds a firmware mapping entry when we do
  * memory hotplug.
@@ -196,6 +260,32 @@ int __init firmware_map_add_early(u64 start, u64 end, const char *type)
 	return firmware_map_add_entry(start, end, type, entry);
 }
 
+/**
+ * firmware_map_remove() - remove a firmware mapping entry
+ * @start: Start of the memory range.
+ * @end:   End of the memory range.
+ * @type:  Type of the memory range.
+ *
+ * removes a firmware mapping entry.
+ *
+ * Returns 0 on success, or -EINVAL if no entry.
+ **/
+int __meminit firmware_map_remove(u64 start, u64 end, const char *type)
+{
+	struct firmware_map_entry *entry;
+
+	entry = firmware_map_find_entry(start, end - 1, type);
+	if (!entry)
+		return -EINVAL;
+
+	firmware_map_remove_entry(entry);
+
+	/* remove the memmap entry */
+	remove_sysfs_fw_map_entry(entry);
+
+	return 0;
+}
+
 /*
  * Sysfs functions -------------------------------------------------------------
  */
@@ -217,8 +307,10 @@ static ssize_t type_show(struct firmware_map_entry *entry, char *buf)
 	return snprintf(buf, PAGE_SIZE, "%s\n", entry->type);
 }
 
-#define to_memmap_attr(_attr) container_of(_attr, struct memmap_attribute, attr)
-#define to_memmap_entry(obj) container_of(obj, struct firmware_map_entry, kobj)
+static inline struct memmap_attribute *to_memmap_attr(struct attribute *attr)
+{
+	return container_of(attr, struct memmap_attribute, attr);
+}
 
 static ssize_t memmap_attr_show(struct kobject *kobj,
 				struct attribute *attr, char *buf)
diff --git a/include/linux/firmware-map.h b/include/linux/firmware-map.h
index 43fe52f..71d4fa7 100644
--- a/include/linux/firmware-map.h
+++ b/include/linux/firmware-map.h
@@ -25,6 +25,7 @@
 
 int firmware_map_add_early(u64 start, u64 end, const char *type);
 int firmware_map_add_hotplug(u64 start, u64 end, const char *type);
+int firmware_map_remove(u64 start, u64 end, const char *type);
 
 #else /* CONFIG_FIRMWARE_MEMMAP */
 
@@ -38,6 +39,11 @@ static inline int firmware_map_add_hotplug(u64 start, u64 end, const char *type)
 	return 0;
 }
 
+static inline int firmware_map_remove(u64 start, u64 end, const char *type)
+{
+	return 0;
+}
+
 #endif /* CONFIG_FIRMWARE_MEMMAP */
 
 #endif /* _LINUX_FIRMWARE_MAP_H */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 69d62eb..9fd5904 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1461,7 +1461,7 @@ static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
 	return ret;
 }
 
-int remove_memory(u64 start, u64 size)
+int __ref remove_memory(u64 start, u64 size)
 {
 	unsigned long start_pfn, end_pfn;
 	int ret = 0;
@@ -1511,6 +1511,9 @@ repeat:
 		return ret;
 	}
 
+	/* remove memmap entry */
+	firmware_map_remove(start, start + size, "System RAM");
+
 	unlock_memory_hotplug();
 
 	return 0;
-- 
1.7.1

^ permalink raw reply related

* [PATCH v6 07/15] memory-hotplug: move pgdat_resize_lock into sparse_remove_one_section()
From: Tang Chen @ 2013-01-09  9:32 UTC (permalink / raw)
  To: akpm, rientjes, len.brown, benh, paulus, cl, minchan.kim,
	kosaki.motohiro, isimatu.yasuaki, wujianguo, wency, tangchen, hpa,
	linfeng, laijs, mgorman, yinghai, glommer
  Cc: linux-s390, linux-ia64, linux-acpi, linux-sh, x86, linux-kernel,
	cmetcalf, linux-mm, sparclinux, linuxppc-dev
In-Reply-To: <1357723959-5416-1-git-send-email-tangchen@cn.fujitsu.com>

In __remove_section(), we locked pgdat_resize_lock when calling
sparse_remove_one_section(). This lock will disable irq. But we don't need
to lock the whole function. If we do some work to free pagetables in
free_section_usemap(), we need to call flush_tlb_all(), which need
irq enabled. Otherwise the WARN_ON_ONCE() in smp_call_function_many()
will be triggered.

If we lock the whole sparse_remove_one_section(), then we come to this call trace:

[  454.796248] ------------[ cut here ]------------
[  454.851408] WARNING: at kernel/smp.c:461 smp_call_function_many+0xbd/0x260()
[  454.935620] Hardware name: PRIMEQUEST 1800E
......
[  455.652201] Call Trace:
[  455.681391]  [<ffffffff8106e73f>] warn_slowpath_common+0x7f/0xc0
[  455.753151]  [<ffffffff810560a0>] ? leave_mm+0x50/0x50
[  455.814527]  [<ffffffff8106e79a>] warn_slowpath_null+0x1a/0x20
[  455.884208]  [<ffffffff810e7a9d>] smp_call_function_many+0xbd/0x260
[  455.959082]  [<ffffffff810e7ecb>] smp_call_function+0x3b/0x50
[  456.027722]  [<ffffffff810560a0>] ? leave_mm+0x50/0x50
[  456.089098]  [<ffffffff810e7f4b>] on_each_cpu+0x3b/0xc0
[  456.151512]  [<ffffffff81055f0c>] flush_tlb_all+0x1c/0x20
[  456.216004]  [<ffffffff8104f8de>] remove_pagetable+0x14e/0x1d0
[  456.285683]  [<ffffffff8104f978>] vmemmap_free+0x18/0x20
[  456.349139]  [<ffffffff811b8797>] sparse_remove_one_section+0xf7/0x100
[  456.427126]  [<ffffffff811c5fc2>] __remove_section+0xa2/0xb0
[  456.494726]  [<ffffffff811c6070>] __remove_pages+0xa0/0xd0
[  456.560258]  [<ffffffff81669c7b>] arch_remove_memory+0x6b/0xc0
[  456.629937]  [<ffffffff8166ad28>] remove_memory+0xb8/0xf0
[  456.694431]  [<ffffffff813e686f>] acpi_memory_device_remove+0x53/0x96
[  456.771379]  [<ffffffff813b33c4>] acpi_device_remove+0x90/0xb2
[  456.841059]  [<ffffffff8144b02c>] __device_release_driver+0x7c/0xf0
[  456.915928]  [<ffffffff8144b1af>] device_release_driver+0x2f/0x50
[  456.988719]  [<ffffffff813b4476>] acpi_bus_remove+0x32/0x6d
[  457.055285]  [<ffffffff813b4542>] acpi_bus_trim+0x91/0x102
[  457.120814]  [<ffffffff813b463b>] acpi_bus_hot_remove_device+0x88/0x16b
[  457.199840]  [<ffffffff813afda7>] acpi_os_execute_deferred+0x27/0x34
[  457.275756]  [<ffffffff81091ece>] process_one_work+0x20e/0x5c0
[  457.345434]  [<ffffffff81091e5f>] ? process_one_work+0x19f/0x5c0
[  457.417190]  [<ffffffff813afd80>] ? acpi_os_wait_events_complete+0x23/0x23
[  457.499332]  [<ffffffff81093f6e>] worker_thread+0x12e/0x370
[  457.565896]  [<ffffffff81093e40>] ? manage_workers+0x180/0x180
[  457.635574]  [<ffffffff8109a09e>] kthread+0xee/0x100
[  457.694871]  [<ffffffff810dfaf9>] ? __lock_release+0x129/0x190
[  457.764552]  [<ffffffff81099fb0>] ? __init_kthread_worker+0x70/0x70
[  457.839427]  [<ffffffff81690aac>] ret_from_fork+0x7c/0xb0
[  457.903914]  [<ffffffff81099fb0>] ? __init_kthread_worker+0x70/0x70
[  457.978784] ---[ end trace 25e85300f542aa01 ]---

Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 mm/memory_hotplug.c |    4 ----
 mm/sparse.c         |    5 ++++-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0682d2a..674e791 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -442,8 +442,6 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
 #else
 static int __remove_section(struct zone *zone, struct mem_section *ms)
 {
-	unsigned long flags;
-	struct pglist_data *pgdat = zone->zone_pgdat;
 	int ret = -EINVAL;
 
 	if (!valid_section(ms))
@@ -453,9 +451,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
 	if (ret)
 		return ret;
 
-	pgdat_resize_lock(pgdat, &flags);
 	sparse_remove_one_section(zone, ms);
-	pgdat_resize_unlock(pgdat, &flags);
 	return 0;
 }
 #endif
diff --git a/mm/sparse.c b/mm/sparse.c
index aadbb2a..05ca73a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -796,8 +796,10 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
 {
 	struct page *memmap = NULL;
-	unsigned long *usemap = NULL;
+	unsigned long *usemap = NULL, flags;
+	struct pglist_data *pgdat = zone->zone_pgdat;
 
+	pgdat_resize_lock(pgdat, &flags);
 	if (ms->section_mem_map) {
 		usemap = ms->pageblock_flags;
 		memmap = sparse_decode_mem_map(ms->section_mem_map,
@@ -805,6 +807,7 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
 		ms->section_mem_map = 0;
 		ms->pageblock_flags = NULL;
 	}
+	pgdat_resize_unlock(pgdat, &flags);
 
 	clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION);
 	free_section_usemap(memmap, usemap);
-- 
1.7.1

^ permalink raw reply related

* RE: [PATCH] Added device tree binding for TDM and TDM phy
From: Singh Sandeep-B37400 @ 2013-01-09  7:10 UTC (permalink / raw)
  To: devicetree-discuss@lists.ozlabs.org, linuxppc-dev@ozlabs.org
  Cc: Aggrwal Poonam-B10812
In-Reply-To: <1357133120-2271-1-git-send-email-Sandeep@freescale.com>

A gentle reminder.
Any comments are appreciated.

Regards,
Sandeep

> -----Original Message-----
> From: Singh Sandeep-B37400
> Sent: Wednesday, January 02, 2013 6:55 PM
> To: devicetree-discuss@lists.ozlabs.org; linuxppc-dev@ozlabs.org
> Cc: Singh Sandeep-B37400; Aggrwal Poonam-B10812
> Subject: [PATCH] Added device tree binding for TDM and TDM phy
>=20
> This controller is available on many Freescale SOCs like MPC8315, P1020,
> P1010 and P1022
>=20
> Signed-off-by: Sandeep Singh <Sandeep@freescale.com>
> Signed-off-by: Poonam Aggrwal <poonam.aggrwal@freescale.com>
> ---
>  .../devicetree/bindings/powerpc/fsl/fsl-tdm.txt    |   63
> ++++++++++++++++++++
>  .../devicetree/bindings/powerpc/fsl/tdm-phy.txt    |   38 ++++++++++++
>  2 files changed, 101 insertions(+), 0 deletions(-)  create mode 100644
> Documentation/devicetree/bindings/powerpc/fsl/fsl-tdm.txt
>  create mode 100644 Documentation/devicetree/bindings/powerpc/fsl/tdm-
> phy.txt
>=20
> diff --git a/Documentation/devicetree/bindings/powerpc/fsl/fsl-tdm.txt
> b/Documentation/devicetree/bindings/powerpc/fsl/fsl-tdm.txt
> new file mode 100644
> index 0000000..ceb2ef1
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/powerpc/fsl/fsl-tdm.txt
> @@ -0,0 +1,63 @@
> +TDM Device Tree Binding
> +
> +NOTE: The bindings described in this document are preliminary and
> +subject to change.
> +
> +TDM (Time Division Multiplexing)
> +
> +Description:
> +
> +The TDM is full duplex serial port designed to allow various devices
> +including digital signal processors (DSPs) to communicate with a
> +variety of serial devices including industry standard framers, codecs,
> other DSPs and microprocessors.
> +
> +The below properties describe the device tree bindings for Freescale
> +TDM controller. This TDM controller is available on various Freescale
> +Processors like MPC8315, P1020, P1022 and P1010.
> +
> +Required properties:
> +
> +- compatible
> +    Value type: <string>
> +    Definition: Should contain "fsl,tdm1.0".
> +
> +- reg
> +    Definition: A standard property. The first reg specifier describes
> the TDM
> +    registers, and the second describes the TDM DMAC registers.
> +
> +- tdm_tx_clk
> +    Value type: <u32 or u64>
> +    Definition: This specifies the value of transmit clock. It should
> not
> +    exceed 50Mhz.
> +
> +- tdm_rx_clk
> +    Value type: <u32 or u64>
> +    Definition: This specifies the value of receive clock. Its value
> could be
> +    zero, in which case tdm will operate in shared mode. Its value
> should not
> +    exceed 50Mhz.
> +
> +- interrupts
> +    Definition: Two interrupt specifiers. The first is TDM error, and
> the
> +    second is TDM DMAC.
> +
> +- phy-handle
> +    Value type: <phandle>
> +    Definition: Phandle of the line controller node or framer node eg.
> SLIC,
> +    E1/T1 etc. (Refer
> +Documentation/devicetree/bindings/powerpc/fsl/tdm-phy.txt)
> +
> +- fsl,max-time-slots
> +    Value type: <u32>
> +    Definition: Maximum number of 8-bit time slots in one TDM frame.
> This is
> +    the maximum number which TDM hardware supports.
> +
> +Example:
> +
> +	tdm@16000 {
> +		compatible =3D "fsl,tdm1.0";
> +		reg =3D <0x16000 0x200 0x2c000 0x2000>;
> +		tdm_tx_clk =3D <2048000>;
> +		tdm_rx_clk =3D <0>;
> +		interrupts =3D <16 8 62 8>;
> +		phy-handle =3D <&tdm-phy>;
> +		fsl,max-time-slots =3D <128>;
> +	};
> diff --git a/Documentation/devicetree/bindings/powerpc/fsl/tdm-phy.txt
> b/Documentation/devicetree/bindings/powerpc/fsl/tdm-phy.txt
> new file mode 100644
> index 0000000..2563934
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/powerpc/fsl/tdm-phy.txt
> @@ -0,0 +1,38 @@
> +TDM PHY Device Tree Binding
> +
> +NOTE: The bindings described in this document are preliminary and
> +subject to change.
> +
> +Description:
> +TDM PHY is the terminal interface of TDM subsystem. It is typically a
> +line control device like E1/T1 framer or SLIC. A TDM device can have
> +multiple TDM PHYs.
> +
> +Required properties:
> +
> +- compatible
> +    Value type: <string>
> +    Definition: Should contain generic compatibility like "tdm-phy-slic"
> or
> +    "tdm-phy-e1" or "tdm-phy-t1".
> +
> +- max-num-ports
> +    Value type: <u32>
> +    Definition: Defines the maximum number of ports supported by the
> SLIC
> +    device. Only required if the device is SLIC. For E1/T1 devices the
> number
> +    of ports are predefined i.e. (24 in case of T1 and 32 in case of
> E1).
> +
> +Apart from the above, there may be other properties required because of
> +the bus/interface this device is connected on. It could be SPI/local
> bus, etc.
> +
> +Example:
> +
> +	tdm-phy@0 {
> +		compatible =3D "zarlink,le88266","tdm-phy-slic";
> +		reg =3D <0>;
> +		max-num-ports =3D <4>;
> +		spi-max-frequency =3D <8000000>;
> +	};
> +
> +In the above example properties "reg" and "spi-max-frequency" are SPI
> +specific as the SLIC device is connected on SPI interface. These
> +properties might vary depending on the specific interface the device is
> using.
> --
> 1.7.6.GIT

^ permalink raw reply

* [PATCH] powerpc/ptrace: make #defines for all request numbers hex
From: Michael Neuling @ 2013-01-09  4:45 UTC (permalink / raw)
  To: benh; +Cc: Linux PPC dev

We have a mix of decimal and hex here, so lets make them consistently
hex.  Also, strace will print them in hex if it can't decode them, so
having them in hex here makes it easier to match up.

No functional change.

Signed-off-by: Michael Neuling <mikey@neuling.org>

diff --git a/arch/powerpc/include/uapi/asm/ptrace.h b/arch/powerpc/include/uapi/asm/ptrace.h
index ee67a2b..7e1584a 100644
--- a/arch/powerpc/include/uapi/asm/ptrace.h
+++ b/arch/powerpc/include/uapi/asm/ptrace.h
@@ -146,34 +146,34 @@ struct pt_regs {
  * structures.  This also simplifies the implementation of a bi-arch
  * (combined (32- and 64-bit) gdb.
  */
-#define PTRACE_GETVRREGS	18
-#define PTRACE_SETVRREGS	19
+#define PTRACE_GETVRREGS	0x12
+#define PTRACE_SETVRREGS	0x13
 
 /* Get/set all the upper 32-bits of the SPE registers, accumulator, and
  * spefscr, in one go */
-#define PTRACE_GETEVRREGS	20
-#define PTRACE_SETEVRREGS	21
+#define PTRACE_GETEVRREGS	0x14
+#define PTRACE_SETEVRREGS	0x15
 
 /* Get the first 32 128bit VSX registers */
-#define PTRACE_GETVSRREGS	27
-#define PTRACE_SETVSRREGS	28
+#define PTRACE_GETVSRREGS	0x1b
+#define PTRACE_SETVSRREGS	0x1c
 
 /*
  * Get or set a debug register. The first 16 are DABR registers and the
  * second 16 are IABR registers.
  */
-#define PTRACE_GET_DEBUGREG	25
-#define PTRACE_SET_DEBUGREG	26
+#define PTRACE_GET_DEBUGREG	0x19
+#define PTRACE_SET_DEBUGREG	0x1a
 
 /* (new) PTRACE requests using the same numbers as x86 and the same
  * argument ordering. Additionally, they support more registers too
  */
-#define PTRACE_GETREGS            12
-#define PTRACE_SETREGS            13
-#define PTRACE_GETFPREGS          14
-#define PTRACE_SETFPREGS          15
-#define PTRACE_GETREGS64	  22
-#define PTRACE_SETREGS64	  23
+#define PTRACE_GETREGS            0xc
+#define PTRACE_SETREGS            0xd
+#define PTRACE_GETFPREGS          0xe
+#define PTRACE_SETFPREGS          0xf
+#define PTRACE_GETREGS64	  0x16
+#define PTRACE_SETREGS64	  0x17
 
 /* Calls to trace a 64bit program from a 32bit program */
 #define PPC_PTRACE_PEEKTEXT_3264 0x95

^ permalink raw reply related

* Re: [PATCH 7/8] mm: use vm_unmapped_area() on powerpc architecture
From: Benjamin Herrenschmidt @ 2013-01-09  3:32 UTC (permalink / raw)
  To: Michel Lespinasse
  Cc: Rik van Riel, Tony Luck, linux-ia64, linux-parisc,
	James E.J. Bottomley, linux-kernel, David Howells, linux-mm,
	linux-alpha, Matt Turner, linuxppc-dev, Andrew Morton
In-Reply-To: <CANN689EJV_7Q7J4j1ttDxZuqbwD53PAuCHb5DhiE-AVbmNSR7Q@mail.gmail.com>

On Tue, 2013-01-08 at 18:38 -0800, Michel Lespinasse wrote:
> 
> Well no fair, the previous patch (for powerpc as well) has 22
> insertions and 93 deletions :)
> 
> The benefit is that the new code has lower algorithmic complexity, it
> replaces a per-vma loop with O(N) complexity with an outer loop that
> finds contiguous slice blocks and passes them to vm_unmapped_area()
> which is only O(log N) complexity. So the new code will be faster for
> workloads which use lots of vmas.
> 
> That said, I do agree that the code that looks for contiguous
> available slices looks kinda ugly - just not sure how to make it look
> nicer though.

Ok. I think at least you can move that construct:

+               if (addr < SLICE_LOW_TOP) {
+                       slice = GET_LOW_SLICE_INDEX(addr);
+                       addr = (slice + 1) << SLICE_LOW_SHIFT;
+                       if (!(available.low_slices & (1u << slice)))
+                               continue;
+               } else {
+                       slice = GET_HIGH_SLICE_INDEX(addr);
+                       addr = (slice + 1) << SLICE_HIGH_SHIFT;
+                       if (!(available.high_slices & (1u << slice)))
+                               continue;
+               }

Into some kind of helper. It will probably compile to the same thing but
at least it's more readable and it will avoid a fuckup in the future if
somebody changes the algorithm and forgets to update one of the
copies :-)

Cheers,
Ben.

^ permalink raw reply

* Re: [PATCH 5/5] kfifo: log based kfifo API
From: Yuanhan Liu @ 2013-01-09  2:42 UTC (permalink / raw)
  To: Dmitry Torokhov
  Cc: linux-iio, linux-mmc, platform-driver-x86, linux-mm, linux-sctp,
	linux-mtd, devel, linux-scsi, libertas-dev, linux-rdma,
	Stefani Seibold, linux-serial, linux-pci, open-iscsi, linux-media,
	linux-input, linux-omap, netdev, linux-usb, linux-wireless,
	linux-kernel, dccp, Andrew Morton, linuxppc-dev
In-Reply-To: <20130108181645.GA7972@core.coreip.homeip.net>

On Tue, Jan 08, 2013 at 10:16:46AM -0800, Dmitry Torokhov wrote:
> Hi Yuanhan,
> 
> On Tue, Jan 08, 2013 at 10:57:53PM +0800, Yuanhan Liu wrote:
> > The current kfifo API take the kfifo size as input, while it rounds
> >  _down_ the size to power of 2 at __kfifo_alloc. This may introduce
> > potential issue.
> > 
> > Take the code at drivers/hid/hid-logitech-dj.c as example:
> > 
> > 	if (kfifo_alloc(&djrcv_dev->notif_fifo,
> >                        DJ_MAX_NUMBER_NOTIFICATIONS * sizeof(struct dj_report),
> >                        GFP_KERNEL)) {
> > 
> > Where, DJ_MAX_NUMBER_NOTIFICATIONS is 8, and sizeo of(struct dj_report)
> > is 15.
> > 
> > Which means it wants to allocate a kfifo buffer which can store 8
> > dj_report entries at once. The expected kfifo buffer size would be
> > 8 * 15 = 120 then. While, in the end, __kfifo_alloc will turn the
> > size to rounddown_power_of_2(120) =  64, and then allocate a buf
> > with 64 bytes, which I don't think this is the original author want.
> > 
> > With the new log API, we can do like following:
> > 
> > 	int kfifo_size_order = order_base_2(DJ_MAX_NUMBER_NOTIFICATIONS *
> > 					    sizeof(struct dj_report));
> > 
> > 	if (kfifo_alloc(&djrcv_dev->notif_fifo, kfifo_size_order, GFP_KERNEL)) {
> > 
> > This make sure we will allocate enough kfifo buffer for holding
> > DJ_MAX_NUMBER_NOTIFICATIONS dj_report entries.
> 
> Why don't you simply change __kfifo_alloc to round the allocation up
> instead of down?

Hi Dmitry,

Yes, it would be neat and that was my first reaction as well. I then
sent out a patch, but it was NACKed by Stefani(the original kfifo
author). Here is the link:

    https://lkml.org/lkml/2012/10/26/144

Then Stefani proposed to change the API to take log of size as input to
root fix this kind of issues. And here it is.

Thanks.

	--yliu

^ permalink raw reply

* Re: [PATCH 7/8] mm: use vm_unmapped_area() on powerpc architecture
From: Michel Lespinasse @ 2013-01-09  2:38 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Rik van Riel, Tony Luck, linux-ia64, linux-parisc,
	James E.J. Bottomley, linux-kernel, David Howells, linux-mm,
	linux-alpha, Matt Turner, linuxppc-dev, Andrew Morton
In-Reply-To: <1357697739.4838.30.camel@pasglop>

On Tue, Jan 8, 2013 at 6:15 PM, Benjamin Herrenschmidt
<benh@kernel.crashing.org> wrote:
> On Tue, 2013-01-08 at 17:28 -0800, Michel Lespinasse wrote:
>> Update the powerpc slice_get_unmapped_area function to make use of
>> vm_unmapped_area() instead of implementing a brute force search.
>>
>> Signed-off-by: Michel Lespinasse <walken@google.com>
>>
>> ---
>>  arch/powerpc/mm/slice.c |  128 +++++++++++++++++++++++++++++-----------------
>>  1 files changed, 81 insertions(+), 47 deletions(-)
>
> That doesn't look good ... the resulting code is longer than the
> original, which makes me wonder how it is an improvement...

Well no fair, the previous patch (for powerpc as well) has 22
insertions and 93 deletions :)

The benefit is that the new code has lower algorithmic complexity, it
replaces a per-vma loop with O(N) complexity with an outer loop that
finds contiguous slice blocks and passes them to vm_unmapped_area()
which is only O(log N) complexity. So the new code will be faster for
workloads which use lots of vmas.

That said, I do agree that the code that looks for contiguous
available slices looks kinda ugly - just not sure how to make it look
nicer though.

-- 
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.

^ permalink raw reply

* Re: [PATCH 7/8] mm: use vm_unmapped_area() on powerpc architecture
From: Benjamin Herrenschmidt @ 2013-01-09  2:15 UTC (permalink / raw)
  To: Michel Lespinasse
  Cc: Rik van Riel, Tony Luck, linux-ia64, linux-parisc,
	James E.J. Bottomley, linux-kernel, David Howells, linux-mm,
	linux-alpha, Matt Turner, linuxppc-dev, Andrew Morton
In-Reply-To: <1357694895-520-8-git-send-email-walken@google.com>

On Tue, 2013-01-08 at 17:28 -0800, Michel Lespinasse wrote:
> Update the powerpc slice_get_unmapped_area function to make use of
> vm_unmapped_area() instead of implementing a brute force search.
> 
> Signed-off-by: Michel Lespinasse <walken@google.com>
> 
> ---
>  arch/powerpc/mm/slice.c |  128 +++++++++++++++++++++++++++++-----------------
>  1 files changed, 81 insertions(+), 47 deletions(-)

That doesn't look good ... the resulting code is longer than the
original, which makes me wonder how it is an improvement...

Now it could just be a matter of how the code is factored, I see
quite a bit of duplication of the whole slice mask test...

Cheers,
Ben.

> diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
> index 999a74f25ebe..048346b7eed5 100644
> --- a/arch/powerpc/mm/slice.c
> +++ b/arch/powerpc/mm/slice.c
> @@ -242,31 +242,51 @@ static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
>  					      struct slice_mask available,
>  					      int psize)
>  {
> -	struct vm_area_struct *vma;
> -	unsigned long addr;
> -	struct slice_mask mask;
>  	int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
> +	unsigned long addr, found, slice;
> +	struct vm_unmapped_area_info info;
>  
> -	addr = TASK_UNMAPPED_BASE;
> +	info.flags = 0;
> +	info.length = len;
> +	info.align_mask = PAGE_MASK & ((1ul << pshift) - 1);
> +	info.align_offset = 0;
>  
> -	for (;;) {
> -		addr = _ALIGN_UP(addr, 1ul << pshift);
> -		if ((TASK_SIZE - len) < addr)
> -			break;
> -		vma = find_vma(mm, addr);
> -		BUG_ON(vma && (addr >= vma->vm_end));
> +	addr = TASK_UNMAPPED_BASE;
> +	while (addr < TASK_SIZE) {
> +		info.low_limit = addr;
> +		if (addr < SLICE_LOW_TOP) {
> +			slice = GET_LOW_SLICE_INDEX(addr);
> +			addr = (slice + 1) << SLICE_LOW_SHIFT;
> +			if (!(available.low_slices & (1u << slice)))
> +				continue;
> +		} else {
> +			slice = GET_HIGH_SLICE_INDEX(addr);
> +			addr = (slice + 1) << SLICE_HIGH_SHIFT;
> +			if (!(available.high_slices & (1u << slice)))
> +				continue;
> +		}
>  
> -		mask = slice_range_to_mask(addr, len);
> -		if (!slice_check_fit(mask, available)) {
> -			if (addr < SLICE_LOW_TOP)
> -				addr = _ALIGN_UP(addr + 1,  1ul << SLICE_LOW_SHIFT);
> -			else
> -				addr = _ALIGN_UP(addr + 1,  1ul << SLICE_HIGH_SHIFT);
> -			continue;
> + next_slice:
> +		if (addr >= TASK_SIZE)
> +			addr = TASK_SIZE;
> +		else if (addr < SLICE_LOW_TOP) {
> +			slice = GET_LOW_SLICE_INDEX(addr);
> +			if (available.low_slices & (1u << slice)) {
> +				addr = (slice + 1) << SLICE_LOW_SHIFT;
> +				goto next_slice;
> +			}
> +		} else {
> +			slice = GET_HIGH_SLICE_INDEX(addr);
> +			if (available.high_slices & (1u << slice)) {
> +				addr = (slice + 1) << SLICE_HIGH_SHIFT;
> +				goto next_slice;
> +			}
>  		}
> -		if (!vma || addr + len <= vma->vm_start)
> -			return addr;
> -		addr = vma->vm_end;
> +		info.high_limit = addr;
> +
> +		found = vm_unmapped_area(&info);
> +		if (!(found & ~PAGE_MASK))
> +			return found;
>  	}
>  
>  	return -ENOMEM;
> @@ -277,39 +297,53 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm,
>  					     struct slice_mask available,
>  					     int psize)
>  {
> -	struct vm_area_struct *vma;
> -	unsigned long addr;
> -	struct slice_mask mask;
>  	int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
> +	unsigned long addr, found, slice;
> +	struct vm_unmapped_area_info info;
>  
> -	addr = mm->mmap_base;
> -	while (addr > len) {
> -		/* Go down by chunk size */
> -		addr = _ALIGN_DOWN(addr - len, 1ul << pshift);
> +	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
> +	info.length = len;
> +	info.align_mask = PAGE_MASK & ((1ul << pshift) - 1);
> +	info.align_offset = 0;
>  
> -		/* Check for hit with different page size */
> -		mask = slice_range_to_mask(addr, len);
> -		if (!slice_check_fit(mask, available)) {
> -			if (addr < SLICE_LOW_TOP)
> -				addr = _ALIGN_DOWN(addr, 1ul << SLICE_LOW_SHIFT);
> -			else if (addr < (1ul << SLICE_HIGH_SHIFT))
> -				addr = SLICE_LOW_TOP;
> -			else
> -				addr = _ALIGN_DOWN(addr, 1ul << SLICE_HIGH_SHIFT);
> -			continue;
> +	addr = mm->mmap_base;
> +	while (addr > PAGE_SIZE) {
> +		info.high_limit = addr;
> +                if (addr < SLICE_LOW_TOP) {
> +			slice = GET_LOW_SLICE_INDEX(addr - 1);
> +			addr = slice << SLICE_LOW_SHIFT;
> +			if (!(available.low_slices & (1u << slice)))
> +				continue;
> +		} else {
> +			slice = GET_HIGH_SLICE_INDEX(addr - 1);
> +			addr = slice ? (slice << SLICE_HIGH_SHIFT) :
> +								SLICE_LOW_TOP;
> +			if (!(available.high_slices & (1u << slice)))
> +				continue;
>  		}
>  
> -		/*
> -		 * Lookup failure means no vma is above this address,
> -		 * else if new region fits below vma->vm_start,
> -		 * return with success:
> -		 */
> -		vma = find_vma(mm, addr);
> -		if (!vma || (addr + len) <= vma->vm_start)
> -			return addr;
> + next_slice:
> +		if (addr < PAGE_SIZE)
> +			addr = PAGE_SIZE;
> +		else if (addr < SLICE_LOW_TOP) {
> +			slice = GET_LOW_SLICE_INDEX(addr - 1);
> +			if (available.low_slices & (1u << slice)) {
> +				addr = slice << SLICE_LOW_SHIFT;
> +				goto next_slice;
> +			}
> +		} else {
> +			slice = GET_HIGH_SLICE_INDEX(addr - 1);
> +			if (available.high_slices & (1u << slice)) {
> +				addr = slice ? (slice << SLICE_HIGH_SHIFT) :
> +								SLICE_LOW_TOP;
> +				goto next_slice;
> +			}
> +		}
> +		info.low_limit = addr;
>  
> -		/* try just below the current vma->vm_start */
> -		addr = vma->vm_start;
> +		found = vm_unmapped_area(&info);
> +		if (!(found & ~PAGE_MASK))
> +			return found;
>  	}
>  
>  	/*

^ permalink raw reply

* Re: [PATCH 0/8] vm_unmapped_area: finish the mission
From: Michel Lespinasse @ 2013-01-09  1:32 UTC (permalink / raw)
  To: Rik van Riel, Benjamin Herrenschmidt, James E.J. Bottomley,
	Matt Turner, David Howells, Tony Luck
  Cc: linux-ia64, linux-parisc, linux-kernel, linux-mm, linux-alpha,
	Andrew Morton, linuxppc-dev
In-Reply-To: <1357694895-520-1-git-send-email-walken@google.com>

Whoops, I was supposed to find a more appropriate subject line before
sending this :]

On Tue, Jan 8, 2013 at 5:28 PM, Michel Lespinasse <walken@google.com> wrote:
> These patches, which apply on top of v3.8-rc kernels, are to complete the
> VMA gap finding code I introduced (following Rik's initial proposal) in
> v3.8-rc1.

-- 
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.

^ permalink raw reply

* [PATCH 8/8] mm: remove free_area_cache
From: Michel Lespinasse @ 2013-01-09  1:28 UTC (permalink / raw)
  To: Rik van Riel, Benjamin Herrenschmidt, James E.J. Bottomley,
	Matt Turner, David Howells, Tony Luck
  Cc: linux-ia64, linux-parisc, linux-kernel, linux-mm, linux-alpha,
	Andrew Morton, linuxppc-dev
In-Reply-To: <1357694895-520-1-git-send-email-walken@google.com>

Since all architectures have been converted to use vm_unmapped_area(),
there is no remaining use for the free_area_cache.

Signed-off-by: Michel Lespinasse <walken@google.com>

---
 arch/arm/mm/mmap.c               |    2 --
 arch/arm64/mm/mmap.c             |    2 --
 arch/mips/mm/mmap.c              |    2 --
 arch/powerpc/mm/mmap_64.c        |    2 --
 arch/s390/mm/mmap.c              |    4 ----
 arch/sparc/kernel/sys_sparc_64.c |    2 --
 arch/tile/mm/mmap.c              |    2 --
 arch/x86/ia32/ia32_aout.c        |    2 --
 arch/x86/mm/mmap.c               |    2 --
 fs/binfmt_aout.c                 |    2 --
 fs/binfmt_elf.c                  |    2 --
 include/linux/mm_types.h         |    3 ---
 include/linux/sched.h            |    2 --
 kernel/fork.c                    |    4 ----
 mm/mmap.c                        |   28 ----------------------------
 mm/nommu.c                       |    4 ----
 mm/util.c                        |    1 -
 17 files changed, 0 insertions(+), 66 deletions(-)

diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c
index 10062ceadd1c..0c6356255fe3 100644
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -181,11 +181,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	if (mmap_is_legacy()) {
 		mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
 		mm->get_unmapped_area = arch_get_unmapped_area;
-		mm->unmap_area = arch_unmap_area;
 	} else {
 		mm->mmap_base = mmap_base(random_factor);
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
-		mm->unmap_area = arch_unmap_area_topdown;
 	}
 }
 
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index 7c7be7855638..8ed6cb1a900f 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -90,11 +90,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	if (mmap_is_legacy()) {
 		mm->mmap_base = TASK_UNMAPPED_BASE;
 		mm->get_unmapped_area = arch_get_unmapped_area;
-		mm->unmap_area = arch_unmap_area;
 	} else {
 		mm->mmap_base = mmap_base();
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
-		mm->unmap_area = arch_unmap_area_topdown;
 	}
 }
 EXPORT_SYMBOL_GPL(arch_pick_mmap_layout);
diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c
index d9be7540a6be..f4e63c29d044 100644
--- a/arch/mips/mm/mmap.c
+++ b/arch/mips/mm/mmap.c
@@ -158,11 +158,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	if (mmap_is_legacy()) {
 		mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
 		mm->get_unmapped_area = arch_get_unmapped_area;
-		mm->unmap_area = arch_unmap_area;
 	} else {
 		mm->mmap_base = mmap_base(random_factor);
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
-		mm->unmap_area = arch_unmap_area_topdown;
 	}
 }
 
diff --git a/arch/powerpc/mm/mmap_64.c b/arch/powerpc/mm/mmap_64.c
index 67a42ed0d2fc..cb8bdbe4972f 100644
--- a/arch/powerpc/mm/mmap_64.c
+++ b/arch/powerpc/mm/mmap_64.c
@@ -92,10 +92,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	if (mmap_is_legacy()) {
 		mm->mmap_base = TASK_UNMAPPED_BASE;
 		mm->get_unmapped_area = arch_get_unmapped_area;
-		mm->unmap_area = arch_unmap_area;
 	} else {
 		mm->mmap_base = mmap_base();
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
-		mm->unmap_area = arch_unmap_area_topdown;
 	}
 }
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index c59a5efa58b1..f2a462625c9e 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -91,11 +91,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	if (mmap_is_legacy()) {
 		mm->mmap_base = TASK_UNMAPPED_BASE;
 		mm->get_unmapped_area = arch_get_unmapped_area;
-		mm->unmap_area = arch_unmap_area;
 	} else {
 		mm->mmap_base = mmap_base();
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
-		mm->unmap_area = arch_unmap_area_topdown;
 	}
 }
 
@@ -173,11 +171,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	if (mmap_is_legacy()) {
 		mm->mmap_base = TASK_UNMAPPED_BASE;
 		mm->get_unmapped_area = s390_get_unmapped_area;
-		mm->unmap_area = arch_unmap_area;
 	} else {
 		mm->mmap_base = mmap_base();
 		mm->get_unmapped_area = s390_get_unmapped_area_topdown;
-		mm->unmap_area = arch_unmap_area_topdown;
 	}
 }
 
diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c
index 708bc29d36a8..f3c169f9d3a1 100644
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -290,7 +290,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	    sysctl_legacy_va_layout) {
 		mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
 		mm->get_unmapped_area = arch_get_unmapped_area;
-		mm->unmap_area = arch_unmap_area;
 	} else {
 		/* We know it's 32-bit */
 		unsigned long task_size = STACK_TOP32;
@@ -302,7 +301,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 
 		mm->mmap_base = PAGE_ALIGN(task_size - gap - random_factor);
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
-		mm->unmap_area = arch_unmap_area_topdown;
 	}
 }
 
diff --git a/arch/tile/mm/mmap.c b/arch/tile/mm/mmap.c
index f96f4cec602a..d67d91ebf63e 100644
--- a/arch/tile/mm/mmap.c
+++ b/arch/tile/mm/mmap.c
@@ -66,10 +66,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	if (!is_32bit || rlimit(RLIMIT_STACK) == RLIM_INFINITY) {
 		mm->mmap_base = TASK_UNMAPPED_BASE;
 		mm->get_unmapped_area = arch_get_unmapped_area;
-		mm->unmap_area = arch_unmap_area;
 	} else {
 		mm->mmap_base = mmap_base(mm);
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
-		mm->unmap_area = arch_unmap_area_topdown;
 	}
 }
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index a703af19c281..3b3558577642 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -309,8 +309,6 @@ static int load_aout_binary(struct linux_binprm *bprm)
 		(current->mm->start_data = N_DATADDR(ex));
 	current->mm->brk = ex.a_bss +
 		(current->mm->start_brk = N_BSSADDR(ex));
-	current->mm->free_area_cache = TASK_UNMAPPED_BASE;
-	current->mm->cached_hole_size = 0;
 
 	retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT);
 	if (retval < 0) {
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 845df6835f9f..62c29a5bfe26 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -115,10 +115,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	if (mmap_is_legacy()) {
 		mm->mmap_base = mmap_legacy_base();
 		mm->get_unmapped_area = arch_get_unmapped_area;
-		mm->unmap_area = arch_unmap_area;
 	} else {
 		mm->mmap_base = mmap_base();
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
-		mm->unmap_area = arch_unmap_area_topdown;
 	}
 }
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 6043567b95c2..692e75ca6415 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -256,8 +256,6 @@ static int load_aout_binary(struct linux_binprm * bprm)
 		(current->mm->start_data = N_DATADDR(ex));
 	current->mm->brk = ex.a_bss +
 		(current->mm->start_brk = N_BSSADDR(ex));
-	current->mm->free_area_cache = current->mm->mmap_base;
-	current->mm->cached_hole_size = 0;
 
 	retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
 	if (retval < 0) {
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 0c42cdbabecf..e2087dea9c1e 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -730,8 +730,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
 
 	/* Do this so that we can load the interpreter, if need be.  We will
 	   change some of these later */
-	current->mm->free_area_cache = current->mm->mmap_base;
-	current->mm->cached_hole_size = 0;
 	retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
 				 executable_stack);
 	if (retval < 0) {
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index f8f5162a3571..e50eb047ea8a 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -329,12 +329,9 @@ struct mm_struct {
 	unsigned long (*get_unmapped_area) (struct file *filp,
 				unsigned long addr, unsigned long len,
 				unsigned long pgoff, unsigned long flags);
-	void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
 #endif
 	unsigned long mmap_base;		/* base of mmap area */
 	unsigned long task_size;		/* size of task vm space */
-	unsigned long cached_hole_size; 	/* if non-zero, the largest hole below free_area_cache */
-	unsigned long free_area_cache;		/* first hole of size cached_hole_size or larger */
 	unsigned long highest_vm_end;		/* highest vma end address */
 	pgd_t * pgd;
 	atomic_t mm_users;			/* How many users with user space? */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 206bb089c06b..fa7e0a60ebe9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -366,8 +366,6 @@ extern unsigned long
 arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
 			  unsigned long len, unsigned long pgoff,
 			  unsigned long flags);
-extern void arch_unmap_area(struct mm_struct *, unsigned long);
-extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
 #else
 static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index a31b823b3c2d..bdf61755ef4a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -364,8 +364,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 	mm->locked_vm = 0;
 	mm->mmap = NULL;
 	mm->mmap_cache = NULL;
-	mm->free_area_cache = oldmm->mmap_base;
-	mm->cached_hole_size = ~0UL;
 	mm->map_count = 0;
 	cpumask_clear(mm_cpumask(mm));
 	mm->mm_rb = RB_ROOT;
@@ -539,8 +537,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
 	mm->nr_ptes = 0;
 	memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
 	spin_lock_init(&mm->page_table_lock);
-	mm->free_area_cache = TASK_UNMAPPED_BASE;
-	mm->cached_hole_size = ~0UL;
 	mm_init_aio(mm);
 	mm_init_owner(mm, p);
 
diff --git a/mm/mmap.c b/mm/mmap.c
index f54b235f29a9..532f447879d4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1800,15 +1800,6 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 }
 #endif	
 
-void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
-{
-	/*
-	 * Is this a new hole at the lowest possible address?
-	 */
-	if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache)
-		mm->free_area_cache = addr;
-}
-
 /*
  * This mmap-allocator allocates new areas top-down from below the
  * stack's low limit (the base):
@@ -1865,19 +1856,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 }
 #endif
 
-void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
-{
-	/*
-	 * Is this a new hole at the highest possible address?
-	 */
-	if (addr > mm->free_area_cache)
-		mm->free_area_cache = addr;
-
-	/* dont allow allocations above current base */
-	if (mm->free_area_cache > mm->mmap_base)
-		mm->free_area_cache = mm->mmap_base;
-}
-
 unsigned long
 get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 		unsigned long pgoff, unsigned long flags)
@@ -2276,7 +2254,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 {
 	struct vm_area_struct **insertion_point;
 	struct vm_area_struct *tail_vma = NULL;
-	unsigned long addr;
 
 	insertion_point = (prev ? &prev->vm_next : &mm->mmap);
 	vma->vm_prev = NULL;
@@ -2293,11 +2270,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 	} else
 		mm->highest_vm_end = prev ? prev->vm_end : 0;
 	tail_vma->vm_next = NULL;
-	if (mm->unmap_area == arch_unmap_area)
-		addr = prev ? prev->vm_end : mm->mmap_base;
-	else
-		addr = vma ?  vma->vm_start : mm->mmap_base;
-	mm->unmap_area(mm, addr);
 	mm->mmap_cache = NULL;		/* Kill the cache. */
 }
 
diff --git a/mm/nommu.c b/mm/nommu.c
index 79c3cac87afa..b5535ff2f9d1 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1852,10 +1852,6 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
 	return -ENOMEM;
 }
 
-void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
-{
-}
-
 void unmap_mapping_range(struct address_space *mapping,
 			 loff_t const holebegin, loff_t const holelen,
 			 int even_cows)
diff --git a/mm/util.c b/mm/util.c
index c55e26b17d93..4c19aa6a1b43 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -293,7 +293,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 {
 	mm->mmap_base = TASK_UNMAPPED_BASE;
 	mm->get_unmapped_area = arch_get_unmapped_area;
-	mm->unmap_area = arch_unmap_area;
 }
 #endif
 
-- 
1.7.7.3

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox