public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
To: Dave Hansen <haveblue@us.ibm.com>
Cc: Andrew Morton <akpm@osdl.org>,
	Linux Kernel Mailing List <linux-kernel@vger.kernel.org>,
	lhms <lhms-devel@lists.sourceforge.net>
Subject: Re: [Lhms-devel] Re: 2.6.14-mm2
Date: Thu, 17 Nov 2005 18:16:17 +0900	[thread overview]
Message-ID: <437C4A61.908@jp.fujitsu.com> (raw)
In-Reply-To: <1132158704.19290.3.camel@localhost>

Dave Hansen wrote:
> Hmmm.  I _think_ you're just trying to do some things at runtime that I
> didn't intend.  In the patch I pointed to in the last mail, look at what
> I did in hot_add_zone_init().  It does some of what
> free_area_init_core() does, but only the most minimal bits.  Basically:
> 
>        zone_wait_table_init(zone, size_pages);
>        init_currently_empty_zone(zone, phys_start_pfn, size_pages);
>        zone_pcp_init(zone);
> 
> Your way may also be valid, but I broke out init_currently_empty_zone()
> for a reason, and I think this was it.  I don't think we want to be
> calling free_area_init_core() itself at runtime.
> 
Thank you, Dave.

My final patch is below. but I attach this just for sharing not for upstream.

Without some strange emulation of memory hot add, this patch is needless, now.

I use this with custom-dsdt (custom acpi information created by hand) and
other emulation code.

This is sesstion log :)
--
> [kamezawa@aworks ~]$ cat /proc/meminfo
> MemTotal:       501772 kB
> MemFree:        415576 kB
> HighTotal:           0 kB
> HighFree:            0 kB
There are no Highmem.
> [root@aworks kamezawa]# cat /sys/devices/system/memory/memory9/state
> offline
> [root@aworks kamezawa]# echo online > /sys/devices/system/memory/memory9/state
> [root@aworks kamezawa]# cat /sys/devices/system/memory/memory9/state
> online
> [root@aworks kamezawa]# cat /proc/meminfo
> MemTotal:       567308 kB
> MemFree:        479968 kB
> HighTotal:       65536 kB
> HighFree:        65408 kB
Highmem is available
> [root@aworks kamezawa]# cat /proc/meminfo
> MemTotal:       567308 kB
> MemFree:        475440 kB
> HighTotal:       65536 kB
> HighFree:        61128 kB
Highmem is used.

Thanks,
-- Kame
--
Linux-2.6.14-mm2's memory-hot-add cannot deal with
adding new zone case.

My personal x86 environment, which has only 700M bytes memory,
has no HIGHMEM at boot time. So I cannot play with memory-hot-add
with it.

This patch enables memory-hot-add test on tiny machine.
With this patch, I boot the kernel with mem= or memlimit= option
and can add extra 200M bytes pages.

Just for emulation people with poor machine ;)

Note:
This patch is cut out from Dave Hansen's -mhp tree.

-- kame <kamezawa.hiroyu@jp.fujitsu.com>

Index: linux-2.6.14-mm2/mm/page_alloc.c
===================================================================
--- linux-2.6.14-mm2.orig/mm/page_alloc.c
+++ linux-2.6.14-mm2/mm/page_alloc.c
@@ -36,6 +36,7 @@
  #include <linux/memory_hotplug.h>
  #include <linux/nodemask.h>
  #include <linux/vmalloc.h>
+#include <linux/stop_machine.h>

  #include <asm/tlbflush.h>
  #include "internal.h"
@@ -1479,7 +1480,9 @@ static int __init build_zonelists_node(p
  		BUG();
  	case ZONE_HIGHMEM:
  		zone = pgdat->node_zones + ZONE_HIGHMEM;
-		if (zone->present_pages) {
+		/* When hot-add, present page is 0 at this point.
+                   so check spanned_pages instead of present_pages */
+		if (zone->spanned_pages) {
  #ifndef CONFIG_HIGHMEM
  			BUG();
  #endif
@@ -1952,21 +1955,26 @@ void __init setup_per_cpu_pageset()
  #endif

  static __devinit
-void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
+void zone_wait_table_init(struct zone *zone,
+			  unsigned long zone_size_pages, int hotadd)
  {
  	int i;
  	struct pglist_data *pgdat = zone->zone_pgdat;
-
+	int allocsize;
  	/*
  	 * The per-page waitqueue mechanism uses hashed waitqueues
  	 * per zone.
  	 */
+	if (hotadd && (zone_size_pages == PAGES_PER_SECTION))
+		zone_size_pages = PAGES_PER_SECTION << 2;
  	zone->wait_table_size = wait_table_size(zone_size_pages);
  	zone->wait_table_bits =	wait_table_bits(zone->wait_table_size);
-	zone->wait_table = (wait_queue_head_t *)
-		alloc_bootmem_node(pgdat, zone->wait_table_size
-					* sizeof(wait_queue_head_t));
-
+	allocsize = zone->wait_table_size * sizeof(wait_queue_head_t);
+	if (hotadd)
+		zone->wait_table = kmalloc(allocsize, GFP_KERNEL);
+	else
+		zone->wait_table = (wait_queue_head_t *)
+			alloc_bootmem_node(pgdat, allocsize);
  	for(i = 0; i < zone->wait_table_size; ++i)
  		init_waitqueue_head(zone->wait_table + i);
  }
@@ -1994,7 +2002,6 @@ static __devinit void init_currently_emp
  {
  	struct pglist_data *pgdat = zone->zone_pgdat;

-	zone_wait_table_init(zone, size);
  	pgdat->nr_zones = zone_idx(zone) + 1;

  	zone->zone_mem_map = pfn_to_page(zone_start_pfn);
@@ -2003,6 +2010,7 @@ static __devinit void init_currently_emp
  	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);

  	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+	zone->spanned_pages = size;
  }

  /*
@@ -2022,7 +2030,7 @@ static void __init free_area_init_core(s
  	pgdat->nr_zones = 0;
  	init_waitqueue_head(&pgdat->kswapd_wait);
  	pgdat->kswapd_max_order = 0;
-	
+
  	for (j = 0; j < MAX_NR_ZONES; j++) {
  		struct zone *zone = pgdat->node_zones + j;
  		unsigned long size, realsize;
@@ -2054,10 +2062,12 @@ static void __init free_area_init_core(s
  		zone->nr_active = 0;
  		zone->nr_inactive = 0;
  		atomic_set(&zone->reclaim_in_progress, 0);
+
  		if (!size)
  			continue;

  		zonetable_add(zone, nid, j, zone_start_pfn, size);
+		zone_wait_table_init(zone, size, 0);
  		init_currently_empty_zone(zone, zone_start_pfn, size);
  		zone_start_pfn += size;
  	}
@@ -2669,3 +2679,49 @@ void *__init alloc_large_system_hash(con

  	return table;
  }
+
+static inline int zone_previously_initialized(struct zone *zone)
+{
+	if (zone->wait_table_size)
+		return 1;
+
+	return 0;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static int __build_zonelists(void *__pgdat)
+{
+	pg_data_t *pgdat = __pgdat;
+	build_zonelists(pgdat);
+	return 0;
+}
+DEFINE_SPINLOCK(zone_init_lock);
+int hot_add_zone_init(struct zone *zone, unsigned long phys_start_pfn, unsigned long size_pages)
+{
+	int ret = 0;
+	unsigned long flags;
+	spin_lock_irqsave(&zone_init_lock,flags);
+	if (zone_previously_initialized(zone)) {
+		ret = -EEXIST;
+		goto out;
+	}
+
+	zone_wait_table_init(zone, size_pages, 1);
+	printk("hot add zone init %lx %lx.....\n",phys_start_pfn, size_pages);
+	init_currently_empty_zone(zone, phys_start_pfn, size_pages);
+	zone_pcp_init(zone);
+
+	/*
+	 * This is an awfully blunt way to do this.  But, the
+	 * zonelists are accessed many times over large areas
+	 * of performance-critical code in the allocator.
+	 * That makes it very hard to get a conventional lock
+	 * to work.  This of this as a rw lock with a huge
+	 * write cost.
+	 */
+	stop_machine_run(__build_zonelists, zone->zone_pgdat, NR_CPUS);
+out:
+	spin_unlock_irqrestore(&zone_init_lock, flags);
+	return ret;
+}
+#endif
Index: linux-2.6.14-mm2/mm/memory_hotplug.c
===================================================================
--- linux-2.6.14-mm2.orig/mm/memory_hotplug.c
+++ linux-2.6.14-mm2/mm/memory_hotplug.c
@@ -48,6 +48,8 @@ static int __add_section(struct zone *zo

  	ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);

+	hot_add_zone_init(zone, phys_start_pfn, PAGES_PER_SECTION);
+
  	if (ret < 0)
  		return ret;




      parent reply	other threads:[~2005-11-17  9:16 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2005-11-11  4:35 2.6.14-mm2 Andrew Morton
2005-11-11  5:23 ` 2.6.14-mm2: loop detected in depmod Brice Goglin
2005-11-11  7:52   ` Antonino A. Daplas
2005-11-12  3:41     ` Brice Goglin
2005-11-11  5:49 ` 2.6.14-mm2 Reuben Farrelly
2005-11-11  6:07   ` 2.6.14-mm2 Andrew Morton
2005-11-11  8:28     ` 2.6.14-mm2 Reuben Farrelly
2005-11-11  8:55       ` 2.6.14-mm2 Andrew Morton
2005-11-11 13:28         ` 2.6.14-mm2 Reuben Farrelly
2005-11-11  8:54 ` 2.6.14-mm2 J.A. Magallon
2005-11-11 12:33 ` 2.6.14-mm2 Felipe Alfaro Solana
2005-11-11 16:40 ` [-mm patch] fs/ocfs2/file.c: make ocfs2_extend_allocation() static Adrian Bunk
2005-11-11 17:09 ` [-mm PATCH] slob: add kmem_set_shrinker Yoichi Yuasa
2005-11-11 19:14 ` 2.6.14-mm2 Badari Pulavarty
2005-11-11 19:21   ` 2.6.14-mm2 Andrew Morton
2005-11-11 19:32     ` 2.6.14-mm2 Badari Pulavarty
2005-11-11 22:32 ` 2.6.14-mm2 Michal Piotrowski
2005-11-11 22:37   ` 2.6.14-mm2 Michal Piotrowski
2005-11-11 23:01   ` 2.6.14-mm2 Andrew Morton
2005-11-11 23:28     ` [PATCH] nvidiafb: Fix bug in nvidiafb_pan_display Antonino A. Daplas
2005-11-11 23:38       ` Benjamin Herrenschmidt
2005-11-11 23:55         ` Antonino A. Daplas
2005-11-11 23:54       ` Michal Piotrowski
2005-11-11 23:30     ` 2.6.14-mm2 Michal Piotrowski
2005-11-12  0:45       ` 2.6.14-mm2 Antonino A. Daplas
2005-11-16  9:04 ` 2.6.14-mm2 KAMEZAWA Hiroyuki
2005-11-16 12:56 ` 2.6.14-mm2 KAMEZAWA Hiroyuki
2005-11-16 13:17   ` [Lhms-devel] 2.6.14-mm2 Dave Hansen
2005-11-16 16:02     ` Kamezawa Hiroyuki
2005-11-16 16:31       ` Dave Hansen
2005-11-16 16:39         ` Kamezawa Hiroyuki
2005-11-17  0:10         ` KAMEZAWA Hiroyuki
2005-11-17  9:16         ` KAMEZAWA Hiroyuki [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=437C4A61.908@jp.fujitsu.com \
    --to=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=akpm@osdl.org \
    --cc=haveblue@us.ibm.com \
    --cc=lhms-devel@lists.sourceforge.net \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox