All of lore.kernel.org
 help / color / mirror / Atom feed
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
To: Dave Hansen <haveblue@us.ibm.com>
Cc: Andrew Morton <akpm@osdl.org>,
	Linux Kernel Mailing List <linux-kernel@vger.kernel.org>,
	lhms <lhms-devel@lists.sourceforge.net>
Subject: Re: [Lhms-devel] Re: 2.6.14-mm2
Date: Thu, 17 Nov 2005 18:16:17 +0900	[thread overview]
Message-ID: <437C4A61.908@jp.fujitsu.com> (raw)
In-Reply-To: <1132158704.19290.3.camel@localhost>

Dave Hansen wrote:
> Hmmm.  I _think_ you're just trying to do some things at runtime that I
> didn't intend.  In the patch I pointed to in the last mail, look at what
> I did in hot_add_zone_init().  It does some of what
> free_area_init_core() does, but only the most minimal bits.  Basically:
> 
>        zone_wait_table_init(zone, size_pages);
>        init_currently_empty_zone(zone, phys_start_pfn, size_pages);
>        zone_pcp_init(zone);
> 
> Your way may also be valid, but I broke out init_currently_empty_zone()
> for a reason, and I think this was it.  I don't think we want to be
> calling free_area_init_core() itself at runtime.
> 
Thank you, Dave.

My final patch is below. but I attach this just for sharing not for upstream.

Without some strange emulation of memory hot add, this patch is needless, now.

I use this with custom-dsdt (custom acpi information created by hand) and
other emulation code.

This is sesstion log :)
--
> [kamezawa@aworks ~]$ cat /proc/meminfo
> MemTotal:       501772 kB
> MemFree:        415576 kB
> HighTotal:           0 kB
> HighFree:            0 kB
There are no Highmem.
> [root@aworks kamezawa]# cat /sys/devices/system/memory/memory9/state
> offline
> [root@aworks kamezawa]# echo online > /sys/devices/system/memory/memory9/state
> [root@aworks kamezawa]# cat /sys/devices/system/memory/memory9/state
> online
> [root@aworks kamezawa]# cat /proc/meminfo
> MemTotal:       567308 kB
> MemFree:        479968 kB
> HighTotal:       65536 kB
> HighFree:        65408 kB
Highmem is available
> [root@aworks kamezawa]# cat /proc/meminfo
> MemTotal:       567308 kB
> MemFree:        475440 kB
> HighTotal:       65536 kB
> HighFree:        61128 kB
Highmem is used.

Thanks,
-- Kame
--
Linux-2.6.14-mm2's memory-hot-add cannot deal with
adding new zone case.

My personal x86 environment, which has only 700M bytes memory,
has no HIGHMEM at boot time. So I cannot play with memory-hot-add
with it.

This patch enables memory-hot-add test on tiny machine.
With this patch, I boot the kernel with mem= or memlimit= option
and can add extra 200M bytes pages.

Just for emulation people with poor machine ;)

Note:
This patch is cut out from Dave Hansen's -mhp tree.

-- kame <kamezawa.hiroyu@jp.fujitsu.com>

Index: linux-2.6.14-mm2/mm/page_alloc.c
===================================================================
--- linux-2.6.14-mm2.orig/mm/page_alloc.c
+++ linux-2.6.14-mm2/mm/page_alloc.c
@@ -36,6 +36,7 @@
  #include <linux/memory_hotplug.h>
  #include <linux/nodemask.h>
  #include <linux/vmalloc.h>
+#include <linux/stop_machine.h>

  #include <asm/tlbflush.h>
  #include "internal.h"
@@ -1479,7 +1480,9 @@ static int __init build_zonelists_node(p
  		BUG();
  	case ZONE_HIGHMEM:
  		zone = pgdat->node_zones + ZONE_HIGHMEM;
-		if (zone->present_pages) {
+		/* When hot-add, present page is 0 at this point.
+                   so check spanned_pages instead of present_pages */
+		if (zone->spanned_pages) {
  #ifndef CONFIG_HIGHMEM
  			BUG();
  #endif
@@ -1952,21 +1955,26 @@ void __init setup_per_cpu_pageset()
  #endif

  static __devinit
-void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
+void zone_wait_table_init(struct zone *zone,
+			  unsigned long zone_size_pages, int hotadd)
  {
  	int i;
  	struct pglist_data *pgdat = zone->zone_pgdat;
-
+	int allocsize;
  	/*
  	 * The per-page waitqueue mechanism uses hashed waitqueues
  	 * per zone.
  	 */
+	if (hotadd && (zone_size_pages == PAGES_PER_SECTION))
+		zone_size_pages = PAGES_PER_SECTION << 2;
  	zone->wait_table_size = wait_table_size(zone_size_pages);
  	zone->wait_table_bits =	wait_table_bits(zone->wait_table_size);
-	zone->wait_table = (wait_queue_head_t *)
-		alloc_bootmem_node(pgdat, zone->wait_table_size
-					* sizeof(wait_queue_head_t));
-
+	allocsize = zone->wait_table_size * sizeof(wait_queue_head_t);
+	if (hotadd)
+		zone->wait_table = kmalloc(allocsize, GFP_KERNEL);
+	else
+		zone->wait_table = (wait_queue_head_t *)
+			alloc_bootmem_node(pgdat, allocsize);
  	for(i = 0; i < zone->wait_table_size; ++i)
  		init_waitqueue_head(zone->wait_table + i);
  }
@@ -1994,7 +2002,6 @@ static __devinit void init_currently_emp
  {
  	struct pglist_data *pgdat = zone->zone_pgdat;

-	zone_wait_table_init(zone, size);
  	pgdat->nr_zones = zone_idx(zone) + 1;

  	zone->zone_mem_map = pfn_to_page(zone_start_pfn);
@@ -2003,6 +2010,7 @@ static __devinit void init_currently_emp
  	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);

  	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+	zone->spanned_pages = size;
  }

  /*
@@ -2022,7 +2030,7 @@ static void __init free_area_init_core(s
  	pgdat->nr_zones = 0;
  	init_waitqueue_head(&pgdat->kswapd_wait);
  	pgdat->kswapd_max_order = 0;
-	
+
  	for (j = 0; j < MAX_NR_ZONES; j++) {
  		struct zone *zone = pgdat->node_zones + j;
  		unsigned long size, realsize;
@@ -2054,10 +2062,12 @@ static void __init free_area_init_core(s
  		zone->nr_active = 0;
  		zone->nr_inactive = 0;
  		atomic_set(&zone->reclaim_in_progress, 0);
+
  		if (!size)
  			continue;

  		zonetable_add(zone, nid, j, zone_start_pfn, size);
+		zone_wait_table_init(zone, size, 0);
  		init_currently_empty_zone(zone, zone_start_pfn, size);
  		zone_start_pfn += size;
  	}
@@ -2669,3 +2679,49 @@ void *__init alloc_large_system_hash(con

  	return table;
  }
+
+static inline int zone_previously_initialized(struct zone *zone)
+{
+	if (zone->wait_table_size)
+		return 1;
+
+	return 0;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static int __build_zonelists(void *__pgdat)
+{
+	pg_data_t *pgdat = __pgdat;
+	build_zonelists(pgdat);
+	return 0;
+}
+DEFINE_SPINLOCK(zone_init_lock);
+int hot_add_zone_init(struct zone *zone, unsigned long phys_start_pfn, unsigned long size_pages)
+{
+	int ret = 0;
+	unsigned long flags;
+	spin_lock_irqsave(&zone_init_lock,flags);
+	if (zone_previously_initialized(zone)) {
+		ret = -EEXIST;
+		goto out;
+	}
+
+	zone_wait_table_init(zone, size_pages, 1);
+	printk("hot add zone init %lx %lx.....\n",phys_start_pfn, size_pages);
+	init_currently_empty_zone(zone, phys_start_pfn, size_pages);
+	zone_pcp_init(zone);
+
+	/*
+	 * This is an awfully blunt way to do this.  But, the
+	 * zonelists are accessed many times over large areas
+	 * of performance-critical code in the allocator.
+	 * That makes it very hard to get a conventional lock
+	 * to work.  This of this as a rw lock with a huge
+	 * write cost.
+	 */
+	stop_machine_run(__build_zonelists, zone->zone_pgdat, NR_CPUS);
+out:
+	spin_unlock_irqrestore(&zone_init_lock, flags);
+	return ret;
+}
+#endif
Index: linux-2.6.14-mm2/mm/memory_hotplug.c
===================================================================
--- linux-2.6.14-mm2.orig/mm/memory_hotplug.c
+++ linux-2.6.14-mm2/mm/memory_hotplug.c
@@ -48,6 +48,8 @@ static int __add_section(struct zone *zo

  	ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);

+	hot_add_zone_init(zone, phys_start_pfn, PAGES_PER_SECTION);
+
  	if (ret < 0)
  		return ret;




      parent reply	other threads:[~2005-11-17  9:16 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2005-11-11  4:35 2.6.14-mm2 Andrew Morton
2005-11-11  5:23 ` 2.6.14-mm2: loop detected in depmod Brice Goglin
2005-11-11  7:52   ` Antonino A. Daplas
2005-11-12  3:41     ` Brice Goglin
2005-11-11  5:49 ` 2.6.14-mm2 Reuben Farrelly
2005-11-11  6:07   ` 2.6.14-mm2 Andrew Morton
2005-11-11  8:28     ` 2.6.14-mm2 Reuben Farrelly
2005-11-11  8:55       ` 2.6.14-mm2 Andrew Morton
2005-11-11 13:28         ` 2.6.14-mm2 Reuben Farrelly
2005-11-11  8:54 ` 2.6.14-mm2 J.A. Magallon
2005-11-11 12:33 ` 2.6.14-mm2 Felipe Alfaro Solana
2005-11-11 16:40 ` [-mm patch] fs/ocfs2/file.c: make ocfs2_extend_allocation() static Adrian Bunk
2005-11-11 17:09 ` [-mm PATCH] slob: add kmem_set_shrinker Yoichi Yuasa
2005-11-11 19:14 ` 2.6.14-mm2 Badari Pulavarty
2005-11-11 19:21   ` 2.6.14-mm2 Andrew Morton
2005-11-11 19:32     ` 2.6.14-mm2 Badari Pulavarty
2005-11-11 22:32 ` 2.6.14-mm2 Michal Piotrowski
2005-11-11 22:37   ` 2.6.14-mm2 Michal Piotrowski
2005-11-11 23:01   ` 2.6.14-mm2 Andrew Morton
2005-11-11 23:28     ` [PATCH] nvidiafb: Fix bug in nvidiafb_pan_display Antonino A. Daplas
2005-11-11 23:38       ` Benjamin Herrenschmidt
2005-11-11 23:55         ` Antonino A. Daplas
2005-11-11 23:54       ` Michal Piotrowski
2005-11-11 23:30     ` 2.6.14-mm2 Michal Piotrowski
2005-11-12  0:45       ` 2.6.14-mm2 Antonino A. Daplas
2005-11-16  9:04 ` 2.6.14-mm2 KAMEZAWA Hiroyuki
2005-11-16 12:56 ` 2.6.14-mm2 KAMEZAWA Hiroyuki
2005-11-16 13:17   ` [Lhms-devel] 2.6.14-mm2 Dave Hansen
2005-11-16 16:02     ` Kamezawa Hiroyuki
2005-11-16 16:31       ` Dave Hansen
2005-11-16 16:39         ` Kamezawa Hiroyuki
2005-11-17  0:10         ` KAMEZAWA Hiroyuki
2005-11-17  9:16         ` KAMEZAWA Hiroyuki [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=437C4A61.908@jp.fujitsu.com \
    --to=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=akpm@osdl.org \
    --cc=haveblue@us.ibm.com \
    --cc=lhms-devel@lists.sourceforge.net \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.