* Re: [PATCH] convert powermac ide blink to new led infrastructure
From: Johannes Berg @ 2006-05-01 21:19 UTC (permalink / raw)
To: Benjamin Herrenschmidt; +Cc: linuxppc-dev list
In-Reply-To: <1146476158.30710.51.camel@localhost.localdomain>
On Mon, 2006-05-01 at 19:35 +1000, Benjamin Herrenschmidt wrote:
> No need to schedule work, it should work fine to re-queue
Untested because I haven't rebooted. Does this look good to you?
I don't see how I could get away without locking, especially with
handling the outstanding requests (though loosing one there might not be
too bad).
johannes
Index: wireless-dev/drivers/ide/Kconfig
===================================================================
--- wireless-dev.orig/drivers/ide/Kconfig 2006-04-30 22:17:49.201535187 +0200
+++ wireless-dev/drivers/ide/Kconfig 2006-04-30 22:17:51.911535187 +0200
@@ -773,13 +773,6 @@ config BLK_DEV_IDEDMA_PMAC
to transfer data to and from memory. Saying Y is safe and improves
performance.
-config BLK_DEV_IDE_PMAC_BLINK
- bool "Blink laptop LED on drive activity"
- depends on BLK_DEV_IDE_PMAC && ADB_PMU
- help
- This option enables the use of the sleep LED as a hard drive
- activity LED.
-
config BLK_DEV_IDE_SWARM
tristate "IDE for Sibyte evaluation boards"
depends on SIBYTE_SB1xxx_SOC
Index: wireless-dev/drivers/ide/ppc/pmac.c
===================================================================
--- wireless-dev.orig/drivers/ide/ppc/pmac.c 2006-04-30 22:17:49.221535187 +0200
+++ wireless-dev/drivers/ide/ppc/pmac.c 2006-04-30 22:17:51.911535187 +0200
@@ -421,107 +421,6 @@ static void pmac_ide_kauai_selectproc(id
#endif /* CONFIG_BLK_DEV_IDEDMA_PMAC */
/*
- * Below is the code for blinking the laptop LED along with hard
- * disk activity.
- */
-
-#ifdef CONFIG_BLK_DEV_IDE_PMAC_BLINK
-
-/* Set to 50ms minimum led-on time (also used to limit frequency
- * of requests sent to the PMU
- */
-#define PMU_HD_BLINK_TIME (HZ/50)
-
-static struct adb_request pmu_blink_on, pmu_blink_off;
-static spinlock_t pmu_blink_lock;
-static unsigned long pmu_blink_stoptime;
-static int pmu_blink_ledstate;
-static struct timer_list pmu_blink_timer;
-static int pmu_ide_blink_enabled;
-
-
-static void
-pmu_hd_blink_timeout(unsigned long data)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&pmu_blink_lock, flags);
-
- /* We may have been triggered again in a racy way, check
- * that we really want to switch it off
- */
- if (time_after(pmu_blink_stoptime, jiffies))
- goto done;
-
- /* Previous req. not complete, try 100ms more */
- if (pmu_blink_off.complete == 0)
- mod_timer(&pmu_blink_timer, jiffies + PMU_HD_BLINK_TIME);
- else if (pmu_blink_ledstate) {
- pmu_request(&pmu_blink_off, NULL, 4, 0xee, 4, 0, 0);
- pmu_blink_ledstate = 0;
- }
-done:
- spin_unlock_irqrestore(&pmu_blink_lock, flags);
-}
-
-static void
-pmu_hd_kick_blink(void *data, int rw)
-{
- unsigned long flags;
-
- pmu_blink_stoptime = jiffies + PMU_HD_BLINK_TIME;
- wmb();
- mod_timer(&pmu_blink_timer, pmu_blink_stoptime);
- /* Fast path when LED is already ON */
- if (pmu_blink_ledstate == 1)
- return;
- spin_lock_irqsave(&pmu_blink_lock, flags);
- if (pmu_blink_on.complete && !pmu_blink_ledstate) {
- pmu_request(&pmu_blink_on, NULL, 4, 0xee, 4, 0, 1);
- pmu_blink_ledstate = 1;
- }
- spin_unlock_irqrestore(&pmu_blink_lock, flags);
-}
-
-static int
-pmu_hd_blink_init(void)
-{
- struct device_node *dt;
- const char *model;
-
- /* Currently, I only enable this feature on KeyLargo based laptops,
- * older laptops may support it (at least heathrow/paddington) but
- * I don't feel like loading those venerable old machines with so
- * much additional interrupt & PMU activity...
- */
- if (pmu_get_model() != PMU_KEYLARGO_BASED)
- return 0;
-
- dt = of_find_node_by_path("/");
- if (dt == NULL)
- return 0;
- model = (const char *)get_property(dt, "model", NULL);
- if (model == NULL)
- return 0;
- if (strncmp(model, "PowerBook", strlen("PowerBook")) != 0 &&
- strncmp(model, "iBook", strlen("iBook")) != 0) {
- of_node_put(dt);
- return 0;
- }
- of_node_put(dt);
-
- pmu_blink_on.complete = 1;
- pmu_blink_off.complete = 1;
- spin_lock_init(&pmu_blink_lock);
- init_timer(&pmu_blink_timer);
- pmu_blink_timer.function = pmu_hd_blink_timeout;
-
- return 1;
-}
-
-#endif /* CONFIG_BLK_DEV_IDE_PMAC_BLINK */
-
-/*
* N.B. this can't be an initfunc, because the media-bay task can
* call ide_[un]register at any time.
*/
@@ -1190,23 +1089,6 @@ pmac_ide_do_suspend(ide_hwif_t *hwif)
pmif->timings[0] = 0;
pmif->timings[1] = 0;
-#ifdef CONFIG_BLK_DEV_IDE_PMAC_BLINK
- /* Note: This code will be called for every hwif, thus we'll
- * try several time to stop the LED blinker timer, but that
- * should be harmless
- */
- if (pmu_ide_blink_enabled) {
- unsigned long flags;
-
- /* Make sure we don't hit the PMU blink */
- spin_lock_irqsave(&pmu_blink_lock, flags);
- if (pmu_blink_ledstate)
- del_timer(&pmu_blink_timer);
- pmu_blink_ledstate = 0;
- spin_unlock_irqrestore(&pmu_blink_lock, flags);
- }
-#endif /* CONFIG_BLK_DEV_IDE_PMAC_BLINK */
-
disable_irq(pmif->irq);
/* The media bay will handle itself just fine */
@@ -1374,13 +1256,6 @@ pmac_ide_setup_device(pmac_ide_hwif_t *p
hwif->selectproc = pmac_ide_selectproc;
hwif->speedproc = pmac_ide_tune_chipset;
-#ifdef CONFIG_BLK_DEV_IDE_PMAC_BLINK
- pmu_ide_blink_enabled = pmu_hd_blink_init();
-
- if (pmu_ide_blink_enabled)
- hwif->led_act = pmu_hd_kick_blink;
-#endif
-
printk(KERN_INFO "ide%d: Found Apple %s controller, bus ID %d%s, irq %d\n",
hwif->index, model_name[pmif->kind], pmif->aapl_bus_id,
pmif->mediabay ? " (mediabay)" : "", hwif->irq);
Index: wireless-dev/drivers/macintosh/Kconfig
===================================================================
--- wireless-dev.orig/drivers/macintosh/Kconfig 2006-04-30 22:17:49.301535187 +0200
+++ wireless-dev/drivers/macintosh/Kconfig 2006-05-01 22:39:34.951534234 +0200
@@ -78,6 +78,17 @@ config ADB_PMU
this device; you should do so if your machine is one of those
mentioned above.
+config ADB_PMU_LED
+ bool "Support for the Power/iBook front LED"
+ depends on ADB_PMU
+ select LEDS_CLASS
+ help
+ Support the front LED on Power/iBooks as a generic LED that can
+ be triggered by any of the supported triggers. To get the
+ behaviour of the old CONFIG_BLK_DEV_IDE_PMAC_BLINK, select this
+ and the ide-disk LED trigger and configure appropriately through
+ sysfs.
+
config PMAC_SMU
bool "Support for SMU based PowerMacs"
depends on PPC_PMAC64
Index: wireless-dev/drivers/macintosh/Makefile
===================================================================
--- wireless-dev.orig/drivers/macintosh/Makefile 2006-04-30 22:17:49.311535187 +0200
+++ wireless-dev/drivers/macintosh/Makefile 2006-05-01 22:36:32.871534234 +0200
@@ -12,6 +12,7 @@ obj-$(CONFIG_INPUT_ADBHID) += adbhid.o
obj-$(CONFIG_ANSLCD) += ans-lcd.o
obj-$(CONFIG_ADB_PMU) += via-pmu.o
+obj-$(CONFIG_ADB_PMU_LED) += via-pmu-led.o
obj-$(CONFIG_ADB_CUDA) += via-cuda.o
obj-$(CONFIG_PMAC_APM_EMU) += apm_emu.o
obj-$(CONFIG_PMAC_SMU) += smu.o
Index: wireless-dev/drivers/macintosh/via-pmu-led.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ wireless-dev/drivers/macintosh/via-pmu-led.c 2006-05-01 23:18:41.001534234 +0200
@@ -0,0 +1,120 @@
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/leds.h>
+#include <linux/adb.h>
+#include <linux/pmu.h>
+#include <asm/prom.h>
+
+static spinlock_t pmu_blink_lock;
+static struct adb_request pmu_blink_req;
+/* -1: no change, 0: request off, 1: request on */
+static int requested_change;
+static int sleeping;
+
+static void pmu_req_done(struct adb_request * req)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&pmu_blink_lock, flags);
+ /* if someone requested a change in the meantime
+ * (we only see the last one which is fine)
+ * then apply it now */
+ if (requested_change != -1 && !sleeping)
+ pmu_request(&pmu_blink_req, NULL, 4, 0xee, 4, 0, requested_change);
+ /* reset requested change */
+ requested_change = -1;
+ spin_unlock_irqrestore(&pmu_blink_lock, flags);
+}
+
+static void pmu_led_set(struct led_classdev *led_cdev,
+ enum led_brightness brightness)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&pmu_blink_lock, flags);
+ switch (brightness) {
+ case LED_OFF:
+ requested_change = 0;
+ break;
+ case LED_FULL:
+ requested_change = 1;
+ break;
+ default:
+ goto out;
+ break;
+ }
+ /* if request isn't done, then don't do anything */
+ if (pmu_blink_req.complete && !sleeping)
+ pmu_request(&pmu_blink_req, NULL, 4, 0xee, 4, 0, requested_change);
+ out:
+ spin_unlock_irqrestore(&pmu_blink_lock, flags);
+}
+
+static struct led_classdev pmu_led = {
+ .name = "pmu-front-led",
+ .brightness_set = pmu_led_set,
+};
+
+#ifdef CONFIG_PM
+static int pmu_led_sleep_call(struct pmu_sleep_notifier *self, int when)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&pmu_blink_lock, flags);
+
+ switch (when) {
+ case PBOOK_SLEEP_REQUEST:
+ sleeping = 1;
+ break;
+ case PBOOK_WAKE:
+ sleeping = 0;
+ break;
+ default:
+ /* do nothing */
+ break;
+ }
+ spin_unlock_irqrestore(&pmu_blink_lock, flags);
+
+ return PBOOK_SLEEP_OK;
+}
+
+static struct pmu_sleep_notifier pmu_led_sleep_notif = {
+ .notifier_call = pmu_led_sleep_call,
+};
+#endif
+
+static __init int pmu_led_init(void)
+{
+ struct device_node *dt;
+ const char *model;
+
+ /* only do this on keylargo based models */
+ if (pmu_get_model() != PMU_KEYLARGO_BASED)
+ return -ENODEV;
+
+ dt = of_find_node_by_path("/");
+ if (dt == NULL)
+ return -ENODEV;
+ model = (const char *)get_property(dt, "model", NULL);
+ if (model == NULL)
+ return -ENODEV;
+ if (strncmp(model, "PowerBook", strlen("PowerBook")) != 0 &&
+ strncmp(model, "iBook", strlen("iBook")) != 0) {
+ of_node_put(dt);
+ /* silently ignore */
+ return 0;
+ }
+ of_node_put(dt);
+
+ spin_lock_init(&pmu_blink_lock);
+ /* no outstanding req */
+ pmu_blink_req.complete = 1;
+ pmu_blink_req.done = pmu_req_done;
+#ifdef CONFIG_PM
+ pmu_register_sleep_notifier(&pmu_led_sleep_notif);
+#endif
+ return led_classdev_register(NULL, &pmu_led);
+}
+
+late_initcall(pmu_led_init);
^ permalink raw reply
* Re: DTC/dts modifications
From: Kumar Gala @ 2006-05-01 20:28 UTC (permalink / raw)
To: Kim Phillips; +Cc: linuxppc-dev, jdl
In-Reply-To: <20060501150728.04694488.kim.phillips@freescale.com>
On May 1, 2006, at 3:07 PM, Kim Phillips wrote:
> On Mon, 1 May 2006 14:52:23 -0500
> Kumar Gala <galak@kernel.crashing.org> wrote:
>
>> [snip]
>>
>>>> Try running a current .dts through cpp today. You will get errors
>>>> like:
>>>>
>>>> oftree.dts:15:3: error: invalid preprocessing directive #address
>>>
>>>> Because of props like:
>>>>
>>>> #cpus = <1>;
>>>> #address-cells = <1>;
>>>> #size-cells = <0>;
>>>>
>>>> If these used some other symbol instead of '#' cpp will be happy
>>>> and
>>>> we can use it to create macros for us.
>>>
>>> Yeah, we're not going to be able to change those; they
>>> are "By The Book".
>>
>> By what book? It would seem to me that BNF for dtc is completely
>> under our control and if we want to change it we can. I understand
>> that there is some correspondence to Open Firmware, but it seems that
>> if its people are ok with the dts format changing that's a lot easier
>> than implementing tons of support in dtc for features that cpp
>> gives us.
>>
>> [I'm also guessing no one's really got time to go and implement these
>> features in dtc]
>>
> cpp -x assembler-with-cpp seems to not produce the above errors,
> and still honours preprocessing directives like #define. Don't
> know what else is messes with, and whether you want to add CPPFLAGS.
Cool, here's an invocation that seems to work well. Not sure what
causes linux = 1 (thus I need the -U linux). Also address the line
information that is normally spit out.
cpp -U linux -P -x assembler-with-cpp foo.dts
With a 8349 dts I'm using I'm able to run it through cpp then dts and
get the exact same dtb.
- kumar
^ permalink raw reply
* Re: DTC/dts modifications
From: Kim Phillips @ 2006-05-01 20:07 UTC (permalink / raw)
To: Kumar Gala; +Cc: linuxppc-dev, jdl
In-Reply-To: <55FD11DB-54AF-4284-9E9A-C313F4232105@kernel.crashing.org>
On Mon, 1 May 2006 14:52:23 -0500
Kumar Gala <galak@kernel.crashing.org> wrote:
> [snip]
>
> >> Try running a current .dts through cpp today. You will get errors
> >> like:
> >>
> >> oftree.dts:15:3: error: invalid preprocessing directive #address
> >
> >> Because of props like:
> >>
> >> #cpus = <1>;
> >> #address-cells = <1>;
> >> #size-cells = <0>;
> >>
> >> If these used some other symbol instead of '#' cpp will be happy and
> >> we can use it to create macros for us.
> >
> > Yeah, we're not going to be able to change those; they
> > are "By The Book".
>
> By what book? It would seem to me that BNF for dtc is completely
> under our control and if we want to change it we can. I understand
> that there is some correspondence to Open Firmware, but it seems that
> if its people are ok with the dts format changing that's a lot easier
> than implementing tons of support in dtc for features that cpp gives us.
>
> [I'm also guessing no one's really got time to go and implement these
> features in dtc]
>
cpp -x assembler-with-cpp seems to not produce the above errors, and still honours preprocessing directives like #define. Don't know what else is messes with, and whether you want to add CPPFLAGS.
Kim
> > Instead, we'll have to make the lexical analysis conscious
> > of something like a <newline> context sensitive token or so.
> > Or throw some flag to cpp to not emit location markers.
>
> - kumar
^ permalink raw reply
* Re: DTC/dts modifications
From: Kumar Gala @ 2006-05-01 19:52 UTC (permalink / raw)
To: Jon Loeliger; +Cc: Jon Loeliger, linuxppc-dev@ozlabs.org list
In-Reply-To: <1146512732.24239.34.camel@cashmere.sps.mot.com>
[snip]
>> Try running a current .dts through cpp today. You will get errors
>> like:
>>
>> oftree.dts:15:3: error: invalid preprocessing directive #address
>
>> Because of props like:
>>
>> #cpus = <1>;
>> #address-cells = <1>;
>> #size-cells = <0>;
>>
>> If these used some other symbol instead of '#' cpp will be happy and
>> we can use it to create macros for us.
>
> Yeah, we're not going to be able to change those; they
> are "By The Book".
By what book? It would seem to me that BNF for dtc is completely
under our control and if we want to change it we can. I understand
that there is some correspondence to Open Firmware, but it seems that
if its people are ok with the dts format changing that's a lot easier
than implementing tons of support in dtc for features that cpp gives us.
[I'm also guessing no one's really got time to go and implement these
features in dtc]
> Instead, we'll have to make the lexical analysis conscious
> of something like a <newline> context sensitive token or so.
> Or throw some flag to cpp to not emit location markers.
- kumar
^ permalink raw reply
* Re: DTC/dts modifications
From: Jon Loeliger @ 2006-05-01 19:45 UTC (permalink / raw)
To: Kumar Gala; +Cc: Jon Loeliger, linuxppc-dev@ozlabs.org list
In-Reply-To: <695BB790-1E64-4B53-91DD-7DD88305F201@kernel.crashing.org>
On Mon, 2006-05-01 at 14:39, Kumar Gala wrote:
>
> Comment aren't the issue.
Ah, ok.
> > I think to get CPP to be usable, it will need to handle
> > the # emitted line-location markers, "# <line> <file> <level>".
>
> Don't follow you here.
The pre-processor emits crap like this:
# 1 "cmd_load.c"
# 1 "/proj/ppc/sysperf/sw/u/jdl/86xx/u-boot-86xx/common//"
# 1 "<built-in>"
# 1 "<command line>"
# 1 "cmd_load.c"
# 27 "cmd_load.c"
# 1 "/proj/ppc/sysperf/sw/u/jdl/86xx/u-boot-86xx/include/common.h" 1
# 30 "/proj/ppc/sysperf/sw/u/jdl/86xx/u-boot-86xx/include/common.h"
typedef unsigned char uchar;
typedef volatile unsigned long vu_long;
typedef volatile unsigned short vu_short;
typedef volatile unsigned char vu_char;
> Try running a current .dts through cpp today. You will get errors like:
>
> oftree.dts:15:3: error: invalid preprocessing directive #address
> Because of props like:
>
> #cpus = <1>;
> #address-cells = <1>;
> #size-cells = <0>;
>
> If these used some other symbol instead of '#' cpp will be happy and
> we can use it to create macros for us.
Yeah, we're not going to be able to change those; they
are "By The Book".
Instead, we'll have to make the lexical analysis conscious
of something like a <newline> context sensitive token or so.
Or throw some flag to cpp to not emit location markers.
Or something.
jdl
^ permalink raw reply
* Re: DTC/dts modifications
From: Kumar Gala @ 2006-05-01 19:39 UTC (permalink / raw)
To: Jon Loeliger; +Cc: Jon Loeliger, linuxppc-dev@ozlabs.org list
In-Reply-To: <1146512012.24239.28.camel@cashmere.sps.mot.com>
On May 1, 2006, at 2:33 PM, Jon Loeliger wrote:
> On Sat, 2006-04-29 at 11:00, Kumar Gala wrote:
>> All,
>>
>> What evilness would it be to change the use of '#' in the .dts format
>> to some other character like '$' or '%'.
>
> Uh, use of '#' for what? Current comment style is
> either C or C++, ie, /* ... */ or //.
Comment aren't the issue.
>> The problem is the use of
>> '#' prevents use from using cpp which would make some aspects of
>> building up .dts for boards far more useful.
>
> I think to get CPP to be usable, it will need to handle
> the # emitted line-location markers, "# <line> <file> <level>".
Don't follow you here.
>> We can easily provide a one line script to convert people's .dts to
>> the new format.
>
> I don't think there is a conversion necessary yet.
> Did I miss something here?
Try running a current .dts through cpp today. You will get errors like:
oftree.dts:15:3: error: invalid preprocessing directive #address
oftree.dts:16:3: error: invalid preprocessing directive #size
oftree.dts:20:4: error: invalid preprocessing directive #cpus
oftree.dts:21:4: error: invalid preprocessing directive #address
oftree.dts:22:4: error: invalid preprocessing directive #size
oftree.dts:25:2: error: invalid preprocessing directive #foobar
Because of props like:
#cpus = <1>;
#address-cells = <1>;
#size-cells = <0>;
If these used some other symbol instead of '#' cpp will be happy and
we can use it to create macros for us.
- k
^ permalink raw reply
* Re: DTC/dts modifications
From: Jon Loeliger @ 2006-05-01 19:33 UTC (permalink / raw)
To: Kumar Gala; +Cc: Jon Loeliger, linuxppc-dev@ozlabs.org list
In-Reply-To: <5CA113BC-1614-4551-87E5-6926E14C2225@kernel.crashing.org>
On Sat, 2006-04-29 at 11:00, Kumar Gala wrote:
> All,
>
> What evilness would it be to change the use of '#' in the .dts format
> to some other character like '$' or '%'.
Uh, use of '#' for what? Current comment style is
either C or C++, ie, /* ... */ or //.
> The problem is the use of
> '#' prevents use from using cpp which would make some aspects of
> building up .dts for boards far more useful.
I think to get CPP to be usable, it will need to handle
the # emitted line-location markers, "# <line> <file> <level>".
> We can easily provide a one line script to convert people's .dts to
> the new format.
I don't think there is a conversion necessary yet.
Did I miss something here?
Thanks,
jdl
^ permalink raw reply
* Re: 2.6.17-rc2-mm1
From: Andi Kleen @ 2006-05-01 18:34 UTC (permalink / raw)
To: Martin J. Bligh; +Cc: Andrew Morton, linuxppc64-dev, linux-kernel
In-Reply-To: <44561A1E.7000103@google.com>
On Monday 01 May 2006 16:24, Martin J. Bligh wrote:
> double fault: 0000 [1] SMP
> last sysfs file: /devices/pci0000:00/0000:00:06.0/resource
> CPU 0
> Modules linked in:
> Pid: 20519, comm: mtest01 Not tainted 2.6.17-rc3-mm1-autokern1 #1
> RIP: 0010:[<ffffffff8047c8b8>] <ffffffff8047c8b8>{__sched_text_start+1856}
> RSP: 0000:0000000000000000 EFLAGS: 00010082
> RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffff805d9438
> RDX: ffff8100db12c0d0 RSI: ffffffff805d9438 RDI: ffff8100db12c0d0
> RBP: ffffffff805d9438 R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
> R13: ffff8100e39bd440 R14: ffff810008003620 R15: 000002b02751726c
> FS: 0000000000000000(0000) GS:ffffffff805fa000(0063) knlGS:00000000f7dd0460
> CS: 0010 DS: 002b ES: 002b CR0: 000000008005003b
> CR2: fffffffffffffff8 CR3: 00000000da399000 CR4: 00000000000006e0
> Process mtest01 (pid: 20519, threadinfo ffff8100b1bb4000, task
> ffff8100db12c0d0)
> Stack: ffffffff80579e20 ffff8100db12c0d0 0000000000000001 ffffffff80579f58
> 0000000000000000 ffffffff80579e78 ffffffff8020b0b2 ffffffff80579f58
> 0000000000000000 ffffffff80485520
> Call Trace: <#DF> <ffffffff8020b0b2>{show_registers+140}
> <ffffffff8020b357>{__die+159} <ffffffff8020b3cc>{die+50}
> <ffffffff8020bba6>{do_double_fault+115}
> <ffffffff8020aa91>{double_fault+125}
> <ffffffff8047c8b8>{__sched_text_start+1856} <EOE>
That's really strange - i wonder why the backtracer can't find the original
stack. Should probably add some printk diagnosis here.
Can you send the output with this patch?
Index: linux/arch/x86_64/kernel/traps.c
===================================================================
--- linux.orig/arch/x86_64/kernel/traps.c
+++ linux/arch/x86_64/kernel/traps.c
@@ -238,6 +238,7 @@ void show_trace(unsigned long *stack)
HANDLE_STACK (stack < estack_end);
i += printk(" <EOE>");
stack = (unsigned long *) estack_end[-2];
+ printk("new stack %lx (%lx %lx %lx %lx %lx)\n", stack, estack_end[0], estack_end[-1], estack_end[-2], estack_end[-3], estack_end[-4]);
continue;
}
if (irqstack_end) {
^ permalink raw reply
* Re: 2.6.17-rc2-mm1
From: Andy Whitcroft @ 2006-05-01 18:32 UTC (permalink / raw)
To: Martin Bligh; +Cc: Andrew Morton, linuxppc64-dev, Badari Pulavarty, lkml, ak
In-Reply-To: <44564BEC.1040803@google.com>
Martin Bligh wrote:
> Badari Pulavarty wrote:
>
>> On Mon, 2006-05-01 at 10:26 -0700, Martin Bligh wrote:
>>
>>>> I ran mtest01 multiple times with various options on my 4-way AMD64
>>>> box.
>>>> So far couldn't reproduce the problem (2.6.17-rc3-mm1).
>>>>
>>>> Are there any special config or test options you are testing with ?
>>>
>>>
>>> Config is here:
>>>
>>> http://ftp.kernel.org/pub/linux/kernel/people/mbligh/config/abat/amd64
>>>
>>> It's just doing "runalltests", I think.
>>
>>
>>
>> FWIW, I tried your config file on my 4-way AMD64 (melody) box and ran
>> latest "mtest01" fine.
>>
>> I am now trying runalltests. I guess, its time to bi-sect :(
>
>
> There was a panic on PPC64 during LTP too, but it seems to have gone
> away with rc3-mm1. Not sure if it was really fixed, or just intermittent.
>
> http://test.kernel.org/abat/29675/debug/console.log
I think its more intermittant than gone. I've got another machine which
runs the same tests, and she threw a very similar failure on 2.6.18-rc3-mm1.
-apw
^ permalink raw reply
* Re: 2.6.17-rc2-mm1
From: Martin Bligh @ 2006-05-01 17:57 UTC (permalink / raw)
To: Badari Pulavarty; +Cc: Andrew Morton, linuxppc64-dev, ak, lkml
In-Reply-To: <1146506105.317.4.camel@dyn9047017100.beaverton.ibm.com>
Badari Pulavarty wrote:
> On Mon, 2006-05-01 at 10:26 -0700, Martin Bligh wrote:
>
>>>I ran mtest01 multiple times with various options on my 4-way AMD64 box.
>>>So far couldn't reproduce the problem (2.6.17-rc3-mm1).
>>>
>>>Are there any special config or test options you are testing with ?
>>
>>Config is here:
>>
>>http://ftp.kernel.org/pub/linux/kernel/people/mbligh/config/abat/amd64
>>
>>It's just doing "runalltests", I think.
>
>
> FWIW, I tried your config file on my 4-way AMD64 (melody) box
> and ran latest "mtest01" fine.
>
> I am now trying runalltests. I guess, its time to bi-sect :(
There was a panic on PPC64 during LTP too, but it seems to have gone
away with rc3-mm1. Not sure if it was really fixed, or just intermittent.
http://test.kernel.org/abat/29675/debug/console.log
M.
^ permalink raw reply
* Re: 2.6.17-rc2-mm1
From: Badari Pulavarty @ 2006-05-01 17:55 UTC (permalink / raw)
To: Martin Bligh; +Cc: Andrew Morton, linuxppc64-dev, ak, lkml
In-Reply-To: <445644B7.7060807@google.com>
On Mon, 2006-05-01 at 10:26 -0700, Martin Bligh wrote:
> > I ran mtest01 multiple times with various options on my 4-way AMD64 box.
> > So far couldn't reproduce the problem (2.6.17-rc3-mm1).
> >
> > Are there any special config or test options you are testing with ?
>
> Config is here:
>
> http://ftp.kernel.org/pub/linux/kernel/people/mbligh/config/abat/amd64
>
> It's just doing "runalltests", I think.
FWIW, I tried your config file on my 4-way AMD64 (melody) box
and ran latest "mtest01" fine.
I am now trying runalltests. I guess, its time to bi-sect :(
Thanks,
Badari
^ permalink raw reply
* Re: [PATCH] powerpc: Export flat device tree via debugfs for debugging
From: Kumar Gala @ 2006-05-01 17:54 UTC (permalink / raw)
To: Michael Ellerman; +Cc: linuxppc-dev, Paul Mackerras
In-Reply-To: <20060501074044.D552967B55@ozlabs.org>
On May 1, 2006, at 2:40 AM, Michael Ellerman wrote:
> If DEBUG is turned on in prom.c, export the flat device tree via
> debugfs.
> This has been handy on several occasions.
>
> To look at it:
> # mount -t debugfs none /sys/kernel/debug
> # od -a /sys/kernel/debug/powerpc/flat-device-tree
> and/or
> # dtc -fI dtb /sys/kernel/debug/powerpc/flat-device-tree -O dts
>
> Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
> ---
>
> arch/powerpc/kernel/prom.c | 25 +++++++++++++++++++++++++
> 1 file changed, 25 insertions(+)
>
> Index: to-merge/arch/powerpc/kernel/prom.c
> ===================================================================
> --- to-merge.orig/arch/powerpc/kernel/prom.c
> +++ to-merge/arch/powerpc/kernel/prom.c
> @@ -30,6 +30,7 @@
> #include <linux/bitops.h>
> #include <linux/module.h>
> #include <linux/kexec.h>
> +#include <linux/debugfs.h>
>
> #include <asm/prom.h>
> #include <asm/rtas.h>
> @@ -2009,3 +2010,27 @@ void kdump_move_device_tree(void)
> /* XXX should we unreserve the old DT? */
> }
> #endif /* CONFIG_KEXEC */
> +
> +#ifdef DEBUG
Shouldn't this also depend on DEBUGFS being built in.
> +static struct debugfs_blob_wrapper flat_dt_blob;
> +
> +static int __init export_flat_device_tree(void)
> +{
> + struct dentry *d;
> +
> + d = debugfs_create_dir("powerpc", NULL);
> + if (!d)
> + return 1;
> +
> + flat_dt_blob.data = initial_boot_params;
> + flat_dt_blob.size = initial_boot_params->totalsize;
> +
> + d = debugfs_create_blob("flat-device-tree", S_IFREG | S_IRUSR,
> + d, &flat_dt_blob);
> + if (!d)
> + return 1;
> +
> + return 0;
> +}
> +__initcall(export_flat_device_tree);
> +#endif
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@ozlabs.org
> https://ozlabs.org/mailman/listinfo/linuxppc-dev
^ permalink raw reply
* Large Page Support, 2.6 kernel , PPC440
From: moris dong @ 2006-05-01 17:35 UTC (permalink / raw)
To: linuxppc-embedded
Friends,
My PPC440 (32bit) MMU supports multiple page sizes.
For the default 4K pages, my 2.6.11 kernel compiles and boots just fine.
I want to re-build it with large pages, to improve my application
performance.
I tried modifying PAGE_SHIFT in "page.h" to 13 (8K pages) and re-build my
kernel.
Compilation worked out fine, but my kernel does NOT boot, nor it prints
anything to the console.
Has anyone successfully compiled & booted a 2.6 kernel with pages larger
than 4K ?
What am I doing wrong ?
Thanks a lot.
_________________________________________________________________
Express yourself instantly with MSN Messenger! Download today it's FREE!
http://messenger.msn.click-url.com/go/onm00200471ave/direct/01/
^ permalink raw reply
* Large Page Support, 2.6 kernel , PPC440
From: moris dong @ 2006-05-01 17:34 UTC (permalink / raw)
To: linuxppc-dev
Friends,
My PPC440 (32bit) MMU supports multiple page sizes.
For the default 4K pages, my 2.6.11 kernel compiles and boots just fine.
I want to re-build it with large pages, to improve my application
performance.
I tried modifying PAGE_SHIFT in "page.h" to 13 (8K pages) and re-build my
kernel.
Compilation worked out fine, but my kernel does NOT boot, nor it prints
anything to the console.
Has anyone successfully compiled & booted a 2.6 kernel with pages larger
than 4K ?
What am I doing wrong ?
Thanks a lot.
_________________________________________________________________
Express yourself instantly with MSN Messenger! Download today it's FREE!
http://messenger.msn.click-url.com/go/onm00200471ave/direct/01/
^ permalink raw reply
* Re: 2.6.17-rc2-mm1
From: Martin Bligh @ 2006-05-01 17:26 UTC (permalink / raw)
To: Badari Pulavarty; +Cc: Andrew Morton, linuxppc64-dev, ak, lkml
In-Reply-To: <1146503960.317.1.camel@dyn9047017100.beaverton.ibm.com>
> I ran mtest01 multiple times with various options on my 4-way AMD64 box.
> So far couldn't reproduce the problem (2.6.17-rc3-mm1).
>
> Are there any special config or test options you are testing with ?
Config is here:
http://ftp.kernel.org/pub/linux/kernel/people/mbligh/config/abat/amd64
It's just doing "runalltests", I think.
M.
^ permalink raw reply
* Re: 2.6.17-rc2-mm1
From: Badari Pulavarty @ 2006-05-01 17:19 UTC (permalink / raw)
To: Andrew Morton; +Cc: linuxppc64-dev, ak, lkml, Martin J. Bligh
In-Reply-To: <20060501100731.051f4eff.akpm@osdl.org>
On Mon, 2006-05-01 at 10:07 -0700, Andrew Morton wrote:
> "Martin J. Bligh" <mbligh@google.com> wrote:
> >
> > Andrew Morton wrote:
> > > (I did s/linux-kernel@google.com/linux-kernel@vger.kernel.org/)
> > >
> > > Martin Bligh <mbligh@google.com> wrote:
> > >
> > >>Still crashes in LTP on x86_64:
> > >>(introduced in previous release)
> > >>
> > >>http://test.kernel.org/abat/29674/debug/console.log
> > >
> > >
> > > What a mess. A doublefault inside an NMI watchdog timeout. I think. It's
> > > hard to see. Some CPUs are stuck on a CPU scheduler lock, others seem to
> > > be stuck in flush_tlb_others. One of these could be a consequence of the
> > > other, or both could be a consequence of something else.
> >
> > OK, well the latest one seems cleaner, on -rc3-mm1.
> > http://test.kernel.org/abat/30007/debug/console.log
> >
> > Just has the double fault, with no NMI watchdog timeouts. Not that
> > it means any more to me, but still ;-) mtest01 seems to be able to
> > reproduce this every time, but I don't have an appropriate box here
> > to diagnose it with (this was a 4x Opteron inside IBM), and it's
> > definitely something in -mm that's not in mainline.
> >
> > M.
> >
> > double fault: 0000 [1] SMP
> > last sysfs file: /devices/pci0000:00/0000:00:06.0/resource
> > CPU 0
> > Modules linked in:
> > Pid: 20519, comm: mtest01 Not tainted 2.6.17-rc3-mm1-autokern1 #1
> > RIP: 0010:[<ffffffff8047c8b8>] <ffffffff8047c8b8>{__sched_text_start+1856}
> > RSP: 0000:0000000000000000 EFLAGS: 00010082
> > RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffff805d9438
> > RDX: ffff8100db12c0d0 RSI: ffffffff805d9438 RDI: ffff8100db12c0d0
> > RBP: ffffffff805d9438 R08: 0000000000000000 R09: 0000000000000000
> > R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
> > R13: ffff8100e39bd440 R14: ffff810008003620 R15: 000002b02751726c
> > FS: 0000000000000000(0000) GS:ffffffff805fa000(0063) knlGS:00000000f7dd0460
> > CS: 0010 DS: 002b ES: 002b CR0: 000000008005003b
> > CR2: fffffffffffffff8 CR3: 00000000da399000 CR4: 00000000000006e0
> > Process mtest01 (pid: 20519, threadinfo ffff8100b1bb4000, task
> > ffff8100db12c0d0)
> > Stack: ffffffff80579e20 ffff8100db12c0d0 0000000000000001 ffffffff80579f58
> > 0000000000000000 ffffffff80579e78 ffffffff8020b0b2 ffffffff80579f58
> > 0000000000000000 ffffffff80485520
> > Call Trace: <#DF> <ffffffff8020b0b2>{show_registers+140}
> > <ffffffff8020b357>{__die+159} <ffffffff8020b3cc>{die+50}
> > <ffffffff8020bba6>{do_double_fault+115}
> > <ffffffff8020aa91>{double_fault+125}
> > <ffffffff8047c8b8>{__sched_text_start+1856} <EOE>
> >
> > Code: e8 4c ba d8 ff 65 48 8b 34 25 00 00 00 00 4c 8b 46 08 f0 41
> > RIP <ffffffff8047c8b8>{__sched_text_start+1856} RSP <0000000000000000>
> > -- 0:conmux-control -- time-stamp -- May/01/06 3:54:37 --
>
> I was not able to reproduce this on the 4-way EMT64 machine. Am a bit stuck.
I ran mtest01 multiple times with various options on my 4-way AMD64 box.
So far couldn't reproduce the problem (2.6.17-rc3-mm1).
Are there any special config or test options you are testing with ?
Thanks,
Badari
^ permalink raw reply
* Re: 2.6.17-rc2-mm1
From: Martin Bligh @ 2006-05-01 17:14 UTC (permalink / raw)
To: Andrew Morton; +Cc: linuxppc64-dev, ak, linux-kernel
In-Reply-To: <20060501100731.051f4eff.akpm@osdl.org>
>>double fault: 0000 [1] SMP
>>last sysfs file: /devices/pci0000:00/0000:00:06.0/resource
>>CPU 0
>>Modules linked in:
>>Pid: 20519, comm: mtest01 Not tainted 2.6.17-rc3-mm1-autokern1 #1
>>RIP: 0010:[<ffffffff8047c8b8>] <ffffffff8047c8b8>{__sched_text_start+1856}
>>RSP: 0000:0000000000000000 EFLAGS: 00010082
>>RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffff805d9438
>>RDX: ffff8100db12c0d0 RSI: ffffffff805d9438 RDI: ffff8100db12c0d0
>>RBP: ffffffff805d9438 R08: 0000000000000000 R09: 0000000000000000
>>R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
>>R13: ffff8100e39bd440 R14: ffff810008003620 R15: 000002b02751726c
>>FS: 0000000000000000(0000) GS:ffffffff805fa000(0063) knlGS:00000000f7dd0460
>>CS: 0010 DS: 002b ES: 002b CR0: 000000008005003b
>>CR2: fffffffffffffff8 CR3: 00000000da399000 CR4: 00000000000006e0
>>Process mtest01 (pid: 20519, threadinfo ffff8100b1bb4000, task
>>ffff8100db12c0d0)
>>Stack: ffffffff80579e20 ffff8100db12c0d0 0000000000000001 ffffffff80579f58
>> 0000000000000000 ffffffff80579e78 ffffffff8020b0b2 ffffffff80579f58
>> 0000000000000000 ffffffff80485520
>>Call Trace: <#DF> <ffffffff8020b0b2>{show_registers+140}
>> <ffffffff8020b357>{__die+159} <ffffffff8020b3cc>{die+50}
>> <ffffffff8020bba6>{do_double_fault+115}
>><ffffffff8020aa91>{double_fault+125}
>> <ffffffff8047c8b8>{__sched_text_start+1856} <EOE>
>>
>>Code: e8 4c ba d8 ff 65 48 8b 34 25 00 00 00 00 4c 8b 46 08 f0 41
>>RIP <ffffffff8047c8b8>{__sched_text_start+1856} RSP <0000000000000000>
>> -- 0:conmux-control -- time-stamp -- May/01/06 3:54:37 --
>
>
> I was not able to reproduce this on the 4-way EMT64 machine. Am a bit stuck.
OK, is there anything we could run this with that'd dump more info?
(eg debug patches or something). There's bugger all of use that I
can see in that stack (and why does __sched_text_start come up anyway,
is that an x86_64-ism ?). I suppose if we're really desperate, we can
play chop search, but that's very boring to try to do remotely ...
It's a couple-of-year-old 4x newisys box.
M.
^ permalink raw reply
* Re: [openib-general] Re: [PATCH 00/16] ehca: IBM eHCA InfiniBand Device Driver
From: Roland Dreier @ 2006-05-01 17:03 UTC (permalink / raw)
To: Heiko Joerg Schick; +Cc: linuxppc-dev, linux-kernel, openib-general
In-Reply-To: <e2r7a0$fo2$1@sea.gmane.org>
Heiko> I don't like the idea to put the whole driver in one patch
Heiko> file. I would propose to put the patch "ehca: integration
Heiko> in Linux kernel" last instead of first, as Arnd
Heiko> mentioned. With that change we leave the kernel in a
Heiko> working state when applying the patches.
Yes, that makes sense.
And I can fold the patches into a single git changeset when we finally
merge it, since I don't see any advantage to having the driver split
into pieces. (No one is going to git biset a half-applied driver or
anything like that)
- R.
^ permalink raw reply
* Re: 2.6.17-rc2-mm1
From: Andrew Morton @ 2006-05-01 17:07 UTC (permalink / raw)
To: Martin J. Bligh; +Cc: linuxppc64-dev, ak, linux-kernel
In-Reply-To: <44561A1E.7000103@google.com>
"Martin J. Bligh" <mbligh@google.com> wrote:
>
> Andrew Morton wrote:
> > (I did s/linux-kernel@google.com/linux-kernel@vger.kernel.org/)
> >
> > Martin Bligh <mbligh@google.com> wrote:
> >
> >>Still crashes in LTP on x86_64:
> >>(introduced in previous release)
> >>
> >>http://test.kernel.org/abat/29674/debug/console.log
> >
> >
> > What a mess. A doublefault inside an NMI watchdog timeout. I think. It's
> > hard to see. Some CPUs are stuck on a CPU scheduler lock, others seem to
> > be stuck in flush_tlb_others. One of these could be a consequence of the
> > other, or both could be a consequence of something else.
>
> OK, well the latest one seems cleaner, on -rc3-mm1.
> http://test.kernel.org/abat/30007/debug/console.log
>
> Just has the double fault, with no NMI watchdog timeouts. Not that
> it means any more to me, but still ;-) mtest01 seems to be able to
> reproduce this every time, but I don't have an appropriate box here
> to diagnose it with (this was a 4x Opteron inside IBM), and it's
> definitely something in -mm that's not in mainline.
>
> M.
>
> double fault: 0000 [1] SMP
> last sysfs file: /devices/pci0000:00/0000:00:06.0/resource
> CPU 0
> Modules linked in:
> Pid: 20519, comm: mtest01 Not tainted 2.6.17-rc3-mm1-autokern1 #1
> RIP: 0010:[<ffffffff8047c8b8>] <ffffffff8047c8b8>{__sched_text_start+1856}
> RSP: 0000:0000000000000000 EFLAGS: 00010082
> RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffff805d9438
> RDX: ffff8100db12c0d0 RSI: ffffffff805d9438 RDI: ffff8100db12c0d0
> RBP: ffffffff805d9438 R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
> R13: ffff8100e39bd440 R14: ffff810008003620 R15: 000002b02751726c
> FS: 0000000000000000(0000) GS:ffffffff805fa000(0063) knlGS:00000000f7dd0460
> CS: 0010 DS: 002b ES: 002b CR0: 000000008005003b
> CR2: fffffffffffffff8 CR3: 00000000da399000 CR4: 00000000000006e0
> Process mtest01 (pid: 20519, threadinfo ffff8100b1bb4000, task
> ffff8100db12c0d0)
> Stack: ffffffff80579e20 ffff8100db12c0d0 0000000000000001 ffffffff80579f58
> 0000000000000000 ffffffff80579e78 ffffffff8020b0b2 ffffffff80579f58
> 0000000000000000 ffffffff80485520
> Call Trace: <#DF> <ffffffff8020b0b2>{show_registers+140}
> <ffffffff8020b357>{__die+159} <ffffffff8020b3cc>{die+50}
> <ffffffff8020bba6>{do_double_fault+115}
> <ffffffff8020aa91>{double_fault+125}
> <ffffffff8047c8b8>{__sched_text_start+1856} <EOE>
>
> Code: e8 4c ba d8 ff 65 48 8b 34 25 00 00 00 00 4c 8b 46 08 f0 41
> RIP <ffffffff8047c8b8>{__sched_text_start+1856} RSP <0000000000000000>
> -- 0:conmux-control -- time-stamp -- May/01/06 3:54:37 --
I was not able to reproduce this on the 4-way EMT64 machine. Am a bit stuck.
^ permalink raw reply
* Re: PPC 405GPr support in linux 2.4.32
From: Stephen Williams @ 2006-05-01 15:21 UTC (permalink / raw)
To: linuxppc-embedded; +Cc: Willy Tarreau
In-Reply-To: <20060430164013.GA4631@dmt>
Marcelo Tosatti wrote:
> Folks,
>
> The v2.4 patch acceptance policy has been shifting gradually from
> "accept new features" to "critical fixes only", and at this point in
> time the goal is to have a minimal amount of modifications as possible.
>
> There should be no need for major patch reworking with reference to new
> v2.4 releases.
>
> Willy Tarreau created a repository of useful v2.4 patches for this sort
> of situations. Stephen, Eugene, I think the 405GPr patches are good candidates.
>
> http://w.ods.org/linux/kernel/2.4/lkup/hardware.html
This (and things like it) needs to be *much* better advertised.
I had no idea it existed, or where I would look for such a thing.
It does no one any good if no one thinks to look for it;-)
If course the next major question is how does one submit patches
to this system?
--
Steve Williams "The woods are lovely, dark and deep.
steve at icarus.com But I have promises to keep,
http://www.icarus.com and lines to code before I sleep,
http://www.picturel.com And lines to code before I sleep."
^ permalink raw reply
* Re: 2.6.17-rc2-mm1
From: Martin J. Bligh @ 2006-05-01 14:24 UTC (permalink / raw)
To: Andrew Morton; +Cc: linuxppc64-dev, Andi Kleen, linux-kernel
In-Reply-To: <20060428012022.7b73c77b.akpm@osdl.org>
Andrew Morton wrote:
> (I did s/linux-kernel@google.com/linux-kernel@vger.kernel.org/)
>
> Martin Bligh <mbligh@google.com> wrote:
>
>>Still crashes in LTP on x86_64:
>>(introduced in previous release)
>>
>>http://test.kernel.org/abat/29674/debug/console.log
>
>
> What a mess. A doublefault inside an NMI watchdog timeout. I think. It's
> hard to see. Some CPUs are stuck on a CPU scheduler lock, others seem to
> be stuck in flush_tlb_others. One of these could be a consequence of the
> other, or both could be a consequence of something else.
OK, well the latest one seems cleaner, on -rc3-mm1.
http://test.kernel.org/abat/30007/debug/console.log
Just has the double fault, with no NMI watchdog timeouts. Not that
it means any more to me, but still ;-) mtest01 seems to be able to
reproduce this every time, but I don't have an appropriate box here
to diagnose it with (this was a 4x Opteron inside IBM), and it's
definitely something in -mm that's not in mainline.
M.
double fault: 0000 [1] SMP
last sysfs file: /devices/pci0000:00/0000:00:06.0/resource
CPU 0
Modules linked in:
Pid: 20519, comm: mtest01 Not tainted 2.6.17-rc3-mm1-autokern1 #1
RIP: 0010:[<ffffffff8047c8b8>] <ffffffff8047c8b8>{__sched_text_start+1856}
RSP: 0000:0000000000000000 EFLAGS: 00010082
RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffff805d9438
RDX: ffff8100db12c0d0 RSI: ffffffff805d9438 RDI: ffff8100db12c0d0
RBP: ffffffff805d9438 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
R13: ffff8100e39bd440 R14: ffff810008003620 R15: 000002b02751726c
FS: 0000000000000000(0000) GS:ffffffff805fa000(0063) knlGS:00000000f7dd0460
CS: 0010 DS: 002b ES: 002b CR0: 000000008005003b
CR2: fffffffffffffff8 CR3: 00000000da399000 CR4: 00000000000006e0
Process mtest01 (pid: 20519, threadinfo ffff8100b1bb4000, task
ffff8100db12c0d0)
Stack: ffffffff80579e20 ffff8100db12c0d0 0000000000000001 ffffffff80579f58
0000000000000000 ffffffff80579e78 ffffffff8020b0b2 ffffffff80579f58
0000000000000000 ffffffff80485520
Call Trace: <#DF> <ffffffff8020b0b2>{show_registers+140}
<ffffffff8020b357>{__die+159} <ffffffff8020b3cc>{die+50}
<ffffffff8020bba6>{do_double_fault+115}
<ffffffff8020aa91>{double_fault+125}
<ffffffff8047c8b8>{__sched_text_start+1856} <EOE>
Code: e8 4c ba d8 ff 65 48 8b 34 25 00 00 00 00 4c 8b 46 08 f0 41
RIP <ffffffff8047c8b8>{__sched_text_start+1856} RSP <0000000000000000>
-- 0:conmux-control -- time-stamp -- May/01/06 3:54:37 --
^ permalink raw reply
* [PATCH 7/7] Print out debugging information during initialisation
From: Mel Gorman @ 2006-05-01 13:37 UTC (permalink / raw)
To: akpm, davej, tony.luck, linux-mm, linux-kernel, bob.picco, ak,
linuxppc-dev
Cc: Mel Gorman
In-Reply-To: <20060501133530.6379.66000.sendpatchset@skynet>
The zone and hole sizing code is new and unexpected problems showed up
during early releases on machines that were not covered by the pre-release
tests. This patch prints out useful information when those unexpected
situations occur.
It is not expected that this patch become a permanent part of the set.
mem_init.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
1 files changed, 58 insertions(+), 4 deletions(-)
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-106-breakout_mem_init/mm/mem_init.c linux-2.6.17-rc3-mm1-107-debug/mm/mem_init.c
--- linux-2.6.17-rc3-mm1-106-breakout_mem_init/mm/mem_init.c 2006-05-01 11:51:50.000000000 +0100
+++ linux-2.6.17-rc3-mm1-107-debug/mm/mem_init.c 2006-05-01 11:51:50.000000000 +0100
@@ -341,6 +341,9 @@ void __meminit memmap_init_zone(unsigned
unsigned long end_pfn = start_pfn + size;
unsigned long pfn;
+ printk("memmap_init_zone(size %lu, nid %d, zone %lu, start_pfn %lu)\n",
+ size, nid, zone, start_pfn);
+
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
if (!early_pfn_valid(pfn))
continue;
@@ -661,6 +664,7 @@ __meminit int init_currently_empty_zone(
}
#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+
/* Note: nid == MAX_NUMNODES returns first region */
static int __init first_active_region_index_in_nid(int nid)
{
@@ -713,13 +717,24 @@ void __init free_bootmem_with_active_reg
for_each_active_range_index_in_nid(i, nid) {
unsigned long size_pages = 0;
unsigned long end_pfn = early_node_map[i].end_pfn;
- if (early_node_map[i].start_pfn >= max_low_pfn)
+ if (early_node_map[i].start_pfn >= max_low_pfn) {
+ printk("start_pfn %lu >= %lu\n", early_node_map[i].start_pfn,
+ max_low_pfn);
continue;
+ }
- if (end_pfn > max_low_pfn)
+ if (end_pfn > max_low_pfn) {
+ printk("end_pfn %lu going back to %lu\n", early_node_map[i].end_pfn,
+ max_low_pfn);
end_pfn = max_low_pfn;
+ }
size_pages = end_pfn - early_node_map[i].start_pfn;
+ printk("free_bootmem_node(%d, %lu, %lu) :::: pfn ranges (%d, %lu, %lu)\n",
+ early_node_map[i].nid,
+ PFN_PHYS(early_node_map[i].start_pfn),
+ PFN_PHYS(size_pages),
+ early_node_map[i].nid, early_node_map[i].start_pfn, end_pfn);
free_bootmem_node(NODE_DATA(early_node_map[i].nid),
PFN_PHYS(early_node_map[i].start_pfn),
size_pages << PAGE_SHIFT);
@@ -729,10 +744,15 @@ void __init free_bootmem_with_active_reg
void __init sparse_memory_present_with_active_regions(int nid)
{
unsigned int i;
- for_each_active_range_index_in_nid(i, nid)
+ for_each_active_range_index_in_nid(i, nid) {
+ printk("memory_present(%d, %lu, %lu)\n",
+ early_node_map[i].nid,
+ early_node_map[i].start_pfn,
+ early_node_map[i].end_pfn);
memory_present(early_node_map[i].nid,
early_node_map[i].start_pfn,
early_node_map[i].end_pfn);
+ }
}
void __init get_pfn_range_for_nid(unsigned int nid,
@@ -785,6 +805,8 @@ unsigned long __init __absent_pages_in_r
unsigned long prev_end_pfn = 0, hole_pages = 0;
unsigned long start_pfn;
+ printk("__absent_pages_in_range(%d, %lu, %lu) = ", nid,
+ range_start_pfn, range_end_pfn);
/* Find the end_pfn of the first active range of pfns in the node */
i = first_active_region_index_in_nid(nid);
if (i == MAX_ACTIVE_REGIONS)
@@ -811,6 +833,8 @@ unsigned long __init __absent_pages_in_r
prev_end_pfn = early_node_map[i].end_pfn;
}
+ printk("%lu\n", hole_pages);
+
return hole_pages;
}
@@ -975,6 +999,9 @@ void __init add_active_range(unsigned in
{
unsigned int i;
+ printk("add_active_range(%d, %lu, %lu): ",
+ nid, start_pfn, end_pfn);
+
/* Merge with existing active regions if possible */
for (i = 0; early_node_map[i].end_pfn; i++) {
if (early_node_map[i].nid != nid)
@@ -982,12 +1009,15 @@ void __init add_active_range(unsigned in
/* Skip if an existing region covers this new one */
if (start_pfn >= early_node_map[i].start_pfn &&
- end_pfn <= early_node_map[i].end_pfn)
+ end_pfn <= early_node_map[i].end_pfn) {
+ printk("Existing\n");
return;
+ }
/* Merge forward if suitable */
if (start_pfn <= early_node_map[i].end_pfn &&
end_pfn > early_node_map[i].end_pfn) {
+ printk("Merging forward\n");
early_node_map[i].end_pfn = end_pfn;
return;
}
@@ -995,6 +1025,7 @@ void __init add_active_range(unsigned in
/* Merge backward if suitable */
if (start_pfn < early_node_map[i].end_pfn &&
end_pfn >= early_node_map[i].start_pfn) {
+ printk("Merging backwards\n");
early_node_map[i].start_pfn = start_pfn;
return;
}
@@ -1006,6 +1037,7 @@ void __init add_active_range(unsigned in
return;
}
+ printk("New\n");
early_node_map[i].nid = nid;
early_node_map[i].start_pfn = start_pfn;
early_node_map[i].end_pfn = end_pfn;
@@ -1017,16 +1049,22 @@ void __init shrink_active_range(unsigned
unsigned int i;
/* Find the old active region end and shrink */
+ printk("Shrinking %u from %lu to %lu: ",
+ nid, old_end_pfn, new_end_pfn);
for_each_active_range_index_in_nid(i, nid) {
if (early_node_map[i].end_pfn == old_end_pfn) {
+ printk("Done\n");
early_node_map[i].end_pfn = new_end_pfn;
break;
}
}
+
+ printk("Not found\n");
}
void __init remove_all_active_ranges()
{
+ printk("remove_all_active_ranges()\n");
memset(early_node_map, 0, sizeof(early_node_map));
}
@@ -1054,6 +1092,14 @@ static void __init sort_node_map(void)
sort(early_node_map, num, sizeof(struct node_active_region),
cmp_node_active_region, NULL);
+
+ printk("Dumping sorted node map\n");
+ for (num = 0; early_node_map[num].end_pfn; num++) {
+ printk("entry %lu: %d %lu -> %lu\n", num,
+ early_node_map[num].nid,
+ early_node_map[num].start_pfn,
+ early_node_map[num].end_pfn);
+ }
}
/* Find the lowest pfn for a node. This depends on a sorted early_node_map */
@@ -1069,6 +1115,7 @@ unsigned long __init find_min_pfn_for_no
return 0;
}
+
unsigned long __init find_min_pfn_with_active_regions(void)
{
return find_min_pfn_for_node(MAX_NUMNODES);
@@ -1082,6 +1129,7 @@ unsigned long __init find_max_pfn_with_a
for (i = 0; early_node_map[i].end_pfn; i++)
max_pfn = max(max_pfn, early_node_map[i].end_pfn);
+ printk("find_max_pfn_with_active_regions() == %lu\n", max_pfn);
return max_pfn;
}
@@ -1093,6 +1141,10 @@ void __init free_area_init_nodes(unsigne
unsigned long nid;
int zone_index;
+ printk("free_area_init_nodes(%lu, %lu, %lu, %lu)\n",
+ arch_max_dma_pfn, arch_max_dma32_pfn,
+ arch_max_low_pfn, arch_max_high_pfn);
+
/* Record where the zone boundaries are */
memset(arch_zone_lowest_possible_pfn, 0,
sizeof(arch_zone_lowest_possible_pfn));
@@ -1109,6 +1161,8 @@ void __init free_area_init_nodes(unsigne
arch_zone_highest_possible_pfn[zone_index-1];
}
+ printk("free_area_init_nodes(): find_min_pfn = %lu\n", find_min_pfn_with_active_regions());
+
/* Regions in the early_node_map can be in any order */
sort_node_map();
^ permalink raw reply
* [PATCH 6/7] Break out memory initialisation code from page_alloc.c to mem_init.c
From: Mel Gorman @ 2006-05-01 13:37 UTC (permalink / raw)
To: akpm, davej, tony.luck, linuxppc-dev, linux-kernel, bob.picco, ak,
linux-mm
Cc: Mel Gorman
In-Reply-To: <20060501133530.6379.66000.sendpatchset@skynet>
page_alloc.c contains a large amount of memory initialisation code. This patch
breaks out the initialisation code to a separate file to make page_alloc.c
a bit easier to read.
Makefile | 2
mem_init.c | 1121 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
page_alloc.c | 1104 -----------------------------------------------------
3 files changed, 1122 insertions(+), 1105 deletions(-)
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/mm/Makefile linux-2.6.17-rc3-mm1-106-breakout_mem_init/mm/Makefile
--- linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/mm/Makefile 2006-05-01 11:37:01.000000000 +0100
+++ linux-2.6.17-rc3-mm1-106-breakout_mem_init/mm/Makefile 2006-05-01 11:44:35.000000000 +0100
@@ -8,7 +8,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o
vmalloc.o
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
- page_alloc.o page-writeback.o pdflush.o \
+ page_alloc.o mem_init.o page-writeback.o pdflush.o \
readahead.o swap.o truncate.o vmscan.o \
prio_tree.o util.o mmzone.o $(mmu-y)
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/mm/mem_init.c linux-2.6.17-rc3-mm1-106-breakout_mem_init/mm/mem_init.c
--- linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/mm/mem_init.c 2006-05-01 11:51:50.000000000 +0100
+++ linux-2.6.17-rc3-mm1-106-breakout_mem_init/mm/mem_init.c 2006-05-01 11:51:50.000000000 +0100
@@ -0,0 +1,1121 @@
+/*
+ * mm/mem_init.c
+ * Initialises the architecture independant view of memory. pgdats, zones, etc
+ *
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ * Copyright (C) 1995, Stephen Tweedie
+ * Copyright (C) July 1999, Gerhard Wichert, Siemens AG
+ * Copyright (C) 1999, Ingo Molnar, Red Hat
+ * Copyright (C) 1999, 2000, Kanoj Sarcar, SGI
+ * Copyright (C) Sept 2000, Martin J. Bligh
+ * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
+ * Copyright (C) Apr 2006, Mel Gorman, IBM
+ * (lots of bits taken from architecture-specific code)
+ */
+#include <linux/config.h>
+#include <linux/sort.h>
+#include <linux/pfn.h>
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <linux/cpuset.h>
+#include <linux/mempolicy.h>
+#include <linux/sysctl.h>
+#include <linux/swap.h>
+#include <linux/cpu.h>
+#include <linux/stop_machine.h>
+
+static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
+int percpu_pagelist_fraction;
+
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+ #ifdef CONFIG_MAX_ACTIVE_REGIONS
+ #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
+ #else
+ #define MAX_ACTIVE_REGIONS (MAX_NR_ZONES * MAX_NUMNODES + 1)
+ #endif
+
+ struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
+ unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
+ unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+
+/*
+ * Builds allocation fallback zone lists.
+ *
+ * Add all populated zones of a node to the zonelist.
+ */
+static int __meminit build_zonelists_node(pg_data_t *pgdat,
+ struct zonelist *zonelist, int nr_zones, int zone_type)
+{
+ struct zone *zone;
+
+ BUG_ON(zone_type > ZONE_HIGHMEM);
+
+ do {
+ zone = pgdat->node_zones + zone_type;
+ if (populated_zone(zone)) {
+#ifndef CONFIG_HIGHMEM
+ BUG_ON(zone_type > ZONE_NORMAL);
+#endif
+ zonelist->zones[nr_zones++] = zone;
+ check_highest_zone(zone_type);
+ }
+ zone_type--;
+
+ } while (zone_type >= 0);
+ return nr_zones;
+}
+
+static inline int highest_zone(int zone_bits)
+{
+ int res = ZONE_NORMAL;
+ if (zone_bits & (__force int)__GFP_HIGHMEM)
+ res = ZONE_HIGHMEM;
+ if (zone_bits & (__force int)__GFP_DMA32)
+ res = ZONE_DMA32;
+ if (zone_bits & (__force int)__GFP_DMA)
+ res = ZONE_DMA;
+ return res;
+}
+
+#ifdef CONFIG_NUMA
+#define MAX_NODE_LOAD (num_online_nodes())
+static int __meminitdata node_load[MAX_NUMNODES];
+/**
+ * find_next_best_node - find the next node that should appear in a given node's fallback list
+ * @node: node whose fallback list we're appending
+ * @used_node_mask: nodemask_t of already used nodes
+ *
+ * We use a number of factors to determine which is the next node that should
+ * appear on a given node's fallback list. The node should not have appeared
+ * already in @node's fallback list, and it should be the next closest node
+ * according to the distance array (which contains arbitrary distance values
+ * from each node to each node in the system), and should also prefer nodes
+ * with no CPUs, since presumably they'll have very little allocation pressure
+ * on them otherwise.
+ * It returns -1 if no node is found.
+ */
+static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
+{
+ int n, val;
+ int min_val = INT_MAX;
+ int best_node = -1;
+
+ /* Use the local node if we haven't already */
+ if (!node_isset(node, *used_node_mask)) {
+ node_set(node, *used_node_mask);
+ return node;
+ }
+
+ for_each_online_node(n) {
+ cpumask_t tmp;
+
+ /* Don't want a node to appear more than once */
+ if (node_isset(n, *used_node_mask))
+ continue;
+
+ /* Use the distance array to find the distance */
+ val = node_distance(node, n);
+
+ /* Penalize nodes under us ("prefer the next node") */
+ val += (n < node);
+
+ /* Give preference to headless and unused nodes */
+ tmp = node_to_cpumask(n);
+ if (!cpus_empty(tmp))
+ val += PENALTY_FOR_NODE_WITH_CPUS;
+
+ /* Slight preference for less loaded node */
+ val *= (MAX_NODE_LOAD*MAX_NUMNODES);
+ val += node_load[n];
+
+ if (val < min_val) {
+ min_val = val;
+ best_node = n;
+ }
+ }
+
+ if (best_node >= 0)
+ node_set(best_node, *used_node_mask);
+
+ return best_node;
+}
+
+static void __meminit build_zonelists(pg_data_t *pgdat)
+{
+ int i, j, k, node, local_node;
+ int prev_node, load;
+ struct zonelist *zonelist;
+ nodemask_t used_mask;
+
+ /* initialize zonelists */
+ for (i = 0; i < GFP_ZONETYPES; i++) {
+ zonelist = pgdat->node_zonelists + i;
+ zonelist->zones[0] = NULL;
+ }
+
+ /* NUMA-aware ordering of nodes */
+ local_node = pgdat->node_id;
+ load = num_online_nodes();
+ prev_node = local_node;
+ nodes_clear(used_mask);
+ while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
+ int distance = node_distance(local_node, node);
+
+ /*
+ * If another node is sufficiently far away then it is better
+ * to reclaim pages in a zone before going off node.
+ */
+ if (distance > RECLAIM_DISTANCE)
+ zone_reclaim_mode = 1;
+
+ /*
+ * We don't want to pressure a particular node.
+ * So adding penalty to the first node in same
+ * distance group to make it round-robin.
+ */
+
+ if (distance != node_distance(local_node, prev_node))
+ node_load[node] += load;
+ prev_node = node;
+ load--;
+ for (i = 0; i < GFP_ZONETYPES; i++) {
+ zonelist = pgdat->node_zonelists + i;
+ for (j = 0; zonelist->zones[j] != NULL; j++);
+
+ k = highest_zone(i);
+
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+ zonelist->zones[j] = NULL;
+ }
+ }
+}
+
+#else /* CONFIG_NUMA */
+
+static void __meminit build_zonelists(pg_data_t *pgdat)
+{
+ int i, j, k, node, local_node;
+
+ local_node = pgdat->node_id;
+ for (i = 0; i < GFP_ZONETYPES; i++) {
+ struct zonelist *zonelist;
+
+ zonelist = pgdat->node_zonelists + i;
+
+ j = 0;
+ k = highest_zone(i);
+ j = build_zonelists_node(pgdat, zonelist, j, k);
+ /*
+ * Now we build the zonelist so that it contains the zones
+ * of all the other nodes.
+ * We don't want to pressure a particular node, so when
+ * building the zones for node N, we make sure that the
+ * zones coming right after the local ones are those from
+ * node N+1 (modulo N)
+ */
+ for (node = local_node + 1; node < MAX_NUMNODES; node++) {
+ if (!node_online(node))
+ continue;
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+ }
+ for (node = 0; node < local_node; node++) {
+ if (!node_online(node))
+ continue;
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+ }
+
+ zonelist->zones[j] = NULL;
+ }
+}
+
+#endif /* CONFIG_NUMA */
+
+/* return values int ....just for stop_machine_run() */
+static int __meminit __build_all_zonelists(void *dummy)
+{
+ int nid;
+ for_each_online_node(nid)
+ build_zonelists(NODE_DATA(nid));
+ return 0;
+}
+
+void __meminit build_all_zonelists(void)
+{
+ if (system_state == SYSTEM_BOOTING) {
+ __build_all_zonelists(0);
+ cpuset_init_current_mems_allowed();
+ } else {
+ /* we have to stop all cpus to guaranntee there is no user
+ of zonelist */
+ stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
+ /* cpuset refresh routine should be here */
+ }
+
+ printk("Built %i zonelists\n", num_online_nodes());
+
+}
+
+/*
+ * Helper functions to size the waitqueue hash table.
+ * Essentially these want to choose hash table sizes sufficiently
+ * large so that collisions trying to wait on pages are rare.
+ * But in fact, the number of active page waitqueues on typical
+ * systems is ridiculously low, less than 200. So this is even
+ * conservative, even though it seems large.
+ *
+ * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
+ * waitqueues, i.e. the size of the waitq table given the number of pages.
+ */
+#define PAGES_PER_WAITQUEUE 256
+
+#ifndef CONFIG_MEMORY_HOTPLUG
+static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
+{
+ unsigned long size = 1;
+
+ pages /= PAGES_PER_WAITQUEUE;
+
+ while (size < pages)
+ size <<= 1;
+
+ /*
+ * Once we have dozens or even hundreds of threads sleeping
+ * on IO we've got bigger problems than wait queue collision.
+ * Limit the size of the wait table to a reasonable size.
+ */
+ size = min(size, 4096UL);
+
+ return max(size, 4UL);
+}
+#else
+/*
+ * A zone's size might be changed by hot-add, so it is not possible to determine
+ * a suitable size for its wait_table. So we use the maximum size now.
+ *
+ * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
+ *
+ * i386 (preemption config) : 4096 x 16 = 64Kbyte.
+ * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
+ * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
+ *
+ * The maximum entries are prepared when a zone's memory is (512K + 256) pages
+ * or more by the traditional way. (See above). It equals:
+ *
+ * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
+ * ia64(16K page size) : = ( 8G + 4M)byte.
+ * powerpc (64K page size) : = (32G +16M)byte.
+ */
+static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
+{
+ return 4096UL;
+}
+#endif
+
+/*
+ * This is an integer logarithm so that shifts can be used later
+ * to extract the more random high bits from the multiplicative
+ * hash function before the remainder is taken.
+ */
+static inline unsigned long wait_table_bits(unsigned long size)
+{
+ return ffz(~size);
+}
+
+#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
+
+#ifndef __HAVE_ARCH_MEMMAP_INIT
+#define memmap_init(size, nid, zone, start_pfn) \
+ memmap_init_zone((size), (nid), (zone), (start_pfn))
+#endif
+
+/*
+ * Initially all pages are reserved - free ones are freed
+ * up by free_all_bootmem() once the early boot process is
+ * done. Non-atomic initialization, single-pass.
+ */
+void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+ unsigned long start_pfn)
+{
+ struct page *page;
+ unsigned long end_pfn = start_pfn + size;
+ unsigned long pfn;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ if (!early_pfn_valid(pfn))
+ continue;
+ page = pfn_to_page(pfn);
+ set_page_links(page, zone, nid, pfn);
+ init_page_count(page);
+ reset_page_mapcount(page);
+ SetPageReserved(page);
+ INIT_LIST_HEAD(&page->lru);
+#ifdef WANT_PAGE_VIRTUAL
+ /* The shift won't overflow because ZONE_NORMAL is below 4G. */
+ if (!is_highmem_idx(zone))
+ set_page_address(page, __va(pfn << PAGE_SHIFT));
+#endif
+ }
+}
+
+void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
+ unsigned long size)
+{
+ int order;
+ for (order = 0; order < MAX_ORDER ; order++) {
+ INIT_LIST_HEAD(&zone->free_area[order].free_list);
+ zone->free_area[order].nr_free = 0;
+ }
+}
+
+#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr)
+void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
+ unsigned long size)
+{
+ unsigned long snum = pfn_to_section_nr(pfn);
+ unsigned long end = pfn_to_section_nr(pfn + size);
+
+ if (FLAGS_HAS_NODE)
+ zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
+ else
+ for (; snum <= end; snum++)
+ zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
+}
+
+static __meminit
+int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
+{
+ int i;
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ size_t alloc_size;
+
+ /*
+ * The per-page waitqueue mechanism uses hashed waitqueues
+ * per zone.
+ */
+ zone->wait_table_hash_nr_entries =
+ wait_table_hash_nr_entries(zone_size_pages);
+ zone->wait_table_bits =
+ wait_table_bits(zone->wait_table_hash_nr_entries);
+ alloc_size = zone->wait_table_hash_nr_entries
+ * sizeof(wait_queue_head_t);
+
+ if (system_state == SYSTEM_BOOTING) {
+ zone->wait_table = (wait_queue_head_t *)
+ alloc_bootmem_node(pgdat, alloc_size);
+ } else {
+ /*
+ * This case means that a zone whose size was 0 gets new memory
+ * via memory hot-add.
+ * But it may be the case that a new node was hot-added. In
+ * this case vmalloc() will not be able to use this new node's
+ * memory - this wait_table must be initialized to use this new
+ * node itself as well.
+ * To use this new node's memory, further consideration will be
+ * necessary.
+ */
+ zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);
+ }
+ if (!zone->wait_table)
+ return -ENOMEM;
+
+ for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
+ init_waitqueue_head(zone->wait_table + i);
+
+ return 0;
+}
+
+/*
+ * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
+ * to the value high for the pageset p.
+ */
+static void setup_pagelist_highmark(struct per_cpu_pageset *p,
+ unsigned long high)
+{
+ struct per_cpu_pages *pcp;
+
+ pcp = &p->pcp[0]; /* hot list */
+ pcp->high = high;
+ pcp->batch = max(1UL, high/4);
+ if ((high/4) > (PAGE_SHIFT * 8))
+ pcp->batch = PAGE_SHIFT * 8;
+}
+
+/*
+ * percpu_pagelist_fraction - changes the pcp->high for each zone on each
+ * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
+ * can have before it gets flushed back to buddy allocator.
+ */
+int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
+ struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+ struct zone *zone;
+ unsigned int cpu;
+ int ret;
+
+ ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+ if (!write || (ret == -EINVAL))
+ return ret;
+ for_each_zone(zone) {
+ for_each_online_cpu(cpu) {
+ unsigned long high;
+ high = zone->present_pages / percpu_pagelist_fraction;
+ setup_pagelist_highmark(zone_pcp(zone, cpu), high);
+ }
+ }
+ return 0;
+}
+
+static int __cpuinit zone_batchsize(struct zone *zone)
+{
+ int batch;
+
+ /*
+ * The per-cpu-pages pools are set to around 1000th of the
+ * size of the zone. But no more than 1/2 of a meg.
+ *
+ * OK, so we don't know how big the cache is. So guess.
+ */
+ batch = zone->present_pages / 1024;
+ if (batch * PAGE_SIZE > 512 * 1024)
+ batch = (512 * 1024) / PAGE_SIZE;
+ batch /= 4; /* We effectively *= 4 below */
+ if (batch < 1)
+ batch = 1;
+
+ /*
+ * Clamp the batch to a 2^n - 1 value. Having a power
+ * of 2 value was found to be more likely to have
+ * suboptimal cache aliasing properties in some cases.
+ *
+ * For example if 2 tasks are alternately allocating
+ * batches of pages, one task can end up with a lot
+ * of pages of one half of the possible page colors
+ * and the other with pages of the other colors.
+ */
+ batch = (1 << (fls(batch + batch/2)-1)) - 1;
+
+ return batch;
+}
+
+inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+{
+ struct per_cpu_pages *pcp;
+
+ memset(p, 0, sizeof(*p));
+
+ pcp = &p->pcp[0]; /* hot */
+ pcp->count = 0;
+ pcp->high = 6 * batch;
+ pcp->batch = max(1UL, 1 * batch);
+ INIT_LIST_HEAD(&pcp->list);
+
+ pcp = &p->pcp[1]; /* cold*/
+ pcp->count = 0;
+ pcp->high = 2 * batch;
+ pcp->batch = max(1UL, batch/2);
+ INIT_LIST_HEAD(&pcp->list);
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * Boot pageset table. One per cpu which is going to be used for all
+ * zones and all nodes. The parameters will be set in such a way
+ * that an item put on a list will immediately be handed over to
+ * the buddy list. This is safe since pageset manipulation is done
+ * with interrupts disabled.
+ *
+ * Some NUMA counter updates may also be caught by the boot pagesets.
+ *
+ * The boot_pagesets must be kept even after bootup is complete for
+ * unused processors and/or zones. They do play a role for bootstrapping
+ * hotplugged processors.
+ *
+ * zoneinfo_show() and maybe other functions do
+ * not check if the processor is online before following the pageset pointer.
+ * Other parts of the kernel may not check if the zone is available.
+ */
+static struct per_cpu_pageset boot_pageset[NR_CPUS];
+
+/*
+ * Dynamically allocate memory for the
+ * per cpu pageset array in struct zone.
+ */
+static int __cpuinit process_zones(int cpu)
+{
+ struct zone *zone, *dzone;
+
+ for_each_zone(zone) {
+
+ zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!zone_pcp(zone, cpu))
+ goto bad;
+
+ setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
+
+ if (percpu_pagelist_fraction)
+ setup_pagelist_highmark(zone_pcp(zone, cpu),
+ (zone->present_pages / percpu_pagelist_fraction));
+ }
+
+ return 0;
+bad:
+ for_each_zone(dzone) {
+ if (dzone == zone)
+ break;
+ kfree(zone_pcp(dzone, cpu));
+ zone_pcp(dzone, cpu) = NULL;
+ }
+ return -ENOMEM;
+}
+
+static inline void free_zone_pagesets(int cpu)
+{
+ struct zone *zone;
+
+ for_each_zone(zone) {
+ struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
+
+ zone_pcp(zone, cpu) = NULL;
+ kfree(pset);
+ }
+}
+
+static int pageset_cpuup_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ int cpu = (long)hcpu;
+ int ret = NOTIFY_OK;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ if (process_zones(cpu))
+ ret = NOTIFY_BAD;
+ break;
+ case CPU_UP_CANCELED:
+ case CPU_DEAD:
+ free_zone_pagesets(cpu);
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+
+static struct notifier_block pageset_notifier =
+ { &pageset_cpuup_callback, NULL, 0 };
+
+void __init setup_per_cpu_pageset(void)
+{
+ int err;
+
+ /* Initialize per_cpu_pageset for cpu 0.
+ * A cpuup callback will do this for every cpu
+ * as it comes online
+ */
+ err = process_zones(smp_processor_id());
+ BUG_ON(err);
+ register_cpu_notifier(&pageset_notifier);
+}
+#endif
+
+static __meminit void zone_pcp_init(struct zone *zone)
+{
+ int cpu;
+ unsigned long batch = zone_batchsize(zone);
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+#ifdef CONFIG_NUMA
+ /* Early boot. Slab allocator not functional yet */
+ zone_pcp(zone, cpu) = &boot_pageset[cpu];
+ setup_pageset(&boot_pageset[cpu],0);
+#else
+ setup_pageset(zone_pcp(zone,cpu), batch);
+#endif
+ }
+ if (zone->present_pages)
+ printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
+ zone->name, zone->present_pages, batch);
+}
+
+__meminit int init_currently_empty_zone(struct zone *zone,
+ unsigned long zone_start_pfn,
+ unsigned long size)
+{
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ int ret;
+ ret = zone_wait_table_init(zone, size);
+ if (ret)
+ return ret;
+ pgdat->nr_zones = zone_idx(zone) + 1;
+
+ zone->zone_start_pfn = zone_start_pfn;
+
+ memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
+
+ zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+
+ return 0;
+}
+
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+/* Note: nid == MAX_NUMNODES returns first region */
+static int __init first_active_region_index_in_nid(int nid)
+{
+ int i;
+ for (i = 0; early_node_map[i].end_pfn; i++) {
+ if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
+ return i;
+ }
+
+ return MAX_ACTIVE_REGIONS;
+}
+
+/* Note: nid == MAX_NUMNODES returns next region */
+static int __init next_active_region_index_in_nid(unsigned int index, int nid)
+{
+ for (index = index + 1; early_node_map[index].end_pfn; index++) {
+ if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
+ return index;
+ }
+
+ return MAX_ACTIVE_REGIONS;
+}
+
+#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+int __init early_pfn_to_nid(unsigned long pfn)
+{
+ int i;
+
+ for (i = 0; early_node_map[i].end_pfn; i++) {
+ unsigned long start_pfn = early_node_map[i].start_pfn;
+ unsigned long end_pfn = early_node_map[i].end_pfn;
+
+ if ((start_pfn <= pfn) && (pfn < end_pfn))
+ return early_node_map[i].nid;
+ }
+
+ return -1;
+}
+#endif
+
+#define for_each_active_range_index_in_nid(i, nid) \
+ for (i = first_active_region_index_in_nid(nid); \
+ i != MAX_ACTIVE_REGIONS; \
+ i = next_active_region_index_in_nid(i, nid))
+
+void __init free_bootmem_with_active_regions(int nid,
+ unsigned long max_low_pfn)
+{
+ unsigned int i;
+ for_each_active_range_index_in_nid(i, nid) {
+ unsigned long size_pages = 0;
+ unsigned long end_pfn = early_node_map[i].end_pfn;
+ if (early_node_map[i].start_pfn >= max_low_pfn)
+ continue;
+
+ if (end_pfn > max_low_pfn)
+ end_pfn = max_low_pfn;
+
+ size_pages = end_pfn - early_node_map[i].start_pfn;
+ free_bootmem_node(NODE_DATA(early_node_map[i].nid),
+ PFN_PHYS(early_node_map[i].start_pfn),
+ size_pages << PAGE_SHIFT);
+ }
+}
+
+void __init sparse_memory_present_with_active_regions(int nid)
+{
+ unsigned int i;
+ for_each_active_range_index_in_nid(i, nid)
+ memory_present(early_node_map[i].nid,
+ early_node_map[i].start_pfn,
+ early_node_map[i].end_pfn);
+}
+
+void __init get_pfn_range_for_nid(unsigned int nid,
+ unsigned long *start_pfn, unsigned long *end_pfn)
+{
+ unsigned int i;
+ *start_pfn = -1UL;
+ *end_pfn = 0;
+
+ for_each_active_range_index_in_nid(i, nid) {
+ *start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
+ *end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
+ }
+
+ if (*start_pfn == -1UL) {
+ printk(KERN_WARNING "Node %u active with no memory\n", nid);
+ *start_pfn = 0;
+ }
+}
+
+unsigned long __init zone_present_pages_in_node(int nid,
+ unsigned long zone_type,
+ unsigned long *ignored)
+{
+ unsigned long node_start_pfn, node_end_pfn;
+ unsigned long zone_start_pfn, zone_end_pfn;
+
+ /* Get the start and end of the node and zone */
+ get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
+ zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
+ zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+
+ /* Check that this node has pages within the zone's required range */
+ if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
+ return 0;
+
+ /* Move the zone boundaries inside the node if necessary */
+ zone_end_pfn = min(zone_end_pfn, node_end_pfn);
+ zone_start_pfn = max(zone_start_pfn, node_start_pfn);
+
+ /* Return the spanned pages */
+ return zone_end_pfn - zone_start_pfn;
+}
+
+unsigned long __init __absent_pages_in_range(int nid,
+ unsigned long range_start_pfn,
+ unsigned long range_end_pfn)
+{
+ int i = 0;
+ unsigned long prev_end_pfn = 0, hole_pages = 0;
+ unsigned long start_pfn;
+
+ /* Find the end_pfn of the first active range of pfns in the node */
+ i = first_active_region_index_in_nid(nid);
+ if (i == MAX_ACTIVE_REGIONS)
+ return 0;
+ prev_end_pfn = early_node_map[i].start_pfn;
+
+ /* Find all holes for the zone within the node */
+ for (; i != MAX_ACTIVE_REGIONS;
+ i = next_active_region_index_in_nid(i, nid)) {
+
+ /* No need to continue if prev_end_pfn is outside the zone */
+ if (prev_end_pfn >= range_end_pfn)
+ break;
+
+ /* Make sure the end of the zone is not within the hole */
+ start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
+ prev_end_pfn = max(prev_end_pfn, range_start_pfn);
+
+ /* Update the hole size cound and move on */
+ if (start_pfn > range_start_pfn) {
+ BUG_ON(prev_end_pfn > start_pfn);
+ hole_pages += start_pfn - prev_end_pfn;
+ }
+ prev_end_pfn = early_node_map[i].end_pfn;
+ }
+
+ return hole_pages;
+}
+
+unsigned long __init absent_pages_in_range(unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
+}
+
+unsigned long __init zone_absent_pages_in_node(int nid,
+ unsigned long zone_type,
+ unsigned long *ignored)
+{
+ return __absent_pages_in_range(nid,
+ arch_zone_lowest_possible_pfn[zone_type],
+ arch_zone_highest_possible_pfn[zone_type]);
+}
+#else
+static inline unsigned long zone_present_pages_in_node(int nid,
+ unsigned long zone_type,
+ unsigned long *zones_size)
+{
+ return zones_size[zone_type];
+}
+
+static inline unsigned long zone_absent_pages_in_node(int nid,
+ unsigned long zone_type,
+ unsigned long *zholes_size)
+{
+ if (!zholes_size)
+ return 0;
+
+ return zholes_size[zone_type];
+}
+#endif
+
+static void __init calculate_node_totalpages(struct pglist_data *pgdat,
+ unsigned long *zones_size, unsigned long *zholes_size)
+{
+ unsigned long realtotalpages, totalpages = 0;
+ int i;
+
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ totalpages += zone_present_pages_in_node(pgdat->node_id, i,
+ zones_size);
+ }
+ pgdat->node_spanned_pages = totalpages;
+
+ realtotalpages = totalpages;
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ realtotalpages -=
+ zone_absent_pages_in_node(pgdat->node_id, i, zholes_size);
+ }
+ pgdat->node_present_pages = realtotalpages;
+ printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
+ realtotalpages);
+}
+
+/*
+ * Set up the zone data structures:
+ * - mark all pages reserved
+ * - mark all memory queues empty
+ * - clear the memory bitmaps
+ */
+static void __meminit free_area_init_core(struct pglist_data *pgdat,
+ unsigned long *zones_size, unsigned long *zholes_size)
+{
+ unsigned long j;
+ int nid = pgdat->node_id;
+ unsigned long zone_start_pfn = pgdat->node_start_pfn;
+ int ret;
+
+ pgdat_resize_init(pgdat);
+ pgdat->nr_zones = 0;
+ init_waitqueue_head(&pgdat->kswapd_wait);
+ pgdat->kswapd_max_order = 0;
+
+ for (j = 0; j < MAX_NR_ZONES; j++) {
+ struct zone *zone = pgdat->node_zones + j;
+ unsigned long size, realsize;
+
+ size = zone_present_pages_in_node(nid, j, zones_size);
+ realsize = size - zone_absent_pages_in_node(nid, j,
+ zholes_size);
+ if (j < ZONE_HIGHMEM)
+ nr_kernel_pages += realsize;
+ nr_all_pages += realsize;
+
+ zone->spanned_pages = size;
+ zone->present_pages = realsize;
+ zone->name = zone_names[j];
+ spin_lock_init(&zone->lock);
+ spin_lock_init(&zone->lru_lock);
+ zone_seqlock_init(zone);
+ zone->zone_pgdat = pgdat;
+ zone->free_pages = 0;
+
+ zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
+
+ zone_pcp_init(zone);
+ INIT_LIST_HEAD(&zone->active_list);
+ INIT_LIST_HEAD(&zone->inactive_list);
+ zone->nr_scan_active = 0;
+ zone->nr_scan_inactive = 0;
+ zone->nr_active = 0;
+ zone->nr_inactive = 0;
+ atomic_set(&zone->reclaim_in_progress, 0);
+ if (!size)
+ continue;
+
+ zonetable_add(zone, nid, j, zone_start_pfn, size);
+ ret = init_currently_empty_zone(zone, zone_start_pfn, size);
+ BUG_ON(ret);
+ zone_start_pfn += size;
+ }
+}
+
+static void __init alloc_node_mem_map(struct pglist_data *pgdat)
+{
+ /* Skip empty nodes */
+ if (!pgdat->node_spanned_pages)
+ return;
+
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+ /* ia64 gets its own node_mem_map, before this, without bootmem */
+ if (!pgdat->node_mem_map) {
+ unsigned long size;
+ struct page *map;
+
+ size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
+ map = alloc_remap(pgdat->node_id, size);
+ if (!map)
+ map = alloc_bootmem_node(pgdat, size);
+ pgdat->node_mem_map = map;
+ }
+#ifdef CONFIG_FLATMEM
+ /*
+ * With no DISCONTIG, the global mem_map is just set as node 0's
+ */
+ if (pgdat == NODE_DATA(0))
+ mem_map = NODE_DATA(0)->node_mem_map;
+#endif
+#endif /* CONFIG_FLAT_NODE_MEM_MAP */
+}
+
+void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
+ unsigned long *zones_size, unsigned long node_start_pfn,
+ unsigned long *zholes_size)
+{
+ pgdat->node_id = nid;
+ pgdat->node_start_pfn = node_start_pfn;
+ calculate_node_totalpages(pgdat, zones_size, zholes_size);
+
+ alloc_node_mem_map(pgdat);
+
+ free_area_init_core(pgdat, zones_size, zholes_size);
+}
+
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+void __init add_active_range(unsigned int nid, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ unsigned int i;
+
+ /* Merge with existing active regions if possible */
+ for (i = 0; early_node_map[i].end_pfn; i++) {
+ if (early_node_map[i].nid != nid)
+ continue;
+
+ /* Skip if an existing region covers this new one */
+ if (start_pfn >= early_node_map[i].start_pfn &&
+ end_pfn <= early_node_map[i].end_pfn)
+ return;
+
+ /* Merge forward if suitable */
+ if (start_pfn <= early_node_map[i].end_pfn &&
+ end_pfn > early_node_map[i].end_pfn) {
+ early_node_map[i].end_pfn = end_pfn;
+ return;
+ }
+
+ /* Merge backward if suitable */
+ if (start_pfn < early_node_map[i].end_pfn &&
+ end_pfn >= early_node_map[i].start_pfn) {
+ early_node_map[i].start_pfn = start_pfn;
+ return;
+ }
+ }
+
+ /* Leave last entry NULL, we use range.end_pfn to terminate the walk */
+ if (i >= MAX_ACTIVE_REGIONS - 1) {
+ printk(KERN_ERR "Too many memory regions, truncating\n");
+ return;
+ }
+
+ early_node_map[i].nid = nid;
+ early_node_map[i].start_pfn = start_pfn;
+ early_node_map[i].end_pfn = end_pfn;
+}
+
+void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
+ unsigned long new_end_pfn)
+{
+ unsigned int i;
+
+ /* Find the old active region end and shrink */
+ for_each_active_range_index_in_nid(i, nid) {
+ if (early_node_map[i].end_pfn == old_end_pfn) {
+ early_node_map[i].end_pfn = new_end_pfn;
+ break;
+ }
+ }
+}
+
+void __init remove_all_active_ranges()
+{
+ memset(early_node_map, 0, sizeof(early_node_map));
+}
+
+/* Compare two active node_active_regions */
+static int __init cmp_node_active_region(const void *a, const void *b)
+{
+ struct node_active_region *arange = (struct node_active_region *)a;
+ struct node_active_region *brange = (struct node_active_region *)b;
+
+ /* Done this way to avoid overflows */
+ if (arange->start_pfn > brange->start_pfn)
+ return 1;
+ if (arange->start_pfn < brange->start_pfn)
+ return -1;
+
+ return 0;
+}
+
+/* sort the node_map by start_pfn */
+static void __init sort_node_map(void)
+{
+ size_t num = 0;
+ while (early_node_map[num].end_pfn)
+ num++;
+
+ sort(early_node_map, num, sizeof(struct node_active_region),
+ cmp_node_active_region, NULL);
+}
+
+/* Find the lowest pfn for a node. This depends on a sorted early_node_map */
+unsigned long __init find_min_pfn_for_node(unsigned long nid)
+{
+ int i;
+
+ /* Assuming a sorted map, the first range found has the starting pfn */
+ for_each_active_range_index_in_nid(i, nid)
+ return early_node_map[i].start_pfn;
+
+ printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);
+ return 0;
+}
+
+unsigned long __init find_min_pfn_with_active_regions(void)
+{
+ return find_min_pfn_for_node(MAX_NUMNODES);
+}
+
+unsigned long __init find_max_pfn_with_active_regions(void)
+{
+ int i;
+ unsigned long max_pfn = 0;
+
+ for (i = 0; early_node_map[i].end_pfn; i++)
+ max_pfn = max(max_pfn, early_node_map[i].end_pfn);
+
+ return max_pfn;
+}
+
+void __init free_area_init_nodes(unsigned long arch_max_dma_pfn,
+ unsigned long arch_max_dma32_pfn,
+ unsigned long arch_max_low_pfn,
+ unsigned long arch_max_high_pfn)
+{
+ unsigned long nid;
+ int zone_index;
+
+ /* Record where the zone boundaries are */
+ memset(arch_zone_lowest_possible_pfn, 0,
+ sizeof(arch_zone_lowest_possible_pfn));
+ memset(arch_zone_highest_possible_pfn, 0,
+ sizeof(arch_zone_highest_possible_pfn));
+ arch_zone_lowest_possible_pfn[ZONE_DMA] =
+ find_min_pfn_with_active_regions();
+ arch_zone_highest_possible_pfn[ZONE_DMA] = arch_max_dma_pfn;
+ arch_zone_highest_possible_pfn[ZONE_DMA32] = arch_max_dma32_pfn;
+ arch_zone_highest_possible_pfn[ZONE_NORMAL] = arch_max_low_pfn;
+ arch_zone_highest_possible_pfn[ZONE_HIGHMEM] = arch_max_high_pfn;
+ for (zone_index = 1; zone_index < MAX_NR_ZONES; zone_index++) {
+ arch_zone_lowest_possible_pfn[zone_index] =
+ arch_zone_highest_possible_pfn[zone_index-1];
+ }
+
+ /* Regions in the early_node_map can be in any order */
+ sort_node_map();
+
+ for_each_online_node(nid) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+ free_area_init_node(nid, pgdat, NULL,
+ find_min_pfn_for_node(nid), NULL);
+ }
+}
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/mm/page_alloc.c linux-2.6.17-rc3-mm1-106-breakout_mem_init/mm/page_alloc.c
--- linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/mm/page_alloc.c 2006-05-01 11:39:02.000000000 +0100
+++ linux-2.6.17-rc3-mm1-106-breakout_mem_init/mm/page_alloc.c 2006-05-01 11:44:35.000000000 +0100
@@ -38,8 +38,6 @@
#include <linux/vmalloc.h>
#include <linux/mempolicy.h>
#include <linux/stop_machine.h>
-#include <linux/sort.h>
-#include <linux/pfn.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -56,7 +54,6 @@ unsigned long totalram_pages __read_most
unsigned long totalhigh_pages __read_mostly;
unsigned long totalreserve_pages __read_mostly;
long nr_swap_pages;
-int percpu_pagelist_fraction;
static void __free_pages_ok(struct page *page, unsigned int order);
@@ -82,24 +79,11 @@ EXPORT_SYMBOL(totalram_pages);
struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
EXPORT_SYMBOL(zone_table);
-static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
int min_free_kbytes = 1024;
unsigned long __meminitdata nr_kernel_pages;
unsigned long __meminitdata nr_all_pages;
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
- #ifdef CONFIG_MAX_ACTIVE_REGIONS
- #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
- #else
- #define MAX_ACTIVE_REGIONS (MAX_NR_ZONES * MAX_NUMNODES + 1)
- #endif
-
- struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
- unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
- unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
-
#ifdef CONFIG_DEBUG_VM
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
{
@@ -1593,1068 +1577,6 @@ void show_free_areas(void)
show_swap_cache_info();
}
-/*
- * Builds allocation fallback zone lists.
- *
- * Add all populated zones of a node to the zonelist.
- */
-static int __meminit build_zonelists_node(pg_data_t *pgdat,
- struct zonelist *zonelist, int nr_zones, int zone_type)
-{
- struct zone *zone;
-
- BUG_ON(zone_type > ZONE_HIGHMEM);
-
- do {
- zone = pgdat->node_zones + zone_type;
- if (populated_zone(zone)) {
-#ifndef CONFIG_HIGHMEM
- BUG_ON(zone_type > ZONE_NORMAL);
-#endif
- zonelist->zones[nr_zones++] = zone;
- check_highest_zone(zone_type);
- }
- zone_type--;
-
- } while (zone_type >= 0);
- return nr_zones;
-}
-
-static inline int highest_zone(int zone_bits)
-{
- int res = ZONE_NORMAL;
- if (zone_bits & (__force int)__GFP_HIGHMEM)
- res = ZONE_HIGHMEM;
- if (zone_bits & (__force int)__GFP_DMA32)
- res = ZONE_DMA32;
- if (zone_bits & (__force int)__GFP_DMA)
- res = ZONE_DMA;
- return res;
-}
-
-#ifdef CONFIG_NUMA
-#define MAX_NODE_LOAD (num_online_nodes())
-static int __meminitdata node_load[MAX_NUMNODES];
-/**
- * find_next_best_node - find the next node that should appear in a given node's fallback list
- * @node: node whose fallback list we're appending
- * @used_node_mask: nodemask_t of already used nodes
- *
- * We use a number of factors to determine which is the next node that should
- * appear on a given node's fallback list. The node should not have appeared
- * already in @node's fallback list, and it should be the next closest node
- * according to the distance array (which contains arbitrary distance values
- * from each node to each node in the system), and should also prefer nodes
- * with no CPUs, since presumably they'll have very little allocation pressure
- * on them otherwise.
- * It returns -1 if no node is found.
- */
-static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
-{
- int n, val;
- int min_val = INT_MAX;
- int best_node = -1;
-
- /* Use the local node if we haven't already */
- if (!node_isset(node, *used_node_mask)) {
- node_set(node, *used_node_mask);
- return node;
- }
-
- for_each_online_node(n) {
- cpumask_t tmp;
-
- /* Don't want a node to appear more than once */
- if (node_isset(n, *used_node_mask))
- continue;
-
- /* Use the distance array to find the distance */
- val = node_distance(node, n);
-
- /* Penalize nodes under us ("prefer the next node") */
- val += (n < node);
-
- /* Give preference to headless and unused nodes */
- tmp = node_to_cpumask(n);
- if (!cpus_empty(tmp))
- val += PENALTY_FOR_NODE_WITH_CPUS;
-
- /* Slight preference for less loaded node */
- val *= (MAX_NODE_LOAD*MAX_NUMNODES);
- val += node_load[n];
-
- if (val < min_val) {
- min_val = val;
- best_node = n;
- }
- }
-
- if (best_node >= 0)
- node_set(best_node, *used_node_mask);
-
- return best_node;
-}
-
-static void __meminit build_zonelists(pg_data_t *pgdat)
-{
- int i, j, k, node, local_node;
- int prev_node, load;
- struct zonelist *zonelist;
- nodemask_t used_mask;
-
- /* initialize zonelists */
- for (i = 0; i < GFP_ZONETYPES; i++) {
- zonelist = pgdat->node_zonelists + i;
- zonelist->zones[0] = NULL;
- }
-
- /* NUMA-aware ordering of nodes */
- local_node = pgdat->node_id;
- load = num_online_nodes();
- prev_node = local_node;
- nodes_clear(used_mask);
- while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
- int distance = node_distance(local_node, node);
-
- /*
- * If another node is sufficiently far away then it is better
- * to reclaim pages in a zone before going off node.
- */
- if (distance > RECLAIM_DISTANCE)
- zone_reclaim_mode = 1;
-
- /*
- * We don't want to pressure a particular node.
- * So adding penalty to the first node in same
- * distance group to make it round-robin.
- */
-
- if (distance != node_distance(local_node, prev_node))
- node_load[node] += load;
- prev_node = node;
- load--;
- for (i = 0; i < GFP_ZONETYPES; i++) {
- zonelist = pgdat->node_zonelists + i;
- for (j = 0; zonelist->zones[j] != NULL; j++);
-
- k = highest_zone(i);
-
- j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
- zonelist->zones[j] = NULL;
- }
- }
-}
-
-#else /* CONFIG_NUMA */
-
-static void __meminit build_zonelists(pg_data_t *pgdat)
-{
- int i, j, k, node, local_node;
-
- local_node = pgdat->node_id;
- for (i = 0; i < GFP_ZONETYPES; i++) {
- struct zonelist *zonelist;
-
- zonelist = pgdat->node_zonelists + i;
-
- j = 0;
- k = highest_zone(i);
- j = build_zonelists_node(pgdat, zonelist, j, k);
- /*
- * Now we build the zonelist so that it contains the zones
- * of all the other nodes.
- * We don't want to pressure a particular node, so when
- * building the zones for node N, we make sure that the
- * zones coming right after the local ones are those from
- * node N+1 (modulo N)
- */
- for (node = local_node + 1; node < MAX_NUMNODES; node++) {
- if (!node_online(node))
- continue;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
- }
- for (node = 0; node < local_node; node++) {
- if (!node_online(node))
- continue;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
- }
-
- zonelist->zones[j] = NULL;
- }
-}
-
-#endif /* CONFIG_NUMA */
-
-/* return values int ....just for stop_machine_run() */
-static int __meminit __build_all_zonelists(void *dummy)
-{
- int nid;
- for_each_online_node(nid)
- build_zonelists(NODE_DATA(nid));
- return 0;
-}
-
-void __meminit build_all_zonelists(void)
-{
- if (system_state == SYSTEM_BOOTING) {
- __build_all_zonelists(0);
- cpuset_init_current_mems_allowed();
- } else {
- /* we have to stop all cpus to guaranntee there is no user
- of zonelist */
- stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
- /* cpuset refresh routine should be here */
- }
-
- printk("Built %i zonelists\n", num_online_nodes());
-
-}
-
-/*
- * Helper functions to size the waitqueue hash table.
- * Essentially these want to choose hash table sizes sufficiently
- * large so that collisions trying to wait on pages are rare.
- * But in fact, the number of active page waitqueues on typical
- * systems is ridiculously low, less than 200. So this is even
- * conservative, even though it seems large.
- *
- * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
- * waitqueues, i.e. the size of the waitq table given the number of pages.
- */
-#define PAGES_PER_WAITQUEUE 256
-
-#ifndef CONFIG_MEMORY_HOTPLUG
-static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
-{
- unsigned long size = 1;
-
- pages /= PAGES_PER_WAITQUEUE;
-
- while (size < pages)
- size <<= 1;
-
- /*
- * Once we have dozens or even hundreds of threads sleeping
- * on IO we've got bigger problems than wait queue collision.
- * Limit the size of the wait table to a reasonable size.
- */
- size = min(size, 4096UL);
-
- return max(size, 4UL);
-}
-#else
-/*
- * A zone's size might be changed by hot-add, so it is not possible to determine
- * a suitable size for its wait_table. So we use the maximum size now.
- *
- * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
- *
- * i386 (preemption config) : 4096 x 16 = 64Kbyte.
- * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
- * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
- *
- * The maximum entries are prepared when a zone's memory is (512K + 256) pages
- * or more by the traditional way. (See above). It equals:
- *
- * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
- * ia64(16K page size) : = ( 8G + 4M)byte.
- * powerpc (64K page size) : = (32G +16M)byte.
- */
-static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
-{
- return 4096UL;
-}
-#endif
-
-/*
- * This is an integer logarithm so that shifts can be used later
- * to extract the more random high bits from the multiplicative
- * hash function before the remainder is taken.
- */
-static inline unsigned long wait_table_bits(unsigned long size)
-{
- return ffz(~size);
-}
-
-#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
-
-/*
- * Initially all pages are reserved - free ones are freed
- * up by free_all_bootmem() once the early boot process is
- * done. Non-atomic initialization, single-pass.
- */
-void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
- unsigned long start_pfn)
-{
- struct page *page;
- unsigned long end_pfn = start_pfn + size;
- unsigned long pfn;
-
- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
- if (!early_pfn_valid(pfn))
- continue;
- page = pfn_to_page(pfn);
- set_page_links(page, zone, nid, pfn);
- init_page_count(page);
- reset_page_mapcount(page);
- SetPageReserved(page);
- INIT_LIST_HEAD(&page->lru);
-#ifdef WANT_PAGE_VIRTUAL
- /* The shift won't overflow because ZONE_NORMAL is below 4G. */
- if (!is_highmem_idx(zone))
- set_page_address(page, __va(pfn << PAGE_SHIFT));
-#endif
-#ifdef CONFIG_PAGE_OWNER
- page->order = -1;
-#endif
- }
-}
-
-void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
- unsigned long size)
-{
- int order;
- for (order = 0; order < MAX_ORDER ; order++) {
- INIT_LIST_HEAD(&zone->free_area[order].free_list);
- zone->free_area[order].nr_free = 0;
- }
-}
-
-#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr)
-void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
- unsigned long size)
-{
- unsigned long snum = pfn_to_section_nr(pfn);
- unsigned long end = pfn_to_section_nr(pfn + size);
-
- if (FLAGS_HAS_NODE)
- zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
- else
- for (; snum <= end; snum++)
- zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
-}
-
-#ifndef __HAVE_ARCH_MEMMAP_INIT
-#define memmap_init(size, nid, zone, start_pfn) \
- memmap_init_zone((size), (nid), (zone), (start_pfn))
-#endif
-
-static int __cpuinit zone_batchsize(struct zone *zone)
-{
- int batch;
-
- /*
- * The per-cpu-pages pools are set to around 1000th of the
- * size of the zone. But no more than 1/2 of a meg.
- *
- * OK, so we don't know how big the cache is. So guess.
- */
- batch = zone->present_pages / 1024;
- if (batch * PAGE_SIZE > 512 * 1024)
- batch = (512 * 1024) / PAGE_SIZE;
- batch /= 4; /* We effectively *= 4 below */
- if (batch < 1)
- batch = 1;
-
- /*
- * Clamp the batch to a 2^n - 1 value. Having a power
- * of 2 value was found to be more likely to have
- * suboptimal cache aliasing properties in some cases.
- *
- * For example if 2 tasks are alternately allocating
- * batches of pages, one task can end up with a lot
- * of pages of one half of the possible page colors
- * and the other with pages of the other colors.
- */
- batch = (1 << (fls(batch + batch/2)-1)) - 1;
-
- return batch;
-}
-
-inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
-{
- struct per_cpu_pages *pcp;
-
- memset(p, 0, sizeof(*p));
-
- pcp = &p->pcp[0]; /* hot */
- pcp->count = 0;
- pcp->high = 6 * batch;
- pcp->batch = max(1UL, 1 * batch);
- INIT_LIST_HEAD(&pcp->list);
-
- pcp = &p->pcp[1]; /* cold*/
- pcp->count = 0;
- pcp->high = 2 * batch;
- pcp->batch = max(1UL, batch/2);
- INIT_LIST_HEAD(&pcp->list);
-}
-
-/*
- * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
- * to the value high for the pageset p.
- */
-
-static void setup_pagelist_highmark(struct per_cpu_pageset *p,
- unsigned long high)
-{
- struct per_cpu_pages *pcp;
-
- pcp = &p->pcp[0]; /* hot list */
- pcp->high = high;
- pcp->batch = max(1UL, high/4);
- if ((high/4) > (PAGE_SHIFT * 8))
- pcp->batch = PAGE_SHIFT * 8;
-}
-
-
-#ifdef CONFIG_NUMA
-/*
- * Boot pageset table. One per cpu which is going to be used for all
- * zones and all nodes. The parameters will be set in such a way
- * that an item put on a list will immediately be handed over to
- * the buddy list. This is safe since pageset manipulation is done
- * with interrupts disabled.
- *
- * Some NUMA counter updates may also be caught by the boot pagesets.
- *
- * The boot_pagesets must be kept even after bootup is complete for
- * unused processors and/or zones. They do play a role for bootstrapping
- * hotplugged processors.
- *
- * zoneinfo_show() and maybe other functions do
- * not check if the processor is online before following the pageset pointer.
- * Other parts of the kernel may not check if the zone is available.
- */
-static struct per_cpu_pageset boot_pageset[NR_CPUS];
-
-/*
- * Dynamically allocate memory for the
- * per cpu pageset array in struct zone.
- */
-static int __cpuinit process_zones(int cpu)
-{
- struct zone *zone, *dzone;
-
- for_each_zone(zone) {
-
- zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
- GFP_KERNEL, cpu_to_node(cpu));
- if (!zone_pcp(zone, cpu))
- goto bad;
-
- setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
-
- if (percpu_pagelist_fraction)
- setup_pagelist_highmark(zone_pcp(zone, cpu),
- (zone->present_pages / percpu_pagelist_fraction));
- }
-
- return 0;
-bad:
- for_each_zone(dzone) {
- if (dzone == zone)
- break;
- kfree(zone_pcp(dzone, cpu));
- zone_pcp(dzone, cpu) = NULL;
- }
- return -ENOMEM;
-}
-
-static inline void free_zone_pagesets(int cpu)
-{
- struct zone *zone;
-
- for_each_zone(zone) {
- struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
-
- zone_pcp(zone, cpu) = NULL;
- kfree(pset);
- }
-}
-
-static int pageset_cpuup_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
-{
- int cpu = (long)hcpu;
- int ret = NOTIFY_OK;
-
- switch (action) {
- case CPU_UP_PREPARE:
- if (process_zones(cpu))
- ret = NOTIFY_BAD;
- break;
- case CPU_UP_CANCELED:
- case CPU_DEAD:
- free_zone_pagesets(cpu);
- break;
- default:
- break;
- }
- return ret;
-}
-
-static struct notifier_block pageset_notifier =
- { &pageset_cpuup_callback, NULL, 0 };
-
-void __init setup_per_cpu_pageset(void)
-{
- int err;
-
- /* Initialize per_cpu_pageset for cpu 0.
- * A cpuup callback will do this for every cpu
- * as it comes online
- */
- err = process_zones(smp_processor_id());
- BUG_ON(err);
- register_cpu_notifier(&pageset_notifier);
-}
-
-#endif
-
-static __meminit
-int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
-{
- int i;
- struct pglist_data *pgdat = zone->zone_pgdat;
- size_t alloc_size;
-
- /*
- * The per-page waitqueue mechanism uses hashed waitqueues
- * per zone.
- */
- zone->wait_table_hash_nr_entries =
- wait_table_hash_nr_entries(zone_size_pages);
- zone->wait_table_bits =
- wait_table_bits(zone->wait_table_hash_nr_entries);
- alloc_size = zone->wait_table_hash_nr_entries
- * sizeof(wait_queue_head_t);
-
- if (system_state == SYSTEM_BOOTING) {
- zone->wait_table = (wait_queue_head_t *)
- alloc_bootmem_node(pgdat, alloc_size);
- } else {
- /*
- * This case means that a zone whose size was 0 gets new memory
- * via memory hot-add.
- * But it may be the case that a new node was hot-added. In
- * this case vmalloc() will not be able to use this new node's
- * memory - this wait_table must be initialized to use this new
- * node itself as well.
- * To use this new node's memory, further consideration will be
- * necessary.
- */
- zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);
- }
- if (!zone->wait_table)
- return -ENOMEM;
-
- for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
- init_waitqueue_head(zone->wait_table + i);
-
- return 0;
-}
-
-static __meminit void zone_pcp_init(struct zone *zone)
-{
- int cpu;
- unsigned long batch = zone_batchsize(zone);
-
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
-#ifdef CONFIG_NUMA
- /* Early boot. Slab allocator not functional yet */
- zone_pcp(zone, cpu) = &boot_pageset[cpu];
- setup_pageset(&boot_pageset[cpu],0);
-#else
- setup_pageset(zone_pcp(zone,cpu), batch);
-#endif
- }
- if (zone->present_pages)
- printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
- zone->name, zone->present_pages, batch);
-}
-
-__meminit int init_currently_empty_zone(struct zone *zone,
- unsigned long zone_start_pfn,
- unsigned long size)
-{
- struct pglist_data *pgdat = zone->zone_pgdat;
- int ret;
- ret = zone_wait_table_init(zone, size);
- if (ret)
- return ret;
- pgdat->nr_zones = zone_idx(zone) + 1;
-
- zone->zone_start_pfn = zone_start_pfn;
-
- memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
-
- zone_init_free_lists(pgdat, zone, zone->spanned_pages);
-
- return 0;
-}
-
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
-/* Note: nid == MAX_NUMNODES returns first region */
-static int __init first_active_region_index_in_nid(int nid)
-{
- int i;
- for (i = 0; early_node_map[i].end_pfn; i++) {
- if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
- return i;
- }
-
- return MAX_ACTIVE_REGIONS;
-}
-
-/* Note: nid == MAX_NUMNODES returns next region */
-static int __init next_active_region_index_in_nid(unsigned int index, int nid)
-{
- for (index = index + 1; early_node_map[index].end_pfn; index++) {
- if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
- return index;
- }
-
- return MAX_ACTIVE_REGIONS;
-}
-
-#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
-int __init early_pfn_to_nid(unsigned long pfn)
-{
- int i;
-
- for (i = 0; early_node_map[i].end_pfn; i++) {
- unsigned long start_pfn = early_node_map[i].start_pfn;
- unsigned long end_pfn = early_node_map[i].end_pfn;
-
- if ((start_pfn <= pfn) && (pfn < end_pfn))
- return early_node_map[i].nid;
- }
-
- return -1;
-}
-#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
-
-#define for_each_active_range_index_in_nid(i, nid) \
- for (i = first_active_region_index_in_nid(nid); \
- i != MAX_ACTIVE_REGIONS; \
- i = next_active_region_index_in_nid(i, nid))
-
-void __init free_bootmem_with_active_regions(int nid,
- unsigned long max_low_pfn)
-{
- unsigned int i;
- for_each_active_range_index_in_nid(i, nid) {
- unsigned long size_pages = 0;
- unsigned long end_pfn = early_node_map[i].end_pfn;
- if (early_node_map[i].start_pfn >= max_low_pfn)
- continue;
-
- if (end_pfn > max_low_pfn)
- end_pfn = max_low_pfn;
-
- size_pages = end_pfn - early_node_map[i].start_pfn;
- free_bootmem_node(NODE_DATA(early_node_map[i].nid),
- PFN_PHYS(early_node_map[i].start_pfn),
- size_pages << PAGE_SHIFT);
- }
-}
-
-void __init sparse_memory_present_with_active_regions(int nid)
-{
- unsigned int i;
- for_each_active_range_index_in_nid(i, nid)
- memory_present(early_node_map[i].nid,
- early_node_map[i].start_pfn,
- early_node_map[i].end_pfn);
-}
-
-void __init get_pfn_range_for_nid(unsigned int nid,
- unsigned long *start_pfn, unsigned long *end_pfn)
-{
- unsigned int i;
- *start_pfn = -1UL;
- *end_pfn = 0;
-
- for_each_active_range_index_in_nid(i, nid) {
- *start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
- *end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
- }
-
- if (*start_pfn == -1UL) {
- printk(KERN_WARNING "Node %u active with no memory\n", nid);
- *start_pfn = 0;
- }
-}
-
-unsigned long __init zone_present_pages_in_node(int nid,
- unsigned long zone_type,
- unsigned long *ignored)
-{
- unsigned long node_start_pfn, node_end_pfn;
- unsigned long zone_start_pfn, zone_end_pfn;
-
- /* Get the start and end of the node and zone */
- get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
- zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
- zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
-
- /* Check that this node has pages within the zone's required range */
- if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
- return 0;
-
- /* Move the zone boundaries inside the node if necessary */
- zone_end_pfn = min(zone_end_pfn, node_end_pfn);
- zone_start_pfn = max(zone_start_pfn, node_start_pfn);
-
- /* Return the spanned pages */
- return zone_end_pfn - zone_start_pfn;
-}
-
-unsigned long __init __absent_pages_in_range(int nid,
- unsigned long range_start_pfn,
- unsigned long range_end_pfn)
-{
- int i = 0;
- unsigned long prev_end_pfn = 0, hole_pages = 0;
- unsigned long start_pfn;
-
- /* Find the end_pfn of the first active range of pfns in the node */
- i = first_active_region_index_in_nid(nid);
- if (i == MAX_ACTIVE_REGIONS)
- return 0;
- prev_end_pfn = early_node_map[i].start_pfn;
-
- /* Find all holes for the zone within the node */
- for (; i != MAX_ACTIVE_REGIONS;
- i = next_active_region_index_in_nid(i, nid)) {
-
- /* No need to continue if prev_end_pfn is outside the zone */
- if (prev_end_pfn >= range_end_pfn)
- break;
-
- /* Make sure the end of the zone is not within the hole */
- start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
- prev_end_pfn = max(prev_end_pfn, range_start_pfn);
-
- /* Update the hole size cound and move on */
- if (start_pfn > range_start_pfn) {
- BUG_ON(prev_end_pfn > start_pfn);
- hole_pages += start_pfn - prev_end_pfn;
- }
- prev_end_pfn = early_node_map[i].end_pfn;
- }
-
- return hole_pages;
-}
-
-unsigned long __init absent_pages_in_range(unsigned long start_pfn,
- unsigned long end_pfn)
-{
- return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
-}
-
-unsigned long __init zone_absent_pages_in_node(int nid,
- unsigned long zone_type,
- unsigned long *ignored)
-{
- return __absent_pages_in_range(nid,
- arch_zone_lowest_possible_pfn[zone_type],
- arch_zone_highest_possible_pfn[zone_type]);
-}
-#else
-static inline unsigned long zone_present_pages_in_node(int nid,
- unsigned long zone_type,
- unsigned long *zones_size)
-{
- return zones_size[zone_type];
-}
-
-static inline unsigned long zone_absent_pages_in_node(int nid,
- unsigned long zone_type,
- unsigned long *zholes_size)
-{
- if (!zholes_size)
- return 0;
-
- return zholes_size[zone_type];
-}
-#endif
-
-static void __init calculate_node_totalpages(struct pglist_data *pgdat,
- unsigned long *zones_size, unsigned long *zholes_size)
-{
- unsigned long realtotalpages, totalpages = 0;
- int i;
-
- for (i = 0; i < MAX_NR_ZONES; i++) {
- totalpages += zone_present_pages_in_node(pgdat->node_id, i,
- zones_size);
- }
- pgdat->node_spanned_pages = totalpages;
-
- realtotalpages = totalpages;
- for (i = 0; i < MAX_NR_ZONES; i++) {
- realtotalpages -=
- zone_absent_pages_in_node(pgdat->node_id, i, zholes_size);
- }
- pgdat->node_present_pages = realtotalpages;
- printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
- realtotalpages);
-}
-
-/*
- * Set up the zone data structures:
- * - mark all pages reserved
- * - mark all memory queues empty
- * - clear the memory bitmaps
- */
-static void __meminit free_area_init_core(struct pglist_data *pgdat,
- unsigned long *zones_size, unsigned long *zholes_size)
-{
- unsigned long j;
- int nid = pgdat->node_id;
- unsigned long zone_start_pfn = pgdat->node_start_pfn;
- int ret;
-
- pgdat_resize_init(pgdat);
- pgdat->nr_zones = 0;
- init_waitqueue_head(&pgdat->kswapd_wait);
- pgdat->kswapd_max_order = 0;
-
- for (j = 0; j < MAX_NR_ZONES; j++) {
- struct zone *zone = pgdat->node_zones + j;
- unsigned long size, realsize;
-
- size = zone_present_pages_in_node(nid, j, zones_size);
- realsize = size - zone_absent_pages_in_node(nid, j,
- zholes_size);
- if (j < ZONE_HIGHMEM)
- nr_kernel_pages += realsize;
- nr_all_pages += realsize;
-
- zone->spanned_pages = size;
- zone->present_pages = realsize;
- zone->name = zone_names[j];
- spin_lock_init(&zone->lock);
- spin_lock_init(&zone->lru_lock);
- zone_seqlock_init(zone);
- zone->zone_pgdat = pgdat;
- zone->free_pages = 0;
-
- zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
-
- zone_pcp_init(zone);
- INIT_LIST_HEAD(&zone->active_list);
- INIT_LIST_HEAD(&zone->inactive_list);
- zone->nr_scan_active = 0;
- zone->nr_scan_inactive = 0;
- zone->nr_active = 0;
- zone->nr_inactive = 0;
- atomic_set(&zone->reclaim_in_progress, 0);
- if (!size)
- continue;
-
- zonetable_add(zone, nid, j, zone_start_pfn, size);
- ret = init_currently_empty_zone(zone, zone_start_pfn, size);
- BUG_ON(ret);
- zone_start_pfn += size;
- }
-}
-
-static void __init alloc_node_mem_map(struct pglist_data *pgdat)
-{
- /* Skip empty nodes */
- if (!pgdat->node_spanned_pages)
- return;
-
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
- /* ia64 gets its own node_mem_map, before this, without bootmem */
- if (!pgdat->node_mem_map) {
- unsigned long size;
- struct page *map;
-
- size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
- map = alloc_remap(pgdat->node_id, size);
- if (!map)
- map = alloc_bootmem_node(pgdat, size);
- pgdat->node_mem_map = map;
- }
-#ifdef CONFIG_FLATMEM
- /*
- * With no DISCONTIG, the global mem_map is just set as node 0's
- */
- if (pgdat == NODE_DATA(0))
- mem_map = NODE_DATA(0)->node_mem_map;
-#endif
-#endif /* CONFIG_FLAT_NODE_MEM_MAP */
-}
-
-void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
- unsigned long *zones_size, unsigned long node_start_pfn,
- unsigned long *zholes_size)
-{
- pgdat->node_id = nid;
- pgdat->node_start_pfn = node_start_pfn;
- calculate_node_totalpages(pgdat, zones_size, zholes_size);
-
- alloc_node_mem_map(pgdat);
-
- free_area_init_core(pgdat, zones_size, zholes_size);
-}
-
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
-void __init add_active_range(unsigned int nid, unsigned long start_pfn,
- unsigned long end_pfn)
-{
- unsigned int i;
-
- /* Merge with existing active regions if possible */
- for (i = 0; early_node_map[i].end_pfn; i++) {
- if (early_node_map[i].nid != nid)
- continue;
-
- /* Skip if an existing region covers this new one */
- if (start_pfn >= early_node_map[i].start_pfn &&
- end_pfn <= early_node_map[i].end_pfn)
- return;
-
- /* Merge forward if suitable */
- if (start_pfn <= early_node_map[i].end_pfn &&
- end_pfn > early_node_map[i].end_pfn) {
- early_node_map[i].end_pfn = end_pfn;
- return;
- }
-
- /* Merge backward if suitable */
- if (start_pfn < early_node_map[i].end_pfn &&
- end_pfn >= early_node_map[i].start_pfn) {
- early_node_map[i].start_pfn = start_pfn;
- return;
- }
- }
-
- /* Leave last entry NULL, we use range.end_pfn to terminate the walk */
- if (i >= MAX_ACTIVE_REGIONS - 1) {
- printk(KERN_ERR "Too many memory regions, truncating\n");
- return;
- }
-
- early_node_map[i].nid = nid;
- early_node_map[i].start_pfn = start_pfn;
- early_node_map[i].end_pfn = end_pfn;
-}
-
-void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
- unsigned long new_end_pfn)
-{
- unsigned int i;
-
- /* Find the old active region end and shrink */
- for_each_active_range_index_in_nid(i, nid) {
- if (early_node_map[i].end_pfn == old_end_pfn) {
- early_node_map[i].end_pfn = new_end_pfn;
- break;
- }
- }
-}
-
-void __init remove_all_active_ranges()
-{
- memset(early_node_map, 0, sizeof(early_node_map));
-}
-
-/* Compare two active node_active_regions */
-static int __init cmp_node_active_region(const void *a, const void *b)
-{
- struct node_active_region *arange = (struct node_active_region *)a;
- struct node_active_region *brange = (struct node_active_region *)b;
-
- /* Done this way to avoid overflows */
- if (arange->start_pfn > brange->start_pfn)
- return 1;
- if (arange->start_pfn < brange->start_pfn)
- return -1;
-
- return 0;
-}
-
-/* sort the node_map by start_pfn */
-static void __init sort_node_map(void)
-{
- size_t num = 0;
- while (early_node_map[num].end_pfn)
- num++;
-
- sort(early_node_map, num, sizeof(struct node_active_region),
- cmp_node_active_region, NULL);
-}
-
-/* Find the lowest pfn for a node. This depends on a sorted early_node_map */
-unsigned long __init find_min_pfn_for_node(unsigned long nid)
-{
- int i;
-
- /* Assuming a sorted map, the first range found has the starting pfn */
- for_each_active_range_index_in_nid(i, nid)
- return early_node_map[i].start_pfn;
-
- printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);
- return 0;
-}
-
-unsigned long __init find_min_pfn_with_active_regions(void)
-{
- return find_min_pfn_for_node(MAX_NUMNODES);
-}
-
-unsigned long __init find_max_pfn_with_active_regions(void)
-{
- int i;
- unsigned long max_pfn = 0;
-
- for (i = 0; early_node_map[i].end_pfn; i++)
- max_pfn = max(max_pfn, early_node_map[i].end_pfn);
-
- return max_pfn;
-}
-
-void __init free_area_init_nodes(unsigned long arch_max_dma_pfn,
- unsigned long arch_max_dma32_pfn,
- unsigned long arch_max_low_pfn,
- unsigned long arch_max_high_pfn)
-{
- unsigned long nid;
- int zone_index;
-
- /* Record where the zone boundaries are */
- memset(arch_zone_lowest_possible_pfn, 0,
- sizeof(arch_zone_lowest_possible_pfn));
- memset(arch_zone_highest_possible_pfn, 0,
- sizeof(arch_zone_highest_possible_pfn));
- arch_zone_lowest_possible_pfn[ZONE_DMA] =
- find_min_pfn_with_active_regions();
- arch_zone_highest_possible_pfn[ZONE_DMA] = arch_max_dma_pfn;
- arch_zone_highest_possible_pfn[ZONE_DMA32] = arch_max_dma32_pfn;
- arch_zone_highest_possible_pfn[ZONE_NORMAL] = arch_max_low_pfn;
- arch_zone_highest_possible_pfn[ZONE_HIGHMEM] = arch_max_high_pfn;
- for (zone_index = 1; zone_index < MAX_NR_ZONES; zone_index++) {
- arch_zone_lowest_possible_pfn[zone_index] =
- arch_zone_highest_possible_pfn[zone_index-1];
- }
-
- /* Regions in the early_node_map can be in any order */
- sort_node_map();
-
- for_each_online_node(nid) {
- pg_data_t *pgdat = NODE_DATA(nid);
- free_area_init_node(nid, pgdat, NULL,
- find_min_pfn_for_node(nid), NULL);
- }
-}
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
-
#ifndef CONFIG_NEED_MULTIPLE_NODES
static bootmem_data_t contig_bootmem_data;
struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
@@ -3175,32 +2097,6 @@ int lowmem_reserve_ratio_sysctl_handler(
return 0;
}
-/*
- * percpu_pagelist_fraction - changes the pcp->high for each zone on each
- * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
- * can have before it gets flushed back to buddy allocator.
- */
-
-int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
-{
- struct zone *zone;
- unsigned int cpu;
- int ret;
-
- ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
- if (!write || (ret == -EINVAL))
- return ret;
- for_each_zone(zone) {
- for_each_online_cpu(cpu) {
- unsigned long high;
- high = zone->present_pages / percpu_pagelist_fraction;
- setup_pagelist_highmark(zone_pcp(zone, cpu), high);
- }
- }
- return 0;
-}
-
__initdata int hashdist = HASHDIST_DEFAULT;
#ifdef CONFIG_NUMA
^ permalink raw reply
* [PATCH 5/7] Have ia64 use add_active_range() and free_area_init_nodes
From: Mel Gorman @ 2006-05-01 13:37 UTC (permalink / raw)
To: akpm, davej, tony.luck, linux-mm, linux-kernel, bob.picco, ak,
linuxppc-dev
Cc: Mel Gorman
In-Reply-To: <20060501133530.6379.66000.sendpatchset@skynet>
Size zones and holes in an architecture independent manner for ia64.
This has only been compile-tested due to lack of a suitable test machine.
arch/ia64/Kconfig | 3 ++
arch/ia64/mm/contig.c | 60 +++++-----------------------------------
arch/ia64/mm/discontig.c | 41 ++++-----------------------
arch/ia64/mm/init.c | 12 ++++++++
include/asm-ia64/meminit.h | 1
5 files changed, 30 insertions(+), 87 deletions(-)
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/ia64/Kconfig linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/arch/ia64/Kconfig
--- linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/ia64/Kconfig 2006-05-01 11:36:54.000000000 +0100
+++ linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/arch/ia64/Kconfig 2006-05-01 11:43:32.000000000 +0100
@@ -353,6 +353,9 @@ config NODES_SHIFT
MAX_NUMNODES will be 2^(This value).
If in doubt, use the default.
+config ARCH_POPULATES_NODE_MAP
+ def_bool y
+
# VIRTUAL_MEM_MAP and FLAT_NODE_MEM_MAP are functionally equivalent.
# VIRTUAL_MEM_MAP has been retained for historical reasons.
config VIRTUAL_MEM_MAP
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/ia64/mm/contig.c linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/arch/ia64/mm/contig.c
--- linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/ia64/mm/contig.c 2006-04-27 03:19:25.000000000 +0100
+++ linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/arch/ia64/mm/contig.c 2006-05-01 11:43:32.000000000 +0100
@@ -26,10 +26,6 @@
#include <asm/sections.h>
#include <asm/mca.h>
-#ifdef CONFIG_VIRTUAL_MEM_MAP
-static unsigned long num_dma_physpages;
-#endif
-
/**
* show_mem - display a memory statistics summary
*
@@ -212,18 +208,6 @@ count_pages (u64 start, u64 end, void *a
return 0;
}
-#ifdef CONFIG_VIRTUAL_MEM_MAP
-static int
-count_dma_pages (u64 start, u64 end, void *arg)
-{
- unsigned long *count = arg;
-
- if (start < MAX_DMA_ADDRESS)
- *count += (min(end, MAX_DMA_ADDRESS) - start) >> PAGE_SHIFT;
- return 0;
-}
-#endif
-
/*
* Set up the page tables.
*/
@@ -232,47 +216,24 @@ void __init
paging_init (void)
{
unsigned long max_dma;
- unsigned long zones_size[MAX_NR_ZONES];
#ifdef CONFIG_VIRTUAL_MEM_MAP
- unsigned long zholes_size[MAX_NR_ZONES];
+ unsigned long nid = 0;
unsigned long max_gap;
#endif
- /* initialize mem_map[] */
-
- memset(zones_size, 0, sizeof(zones_size));
-
num_physpages = 0;
efi_memmap_walk(count_pages, &num_physpages);
max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
#ifdef CONFIG_VIRTUAL_MEM_MAP
- memset(zholes_size, 0, sizeof(zholes_size));
-
- num_dma_physpages = 0;
- efi_memmap_walk(count_dma_pages, &num_dma_physpages);
-
- if (max_low_pfn < max_dma) {
- zones_size[ZONE_DMA] = max_low_pfn;
- zholes_size[ZONE_DMA] = max_low_pfn - num_dma_physpages;
- } else {
- zones_size[ZONE_DMA] = max_dma;
- zholes_size[ZONE_DMA] = max_dma - num_dma_physpages;
- if (num_physpages > num_dma_physpages) {
- zones_size[ZONE_NORMAL] = max_low_pfn - max_dma;
- zholes_size[ZONE_NORMAL] =
- ((max_low_pfn - max_dma) -
- (num_physpages - num_dma_physpages));
- }
- }
-
max_gap = 0;
+ efi_memmap_walk(register_active_ranges, &nid);
efi_memmap_walk(find_largest_hole, (u64 *)&max_gap);
if (max_gap < LARGE_GAP) {
vmem_map = (struct page *) 0;
- free_area_init_node(0, NODE_DATA(0), zones_size, 0,
- zholes_size);
+ free_area_init_nodes(max_dma, max_dma,
+ max_low_pfn, max_low_pfn);
} else {
unsigned long map_size;
@@ -284,19 +245,14 @@ paging_init (void)
efi_memmap_walk(create_mem_map_page_table, NULL);
NODE_DATA(0)->node_mem_map = vmem_map;
- free_area_init_node(0, NODE_DATA(0), zones_size,
- 0, zholes_size);
+ free_area_init_nodes(max_dma, max_dma,
+ max_low_pfn, max_low_pfn);
printk("Virtual mem_map starts at 0x%p\n", mem_map);
}
#else /* !CONFIG_VIRTUAL_MEM_MAP */
- if (max_low_pfn < max_dma)
- zones_size[ZONE_DMA] = max_low_pfn;
- else {
- zones_size[ZONE_DMA] = max_dma;
- zones_size[ZONE_NORMAL] = max_low_pfn - max_dma;
- }
- free_area_init(zones_size);
+ add_active_range(0, 0, max_low_pfn);
+ free_area_init_nodes(max_dma, max_dma, max_low_pfn, max_low_pfn);
#endif /* !CONFIG_VIRTUAL_MEM_MAP */
zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
}
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/ia64/mm/discontig.c linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/arch/ia64/mm/discontig.c
--- linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/ia64/mm/discontig.c 2006-04-27 03:19:25.000000000 +0100
+++ linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/arch/ia64/mm/discontig.c 2006-05-01 11:43:32.000000000 +0100
@@ -700,6 +700,7 @@ static __init int count_node_pages(unsig
{
unsigned long end = start + len;
+ add_active_range(node, start >> PAGE_SHIFT, end >> PAGE_SHIFT);
mem_data[node].num_physpages += len >> PAGE_SHIFT;
if (start <= __pa(MAX_DMA_ADDRESS))
mem_data[node].num_dma_physpages +=
@@ -724,9 +725,8 @@ static __init int count_node_pages(unsig
void __init paging_init(void)
{
unsigned long max_dma;
- unsigned long zones_size[MAX_NR_ZONES];
- unsigned long zholes_size[MAX_NR_ZONES];
unsigned long pfn_offset = 0;
+ unsigned long max_pfn = 0;
int node;
max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
@@ -743,46 +743,17 @@ void __init paging_init(void)
#endif
for_each_online_node(node) {
- memset(zones_size, 0, sizeof(zones_size));
- memset(zholes_size, 0, sizeof(zholes_size));
-
num_physpages += mem_data[node].num_physpages;
-
- if (mem_data[node].min_pfn >= max_dma) {
- /* All of this node's memory is above ZONE_DMA */
- zones_size[ZONE_NORMAL] = mem_data[node].max_pfn -
- mem_data[node].min_pfn;
- zholes_size[ZONE_NORMAL] = mem_data[node].max_pfn -
- mem_data[node].min_pfn -
- mem_data[node].num_physpages;
- } else if (mem_data[node].max_pfn < max_dma) {
- /* All of this node's memory is in ZONE_DMA */
- zones_size[ZONE_DMA] = mem_data[node].max_pfn -
- mem_data[node].min_pfn;
- zholes_size[ZONE_DMA] = mem_data[node].max_pfn -
- mem_data[node].min_pfn -
- mem_data[node].num_dma_physpages;
- } else {
- /* This node has memory in both zones */
- zones_size[ZONE_DMA] = max_dma -
- mem_data[node].min_pfn;
- zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] -
- mem_data[node].num_dma_physpages;
- zones_size[ZONE_NORMAL] = mem_data[node].max_pfn -
- max_dma;
- zholes_size[ZONE_NORMAL] = zones_size[ZONE_NORMAL] -
- (mem_data[node].num_physpages -
- mem_data[node].num_dma_physpages);
- }
-
pfn_offset = mem_data[node].min_pfn;
#ifdef CONFIG_VIRTUAL_MEM_MAP
NODE_DATA(node)->node_mem_map = vmem_map + pfn_offset;
#endif
- free_area_init_node(node, NODE_DATA(node), zones_size,
- pfn_offset, zholes_size);
+ if (mem_data[node].max_pfn > max_pfn)
+ max_pfn = mem_data[node].max_pfn;
}
+ free_area_init_nodes(max_dma, max_dma, max_pfn, max_pfn);
+
zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
}
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/ia64/mm/init.c linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/arch/ia64/mm/init.c
--- linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/ia64/mm/init.c 2006-05-01 11:36:54.000000000 +0100
+++ linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/arch/ia64/mm/init.c 2006-05-01 11:43:32.000000000 +0100
@@ -539,6 +539,18 @@ find_largest_hole (u64 start, u64 end, v
last_end = end;
return 0;
}
+
+int __init
+register_active_ranges(u64 start, u64 end, void *nid)
+{
+ BUG_ON(nid == NULL);
+ BUG_ON(*(unsigned long *)nid >= MAX_NUMNODES);
+
+ add_active_range(*(unsigned long *)nid,
+ __pa(start) >> PAGE_SHIFT,
+ __pa(end) >> PAGE_SHIFT);
+ return 0;
+}
#endif /* CONFIG_VIRTUAL_MEM_MAP */
static int __init
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/include/asm-ia64/meminit.h linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/include/asm-ia64/meminit.h
--- linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/include/asm-ia64/meminit.h 2006-04-27 03:19:25.000000000 +0100
+++ linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/include/asm-ia64/meminit.h 2006-05-01 11:43:32.000000000 +0100
@@ -56,6 +56,7 @@ extern void efi_memmap_init(unsigned lon
extern unsigned long vmalloc_end;
extern struct page *vmem_map;
extern int find_largest_hole (u64 start, u64 end, void *arg);
+ extern int register_active_ranges (u64 start, u64 end, void *arg);
extern int create_mem_map_page_table (u64 start, u64 end, void *arg);
#endif
^ permalink raw reply
* [PATCH 4/7] Have x86_64 use add_active_range() and free_area_init_nodes
From: Mel Gorman @ 2006-05-01 13:36 UTC (permalink / raw)
To: akpm, davej, tony.luck, linuxppc-dev, linux-kernel, bob.picco, ak,
linux-mm
Cc: Mel Gorman
In-Reply-To: <20060501133530.6379.66000.sendpatchset@skynet>
Size zones and holes in an architecture independent manner for x86_64.
This has only been boot tested on an x86_64 with NUMA and SRAT.
arch/x86_64/Kconfig | 3 +
arch/x86_64/kernel/e820.c | 109 ++++++++++-----------------------------
arch/x86_64/kernel/setup.c | 7 ++
arch/x86_64/mm/init.c | 62 +---------------------
arch/x86_64/mm/k8topology.c | 3 +
arch/x86_64/mm/numa.c | 18 +++---
arch/x86_64/mm/srat.c | 11 ++-
include/asm-x86_64/e820.h | 5 -
include/asm-x86_64/proto.h | 2
9 files changed, 63 insertions(+), 157 deletions(-)
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/x86_64/Kconfig linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/x86_64/Kconfig
--- linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/x86_64/Kconfig 2006-05-01 11:36:58.000000000 +0100
+++ linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/x86_64/Kconfig 2006-05-01 11:42:26.000000000 +0100
@@ -73,6 +73,9 @@ config ARCH_MAY_HAVE_PC_FDC
bool
default y
+config ARCH_POPULATES_NODE_MAP
+ def_bool y
+
config DMI
bool
default y
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/x86_64/kernel/e820.c linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/x86_64/kernel/e820.c
--- linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/x86_64/kernel/e820.c 2006-04-27 03:19:25.000000000 +0100
+++ linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/x86_64/kernel/e820.c 2006-05-01 11:42:26.000000000 +0100
@@ -18,6 +18,7 @@
#include <linux/string.h>
#include <linux/kexec.h>
#include <linux/module.h>
+#include <linux/mm.h>
#include <asm/page.h>
#include <asm/e820.h>
@@ -155,58 +156,14 @@ unsigned long __init find_e820_area(unsi
return -1UL;
}
-/*
- * Free bootmem based on the e820 table for a node.
- */
-void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end)
-{
- int i;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- unsigned long last, addr;
-
- if (ei->type != E820_RAM ||
- ei->addr+ei->size <= start ||
- ei->addr >= end)
- continue;
-
- addr = round_up(ei->addr, PAGE_SIZE);
- if (addr < start)
- addr = start;
-
- last = round_down(ei->addr + ei->size, PAGE_SIZE);
- if (last >= end)
- last = end;
-
- if (last > addr && last-addr >= PAGE_SIZE)
- free_bootmem_node(pgdat, addr, last-addr);
- }
-}
-
/*
* Find the highest page frame number we have available
*/
unsigned long __init e820_end_of_ram(void)
{
- int i;
unsigned long end_pfn = 0;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- unsigned long start, end;
-
- start = round_up(ei->addr, PAGE_SIZE);
- end = round_down(ei->addr + ei->size, PAGE_SIZE);
- if (start >= end)
- continue;
- if (ei->type == E820_RAM) {
- if (end > end_pfn<<PAGE_SHIFT)
- end_pfn = end>>PAGE_SHIFT;
- } else {
- if (end > end_pfn_map<<PAGE_SHIFT)
- end_pfn_map = end>>PAGE_SHIFT;
- }
- }
+ end_pfn = find_max_pfn_with_active_regions();
if (end_pfn > end_pfn_map)
end_pfn_map = end_pfn;
@@ -220,40 +177,6 @@ unsigned long __init e820_end_of_ram(voi
return end_pfn;
}
-/*
- * Compute how much memory is missing in a range.
- * Unlike the other functions in this file the arguments are in page numbers.
- */
-unsigned long __init
-e820_hole_size(unsigned long start_pfn, unsigned long end_pfn)
-{
- unsigned long ram = 0;
- unsigned long start = start_pfn << PAGE_SHIFT;
- unsigned long end = end_pfn << PAGE_SHIFT;
- int i;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- unsigned long last, addr;
-
- if (ei->type != E820_RAM ||
- ei->addr+ei->size <= start ||
- ei->addr >= end)
- continue;
-
- addr = round_up(ei->addr, PAGE_SIZE);
- if (addr < start)
- addr = start;
-
- last = round_down(ei->addr + ei->size, PAGE_SIZE);
- if (last >= end)
- last = end;
-
- if (last > addr)
- ram += last - addr;
- }
- return ((end - start) - ram) >> PAGE_SHIFT;
-}
-
/*
* Mark e820 reserved areas as busy for the resource manager.
*/
@@ -288,6 +211,34 @@ void __init e820_reserve_resources(void)
}
}
+/* Walk the e820 map and register active regions within a node */
+void __init
+e820_register_active_regions(int nid, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ int i;
+ unsigned long ei_startpfn, ei_endpfn;
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
+ ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE)
+ >> PAGE_SHIFT;
+ /* Skip if map is outside the node */
+ if (ei->type != E820_RAM ||
+ ei_endpfn <= start_pfn ||
+ ei_startpfn >= end_pfn)
+ continue;
+
+ /* Check for overlaps */
+ if (ei_startpfn < start_pfn)
+ ei_startpfn = start_pfn;
+ if (ei_endpfn > end_pfn)
+ ei_endpfn = end_pfn;
+
+ add_active_range(nid, ei_startpfn, ei_endpfn);
+ }
+}
+
/*
* Add a memory region to the kernel e820 map.
*/
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/x86_64/kernel/setup.c linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/x86_64/kernel/setup.c
--- linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/x86_64/kernel/setup.c 2006-05-01 11:36:58.000000000 +0100
+++ linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/x86_64/kernel/setup.c 2006-05-01 11:42:26.000000000 +0100
@@ -475,7 +475,8 @@ contig_initmem_init(unsigned long start_
if (bootmap == -1L)
panic("Cannot find bootmem map of size %ld\n",bootmap_size);
bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
- e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT);
+ e820_register_active_regions(0, start_pfn, end_pfn);
+ free_bootmem_with_active_regions(0, end_pfn);
reserve_bootmem(bootmap, bootmap_size);
}
#endif
@@ -645,6 +646,7 @@ void __init setup_arch(char **cmdline_p)
early_identify_cpu(&boot_cpu_data);
+ e820_register_active_regions(0, 0, -1UL);
/*
* partially used pages are not usable - thus
* we are rounding upwards:
@@ -668,6 +670,9 @@ void __init setup_arch(char **cmdline_p)
acpi_boot_table_init();
#endif
+ /* Remove active ranges so rediscovery with NUMA-awareness happens */
+ remove_all_active_ranges();
+
#ifdef CONFIG_ACPI_NUMA
/*
* Parse SRAT to discover nodes.
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/x86_64/mm/init.c linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/x86_64/mm/init.c
--- linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/x86_64/mm/init.c 2006-05-01 11:36:58.000000000 +0100
+++ linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/x86_64/mm/init.c 2006-05-01 11:42:26.000000000 +0100
@@ -406,69 +406,12 @@ void __cpuinit zap_low_mappings(int cpu)
__flush_tlb_all();
}
-/* Compute zone sizes for the DMA and DMA32 zones in a node. */
-__init void
-size_zones(unsigned long *z, unsigned long *h,
- unsigned long start_pfn, unsigned long end_pfn)
-{
- int i;
- unsigned long w;
-
- for (i = 0; i < MAX_NR_ZONES; i++)
- z[i] = 0;
-
- if (start_pfn < MAX_DMA_PFN)
- z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
- if (start_pfn < MAX_DMA32_PFN) {
- unsigned long dma32_pfn = MAX_DMA32_PFN;
- if (dma32_pfn > end_pfn)
- dma32_pfn = end_pfn;
- z[ZONE_DMA32] = dma32_pfn - start_pfn;
- }
- z[ZONE_NORMAL] = end_pfn - start_pfn;
-
- /* Remove lower zones from higher ones. */
- w = 0;
- for (i = 0; i < MAX_NR_ZONES; i++) {
- if (z[i])
- z[i] -= w;
- w += z[i];
- }
-
- /* Compute holes */
- w = start_pfn;
- for (i = 0; i < MAX_NR_ZONES; i++) {
- unsigned long s = w;
- w += z[i];
- h[i] = e820_hole_size(s, w);
- }
-
- /* Add the space pace needed for mem_map to the holes too. */
- for (i = 0; i < MAX_NR_ZONES; i++)
- h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
-
- /* The 16MB DMA zone has the kernel and other misc mappings.
- Account them too */
- if (h[ZONE_DMA]) {
- h[ZONE_DMA] += dma_reserve;
- if (h[ZONE_DMA] >= z[ZONE_DMA]) {
- printk(KERN_WARNING
- "Kernel too large and filling up ZONE_DMA?\n");
- h[ZONE_DMA] = z[ZONE_DMA];
- }
- }
-}
-
#ifndef CONFIG_NUMA
void __init paging_init(void)
{
- unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
-
memory_present(0, 0, end_pfn);
sparse_init();
- size_zones(zones, holes, 0, end_pfn);
- free_area_init_node(0, NODE_DATA(0), zones,
- __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
+ free_area_init_nodes(MAX_DMA_PFN, MAX_DMA32_PFN, end_pfn, end_pfn);
}
#endif
@@ -620,7 +563,8 @@ void __init mem_init(void)
#else
totalram_pages = free_all_bootmem();
#endif
- reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
+ reservedpages = end_pfn - totalram_pages -
+ absent_pages_in_range(0, end_pfn);
after_bootmem = 1;
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/x86_64/mm/k8topology.c linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/x86_64/mm/k8topology.c
--- linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/x86_64/mm/k8topology.c 2006-04-27 03:19:25.000000000 +0100
+++ linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/x86_64/mm/k8topology.c 2006-05-01 11:42:26.000000000 +0100
@@ -146,6 +146,9 @@ int __init k8_scan_nodes(unsigned long s
nodes[nodeid].start = base;
nodes[nodeid].end = limit;
+ e820_register_active_regions(nodeid,
+ nodes[nodeid].start >> PAGE_SHIFT,
+ nodes[nodeid].end >> PAGE_SHIFT);
prevbase = base;
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/x86_64/mm/numa.c linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/x86_64/mm/numa.c
--- linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/x86_64/mm/numa.c 2006-04-27 03:19:25.000000000 +0100
+++ linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/x86_64/mm/numa.c 2006-05-01 11:42:26.000000000 +0100
@@ -161,7 +161,7 @@ void __init setup_node_bootmem(int nodei
bootmap_start >> PAGE_SHIFT,
start_pfn, end_pfn);
- e820_bootmem_free(NODE_DATA(nodeid), start, end);
+ free_bootmem_with_active_regions(nodeid, end);
reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
@@ -175,13 +175,11 @@ void __init setup_node_bootmem(int nodei
void __init setup_node_zones(int nodeid)
{
unsigned long start_pfn, end_pfn, memmapsize, limit;
- unsigned long zones[MAX_NR_ZONES];
- unsigned long holes[MAX_NR_ZONES];
start_pfn = node_start_pfn(nodeid);
end_pfn = node_end_pfn(nodeid);
- Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n",
+ Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
nodeid, start_pfn, end_pfn);
/* Try to allocate mem_map at end to not fill up precious <4GB
@@ -195,10 +193,6 @@ void __init setup_node_zones(int nodeid)
round_down(limit - memmapsize, PAGE_SIZE),
limit);
#endif
-
- size_zones(zones, holes, start_pfn, end_pfn);
- free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
- start_pfn, holes);
}
void __init numa_init_array(void)
@@ -259,8 +253,11 @@ static int numa_emulation(unsigned long
printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
return -1;
}
- for_each_online_node(i)
+ for_each_online_node(i) {
+ e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
+ nodes[i].end >> PAGE_SHIFT);
setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+ }
numa_init_array();
return 0;
}
@@ -299,6 +296,7 @@ void __init numa_initmem_init(unsigned l
for (i = 0; i < NR_CPUS; i++)
numa_set_node(i, 0);
node_to_cpumask[0] = cpumask_of_cpu(0);
+ e820_register_active_regions(0, start_pfn, end_pfn);
setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
}
@@ -346,6 +344,8 @@ void __init paging_init(void)
for_each_online_node(i) {
setup_node_zones(i);
}
+
+ free_area_init_nodes(MAX_DMA_PFN, MAX_DMA32_PFN, end_pfn, end_pfn);
}
/* [numa=off] */
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/x86_64/mm/srat.c linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/x86_64/mm/srat.c
--- linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/x86_64/mm/srat.c 2006-05-01 11:36:58.000000000 +0100
+++ linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/x86_64/mm/srat.c 2006-05-01 11:42:26.000000000 +0100
@@ -87,6 +87,7 @@ static __init void bad_srat(void)
apicid_to_node[i] = NUMA_NO_NODE;
for (i = 0; i < MAX_NUMNODES; i++)
nodes_add[i].start = nodes[i].end = 0;
+ remove_all_active_ranges();
}
static __init inline int srat_disabled(void)
@@ -168,7 +169,7 @@ static int hotadd_enough_memory(struct b
if (mem < 0)
return 0;
- allowed = (end_pfn - e820_hole_size(0, end_pfn)) * PAGE_SIZE;
+ allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE;
allowed = (allowed / 100) * hotadd_percent;
if (allocated + mem > allowed) {
/* Give them at least part of their hotadd memory upto hotadd_percent
@@ -216,7 +217,7 @@ static int reserve_hotadd(int node, unsi
}
/* This check might be a bit too strict, but I'm keeping it for now. */
- if (e820_hole_size(s_pfn, e_pfn) != e_pfn - s_pfn) {
+ if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
printk(KERN_ERR "SRAT: Hotplug area has existing memory\n");
return -1;
}
@@ -310,6 +311,8 @@ acpi_numa_memory_affinity_init(struct ac
printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
nd->start, nd->end);
+ e820_register_active_regions(node, nd->start >> PAGE_SHIFT,
+ nd->end >> PAGE_SHIFT);
#ifdef RESERVE_HOTADD
if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) {
@@ -334,13 +337,13 @@ static int nodes_cover_memory(void)
unsigned long s = nodes[i].start >> PAGE_SHIFT;
unsigned long e = nodes[i].end >> PAGE_SHIFT;
pxmram += e - s;
- pxmram -= e820_hole_size(s, e);
+ pxmram -= absent_pages_in_range(s, e);
pxmram -= nodes_add[i].end - nodes_add[i].start;
if ((long)pxmram < 0)
pxmram = 0;
}
- e820ram = end_pfn - e820_hole_size(0, end_pfn);
+ e820ram = end_pfn - absent_pages_in_range(0, end_pfn);
/* We seem to lose 3 pages somewhere. Allow a bit of slack. */
if ((long)(e820ram - pxmram) >= 1*1024*1024) {
printk(KERN_ERR
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/include/asm-x86_64/e820.h linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/include/asm-x86_64/e820.h
--- linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/include/asm-x86_64/e820.h 2006-04-27 03:19:25.000000000 +0100
+++ linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/include/asm-x86_64/e820.h 2006-05-01 11:42:26.000000000 +0100
@@ -50,10 +50,9 @@ extern void e820_print_map(char *who);
extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type);
extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type);
-extern void e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end);
extern void e820_setup_gap(void);
-extern unsigned long e820_hole_size(unsigned long start_pfn,
- unsigned long end_pfn);
+extern void e820_register_active_regions(int nid,
+ unsigned long start_pfn, unsigned long end_pfn);
extern void __init parse_memopt(char *p, char **end);
extern void __init parse_memmapopt(char *p, char **end);
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/include/asm-x86_64/proto.h linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/include/asm-x86_64/proto.h
--- linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/include/asm-x86_64/proto.h 2006-05-01 11:37:01.000000000 +0100
+++ linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/include/asm-x86_64/proto.h 2006-05-01 11:42:26.000000000 +0100
@@ -24,8 +24,6 @@ extern void mtrr_bp_init(void);
#define mtrr_bp_init() do {} while (0)
#endif
extern void init_memory_mapping(unsigned long start, unsigned long end);
-extern void size_zones(unsigned long *z, unsigned long *h,
- unsigned long start_pfn, unsigned long end_pfn);
extern void system_call(void);
extern int kernel_syscall(void);
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox