Linux Power Management development
 help / color / mirror / Atom feed
* Re: [PATCH 3/9] cgroup: implement generic child / descendant walk macros
From: Tejun Heo @ 2012-11-06 20:31 UTC (permalink / raw)
  To: lizefan, mhocko, rjw
  Cc: containers, cgroups, linux-kernel, linux-pm, fweisbec
In-Reply-To: <1351931915-1701-4-git-send-email-tj@kernel.org>

On Sat, Nov 03, 2012 at 01:38:29AM -0700, Tejun Heo wrote:
> Currently, cgroup doesn't provide any generic helper for walking a
> given cgroup's children or descendants.  This patch adds the following
> three macros.
> 
> * cgroup_for_each_child() - walk immediate children of a cgroup.
> 
> * cgroup_for_each_descendant_pre() - visit all descendants of a cgroup
>   in pre-order tree traversal.
> 
> * cgroup_for_each_descendant_post() - visit all descendants of a
>   cgroup in post-order tree traversal.
> 
> All three only require the user to hold RCU read lock during
> traversal.  Verifying that each iterated cgroup is online is the
> responsibility of the user.  When used with proper synchronization,
> cgroup_for_each_descendant_pre() can be used to propagate config
> updates to descendants in reliable way.  See comments for details.

Michal, Li, how does this look to you?  Would this be okay for memcg
too?  Li, do you think the comment on cgroup_for_each_descendant_pre()
is correct?

Thanks.

-- 
tejun

^ permalink raw reply

* [PATCH 1/6 v3] arm: use devicetree to get smp_twd clock
From: Mark Langsdorf @ 2012-11-06 20:18 UTC (permalink / raw)
  To: linux-kernel, linux-pm, cpufreq, mark.langsdorf; +Cc: Rob Herring
In-Reply-To: <1352233089-22586-1-git-send-email-mark.langsdorf@calxeda.com>

From: Rob Herring <rob.herring@calxeda.com>

Signed-off-by: Rob Herring <rob.herring@calxeda.com>
Signed-off-by: Mark Langsdorf <mark.langsdorf@calxeda.com>

---
Changes from v2
	Turned the check for the node pointer into an if-then-else statement.
	Removed the second, redundant clk_get_rate
Changes from v1
        None.

 arch/arm/kernel/smp_twd.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/arch/arm/kernel/smp_twd.c b/arch/arm/kernel/smp_twd.c
index b22d700..b1fb6e1 100644
--- a/arch/arm/kernel/smp_twd.c
+++ b/arch/arm/kernel/smp_twd.c
@@ -237,12 +237,15 @@ static irqreturn_t twd_handler(int irq, void *dev_id)
 	return IRQ_NONE;
 }
 
-static struct clk *twd_get_clock(void)
+static struct clk *twd_get_clock(struct device_node *np)
 {
-	struct clk *clk;
+	struct clk *clk = NULL;
 	int err;
 
-	clk = clk_get_sys("smp_twd", NULL);
+	if (np)
+		clk = of_clk_get(np, 0);
+	else
+		clk = clk_get_sys("smp_twd", NULL);
 	if (IS_ERR(clk)) {
 		pr_err("smp_twd: clock not found: %d\n", (int)PTR_ERR(clk));
 		return clk;
@@ -263,6 +266,7 @@ static struct clk *twd_get_clock(void)
 		return ERR_PTR(err);
 	}
 
+	twd_timer_rate = clk_get_rate(clk);
 	return clk;
 }
 
@@ -273,12 +277,7 @@ static int __cpuinit twd_timer_setup(struct clock_event_device *clk)
 {
 	struct clock_event_device **this_cpu_clk;
 
-	if (!twd_clk)
-		twd_clk = twd_get_clock();
-
-	if (!IS_ERR_OR_NULL(twd_clk))
-		twd_timer_rate = clk_get_rate(twd_clk);
-	else
+	if (IS_ERR_OR_NULL(twd_clk))
 		twd_calibrate_rate();
 
 	__raw_writel(0, twd_base + TWD_TIMER_CONTROL);
@@ -349,6 +348,8 @@ int __init twd_local_timer_register(struct twd_local_timer *tlt)
 	if (!twd_base)
 		return -ENOMEM;
 
+	twd_clk = twd_get_clock(NULL);
+
 	return twd_local_timer_common_register();
 }
 
@@ -383,6 +384,8 @@ void __init twd_local_timer_of_register(void)
 		goto out;
 	}
 
+	twd_clk = twd_get_clock(np);
+
 	err = twd_local_timer_common_register();
 
 out:
-- 
1.7.11.7


^ permalink raw reply related

* [PATCH 0/6 v3] cpufreq: add support for Calxeda ECX-1000 (highbank)
From: Mark Langsdorf @ 2012-11-06 20:18 UTC (permalink / raw)
  To: linux-kernel, linux-pm, cpufreq, mark.langsdorf
In-Reply-To: <1351631056-25938-1-git-send-email-mark.langsdorf@calxeda.com>

This patch series adds cpufreq support for the Calxeda
ECX-1000 (highbank) SoCs. The driver is based on the 
cpufreq-cpu0 driver. Because of the unique way that 
highbank uses the EnergyCore Management Engine to manage
voltages, it was not possible to use the cpufreq-cpu0 driver.

--Mark Langsdorf



^ permalink raw reply

* [PATCH 6/6 v3] cpufreq, highbank: add support for highbank cpufreq
From: Mark Langsdorf @ 2012-11-06 20:18 UTC (permalink / raw)
  To: linux-kernel, linux-pm, cpufreq, mark.langsdorf
  Cc: devicetree-discuss, Rafael J. Wysocki
In-Reply-To: <1352233089-22586-1-git-send-email-mark.langsdorf@calxeda.com>

Highbank processors depend on the external ECME to perform voltage
management based on a requested frequency. Communication between the
highbank and ECME cores happens over the pl320 IPC channel.

Signed-off-by: Mark Langsdorf <mark.langsdorf@calxeda.com>
Cc: devicetree-discuss@lists.ozlabs.org
Cc: Rafael J. Wysocki <rjw@sisk.pl>

---
Changes from v2:
	Changed transition latency binding in code to match documentation
Changes from v1:
        Added highbank specific Kconfig changes

 .../bindings/cpufreq/highbank-cpufreq.txt          |  53 +++++
 arch/arm/Kconfig                                   |   2 +
 arch/arm/boot/dts/highbank.dts                     |  10 +
 drivers/cpufreq/Kconfig.arm                        |  15 ++
 drivers/cpufreq/Makefile                           |   1 +
 drivers/cpufreq/highbank-cpufreq.c                 | 229 +++++++++++++++++++++
 6 files changed, 310 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/cpufreq/highbank-cpufreq.txt
 create mode 100644 drivers/cpufreq/highbank-cpufreq.c

diff --git a/Documentation/devicetree/bindings/cpufreq/highbank-cpufreq.txt b/Documentation/devicetree/bindings/cpufreq/highbank-cpufreq.txt
new file mode 100644
index 0000000..3ec2cec
--- /dev/null
+++ b/Documentation/devicetree/bindings/cpufreq/highbank-cpufreq.txt
@@ -0,0 +1,53 @@
+Highbank cpufreq driver
+
+This is cpufreq driver for Calxeda ECX-1000 (highbank) processor. It is based
+on the generic cpu0 driver and uses a similar format for bindings. Since
+the EnergyCore Management Engine maintains the voltage based on the
+frequency, the voltage component of the operating points can be set to any
+arbitrary values.
+
+Both required properties listed below must be defined under node /cpus/cpu@0.
+
+Required properties:
+- operating-points: Refer to Documentation/devicetree/bindings/power/opp.txt
+  for details
+- clock-latency: Specify the possible maximum transition latency for clock,
+  in unit of nanoseconds.
+
+Examples:
+
+cpus {
+	#address-cells = <1>;
+	#size-cells = <0>;
+
+	cpu@0 {
+		compatible = "arm,cortex-a9";
+		reg = <0>;
+		next-level-cache = <&L2>;
+		operating-points = <
+			/* kHz  ignored */
+			790000  1000000
+			396000  1000000
+			198000  1000000
+		>;
+		transition-latency = <200000>;
+	};
+
+	cpu@1 {
+		compatible = "arm,cortex-a9";
+		reg = <1>;
+		next-level-cache = <&L2>;
+	};
+
+	cpu@2 {
+		compatible = "arm,cortex-a9";
+		reg = <2>;
+		next-level-cache = <&L2>;
+	};
+
+	cpu@3 {
+		compatible = "arm,cortex-a9";
+		reg = <3>;
+		next-level-cache = <&L2>;
+	};
+};
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index ade7e92..4ed0b7b 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -391,6 +391,8 @@ config ARCH_SIRF
 	select PINCTRL
 	select PINCTRL_SIRF
 	select USE_OF
+	select ARCH_HAS_CPUFREQ
+	select ARCH_HAS_OPP
 	help
 	  Support for CSR SiRFprimaII/Marco/Polo platforms
 
diff --git a/arch/arm/boot/dts/highbank.dts b/arch/arm/boot/dts/highbank.dts
index 0c6fc34..7c4c27d 100644
--- a/arch/arm/boot/dts/highbank.dts
+++ b/arch/arm/boot/dts/highbank.dts
@@ -36,6 +36,16 @@
 			next-level-cache = <&L2>;
 			clocks = <&a9pll>;
 			clock-names = "cpu";
+			operating-points = <
+				/* kHz    ignored */
+				 1300000  1000000
+				 1200000  1000000
+				 1100000  1000000
+				  800000  1000000
+				  400000  1000000
+				  200000  1000000
+			>;
+			clock-latency = <100000>;
 		};
 
 		cpu@1 {
diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm
index 5961e64..bc3ef55 100644
--- a/drivers/cpufreq/Kconfig.arm
+++ b/drivers/cpufreq/Kconfig.arm
@@ -76,3 +76,18 @@ config ARM_EXYNOS5250_CPUFREQ
 	help
 	  This adds the CPUFreq driver for Samsung EXYNOS5250
 	  SoC.
+
+config ARM_HIGHBANK_CPUFREQ
+       tristate "Calxeda Highbank-based"
+       depends on ARCH_HIGHBANK
+       select CPU_FREQ_TABLE
+       select HAVE_CLK
+       select PM_OPP
+       select OF
+       default m
+       help
+         This adds the CPUFreq driver for Calxeda Highbank SoC
+         based boards.
+
+         If in doubt, say N.
+
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index 1bc90e1..9e8f12a 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -50,6 +50,7 @@ obj-$(CONFIG_ARM_EXYNOS4210_CPUFREQ)	+= exynos4210-cpufreq.o
 obj-$(CONFIG_ARM_EXYNOS4X12_CPUFREQ)	+= exynos4x12-cpufreq.o
 obj-$(CONFIG_ARM_EXYNOS5250_CPUFREQ)	+= exynos5250-cpufreq.o
 obj-$(CONFIG_ARM_OMAP2PLUS_CPUFREQ)     += omap-cpufreq.o
+obj-$(CONFIG_ARM_HIGHBANK_CPUFREQ)	+= highbank-cpufreq.o
 
 ##################################################################################
 # PowerPC platform drivers
diff --git a/drivers/cpufreq/highbank-cpufreq.c b/drivers/cpufreq/highbank-cpufreq.c
new file mode 100644
index 0000000..a167073
--- /dev/null
+++ b/drivers/cpufreq/highbank-cpufreq.c
@@ -0,0 +1,229 @@
+/*
+ * Copyright (C) 2012 Calxeda, Inc.
+ *
+ * derived from cpufreq-cpu0 by Freescale Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/clk.h>
+#include <linux/cpu.h>
+#include <linux/cpufreq.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/opp.h>
+#include <linux/slab.h>
+#include <asm/pl320-ipc.h>
+
+#define HB_CPUFREQ_CHANGE_NOTE 0x80000001
+
+static unsigned int transition_latency;
+
+static struct device *cpu_dev;
+static struct clk *cpu_clk;
+static struct cpufreq_frequency_table *freq_table;
+
+static int hb_verify_speed(struct cpufreq_policy *policy)
+{
+	return cpufreq_frequency_table_verify(policy, freq_table);
+}
+
+static unsigned int hb_get_speed(unsigned int cpu)
+{
+	return clk_get_rate(cpu_clk) / 1000;
+}
+
+static int hb_voltage_change(unsigned int freq)
+{
+	int i;
+	u32 msg[7];
+
+	msg[0] = HB_CPUFREQ_CHANGE_NOTE;
+	msg[1] = freq / 1000;
+	for (i = 2; i < 7; i++)
+		msg[i] = 0;
+
+	return ipc_call_slow(msg);
+}
+
+static int hb_set_target(struct cpufreq_policy *policy,
+			   unsigned int target_freq, unsigned int relation)
+{
+	struct cpufreq_freqs freqs;
+	unsigned long freq_Hz;
+	unsigned int index, cpu;
+	int ret;
+
+	ret = cpufreq_frequency_table_target(policy, freq_table, target_freq,
+					     relation, &index);
+	if (ret) {
+		pr_err("failed to match target freqency %d: %d\n",
+		       target_freq, ret);
+		return ret;
+	}
+
+	freq_Hz = clk_round_rate(cpu_clk, freq_table[index].frequency * 1000);
+	if (freq_Hz < 0)
+		freq_Hz = freq_table[index].frequency * 1000;
+	freqs.new = freq_Hz / 1000;
+	freqs.old = clk_get_rate(cpu_clk) / 1000;
+
+	if (freqs.old == freqs.new)
+		return 0;
+
+	for_each_online_cpu(cpu) {
+		freqs.cpu = cpu;
+		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	}
+
+	pr_debug("%u MHz --> %u MHz\n", freqs.old / 1000, freqs.new / 1000);
+
+	/* scaling up?  scale voltage before frequency */
+	if (freqs.new > freqs.old) {
+		ret = hb_voltage_change(freqs.new);
+		if (ret) {
+			freqs.new = freqs.old;
+			return -EAGAIN;
+		}
+	}
+
+	ret = clk_set_rate(cpu_clk, freqs.new * 1000);
+	if (ret) {
+		pr_err("failed to set clock rate: %d\n", ret);
+		hb_voltage_change(freqs.old);
+		return ret;
+	}
+
+	/* scaling down?  scale voltage after frequency */
+	if (freqs.new < freqs.old) {
+		ret = hb_voltage_change(freqs.new);
+		if (ret) {
+			if (clk_set_rate(cpu_clk, freqs.old * 1000))
+				pr_err("also failed to reset freq\n");
+			freqs.new = freqs.old;
+			return -EAGAIN;
+		}
+	}
+
+	for_each_online_cpu(cpu) {
+		freqs.cpu = cpu;
+		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	}
+
+	return 0;
+}
+
+static int hb_cpufreq_init(struct cpufreq_policy *policy)
+{
+	int ret;
+
+	if (policy->cpu != 0)
+		return -EINVAL;
+
+	ret = cpufreq_frequency_table_cpuinfo(policy, freq_table);
+	if (ret) {
+		pr_err("invalid frequency table: %d\n", ret);
+		return ret;
+	}
+
+	policy->cpuinfo.transition_latency = transition_latency;
+	policy->cur = clk_get_rate(cpu_clk) / 1000;
+
+	policy->shared_type = CPUFREQ_SHARED_TYPE_ANY;
+	cpumask_setall(policy->cpus);
+
+	cpufreq_frequency_table_get_attr(freq_table, policy->cpu);
+
+	return 0;
+}
+
+static int hb_cpufreq_exit(struct cpufreq_policy *policy)
+{
+	cpufreq_frequency_table_put_attr(policy->cpu);
+
+	return 0;
+}
+
+static struct freq_attr *hb_cpufreq_attr[] = {
+	&cpufreq_freq_attr_scaling_available_freqs,
+	NULL,
+};
+
+static struct cpufreq_driver hb_cpufreq_driver = {
+	.flags = CPUFREQ_STICKY,
+	.verify = hb_verify_speed,
+	.target = hb_set_target,
+	.get = hb_get_speed,
+	.init = hb_cpufreq_init,
+	.exit = hb_cpufreq_exit,
+	.name = "highbank-cpufreq",
+	.attr = hb_cpufreq_attr,
+};
+
+static int __devinit hb_cpufreq_driver_init(void)
+{
+	struct device_node *np;
+	int ret;
+
+	np = of_find_node_by_path("/cpus/cpu@0");
+	if (!np) {
+		pr_err("failed to find highbank cpufreq node\n");
+		return -ENOENT;
+	}
+
+	cpu_dev = get_cpu_device(0);
+	if (!cpu_dev) {
+		pr_err("failed to get highbank cpufreq device\n");
+		ret = -ENODEV;
+		goto out_put_node;
+	}
+
+	cpu_dev->of_node = np;
+
+	cpu_clk = clk_get(cpu_dev, NULL);
+	if (IS_ERR(cpu_clk)) {
+		ret = PTR_ERR(cpu_clk);
+		pr_err("failed to get cpu0 clock: %d\n", ret);
+		goto out_put_node;
+	}
+
+	ret = of_init_opp_table(cpu_dev);
+	if (ret) {
+		pr_err("failed to init OPP table: %d\n", ret);
+		goto out_put_node;
+	}
+
+	ret = opp_init_cpufreq_table(cpu_dev, &freq_table);
+	if (ret) {
+		pr_err("failed to init cpufreq table: %d\n", ret);
+		goto out_put_node;
+	}
+
+	if (of_property_read_u32(np, "transition-latency", &transition_latency))
+		transition_latency = CPUFREQ_ETERNAL;
+
+	ret = cpufreq_register_driver(&hb_cpufreq_driver);
+	if (ret) {
+		pr_err("failed register driver: %d\n", ret);
+		goto out_free_table;
+	}
+
+	of_node_put(np);
+	return 0;
+
+out_free_table:
+	opp_free_cpufreq_table(cpu_dev, &freq_table);
+out_put_node:
+	of_node_put(np);
+	return ret;
+}
+late_initcall(hb_cpufreq_driver_init);
+
+MODULE_AUTHOR("Mark Langsdorf <mark.langsdorf@calxeda.com>");
+MODULE_DESCRIPTION("Calxeda Highbank cpufreq driver");
+MODULE_LICENSE("GPL");
-- 
1.7.11.7

^ permalink raw reply related

* [PATCH 5/6 v3] power: export opp cpufreq functions
From: Mark Langsdorf @ 2012-11-06 20:18 UTC (permalink / raw)
  To: linux-kernel, linux-pm, cpufreq, mark.langsdorf
In-Reply-To: <1352233089-22586-1-git-send-email-mark.langsdorf@calxeda.com>

These functions are needed to make the cpufreq-core0 and highbank-cpufreq
drivers loadable as modules.

Signed-off-by: Mark Langsdorf <mark.langsdorf@calxeda.com>
Acked-by: Nishanth Menon <nm@ti.com>

---
Changes from v2:
	None.
Changes from v1:
        Added Nishanth Menon's ack.
        Clarified the purpose of the change in the commit message.

 drivers/base/power/opp.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/base/power/opp.c b/drivers/base/power/opp.c
index d946864..37dc5f4 100644
--- a/drivers/base/power/opp.c
+++ b/drivers/base/power/opp.c
@@ -23,6 +23,7 @@
 #include <linux/rcupdate.h>
 #include <linux/opp.h>
 #include <linux/of.h>
+#include <linux/module.h>
 
 /*
  * Internal data structure organization with the OPP layer library is as
@@ -643,6 +644,7 @@ int opp_init_cpufreq_table(struct device *dev,
 
 	return 0;
 }
+EXPORT_SYMBOL(opp_init_cpufreq_table);
 
 /**
  * opp_free_cpufreq_table() - free the cpufreq table
@@ -660,6 +662,7 @@ void opp_free_cpufreq_table(struct device *dev,
 	kfree(*table);
 	*table = NULL;
 }
+EXPORT_SYMBOL(opp_free_cpufreq_table);
 #endif		/* CONFIG_CPU_FREQ */
 
 /**
@@ -720,4 +723,5 @@ int of_init_opp_table(struct device *dev)
 
 	return 0;
 }
+EXPORT_SYMBOL(of_init_opp_table);
 #endif
-- 
1.7.11.7


^ permalink raw reply related

* [PATCH 4/6 v3] arm highbank: add support for pl320 IPC
From: Mark Langsdorf @ 2012-11-06 20:18 UTC (permalink / raw)
  To: linux-kernel, linux-pm, cpufreq, mark.langsdorf; +Cc: Rob Herring
In-Reply-To: <1352233089-22586-1-git-send-email-mark.langsdorf@calxeda.com>

From: Rob Herring <rob.herring@calxeda.com>

The pl320 IPC allows for interprocessor communication between the highbank A9
and the EnergyCore Management Engine. The pl320 implements a straightforward
mailbox protocol.

Signed-off-by: Mark Langsdorf <mark.langsdorf@calxeda.com>
Signed-off-by: Rob Herring <rob.herring@calxeda.com>
---
Changes from v2:
	None
Changes from v1:
        Removed erroneous changes for cpufreq Kconfig

 arch/arm/include/asm/pl320-ipc.h                |  20 ++
 arch/arm/mach-highbank/Kconfig                  |   2 +
 arch/arm/mach-highbank/Makefile                 |   2 +
 arch/arm/mach-highbank/include/mach/pl320-ipc.h |  20 ++
 arch/arm/mach-highbank/pl320-ipc.c              | 232 ++++++++++++++++++++++++
 5 files changed, 276 insertions(+)
 create mode 100644 arch/arm/include/asm/pl320-ipc.h
 create mode 100644 arch/arm/mach-highbank/include/mach/pl320-ipc.h
 create mode 100644 arch/arm/mach-highbank/pl320-ipc.c

diff --git a/arch/arm/include/asm/pl320-ipc.h b/arch/arm/include/asm/pl320-ipc.h
new file mode 100644
index 0000000..a0e58ee
--- /dev/null
+++ b/arch/arm/include/asm/pl320-ipc.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright 2010 Calxeda, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+int ipc_call_fast(u32 *data);
+int ipc_call_slow(u32 *data);
+
+extern int pl320_ipc_register_notifier(struct notifier_block *nb);
+extern int pl320_ipc_unregister_notifier(struct notifier_block *nb);
diff --git a/arch/arm/mach-highbank/Kconfig b/arch/arm/mach-highbank/Kconfig
index 0e1d0a4..ee83af6 100644
--- a/arch/arm/mach-highbank/Kconfig
+++ b/arch/arm/mach-highbank/Kconfig
@@ -13,3 +13,5 @@ config ARCH_HIGHBANK
 	select HAVE_SMP
 	select SPARSE_IRQ
 	select USE_OF
+	select ARCH_HAS_CPUFREQ
+	select ARCH_HAS_OPP
diff --git a/arch/arm/mach-highbank/Makefile b/arch/arm/mach-highbank/Makefile
index 3ec8bdd..b894708 100644
--- a/arch/arm/mach-highbank/Makefile
+++ b/arch/arm/mach-highbank/Makefile
@@ -7,3 +7,5 @@ obj-$(CONFIG_DEBUG_HIGHBANK_UART)	+= lluart.o
 obj-$(CONFIG_SMP)			+= platsmp.o
 obj-$(CONFIG_HOTPLUG_CPU)		+= hotplug.o
 obj-$(CONFIG_PM_SLEEP)			+= pm.o
+
+obj-y					+= pl320-ipc.o
diff --git a/arch/arm/mach-highbank/include/mach/pl320-ipc.h b/arch/arm/mach-highbank/include/mach/pl320-ipc.h
new file mode 100644
index 0000000..a0e58ee
--- /dev/null
+++ b/arch/arm/mach-highbank/include/mach/pl320-ipc.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright 2010 Calxeda, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+int ipc_call_fast(u32 *data);
+int ipc_call_slow(u32 *data);
+
+extern int pl320_ipc_register_notifier(struct notifier_block *nb);
+extern int pl320_ipc_unregister_notifier(struct notifier_block *nb);
diff --git a/arch/arm/mach-highbank/pl320-ipc.c b/arch/arm/mach-highbank/pl320-ipc.c
new file mode 100644
index 0000000..0eb92e4
--- /dev/null
+++ b/arch/arm/mach-highbank/pl320-ipc.c
@@ -0,0 +1,232 @@
+/*
+ * Copyright 2012 Calxeda, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/types.h>
+#include <linux/err.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/io.h>
+#include <linux/interrupt.h>
+#include <linux/completion.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/spinlock.h>
+#include <linux/device.h>
+#include <linux/amba/bus.h>
+
+#include <asm/pl320-ipc.h>
+
+#define IPCMxSOURCE(m)		((m) * 0x40)
+#define IPCMxDSET(m)		(((m) * 0x40) + 0x004)
+#define IPCMxDCLEAR(m)		(((m) * 0x40) + 0x008)
+#define IPCMxDSTATUS(m)		(((m) * 0x40) + 0x00C)
+#define IPCMxMODE(m)		(((m) * 0x40) + 0x010)
+#define IPCMxMSET(m)		(((m) * 0x40) + 0x014)
+#define IPCMxMCLEAR(m)		(((m) * 0x40) + 0x018)
+#define IPCMxMSTATUS(m)		(((m) * 0x40) + 0x01C)
+#define IPCMxSEND(m)		(((m) * 0x40) + 0x020)
+#define IPCMxDR(m, dr)		(((m) * 0x40) + ((dr) * 4) + 0x024)
+
+#define IPCMMIS(irq)		(((irq) * 8) + 0x800)
+#define IPCMRIS(irq)		(((irq) * 8) + 0x804)
+
+#define MBOX_MASK(n)		(1 << (n))
+#define IPC_FAST_MBOX		0
+#define IPC_SLOW_MBOX		1
+#define IPC_RX_MBOX		2
+
+#define CHAN_MASK(n)		(1 << (n))
+#define A9_SOURCE		1
+#define M3_SOURCE		0
+
+static void __iomem *ipc_base;
+static int ipc_irq;
+static DEFINE_SPINLOCK(ipc_m0_lock);
+static DEFINE_MUTEX(ipc_m1_lock);
+static DECLARE_COMPLETION(ipc_completion);
+static ATOMIC_NOTIFIER_HEAD(ipc_notifier);
+
+static inline void set_destination(int source, int mbox)
+{
+	__raw_writel(CHAN_MASK(source), ipc_base + IPCMxDSET(mbox));
+	__raw_writel(CHAN_MASK(source), ipc_base + IPCMxMSET(mbox));
+}
+
+static inline void clear_destination(int source, int mbox)
+{
+	__raw_writel(CHAN_MASK(source), ipc_base + IPCMxDCLEAR(mbox));
+	__raw_writel(CHAN_MASK(source), ipc_base + IPCMxMCLEAR(mbox));
+}
+
+static void __ipc_send(int mbox, u32 *data)
+{
+	int i;
+	for (i = 0; i < 7; i++)
+		__raw_writel(data[i], ipc_base + IPCMxDR(mbox, i));
+	__raw_writel(0x1, ipc_base + IPCMxSEND(mbox));
+}
+
+static u32 __ipc_rcv(int mbox, u32 *data)
+{
+	int i;
+	for (i = 0; i < 7; i++)
+		data[i] = __raw_readl(ipc_base + IPCMxDR(mbox, i));
+	return data[1];
+}
+
+/* non-blocking implementation from the A9 side, interrupt safe in theory */
+int ipc_call_fast(u32 *data)
+{
+	int timeout, ret;
+
+	spin_lock(&ipc_m0_lock);
+
+	__ipc_send(IPC_FAST_MBOX, data);
+
+	for (timeout = 500; timeout > 0; timeout--) {
+		if (__raw_readl(ipc_base + IPCMxSEND(IPC_FAST_MBOX)) == 0x2)
+			break;
+		udelay(100);
+	}
+	if (timeout == 0) {
+		ret = -ETIMEDOUT;
+		goto out;
+	}
+
+	ret = __ipc_rcv(IPC_FAST_MBOX, data);
+out:
+	__raw_writel(0, ipc_base + IPCMxSEND(IPC_FAST_MBOX));
+	spin_unlock(&ipc_m0_lock);
+	return ret;
+}
+EXPORT_SYMBOL(ipc_call_fast);
+
+/* blocking implmentation from the A9 side, not usuable in interrupts! */
+int ipc_call_slow(u32 *data)
+{
+	int ret;
+
+	mutex_lock(&ipc_m1_lock);
+
+	init_completion(&ipc_completion);
+	__ipc_send(IPC_SLOW_MBOX, data);
+	ret = wait_for_completion_timeout(&ipc_completion,
+					  msecs_to_jiffies(1000));
+	if (ret == 0) {
+		ret = -ETIMEDOUT;
+		goto out;
+	}
+
+	ret = __ipc_rcv(IPC_SLOW_MBOX, data);
+out:
+	mutex_unlock(&ipc_m1_lock);
+	return ret;
+}
+EXPORT_SYMBOL(ipc_call_slow);
+
+irqreturn_t ipc_handler(int irq, void *dev)
+{
+	u32 irq_stat;
+	u32 data[7];
+
+	irq_stat = __raw_readl(ipc_base + IPCMMIS(1));
+	if (irq_stat & MBOX_MASK(IPC_SLOW_MBOX)) {
+		__raw_writel(0, ipc_base + IPCMxSEND(IPC_SLOW_MBOX));
+		complete(&ipc_completion);
+	}
+	if (irq_stat & MBOX_MASK(IPC_RX_MBOX)) {
+		__ipc_rcv(IPC_RX_MBOX, data);
+		atomic_notifier_call_chain(&ipc_notifier, data[0], data + 1);
+		__raw_writel(2, ipc_base + IPCMxSEND(IPC_RX_MBOX));
+	}
+
+	return IRQ_HANDLED;
+}
+
+int pl320_ipc_register_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_register(&ipc_notifier, nb);
+}
+
+int pl320_ipc_unregister_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&ipc_notifier, nb);
+}
+
+static int __devinit pl320_probe(struct amba_device *adev,
+				const struct amba_id *id)
+{
+	int ret;
+
+	ipc_base = ioremap(adev->res.start, resource_size(&adev->res));
+	if (ipc_base == NULL)
+		return -ENOMEM;
+
+	__raw_writel(0, ipc_base + IPCMxSEND(IPC_FAST_MBOX));
+	__raw_writel(0, ipc_base + IPCMxSEND(IPC_SLOW_MBOX));
+
+	ipc_irq = adev->irq[0];
+	ret = request_irq(ipc_irq, ipc_handler, 0, dev_name(&adev->dev), NULL);
+	if (ret < 0)
+		goto err;
+
+	/* Init fast mailbox */
+	__raw_writel(CHAN_MASK(A9_SOURCE),
+			ipc_base + IPCMxSOURCE(IPC_FAST_MBOX));
+	set_destination(M3_SOURCE, IPC_FAST_MBOX);
+
+	/* Init slow mailbox */
+	__raw_writel(CHAN_MASK(A9_SOURCE),
+			ipc_base + IPCMxSOURCE(IPC_SLOW_MBOX));
+	__raw_writel(CHAN_MASK(M3_SOURCE),
+			ipc_base + IPCMxDSET(IPC_SLOW_MBOX));
+	__raw_writel(CHAN_MASK(M3_SOURCE) | CHAN_MASK(A9_SOURCE),
+		     ipc_base + IPCMxMSET(IPC_SLOW_MBOX));
+
+	/* Init receive mailbox */
+	__raw_writel(CHAN_MASK(M3_SOURCE),
+			ipc_base + IPCMxSOURCE(IPC_RX_MBOX));
+	__raw_writel(CHAN_MASK(A9_SOURCE),
+			ipc_base + IPCMxDSET(IPC_RX_MBOX));
+	__raw_writel(CHAN_MASK(M3_SOURCE) | CHAN_MASK(A9_SOURCE),
+		     ipc_base + IPCMxMSET(IPC_RX_MBOX));
+
+	return 0;
+err:
+	iounmap(ipc_base);
+	return ret;
+}
+
+static struct amba_id pl320_ids[] = {
+	{
+		.id	= 0x00041320,
+		.mask	= 0x000fffff,
+	},
+	{ 0, 0 },
+};
+
+static struct amba_driver pl320_driver = {
+	.drv = {
+		.name	= "pl320",
+	},
+	.id_table	= pl320_ids,
+	.probe		= pl320_probe,
+};
+
+static int __init ipc_init(void)
+{
+	return amba_driver_register(&pl320_driver);
+}
+module_init(ipc_init);
-- 
1.7.11.7

^ permalink raw reply related

* [PATCH 3/6 v3] cpufreq: tolerate inexact values when collecting stats
From: Mark Langsdorf @ 2012-11-06 20:18 UTC (permalink / raw)
  To: linux-kernel, linux-pm, cpufreq, mark.langsdorf
In-Reply-To: <1352233089-22586-1-git-send-email-mark.langsdorf@calxeda.com>

Allow frequency values to vary by +/-5000 Hz when collecting stats.

Signed-off-by: Mark Langsdorf <mark.langsdorf@calxeda.com>
---
Changes from v2:
	None
Changes from v1:
        Implemented a simple round-up algorithm instead of the over/under
method that could cause errors on Intel processors with boost mode.

 drivers/cpufreq/cpufreq_stats.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 3998316..4e2ea7e 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -158,9 +158,11 @@ static struct attribute_group stats_attr_group = {
 static int freq_table_get_index(struct cpufreq_stats *stat, unsigned int freq)
 {
 	int index;
-	for (index = 0; index < stat->max_state; index++)
-		if (stat->freq_table[index] == freq)
+	for (index = 0; index < stat->max_state; index++) {
+		if ((stat->freq_table[index] < (freq + 5000)) &&
+		    (stat->freq_table[index] > (freq - 5000)))
 			return index;
+	}
 	return -1;
 }
 
@@ -251,6 +253,8 @@ static int cpufreq_stats_create_table(struct cpufreq_policy *policy,
 	spin_lock(&cpufreq_stats_lock);
 	stat->last_time = get_jiffies_64();
 	stat->last_index = freq_table_get_index(stat, policy->cur);
+	if (stat->last_index > stat->max_state)
+		stat->last_index = stat->max_state - 1;
 	spin_unlock(&cpufreq_stats_lock);
 	cpufreq_cpu_put(data);
 	return 0;
-- 
1.7.11.7

^ permalink raw reply related

* [PATCH 2/6 v3] clk, highbank: remove non-bypass reset mode
From: Mark Langsdorf @ 2012-11-06 20:18 UTC (permalink / raw)
  To: linux-kernel, linux-pm, cpufreq, mark.langsdorf
In-Reply-To: <1352233089-22586-1-git-send-email-mark.langsdorf@calxeda.com>

The highbank clock will glitch if the clock rate is reset without
relocking the PLL. Remove the option to attempt reseting without
relocking.

Signed-off-by: Mark Langsdorf <mark.langsdorf@calxeda.com>
---
Changes from v2:
	None
Changes from v1:
        Removed erroneous reformating.

 drivers/clk/clk-highbank.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/clk/clk-highbank.c b/drivers/clk/clk-highbank.c
index 52fecad..4f50c42 100644
--- a/drivers/clk/clk-highbank.c
+++ b/drivers/clk/clk-highbank.c
@@ -171,7 +171,8 @@ static int clk_pll_set_rate(struct clk_hw *hwclk, unsigned long rate,
 
 		writel(reg | HB_PLL_RESET, hbclk->reg);
 		reg &= ~(HB_PLL_DIVF_MASK | HB_PLL_DIVQ_MASK);
-		reg |= (divf << HB_PLL_DIVF_SHIFT) | (divq << HB_PLL_DIVQ_SHIFT);
+		reg |= (divf << HB_PLL_DIVF_SHIFT) |
+			(divq << HB_PLL_DIVQ_SHIFT);
 		writel(reg | HB_PLL_RESET, hbclk->reg);
 		writel(reg, hbclk->reg);
 
@@ -182,8 +183,10 @@ static int clk_pll_set_rate(struct clk_hw *hwclk, unsigned long rate,
 		reg |= HB_PLL_EXT_ENA;
 		reg &= ~HB_PLL_EXT_BYPASS;
 	} else {
+		writel(reg | HB_PLL_EXT_BYPASS, hbclk->reg);
 		reg &= ~HB_PLL_DIVQ_MASK;
 		reg |= divq << HB_PLL_DIVQ_SHIFT;
+		writel(reg | HB_PLL_EXT_BYPASS, hbclk->reg);
 	}
 	writel(reg, hbclk->reg);
 
-- 
1.7.11.7

^ permalink raw reply related

* [RFC PATCH 8/8] mm: Print memory region statistics to understand the buddy allocator behavior
From: Srivatsa S. Bhat @ 2012-11-06 19:54 UTC (permalink / raw)
  To: akpm, mgorman, mjg59, paulmck, dave, maxime.coquelin,
	loic.pallardy, arjan, kmpark, kamezawa.hiroyu, lenb, rjw
  Cc: gargankita, amit.kachhap, svaidy, thomas.abraham,
	santosh.shilimkar, srivatsa.bhat, linux-pm, linux-mm,
	linux-kernel
In-Reply-To: <20121106195026.6941.24662.stgit@srivatsabhat.in.ibm.com>

In order to observe the behavior of the region-aware buddy allocator, modify
vmstat.c to also print memory region related statistics. In particular, enable
memory region-related info in /proc/zoneinfo and /proc/buddyinfo, since they
would help us to atleast (roughly) see how the new buddy allocator is
performing.

For now, the region statistics correspond to the zone memory regions and not
the (absolute) node memory regions, and some of the statistics (especially the
no. of present pages) might not be very accurate. But since we account for
and print the free page statistics for every zone memory region accurately, we
should be able to observe the new page allocator behavior to a reasonable
degree of accuracy.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
---

 mm/vmstat.c |   57 +++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 53 insertions(+), 4 deletions(-)

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8183331..cbcd373 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -812,11 +812,31 @@ const char * const vmstat_text[] = {
 static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
 						struct zone *zone)
 {
-	int order;
+	int i, order, t;
+	struct free_area *area;
 
-	seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
-	for (order = 0; order < MAX_ORDER; ++order)
-		seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
+	seq_printf(m, "Node %d, zone %8s \n", pgdat->node_id, zone->name);
+
+	for (i = 0; i < zone->nr_zone_regions; i++) {
+
+		seq_printf(m, "\t\t Region %d ", i);
+
+		for (order = 0; order < MAX_ORDER; ++order) {
+			unsigned long nr_free = 0;
+
+			area = &zone->free_area[order];
+
+			for (t = 0; t < MIGRATE_TYPES; t++) {
+				if (t == MIGRATE_ISOLATE ||
+					t == MIGRATE_RESERVE)
+					continue;
+				nr_free +=
+					area->free_list[t].mr_list[i].nr_free;
+			}
+			seq_printf(m, "%6lu ", nr_free);
+		}
+		seq_putc(m, '\n');
+	}
 	seq_putc(m, '\n');
 }
 
@@ -984,6 +1004,8 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 							struct zone *zone)
 {
 	int i;
+	unsigned long zone_nr_free = 0;
+
 	seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
 	seq_printf(m,
 		   "\n  pages free     %lu"
@@ -1001,6 +1023,33 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   zone->spanned_pages,
 		   zone->present_pages);
 
+	for (i = 0; i < zone->nr_zone_regions; i++) {
+		int order, t;
+		unsigned long nr_free = 0;
+		struct free_area *area = zone->free_area;
+
+		for_each_migratetype_order(order, t) {
+			if (t == MIGRATE_ISOLATE || t == MIGRATE_RESERVE)
+				continue;
+			nr_free +=
+				area[order].free_list[t].mr_list[i].nr_free
+				* (1UL << order);
+		}
+		seq_printf(m, "\n\nZone mem region %d", i);
+		seq_printf(m,
+			   "\n  pages spanned	%lu"
+			   "\n        present	%lu"
+			   "\n        free	%lu",
+			   zone->zone_mem_region[i].spanned_pages,
+			   zone->zone_mem_region[i].present_pages,
+			   nr_free);
+	}
+
+	for (i = 0; i < MAX_ORDER; i++)
+		zone_nr_free += zone->free_area[i].nr_free * (1UL << i);
+
+	seq_printf(m, "\nZone pages nr_free  %lu\n", zone_nr_free);
+
 	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
 		seq_printf(m, "\n    %-12s %lu", vmstat_text[i],
 				zone_page_state(zone, i));

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [RFC PATCH 7/8] mm: Add an optimized version of del_from_freelist to keep page allocation fast
From: Srivatsa S. Bhat @ 2012-11-06 19:54 UTC (permalink / raw)
  To: akpm, mgorman, mjg59, paulmck, dave, maxime.coquelin,
	loic.pallardy, arjan, kmpark, kamezawa.hiroyu, lenb, rjw
  Cc: gargankita, amit.kachhap, svaidy, thomas.abraham,
	santosh.shilimkar, srivatsa.bhat, linux-pm, linux-mm,
	linux-kernel
In-Reply-To: <20121106195026.6941.24662.stgit@srivatsabhat.in.ibm.com>

One of the main advantages of this design of memory regions is that page
allocations can potentially be extremely fast - almost with no extra
overhead from memory regions.

To exploit that, introduce an optimized version of del_from_freelist(), which
utilizes the fact that we always delete items from the head of the list
during page allocation.

Basically, we want to keep a note of the region from which we are allocating
in a given freelist, to avoid having to compute the page-to-zone-region for
every page allocation. So introduce a 'next_region' pointer in every freelist
to achieve that, and use it to keep the fastpath of page allocation almost as
fast as it would be without memory regions.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
---

 include/linux/mm.h     |   11 ++++++++++
 include/linux/mmzone.h |    6 ++++++
 mm/page_alloc.c        |   51 ++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index a817b16..cab8709 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -725,6 +725,17 @@ static inline int page_zone_region_id(const struct page *page)
 	return pgdat->node_regions[node_region_idx].zone_region_idx[z_num];
 }
 
+static inline void set_next_region_in_freelist(struct free_list *free_list)
+{
+	if (list_empty(&free_list->list))
+		free_list->next_region = NULL;
+	else {
+		do {
+			free_list->next_region++;
+		} while (free_list->next_region->nr_free == 0);
+	}
+}
+
 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
 static inline void set_page_section(struct page *page, unsigned long section)
 {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index aba4d68..1d20aa1 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -91,6 +91,12 @@ struct free_list {
 	struct list_head	list;
 
 	/*
+	 * Pointer to the region from which the next allocation will be
+	 * satisfied. (Same as the freelist's first pageblock's region.)
+	 */
+	struct mem_region_list	*next_region; /* for fast page allocation */
+
+	/*
 	 * Demarcates pageblocks belonging to different regions within
 	 * this freelist.
 	 */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 52ff914..05c1fcf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -532,6 +532,11 @@ static void add_to_freelist(struct page *page, struct list_head *lru,
 	/* This is the first region, so add to the head of the list */
 	prev_region_list = &free_list->list;
 
+	/*
+	 * Set 'next_region' to this region, since this is the first region now
+	 */
+	free_list->next_region = region;
+
 out:
 	list_add(lru, prev_region_list);
 
@@ -539,6 +544,38 @@ out:
 	region->page_block = lru;
 }
 
+/**
+ * __rmqueue_smallest() *always* deletes elements from the head of the
+ * list. Use this knowledge to keep page allocation fast, despite being
+ * region-aware.
+ *
+ * Do *NOT* call this function if you are deleting from somewhere deep
+ * inside the freelist.
+ */
+static void rmqueue_del_from_freelist(struct list_head *lru,
+				      struct free_list *free_list)
+{
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	WARN_ON(free_list->list.next != lru);
+#endif
+
+	list_del(lru);
+
+	/* Fastpath */
+	if (--(free_list->next_region->nr_free))
+		return;
+
+	/*
+	 * Slowpath, when this is the last pageblock of this region
+	 * in this freelist.
+	 */
+	free_list->next_region->page_block = NULL;
+
+	/* Set 'next_region' to the new first region in the freelist. */
+	set_next_region_in_freelist(free_list);
+}
+
+/* Generic delete function for region-aware buddy allocator. */
 static void del_from_freelist(struct page *page, struct list_head *lru,
 			      struct free_list *free_list)
 {
@@ -546,6 +583,11 @@ static void del_from_freelist(struct page *page, struct list_head *lru,
 	struct list_head *prev_page_lru;
 	int region_id;
 
+
+	/* Try to fastpath, if deleting from the head of the list */
+	if (lru == free_list->list.next)
+		return rmqueue_del_from_freelist(lru, free_list);
+
 	region_id = page_zone_region_id(page);
 	region = &free_list->mr_list[region_id];
 	region->nr_free--;
@@ -558,6 +600,11 @@ static void del_from_freelist(struct page *page, struct list_head *lru,
 	prev_page_lru = lru->prev;
 	list_del(lru);
 
+	/*
+	 * Since we are not deleting from the head of the list, the
+	 * 'next_region' pointer doesn't have to change.
+	 */
+
 	if (region->nr_free == 0)
 		region->page_block = NULL;
 	else
@@ -965,8 +1012,8 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 
 		page = list_entry(area->free_list[migratetype].list.next,
 							struct page, lru);
-		del_from_freelist(page, &page->lru,
-				  &area->free_list[migratetype]);
+		rmqueue_del_from_freelist(&page->lru,
+					  &area->free_list[migratetype]);
 		rmv_page_order(page);
 		area->nr_free--;
 		expand(zone, page, order, current_order, area, migratetype);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [RFC PATCH 6/8] mm: Demarcate and maintain pageblocks in region-order in the zones' freelists
From: Srivatsa S. Bhat @ 2012-11-06 19:53 UTC (permalink / raw)
  To: akpm, mgorman, mjg59, paulmck, dave, maxime.coquelin,
	loic.pallardy, arjan, kmpark, kamezawa.hiroyu, lenb, rjw
  Cc: gargankita, amit.kachhap, svaidy, thomas.abraham,
	santosh.shilimkar, srivatsa.bhat, linux-pm, linux-mm,
	linux-kernel
In-Reply-To: <20121106195026.6941.24662.stgit@srivatsabhat.in.ibm.com>

The zones' freelists need to be made region-aware, in order to influence
page allocation and freeing algorithms. So in every free list in the zone, we
would like to demarcate the pageblocks belonging to different memory regions
(we can do this using a set of pointers, and thus avoid splitting up the
freelists).

Also, we would like to keep the pageblocks in the freelists sorted in
region-order. That is, pageblocks belonging to region-0 would come first,
followed by pageblocks belonging to region-1 and so on, within a given
freelist. Of course, a set of pageblocks belonging to the same region need
not be sorted; it is sufficient if we maintain the pageblocks in
region-sorted-order, rather than a full address-sorted-order.

For each freelist within the zone, we maintain a set of pointers to
pageblocks belonging to the various memory regions in that zone.

Eg:

    |<---Region0--->|   |<---Region1--->|   |<-------Region2--------->|
     ____      ____      ____      ____      ____      ____      ____
--> |____|--> |____|--> |____|--> |____|--> |____|--> |____|--> |____|-->

                 ^                  ^                              ^
                 |                  |                              |
                Reg0               Reg1                          Reg2


Page allocation will proceed as usual - pick the first item on the free list.
But we don't want to keep updating these region pointers every time we allocate
a pageblock from the freelist. So, instead of pointing to the *first* pageblock
of that region, we maintain the region pointers such that they point to the
*last* pageblock in that region, as shown in the figure above. That way, as
long as there are > 1 pageblocks in that region in that freelist, that region
pointer doesn't need to be updated.


Page allocation algorithm:
-------------------------

The heart of the page allocation algorithm remains it is - pick the first
item on the appropriate freelist and return it.


Pageblock order in the zone freelists:
-------------------------------------

This is the main change - we keep the pageblocks in region-sorted order,
where pageblocks belonging to region-0 come first, followed by those belonging
to region-1 and so on. But the pageblocks within a given region need *not* be
sorted, since we need them to be only region-sorted and not fully
address-sorted.

This sorting is performed when adding pages back to the freelists, thus
avoiding any region-related overhead in the critical page allocation
paths.

Page reclaim [Todo]:
--------------------

Page allocation happens in the order of increasing region number. We would
like to do page reclaim in the reverse order, to keep allocated pages within
a minimal number of regions (approximately).

---------------------------- Increasing region number---------------------->

Direction of allocation--->                         <---Direction of reclaim

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
---

 mm/page_alloc.c |  128 +++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 113 insertions(+), 15 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 62d0a9a..52ff914 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -502,6 +502,79 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
 	return 0;
 }
 
+static void add_to_freelist(struct page *page, struct list_head *lru,
+			    struct free_list *free_list)
+{
+	struct mem_region_list *region;
+	struct list_head *prev_region_list;
+	int region_id, i;
+
+	region_id = page_zone_region_id(page);
+
+	region = &free_list->mr_list[region_id];
+	region->nr_free++;
+
+	if (region->page_block) {
+		list_add_tail(lru, region->page_block);
+		return;
+	}
+
+	if (!list_empty(&free_list->list)) {
+		for (i = region_id - 1; i >= 0; i--) {
+			if (free_list->mr_list[i].page_block) {
+				prev_region_list =
+					free_list->mr_list[i].page_block;
+				goto out;
+			}
+		}
+	}
+
+	/* This is the first region, so add to the head of the list */
+	prev_region_list = &free_list->list;
+
+out:
+	list_add(lru, prev_region_list);
+
+	/* Save pointer to page block of this region */
+	region->page_block = lru;
+}
+
+static void del_from_freelist(struct page *page, struct list_head *lru,
+			      struct free_list *free_list)
+{
+	struct mem_region_list *region;
+	struct list_head *prev_page_lru;
+	int region_id;
+
+	region_id = page_zone_region_id(page);
+	region = &free_list->mr_list[region_id];
+	region->nr_free--;
+
+	if (lru != region->page_block) {
+		list_del(lru);
+		return;
+	}
+
+	prev_page_lru = lru->prev;
+	list_del(lru);
+
+	if (region->nr_free == 0)
+		region->page_block = NULL;
+	else
+		region->page_block = prev_page_lru;
+}
+
+/**
+ * Move pages of a given order from freelist of one migrate-type to another.
+ */
+static void move_pages_freelist(struct page *page, struct list_head *lru,
+				struct free_list *old_list,
+				struct free_list *new_list)
+{
+	del_from_freelist(page, lru, old_list);
+	add_to_freelist(page, lru, new_list);
+}
+
 /*
  * Freeing function for a buddy system allocator.
  *
@@ -534,6 +607,7 @@ static inline void __free_one_page(struct page *page,
 	unsigned long combined_idx;
 	unsigned long uninitialized_var(buddy_idx);
 	struct page *buddy;
+	struct free_area *area;
 
 	if (unlikely(PageCompound(page)))
 		if (unlikely(destroy_compound_page(page, order)))
@@ -561,8 +635,10 @@ static inline void __free_one_page(struct page *page,
 			__mod_zone_freepage_state(zone, 1 << order,
 						  migratetype);
 		} else {
-			list_del(&buddy->lru);
-			zone->free_area[order].nr_free--;
+			area = &zone->free_area[order];
+			del_from_freelist(buddy, &buddy->lru,
+					  &area->free_list[migratetype]);
+			area->nr_free--;
 			rmv_page_order(buddy);
 		}
 		combined_idx = buddy_idx & page_idx;
@@ -587,14 +663,23 @@ static inline void __free_one_page(struct page *page,
 		buddy_idx = __find_buddy_index(combined_idx, order + 1);
 		higher_buddy = higher_page + (buddy_idx - combined_idx);
 		if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
-			list_add_tail(&page->lru,
-				&zone->free_area[order].free_list[migratetype].list);
+
+			/*
+			 * Implementing an add_to_freelist_tail() won't be
+			 * very useful because both of them (almost) add to
+			 * the tail within the region. So we could potentially
+			 * switch off this entire "is next-higher buddy free?"
+			 * logic when memory regions are used.
+			 */
+			area = &zone->free_area[order];
+			add_to_freelist(page, &page->lru,
+					&area->free_list[migratetype]);
 			goto out;
 		}
 	}
 
-	list_add(&page->lru,
-		&zone->free_area[order].free_list[migratetype].list);
+	add_to_freelist(page, &page->lru,
+			&zone->free_area[order].free_list[migratetype]);
 out:
 	zone->free_area[order].nr_free++;
 }
@@ -812,7 +897,8 @@ static inline void expand(struct zone *zone, struct page *page,
 			continue;
 		}
 #endif
-		list_add(&page[size].lru, &area->free_list[migratetype].list);
+		add_to_freelist(&page[size], &page[size].lru,
+					&area->free_list[migratetype]);
 		area->nr_free++;
 		set_page_order(&page[size], high);
 	}
@@ -879,7 +965,8 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 
 		page = list_entry(area->free_list[migratetype].list.next,
 							struct page, lru);
-		list_del(&page->lru);
+		del_from_freelist(page, &page->lru,
+				  &area->free_list[migratetype]);
 		rmv_page_order(page);
 		area->nr_free--;
 		expand(zone, page, order, current_order, area, migratetype);
@@ -918,7 +1005,8 @@ int move_freepages(struct zone *zone,
 {
 	struct page *page;
 	unsigned long order;
-	int pages_moved = 0;
+	struct free_area *area;
+	int pages_moved = 0, old_mt;
 
 #ifndef CONFIG_HOLES_IN_ZONE
 	/*
@@ -946,8 +1034,11 @@ int move_freepages(struct zone *zone,
 		}
 
 		order = page_order(page);
-		list_move(&page->lru,
-			  &zone->free_area[order].free_list[migratetype].list);
+		old_mt = get_freepage_migratetype(page);
+		area = &zone->free_area[order];
+		move_pages_freelist(page, &page->lru,
+				    &area->free_list[old_mt],
+				    &area->free_list[migratetype]);
 		set_freepage_migratetype(page, migratetype);
 		page += 1 << order;
 		pages_moved += 1 << order;
@@ -1045,7 +1136,8 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
 			}
 
 			/* Remove the page from the freelists */
-			list_del(&page->lru);
+			del_from_freelist(page, &page->lru,
+					  &area->free_list[migratetype]);
 			rmv_page_order(page);
 
 			/* Take ownership for orders >= pageblock_order */
@@ -1399,12 +1491,14 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
 	if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
 		return 0;
 
+	mt = get_pageblock_migratetype(page);
+
 	/* Remove page from free list */
-	list_del(&page->lru);
+	del_from_freelist(page, &page->lru,
+			  &zone->free_area[order].free_list[mt]);
 	zone->free_area[order].nr_free--;
 	rmv_page_order(page);
 
-	mt = get_pageblock_migratetype(page);
 	if (unlikely(mt != MIGRATE_ISOLATE))
 		__mod_zone_freepage_state(zone, -(1UL << order), mt);
 
@@ -6040,6 +6134,8 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 	int order, i;
 	unsigned long pfn;
 	unsigned long flags;
+	int mt;
+
 	/* find the first valid pfn */
 	for (pfn = start_pfn; pfn < end_pfn; pfn++)
 		if (pfn_valid(pfn))
@@ -6062,7 +6158,9 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 		printk(KERN_INFO "remove from free list %lx %d %lx\n",
 		       pfn, 1 << order, end_pfn);
 #endif
-		list_del(&page->lru);
+		mt = get_freepage_migratetype(page);
+		del_from_freelist(page, &page->lru,
+			  	  &zone->free_area[order].free_list[mt]);
 		rmv_page_order(page);
 		zone->free_area[order].nr_free--;
 		__mod_zone_page_state(zone, NR_FREE_PAGES,

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [RFC PATCH 5/8] mm: Add data-structures to describe memory regions within the zones' freelists
From: Srivatsa S. Bhat @ 2012-11-06 19:53 UTC (permalink / raw)
  To: akpm, mgorman, mjg59, paulmck, dave, maxime.coquelin,
	loic.pallardy, arjan, kmpark, kamezawa.hiroyu, lenb, rjw
  Cc: gargankita, amit.kachhap, svaidy, thomas.abraham,
	santosh.shilimkar, srivatsa.bhat, linux-pm, linux-mm,
	linux-kernel
In-Reply-To: <20121106195026.6941.24662.stgit@srivatsabhat.in.ibm.com>

In order to influence page allocation decisions (i.e., to make page-allocation
region-aware), we need to be able to distinguish pageblocks belonging to
different zone memory regions within the zones' freelists.

So, within every freelist in a zone, provide pointers to describe the
boundaries of zone memory regions and counters to track the number of free
pageblocks within each region.

Also, fixup the references to the freelist's list_head inside struct free_area.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
---

 include/linux/mmzone.h |   17 ++++++++++++++++-
 mm/compaction.c        |    8 ++++----
 mm/page_alloc.c        |   21 +++++++++++----------
 mm/vmstat.c            |    2 +-
 4 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3982354..aba4d68 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -82,8 +82,23 @@ static inline int get_pageblock_migratetype(struct page *page)
 
 #define MAX_NR_REGIONS	256
 
+struct mem_region_list {
+	struct list_head	*page_block;
+	unsigned long		nr_free;
+};
+
+struct free_list {
+	struct list_head	list;
+
+	/*
+	 * Demarcates pageblocks belonging to different regions within
+	 * this freelist.
+	 */
+	struct mem_region_list	mr_list[MAX_NR_REGIONS];
+};
+
 struct free_area {
-	struct list_head	free_list[MIGRATE_TYPES];
+	struct free_list	free_list[MIGRATE_TYPES];
 	unsigned long		nr_free;
 };
 
diff --git a/mm/compaction.c b/mm/compaction.c
index 9eef558..95f5c92 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -247,14 +247,14 @@ static void compact_capture_page(struct compact_control *cc)
 			struct page *page;
 			struct free_area *area;
 			area = &(cc->zone->free_area[order]);
-			if (list_empty(&area->free_list[mtype]))
+			if (list_empty(&area->free_list[mtype].list))
 				continue;
 
 			/* Take the lock and attempt capture of the page */
 			if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
 				return;
-			if (!list_empty(&area->free_list[mtype])) {
-				page = list_entry(area->free_list[mtype].next,
+			if (!list_empty(&area->free_list[mtype].list)) {
+				page = list_entry(area->free_list[mtype].list.next,
 							struct page, lru);
 				if (capture_free_page(page, cc->order, mtype)) {
 					spin_unlock_irqrestore(&cc->zone->lock,
@@ -866,7 +866,7 @@ static int compact_finished(struct zone *zone,
 		for (order = cc->order; order < MAX_ORDER; order++) {
 			struct free_area *area = &zone->free_area[cc->order];
 			/* Job done if page is free of the right migratetype */
-			if (!list_empty(&area->free_list[cc->migratetype]))
+			if (!list_empty(&area->free_list[cc->migratetype].list))
 				return COMPACT_PARTIAL;
 
 			/* Job done if allocation would set block type */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7fd89cd..62d0a9a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -588,12 +588,13 @@ static inline void __free_one_page(struct page *page,
 		higher_buddy = higher_page + (buddy_idx - combined_idx);
 		if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
 			list_add_tail(&page->lru,
-				&zone->free_area[order].free_list[migratetype]);
+				&zone->free_area[order].free_list[migratetype].list);
 			goto out;
 		}
 	}
 
-	list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
+	list_add(&page->lru,
+		&zone->free_area[order].free_list[migratetype].list);
 out:
 	zone->free_area[order].nr_free++;
 }
@@ -811,7 +812,7 @@ static inline void expand(struct zone *zone, struct page *page,
 			continue;
 		}
 #endif
-		list_add(&page[size].lru, &area->free_list[migratetype]);
+		list_add(&page[size].lru, &area->free_list[migratetype].list);
 		area->nr_free++;
 		set_page_order(&page[size], high);
 	}
@@ -873,10 +874,10 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 	/* Find a page of the appropriate size in the preferred list */
 	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
 		area = &(zone->free_area[current_order]);
-		if (list_empty(&area->free_list[migratetype]))
+		if (list_empty(&area->free_list[migratetype].list))
 			continue;
 
-		page = list_entry(area->free_list[migratetype].next,
+		page = list_entry(area->free_list[migratetype].list.next,
 							struct page, lru);
 		list_del(&page->lru);
 		rmv_page_order(page);
@@ -946,7 +947,7 @@ int move_freepages(struct zone *zone,
 
 		order = page_order(page);
 		list_move(&page->lru,
-			  &zone->free_area[order].free_list[migratetype]);
+			  &zone->free_area[order].free_list[migratetype].list);
 		set_freepage_migratetype(page, migratetype);
 		page += 1 << order;
 		pages_moved += 1 << order;
@@ -1007,10 +1008,10 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
 				break;
 
 			area = &(zone->free_area[current_order]);
-			if (list_empty(&area->free_list[migratetype]))
+			if (list_empty(&area->free_list[migratetype].list))
 				continue;
 
-			page = list_entry(area->free_list[migratetype].next,
+			page = list_entry(area->free_list[migratetype].list.next,
 					struct page, lru);
 			area->nr_free--;
 
@@ -1274,7 +1275,7 @@ void mark_free_pages(struct zone *zone)
 		}
 
 	for_each_migratetype_order(order, t) {
-		list_for_each(curr, &zone->free_area[order].free_list[t]) {
+		list_for_each(curr, &zone->free_area[order].free_list[t].list) {
 			unsigned long i;
 
 			pfn = page_to_pfn(list_entry(curr, struct page, lru));
@@ -3859,7 +3860,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
 {
 	int order, t;
 	for_each_migratetype_order(order, t) {
-		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
+		INIT_LIST_HEAD(&zone->free_area[order].free_list[t].list);
 		zone->free_area[order].nr_free = 0;
 	}
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c737057..8183331 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -847,7 +847,7 @@ static void pagetypeinfo_showfree_print(struct seq_file *m,
 
 			area = &(zone->free_area[order]);
 
-			list_for_each(curr, &area->free_list[mtype])
+			list_for_each(curr, &area->free_list[mtype].list)
 				freecount++;
 			seq_printf(m, "%6lu ", freecount);
 		}

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [RFC PATCH 0/8][Sorted-buddy] mm: Linux VM Infrastructure to support Memory Power Management
From: Srivatsa S. Bhat @ 2012-11-06 19:52 UTC (permalink / raw)
  To: akpm, mgorman, mjg59, paulmck, dave, maxime.coquelin,
	loic.pallardy, arjan, kmpark, kamezawa.hiroyu, lenb, rjw
  Cc: gargankita, amit.kachhap, svaidy, thomas.abraham,
	santosh.shilimkar, srivatsa.bhat, linux-pm, linux-mm,
	linux-kernel

Hi,

This is an alternative design for Memory Power Management, developed based on
some of the suggestions[1] received during the review of the earlier patchset
("Hierarchy" design) on Memory Power Management[2]. This alters the buddy-lists
to keep them region-sorted, and is hence identified as the "Sorted-buddy" design.

One of the key aspects of this design is that it avoids the zone-fragmentation
problem that was present in the earlier design[3].


Quick overview of Memory Power Management and Memory Regions:
------------------------------------------------------------

Today memory subsystems are offer a wide range of capabilities for managing
memory power consumption. As a quick example, if a block of memory is not
referenced for a threshold amount of time, the memory controller can decide to
put that chunk into a low-power content-preserving state. And the next
reference to that memory chunk would bring it back to full power for read/write.
With this capability in place, it becomes important for the OS to understand
the boundaries of such power-manageable chunks of memory and to ensure that
references are consolidated to a minimum number of such memory power management
domains.

ACPI 5.0 has introduced MPST tables (Memory Power State Tables) [5] so that
the firmware can expose information regarding the boundaries of such memory
power management domains to the OS in a standard way.

How can Linux VM help memory power savings?

o Consolidate memory allocations and/or references such that they are
not spread across the entire memory address space.  Basically area of memory
that is not being referenced, can reside in low power state.

o Support targeted memory reclaim, where certain areas of memory that can be
easily freed can be offlined, allowing those areas of memory to be put into
lower power states.

Memory Regions:
---------------

"Memory Regions" is a way of capturing the boundaries of power-managable
chunks of memory, within the MM subsystem.


Short description of the "Sorted-buddy" design:
-----------------------------------------------

In this design, the memory region boundaries are captured in a parallel
data-structure instead of fitting regions between nodes and zones in the
hierarchy. Further, the buddy allocator is altered, such that we maintain the
zones' freelists in region-sorted-order and thus do page allocation in the
order of increasing memory regions. (The freelists need not be fully
address-sorted, they just need to be region-sorted. Patch 6 explains this
in more detail).

The idea is to do page allocation in increasing order of memory regions
(within a zone) and perform page reclaim in the reverse order, as illustrated
below.

---------------------------- Increasing region number---------------------->

Direction of allocation--->                         <---Direction of reclaim


The sorting logic (to maintain freelist pageblocks in region-sorted-order)
lies in the page-free path and not the page-allocation path and hence the
critical page allocation paths remain fast. Moreover, the heart of the page
allocation algorithm itself remains largely unchanged, and the region-related
data-structures are optimized to avoid unnecessary updates during the
page-allocator's runtime.

Advantages of this design:
--------------------------
1. No zone-fragmentation (IOW, we don't create more zones than necessary) and
   hence we avoid its associated problems (like too many zones, extra page
   reclaim threads, question of choosing watermarks etc).
   [This is an advantage over the "Hierarchy" design]

2. Performance overhead is expected to be low: Since we retain the simplicity
   of the algorithm in the page allocation path, page allocation can
   potentially remain as fast as it would be without memory regions. The
   overhead is pushed to the page-freeing paths which are not that critical.


Results:
=======

Test setup:
-----------
This patchset applies cleanly on top of 3.7-rc3.

x86 dual-socket quad core HT-enabled machine booted with mem=8G
Memory region size = 512 MB

Functional testing:
-------------------

Ran pagetest, a simple C program that allocates and touches a required number
of pages.

Below is the statistics from the regions within ZONE_NORMAL, at various sizes
of allocations from pagetest.

	     Present pages   |	Free pages at various allocations        |
			     |  start	|  512 MB  |  1024 MB | 2048 MB  |
  Region 0      16	     |   0      |    0     |     0    |    0     |
  Region 1      131072       |  87219   |  8066    |   7892   |  7387    |
  Region 2      131072       | 131072   |  79036   |     0    |    0     |
  Region 3      131072       | 131072   | 131072   |   79061  |    0     |
  Region 4      131072       | 131072   | 131072   |  131072  |    0     |
  Region 5      131072       | 131072   | 131072   |  131072  |  79051   |
  Region 6      131072       | 131072   | 131072   |  131072  |  131072  |
  Region 7      131072       | 131072   | 131072   |  131072  |  131072  |
  Region 8      131056       | 105475   | 105472   |  105472  |  105472  |

This shows that page allocation occurs in the order of increasing region
numbers, as intended in this design.

Performance impact:
-------------------

Kernbench results didn't show much of a difference between the performance
of vanilla 3.7-rc3 and this patchset.


Todos:
=====

1. Memory-region aware page-reclamation:
----------------------------------------

We would like to do page reclaim in the reverse order of page allocation
within a zone, ie., in the order of decreasing region numbers.
To achieve that, while scanning lru pages to reclaim, we could potentially
look for pages belonging to higher regions (considering region boundaries)
or perhaps simply prefer pages of higher pfns (and skip lower pfns) as
reclaim candidates.

2. Compile-time exclusion of Memory Power Management, and extending the
support to also work with other features such as Mem cgroups, kexec etc.

References:
----------

[1]. Review comments suggesting modifying the buddy allocator to be aware of
     memory regions:
     http://article.gmane.org/gmane.linux.power-management.general/24862
     http://article.gmane.org/gmane.linux.power-management.general/25061
     http://article.gmane.org/gmane.linux.kernel.mm/64689

[2]. Patch series that implemented the node-region-zone hierarchy design:
     http://lwn.net/Articles/445045/
     http://thread.gmane.org/gmane.linux.kernel.mm/63840

     Summary of the discussion on that patchset:
     http://article.gmane.org/gmane.linux.power-management.general/25061

     Forward-port of that patchset to 3.7-rc3 (minimal x86 config)
     http://thread.gmane.org/gmane.linux.kernel.mm/89202

[3]. Disadvantages of having memory regions in the hierarchy between nodes and
     zones:
     http://article.gmane.org/gmane.linux.kernel.mm/63849

[4]. Estimate of potential power savings on Samsung exynos board
     http://article.gmane.org/gmane.linux.kernel.mm/65935

[5]. ACPI 5.0 and MPST support
     http://www.acpi.info/spec.htm
     Section 5.2.21 Memory Power State Table (MPST)

 Srivatsa S. Bhat (8):
      mm: Introduce memory regions data-structure to capture region boundaries within node
      mm: Initialize node memory regions during boot
      mm: Introduce and initialize zone memory regions
      mm: Add helpers to retrieve node region and zone region for a given page
      mm: Add data-structures to describe memory regions within the zones' freelists
      mm: Demarcate and maintain pageblocks in region-order in the zones' freelists
      mm: Add an optimized version of del_from_freelist to keep page allocation fast
      mm: Print memory region statistics to understand the buddy allocator behavior


  include/linux/mm.h     |   38 +++++++
 include/linux/mmzone.h |   52 +++++++++
 mm/compaction.c        |    8 +
 mm/page_alloc.c        |  263 ++++++++++++++++++++++++++++++++++++++++++++----
 mm/vmstat.c            |   59 ++++++++++-
 5 files changed, 390 insertions(+), 30 deletions(-)


Thanks,
Srivatsa S. Bhat
IBM Linux Technology Center


^ permalink raw reply

* [RFC PATCH 4/8] mm: Add helpers to retrieve node region and zone region for a given page
From: Srivatsa S. Bhat @ 2012-11-06 19:53 UTC (permalink / raw)
  To: akpm, mgorman, mjg59, paulmck, dave, maxime.coquelin,
	loic.pallardy, arjan, kmpark, kamezawa.hiroyu, lenb, rjw
  Cc: gargankita, amit.kachhap, svaidy, thomas.abraham,
	santosh.shilimkar, srivatsa.bhat, linux-pm, linux-mm,
	linux-kernel
In-Reply-To: <20121106195026.6941.24662.stgit@srivatsabhat.in.ibm.com>

Given a page, we would like to have an efficient mechanism to find out
the node memory region and the zone memory region to which it belongs.

Since the node is assumed to be divided into equal-sized node memory
regions, the node memory region index can be obtained by simply right-shifting
the page's pfn by 'mem_region_shift'.

But finding the corresponding zone memory region's index in the zone is
not that straight-forward. To have a O(1) algorithm to find it out, define a
zone_region_idx[] array to store the zone memory region indices for every
node memory region.

To illustrate, consider the following example:

	|<---------------------Node---------------------->|
	 _________________________________________________
	|      Node mem reg 0 	|      Node mem reg 1     |
	|_______________________|_________________________|

	 _________________________________________________
	|   ZONE_DMA    |	ZONE_NORMAL		  |
	|_______________|_________________________________|


In the above figure,

Node mem region 0:
------------------
This region corresponds to the first zone mem region in ZONE_DMA and also
the first zone mem region in ZONE_NORMAL. Hence its index array would look
like this:
    node_regions[0].zone_region_idx[ZONE_DMA]     == 0
    node_regions[0].zone_region_idx[ZONE_NORMAL]  == 0


Node mem region 1:
------------------
This region corresponds to the second zone mem region in ZONE_NORMAL. Hence
its index array would look like this:
    node_regions[1].zone_region_idx[ZONE_NORMAL]  == 1


Using this index array, we can quickly obtain the zone memory region to
which a given page belongs.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
---

 include/linux/mm.h     |   23 +++++++++++++++++++++++
 include/linux/mmzone.h |    7 +++++++
 mm/page_alloc.c        |    2 ++
 3 files changed, 32 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 19c4fb0..a817b16 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -702,6 +702,29 @@ static inline struct zone *page_zone(const struct page *page)
 	return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
 }
 
+static inline int page_node_region_id(const struct page *page)
+{
+	return page_to_pfn(page) >> MEM_REGION_SHIFT;
+}
+
+/**
+ * Return the index of the region to which the page belongs, within its zone.
+ *
+ * Given a page, find the absolute (node) region as well as the zone to which
+ * it belongs. Then find the region within the zone that corresponds to that
+ * absolute (node) region, and return its index.
+ */
+static inline int page_zone_region_id(const struct page *page)
+{
+	pg_data_t *pgdat = NODE_DATA(page_to_nid(page));
+	enum zone_type z_num = page_zonenum(page);
+	unsigned long node_region_idx;
+
+	node_region_idx = page_node_region_id(page);
+
+	return pgdat->node_regions[node_region_idx].zone_region_idx[z_num];
+}
+
 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
 static inline void set_page_section(struct page *page, unsigned long section)
 {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9f923aa..3982354 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -336,6 +336,13 @@ struct node_mem_region {
 	unsigned long spanned_pages;
 	int idx;
 	int node;
+
+	/*
+	 * A physical (node) region could be split across multiple zones.
+	 * Store the indices of the corresponding regions of each such
+	 * zone for this physical (node) region.
+	 */
+	int zone_region_idx[MAX_NR_ZONES];
 	struct pglist_data *pgdat;
 };
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c00f72d..7fd89cd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4621,6 +4621,8 @@ void init_zone_memory_regions(struct pglist_data *pgdat)
 						         end_pfn);
 			z->zone_mem_region[idx].present_pages =
 						end_pfn - start_pfn - absent;
+
+			region->zone_region_idx[zone_idx(z)] = idx;
 			idx++;
 		}
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [RFC PATCH 3/8] mm: Introduce and initialize zone memory regions
From: Srivatsa S. Bhat @ 2012-11-06 19:53 UTC (permalink / raw)
  To: akpm, mgorman, mjg59, paulmck, dave, maxime.coquelin,
	loic.pallardy, arjan, kmpark, kamezawa.hiroyu, lenb, rjw
  Cc: gargankita, amit.kachhap, svaidy, thomas.abraham,
	santosh.shilimkar, srivatsa.bhat, linux-pm, linux-mm,
	linux-kernel
In-Reply-To: <20121106195026.6941.24662.stgit@srivatsabhat.in.ibm.com>

Memory region boundaries don't necessarily fit on zone boundaries. So we need
to maintain a zone-level mapping of the absolute memory region boundaries.

"Node Memory Regions" will be used to capture the absolute region boundaries.
Add "Zone Memory Regions" to track the subsets of the absolute memory regions
that fall within the zone boundaries.

Eg:

	|<---------------------Node---------------------->|
	 _________________________________________________
	|      Node mem reg 0 	|      Node mem reg 1     |
	|_______________________|_________________________|

	 _________________________________________________
	|   ZONE_DMA    |	ZONE_NORMAL		  |
	|_______________|_________________________________|


In the above figure,

ZONE_DMA has only 1 zone memory region (say, Zone mem reg 0) which is a subset
of Node mem reg 0.

ZONE_NORMAL has 2 zone memory regions (say, Zone mem reg 0 and Zone mem reg 1)
which are subsets of Node mem reg 0 and Node mem reg 1 respectively.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
---

 include/linux/mmzone.h |    9 +++++++++
 mm/page_alloc.c        |   42 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index bb7c3ef..9f923aa 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -339,6 +339,12 @@ struct node_mem_region {
 	struct pglist_data *pgdat;
 };
 
+struct zone_mem_region {
+	unsigned long start_pfn;
+	unsigned long spanned_pages;
+	unsigned long present_pages;
+};
+
 struct zone {
 	/* Fields commonly accessed by the page allocator */
 
@@ -403,6 +409,9 @@ struct zone {
 #endif
 	struct free_area	free_area[MAX_ORDER];
 
+	struct zone_mem_region	zone_mem_region[MAX_NR_REGIONS];
+	int 			nr_zone_regions;
+
 #ifndef CONFIG_SPARSEMEM
 	/*
 	 * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 709e3c1..c00f72d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4594,6 +4594,46 @@ void init_node_memory_regions(struct pglist_data *pgdat)
 	}
 }
 
+void init_zone_memory_regions(struct pglist_data *pgdat)
+{
+	unsigned long start_pfn, end_pfn, absent;
+	int i, j, idx, nid = pgdat->node_id;
+	struct node_mem_region *region;
+	struct zone *z;
+
+	for (i = 0; i < pgdat->nr_zones; i++) {
+		z = &pgdat->node_zones[i];
+		idx = 0;
+
+		for (j = 0; j < pgdat->nr_node_regions; j++) {
+			region = &pgdat->node_regions[j];
+			start_pfn = max(z->zone_start_pfn, region->start_pfn);
+			end_pfn = min(z->zone_start_pfn + z->spanned_pages,
+				      region->start_pfn + region->spanned_pages);
+
+			if (start_pfn >= end_pfn)
+				continue;
+
+			z->zone_mem_region[idx].start_pfn = start_pfn;
+			z->zone_mem_region[idx].spanned_pages = end_pfn - start_pfn;
+
+			absent = __absent_pages_in_range(nid, start_pfn,
+						         end_pfn);
+			z->zone_mem_region[idx].present_pages =
+						end_pfn - start_pfn - absent;
+			idx++;
+		}
+
+		z->nr_zone_regions = idx;
+	}
+}
+
+void init_memory_regions(struct pglist_data *pgdat)
+{
+	init_node_memory_regions(pgdat);
+	init_zone_memory_regions(pgdat);
+}
+
 void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 		unsigned long node_start_pfn, unsigned long *zholes_size)
 {
@@ -4615,7 +4655,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 #endif
 
 	free_area_init_core(pgdat, zones_size, zholes_size);
-	init_node_memory_regions(pgdat);
+	init_memory_regions(pgdat);
 }
 
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [RFC PATCH 2/8] mm: Initialize node memory regions during boot
From: Srivatsa S. Bhat @ 2012-11-06 19:52 UTC (permalink / raw)
  To: akpm, mgorman, mjg59, paulmck, dave, maxime.coquelin,
	loic.pallardy, arjan, kmpark, kamezawa.hiroyu, lenb, rjw
  Cc: gargankita, amit.kachhap, svaidy, thomas.abraham,
	santosh.shilimkar, srivatsa.bhat, linux-pm, linux-mm,
	linux-kernel
In-Reply-To: <20121106195026.6941.24662.stgit@srivatsabhat.in.ibm.com>

Initialize the node's memory regions structures with the information about
the region-boundaries, at boot time.

Based-on-patch-by: Ankita Garg <gargankita@gmail.com>
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
---

 include/linux/mm.h |    4 ++++
 mm/page_alloc.c    |   35 +++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index fa06804..19c4fb0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -657,6 +657,10 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 #define SECTIONS_MASK		((1UL << SECTIONS_WIDTH) - 1)
 #define ZONEID_MASK		((1UL << ZONEID_SHIFT) - 1)
 
+/* Hard-code memory regions size to be 512 MB for now. */
+#define MEM_REGION_SHIFT	(29 - PAGE_SHIFT)
+#define MEM_REGION_SIZE		(1UL << MEM_REGION_SHIFT)
+
 static inline enum zone_type page_zonenum(const struct page *page)
 {
 	return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bb90971..709e3c1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4560,6 +4560,40 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
 }
 
+void init_node_memory_regions(struct pglist_data *pgdat)
+{
+	int nid = pgdat->node_id;
+	unsigned long start_pfn = pgdat->node_start_pfn;
+	unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
+	unsigned long i, absent;
+	int idx;
+	struct node_mem_region *region;
+
+	for (i = start_pfn, idx = 0; i < end_pfn;
+				i += region->spanned_pages, idx++) {
+
+		region = &pgdat->node_regions[idx];
+
+		if (i + MEM_REGION_SIZE <= end_pfn) {
+			region->start_pfn = i;
+			region->spanned_pages = MEM_REGION_SIZE;
+		} else {
+			region->start_pfn = i;
+			region->spanned_pages = end_pfn - i;
+		}
+
+		absent = __absent_pages_in_range(nid, region->start_pfn,
+						 region->start_pfn +
+						 region->spanned_pages);
+
+		region->present_pages = region->spanned_pages - absent;
+		region->idx = idx;
+		region->node = nid;
+		region->pgdat = pgdat;
+		pgdat->nr_node_regions++;
+	}
+}
+
 void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 		unsigned long node_start_pfn, unsigned long *zholes_size)
 {
@@ -4581,6 +4615,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 #endif
 
 	free_area_init_core(pgdat, zones_size, zholes_size);
+	init_node_memory_regions(pgdat);
 }
 
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* Re: [PATCH 6/7] ACPI / PM: Move device PM functions related to sleep states
From: David Rientjes @ 2012-11-06 19:52 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Aaron Lu, Huang Ying, Len Brown, Lv Zheng, Adrian Hunter,
	linux-kernel, linux-pm, linux-acpi
In-Reply-To: <1456107.KxymqUQLId@vostro.rjw.lan>

Commit b87b49cd0efd ("ACPI / PM: Move device PM functions related to sleep 
states") declared acpi_target_system_state() for CONFIG_PM_SLEEP whereas 
it is only defined for CONFIG_ACPI_SLEEP, resulting in the following link 
error:

drivers/built-in.o: In function `acpi_pm_device_sleep_wake':
drivers/acpi/device_pm.c:342: undefined reference to `acpi_target_system_state'
drivers/built-in.o: In function `acpi_dev_suspend_late':
drivers/acpi/device_pm.c:501: undefined reference to `acpi_target_system_state'
drivers/built-in.o: In function `acpi_pm_device_sleep_state':
drivers/acpi/device_pm.c:221: undefined reference to `acpi_target_system_state'

Define it only for CONFIG_ACPI_SLEEP and fallback to a dummy definition 
for other configs.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/acpi/acpi_bus.h |    8 ++++++--
 1 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -469,11 +469,9 @@ static inline int acpi_pm_device_run_wake(struct device *dev, bool enable)
 #endif
 
 #ifdef CONFIG_PM_SLEEP
-u32 acpi_target_system_state(void);
 int __acpi_device_sleep_wake(struct acpi_device *, u32, bool);
 int acpi_pm_device_sleep_wake(struct device *, bool);
 #else
-static inline u32 acpi_target_system_state(void) { return ACPI_STATE_S0; }
 static inline int __acpi_device_sleep_wake(struct acpi_device *adev,
 					   u32 target_state, bool enable)
 {
@@ -485,6 +483,12 @@ static inline int acpi_pm_device_sleep_wake(struct device *dev, bool enable)
 }
 #endif
 
+#ifdef CONFIG_ACPI_SLEEP
+u32 acpi_target_system_state(void);
+#else
+static inline u32 acpi_target_system_state(void) { return ACPI_STATE_S0; }
+#endif
+
 static inline bool acpi_device_power_manageable(struct acpi_device *adev)
 {
 	return adev->flags.power_manageable;

^ permalink raw reply

* [RFC PATCH 1/8] mm: Introduce memory regions data-structure to capture region boundaries within node
From: Srivatsa S. Bhat @ 2012-11-06 19:52 UTC (permalink / raw)
  To: akpm, mgorman, mjg59, paulmck, dave, maxime.coquelin,
	loic.pallardy, arjan, kmpark, kamezawa.hiroyu, lenb, rjw
  Cc: gargankita, amit.kachhap, svaidy, thomas.abraham,
	santosh.shilimkar, srivatsa.bhat, linux-pm, linux-mm,
	linux-kernel
In-Reply-To: <20121106195026.6941.24662.stgit@srivatsabhat.in.ibm.com>

Within a node, we can have regions of memory that can be power-managed.
That is, chunks of memory can be transitioned (manually or automatically)
to low-power states based on the frequency of references to that region.
For example, if a memory chunk is not referenced for a given threshold
amount of time, the hardware can decide to put that piece of memory into
a content-preserving low-power state. And of course, on the next reference
to that chunk of memory, it will be transitioned to full-power for
read/write operations.

We propose to incorporate this knowledge of power-manageable chunks of
memory into a new data-structure called "Memory Regions". This way of
acknowledging the existence of different classes of memory with different
characteristics is the first step to in order to manage memory
power-efficiently, such as performing power-aware memory allocation etc.

[Also, the concept of memory regions could potentially be extended to work
with different classes of memory like PCM (Phase Change Memory) etc and
hence, it is not limited to just power management alone].

We already sub-divide a node's memory into zones, based on some well-known
constraints. So the question is, where do we fit in memory regions in this
hierarchy. Instead of artificially trying to fit it into the hierarchy one
way or the other, we choose to simply capture the region boundaries in a
parallel data-structure, since there is no guarantee that the region
boundaries will naturally fit inside zone boundaries or vice-versa.

But of course, memory regions are sub-divisions *within* a node, so it makes
sense to keep the data-structures in the node's struct pglist_data. (Thus
this placement makes memory regions parallel to zones in that node).

Once we capture the region boundaries in the memory regions data-structure,
we can influence MM decisions at various places, such as page allocation,
reclamation etc.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
---

 include/linux/mmzone.h |   13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 50aaca8..bb7c3ef 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -80,6 +80,8 @@ static inline int get_pageblock_migratetype(struct page *page)
 	return get_pageblock_flags_group(page, PB_migrate, PB_migrate_end);
 }
 
+#define MAX_NR_REGIONS	256
+
 struct free_area {
 	struct list_head	free_list[MIGRATE_TYPES];
 	unsigned long		nr_free;
@@ -328,6 +330,15 @@ enum zone_type {
 #error ZONES_SHIFT -- too many zones configured adjust calculation
 #endif
 
+struct node_mem_region {
+	unsigned long start_pfn;
+	unsigned long present_pages;
+	unsigned long spanned_pages;
+	int idx;
+	int node;
+	struct pglist_data *pgdat;
+};
+
 struct zone {
 	/* Fields commonly accessed by the page allocator */
 
@@ -687,6 +698,8 @@ typedef struct pglist_data {
 	struct zone node_zones[MAX_NR_ZONES];
 	struct zonelist node_zonelists[MAX_ZONELISTS];
 	int nr_zones;
+	struct node_mem_region node_regions[MAX_NR_REGIONS];
+	int nr_node_regions;
 #ifdef CONFIG_FLAT_NODE_MEM_MAP	/* means !SPARSEMEM */
 	struct page *node_mem_map;
 #ifdef CONFIG_MEMCG

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [RFC PATCH 05/10] mm: Create zonelists
From: Srivatsa S. Bhat @ 2012-11-06 19:40 UTC (permalink / raw)
  To: akpm, mgorman, mjg59, paulmck, dave, maxime.coquelin,
	loic.pallardy, arjan, kmpark, kamezawa.hiroyu, lenb, rjw
  Cc: gargankita, amit.kachhap, svaidy, thomas.abraham,
	santosh.shilimkar, srivatsa.bhat, linux-pm, linux-mm,
	linux-kernel
In-Reply-To: <20121106193650.6560.71366.stgit@srivatsabhat.in.ibm.com>

From: Ankita Garg <gargankita@gmail.com>

The default zonelist that is node ordered contains all zones from within a
node and then all zones from the next node and so on. By introducing memory
regions, the primary aim is to group memory allocations to a given area of
memory together. The modified zonelists thus contain all zones from one
region, followed by all zones from the next region and so on. This ensures
that all the memory in one region is allocated before going over to the next
region, unless targetted memory allocations are performed.

Signed-off-by: Ankita Garg <gargankita@gmail.com>
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
---

 mm/page_alloc.c |   69 +++++++++++++++++++++++++++++++++----------------------
 1 file changed, 42 insertions(+), 27 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a8e86b5..9c1d680 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3040,21 +3040,25 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
 				int nr_zones, enum zone_type zone_type)
 {
+	enum zone_type z_type = zone_type;
+	struct mem_region *region;
 	struct zone *zone;
 
 	BUG_ON(zone_type >= MAX_NR_ZONES);
 	zone_type++;
 
-	do {
-		zone_type--;
-		zone = pgdat->node_zones + zone_type;
-		if (populated_zone(zone)) {
-			zoneref_set_zone(zone,
-				&zonelist->_zonerefs[nr_zones++]);
-			check_highest_zone(zone_type);
-		}
-
-	} while (zone_type);
+	for_each_mem_region_in_node(region, pgdat->node_id) {
+		do {
+			zone_type--;
+			zone = region->region_zones + zone_type;
+			if (populated_zone(zone)) {
+				zoneref_set_zone(zone,
+					&zonelist->_zonerefs[nr_zones++]);
+				check_highest_zone(zone_type);
+			}
+		} while (zone_type);
+		zone_type = z_type + 1;
+	}
 	return nr_zones;
 }
 
@@ -3275,17 +3279,20 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
 	int zone_type;		/* needs to be signed */
 	struct zone *z;
 	struct zonelist *zonelist;
+	struct mem_region *region;
 
 	zonelist = &pgdat->node_zonelists[0];
 	pos = 0;
 	for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
 		for (j = 0; j < nr_nodes; j++) {
 			node = node_order[j];
-			z = &NODE_DATA(node)->node_zones[zone_type];
-			if (populated_zone(z)) {
-				zoneref_set_zone(z,
-					&zonelist->_zonerefs[pos++]);
-				check_highest_zone(zone_type);
+			for_each_mem_region_in_node(region, node) {
+				z = &region->region_zones[zone_type];
+				if (populated_zone(z)) {
+					zoneref_set_zone(z,
+						&zonelist->_zonerefs[pos++]);
+					check_highest_zone(zone_type);
+				}
 			}
 		}
 	}
@@ -3299,6 +3306,8 @@ static int default_zonelist_order(void)
 	unsigned long low_kmem_size,total_size;
 	struct zone *z;
 	int average_size;
+	struct mem_region *region;
+
 	/*
          * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
 	 * If they are really small and used heavily, the system can fall
@@ -3310,12 +3319,15 @@ static int default_zonelist_order(void)
 	total_size = 0;
 	for_each_online_node(nid) {
 		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
-			z = &NODE_DATA(nid)->node_zones[zone_type];
-			if (populated_zone(z)) {
-				if (zone_type < ZONE_NORMAL)
-					low_kmem_size += z->present_pages;
-				total_size += z->present_pages;
-			} else if (zone_type == ZONE_NORMAL) {
+			for_each_mem_region_in_node(region, nid) {
+				z = &region->region_zones[zone_type];
+				if (populated_zone(z)) {
+					if (zone_type < ZONE_NORMAL)
+						low_kmem_size +=
+							z->present_pages;
+
+					total_size += z->present_pages;
+				} else if (zone_type == ZONE_NORMAL) {
 				/*
 				 * If any node has only lowmem, then node order
 				 * is preferred to allow kernel allocations
@@ -3323,7 +3335,8 @@ static int default_zonelist_order(void)
 				 * on other nodes when there is an abundance of
 				 * lowmem available to allocate from.
 				 */
-				return ZONELIST_ORDER_NODE;
+					return ZONELIST_ORDER_NODE;
+				}
 			}
 		}
 	}
@@ -3341,11 +3354,13 @@ static int default_zonelist_order(void)
 		low_kmem_size = 0;
 		total_size = 0;
 		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
-			z = &NODE_DATA(nid)->node_zones[zone_type];
-			if (populated_zone(z)) {
-				if (zone_type < ZONE_NORMAL)
-					low_kmem_size += z->present_pages;
-				total_size += z->present_pages;
+			for_each_mem_region_in_node(region, nid) {
+				z = &region->region_zones[zone_type];
+				if (populated_zone(z)) {
+					if (zone_type < ZONE_NORMAL)
+						low_kmem_size += z->present_pages;
+					total_size += z->present_pages;
+				}
 			}
 		}
 		if (low_kmem_size &&


^ permalink raw reply related

* [RFC PATCH 10/10] mm: Create memory regions at boot-up
From: Srivatsa S. Bhat @ 2012-11-06 19:42 UTC (permalink / raw)
  To: akpm, mgorman, mjg59, paulmck, dave, maxime.coquelin,
	loic.pallardy, arjan, kmpark, kamezawa.hiroyu, lenb, rjw
  Cc: gargankita, amit.kachhap, svaidy, thomas.abraham,
	santosh.shilimkar, srivatsa.bhat, linux-pm, linux-mm,
	linux-kernel
In-Reply-To: <20121106193650.6560.71366.stgit@srivatsabhat.in.ibm.com>

From: Ankita Garg <gargankita@gmail.com>

Memory regions are created at boot up time, from the information obtained
from the firmware. But since the firmware doesn't yet export information
about memory units that can be independently power managed, for the purpose
of demonstration, we hard code memory region size to be 512MB.

In future, we expect ACPI 5.0 compliant firmware to expose the required
info in the form of MPST (Memory Power State Table) tables.

Signed-off-by: Ankita Garg <gargankita@gmail.com>
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
---

 mm/page_alloc.c |   28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9c1d680..13d1b2f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4491,6 +4491,33 @@ void __init set_pageblock_order(void)
 
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 
+#define REGIONS_SIZE	(512 << 20) >> PAGE_SHIFT
+
+static void init_node_memory_regions(struct pglist_data *pgdat)
+{
+	int cnt = 0;
+	unsigned long i;
+	unsigned long start_pfn = pgdat->node_start_pfn;
+	unsigned long spanned_pages = pgdat->node_spanned_pages;
+	unsigned long total = 0;
+
+	for (i = start_pfn; i < start_pfn + spanned_pages; i += REGIONS_SIZE) {
+		struct mem_region *region = &pgdat->node_regions[cnt];
+
+		region->start_pfn = i;
+		if ((spanned_pages - total) < REGIONS_SIZE)
+			region->spanned_pages = spanned_pages - total;
+		else
+			region->spanned_pages = REGIONS_SIZE;
+
+		region->node = pgdat->node_id;
+		region->region = cnt;
+		pgdat->nr_node_regions++;
+		total += region->spanned_pages;
+		cnt++;
+	}
+}
+
 /*
  * Set up the zone data structures:
  *   - mark all pages reserved
@@ -4653,6 +4680,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 		(unsigned long)pgdat->node_mem_map);
 #endif
 
+	init_node_memory_regions(pgdat);
 	free_area_init_core(pgdat, zones_size, zholes_size);
 }
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [RFC PATCH 09/10] mm: Reflect memory region changes in zoneinfo
From: Srivatsa S. Bhat @ 2012-11-06 19:41 UTC (permalink / raw)
  To: akpm, mgorman, mjg59, paulmck, dave, maxime.coquelin,
	loic.pallardy, arjan, kmpark, kamezawa.hiroyu, lenb, rjw
  Cc: gargankita, amit.kachhap, svaidy, thomas.abraham,
	santosh.shilimkar, srivatsa.bhat, linux-pm, linux-mm,
	linux-kernel
In-Reply-To: <20121106193650.6560.71366.stgit@srivatsabhat.in.ibm.com>

From: Ankita Garg <gargankita@gmail.com>

This patch modifies the output of /proc/zoneinfo to take the memory regions
into into account. Below is the output on a KVM guest booted with 4 regions,
each of size 512MB.

cat /proc/zoneinfo:

Node 0, Region 0, zone      DMA
  pages free     3975
        min      11
        low      13
        high     16
        scanned  0
        spanned  4080
        present  3977
    nr_free_pages 3975
    nr_inactive_anon 0
    nr_active_anon 0
    nr_inactive_file 0
    nr_active_file 0
    nr_unevictable 0
    nr_mlock     0
    nr_anon_pages 0
    nr_mapped    0
    nr_file_pages 0
    nr_dirty     0
    nr_writeback 0
    nr_slab_reclaimable 0
    nr_slab_unreclaimable 2
    nr_page_table_pages 0
    nr_kernel_stack 0
    nr_unstable  0
    nr_bounce    0
    nr_vmscan_write 0
    nr_vmscan_immediate_reclaim 0
    nr_writeback_temp 0
    nr_isolated_anon 0
    nr_isolated_file 0
    nr_shmem     0
    nr_dirtied   0
    nr_written   0
    nr_anon_transparent_hugepages 0
    nr_free_cma  0
        protection: (0, 471, 471, 471)
  pagesets
    cpu: 0
              count: 0
              high:  0
              batch: 1
  vm stats threshold: 6
    cpu: 1
              count: 0
              high:  0
              batch: 1
  vm stats threshold: 6
    cpu: 2
              count: 0
              high:  0
              batch: 1
  vm stats threshold: 6
    cpu: 3
              count: 0
              high:  0
              batch: 1
  vm stats threshold: 6
  all_unreclaimable: 0
  start_pfn:         16
  inactive_ratio:    1
Node 0, Region 0, zone    DMA32
  pages free     107720
        min      338
        low      422
        high     507
        scanned  0
        spanned  126992
        present  120642
.....
Node 0, Region 1, zone    DMA32
  pages free     131072
        min      367
        low      458
        high     550
        scanned  0
        spanned  131072
        present  131072
.....
Node 0, Region 2, zone    DMA32
  pages free     131072
        min      367
        low      458
        high     550
        scanned  0
        spanned  131072
        present  131072
.....
Node 0, Region 3, zone    DMA32
  pages free     121880
        min      341
        low      426
        high     511
        scanned  0
        spanned  131054
        present  121928
.....

Signed-off-by: Ankita Garg <gargankita@gmail.com>
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
---

 mm/vmstat.c |   31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 86a92a6..b3be9ba 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -179,9 +179,12 @@ void refresh_zone_stat_thresholds(void)
 		 */
 		tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
 		max_drift = num_online_cpus() * threshold;
-		if (max_drift > tolerate_drift)
+		if (max_drift > tolerate_drift) {
 			zone->percpu_drift_mark = high_wmark_pages(zone) +
 					max_drift;
+			printk("zone %s drift mark %lu \n", zone->name,
+						zone->percpu_drift_mark);
+		}
 	}
 }
 
@@ -189,12 +192,11 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
 				int (*calculate_pressure)(struct zone *))
 {
 	struct mem_region *region;
-	struct zone *zone;
 	int cpu;
 	int threshold;
 	int i;
 
-	for (i = 0; i < pgdat->nr_zones; i++) {
+	for (i = 0; i < pgdat->nr_node_zone_types; i++) {
 		for_each_mem_region_in_node(region, pgdat->node_id) {
 			struct zone *zone = region->region_zones + i;
 
@@ -818,11 +820,12 @@ const char * const vmstat_text[] = {
 
 #ifdef CONFIG_PROC_FS
 static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
-						struct zone *zone)
+					struct mem_region *region, struct zone *zone)
 {
 	int order;
 
-	seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
+	seq_printf(m, "Node %d, REG %d, zone %8s ", pgdat->node_id,
+						region->region, zone->name);
 	for (order = 0; order < MAX_ORDER; ++order)
 		seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
 	seq_putc(m, '\n');
@@ -838,14 +841,15 @@ static int frag_show(struct seq_file *m, void *arg)
 	return 0;
 }
 
-static void pagetypeinfo_showfree_print(struct seq_file *m,
-					pg_data_t *pgdat, struct zone *zone)
+static void pagetypeinfo_showfree_print(struct seq_file *m, pg_data_t *pgdat,
+						struct mem_region *region, struct zone *zone)
 {
 	int order, mtype;
 
 	for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
-		seq_printf(m, "Node %4d, zone %8s, type %12s ",
+		seq_printf(m, "Node %4d, Region %d, zone %8s, type %12s ",
 					pgdat->node_id,
+					region->region,
 					zone->name,
 					migratetype_names[mtype]);
 		for (order = 0; order < MAX_ORDER; ++order) {
@@ -880,8 +884,8 @@ static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
 	return 0;
 }
 
-static void pagetypeinfo_showblockcount_print(struct seq_file *m,
-					pg_data_t *pgdat, struct zone *zone)
+static void pagetypeinfo_showblockcount_print(struct seq_file *m, pg_data_t *pgdat,
+							struct mem_region *region, struct zone *zone)
 {
 	int mtype;
 	unsigned long pfn;
@@ -908,7 +912,7 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
 	}
 
 	/* Print counts */
-	seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
+	seq_printf(m, "Node %d, Region %d, zone %8s ", pgdat->node_id, region->region, zone->name);
 	for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
 		seq_printf(m, "%12lu ", count[mtype]);
 	seq_putc(m, '\n');
@@ -989,10 +993,11 @@ static const struct file_operations pagetypeinfo_file_ops = {
 };
 
 static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
-							struct zone *zone)
+					struct mem_region *region, struct zone *zone)
 {
 	int i;
-	seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
+	seq_printf(m, "Node %d, Region %d, zone %8s", pgdat->node_id,
+						region->region, zone->name);
 	seq_printf(m,
 		   "\n  pages free     %lu"
 		   "\n        min      %lu"

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [RFC PATCH 08/10] mm: Modify vmscan
From: Srivatsa S. Bhat @ 2012-11-06 19:41 UTC (permalink / raw)
  To: akpm, mgorman, mjg59, paulmck, dave, maxime.coquelin,
	loic.pallardy, arjan, kmpark, kamezawa.hiroyu, lenb, rjw
  Cc: gargankita, amit.kachhap, svaidy, thomas.abraham,
	santosh.shilimkar, srivatsa.bhat, linux-pm, linux-mm,
	linux-kernel
In-Reply-To: <20121106193650.6560.71366.stgit@srivatsabhat.in.ibm.com>

From: Ankita Garg <gargankita@gmail.com>

Modify vmscan to take into account the changed node-zone hierarchy.

Signed-off-by: Ankita Garg <gargankita@gmail.com>
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
---

 mm/vmscan.c |  364 +++++++++++++++++++++++++++++++----------------------------
 1 file changed, 193 insertions(+), 171 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2624edc..4d8f303 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2209,11 +2209,14 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
 	unsigned long free_pages = 0;
 	int i;
 	bool wmark_ok;
+	struct mem_region *region;
 
 	for (i = 0; i <= ZONE_NORMAL; i++) {
-		zone = &pgdat->node_zones[i];
-		pfmemalloc_reserve += min_wmark_pages(zone);
-		free_pages += zone_page_state(zone, NR_FREE_PAGES);
+		for_each_mem_region_in_node(region, pgdat->node_id) {
+			zone = &region->region_zones[i];
+			pfmemalloc_reserve += min_wmark_pages(zone);
+			free_pages += zone_page_state(zone, NR_FREE_PAGES);
+		}
 	}
 
 	wmark_ok = free_pages > pfmemalloc_reserve / 2;
@@ -2442,10 +2445,16 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
 						int classzone_idx)
 {
 	unsigned long present_pages = 0;
+	struct mem_region *region;
 	int i;
 
-	for (i = 0; i <= classzone_idx; i++)
-		present_pages += pgdat->node_zones[i].present_pages;
+	for (i = 0; i <= classzone_idx; i++) {
+		for_each_mem_region_in_node(region, pgdat->node_id) {
+			struct zone *zone = region->region_zones + i;
+
+			present_pages += zone->present_pages;
+		}
+	}
 
 	/* A special case here: if zone has no page, we think it's balanced */
 	return balanced_pages >= (present_pages >> 2);
@@ -2463,6 +2472,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
 	int i;
 	unsigned long balanced = 0;
 	bool all_zones_ok = true;
+	struct mem_region *region;
 
 	/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
 	if (remaining)
@@ -2484,27 +2494,29 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
 
 	/* Check the watermark levels */
 	for (i = 0; i <= classzone_idx; i++) {
-		struct zone *zone = pgdat->node_zones + i;
+		for_each_mem_region_in_node(region, pgdat->node_id) {
+			struct zone *zone = region->region_zones + i;
 
-		if (!populated_zone(zone))
-			continue;
+			if (!populated_zone(zone))
+				continue;
 
-		/*
-		 * balance_pgdat() skips over all_unreclaimable after
-		 * DEF_PRIORITY. Effectively, it considers them balanced so
-		 * they must be considered balanced here as well if kswapd
-		 * is to sleep
-		 */
-		if (zone->all_unreclaimable) {
-			balanced += zone->present_pages;
-			continue;
-		}
+				/*
+			 * balance_pgdat() skips over all_unreclaimable after
+			 * DEF_PRIORITY. Effectively, it considers them balanced so
+			 * they must be considered balanced here as well if kswapd
+			 * is to sleep
+			 */
+			if (zone->all_unreclaimable) {
+				balanced += zone->present_pages;
+				continue;
+			}
 
-		if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
-							i, 0))
-			all_zones_ok = false;
-		else
-			balanced += zone->present_pages;
+			if (!zone_watermark_ok_safe(zone, order,
+						high_wmark_pages(zone), i, 0))
+				all_zones_ok = false;
+			else
+				balanced += zone->present_pages;
+		}
 	}
 
 	/*
@@ -2565,6 +2577,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 	struct shrink_control shrink = {
 		.gfp_mask = sc.gfp_mask,
 	};
+	struct mem_region *region;
 loop_again:
 	total_scanned = 0;
 	sc.priority = DEF_PRIORITY;
@@ -2583,49 +2596,55 @@ loop_again:
 		 * Scan in the highmem->dma direction for the highest
 		 * zone which needs scanning
 		 */
-		for (i = pgdat->nr_zones - 1; i >= 0; i--) {
-			struct zone *zone = pgdat->node_zones + i;
+		for (i = pgdat->nr_node_zone_types - 1; i >= 0; i--) {
+			for_each_mem_region_in_node(region, pgdat->node_id) {
+				struct zone *zone = region->region_zones + i;
 
-			if (!populated_zone(zone))
-				continue;
+				if (!populated_zone(zone))
+					continue;
 
-			if (zone->all_unreclaimable &&
-			    sc.priority != DEF_PRIORITY)
-				continue;
+				if (zone->all_unreclaimable &&
+				    sc.priority != DEF_PRIORITY)
+					continue;
 
-			/*
-			 * Do some background aging of the anon list, to give
-			 * pages a chance to be referenced before reclaiming.
-			 */
-			age_active_anon(zone, &sc);
+				/*
+				 * Do some background aging of the anon list, to give
+				 * pages a chance to be referenced before reclaiming.
+				 */
+				age_active_anon(zone, &sc);
 
-			/*
-			 * If the number of buffer_heads in the machine
-			 * exceeds the maximum allowed level and this node
-			 * has a highmem zone, force kswapd to reclaim from
-			 * it to relieve lowmem pressure.
-			 */
-			if (buffer_heads_over_limit && is_highmem_idx(i)) {
-				end_zone = i;
-				break;
-			}
+				/*
+				 * If the number of buffer_heads in the machine
+				 * exceeds the maximum allowed level and this node
+				 * has a highmem zone, force kswapd to reclaim from
+				 * it to relieve lowmem pressure.
+				 */
+				if (buffer_heads_over_limit && is_highmem_idx(i)) {
+					end_zone = i;
+					goto out_loop;
+				}
 
-			if (!zone_watermark_ok_safe(zone, order,
-					high_wmark_pages(zone), 0, 0)) {
-				end_zone = i;
-				break;
-			} else {
-				/* If balanced, clear the congested flag */
-				zone_clear_flag(zone, ZONE_CONGESTED);
+				if (!zone_watermark_ok_safe(zone, order,
+						high_wmark_pages(zone), 0, 0)) {
+					end_zone = i;
+					goto out_loop;
+				} else {
+					/* If balanced, clear the congested flag */
+					zone_clear_flag(zone, ZONE_CONGESTED);
+				}
 			}
 		}
+
+	out_loop:
 		if (i < 0)
 			goto out;
 
 		for (i = 0; i <= end_zone; i++) {
-			struct zone *zone = pgdat->node_zones + i;
+			for_each_mem_region_in_node(region, pgdat->node_id) {
+				struct zone *zone = region->region_zones + i;
 
-			lru_pages += zone_reclaimable_pages(zone);
+				lru_pages += zone_reclaimable_pages(zone);
+			}
 		}
 
 		/*
@@ -2638,108 +2657,109 @@ loop_again:
 		 * cause too much scanning of the lower zones.
 		 */
 		for (i = 0; i <= end_zone; i++) {
-			struct zone *zone = pgdat->node_zones + i;
-			int nr_slab, testorder;
-			unsigned long balance_gap;
-
-			if (!populated_zone(zone))
-				continue;
+			for_each_mem_region_in_node(region, pgdat->node_id) {
+				struct zone *zone = region->region_zones + i;
+				int nr_slab, testorder;
+				unsigned long balance_gap;
 
-			if (zone->all_unreclaimable &&
-			    sc.priority != DEF_PRIORITY)
-				continue;
-
-			sc.nr_scanned = 0;
-
-			nr_soft_scanned = 0;
-			/*
-			 * Call soft limit reclaim before calling shrink_zone.
-			 */
-			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
-							order, sc.gfp_mask,
-							&nr_soft_scanned);
-			sc.nr_reclaimed += nr_soft_reclaimed;
-			total_scanned += nr_soft_scanned;
-
-			/*
-			 * We put equal pressure on every zone, unless
-			 * one zone has way too many pages free
-			 * already. The "too many pages" is defined
-			 * as the high wmark plus a "gap" where the
-			 * gap is either the low watermark or 1%
-			 * of the zone, whichever is smaller.
-			 */
-			balance_gap = min(low_wmark_pages(zone),
-				(zone->present_pages +
-					KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
-				KSWAPD_ZONE_BALANCE_GAP_RATIO);
-			/*
-			 * Kswapd reclaims only single pages with compaction
-			 * enabled. Trying too hard to reclaim until contiguous
-			 * free pages have become available can hurt performance
-			 * by evicting too much useful data from memory.
-			 * Do not reclaim more than needed for compaction.
-			 */
-			testorder = order;
-			if (COMPACTION_BUILD && order &&
-					compaction_suitable(zone, order) !=
-						COMPACT_SKIPPED)
-				testorder = 0;
-
-			if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
-				    !zone_watermark_ok_safe(zone, testorder,
-					high_wmark_pages(zone) + balance_gap,
-					end_zone, 0)) {
-				shrink_zone(zone, &sc);
-
-				reclaim_state->reclaimed_slab = 0;
-				nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
-				sc.nr_reclaimed += reclaim_state->reclaimed_slab;
-				total_scanned += sc.nr_scanned;
+				if (!populated_zone(zone))
+					continue;
 
-				if (nr_slab == 0 && !zone_reclaimable(zone))
-					zone->all_unreclaimable = 1;
-			}
+				if (zone->all_unreclaimable &&
+				    sc.priority != DEF_PRIORITY)
+					continue;
 
-			/*
-			 * If we've done a decent amount of scanning and
-			 * the reclaim ratio is low, start doing writepage
-			 * even in laptop mode
-			 */
-			if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
-			    total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
-				sc.may_writepage = 1;
+				sc.nr_scanned = 0;
 
-			if (zone->all_unreclaimable) {
-				if (end_zone && end_zone == i)
-					end_zone--;
-				continue;
-			}
+				nr_soft_scanned = 0;
+				/*
+				 * Call soft limit reclaim before calling shrink_zone.
+				 */
+				nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+								order, sc.gfp_mask,
+								&nr_soft_scanned);
+				sc.nr_reclaimed += nr_soft_reclaimed;
+				total_scanned += nr_soft_scanned;
 
-			if (!zone_watermark_ok_safe(zone, testorder,
-					high_wmark_pages(zone), end_zone, 0)) {
-				all_zones_ok = 0;
 				/*
-				 * We are still under min water mark.  This
-				 * means that we have a GFP_ATOMIC allocation
-				 * failure risk. Hurry up!
+				 * We put equal pressure on every zone, unless
+				 * one zone has way too many pages free
+				 * already. The "too many pages" is defined
+				 * as the high wmark plus a "gap" where the
+				 * gap is either the low watermark or 1%
+				 * of the zone, whichever is smaller.
 				 */
-				if (!zone_watermark_ok_safe(zone, order,
-					    min_wmark_pages(zone), end_zone, 0))
-					has_under_min_watermark_zone = 1;
-			} else {
+				balance_gap = min(low_wmark_pages(zone),
+					(zone->present_pages +
+						KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+					KSWAPD_ZONE_BALANCE_GAP_RATIO);
 				/*
-				 * If a zone reaches its high watermark,
-				 * consider it to be no longer congested. It's
-				 * possible there are dirty pages backed by
-				 * congested BDIs but as pressure is relieved,
-				 * speculatively avoid congestion waits
+				 * Kswapd reclaims only single pages with compaction
+				 * enabled. Trying too hard to reclaim until contiguous
+				 * free pages have become available can hurt performance
+				 * by evicting too much useful data from memory.
+				 * Do not reclaim more than needed for compaction.
 				 */
-				zone_clear_flag(zone, ZONE_CONGESTED);
-				if (i <= *classzone_idx)
-					balanced += zone->present_pages;
-			}
+				testorder = order;
+				if (COMPACTION_BUILD && order &&
+						compaction_suitable(zone, order) !=
+							COMPACT_SKIPPED)
+					testorder = 0;
+
+				if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
+					    !zone_watermark_ok_safe(zone, testorder,
+						high_wmark_pages(zone) + balance_gap,
+						end_zone, 0)) {
+					shrink_zone(zone, &sc);
+
+					reclaim_state->reclaimed_slab = 0;
+					nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
+					sc.nr_reclaimed += reclaim_state->reclaimed_slab;
+					total_scanned += sc.nr_scanned;
+
+					if (nr_slab == 0 && !zone_reclaimable(zone))
+						zone->all_unreclaimable = 1;
+				}
 
+				/*
+				 * If we've done a decent amount of scanning and
+				 * the reclaim ratio is low, start doing writepage
+				 * even in laptop mode
+				 */
+				if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
+				    total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
+					sc.may_writepage = 1;
+
+				if (zone->all_unreclaimable) {
+					if (end_zone && end_zone == i)
+						end_zone--;
+					continue;
+				}
+
+				if (!zone_watermark_ok_safe(zone, testorder,
+						high_wmark_pages(zone), end_zone, 0)) {
+					all_zones_ok = 0;
+					/*
+					 * We are still under min water mark.  This
+					 * means that we have a GFP_ATOMIC allocation
+					 * failure risk. Hurry up!
+					 */
+					if (!zone_watermark_ok_safe(zone, order,
+						    min_wmark_pages(zone), end_zone, 0))
+						has_under_min_watermark_zone = 1;
+				} else {
+					/*
+					 * If a zone reaches its high watermark,
+					 * consider it to be no longer congested. It's
+					 * possible there are dirty pages backed by
+					 * congested BDIs but as pressure is relieved,
+					 * speculatively avoid congestion waits
+					 */
+					zone_clear_flag(zone, ZONE_CONGESTED);
+					if (i <= *classzone_idx)
+						balanced += zone->present_pages;
+				}
+			}
 		}
 
 		/*
@@ -2817,34 +2837,36 @@ out:
 		int zones_need_compaction = 1;
 
 		for (i = 0; i <= end_zone; i++) {
-			struct zone *zone = pgdat->node_zones + i;
+			for_each_mem_region_in_node(region, pgdat->node_id) {
+				struct zone *zone = region->region_zones + i;
 
-			if (!populated_zone(zone))
-				continue;
+				if (!populated_zone(zone))
+					continue;
 
-			if (zone->all_unreclaimable &&
-			    sc.priority != DEF_PRIORITY)
-				continue;
+				if (zone->all_unreclaimable &&
+				    sc.priority != DEF_PRIORITY)
+					continue;
 
-			/* Would compaction fail due to lack of free memory? */
-			if (COMPACTION_BUILD &&
-			    compaction_suitable(zone, order) == COMPACT_SKIPPED)
-				goto loop_again;
+				/* Would compaction fail due to lack of free memory? */
+				if (COMPACTION_BUILD &&
+				    compaction_suitable(zone, order) == COMPACT_SKIPPED)
+					goto loop_again;
 
-			/* Confirm the zone is balanced for order-0 */
-			if (!zone_watermark_ok(zone, 0,
-					high_wmark_pages(zone), 0, 0)) {
-				order = sc.order = 0;
-				goto loop_again;
-			}
+				/* Confirm the zone is balanced for order-0 */
+				if (!zone_watermark_ok(zone, 0,
+						high_wmark_pages(zone), 0, 0)) {
+					order = sc.order = 0;
+					goto loop_again;
+				}
 
-			/* Check if the memory needs to be defragmented. */
-			if (zone_watermark_ok(zone, order,
-				    low_wmark_pages(zone), *classzone_idx, 0))
-				zones_need_compaction = 0;
+				/* Check if the memory needs to be defragmented. */
+				if (zone_watermark_ok(zone, order,
+					    low_wmark_pages(zone), *classzone_idx, 0))
+					zones_need_compaction = 0;
 
-			/* If balanced, clear the congested flag */
-			zone_clear_flag(zone, ZONE_CONGESTED);
+				/* If balanced, clear the congested flag */
+				zone_clear_flag(zone, ZONE_CONGESTED);
+			}
 		}
 
 		if (zones_need_compaction)
@@ -2966,7 +2988,7 @@ static int kswapd(void *p)
 
 	order = new_order = 0;
 	balanced_order = 0;
-	classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
+	classzone_idx = new_classzone_idx = pgdat->nr_node_zone_types - 1;
 	balanced_classzone_idx = classzone_idx;
 	for ( ; ; ) {
 		int ret;
@@ -2981,7 +3003,7 @@ static int kswapd(void *p)
 			new_order = pgdat->kswapd_max_order;
 			new_classzone_idx = pgdat->classzone_idx;
 			pgdat->kswapd_max_order =  0;
-			pgdat->classzone_idx = pgdat->nr_zones - 1;
+			pgdat->classzone_idx = pgdat->nr_node_zone_types - 1;
 		}
 
 		if (order < new_order || classzone_idx > new_classzone_idx) {
@@ -2999,7 +3021,7 @@ static int kswapd(void *p)
 			new_order = order;
 			new_classzone_idx = classzone_idx;
 			pgdat->kswapd_max_order = 0;
-			pgdat->classzone_idx = pgdat->nr_zones - 1;
+			pgdat->classzone_idx = pgdat->nr_node_zone_types - 1;
 		}
 
 		ret = try_to_freeze();

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [RFC PATCH 07/10] mm: Modify vmstat
From: Srivatsa S. Bhat @ 2012-11-06 19:41 UTC (permalink / raw)
  To: akpm, mgorman, mjg59, paulmck, dave, maxime.coquelin,
	loic.pallardy, arjan, kmpark, kamezawa.hiroyu, lenb, rjw
  Cc: gargankita, amit.kachhap, svaidy, thomas.abraham,
	santosh.shilimkar, srivatsa.bhat, linux-pm, linux-mm,
	linux-kernel
In-Reply-To: <20121106193650.6560.71366.stgit@srivatsabhat.in.ibm.com>

From: Ankita Garg <gargankita@gmail.com>

Change the way vmstats are collected. Since the zones are now present inside
regions, scan through all the regions to obtain zone specific statistics.

Signed-off-by: Ankita Garg <gargankita@gmail.com>
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
---

 include/linux/vmstat.h |   21 ++++++++++++++-------
 mm/vmstat.c            |   40 ++++++++++++++++++++++++----------------
 2 files changed, 38 insertions(+), 23 deletions(-)

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 92a86b2..a782f05 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -151,20 +151,27 @@ extern unsigned long zone_reclaimable_pages(struct zone *zone);
 static inline unsigned long node_page_state(int node,
 				 enum zone_stat_item item)
 {
-	struct zone *zones = NODE_DATA(node)->node_zones;
+	unsigned long page_state = 0;
+	struct mem_region *region;
+
+	for_each_mem_region_in_node(region, node) {
+		struct zone *zones = region->region_zones;
+
+		page_state =
 
-	return
 #ifdef CONFIG_ZONE_DMA
-		zone_page_state(&zones[ZONE_DMA], item) +
+			zone_page_state(&zones[ZONE_DMA], item) +
 #endif
 #ifdef CONFIG_ZONE_DMA32
-		zone_page_state(&zones[ZONE_DMA32], item) +
+			zone_page_state(&zones[ZONE_DMA32], item) +
 #endif
 #ifdef CONFIG_HIGHMEM
-		zone_page_state(&zones[ZONE_HIGHMEM], item) +
+			zone_page_state(&zones[ZONE_HIGHMEM], item) +
 #endif
-		zone_page_state(&zones[ZONE_NORMAL], item) +
-		zone_page_state(&zones[ZONE_MOVABLE], item);
+			zone_page_state(&zones[ZONE_NORMAL], item) +
+			zone_page_state(&zones[ZONE_MOVABLE], item);
+	}
+	return page_state;
 }
 
 extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c737057..86a92a6 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -188,20 +188,24 @@ void refresh_zone_stat_thresholds(void)
 void set_pgdat_percpu_threshold(pg_data_t *pgdat,
 				int (*calculate_pressure)(struct zone *))
 {
+	struct mem_region *region;
 	struct zone *zone;
 	int cpu;
 	int threshold;
 	int i;
 
 	for (i = 0; i < pgdat->nr_zones; i++) {
-		zone = &pgdat->node_zones[i];
-		if (!zone->percpu_drift_mark)
-			continue;
+		for_each_mem_region_in_node(region, pgdat->node_id) {
+			struct zone *zone = region->region_zones + i;
 
-		threshold = (*calculate_pressure)(zone);
-		for_each_possible_cpu(cpu)
-			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
-							= threshold;
+			if (!zone->percpu_drift_mark)
+				continue;
+
+			threshold = (*calculate_pressure)(zone);
+			for_each_possible_cpu(cpu)
+				per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+								= threshold;
+		}
 	}
 }
 
@@ -657,19 +661,23 @@ static void frag_stop(struct seq_file *m, void *arg)
 
 /* Walk all the zones in a node and print using a callback */
 static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
-		void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
+			       void (*print)(struct seq_file *m, pg_data_t *,
+		               struct mem_region *, struct zone *))
 {
-	struct zone *zone;
-	struct zone *node_zones = pgdat->node_zones;
+	int i;
 	unsigned long flags;
+	struct mem_region *region;
 
-	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
-		if (!populated_zone(zone))
-			continue;
+	for (i = 0; i < MAX_NR_ZONES; ++i) {
+		for_each_mem_region_in_node(region, pgdat->node_id) {
+			struct zone *zone = region->region_zones + i;
+			if (!populated_zone(zone))
+				continue;
 
-		spin_lock_irqsave(&zone->lock, flags);
-		print(m, pgdat, zone);
-		spin_unlock_irqrestore(&zone->lock, flags);
+			spin_lock_irqsave(&zone->lock, flags);
+			print(m, pgdat, region, zone);
+			spin_unlock_irqrestore(&zone->lock, flags);
+		}
 	}
 }
 #endif

^ permalink raw reply related

* [RFC PATCH 06/10] mm: Verify zonelists
From: Srivatsa S. Bhat @ 2012-11-06 19:41 UTC (permalink / raw)
  To: akpm, mgorman, mjg59, paulmck, dave, maxime.coquelin,
	loic.pallardy, arjan, kmpark, kamezawa.hiroyu, lenb, rjw
  Cc: gargankita, amit.kachhap, svaidy, thomas.abraham,
	santosh.shilimkar, srivatsa.bhat, linux-pm, linux-mm,
	linux-kernel
In-Reply-To: <20121106193650.6560.71366.stgit@srivatsabhat.in.ibm.com>

From: Ankita Garg <gargankita@gmail.com>

Verify that the zonelists were created appropriately.

Signed-off-by: Ankita Garg <gargankita@gmail.com>
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
---

 mm/mm_init.c |   57 ++++++++++++++++++++++++++++++---------------------------
 1 file changed, 30 insertions(+), 27 deletions(-)

diff --git a/mm/mm_init.c b/mm/mm_init.c
index 1ffd97a..5c19842 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -21,6 +21,7 @@ int mminit_loglevel;
 /* The zonelists are simply reported, validation is manual. */
 void mminit_verify_zonelist(void)
 {
+	struct mem_region *region;
 	int nid;
 
 	if (mminit_loglevel < MMINIT_VERIFY)
@@ -28,37 +29,39 @@ void mminit_verify_zonelist(void)
 
 	for_each_online_node(nid) {
 		pg_data_t *pgdat = NODE_DATA(nid);
-		struct zone *zone;
-		struct zoneref *z;
-		struct zonelist *zonelist;
-		int i, listid, zoneid;
-
-		BUG_ON(MAX_ZONELISTS > 2);
-		for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
-
-			/* Identify the zone and nodelist */
-			zoneid = i % MAX_NR_ZONES;
-			listid = i / MAX_NR_ZONES;
-			zonelist = &pgdat->node_zonelists[listid];
-			zone = &pgdat->node_zones[zoneid];
-			if (!populated_zone(zone))
-				continue;
-
-			/* Print information about the zonelist */
-			printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ",
-				listid > 0 ? "thisnode" : "general", nid,
-				zone->name);
-
-			/* Iterate the zonelist */
-			for_each_zone_zonelist(zone, z, zonelist, zoneid) {
+		for_each_mem_region_in_node(region, nid) {
+			struct zone *zone;
+			struct zoneref *z;
+			struct zonelist *zonelist;
+			int i, listid, zoneid;
+
+			BUG_ON(MAX_ZONELISTS > 2);
+			for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
+
+				/* Identify the zone and nodelist */
+				zoneid = i % MAX_NR_ZONES;
+				listid = i / MAX_NR_ZONES;
+				zonelist = &pgdat->node_zonelists[listid];
+				zone = &region->region_zones[zoneid];
+				if (!populated_zone(zone))
+					continue;
+
+				/* Print information about the zonelist */
+				printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ",
+					listid > 0 ? "thisnode" : "general", nid,
+					zone->name);
+
+				/* Iterate the zonelist */
+				for_each_zone_zonelist(zone, z, zonelist, zoneid) {
 #ifdef CONFIG_NUMA
-				printk(KERN_CONT "%d:%s ",
-					zone->node, zone->name);
+					printk(KERN_CONT "%d:%s ",
+						zone->node, zone->name);
 #else
-				printk(KERN_CONT "0:%s ", zone->name);
+					printk(KERN_CONT "0:%s ", zone->name);
 #endif /* CONFIG_NUMA */
+				}
+				printk(KERN_CONT "\n");
 			}
-			printk(KERN_CONT "\n");
 		}
 	}
 }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [RFC PATCH 01/10] mm: Introduce the memory regions data structure
From: Srivatsa S. Bhat @ 2012-11-06 19:39 UTC (permalink / raw)
  To: akpm, mgorman, mjg59, paulmck, dave, maxime.coquelin,
	loic.pallardy, arjan, kmpark, kamezawa.hiroyu, lenb, rjw
  Cc: gargankita, amit.kachhap, svaidy, thomas.abraham,
	santosh.shilimkar, srivatsa.bhat, linux-pm, linux-mm,
	linux-kernel
In-Reply-To: <20121106193650.6560.71366.stgit@srivatsabhat.in.ibm.com>

From: Ankita Garg <gargankita@gmail.com>

Memory region data structure is created under a NUMA node. Each NUMA node can
have multiple memory regions, depending upon the platform configuration for
power management. Each memory region contains zones, which is the entity from
which memory is allocated by the buddy allocator.

                 -------------
		 | pg_data_t |
                 -------------
                     |  |
		------  -------
		v             v
        ----------------    ----------------
        | mem_region_t |    | mem_region_t |
        ----------------    ----------------    -------------
               |                    |...........| zone0 | ....
	       v                                -------------
           -----------------------------
           | zone0 | zone1 | zone3 | ..|
           -----------------------------

Each memory region contains a zone array for the zones belonging to that region,
in addition to other fields like node id, index of the region in the node, start
pfn of the pages in that region and the number of pages spanned in the region.
The zone array inside the regions is statically allocated at this point.

ToDo:
However, since the number of regions actually present on the system might be much
smaller than the maximum allowed, dynamic bootmem allocation could be used to save
memory.

Signed-off-by: Ankita Garg <gargankita@gmail.com>
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
---

 include/linux/mmzone.h |   24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 50aaca8..3f9b106 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -86,6 +86,7 @@ struct free_area {
 };
 
 struct pglist_data;
+struct mem_region;
 
 /*
  * zone->lock and zone->lru_lock are two of the hottest locks in the kernel.
@@ -465,6 +466,8 @@ struct zone {
 	 * Discontig memory support fields.
 	 */
 	struct pglist_data	*zone_pgdat;
+	struct mem_region	*zone_mem_region;
+
 	/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
 	unsigned long		zone_start_pfn;
 
@@ -533,6 +536,8 @@ static inline int zone_is_oom_locked(const struct zone *zone)
 	return test_bit(ZONE_OOM_LOCKED, &zone->flags);
 }
 
+#define MAX_NR_REGIONS    256
+
 /*
  * The "priority" of VM scanning is how much of the queues we will scan in one
  * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
@@ -541,7 +546,7 @@ static inline int zone_is_oom_locked(const struct zone *zone)
 #define DEF_PRIORITY 12
 
 /* Maximum number of zones on a zonelist */
-#define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)
+#define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_REGIONS * MAX_NR_ZONES)
 
 #ifdef CONFIG_NUMA
 
@@ -671,6 +676,18 @@ struct node_active_region {
 extern struct page *mem_map;
 #endif
 
+struct mem_region {
+	struct zone region_zones[MAX_NR_ZONES];
+	int nr_region_zones;
+
+	int node;
+	int region;
+
+	unsigned long start_pfn;
+	unsigned long spanned_pages;
+};
+
+
 /*
  * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM
  * (mostly NUMA machines?) to denote a higher-level memory zone than the
@@ -684,9 +701,10 @@ extern struct page *mem_map;
  */
 struct bootmem_data;
 typedef struct pglist_data {
-	struct zone node_zones[MAX_NR_ZONES];
+	struct mem_region node_regions[MAX_NR_REGIONS];
+	int nr_node_regions;
 	struct zonelist node_zonelists[MAX_ZONELISTS];
-	int nr_zones;
+	int nr_node_zone_types;
 #ifdef CONFIG_FLAT_NODE_MEM_MAP	/* means !SPARSEMEM */
 	struct page *node_mem_map;
 #ifdef CONFIG_MEMCG


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox