* [PATCH 4/6][v3] perf/POWER7: Make some POWER7 events available in sysfs
From: sukadev @ 2013-01-10 0:06 UTC (permalink / raw)
To: Peter Zijlstra, Paul Mackerras
Cc: robert.richter, linuxppc-dev, Anton Blanchard, Jiri Olsa,
Arnaldo Carvalho de Melo
In-Reply-To: <1357776380-31712-1-git-send-email-sukadev@linux.vnet.ibm.com>
Make some POWER7-specific perf events available in sysfs.
$ /bin/ls -1 /sys/bus/event_source/devices/cpu/events/
branch-instructions
branch-misses
cache-misses
cache-references
cpu-cycles
instructions
PM_BRU_FIN
PM_BRU_MPRED
PM_CMPLU_STALL
PM_CYC
PM_GCT_NOSLOT_CYC
PM_INST_CMPL
PM_LD_MISS_L1
PM_LD_REF_L1
stalled-cycles-backend
stalled-cycles-frontend
where the 'PM_*' events are POWER specific and the others are the
generic events.
This will enable users to specify these events with their symbolic
names rather than with their raw code.
perf stat -e 'cpu/PM_CYC/' ...
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
---
arch/powerpc/include/asm/perf_event_server.h | 2 ++
arch/powerpc/perf/power7-pmu.c | 18 ++++++++++++++++++
2 files changed, 20 insertions(+), 0 deletions(-)
diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h
index 3f21d89..b29fcc6 100644
--- a/arch/powerpc/include/asm/perf_event_server.h
+++ b/arch/powerpc/include/asm/perf_event_server.h
@@ -133,3 +133,5 @@ extern ssize_t power_events_sysfs_show(struct device *dev,
#define GENERIC_EVENT_ATTR(_name, _id) EVENT_ATTR(_name, _id, _g)
#define GENERIC_EVENT_PTR(_id) EVENT_PTR(_id, _g)
+#define POWER_EVENT_ATTR(_name, _id) EVENT_ATTR(PM_##_name, _id, _p)
+#define POWER_EVENT_PTR(_id) EVENT_PTR(_id, _p)
diff --git a/arch/powerpc/perf/power7-pmu.c b/arch/powerpc/perf/power7-pmu.c
index ae5d757..5627940 100644
--- a/arch/powerpc/perf/power7-pmu.c
+++ b/arch/powerpc/perf/power7-pmu.c
@@ -373,6 +373,15 @@ GENERIC_EVENT_ATTR(cache-misses, LD_MISS_L1);
GENERIC_EVENT_ATTR(branch-instructions, BRU_FIN);
GENERIC_EVENT_ATTR(branch-misses, BRU_MPRED);
+POWER_EVENT_ATTR(CYC, CYC);
+POWER_EVENT_ATTR(GCT_NOSLOT_CYC, GCT_NOSLOT_CYC);
+POWER_EVENT_ATTR(CMPLU_STALL, CMPLU_STALL);
+POWER_EVENT_ATTR(INST_CMPL, INST_CMPL);
+POWER_EVENT_ATTR(LD_REF_L1, LD_REF_L1);
+POWER_EVENT_ATTR(LD_MISS_L1, LD_MISS_L1);
+POWER_EVENT_ATTR(BRU_FIN, BRU_FIN)
+POWER_EVENT_ATTR(BRU_MPRED, BRU_MPRED);
+
static struct attribute *power7_events_attr[] = {
GENERIC_EVENT_PTR(CYC),
GENERIC_EVENT_PTR(GCT_NOSLOT_CYC),
@@ -382,6 +391,15 @@ static struct attribute *power7_events_attr[] = {
GENERIC_EVENT_PTR(LD_MISS_L1),
GENERIC_EVENT_PTR(BRU_FIN),
GENERIC_EVENT_PTR(BRU_MPRED),
+
+ POWER_EVENT_PTR(CYC),
+ POWER_EVENT_PTR(GCT_NOSLOT_CYC),
+ POWER_EVENT_PTR(CMPLU_STALL),
+ POWER_EVENT_PTR(INST_CMPL),
+ POWER_EVENT_PTR(LD_REF_L1),
+ POWER_EVENT_PTR(LD_MISS_L1),
+ POWER_EVENT_PTR(BRU_FIN),
+ POWER_EVENT_PTR(BRU_MPRED),
NULL
};
--
1.7.1
^ permalink raw reply related
* [PATCH 3/6][v3] perf/POWER7: Make generic event translations available in sysfs
From: sukadev @ 2013-01-10 0:06 UTC (permalink / raw)
To: Peter Zijlstra, Paul Mackerras
Cc: robert.richter, linuxppc-dev, Anton Blanchard, Jiri Olsa,
Arnaldo Carvalho de Melo
In-Reply-To: <1357776380-31712-1-git-send-email-sukadev@linux.vnet.ibm.com>
Make the generic perf events in POWER7 available via sysfs.
$ ls /sys/bus/event_source/devices/cpu/events
branch-instructions
branch-misses
cache-misses
cache-references
cpu-cycles
instructions
stalled-cycles-backend
stalled-cycles-frontend
$ cat /sys/bus/event_source/devices/cpu/events/cache-misses
event=0x400f0
This patch is based on commits that implement this functionality on x86.
Eg:
commit a47473939db20e3961b200eb00acf5fcf084d755
Author: Jiri Olsa <jolsa@redhat.com>
Date: Wed Oct 10 14:53:11 2012 +0200
perf/x86: Make hardware event translations available in sysfs
Changelog:[v3]
[Jiri Olsa] Drop EVENT_ID() macro since it is only used once.
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
---
arch/powerpc/include/asm/perf_event_server.h | 24 ++++++++++++++
arch/powerpc/perf/core-book3s.c | 12 +++++++
arch/powerpc/perf/power7-pmu.c | 34 +++++++++++++++++++++
3 files changed, 70 insertions(+), 0 deletions(-)
create mode 100644 Documentation/ABI/stable/sysfs-devices-cpu-events
diff --git a/Documentation/ABI/stable/sysfs-devices-cpu-events b/Documentation/ABI/stable/sysfs-devices-cpu-events
new file mode 100644
index 0000000..e69de29
diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h
index 9710be3..3f21d89 100644
--- a/arch/powerpc/include/asm/perf_event_server.h
+++ b/arch/powerpc/include/asm/perf_event_server.h
@@ -11,6 +11,7 @@
#include <linux/types.h>
#include <asm/hw_irq.h>
+#include <linux/device.h>
#define MAX_HWEVENTS 8
#define MAX_EVENT_ALTERNATIVES 8
@@ -35,6 +36,7 @@ struct power_pmu {
void (*disable_pmc)(unsigned int pmc, unsigned long mmcr[]);
int (*limited_pmc_event)(u64 event_id);
u32 flags;
+ const struct attribute_group **attr_groups;
int n_generic;
int *generic_events;
int (*cache_events)[PERF_COUNT_HW_CACHE_MAX]
@@ -109,3 +111,25 @@ extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
* If an event_id is not subject to the constraint expressed by a particular
* field, then it will have 0 in both the mask and value for that field.
*/
+
+extern ssize_t power_events_sysfs_show(struct device *dev,
+ struct device_attribute *attr, char *page);
+
+/*
+ * EVENT_VAR() is same as PMU_EVENT_VAR with a suffix.
+ *
+ * Having a suffix allows us to have aliases in sysfs - eg: the generic
+ * event 'cpu-cycles' can have two entries in sysfs: 'cpu-cycles' and
+ * 'PM_CYC' where the latter is the name by which the event is known in
+ * POWER CPU specification.
+ */
+#define EVENT_VAR(_id, _suffix) event_attr_##_id##_suffix
+#define EVENT_PTR(_id, _suffix) &EVENT_VAR(_id, _suffix)
+
+#define EVENT_ATTR(_name, _id, _suffix) \
+ PMU_EVENT_ATTR(_name, EVENT_VAR(_id, _suffix), PME_PM_##_id, \
+ power_events_sysfs_show)
+
+#define GENERIC_EVENT_ATTR(_name, _id) EVENT_ATTR(_name, _id, _g)
+#define GENERIC_EVENT_PTR(_id) EVENT_PTR(_id, _g)
+
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index aa2465e..fa476d5 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -1305,6 +1305,16 @@ static int power_pmu_event_idx(struct perf_event *event)
return event->hw.idx;
}
+ssize_t power_events_sysfs_show(struct device *dev,
+ struct device_attribute *attr, char *page)
+{
+ struct perf_pmu_events_attr *pmu_attr;
+
+ pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
+
+ return sprintf(page, "event=0x%02llx\n", pmu_attr->id);
+}
+
struct pmu power_pmu = {
.pmu_enable = power_pmu_enable,
.pmu_disable = power_pmu_disable,
@@ -1537,6 +1547,8 @@ int __cpuinit register_power_pmu(struct power_pmu *pmu)
pr_info("%s performance monitor hardware support registered\n",
pmu->name);
+ power_pmu.attr_groups = ppmu->attr_groups;
+
#ifdef MSR_HV
/*
* Use FCHV to ignore kernel events if MSR.HV is set.
diff --git a/arch/powerpc/perf/power7-pmu.c b/arch/powerpc/perf/power7-pmu.c
index 44e70d2..ae5d757 100644
--- a/arch/powerpc/perf/power7-pmu.c
+++ b/arch/powerpc/perf/power7-pmu.c
@@ -363,6 +363,39 @@ static int power7_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
},
};
+
+GENERIC_EVENT_ATTR(cpu-cycles, CYC);
+GENERIC_EVENT_ATTR(stalled-cycles-frontend, GCT_NOSLOT_CYC);
+GENERIC_EVENT_ATTR(stalled-cycles-backend, CMPLU_STALL);
+GENERIC_EVENT_ATTR(instructions, INST_CMPL);
+GENERIC_EVENT_ATTR(cache-references, LD_REF_L1);
+GENERIC_EVENT_ATTR(cache-misses, LD_MISS_L1);
+GENERIC_EVENT_ATTR(branch-instructions, BRU_FIN);
+GENERIC_EVENT_ATTR(branch-misses, BRU_MPRED);
+
+static struct attribute *power7_events_attr[] = {
+ GENERIC_EVENT_PTR(CYC),
+ GENERIC_EVENT_PTR(GCT_NOSLOT_CYC),
+ GENERIC_EVENT_PTR(CMPLU_STALL),
+ GENERIC_EVENT_PTR(INST_CMPL),
+ GENERIC_EVENT_PTR(LD_REF_L1),
+ GENERIC_EVENT_PTR(LD_MISS_L1),
+ GENERIC_EVENT_PTR(BRU_FIN),
+ GENERIC_EVENT_PTR(BRU_MPRED),
+ NULL
+};
+
+
+static struct attribute_group power7_pmu_events_group = {
+ .name = "events",
+ .attrs = power7_events_attr,
+};
+
+static const struct attribute_group *power7_pmu_attr_groups[] = {
+ &power7_pmu_events_group,
+ NULL,
+};
+
static struct power_pmu power7_pmu = {
.name = "POWER7",
.n_counter = 6,
@@ -374,6 +407,7 @@ static struct power_pmu power7_pmu = {
.get_alternatives = power7_get_alternatives,
.disable_pmc = power7_disable_pmc,
.flags = PPMU_ALT_SIPR,
+ .attr_groups = power7_pmu_attr_groups,
.n_generic = ARRAY_SIZE(power7_generic_events),
.generic_events = power7_generic_events,
.cache_events = &power7_cache_events,
--
1.7.1
^ permalink raw reply related
* [PATCH 6/6][v3] perf: Document the ABI of perf sysfs entries
From: sukadev @ 2013-01-10 0:06 UTC (permalink / raw)
To: Peter Zijlstra, Paul Mackerras
Cc: robert.richter, linuxppc-dev, Anton Blanchard, Jiri Olsa,
Arnaldo Carvalho de Melo
In-Reply-To: <1357776380-31712-1-git-send-email-sukadev@linux.vnet.ibm.com>
This patchset addes two new sets of files to sysfs:
- generic and POWER-specific perf events in /sys/devices/cpu/events/
- perf event config format in /sys/devices/cpu/format/event
Document the format of these files which would become part of the ABI.
Changelog[v3]:
[Greg KH] Include ABI documentation.
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
---
Documentation/ABI/stable/sysfs-devices-cpu-events | 54 +++++++++++++++++++++
Documentation/ABI/stable/sysfs-devices-cpu-format | 27 ++++++++++
2 files changed, 81 insertions(+), 0 deletions(-)
create mode 100644 Documentation/ABI/stable/sysfs-devices-cpu-format
diff --git a/Documentation/ABI/stable/sysfs-devices-cpu-events b/Documentation/ABI/stable/sysfs-devices-cpu-events
index e69de29..f37d542 100644
--- a/Documentation/ABI/stable/sysfs-devices-cpu-events
+++ b/Documentation/ABI/stable/sysfs-devices-cpu-events
@@ -0,0 +1,54 @@
+What: /sys/devices/cpu/events/
+ /sys/devices/cpu/events/branch-misses
+ /sys/devices/cpu/events/cache-references
+ /sys/devices/cpu/events/cache-misses
+ /sys/devices/cpu/events/stalled-cycles-frontend
+ /sys/devices/cpu/events/branch-instructions
+ /sys/devices/cpu/events/stalled-cycles-backend
+ /sys/devices/cpu/events/instructions
+ /sys/devices/cpu/events/cpu-cycles
+
+Date: 2013/01/08
+
+Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
+
+Description: Generic performance monitoring events
+
+ A collection of performance monitoring events that may be
+ supported by many/most CPUs. These events can be monitored
+ using the 'perf(1)' tool.
+
+ The contents of each file would look like:
+
+ event=0xNNNN
+
+ where 'N' is a hex digit.
+
+
+What: /sys/devices/cpu/events/PM_LD_MISS_L1
+ /sys/devices/cpu/events/PM_LD_REF_L1
+ /sys/devices/cpu/events/PM_CYC
+ /sys/devices/cpu/events/PM_BRU_FIN
+ /sys/devices/cpu/events/PM_GCT_NOSLOT_CYC
+ /sys/devices/cpu/events/PM_BRU_MPRED
+ /sys/devices/cpu/events/PM_INST_CMPL
+ /sys/devices/cpu/events/PM_CMPLU_STALL
+
+Date: 2013/01/08
+
+Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
+ Linux Powerpc mailing list <linuxppc-dev@ozlabs.org>
+
+Description: POWER specific performance monitoring events
+
+ A collection of performance monitoring events that may be
+ supported by the POWER CPU. These events can be monitored
+ using the 'perf(1)' tool.
+
+ These events may not be supported by other CPUs.
+
+ The contents of each file would look like:
+
+ event=0xNNNN
+
+ where 'N' is a hex digit.
diff --git a/Documentation/ABI/stable/sysfs-devices-cpu-format b/Documentation/ABI/stable/sysfs-devices-cpu-format
new file mode 100644
index 0000000..b15cfb2
--- /dev/null
+++ b/Documentation/ABI/stable/sysfs-devices-cpu-format
@@ -0,0 +1,27 @@
+What: /sys/devices/cpu/format/
+ /sys/devices/cpu/format/event
+
+Date: 2013/01/08
+
+Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
+
+Description: Format of performance monitoring events
+
+ Each CPU/architecture may use different format to represent
+ the perf event. The 'event' file describes the configuration
+ format of the performance monitoring event on the CPU/system.
+
+ The contents of each file would look like:
+
+ config:m-n
+
+ where m and n are the starting and ending bits that are
+ used to represent the event.
+
+ For example, on POWER,
+
+ $ cat /sys/devices/cpu/format/event
+ config:0-20
+
+ meaning that POWER uses the first 20-bits to represent a perf
+ event.
--
1.7.1
^ permalink raw reply related
* [PATCH 2/6][v3] perf: Make EVENT_ATTR global
From: sukadev @ 2013-01-10 0:06 UTC (permalink / raw)
To: Peter Zijlstra, Paul Mackerras
Cc: robert.richter, linuxppc-dev, Anton Blanchard, Jiri Olsa,
Arnaldo Carvalho de Melo
In-Reply-To: <1357776380-31712-1-git-send-email-sukadev@linux.vnet.ibm.com>
Rename EVENT_ATTR() to PMU_EVENT_ATTR() and make it global so it is
available to all architectures.
Further to allow architectures flexibility, have PMU_EVENT_ATTR() pass
in the variable name as a parameter.
Changelog[v3]
- [Jiri Olsa] No need to define PMU_EVENT_PTR()
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
---
arch/x86/kernel/cpu/perf_event.c | 13 +++----------
include/linux/perf_event.h | 11 +++++++++++
2 files changed, 14 insertions(+), 10 deletions(-)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 4428fd1..59a1238 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1316,11 +1316,6 @@ static struct attribute_group x86_pmu_format_group = {
.attrs = NULL,
};
-struct perf_pmu_events_attr {
- struct device_attribute attr;
- u64 id;
-};
-
/*
* Remove all undefined events (x86_pmu.event_map(id) == 0)
* out of events_attr attributes.
@@ -1354,11 +1349,9 @@ static ssize_t events_sysfs_show(struct device *dev, struct device_attribute *at
#define EVENT_VAR(_id) event_attr_##_id
#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
-#define EVENT_ATTR(_name, _id) \
-static struct perf_pmu_events_attr EVENT_VAR(_id) = { \
- .attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \
- .id = PERF_COUNT_HW_##_id, \
-};
+#define EVENT_ATTR(_name, _id) \
+ PMU_EVENT_ATTR(_name, EVENT_VAR(_id), PERF_COUNT_HW_##_id, \
+ events_sysfs_show)
EVENT_ATTR(cpu-cycles, CPU_CYCLES );
EVENT_ATTR(instructions, INSTRUCTIONS );
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 6bfb2fa..42adf01 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -817,6 +817,17 @@ do { \
} while (0)
+struct perf_pmu_events_attr {
+ struct device_attribute attr;
+ u64 id;
+};
+
+#define PMU_EVENT_ATTR(_name, _var, _id, _show) \
+static struct perf_pmu_events_attr _var = { \
+ .attr = __ATTR(_name, 0444, _show, NULL), \
+ .id = _id, \
+};
+
#define PMU_FORMAT_ATTR(_name, _format) \
static ssize_t \
_name##_show(struct device *dev, \
--
1.7.1
^ permalink raw reply related
* [PATCH 1/6][v3] perf/Power7: Use macros to identify perf events
From: sukadev @ 2013-01-10 0:06 UTC (permalink / raw)
To: Peter Zijlstra, Paul Mackerras
Cc: robert.richter, linuxppc-dev, Anton Blanchard, Jiri Olsa,
Arnaldo Carvalho de Melo
Define and use macros to identify perf events codes. This would make it
easier and more readable when these event codes need to be used in more
than one place.
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
---
arch/powerpc/perf/power7-pmu.c | 28 ++++++++++++++++++++--------
1 files changed, 20 insertions(+), 8 deletions(-)
diff --git a/arch/powerpc/perf/power7-pmu.c b/arch/powerpc/perf/power7-pmu.c
index 441af08..44e70d2 100644
--- a/arch/powerpc/perf/power7-pmu.c
+++ b/arch/powerpc/perf/power7-pmu.c
@@ -51,6 +51,18 @@
#define MMCR1_PMCSEL_MSK 0xff
/*
+ * Power7 event codes.
+ */
+#define PME_PM_CYC 0x1e
+#define PME_PM_GCT_NOSLOT_CYC 0x100f8
+#define PME_PM_CMPLU_STALL 0x4000a
+#define PME_PM_INST_CMPL 0x2
+#define PME_PM_LD_REF_L1 0xc880
+#define PME_PM_LD_MISS_L1 0x400f0
+#define PME_PM_BRU_FIN 0x10068
+#define PME_PM_BRU_MPRED 0x400f6
+
+/*
* Layout of constraint bits:
* 6666555555555544444444443333333333222222222211111111110000000000
* 3210987654321098765432109876543210987654321098765432109876543210
@@ -296,14 +308,14 @@ static void power7_disable_pmc(unsigned int pmc, unsigned long mmcr[])
}
static int power7_generic_events[] = {
- [PERF_COUNT_HW_CPU_CYCLES] = 0x1e,
- [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x100f8, /* GCT_NOSLOT_CYC */
- [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x4000a, /* CMPLU_STALL */
- [PERF_COUNT_HW_INSTRUCTIONS] = 2,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0xc880, /* LD_REF_L1_LSU*/
- [PERF_COUNT_HW_CACHE_MISSES] = 0x400f0, /* LD_MISS_L1 */
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x10068, /* BRU_FIN */
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x400f6, /* BR_MPRED */
+ [PERF_COUNT_HW_CPU_CYCLES] = PME_PM_CYC,
+ [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = PME_PM_GCT_NOSLOT_CYC,
+ [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = PME_PM_CMPLU_STALL,
+ [PERF_COUNT_HW_INSTRUCTIONS] = PME_PM_INST_CMPL,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = PME_PM_LD_REF_L1,
+ [PERF_COUNT_HW_CACHE_MISSES] = PME_PM_LD_MISS_L1,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = PME_PM_BRU_FIN,
+ [PERF_COUNT_HW_BRANCH_MISSES] = PME_PM_BRU_MPRED,
};
#define C(x) PERF_COUNT_HW_CACHE_##x
--
1.7.1
^ permalink raw reply related
* Re: [PATCH v6 00/15] memory-hotplug: hot-remove physical memory
From: Andrew Morton @ 2013-01-09 23:33 UTC (permalink / raw)
To: Tang Chen
Cc: linux-ia64, linux-sh, linux-mm, paulus, hpa, sparclinux, cl,
linux-s390, x86, linux-acpi, isimatu.yasuaki, linfeng, mgorman,
kosaki.motohiro, rientjes, len.brown, wency, cmetcalf, glommer,
wujianguo, yinghai, laijs, linux-kernel, minchan.kim,
linuxppc-dev
In-Reply-To: <1357723959-5416-1-git-send-email-tangchen@cn.fujitsu.com>
On Wed, 9 Jan 2013 17:32:24 +0800
Tang Chen <tangchen@cn.fujitsu.com> wrote:
> This patch-set aims to implement physical memory hot-removing.
As you were on th patch delivery path, all of these patches should have
your Signed-off-by:. But some were missing it. I fixed this in my
copy of the patches.
I suspect this patchset adds a significant amount of code which will
not be used if CONFIG_MEMORY_HOTPLUG=n. "[PATCH v6 06/15]
memory-hotplug: implement register_page_bootmem_info_section of
sparse-vmemmap", for example. This is not a good thing, so please go
through the patchset (in fact, go through all the memhotplug code) and
let's see if we can reduce the bloat for CONFIG_MEMORY_HOTPLUG=n
kernels.
This needn't be done immediately - it would be OK by me if you were to
defer this exercise until all the new memhotplug code is largely in
place. But please, let's do it.
^ permalink raw reply
* Re: [PATCH v6 04/15] memory-hotplug: remove /sys/firmware/memmap/X sysfs
From: Andrew Morton @ 2013-01-09 23:19 UTC (permalink / raw)
To: Tang Chen
Cc: linux-ia64, linux-sh, linux-mm, paulus, hpa, sparclinux, cl,
linux-s390, x86, linux-acpi, isimatu.yasuaki, linfeng, mgorman,
kosaki.motohiro, rientjes, len.brown, wency, cmetcalf, glommer,
wujianguo, yinghai, laijs, linux-kernel, minchan.kim,
linuxppc-dev
In-Reply-To: <1357723959-5416-5-git-send-email-tangchen@cn.fujitsu.com>
On Wed, 9 Jan 2013 17:32:28 +0800
Tang Chen <tangchen@cn.fujitsu.com> wrote:
> From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
>
> When (hot)adding memory into system, /sys/firmware/memmap/X/{end, start, type}
> sysfs files are created. But there is no code to remove these files. The patch
> implements the function to remove them.
>
> Note: The code does not free firmware_map_entry which is allocated by bootmem.
> So the patch makes memory leak. But I think the memory leak size is
> very samll. And it does not affect the system.
>
> ...
>
> +static struct firmware_map_entry * __meminit
> +firmware_map_find_entry(u64 start, u64 end, const char *type)
> +{
> + struct firmware_map_entry *entry;
> +
> + spin_lock(&map_entries_lock);
> + list_for_each_entry(entry, &map_entries, list)
> + if ((entry->start == start) && (entry->end == end) &&
> + (!strcmp(entry->type, type))) {
> + spin_unlock(&map_entries_lock);
> + return entry;
> + }
> +
> + spin_unlock(&map_entries_lock);
> + return NULL;
> +}
>
> ...
>
> + entry = firmware_map_find_entry(start, end - 1, type);
> + if (!entry)
> + return -EINVAL;
> +
> + firmware_map_remove_entry(entry);
>
> ...
>
The above code looks racy. After firmware_map_find_entry() does the
spin_unlock() there is nothing to prevent a concurrent
firmware_map_remove_entry() from removing the entry, so the kernel ends
up calling firmware_map_remove_entry() twice against the same entry.
An easy fix for this is to hold the spinlock across the entire
lookup/remove operation.
This problem is inherent to firmware_map_find_entry() as you have
implemented it, so this function simply should not exist in the current
form - no caller can use it without being buggy! A simple fix for this
is to remove the spin_lock()/spin_unlock() from
firmware_map_find_entry() and add locking documentation to
firmware_map_find_entry(), explaining that the caller must hold
map_entries_lock and must not release that lock until processing of
firmware_map_find_entry()'s return value has completed.
^ permalink raw reply
* Re: [PATCH v6 02/15] memory-hotplug: check whether all memory blocks are offlined or not when removing memory
From: Andrew Morton @ 2013-01-09 23:11 UTC (permalink / raw)
To: Tang Chen
Cc: linux-ia64, linux-sh, linux-mm, paulus, hpa, sparclinux, cl,
linux-s390, x86, linux-acpi, isimatu.yasuaki, linfeng, mgorman,
kosaki.motohiro, rientjes, len.brown, wency, cmetcalf, glommer,
wujianguo, yinghai, laijs, linux-kernel, minchan.kim,
linuxppc-dev
In-Reply-To: <1357723959-5416-3-git-send-email-tangchen@cn.fujitsu.com>
On Wed, 9 Jan 2013 17:32:26 +0800
Tang Chen <tangchen@cn.fujitsu.com> wrote:
> We remove the memory like this:
> 1. lock memory hotplug
> 2. offline a memory block
> 3. unlock memory hotplug
> 4. repeat 1-3 to offline all memory blocks
> 5. lock memory hotplug
> 6. remove memory(TODO)
> 7. unlock memory hotplug
>
> All memory blocks must be offlined before removing memory. But we don't hold
> the lock in the whole operation. So we should check whether all memory blocks
> are offlined before step6. Otherwise, kernel maybe panicked.
Well, the obvious question is: why don't we hold lock_memory_hotplug()
for all of steps 1-4? Please send the reasons for this in a form which
I can paste into the changelog.
Actually, I wonder if doing this would fix a race in the current
remove_memory() repeat: loop. That code does a
find_memory_block_hinted() followed by offline_memory_block(), but
afaict find_memory_block_hinted() only does a get_device(). Is the
get_device() sufficiently strong to prevent problems if another thread
concurrently offlines or otherwise alters this memory_block's state?
^ permalink raw reply
* Re: [PATCH] powerpc: POWER7 optimised memcpy using VMX and enhanced prefetch
From: Peter Bergner @ 2013-01-09 23:06 UTC (permalink / raw)
To: Jimi Xenidis
Cc: paulus@samba.org Mackerras, linuxppc-dev, Kumar Gala,
Anton Blanchard
In-Reply-To: <9A768D6E-D4DC-4F8B-ADAC-A1E953B38462@pobox.com>
On Wed, 2013-01-09 at 16:19 -0600, Jimi Xenidis wrote:
> I agree, but that means it is impossible for the same .S file can be compiled
> but -mcpu=e500mc and -mcpu=powerpc? So either these files have to be Book3S
> versus Book3E --or-- we use a CPP macro to get them right.
> FWIW, I prefer the latter which allows more code reuse.
I agree using a CPP macro - like we do for "new" instructions for which some
older assemblers might not support yet - is probably the best solution.
Peter
^ permalink raw reply
* Re: [PATCH v6 05/15] memory-hotplug: introduce new function arch_remove_memory() for removing page table depends on architecture
From: Andrew Morton @ 2013-01-09 22:50 UTC (permalink / raw)
To: Tang Chen
Cc: linux-ia64, linux-sh, linux-mm, paulus, hpa, sparclinux, cl,
linux-s390, x86, linux-acpi, isimatu.yasuaki, linfeng, mgorman,
kosaki.motohiro, rientjes, len.brown, wency, cmetcalf, glommer,
wujianguo, yinghai, laijs, linux-kernel, minchan.kim,
linuxppc-dev
In-Reply-To: <1357723959-5416-6-git-send-email-tangchen@cn.fujitsu.com>
On Wed, 9 Jan 2013 17:32:29 +0800
Tang Chen <tangchen@cn.fujitsu.com> wrote:
> For removing memory, we need to remove page table. But it depends
> on architecture. So the patch introduce arch_remove_memory() for
> removing page table. Now it only calls __remove_pages().
>
> Note: __remove_pages() for some archtecuture is not implemented
> (I don't know how to implement it for s390).
Can this break the build for s390?
^ permalink raw reply
* Re: [PATCH v6 04/15] memory-hotplug: remove /sys/firmware/memmap/X sysfs
From: Andrew Morton @ 2013-01-09 22:49 UTC (permalink / raw)
To: Tang Chen
Cc: linux-ia64, linux-sh, linux-mm, paulus, hpa, sparclinux, cl,
linux-s390, x86, linux-acpi, isimatu.yasuaki, linfeng, mgorman,
kosaki.motohiro, rientjes, len.brown, wency, cmetcalf, glommer,
wujianguo, yinghai, laijs, linux-kernel, minchan.kim,
linuxppc-dev
In-Reply-To: <1357723959-5416-5-git-send-email-tangchen@cn.fujitsu.com>
On Wed, 9 Jan 2013 17:32:28 +0800
Tang Chen <tangchen@cn.fujitsu.com> wrote:
> When (hot)adding memory into system, /sys/firmware/memmap/X/{end, start, type}
> sysfs files are created. But there is no code to remove these files. The patch
> implements the function to remove them.
>
> Note: The code does not free firmware_map_entry which is allocated by bootmem.
> So the patch makes memory leak. But I think the memory leak size is
> very samll. And it does not affect the system.
Well that's bad. Can we remember the address of that memory and then
reuse the storage if/when the memory is re-added? That at least puts an upper
bound on the leak.
^ permalink raw reply
* [PATCH v2] powerpc/mm: eliminate unneeded for_each_memblock
From: Cody P Schafer @ 2013-01-09 22:40 UTC (permalink / raw)
To: Linux PPC, galak; +Cc: Cody P Schafer, Paul Mackerras, LKML
In-Reply-To: <82A31E2D-FA24-44CF-9812-9E8062AFA6F4@kernel.crashing.org>
The only persistent change made by this loop is calling
memblock_set_node() once for each memblock, which is not useful (and has
no effect) as memblock_set_node() is not called with any
memblock-specific parameters.
Subsistute a single memblock_set_node().
Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com>
---
Now with a signoff & wrapped comment line.
arch/powerpc/mm/mem.c | 11 ++++-------
1 file changed, 4 insertions(+), 7 deletions(-)
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 0dba506..40df7c8 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -195,13 +195,10 @@ void __init do_init_bootmem(void)
min_low_pfn = MEMORY_START >> PAGE_SHIFT;
boot_mapsize = init_bootmem_node(NODE_DATA(0), start >> PAGE_SHIFT, min_low_pfn, max_low_pfn);
- /* Add active regions with valid PFNs */
- for_each_memblock(memory, reg) {
- unsigned long start_pfn, end_pfn;
- start_pfn = memblock_region_memory_base_pfn(reg);
- end_pfn = memblock_region_memory_end_pfn(reg);
- memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
- }
+ /* Place all memblock_regions in the same node and merge contiguous
+ * memblock_regions
+ */
+ memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
/* Add all physical memory to the bootmem map, mark each area
* present.
--
1.8.0.3
^ permalink raw reply related
* Re: [PATCH v6 00/15] memory-hotplug: hot-remove physical memory
From: Andrew Morton @ 2013-01-09 22:23 UTC (permalink / raw)
To: Tang Chen
Cc: linux-ia64, linux-sh, linux-mm, paulus, hpa, sparclinux, cl,
linux-s390, x86, linux-acpi, isimatu.yasuaki, linfeng, mgorman,
kosaki.motohiro, rientjes, len.brown, wency, cmetcalf, glommer,
wujianguo, yinghai, laijs, linux-kernel, minchan.kim,
linuxppc-dev
In-Reply-To: <1357723959-5416-1-git-send-email-tangchen@cn.fujitsu.com>
On Wed, 9 Jan 2013 17:32:24 +0800
Tang Chen <tangchen@cn.fujitsu.com> wrote:
> Here is the physical memory hot-remove patch-set based on 3.8rc-2.
>
> This patch-set aims to implement physical memory hot-removing.
>
> The patches can free/remove the following things:
>
> - /sys/firmware/memmap/X/{end, start, type} : [PATCH 4/15]
> - memmap of sparse-vmemmap : [PATCH 6,7,8,10/15]
> - page table of removed memory : [RFC PATCH 7,8,10/15]
> - node and related sysfs files : [RFC PATCH 13-15/15]
>
>
> Existing problem:
> If CONFIG_MEMCG is selected, we will allocate memory to store page cgroup
> when we online pages.
>
> For example: there is a memory device on node 1. The address range
> is [1G, 1.5G). You will find 4 new directories memory8, memory9, memory10,
> and memory11 under the directory /sys/devices/system/memory/.
>
> If CONFIG_MEMCG is selected, when we online memory8, the memory stored page
> cgroup is not provided by this memory device. But when we online memory9, the
> memory stored page cgroup may be provided by memory8. So we can't offline
> memory8 now. We should offline the memory in the reversed order.
>
> When the memory device is hotremoved, we will auto offline memory provided
> by this memory device. But we don't know which memory is onlined first, so
> offlining memory may fail.
This does sound like a significant problem. We should assume that
mmecg is available and in use.
> In patch1, we provide a solution which is not good enough:
> Iterate twice to offline the memory.
> 1st iterate: offline every non primary memory block.
> 2nd iterate: offline primary (i.e. first added) memory block.
Let's flesh this out a bit.
If we online memory8, memory9, memory10 and memory11 then I'd have
thought that they would need to offlined in reverse order, which will
require four iterations, not two. Is this wrong and if so, why?
Also, what happens if we wish to offline only memory9? Do we offline
memory11 then memory10 then memory9 and then re-online memory10 and
memory11?
> And a new idea from Wen Congyang <wency@cn.fujitsu.com> is:
> allocate the memory from the memory block they are describing.
Yes.
> But we are not sure if it is OK to do so because there is not existing API
> to do so, and we need to move page_cgroup memory allocation from MEM_GOING_ONLINE
> to MEM_ONLINE.
This all sounds solvable - can we proceed in this fashion?
> And also, it may interfere the hugepage.
Please provide full details on this problem.
> Note: if the memory provided by the memory device is used by the kernel, it
> can't be offlined. It is not a bug.
Right. But how often does this happen in testing? In other words,
please provide an overall description of how well memory hot-remove is
presently operating. Is it reliable? What is the success rate in
real-world situations? Are there precautions which the administrator
can take to improve the success rate? What are the remaining problems
and are there plans to address them?
^ permalink raw reply
* Re: [PATCH] powerpc: POWER7 optimised memcpy using VMX and enhanced prefetch
From: Jimi Xenidis @ 2013-01-09 22:19 UTC (permalink / raw)
To: Peter Bergner
Cc: paulus@samba.org Mackerras, linuxppc-dev, Kumar Gala,
Anton Blanchard
In-Reply-To: <1355848269.5180.41.camel@otta>
On Dec 18, 2012, at 10:31 AM, Peter Bergner <bergner@vnet.ibm.com> =
wrote:
> On Tue, 2012-12-18 at 07:28 -0600, Jimi Xenidis wrote:
>> On Dec 17, 2012, at 6:26 PM, Peter Bergner <bergner@vnet.ibm.com> =
wrote:
>>> Jimi, are you using an "old" binutils from before my patch that
>>> changed the operand order for these types of instructions?
>>>=20
>>> http://sourceware.org/ml/binutils/2009-02/msg00044.html
>>=20
>> Actually, this confused me as well, that embedded has the same =
instruction
>> encoding but different mnemonic.
>=20
> The mnemonic is the same (ie, dcbtst), and yes, the encoding is the =
same.
> All that is different is the accepted operand ordering...and yes, it =
is
> very unfortunate the operand ordering is different between embedded =
and
> server. :(
>=20
>=20
>> I was under the impression that the assembler made no instruction =
decisions
>> based on CPU. So your only hint would be that '0b' prefix.
>> Does AS even see that?
>=20
> GAS definitely makes decisions based on CPU (ie, -m<cpu> option). =
Below is
> the GAS code used in recognizing the dcbtst instruction. This shows =
that
> the "server" operand ordering is enabled for POWER4 and later cpus =
while
> the "embedded" operand ordering is enabled for pre POWER4 cpus (yes, =
not
> exactly a server versus embedded trigger, but that's we agreed on to
> mitigate breaking any old asm code out there).
>=20
> {"dcbtst", X(31,246), X_MASK, POWER4, PPCNONE, =
{RA0, RB, CT}},
> {"dcbtst", X(31,246), X_MASK, PPC|PPCVLE, POWER4, =
{CT, RA0, RB}},
>=20
> GAS doesn't look at how the operands are written to try and guess what
> operand ordering you are attempting to use. Rather, it knows what =
ordering
> it expects and the values had better match that ordering.
>=20
I agree, but that means it is impossible for the same .S file can be =
compiled but -mcpu=3De500mc and -mcpu=3Dpowerpc?
So either these files have to be Book3S versus Book3E --or-- we use a =
CPP macro to get them right.
FWIW, I prefer the latter which allows more code reuse.
-jx
>=20
> Peter
>=20
>=20
>=20
^ permalink raw reply
* Re: [PATCH 7/8] mm: use vm_unmapped_area() on powerpc architecture
From: Rik van Riel @ 2013-01-09 21:41 UTC (permalink / raw)
To: Michel Lespinasse
Cc: Tony Luck, linux-ia64, linux-parisc, James E.J. Bottomley,
linux-kernel, David Howells, linux-mm, linux-alpha, Matt Turner,
linuxppc-dev, Andrew Morton
In-Reply-To: <20130109112313.GA4905@google.com>
On 01/09/2013 06:23 AM, Michel Lespinasse wrote:
> On Wed, Jan 09, 2013 at 02:32:56PM +1100, Benjamin Herrenschmidt wrote:
>> Ok. I think at least you can move that construct:
>>
>> + if (addr < SLICE_LOW_TOP) {
>> + slice = GET_LOW_SLICE_INDEX(addr);
>> + addr = (slice + 1) << SLICE_LOW_SHIFT;
>> + if (!(available.low_slices & (1u << slice)))
>> + continue;
>> + } else {
>> + slice = GET_HIGH_SLICE_INDEX(addr);
>> + addr = (slice + 1) << SLICE_HIGH_SHIFT;
>> + if (!(available.high_slices & (1u << slice)))
>> + continue;
>> + }
>>
>> Into some kind of helper. It will probably compile to the same thing but
>> at least it's more readable and it will avoid a fuckup in the future if
>> somebody changes the algorithm and forgets to update one of the
>> copies :-)
>
> All right, does the following look more palatable then ?
> (didn't re-test it, though)
Looks equivalent. I have also not tested :)
> Signed-off-by: Michel Lespinasse <walken@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
^ permalink raw reply
* Re: [PATCH 8/8] mm: remove free_area_cache
From: Rik van Riel @ 2013-01-09 21:25 UTC (permalink / raw)
To: Michel Lespinasse
Cc: Tony Luck, linux-ia64, linux-parisc, James E.J. Bottomley,
linux-kernel, David Howells, linux-mm, linux-alpha, Matt Turner,
linuxppc-dev, Andrew Morton
In-Reply-To: <1357694895-520-9-git-send-email-walken@google.com>
On 01/08/2013 08:28 PM, Michel Lespinasse wrote:
> Since all architectures have been converted to use vm_unmapped_area(),
> there is no remaining use for the free_area_cache.
>
> Signed-off-by: Michel Lespinasse <walken@google.com>
Yay
Acked-by: Rik van Riel <riel@redhat.com>
^ permalink raw reply
* Re: [PATCH 7/8] mm: use vm_unmapped_area() on powerpc architecture
From: Rik van Riel @ 2013-01-09 21:24 UTC (permalink / raw)
To: Michel Lespinasse
Cc: Tony Luck, linux-ia64, linux-parisc, James E.J. Bottomley,
linux-kernel, David Howells, linux-mm, linux-alpha, Matt Turner,
linuxppc-dev, Andrew Morton
In-Reply-To: <1357694895-520-8-git-send-email-walken@google.com>
On 01/08/2013 08:28 PM, Michel Lespinasse wrote:
> Update the powerpc slice_get_unmapped_area function to make use of
> vm_unmapped_area() instead of implementing a brute force search.
>
> Signed-off-by: Michel Lespinasse <walken@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
^ permalink raw reply
* Re: [PATCH 6/8] mm: remove free_area_cache use in powerpc architecture
From: Rik van Riel @ 2013-01-09 20:57 UTC (permalink / raw)
To: Michel Lespinasse
Cc: Tony Luck, linux-ia64, linux-parisc, James E.J. Bottomley,
linux-kernel, David Howells, linux-mm, linux-alpha, Matt Turner,
linuxppc-dev, Andrew Morton
In-Reply-To: <1357694895-520-7-git-send-email-walken@google.com>
On 01/08/2013 08:28 PM, Michel Lespinasse wrote:
> As all other architectures have been converted to use vm_unmapped_area(),
> we are about to retire the free_area_cache.
>
> This change simply removes the use of that cache in
> slice_get_unmapped_area(), which will most certainly have a
> performance cost. Next one will convert that function to use the
> vm_unmapped_area() infrastructure and regain the performance.
>
> Signed-off-by: Michel Lespinasse <walken@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
^ permalink raw reply
* Re: [PATCH 5/8] mm: use vm_unmapped_area() in hugetlbfs on ia64 architecture
From: Rik van Riel @ 2013-01-09 18:32 UTC (permalink / raw)
To: Michel Lespinasse
Cc: Tony Luck, linux-ia64, linux-parisc, James E.J. Bottomley,
linux-kernel, David Howells, linux-mm, linux-alpha, Matt Turner,
linuxppc-dev, Andrew Morton
In-Reply-To: <1357694895-520-6-git-send-email-walken@google.com>
On 01/08/2013 08:28 PM, Michel Lespinasse wrote:
> Update the ia64 hugetlb_get_unmapped_area function to make use of
> vm_unmapped_area() instead of implementing a brute force search.
>
> Signed-off-by: Michel Lespinasse <walken@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
^ permalink raw reply
* Re: [PATCH 4/8] mm: use vm_unmapped_area() on ia64 architecture
From: Rik van Riel @ 2013-01-09 18:29 UTC (permalink / raw)
To: Michel Lespinasse
Cc: Tony Luck, linux-ia64, linux-parisc, James E.J. Bottomley,
linux-kernel, David Howells, linux-mm, linux-alpha, Matt Turner,
linuxppc-dev, Andrew Morton
In-Reply-To: <1357694895-520-5-git-send-email-walken@google.com>
On 01/08/2013 08:28 PM, Michel Lespinasse wrote:
> Update the ia64 arch_get_unmapped_area function to make use of
> vm_unmapped_area() instead of implementing a brute force search.
>
> Signed-off-by: Michel Lespinasse <walken@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
^ permalink raw reply
* Re: [PATCH 3/8] mm: use vm_unmapped_area() on frv architecture
From: Rik van Riel @ 2013-01-09 18:25 UTC (permalink / raw)
To: Michel Lespinasse
Cc: Tony Luck, linux-ia64, linux-parisc, James E.J. Bottomley,
linux-kernel, David Howells, linux-mm, linux-alpha, Matt Turner,
linuxppc-dev, Andrew Morton
In-Reply-To: <1357694895-520-4-git-send-email-walken@google.com>
On 01/08/2013 08:28 PM, Michel Lespinasse wrote:
> Update the frv arch_get_unmapped_area function to make use of
> vm_unmapped_area() instead of implementing a brute force search.
>
> Signed-off-by: Michel Lespinasse <walken@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
^ permalink raw reply
* Re: [PATCH 2/8] mm: use vm_unmapped_area() on alpha architecture
From: Rik van Riel @ 2013-01-09 17:01 UTC (permalink / raw)
To: Michel Lespinasse
Cc: Tony Luck, linux-ia64, linux-parisc, James E.J. Bottomley,
linux-kernel, David Howells, linux-mm, linux-alpha, Matt Turner,
linuxppc-dev, Andrew Morton
In-Reply-To: <1357694895-520-3-git-send-email-walken@google.com>
On 01/08/2013 08:28 PM, Michel Lespinasse wrote:
> Update the alpha arch_get_unmapped_area function to make use of
> vm_unmapped_area() instead of implementing a brute force search.
>
> Signed-off-by: Michel Lespinasse <walken@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
^ permalink raw reply
* Re: [PATCH 1/8] mm: use vm_unmapped_area() on parisc architecture
From: Rik van Riel @ 2013-01-09 16:56 UTC (permalink / raw)
To: Michel Lespinasse
Cc: Tony Luck, linux-ia64, linux-parisc, James E.J. Bottomley,
linux-kernel, David Howells, linux-mm, linux-alpha, Matt Turner,
linuxppc-dev, Andrew Morton
In-Reply-To: <1357694895-520-2-git-send-email-walken@google.com>
On 01/08/2013 08:28 PM, Michel Lespinasse wrote:
> Update the parisc arch_get_unmapped_area function to make use of
> vm_unmapped_area() instead of implementing a brute force search.
>
> Signed-off-by: Michel Lespinasse <walken@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
^ permalink raw reply
* Re: [PATCH v5 01/14] memory-hotplug: try to offline the memory twice to avoid dependence
From: Glauber Costa @ 2013-01-09 15:09 UTC (permalink / raw)
To: Wen Congyang
Cc: linux-ia64, linux-sh, Tang Chen, linux-mm, paulus, hpa,
sparclinux, cl, linux-s390, x86, linux-acpi, isimatu.yasuaki,
linfeng, mgorman, kosaki.motohiro, rientjes, liuj97, len.brown,
cmetcalf, wujianguo, yinghai, KAMEZAWA Hiroyuki, laijs,
linux-kernel, minchan.kim, akpm, linuxppc-dev
In-Reply-To: <50DFD7F7.5090408@cn.fujitsu.com>
On 12/30/2012 09:58 AM, Wen Congyang wrote:
> At 12/25/2012 04:35 PM, Glauber Costa Wrote:
>> On 12/24/2012 04:09 PM, Tang Chen wrote:
>>> From: Wen Congyang <wency@cn.fujitsu.com>
>>>
>>> memory can't be offlined when CONFIG_MEMCG is selected.
>>> For example: there is a memory device on node 1. The address range
>>> is [1G, 1.5G). You will find 4 new directories memory8, memory9, memory10,
>>> and memory11 under the directory /sys/devices/system/memory/.
>>>
>>> If CONFIG_MEMCG is selected, we will allocate memory to store page cgroup
>>> when we online pages. When we online memory8, the memory stored page cgroup
>>> is not provided by this memory device. But when we online memory9, the memory
>>> stored page cgroup may be provided by memory8. So we can't offline memory8
>>> now. We should offline the memory in the reversed order.
>>>
>>> When the memory device is hotremoved, we will auto offline memory provided
>>> by this memory device. But we don't know which memory is onlined first, so
>>> offlining memory may fail. In such case, iterate twice to offline the memory.
>>> 1st iterate: offline every non primary memory block.
>>> 2nd iterate: offline primary (i.e. first added) memory block.
>>>
>>> This idea is suggested by KOSAKI Motohiro.
>>>
>>> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
>>
>> Maybe there is something here that I am missing - I admit that I came
>> late to this one, but this really sounds like a very ugly hack, that
>> really has no place in here.
>>
>> Retrying, of course, may make sense, if we have reasonable belief that
>> we may now succeed. If this is the case, you need to document - in the
>> code - while is that.
>>
>> The memcg argument, however, doesn't really cut it. Why can't we make
>> all page_cgroup allocations local to the node they are describing? If
>> memcg is the culprit here, we should fix it, and not retry. If there is
>> still any benefit in retrying, then we retry being very specific about why.
>
> We try to make all page_cgroup allocations local to the node they are describing
> now. If the memory is the first memory onlined in this node, we will allocate
> it from the other node.
>
> For example, node1 has 4 memory blocks: 8-11, and we online it from 8 to 11
> 1. memory block 8, page_cgroup allocations are in the other nodes
> 2. memory block 9, page_cgroup allocations are in memory block 8
>
> So we should offline memory block 9 first. But we don't know in which order
> the user online the memory block.
>
> I think we can modify memcg like this:
> allocate the memory from the memory block they are describing
>
> I am not sure it is OK to do so.
I don't see a reason why not.
You would have to tweak a bit the lookup function for page_cgroup, but
assuming you will always have the pfns and limits, it should be easy to do.
I think the only tricky part is that today we have a single
node_page_cgroup, and we would of course have to have one per memory
block. My assumption is that the number of memory blocks is limited and
likely not very big. So even a static array would do.
Kamezawa, do you have any input in here?
^ permalink raw reply
* Re: [PATCH 7/8] mm: use vm_unmapped_area() on powerpc architecture
From: Michel Lespinasse @ 2013-01-09 11:23 UTC (permalink / raw)
To: Benjamin Herrenschmidt
Cc: Rik van Riel, Tony Luck, linux-ia64, linux-parisc,
James E.J. Bottomley, linux-kernel, David Howells, linux-mm,
linux-alpha, Matt Turner, linuxppc-dev, Andrew Morton
In-Reply-To: <1357702376.4838.32.camel@pasglop>
On Wed, Jan 09, 2013 at 02:32:56PM +1100, Benjamin Herrenschmidt wrote:
> Ok. I think at least you can move that construct:
>
> + if (addr < SLICE_LOW_TOP) {
> + slice = GET_LOW_SLICE_INDEX(addr);
> + addr = (slice + 1) << SLICE_LOW_SHIFT;
> + if (!(available.low_slices & (1u << slice)))
> + continue;
> + } else {
> + slice = GET_HIGH_SLICE_INDEX(addr);
> + addr = (slice + 1) << SLICE_HIGH_SHIFT;
> + if (!(available.high_slices & (1u << slice)))
> + continue;
> + }
>
> Into some kind of helper. It will probably compile to the same thing but
> at least it's more readable and it will avoid a fuckup in the future if
> somebody changes the algorithm and forgets to update one of the
> copies :-)
All right, does the following look more palatable then ?
(didn't re-test it, though)
Signed-off-by: Michel Lespinasse <walken@google.com>
---
arch/powerpc/mm/slice.c | 123 ++++++++++++++++++++++++++++++-----------------
1 files changed, 78 insertions(+), 45 deletions(-)
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 999a74f25ebe..3e99c149271a 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -237,36 +237,69 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz
#endif
}
+/*
+ * Compute which slice addr is part of;
+ * set *boundary_addr to the start or end boundary of that slice
+ * (depending on 'end' parameter);
+ * return boolean indicating if the slice is marked as available in the
+ * 'available' slice_mark.
+ */
+static bool slice_scan_available(unsigned long addr,
+ struct slice_mask available,
+ int end,
+ unsigned long *boundary_addr)
+{
+ unsigned long slice;
+ if (addr < SLICE_LOW_TOP) {
+ slice = GET_LOW_SLICE_INDEX(addr);
+ *boundary_addr = (slice + end) << SLICE_LOW_SHIFT;
+ return !!(available.low_slices & (1u << slice));
+ } else {
+ slice = GET_HIGH_SLICE_INDEX(addr);
+ *boundary_addr = (slice + end) ?
+ ((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP;
+ return !!(available.high_slices & (1u << slice));
+ }
+}
+
static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
unsigned long len,
struct slice_mask available,
int psize)
{
- struct vm_area_struct *vma;
- unsigned long addr;
- struct slice_mask mask;
int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
+ unsigned long addr, found, next_end;
+ struct vm_unmapped_area_info info;
- addr = TASK_UNMAPPED_BASE;
-
- for (;;) {
- addr = _ALIGN_UP(addr, 1ul << pshift);
- if ((TASK_SIZE - len) < addr)
- break;
- vma = find_vma(mm, addr);
- BUG_ON(vma && (addr >= vma->vm_end));
+ info.flags = 0;
+ info.length = len;
+ info.align_mask = PAGE_MASK & ((1ul << pshift) - 1);
+ info.align_offset = 0;
- mask = slice_range_to_mask(addr, len);
- if (!slice_check_fit(mask, available)) {
- if (addr < SLICE_LOW_TOP)
- addr = _ALIGN_UP(addr + 1, 1ul << SLICE_LOW_SHIFT);
- else
- addr = _ALIGN_UP(addr + 1, 1ul << SLICE_HIGH_SHIFT);
+ addr = TASK_UNMAPPED_BASE;
+ while (addr < TASK_SIZE) {
+ info.low_limit = addr;
+ if (!slice_scan_available(addr, available, 1, &addr))
continue;
+
+ next_slice:
+ /*
+ * At this point [info.low_limit; addr) covers
+ * available slices only and ends at a slice boundary.
+ * Check if we need to reduce the range, or if we can
+ * extend it to cover the next available slice.
+ */
+ if (addr >= TASK_SIZE)
+ addr = TASK_SIZE;
+ else if (slice_scan_available(addr, available, 1, &next_end)) {
+ addr = next_end;
+ goto next_slice;
}
- if (!vma || addr + len <= vma->vm_start)
- return addr;
- addr = vma->vm_end;
+ info.high_limit = addr;
+
+ found = vm_unmapped_area(&info);
+ if (!(found & ~PAGE_MASK))
+ return found;
}
return -ENOMEM;
@@ -277,39 +310,39 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm,
struct slice_mask available,
int psize)
{
- struct vm_area_struct *vma;
- unsigned long addr;
- struct slice_mask mask;
int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
+ unsigned long addr, found, prev;
+ struct vm_unmapped_area_info info;
- addr = mm->mmap_base;
- while (addr > len) {
- /* Go down by chunk size */
- addr = _ALIGN_DOWN(addr - len, 1ul << pshift);
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ info.align_mask = PAGE_MASK & ((1ul << pshift) - 1);
+ info.align_offset = 0;
- /* Check for hit with different page size */
- mask = slice_range_to_mask(addr, len);
- if (!slice_check_fit(mask, available)) {
- if (addr < SLICE_LOW_TOP)
- addr = _ALIGN_DOWN(addr, 1ul << SLICE_LOW_SHIFT);
- else if (addr < (1ul << SLICE_HIGH_SHIFT))
- addr = SLICE_LOW_TOP;
- else
- addr = _ALIGN_DOWN(addr, 1ul << SLICE_HIGH_SHIFT);
+ addr = mm->mmap_base;
+ while (addr > PAGE_SIZE) {
+ info.high_limit = addr;
+ if (!slice_scan_available(addr - 1, available, 0, &addr))
continue;
- }
+ prev_slice:
/*
- * Lookup failure means no vma is above this address,
- * else if new region fits below vma->vm_start,
- * return with success:
+ * At this point [addr; info.high_limit) covers
+ * available slices only and starts at a slice boundary.
+ * Check if we need to reduce the range, or if we can
+ * extend it to cover the previous available slice.
*/
- vma = find_vma(mm, addr);
- if (!vma || (addr + len) <= vma->vm_start)
- return addr;
+ if (addr < PAGE_SIZE)
+ addr = PAGE_SIZE;
+ else if (slice_scan_available(addr - 1, available, 0, &prev)) {
+ addr = prev;
+ goto prev_slice;
+ }
+ info.low_limit = addr;
- /* try just below the current vma->vm_start */
- addr = vma->vm_start;
+ found = vm_unmapped_area(&info);
+ if (!(found & ~PAGE_MASK))
+ return found;
}
/*
--
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox