* [PATCH 2/3] powerpc/pseries: Re-organise the oops compression code
From: Aruna Balakrishnaiah @ 2013-04-26 9:56 UTC (permalink / raw)
To: linuxppc-dev, paulus, linux-kernel, benh
Cc: jkenisto, tony.luck, mahesh, cbouatmailru, anton, ccross,
keescook
In-Reply-To: <20130426094923.14323.80567.stgit@aruna-ThinkPad-T420>
nvram_compress() and zip_oops() is used by the nvram_pstore_write
API to compress oops messages hence re-organise the functions
accordingly to avoid forward declarations.
Signed-off-by: Aruna Balakrishnaiah <aruna@linux.vnet.ibm.com>
---
arch/powerpc/platforms/pseries/nvram.c | 104 ++++++++++++++++----------------
1 file changed, 52 insertions(+), 52 deletions(-)
diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
index 14cc486..0159d74 100644
--- a/arch/powerpc/platforms/pseries/nvram.c
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -486,6 +486,58 @@ static int clobbering_unread_rtas_event(void)
NVRAM_RTAS_READ_TIMEOUT);
}
+/* Derived from logfs_compress() */
+static int nvram_compress(const void *in, void *out, size_t inlen,
+ size_t outlen)
+{
+ int err, ret;
+
+ ret = -EIO;
+ err = zlib_deflateInit2(&stream, COMPR_LEVEL, Z_DEFLATED, WINDOW_BITS,
+ MEM_LEVEL, Z_DEFAULT_STRATEGY);
+ if (err != Z_OK)
+ goto error;
+
+ stream.next_in = in;
+ stream.avail_in = inlen;
+ stream.total_in = 0;
+ stream.next_out = out;
+ stream.avail_out = outlen;
+ stream.total_out = 0;
+
+ err = zlib_deflate(&stream, Z_FINISH);
+ if (err != Z_STREAM_END)
+ goto error;
+
+ err = zlib_deflateEnd(&stream);
+ if (err != Z_OK)
+ goto error;
+
+ if (stream.total_out >= stream.total_in)
+ goto error;
+
+ ret = stream.total_out;
+error:
+ return ret;
+}
+
+/* Compress the text from big_oops_buf into oops_buf. */
+static int zip_oops(size_t text_len)
+{
+ struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf;
+ int zipped_len = nvram_compress(big_oops_buf, oops_data, text_len,
+ oops_data_sz);
+ if (zipped_len < 0) {
+ pr_err("nvram: compression failed; returned %d\n", zipped_len);
+ pr_err("nvram: logging uncompressed oops/panic report\n");
+ return -1;
+ }
+ oops_hdr->version = OOPS_HDR_VERSION;
+ oops_hdr->report_length = (u16) zipped_len;
+ oops_hdr->timestamp = get_seconds();
+ return 0;
+}
+
#ifdef CONFIG_PSTORE
static int nvram_pstore_open(struct pstore_info *psi)
{
@@ -757,58 +809,6 @@ int __init pSeries_nvram_init(void)
}
-/* Derived from logfs_compress() */
-static int nvram_compress(const void *in, void *out, size_t inlen,
- size_t outlen)
-{
- int err, ret;
-
- ret = -EIO;
- err = zlib_deflateInit2(&stream, COMPR_LEVEL, Z_DEFLATED, WINDOW_BITS,
- MEM_LEVEL, Z_DEFAULT_STRATEGY);
- if (err != Z_OK)
- goto error;
-
- stream.next_in = in;
- stream.avail_in = inlen;
- stream.total_in = 0;
- stream.next_out = out;
- stream.avail_out = outlen;
- stream.total_out = 0;
-
- err = zlib_deflate(&stream, Z_FINISH);
- if (err != Z_STREAM_END)
- goto error;
-
- err = zlib_deflateEnd(&stream);
- if (err != Z_OK)
- goto error;
-
- if (stream.total_out >= stream.total_in)
- goto error;
-
- ret = stream.total_out;
-error:
- return ret;
-}
-
-/* Compress the text from big_oops_buf into oops_buf. */
-static int zip_oops(size_t text_len)
-{
- struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf;
- int zipped_len = nvram_compress(big_oops_buf, oops_data, text_len,
- oops_data_sz);
- if (zipped_len < 0) {
- pr_err("nvram: compression failed; returned %d\n", zipped_len);
- pr_err("nvram: logging uncompressed oops/panic report\n");
- return -1;
- }
- oops_hdr->version = OOPS_HDR_VERSION;
- oops_hdr->report_length = (u16) zipped_len;
- oops_hdr->timestamp = get_seconds();
- return 0;
-}
-
/*
* This is our kmsg_dump callback, called after an oops or panic report
* has been written to the printk buffer. We want to capture as much
^ permalink raw reply related
* [PATCH 3/3] powerpc/pseries: Support compression of oops text via pstore
From: Aruna Balakrishnaiah @ 2013-04-26 9:56 UTC (permalink / raw)
To: linuxppc-dev, paulus, linux-kernel, benh
Cc: jkenisto, tony.luck, mahesh, cbouatmailru, anton, ccross,
keescook
In-Reply-To: <20130426094923.14323.80567.stgit@aruna-ThinkPad-T420>
The patch set supports compression of oops messages while writing to NVRAM,
this helps in capturing more of oops data to lnx,oops-log. The pstore file
for oops messages will be in decompressed format making it readable.
In case compression fails, the patch takes care of copying the header added
by pstore and last oops_data_sz bytes of big_oops_buf to NVRAM so that we
have recent oops messages in lnx,oops-log.
In case decompression fails, it will result in absence of oops file but still
have files (in /dev/pstore) for other partitions.
Signed-off-by: Aruna Balakrishnaiah <aruna@linux.vnet.ibm.com>
---
arch/powerpc/platforms/pseries/nvram.c | 132 +++++++++++++++++++++++++++++---
1 file changed, 118 insertions(+), 14 deletions(-)
diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
index 0159d74..b5ba5e2 100644
--- a/arch/powerpc/platforms/pseries/nvram.c
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -539,6 +539,65 @@ static int zip_oops(size_t text_len)
}
#ifdef CONFIG_PSTORE
+/* Derived from logfs_uncompress */
+int nvram_decompress(void *in, void *out, size_t inlen, size_t outlen)
+{
+ int err, ret;
+
+ ret = -EIO;
+ err = zlib_inflateInit(&stream);
+ if (err != Z_OK)
+ goto error;
+
+ stream.next_in = in;
+ stream.avail_in = inlen;
+ stream.total_in = 0;
+ stream.next_out = out;
+ stream.avail_out = outlen;
+ stream.total_out = 0;
+
+ err = zlib_inflate(&stream, Z_FINISH);
+ if (err != Z_STREAM_END)
+ goto error;
+
+ err = zlib_inflateEnd(&stream);
+ if (err != Z_OK)
+ goto error;
+
+ ret = stream.total_out;
+error:
+ return ret;
+}
+
+static int unzip_oops(char *oops_buf, char *big_buf)
+{
+ struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf;
+ u64 timestamp = oops_hdr->timestamp;
+ char *big_oops_data = NULL;
+ char *oops_data_buf = NULL;
+ size_t big_oops_data_sz;
+ int unzipped_len;
+
+ big_oops_data = big_buf + sizeof(struct oops_log_info);
+ big_oops_data_sz = big_oops_buf_sz - sizeof(struct oops_log_info);
+ oops_data_buf = oops_buf + sizeof(struct oops_log_info);
+
+ unzipped_len = nvram_decompress(oops_data_buf, big_oops_data,
+ oops_hdr->report_length,
+ big_oops_data_sz);
+
+ if (unzipped_len < 0) {
+ pr_err("nvram: decompression failed; returned %d\n",
+ unzipped_len);
+ return -1;
+ }
+ oops_hdr = (struct oops_log_info *)big_buf;
+ oops_hdr->version = OOPS_HDR_VERSION;
+ oops_hdr->report_length = (u16) unzipped_len;
+ oops_hdr->timestamp = timestamp;
+ return 0;
+}
+
static int nvram_pstore_open(struct pstore_info *psi)
{
/* Reset the iterator to start reading partitions again */
@@ -567,6 +626,7 @@ static int nvram_pstore_write(enum pstore_type_id type,
size_t size, struct pstore_info *psi)
{
int rc;
+ unsigned int err_type = ERR_TYPE_KERNEL_PANIC;
struct oops_log_info *oops_hdr = (struct oops_log_info *) oops_buf;
/* part 1 has the recent messages from printk buffer */
@@ -577,8 +637,31 @@ static int nvram_pstore_write(enum pstore_type_id type,
oops_hdr->version = OOPS_HDR_VERSION;
oops_hdr->report_length = (u16) size;
oops_hdr->timestamp = get_seconds();
+
+ if (big_oops_buf) {
+ rc = zip_oops(size);
+ /*
+ * If compression fails copy recent log messages from
+ * big_oops_buf to oops_data.
+ */
+ if (rc != 0) {
+ int hsize = pstore_get_header_size();
+ size_t diff = size - oops_data_sz + hsize;
+
+ if (size > oops_data_sz) {
+ memcpy(oops_data, big_oops_buf, hsize);
+ memcpy(oops_data + hsize, big_oops_buf + diff,
+ oops_data_sz - hsize);
+
+ oops_hdr->report_length = (u16) oops_data_sz;
+ } else
+ memcpy(oops_data, big_oops_buf, size);
+ } else
+ err_type = ERR_TYPE_KERNEL_PANIC_GZ;
+ }
+
rc = nvram_write_os_partition(&oops_log_partition, oops_buf,
- (int) (sizeof(*oops_hdr) + size), ERR_TYPE_KERNEL_PANIC,
+ (int) (sizeof(*oops_hdr) + oops_hdr->report_length), err_type,
count);
if (rc != 0)
@@ -600,10 +683,11 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type,
struct oops_log_info *oops_hdr;
unsigned int err_type, id_no, size = 0;
struct nvram_os_partition *part = NULL;
- char *buff = NULL;
- int sig = 0;
+ char *buff = NULL, *big_buff = NULL;
+ int rc, sig = 0;
loff_t p;
+read_partition:
read_type++;
switch (nvram_type_ids[read_type]) {
@@ -666,6 +750,25 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type,
if (nvram_type_ids[read_type] == PSTORE_TYPE_DMESG) {
oops_hdr = (struct oops_log_info *)buff;
*buf = buff + sizeof(*oops_hdr);
+
+ if (err_type == ERR_TYPE_KERNEL_PANIC_GZ) {
+ big_buff = kmalloc(big_oops_buf_sz, GFP_KERNEL);
+ if (!big_buff)
+ return -ENOMEM;
+
+ rc = unzip_oops(buff, big_buff);
+
+ if (rc != 0) {
+ kfree(buff);
+ kfree(big_buff);
+ goto read_partition;
+ }
+
+ oops_hdr = (struct oops_log_info *)big_buff;
+ *buf = big_buff + sizeof(*oops_hdr);
+ kfree(buff);
+ }
+
time->tv_sec = oops_hdr->timestamp;
time->tv_nsec = 0;
return oops_hdr->report_length;
@@ -687,17 +790,18 @@ static int nvram_pstore_init(void)
{
int rc = 0;
- nvram_pstore_info.buf = oops_data;
- nvram_pstore_info.bufsize = oops_data_sz;
+ if (big_oops_buf) {
+ nvram_pstore_info.buf = big_oops_buf;
+ nvram_pstore_info.bufsize = big_oops_buf_sz;
+ } else {
+ nvram_pstore_info.buf = oops_data;
+ nvram_pstore_info.bufsize = oops_data_sz;
+ }
rc = pstore_register(&nvram_pstore_info);
if (rc != 0)
pr_err("nvram: pstore_register() failed, defaults to "
"kmsg_dump; returned %d\n", rc);
- else
- /*TODO: Support compression when pstore is configured */
- pr_info("nvram: Compression of oops text supported only when "
- "pstore is not configured");
return rc;
}
@@ -731,11 +835,6 @@ static void __init nvram_init_oops_partition(int rtas_partition_exists)
oops_data = oops_buf + sizeof(struct oops_log_info);
oops_data_sz = oops_log_partition.size - sizeof(struct oops_log_info);
- rc = nvram_pstore_init();
-
- if (!rc)
- return;
-
/*
* Figure compression (preceded by elimination of each line's <n>
* severity prefix) will reduce the oops/panic report to at most
@@ -759,6 +858,11 @@ static void __init nvram_init_oops_partition(int rtas_partition_exists)
stream.workspace = NULL;
}
+ rc = nvram_pstore_init();
+
+ if (!rc)
+ return;
+
rc = kmsg_dump_register(&nvram_kmsg_dumper);
if (rc != 0) {
pr_err("nvram: kmsg_dump_register() failed; returned %d\n", rc);
^ permalink raw reply related
* Re: [PATCH 0/8 v3] KVM: PPC: e500: Enable FSL e6500 core
From: Alexander Graf @ 2013-04-26 13:48 UTC (permalink / raw)
To: Mihai Caraman; +Cc: linuxppc-dev, kvm, kvm-ppc
In-Reply-To: <1365674594-17410-1-git-send-email-mihai.caraman@freescale.com>
On 11.04.2013, at 12:03, Mihai Caraman wrote:
> Enable basic support for Freescale e6500 core, adding MAV 2.0 support.
> Validated on T4240QDS platfrom. Altivec, Multithreading and HW =
Tablewalk
> are not addressed by this patchset.
Thanks, applied all to kvm-ppc-queue.
Alex
>=20
> Mihai Caraman (8):
> KVM: PPC: Book3E: Refactor ONE_REG ioctl implementation
> KVM: PPC: e500: Expose MMU registers via ONE_REG
> KVM: PPC: e500: Move vcpu's MMU configuration to dedicated functions
> KVM: PPC: e500: Add support for TLBnPS registers
> KVM: PPC: e500: Add support for EPTCFG register
> KVM: PPC: e500: Remove E.PT and E.HV.LRAT categories from VCPUs
> KVM: PPC: e500mc: Enable e6500 cores
> KVM: PPC: e500: Add e6500 core to Kconfig description
>=20
> Documentation/virtual/kvm/api.txt | 16 +++
> arch/powerpc/include/asm/kvm_host.h | 2 +
> arch/powerpc/include/uapi/asm/kvm.h | 22 ++++
> arch/powerpc/kvm/44x.c | 12 ++
> arch/powerpc/kvm/Kconfig | 6 +-
> arch/powerpc/kvm/booke.c | 102 ++++++++++---------
> arch/powerpc/kvm/e500.c | 14 +++
> arch/powerpc/kvm/e500.h | 22 ++++
> arch/powerpc/kvm/e500_emulate.c | 19 ++++
> arch/powerpc/kvm/e500_mmu.c | 192 =
+++++++++++++++++++++++++++++++----
> arch/powerpc/kvm/e500mc.c | 16 +++
> 11 files changed, 351 insertions(+), 72 deletions(-)
>=20
> --=20
> 1.7.4.1
>=20
>=20
> --
> To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Status of I2C mac-drivers needing attach_adapter?
From: Wolfram Sang @ 2013-04-26 15:31 UTC (permalink / raw)
To: linux-i2c; +Cc: linuxppc-dev
Hi Ben,
because I will push a patch to get rid of detach_adapter() from the I2C
core in the next merge window, I was curious to find out about the
status of attach_adapter().
I found commit 6cd3209967469f6e89d329deda6bb0b4700e7b62
(powerpc/powermac: New windfarm driver for PowerMac G5 (AGP) and Xserve
G5) which should get rid of therm_pm72 if I understand correctly? Or is
it still needed? Can you give me an overview if there is/was progress
regarding the other two drivers therm_windtunnel and sound/ppc/keywest?
Thanks,
Wolfram
^ permalink raw reply
* [PATCH] powerpc: Bring all threads online prior to migration/hibernation
From: Robert Jennings @ 2013-04-26 21:32 UTC (permalink / raw)
To: linuxppc-dev; +Cc: stable
With this patch before a migration/hibernation all threads present but
not online will be brought online. After migration/hibernation those
threads are taken back offline.
During migration/hibernation all online CPUs must call H_JOIN, this is
required by the hypervisor. Without this patch, threads that are offline
(H_CEDE'd) will not be woken to make the H_JOIN call and the OS will be
deadlocked (all threads either JOIN'd or CEDE'd).
Cc: <stable@kernel.org>
Signed-off-by: Robert Jennings <rcj@linux.vnet.ibm.com>
---
arch/powerpc/include/asm/rtas.h | 2 +
arch/powerpc/kernel/rtas.c | 95 ++++++++++++++++++++++++++++++
arch/powerpc/platforms/pseries/suspend.c | 22 +++++++
3 files changed, 119 insertions(+)
diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index aef00c6..ee38f29 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -262,6 +262,8 @@ extern void rtas_progress(char *s, unsigned short hex);
extern void rtas_initialize(void);
extern int rtas_suspend_cpu(struct rtas_suspend_me_data *data);
extern int rtas_suspend_last_cpu(struct rtas_suspend_me_data *data);
+extern int rtas_online_cpus_mask(cpumask_var_t cpus);
+extern int rtas_offline_cpus_mask(cpumask_var_t cpus);
extern int rtas_ibm_suspend_me(struct rtas_args *);
struct rtc_time;
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 1fd6e7b..855ee98 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -19,6 +19,7 @@
#include <linux/init.h>
#include <linux/capability.h>
#include <linux/delay.h>
+#include <linux/cpu.h>
#include <linux/smp.h>
#include <linux/completion.h>
#include <linux/cpumask.h>
@@ -807,6 +808,77 @@ static void rtas_percpu_suspend_me(void *info)
__rtas_suspend_cpu((struct rtas_suspend_me_data *)info, 1);
}
+enum rtas_cpu_state {
+ DOWN,
+ UP,
+};
+
+/* On return cpumask will be altered to indicate CPUs changed */
+static int rtas_cpu_state_change_mask(enum rtas_cpu_state state,
+ cpumask_var_t cpus)
+{
+ int cpu;
+ int cpuret = 0;
+ int ret = 0;
+
+ if (cpumask_empty(cpus))
+ return 0;
+
+ for_each_cpu(cpu, cpus) {
+ switch (state) {
+ case DOWN:
+ cpuret = cpu_down(cpu);
+ break;
+ case UP:
+ cpuret = cpu_up(cpu);
+ break;
+ }
+ if (cpuret) {
+ pr_debug("%s: cpu_%s for cpu#%d returned %d.\n",
+ __func__,
+ ((state == UP) ? "up" : "down"),
+ cpu, cpuret);
+ if (!ret)
+ ret = cpuret;
+ if (state == UP) {
+ cpumask_shift_right(cpus, cpus, cpu);
+ cpumask_shift_left(cpus, cpus, cpu);
+ break;
+ } else
+ cpumask_clear_cpu(cpu, cpus);
+ }
+ }
+
+ return ret;
+}
+
+int rtas_online_cpus_mask(cpumask_var_t cpus)
+{
+ int ret;
+
+ ret = rtas_cpu_state_change_mask(UP, cpus);
+
+ if (ret) {
+ cpumask_var_t tmp_mask;
+
+ if (!alloc_cpumask_var(&tmp_mask, GFP_TEMPORARY))
+ return ret;
+
+ cpumask_copy(tmp_mask, cpus);
+ rtas_offline_cpus_mask(tmp_mask);
+ free_cpumask_var(tmp_mask);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(rtas_online_cpus_mask);
+
+int rtas_offline_cpus_mask(cpumask_var_t cpus)
+{
+ return rtas_cpu_state_change_mask(DOWN, cpus);
+}
+EXPORT_SYMBOL(rtas_offline_cpus_mask);
+
int rtas_ibm_suspend_me(struct rtas_args *args)
{
long state;
@@ -814,6 +886,8 @@ int rtas_ibm_suspend_me(struct rtas_args *args)
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
struct rtas_suspend_me_data data;
DECLARE_COMPLETION_ONSTACK(done);
+ cpumask_var_t offline_mask;
+ int cpuret;
if (!rtas_service_present("ibm,suspend-me"))
return -ENOSYS;
@@ -837,11 +911,24 @@ int rtas_ibm_suspend_me(struct rtas_args *args)
return 0;
}
+ if (!alloc_cpumask_var(&offline_mask, GFP_TEMPORARY))
+ return -ENOMEM;
+
atomic_set(&data.working, 0);
atomic_set(&data.done, 0);
atomic_set(&data.error, 0);
data.token = rtas_token("ibm,suspend-me");
data.complete = &done;
+
+ /* All present CPUs must be online */
+ cpumask_andnot(offline_mask, cpu_present_mask, cpu_online_mask);
+ cpuret = rtas_online_cpus_mask(offline_mask);
+ if (cpuret) {
+ pr_err("%s: Could not bring present CPUs online.\n", __func__);
+ atomic_set(&data.error, cpuret);
+ goto out;
+ }
+
stop_topology_update();
/* Call function on all CPUs. One of us will make the
@@ -857,6 +944,14 @@ int rtas_ibm_suspend_me(struct rtas_args *args)
start_topology_update();
+ /* Take down CPUs not online prior to suspend */
+ cpuret = rtas_offline_cpus_mask(offline_mask);
+ if (cpuret)
+ pr_warn("%s: Could not restore CPUs to offline state.\n",
+ __func__);
+
+out:
+ free_cpumask_var(offline_mask);
return atomic_read(&data.error);
}
#else /* CONFIG_PPC_PSERIES */
diff --git a/arch/powerpc/platforms/pseries/suspend.c b/arch/powerpc/platforms/pseries/suspend.c
index 47226e0..5f997e7 100644
--- a/arch/powerpc/platforms/pseries/suspend.c
+++ b/arch/powerpc/platforms/pseries/suspend.c
@@ -16,6 +16,7 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#include <linux/cpu.h>
#include <linux/delay.h>
#include <linux/suspend.h>
#include <linux/stat.h>
@@ -126,11 +127,15 @@ static ssize_t store_hibernate(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
+ cpumask_var_t offline_mask;
int rc;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!alloc_cpumask_var(&offline_mask, GFP_TEMPORARY))
+ return -ENOMEM;
+
stream_id = simple_strtoul(buf, NULL, 16);
do {
@@ -140,15 +145,32 @@ static ssize_t store_hibernate(struct device *dev,
} while (rc == -EAGAIN);
if (!rc) {
+ /* All present CPUs must be online */
+ cpumask_andnot(offline_mask, cpu_present_mask,
+ cpu_online_mask);
+ rc = rtas_online_cpus_mask(offline_mask);
+ if (rc) {
+ pr_err("%s: Could not bring present CPUs online.\n",
+ __func__);
+ goto out;
+ }
+
stop_topology_update();
rc = pm_suspend(PM_SUSPEND_MEM);
start_topology_update();
+
+ /* Take down CPUs not online prior to suspend */
+ if (!rtas_offline_cpus_mask(offline_mask))
+ pr_warn("%s: Could not restore CPUs to offline "
+ "state.\n", __func__);
}
stream_id = 0;
if (!rc)
rc = count;
+out:
+ free_cpumask_var(offline_mask);
return rc;
}
--
1.7.10.4
^ permalink raw reply related
* Re: [PATCH 1/3] rapidio: make enumeration/discovery configurable
From: Andrew Morton @ 2013-04-26 22:53 UTC (permalink / raw)
To: Alexandre Bounine
Cc: Micha Nelissen, linux-kernel, Andre van Herk, linuxppc-dev
In-Reply-To: <1366813919-13766-2-git-send-email-alexandre.bounine@idt.com>
On Wed, 24 Apr 2013 10:31:57 -0400 Alexandre Bounine <alexandre.bounine@idt.com> wrote:
> Rework to implement RapidIO enumeration/discovery method selection
> combined with ability to use enumeration/discovery as a kernel module.
>
> This patch adds ability to introduce new RapidIO enumeration/discovery methods
> using kernel configuration options or loadable modules. Configuration option
> mechanism allows to select built-in or modular enumeration/discovery method from
> the list of existing methods or use external modules.
> If a modular enumeration/discovery is selected each RapidIO mport device can
> have its own method attached to it.
>
> The currently existing enumeration/discovery code was updated to be used
> as built-in or modular method. This configuration option is named "Basic
> enumeration/discovery" method.
>
> Several common routines have been moved from rio-scan.c to make them available
> to other enumeration methods and reduce number of exported symbols.
>
> ...
>
> --- a/drivers/rapidio/Kconfig
> +++ b/drivers/rapidio/Kconfig
> @@ -47,4 +47,23 @@ config RAPIDIO_DEBUG
>
> If you are unsure about this, say N here.
>
> +choice
> + prompt "Enumeration method"
> + depends on RAPIDIO
> + help
> + There are different enumeration and discovery mechanisms offered
> + for RapidIO subsystem. You may select single built-in method or
> + or any number of methods to be built as modules.
> + Selecting a built-in method disables use of loadable methods.
> +
> + If unsure, select Basic built-in.
> +
> +config RAPIDIO_ENUM_BASIC
> + tristate "Basic"
> + help
> + This option includes basic RapidIO fabric enumeration and discovery
> + mechanism similar to one described in RapidIO specification Annex 1.
> +
> +endchoice
This Kconfig change makes my kbuild do Weird Things.
make mrproper ; yes "" | make allmodconfig ; make 2>/tmp/x
: scripts/kconfig/conf --silentoldconfig Kconfig
: *
: * Restart config...
: *
: *
: * Bus options (PCI etc.)
: *
: PCI support (PCI) [Y/n/?] y
: Support mmconfig PCI config space access (PCI_MMCONFIG) [Y/n] y
: Read CNB20LE Host Bridge Windows (PCI_CNB20LE_QUIRK) [Y/n/?] y
: PCI Express support (PCIEPORTBUS) [Y/n/?] y
: PCI Express Hotplug driver (HOTPLUG_PCI_PCIE) [M/n/?] m
: Root Port Advanced Error Reporting support (PCIEAER) [Y/n/?] y
: PCI Express ECRC settings control (PCIE_ECRC) [Y/n/?] y
: PCIe AER error injector support (PCIEAER_INJECT) [M/n/y/?] m
: PCI Express ASPM control (PCIEASPM) [Y/n/?] y
: Debug PCI Express ASPM (PCIEASPM_DEBUG) [Y/n/?] y
: Default ASPM policy
: > 1. BIOS default (PCIEASPM_DEFAULT)
: 2. Powersave (PCIEASPM_POWERSAVE)
: 3. Performance (PCIEASPM_PERFORMANCE)
: choice[1-3]: 1
: Message Signaled Interrupts (MSI and MSI-X) (PCI_MSI) [Y/?] y
: PCI Debugging (PCI_DEBUG) [Y/n/?] y
: Enable PCI resource re-allocation detection (PCI_REALLOC_ENABLE_AUTO) [Y/n/?] y
: PCI Stub driver (PCI_STUB) [M/n/y/?] m
: Xen PCI Frontend (XEN_PCIDEV_FRONTEND) [M/n/y/?] m
: Interrupts on hypertransport devices (HT_IRQ) [Y/n/?] y
: PCI IOV support (PCI_IOV) [Y/n/?] y
: PCI PRI support (PCI_PRI) [Y/?] y
: PCI PASID support (PCI_PASID) [Y/?] y
: PCI IO-APIC hotplug support (PCI_IOAPIC) [M/n/y] m
: ISA-style DMA support (ISA_DMA_API) [Y/n/?] y
: RapidIO support (RAPIDIO) [Y/n/?] y
: IDT Tsi721 PCI Express SRIO Controller support (RAPIDIO_TSI721) [Y/n/?] y
: Discovery timeout duration (seconds) (RAPIDIO_DISC_TIMEOUT) [30] 30
: Enable RapidIO Input/Output Ports (RAPIDIO_ENABLE_RX_TX_PORTS) [Y/n/?] y
: DMA Engine support for RapidIO (RAPIDIO_DMA_ENGINE) [Y/n/?] y
: RapidIO subsystem debug messages (RAPIDIO_DEBUG) [Y/n/?] y
: Enumeration method [M/y/?] (NEW) aborted!
:
: Console input/output is redirected. Run 'make oldconfig' to update configuration.
:
: SYSHDR arch/x86/syscalls/../include/generated/uapi/asm/unistd_32.h
: SYSHDR arch/x86/syscalls/../include/generated/uapi/asm/unistd_64.h
: SYSHDR arch/x86/syscalls/../include/generated/uapi/asm/unistd_x32.h
See the "Enumeration method [M/y/?] (NEW) aborted!"
Note that this only happens when make's stderr is redirected.
I've no idea what's going on here. This appears to fix things:
--- a/drivers/rapidio/Kconfig~rapidio-make-enumeration-discovery-configurable-fix
+++ a/drivers/rapidio/Kconfig
@@ -59,7 +59,7 @@ choice
If unsure, select Basic built-in.
config RAPIDIO_ENUM_BASIC
- tristate "Basic"
+ bool "Basic"
help
This option includes basic RapidIO fabric enumeration and discovery
mechanism similar to one described in RapidIO specification Annex 1.
but doesn't appear to be what you intended.
^ permalink raw reply
* RE: [PATCH 2/2 V7] powerpc/85xx: Add machine check handler to fix PCIe erratum on mpc85xx
From: Jia Hongtao-B38951 @ 2013-04-27 2:26 UTC (permalink / raw)
To: Wood Scott-B07421, Segher Boessenkool; +Cc: linuxppc-dev@lists.ozlabs.org
In-Reply-To: <1366909096.30341.3@snotra>
> -----Original Message-----
> From: Wood Scott-B07421
> Sent: Friday, April 26, 2013 12:58 AM
> To: Segher Boessenkool
> Cc: Jia Hongtao-B38951; linuxppc-dev@lists.ozlabs.org;
> galak@kernel.crashing.org; Wood Scott-B07421
> Subject: Re: [PATCH 2/2 V7] powerpc/85xx: Add machine check handler to
> fix PCIe erratum on mpc85xx
>=20
> On 04/25/2013 10:31:51 AM, Segher Boessenkool wrote:
> >> * Remove A variant of load instruction emulation
> >
> > Why is this? You handle all other simple load insns, there is nothing
> > special about LHA. (I reviewed the V4 email thread, no reason for the
> > chance is given there).
>=20
> The LHA implementation in V5 was incorrect (didn't sign-extend).
>=20
> -Scott
In former email you doubt whether we need A variant or not.
Any particular reason for that?
If not should I emulate all the A ARX AU AUX and AX variant?
Thanks.
-Hongtao.
^ permalink raw reply
* Re: [PATCH v2] PowerPC: kernel: compiling issue, make additional room in exception vector area
From: Chen Gang F T @ 2013-04-27 9:28 UTC (permalink / raw)
To: Mike Qiu
Cc: sfr, Michael Neuling, matt, Chen Gang, linux-kernel,
Paul Mackerras, Aneesh Kumar K.V, linuxppc-dev
In-Reply-To: <5179FA8E.9090501@linux.vnet.ibm.com>
On 2013年04月26日 11:54, Mike Qiu wrote:
> 于 2013/4/26 11:42, Chen Gang 写道:
>> On 2013年04月26日 11:25, Chen Gang wrote:
>>> On 2013年04月26日 11:08, Mike Qiu wrote:
>>>> 于 2013/4/26 10:06, Chen Gang 写道:
>>>>> On 2013年04月26日 10:03, Mike Qiu wrote:
>>>>>> �� 2013/4/26 9:36, Chen Gang �:
>>>>>>>> On 2013��04��26�� 09:18, Chen Gang wrote:
>>>>>>>>>> On 2013��04��26�� 09:06, Chen Gang wrote:
>>>>>>>>>>>>>> CFAR is the Come From Register. It saves the location of the
>>>>>>>>>>>>>> last
>>>>>>>>>>>>>>>> branch and is hence overwritten by any branch.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>> Do we process it just like others done (e.g. 0x300, 0xe00,
>>>>>>>>>>>> 0xe20 ...) ?
>>>>>>>>>>>> . = 0x900
>>>>>>>>>>>> .globl decrementer_pSeries
>>>>>>>>>>>> decrementer_pSeries:
>>>>>>>>>>>> HMT_MEDIUM_PPR_DISCARD
>>>>>>>>>>>> SET_SCRATCH0(r13)
>>>>>>>>>>>> b decrementer_pSeries_0
>>>>>>>>>>>>
>>>>>>>>>>>> ...
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>> Oh, it seems EXCEPTION_PROLOG_1 will save the regesters which
>>>>>>>> related
>>>>>>>> with CFAR, so I think need move EXCEPTION_PROLOG_1 to near 0x900.
>>>>>> I will try your diff V2, to see if the machine can boot up
>>>>> OK, thanks. (hope it can work)
>>>> It seems that the machine can be bootup in powernv mode, but I'm not
>>>> sure if my machine call that module.
>>>>
>>>> At lease my machine can boot up
>> Please reference commit number: 1707dd161349e6c54170c88d94fed012e3d224e3
>> (1707dd1 powerpc: Save CFAR before branching in interrupt entry paths)
>>
>> What our diff v2 has done is just the fix for our patch v2 (just like
>> the commit 1707dd1 has done).
>>
>> Please check, thanks.
>>
>> :-)
> I will check this evening or tomorrow, I have something else to do this
> afteroon.
I think the diff v2 is correct, but is not the best one for this issue.
I prefer the Paul's patch for this issue which has better performance
:-)
Thanks.
--
Chen Gang
Flying Transformer
^ permalink raw reply
* Re: [PATCH v2] PowerPC: kernel: compiling issue, make additional room in exception vector area
From: Mike Qiu @ 2013-04-27 9:32 UTC (permalink / raw)
To: Chen Gang F T
Cc: sfr, Michael Neuling, matt, Chen Gang, linux-kernel,
Paul Mackerras, Aneesh Kumar K.V, linuxppc-dev
In-Reply-To: <517B9A4D.1020502@gmail.com>
于 2013/4/27 17:28, Chen Gang F T 写道:
> On 2013年04月26日 11:54, Mike Qiu wrote:
>> 于 2013/4/26 11:42, Chen Gang 写道:
>>> On 2013年04月26日 11:25, Chen Gang wrote:
>>>> On 2013年04月26日 11:08, Mike Qiu wrote:
>>>>> 于 2013/4/26 10:06, Chen Gang 写道:
>>>>>> On 2013年04月26日 10:03, Mike Qiu wrote:
>>>>>>> �� 2013/4/26 9:36, Chen Gang �:
>>>>>>>>> On 2013��04��26�� 09:18, Chen Gang wrote:
>>>>>>>>>>> On 2013��04��26�� 09:06, Chen Gang wrote:
>>>>>>>>>>>>>>> CFAR is the Come From Register. It saves the location of the
>>>>>>>>>>>>>>> last
>>>>>>>>>>>>>>>>> branch and is hence overwritten by any branch.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>> Do we process it just like others done (e.g. 0x300, 0xe00,
>>>>>>>>>>>>> 0xe20 ...) ?
>>>>>>>>>>>>> . = 0x900
>>>>>>>>>>>>> .globl decrementer_pSeries
>>>>>>>>>>>>> decrementer_pSeries:
>>>>>>>>>>>>> HMT_MEDIUM_PPR_DISCARD
>>>>>>>>>>>>> SET_SCRATCH0(r13)
>>>>>>>>>>>>> b decrementer_pSeries_0
>>>>>>>>>>>>>
>>>>>>>>>>>>> ...
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>> Oh, it seems EXCEPTION_PROLOG_1 will save the regesters which
>>>>>>>>> related
>>>>>>>>> with CFAR, so I think need move EXCEPTION_PROLOG_1 to near 0x900.
>>>>>>> I will try your diff V2, to see if the machine can boot up
>>>>>> OK, thanks. (hope it can work)
>>>>> It seems that the machine can be bootup in powernv mode, but I'm not
>>>>> sure if my machine call that module.
>>>>>
>>>>> At lease my machine can boot up
>>> Please reference commit number: 1707dd161349e6c54170c88d94fed012e3d224e3
>>> (1707dd1 powerpc: Save CFAR before branching in interrupt entry paths)
>>>
>>> What our diff v2 has done is just the fix for our patch v2 (just like
>>> the commit 1707dd1 has done).
>>>
>>> Please check, thanks.
>>>
>>> :-)
>> I will check this evening or tomorrow, I have something else to do this
>> afteroon.
> I think the diff v2 is correct, but is not the best one for this issue.
>
> I prefer the Paul's patch for this issue which has better performance
>
> :-)
yes, I use your patch and it can work, also Paul's patch can work too.
>
> Thanks.
>
^ permalink raw reply
* Re: [PATCH v2] PowerPC: kernel: compiling issue, make additional room in exception vector area
From: Chen Gang @ 2013-04-27 9:33 UTC (permalink / raw)
To: Mike Qiu
Cc: sfr, Michael Neuling, matt, linux-kernel, Paul Mackerras,
Aneesh Kumar K.V, Chen Gang F T, linuxppc-dev
In-Reply-To: <517B9B2B.30007@linux.vnet.ibm.com>
On 2013年04月27日 17:32, Mike Qiu wrote:
>>>
>> I think the diff v2 is correct, but is not the best one for this issue.
>>
>> I prefer the Paul's patch for this issue which has better performance
>>
>> :-)
> yes, I use your patch and it can work, also Paul's patch can work too.
Good news.
Bye !
:-)
--
Chen Gang
Asianux Corporation
^ permalink raw reply
* Re: [PATCH] powerpc: Fix "attempt to move .org backwards" error
From: Mike Qiu @ 2013-04-27 9:34 UTC (permalink / raw)
To: Paul Mackerras; +Cc: linuxppc-dev, Chen Gang
In-Reply-To: <20130426035140.GA5796@drongo>
于 2013/4/26 11:51, Paul Mackerras 写道:
> Building a 64-bit powerpc kernel with PR KVM enabled currently gives
> this error:
>
> AS arch/powerpc/kernel/head_64.o
> arch/powerpc/kernel/exceptions-64s.S: Assembler messages:
> arch/powerpc/kernel/exceptions-64s.S:258: Error: attempt to move .org backwards
> make[2]: *** [arch/powerpc/kernel/head_64.o] Error 1
>
> This happens because the MASKABLE_EXCEPTION_PSERIES macro turns into
> 33 instructions, but we only have space for 32 at the decrementer
> interrupt vector (from 0x900 to 0x980).
>
> In the code generated by the MASKABLE_EXCEPTION_PSERIES macro, we
> currently have two instances of the HMT_MEDIUM macro, which has the
> effect of setting the SMT thread priority to medium. One is the
> first instruction, and is overwritten by a no-op on processors where
> we save the PPR (processor priority register), that is, POWER7 or
> later. The other is after we have saved the PPR.
>
> In order to reduce the code at 0x900 by one instruction, we omit the
> first HMT_MEDIUM. On processors without SMT this will have no effect
> since HMT_MEDIUM is a no-op there. On POWER5 and RS64 machines this
> will mean that the first few instructions take a little longer in the
> case where a decrementer interrupt occurs when the hardware thread is
> running at low SMT priority. On POWER6 and later machines, the
> hardware automatically boosts the thread priority when a decrementer
> interrupt is taken if the thread priority was below medium, so this
> change won't make any difference.
>
> The alternative would be to branch out of line after saving the CFAR.
> However, that would incur an extra overhead on all processors, whereas
> the approach adopted here only adds overhead on older threaded processors.
>
> Signed-off-by: Paul Mackerras <paulus@samba.org>
> ---
> arch/powerpc/include/asm/exception-64s.h | 2 +-
> arch/powerpc/kernel/exceptions-64s.S | 7 ++++++-
> 2 files changed, 7 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
> index 05e6d2e..8e5fae8 100644
> --- a/arch/powerpc/include/asm/exception-64s.h
> +++ b/arch/powerpc/include/asm/exception-64s.h
> @@ -414,7 +414,6 @@ label##_relon_hv: \
> #define SOFTEN_NOTEST_HV(vec) _SOFTEN_TEST(EXC_HV, vec)
>
> #define __MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra) \
> - HMT_MEDIUM_PPR_DISCARD; \
> SET_SCRATCH0(r13); /* save r13 */ \
> EXCEPTION_PROLOG_0(PACA_EXGEN); \
> __EXCEPTION_PROLOG_1(PACA_EXGEN, extra, vec); \
> @@ -427,6 +426,7 @@ label##_relon_hv: \
> . = loc; \
> .globl label##_pSeries; \
> label##_pSeries: \
> + HMT_MEDIUM_PPR_DISCARD; \
> _MASKABLE_EXCEPTION_PSERIES(vec, label, \
> EXC_STD, SOFTEN_TEST_PR)
>
> diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
> index 56bd923..574db3f 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -235,6 +235,7 @@ instruction_access_slb_pSeries:
> .globl hardware_interrupt_hv;
> hardware_interrupt_pSeries:
> hardware_interrupt_hv:
> + HMT_MEDIUM_PPR_DISCARD
> BEGIN_FTR_SECTION
> _MASKABLE_EXCEPTION_PSERIES(0x502, hardware_interrupt,
> EXC_HV, SOFTEN_TEST_HV)
> @@ -254,7 +255,11 @@ hardware_interrupt_hv:
> STD_EXCEPTION_PSERIES(0x800, 0x800, fp_unavailable)
> KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x800)
>
> - MASKABLE_EXCEPTION_PSERIES(0x900, 0x900, decrementer)
> + . = 0x900
> + .globl decrementer_pSeries
> +decrementer_pSeries:
> + _MASKABLE_EXCEPTION_PSERIES(0x900, decrementer, EXC_STD, SOFTEN_TEST_PR)
> +
> STD_EXCEPTION_HV(0x980, 0x982, hdecrementer)
>
> MASKABLE_EXCEPTION_PSERIES(0xa00, 0xa00, doorbell_super)
test-by: Mike Qiu <qiudayu@linux.vnet.ibm.com>
It's workable for me. but I just use this patch to compile and boot up
the machine. not do any performance test:)
^ permalink raw reply
* Re: [PATCH 2/2 V7] powerpc/85xx: Add machine check handler to fix PCIe erratum on mpc85xx
From: Segher Boessenkool @ 2013-04-27 13:31 UTC (permalink / raw)
To: Jia Hongtao-B38951; +Cc: Wood Scott-B07421, linuxppc-dev@lists.ozlabs.org
In-Reply-To: <412C8208B4A0464FA894C5F0C278CD5D01C4C76D@039-SN1MPN1-002.039d.mgd.msft.net>
> In former email you doubt whether we need A variant or not.
> Any particular reason for that?
> If not should I emulate all the A ARX AU AUX and AX variant?
A/AU/AX/AUX are just normal loads, sign-extended instead of
zero-extended (so assign -1 to the register loaded).
The ARX thing is load-locked, you do not want that one.
Segher
^ permalink raw reply
* RE: [PATCH 2/2 V7] powerpc/85xx: Add machine check handler to fix PCIe erratum on mpc85xx
From: Jia Hongtao-B38951 @ 2013-04-28 2:20 UTC (permalink / raw)
To: Segher Boessenkool; +Cc: Wood Scott-B07421, linuxppc-dev@lists.ozlabs.org
In-Reply-To: <C8615FAE-3DFA-4086-88AE-2EA9EA700D15@kernel.crashing.org>
> -----Original Message-----
> From: Segher Boessenkool [mailto:segher@kernel.crashing.org]
> Sent: Saturday, April 27, 2013 9:32 PM
> To: Jia Hongtao-B38951
> Cc: Wood Scott-B07421; linuxppc-dev@lists.ozlabs.org;
> galak@kernel.crashing.org
> Subject: Re: [PATCH 2/2 V7] powerpc/85xx: Add machine check handler to
> fix PCIe erratum on mpc85xx
>=20
> > In former email you doubt whether we need A variant or not.
> > Any particular reason for that?
> > If not should I emulate all the A ARX AU AUX and AX variant?
>=20
> A/AU/AX/AUX are just normal loads, sign-extended instead of zero-extended
> (so assign -1 to the register loaded).
>=20
> The ARX thing is load-locked, you do not want that one.
>=20
>=20
> Segher
Thanks, very helpful.
-Hongtao
^ permalink raw reply
* [PATCH 1/2 V2] powerpc: Move opcode definitions from kvm/emulate.c to asm/ppc-opcode.h
From: Jia Hongtao @ 2013-04-28 5:20 UTC (permalink / raw)
To: linuxppc-dev, galak, B07421, segher; +Cc: hongtao.jia
Opcode and xopcode are useful definitions not just for KVM. Move these
definitions to asm/ppc-opcode.h for public use.
Signed-off-by: Jia Hongtao <hongtao.jia@freescale.com>
Signed-off-by: Li Yang <leoli@freescale.com>
---
V2:
* Add LHAUX definition.
arch/powerpc/include/asm/ppc-opcode.h | 46 +++++++++++++++++++++++++++++++++++
arch/powerpc/kvm/emulate.c | 44 +--------------------------------
2 files changed, 47 insertions(+), 43 deletions(-)
diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index 8752bc8..79057f7 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -81,6 +81,52 @@
#define __REGA0_R30 30
#define __REGA0_R31 31
+/* opcode and xopcode for instructions */
+#define OP_TRAP 3
+#define OP_TRAP_64 2
+
+#define OP_31_XOP_TRAP 4
+#define OP_31_XOP_LWZX 23
+#define OP_31_XOP_LWZUX 55
+#define OP_31_XOP_TRAP_64 68
+#define OP_31_XOP_DCBF 86
+#define OP_31_XOP_LBZX 87
+#define OP_31_XOP_STWX 151
+#define OP_31_XOP_STBX 215
+#define OP_31_XOP_LBZUX 119
+#define OP_31_XOP_STBUX 247
+#define OP_31_XOP_LHZX 279
+#define OP_31_XOP_LHZUX 311
+#define OP_31_XOP_MFSPR 339
+#define OP_31_XOP_LHAX 343
+#define OP_31_XOP_LHAUX 375
+#define OP_31_XOP_STHX 407
+#define OP_31_XOP_STHUX 439
+#define OP_31_XOP_MTSPR 467
+#define OP_31_XOP_DCBI 470
+#define OP_31_XOP_LWBRX 534
+#define OP_31_XOP_TLBSYNC 566
+#define OP_31_XOP_STWBRX 662
+#define OP_31_XOP_LHBRX 790
+#define OP_31_XOP_STHBRX 918
+
+#define OP_LWZ 32
+#define OP_LD 58
+#define OP_LWZU 33
+#define OP_LBZ 34
+#define OP_LBZU 35
+#define OP_STW 36
+#define OP_STWU 37
+#define OP_STD 62
+#define OP_STB 38
+#define OP_STBU 39
+#define OP_LHZ 40
+#define OP_LHZU 41
+#define OP_LHA 42
+#define OP_LHAU 43
+#define OP_STH 44
+#define OP_STHU 45
+
/* sorted alphabetically */
#define PPC_INST_DCBA 0x7c0005ec
#define PPC_INST_DCBA_MASK 0xfc0007fe
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 7a73b6f..426d3f5 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -30,52 +30,10 @@
#include <asm/byteorder.h>
#include <asm/kvm_ppc.h>
#include <asm/disassemble.h>
+#include <asm/ppc-opcode.h>
#include "timing.h"
#include "trace.h"
-#define OP_TRAP 3
-#define OP_TRAP_64 2
-
-#define OP_31_XOP_TRAP 4
-#define OP_31_XOP_LWZX 23
-#define OP_31_XOP_TRAP_64 68
-#define OP_31_XOP_DCBF 86
-#define OP_31_XOP_LBZX 87
-#define OP_31_XOP_STWX 151
-#define OP_31_XOP_STBX 215
-#define OP_31_XOP_LBZUX 119
-#define OP_31_XOP_STBUX 247
-#define OP_31_XOP_LHZX 279
-#define OP_31_XOP_LHZUX 311
-#define OP_31_XOP_MFSPR 339
-#define OP_31_XOP_LHAX 343
-#define OP_31_XOP_STHX 407
-#define OP_31_XOP_STHUX 439
-#define OP_31_XOP_MTSPR 467
-#define OP_31_XOP_DCBI 470
-#define OP_31_XOP_LWBRX 534
-#define OP_31_XOP_TLBSYNC 566
-#define OP_31_XOP_STWBRX 662
-#define OP_31_XOP_LHBRX 790
-#define OP_31_XOP_STHBRX 918
-
-#define OP_LWZ 32
-#define OP_LD 58
-#define OP_LWZU 33
-#define OP_LBZ 34
-#define OP_LBZU 35
-#define OP_STW 36
-#define OP_STWU 37
-#define OP_STD 62
-#define OP_STB 38
-#define OP_STBU 39
-#define OP_LHZ 40
-#define OP_LHZU 41
-#define OP_LHA 42
-#define OP_LHAU 43
-#define OP_STH 44
-#define OP_STHU 45
-
void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
{
unsigned long dec_nsec;
--
1.8.0
^ permalink raw reply related
* [PATCH 2/2 V8] powerpc/85xx: Add machine check handler to fix PCIe erratum on mpc85xx
From: Jia Hongtao @ 2013-04-28 5:20 UTC (permalink / raw)
To: linuxppc-dev, galak, B07421, segher; +Cc: hongtao.jia
In-Reply-To: <1367126408-12997-1-git-send-email-hongtao.jia@freescale.com>
A PCIe erratum of mpc85xx may causes a core hang when a link of PCIe
goes down. when the link goes down, Non-posted transactions issued
via the ATMU requiring completion result in an instruction stall.
At the same time a machine-check exception is generated to the core
to allow further processing by the handler. We implements the handler
which skips the instruction caused the stall.
This patch depends on patch:
powerpc/85xx: Add platform_device declaration to fsl_pci.h
Signed-off-by: Zhao Chenhui <b35336@freescale.com>
Signed-off-by: Li Yang <leoli@freescale.com>
Signed-off-by: Liu Shuo <soniccat.liu@gmail.com>
Signed-off-by: Jia Hongtao <hongtao.jia@freescale.com>
---
V8:
* Add A variant load instruction emulation.
V7:
* Correct PCIe checking method (Using indirect_type member of pci_controller
stucture).
V6:
* Move OP and XOP defines to a new header file: asm/ppc-disassemble.h
* Add X UX BRX variant of load instruction emulation
* Remove A variant of load instruction emulation
V5:
* Fill rd with all-Fs if the skipped instruction is load and emulate the
instruction.
* Let KVM/QEMU deal with the exception if the machine check comes from KVM.
arch/powerpc/kernel/cpu_setup_fsl_booke.S | 2 +-
arch/powerpc/kernel/traps.c | 3 +
arch/powerpc/sysdev/fsl_pci.c | 158 ++++++++++++++++++++++++++++++
arch/powerpc/sysdev/fsl_pci.h | 6 ++
4 files changed, 168 insertions(+), 1 deletion(-)
diff --git a/arch/powerpc/kernel/cpu_setup_fsl_booke.S b/arch/powerpc/kernel/cpu_setup_fsl_booke.S
index 0b9af01..bfb18c7 100644
--- a/arch/powerpc/kernel/cpu_setup_fsl_booke.S
+++ b/arch/powerpc/kernel/cpu_setup_fsl_booke.S
@@ -75,7 +75,7 @@ _GLOBAL(__setup_cpu_e500v2)
bl __e500_icache_setup
bl __e500_dcache_setup
bl __setup_e500_ivors
-#ifdef CONFIG_FSL_RIO
+#if defined(CONFIG_FSL_RIO) || defined(CONFIG_FSL_PCI)
/* Ensure that RFXE is set */
mfspr r3,SPRN_HID1
oris r3,r3,HID1_RFXE@h
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 37cc40e..d15cfb5 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -60,6 +60,7 @@
#include <asm/switch_to.h>
#include <asm/tm.h>
#include <asm/debug.h>
+#include <sysdev/fsl_pci.h>
#if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
int (*__debugger)(struct pt_regs *regs) __read_mostly;
@@ -565,6 +566,8 @@ int machine_check_e500(struct pt_regs *regs)
if (reason & MCSR_BUS_RBERR) {
if (fsl_rio_mcheck_exception(regs))
return 1;
+ if (fsl_pci_mcheck_exception(regs))
+ return 1;
}
printk("Machine check in kernel mode.\n");
diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
index 40ffe29..5fa851a 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@ -26,11 +26,15 @@
#include <linux/memblock.h>
#include <linux/log2.h>
#include <linux/slab.h>
+#include <linux/uaccess.h>
#include <asm/io.h>
#include <asm/prom.h>
#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
#include <asm/machdep.h>
+#include <asm/disassemble.h>
+#include <asm/ppc-opcode.h>
#include <sysdev/fsl_soc.h>
#include <sysdev/fsl_pci.h>
@@ -876,6 +880,160 @@ u64 fsl_pci_immrbar_base(struct pci_controller *hose)
return 0;
}
+#ifdef CONFIG_E500
+static int mcheck_handle_load(struct pt_regs *regs, u32 inst)
+{
+ unsigned int rd, ra, rb, d;
+
+ rd = get_rt(inst);
+ ra = get_ra(inst);
+ rb = get_rb(inst);
+ d = get_d(inst);
+
+ switch (get_op(inst)) {
+ case 31:
+ switch (get_xop(inst)) {
+ case OP_31_XOP_LWZX:
+ case OP_31_XOP_LWBRX:
+ regs->gpr[rd] = 0xffffffff;
+ break;
+
+ case OP_31_XOP_LWZUX:
+ regs->gpr[rd] = 0xffffffff;
+ regs->gpr[ra] += regs->gpr[rb];
+ break;
+
+ case OP_31_XOP_LBZX:
+ regs->gpr[rd] = 0xff;
+ break;
+
+ case OP_31_XOP_LBZUX:
+ regs->gpr[rd] = 0xff;
+ regs->gpr[ra] += regs->gpr[rb];
+ break;
+
+ case OP_31_XOP_LHZX:
+ case OP_31_XOP_LHBRX:
+ regs->gpr[rd] = 0xffff;
+ break;
+
+ case OP_31_XOP_LHZUX:
+ regs->gpr[rd] = 0xffff;
+ regs->gpr[ra] += regs->gpr[rb];
+ break;
+
+ case OP_31_XOP_LHAX:
+ regs->gpr[rd] = ~0UL;
+ break;
+
+ case OP_31_XOP_LHAUX:
+ regs->gpr[rd] = ~0UL;
+ regs->gpr[ra] += regs->gpr[rb];
+ break;
+
+ default:
+ return 0;
+ }
+ break;
+
+ case OP_LWZ:
+ regs->gpr[rd] = 0xffffffff;
+ break;
+
+ case OP_LWZU:
+ regs->gpr[rd] = 0xffffffff;
+ regs->gpr[ra] += (s16)d;
+ break;
+
+ case OP_LBZ:
+ regs->gpr[rd] = 0xff;
+ break;
+
+ case OP_LBZU:
+ regs->gpr[rd] = 0xff;
+ regs->gpr[ra] += (s16)d;
+ break;
+
+ case OP_LHZ:
+ regs->gpr[rd] = 0xffff;
+ break;
+
+ case OP_LHZU:
+ regs->gpr[rd] = 0xffff;
+ regs->gpr[ra] += (s16)d;
+ break;
+
+ case OP_LHA:
+ regs->gpr[rd] = ~0UL;
+ break;
+
+ case OP_LHAU:
+ regs->gpr[rd] = ~0UL;
+ regs->gpr[ra] += (s16)d;
+ break;
+
+ default:
+ return 0;
+ }
+
+ return 1;
+}
+
+static int is_in_pci_mem_space(phys_addr_t addr)
+{
+ struct pci_controller *hose;
+ struct resource *res;
+ int i;
+
+ list_for_each_entry(hose, &hose_list, list_node) {
+ if (!(hose->indirect_type & PPC_INDIRECT_TYPE_EXT_REG))
+ continue;
+
+ for (i = 0; i < 3; i++) {
+ res = &hose->mem_resources[i];
+ if ((res->flags & IORESOURCE_MEM) &&
+ addr >= res->start && addr <= res->end)
+ return 1;
+ }
+ }
+ return 0;
+}
+
+int fsl_pci_mcheck_exception(struct pt_regs *regs)
+{
+ u32 inst;
+ int ret;
+ phys_addr_t addr = 0;
+
+ /* Let KVM/QEMU deal with the exception */
+ if (regs->msr & MSR_GS)
+ return 0;
+
+#ifdef CONFIG_PHYS_64BIT
+ addr = mfspr(SPRN_MCARU);
+ addr <<= 32;
+#endif
+ addr += mfspr(SPRN_MCAR);
+
+ if (is_in_pci_mem_space(addr)) {
+ if (user_mode(regs)) {
+ pagefault_disable();
+ ret = get_user(regs->nip, &inst);
+ pagefault_enable();
+ } else {
+ ret = probe_kernel_address(regs->nip, inst);
+ }
+
+ if (mcheck_handle_load(regs, inst)) {
+ regs->nip += 4;
+ return 1;
+ }
+ }
+
+ return 0;
+}
+#endif
+
#if defined(CONFIG_FSL_SOC_BOOKE) || defined(CONFIG_PPC_86xx)
static const struct of_device_id pci_ids[] = {
{ .compatible = "fsl,mpc8540-pci", },
diff --git a/arch/powerpc/sysdev/fsl_pci.h b/arch/powerpc/sysdev/fsl_pci.h
index 72b5625..defc422 100644
--- a/arch/powerpc/sysdev/fsl_pci.h
+++ b/arch/powerpc/sysdev/fsl_pci.h
@@ -126,5 +126,11 @@ static inline int mpc85xx_pci_err_probe(struct platform_device *op)
}
#endif
+#ifdef CONFIG_FSL_PCI
+extern int fsl_pci_mcheck_exception(struct pt_regs *);
+#else
+static inline int fsl_pci_mcheck_exception(struct pt_regs *regs) {return 0; }
+#endif
+
#endif /* __POWERPC_FSL_PCI_H */
#endif /* __KERNEL__ */
--
1.8.0
^ permalink raw reply related
* Re: [PATCH v2 12/15] powerpc/85xx: add time base sync support for e6500
From: Zhao Chenhui @ 2013-04-28 9:56 UTC (permalink / raw)
To: Scott Wood; +Cc: linuxppc-dev, linux-kernel, r58472
In-Reply-To: <1366934844.30341.16@snotra>
On Thu, Apr 25, 2013 at 07:07:24PM -0500, Scott Wood wrote:
> On 04/24/2013 07:28:18 PM, Zhao Chenhui wrote:
> >On Wed, Apr 24, 2013 at 05:38:16PM -0500, Scott Wood wrote:
> >> On 04/24/2013 06:29:29 AM, Zhao Chenhui wrote:
> >> >On Tue, Apr 23, 2013 at 07:04:06PM -0500, Scott Wood wrote:
> >> >> On 04/19/2013 05:47:45 AM, Zhao Chenhui wrote:
> >> >> >From: Chen-Hui Zhao <chenhui.zhao@freescale.com>
> >> >> >
> >> >> >For e6500, two threads in one core share one time base. Just
> >need
> >> >> >to do time base sync on first thread of one core, and skip it on
> >> >> >the other thread.
> >> >> >
> >> >> >Signed-off-by: Zhao Chenhui <chenhui.zhao@freescale.com>
> >> >> >Signed-off-by: Li Yang <leoli@freescale.com>
> >> >> >Signed-off-by: Andy Fleming <afleming@freescale.com>
> >> >> >---
> >> >> > arch/powerpc/platforms/85xx/smp.c | 52
> >> >> >+++++++++++++++++++++++++++++++-----
> >> >> > 1 files changed, 44 insertions(+), 8 deletions(-)
> >> >> >
> >> >> >diff --git a/arch/powerpc/platforms/85xx/smp.c
> >> >> >b/arch/powerpc/platforms/85xx/smp.c
> >> >> >index 74d8cde..5f3eee3 100644
> >> >> >--- a/arch/powerpc/platforms/85xx/smp.c
> >> >> >+++ b/arch/powerpc/platforms/85xx/smp.c
> >> >> >@@ -53,26 +55,40 @@ static inline u32 get_phy_cpu_mask(void)
> >> >> > u32 mask;
> >> >> > int cpu;
> >> >> >
> >> >> >- mask = 1 << cur_booting_core;
> >> >> >- for_each_online_cpu(cpu)
> >> >> >- mask |= 1 << get_hard_smp_processor_id(cpu);
> >> >> >+ if (smt_capable()) {
> >> >> >+ /* two threads in one core share one time base */
> >> >> >+ mask = 1 << cpu_core_index_of_thread(cur_booting_core);
> >> >> >+ for_each_online_cpu(cpu)
> >> >> >+ mask |= 1 << cpu_core_index_of_thread(
> >> >> >+ get_hard_smp_processor_id(cpu));
> >> >> >+ } else {
> >> >> >+ mask = 1 << cur_booting_core;
> >> >> >+ for_each_online_cpu(cpu)
> >> >> >+ mask |= 1 << get_hard_smp_processor_id(cpu);
> >> >> >+ }
> >> >>
> >> >> Where is smt_capable defined()? I assume somewhere in the
> >patchset
> >> >> but it's a pain to search 12 patches...
> >> >>
> >> >
> >> >It is defined in arch/powerpc/include/asm/topology.h.
> >> > #define smt_capable() (cpu_has_feature(CPU_FTR_SMT))
> >> >
> >> >Thanks for your review again.
> >>
> >> We shouldn't base it on CPU_FTR_SMT. For example, e6500 doesn't
> >> claim that feature yet, except in our SDK kernel. That doesn't
> >> change the topology of CPU numbering.
> >>
> >
> >Then, where can I get the thread information? dts?
> >Or, wait for upstream of the thread suppport of e6500.
>
> It's an inherent property of e6500 (outside of some virtualization
> scenarios, but you wouldn't run this code under a hypervisor) that
> you have two threads per core (whether Linux uses them or not). Or
> you could read TMCFG0[NTHRD] if you know you're on a chip that has
> TMRs but aren't positive it's an e6500, but I wouldn't bother. If
> we do ever have such a chip, there are probably other things that
> will need updating.
>
But how to know that there are TMRs on a chip except by CPU_FTR_SMT.
> >> >static inline u32 get_phy_cpu_mask(void)
> >> >{
> >> > u32 mask;
> >> > int cpu;
> >> >
> >> > mask = 1 << cpu_core_index_of_thread(cur_booting_core);
> >> > for_each_online_cpu(cpu)
> >> > mask |= 1 << cpu_core_index_of_thread(
> >> > get_hard_smp_processor_id(cpu));
> >> >
> >> > return mask;
> >> >}
> >>
> >> Likewise, this will get it wrong if SMT is disabled or not yet
> >> implemented on a core.
> >>
> >> -Scott
> >
> >Let's look into cpu_core_index_of_thread() in
> >arch/powerpc/kernel/smp.c.
> >
> > int cpu_core_index_of_thread(int cpu)
> > {
> > return cpu >> threads_shift;
> > }
> >
> >If no thread, the threads_shift is equal to 0. It can work with no
> >thread.
>
> My point is that if threads are disabled, threads_shift will be 0,
> but e6500 cores will still be numbered 0, 2, 4, etc.
>
> >Perhaps, I should submit this patch after the thread patches for
> >e6500.
>
> Why?
>
> -Scott
Even if threads are disabled, the threads_shift derived from dts is right.
But, if there aren't the thread related patches existed in SDK, the threads_shift
gets a wrong value on T4.
-Chenhui
^ permalink raw reply
* Re: [PATCH v2 02/15] powerpc/85xx: add sleep and deep sleep support
From: Zhao Chenhui @ 2013-04-28 10:20 UTC (permalink / raw)
To: Scott Wood; +Cc: linuxppc-dev, linux-kernel, r58472
In-Reply-To: <1366761200.5825.18@snotra>
On Tue, Apr 23, 2013 at 06:53:20PM -0500, Scott Wood wrote:
> On 04/19/2013 05:47:35 AM, Zhao Chenhui wrote:
> > static int pmc_suspend_enter(suspend_state_t state)
> > {
> >- int ret;
> >+ int ret = 0;
> >+
> >+ switch (state) {
> >+#ifdef CONFIG_PPC_85xx
> >+ case PM_SUSPEND_MEM:
> >+#ifdef CONFIG_SPE
> >+ enable_kernel_spe();
> >+#endif
> >+ enable_kernel_fp();
>
> Why does enable_kernel_spe() need an ifdef but enable_kernel_fp()
> doesn't?
>
will enclose it with CONFIG_PPC_FPU.
> >+ case PM_SUSPEND_STANDBY:
> >+#ifdef CONFIG_FSL_SOC_BOOKE
> >+ flush_dcache_L1();
> >+#endif
> >+ setbits32(&pmc_regs->powmgtcsr, POWMGTCSR_SLP);
>
> Only L1, even on e500mc?
>
> -Scott
This patch is just for chips with pmc unit. They have no e500mc core.
-Chenhui
^ permalink raw reply
* [PATCH -V7 05/18] powerpc: Save DAR and DSISR in pt_regs on MCE
From: Aneesh Kumar K.V @ 2013-04-28 19:37 UTC (permalink / raw)
To: benh, paulus, dwg, linux-mm; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1367177859-7893-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
We were not saving DAR and DSISR on MCE. Save then and also print the values
along with exception details in xmon.
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
arch/powerpc/kernel/exceptions-64s.S | 9 +++++++++
arch/powerpc/xmon/xmon.c | 2 +-
2 files changed, 10 insertions(+), 1 deletion(-)
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 56bd923..7da3f94 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -688,9 +688,18 @@ slb_miss_user_pseries:
.align 7
.globl machine_check_common
machine_check_common:
+
+ mfspr r10,SPRN_DAR
+ std r10,PACA_EXGEN+EX_DAR(r13)
+ mfspr r10,SPRN_DSISR
+ stw r10,PACA_EXGEN+EX_DSISR(r13)
EXCEPTION_PROLOG_COMMON(0x200, PACA_EXMC)
FINISH_NAP
DISABLE_INTS
+ ld r3,PACA_EXGEN+EX_DAR(r13)
+ lwz r4,PACA_EXGEN+EX_DSISR(r13)
+ std r3,_DAR(r1)
+ std r4,_DSISR(r1)
bl .save_nvgprs
addi r3,r1,STACK_FRAME_OVERHEAD
bl .machine_check_exception
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 13f85de..51e237c 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -1430,7 +1430,7 @@ static void excprint(struct pt_regs *fp)
printf(" sp: %lx\n", fp->gpr[1]);
printf(" msr: %lx\n", fp->msr);
- if (trap == 0x300 || trap == 0x380 || trap == 0x600) {
+ if (trap == 0x300 || trap == 0x380 || trap == 0x600 || trap == 0x200) {
printf(" dar: %lx\n", fp->dar);
if (trap != 0x380)
printf(" dsisr: %lx\n", fp->dsisr);
--
1.8.1.2
^ permalink raw reply related
* [PATCH -V7 13/18] powerpc: Use encode avpn where we need only avpn values
From: Aneesh Kumar K.V @ 2013-04-28 19:37 UTC (permalink / raw)
To: benh, paulus, dwg, linux-mm; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1367177859-7893-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
In all these cases we are doing something similar to
HPTE_V_COMPARE(hpte_v, want_v) which ignores the HPTE_V_LARGE bit
With MPSS support we would need actual page size to set HPTE_V_LARGE
bit and that won't be available in most of these cases. Since we are ignoring
HPTE_V_LARGE bit, use the avpn value instead. There should not be any change
in behaviour after this patch.
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
arch/powerpc/mm/hash_native_64.c | 8 ++++----
arch/powerpc/platforms/cell/beat_htab.c | 10 +++++-----
arch/powerpc/platforms/ps3/htab.c | 2 +-
3 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index ffc1e00..9d8983a 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -252,7 +252,7 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
unsigned long hpte_v, want_v;
int ret = 0;
- want_v = hpte_encode_v(vpn, psize, ssize);
+ want_v = hpte_encode_avpn(vpn, psize, ssize);
DBG_LOW(" update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)",
vpn, want_v & HPTE_V_AVPN, slot, newpp);
@@ -288,7 +288,7 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize)
unsigned long want_v, hpte_v;
hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
- want_v = hpte_encode_v(vpn, psize, ssize);
+ want_v = hpte_encode_avpn(vpn, psize, ssize);
/* Bolted mappings are only ever in the primary group */
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
@@ -348,7 +348,7 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
DBG_LOW(" invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot);
- want_v = hpte_encode_v(vpn, psize, ssize);
+ want_v = hpte_encode_avpn(vpn, psize, ssize);
native_lock_hpte(hptep);
hpte_v = hptep->v;
@@ -520,7 +520,7 @@ static void native_flush_hash_range(unsigned long number, int local)
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot += hidx & _PTEIDX_GROUP_IX;
hptep = htab_address + slot;
- want_v = hpte_encode_v(vpn, psize, ssize);
+ want_v = hpte_encode_avpn(vpn, psize, ssize);
native_lock_hpte(hptep);
hpte_v = hptep->v;
if (!HPTE_V_COMPARE(hpte_v, want_v) ||
diff --git a/arch/powerpc/platforms/cell/beat_htab.c b/arch/powerpc/platforms/cell/beat_htab.c
index 0f6f839..472f9a7 100644
--- a/arch/powerpc/platforms/cell/beat_htab.c
+++ b/arch/powerpc/platforms/cell/beat_htab.c
@@ -191,7 +191,7 @@ static long beat_lpar_hpte_updatepp(unsigned long slot,
u64 dummy0, dummy1;
unsigned long want_v;
- want_v = hpte_encode_v(vpn, psize, MMU_SEGSIZE_256M);
+ want_v = hpte_encode_avpn(vpn, psize, MMU_SEGSIZE_256M);
DBG_LOW(" update: "
"avpnv=%016lx, slot=%016lx, psize: %d, newpp %016lx ... ",
@@ -228,7 +228,7 @@ static long beat_lpar_hpte_find(unsigned long vpn, int psize)
unsigned long want_v, hpte_v;
hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, MMU_SEGSIZE_256M);
- want_v = hpte_encode_v(vpn, psize, MMU_SEGSIZE_256M);
+ want_v = hpte_encode_avpn(vpn, psize, MMU_SEGSIZE_256M);
for (j = 0; j < 2; j++) {
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
@@ -283,7 +283,7 @@ static void beat_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn,
DBG_LOW(" inval : slot=%lx, va=%016lx, psize: %d, local: %d\n",
slot, va, psize, local);
- want_v = hpte_encode_v(vpn, psize, MMU_SEGSIZE_256M);
+ want_v = hpte_encode_avpn(vpn, psize, MMU_SEGSIZE_256M);
raw_spin_lock_irqsave(&beat_htab_lock, flags);
dummy1 = beat_lpar_hpte_getword0(slot);
@@ -372,7 +372,7 @@ static long beat_lpar_hpte_updatepp_v3(unsigned long slot,
unsigned long want_v;
unsigned long pss;
- want_v = hpte_encode_v(vpn, psize, MMU_SEGSIZE_256M);
+ want_v = hpte_encode_avpn(vpn, psize, MMU_SEGSIZE_256M);
pss = (psize == MMU_PAGE_4K) ? -1UL : mmu_psize_defs[psize].penc;
DBG_LOW(" update: "
@@ -402,7 +402,7 @@ static void beat_lpar_hpte_invalidate_v3(unsigned long slot, unsigned long vpn,
DBG_LOW(" inval : slot=%lx, vpn=%016lx, psize: %d, local: %d\n",
slot, vpn, psize, local);
- want_v = hpte_encode_v(vpn, psize, MMU_SEGSIZE_256M);
+ want_v = hpte_encode_avpn(vpn, psize, MMU_SEGSIZE_256M);
pss = (psize == MMU_PAGE_4K) ? -1UL : mmu_psize_defs[psize].penc;
lpar_rc = beat_invalidate_htab_entry3(0, slot, want_v, pss);
diff --git a/arch/powerpc/platforms/ps3/htab.c b/arch/powerpc/platforms/ps3/htab.c
index 6cc5820..cd8f2fb 100644
--- a/arch/powerpc/platforms/ps3/htab.c
+++ b/arch/powerpc/platforms/ps3/htab.c
@@ -117,7 +117,7 @@ static long ps3_hpte_updatepp(unsigned long slot, unsigned long newpp,
unsigned long flags;
long ret;
- want_v = hpte_encode_v(vpn, psize, ssize);
+ want_v = hpte_encode_avpn(vpn, psize, ssize);
spin_lock_irqsave(&ps3_htab_lock, flags);
--
1.8.1.2
^ permalink raw reply related
* [PATCH -V7 10/18] powerpc: Reduce the PTE_INDEX_SIZE
From: Aneesh Kumar K.V @ 2013-04-28 19:37 UTC (permalink / raw)
To: benh, paulus, dwg, linux-mm; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1367177859-7893-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
This make one PMD cover 16MB range. That helps in easier implementation of THP
on power. THP core code make use of one pmd entry to track the hugepage and
the range mapped by a single pmd entry should be equal to the hugepage size
supported by the hardware.
This also switch PGD to cover 16GB. That is needed so that we can simplify the
hugetlb page walking code so that we have same pte format for explicit hugepage
and THP hugepage.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
arch/powerpc/include/asm/pgtable-ppc64-64k.h | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/arch/powerpc/include/asm/pgtable-ppc64-64k.h b/arch/powerpc/include/asm/pgtable-ppc64-64k.h
index be4e287..45142d6 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64-64k.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64-64k.h
@@ -4,10 +4,10 @@
#include <asm-generic/pgtable-nopud.h>
-#define PTE_INDEX_SIZE 12
-#define PMD_INDEX_SIZE 12
+#define PTE_INDEX_SIZE 8
+#define PMD_INDEX_SIZE 10
#define PUD_INDEX_SIZE 0
-#define PGD_INDEX_SIZE 6
+#define PGD_INDEX_SIZE 12
#ifndef __ASSEMBLY__
#define PTE_TABLE_SIZE (sizeof(real_pte_t) << PTE_INDEX_SIZE)
--
1.8.1.2
^ permalink raw reply related
* [PATCH -V7 00/18] THP support for PPC64 (Patchset 1)
From: Aneesh Kumar K.V @ 2013-04-28 19:37 UTC (permalink / raw)
To: benh, paulus, dwg, linux-mm; +Cc: linuxppc-dev
Hi,
This patchset include changes needed for mm/ and powerpc/mm/ to support THP.
I have split the patch series into two patchset, so that we can look at getting
prerequisite patches upstream in 3.10.
Some numbers:
The latency measurements code from Anton found at
http://ozlabs.org/~anton/junkcode/latency2001.c
64K page size (With THP support)
--------------------------
[root@llmp24l02 test]# ./latency2001 8G
8589934592 428.49 cycles 120.50 ns
[root@llmp24l02 test]# ./latency2001 -l 8G
8589934592 471.16 cycles 132.50 ns
[root@llmp24l02 test]# echo never > /sys/kernel/mm/transparent_hugepage/enabled
[root@llmp24l02 test]# ./latency2001 8G
8589934592 766.52 cycles 215.56 ns
[root@llmp24l02 test]#
4K page size (No THP support for 4K)
----------------------------
[root@llmp24l02 test]# ./latency2001 8G
8589934592 814.88 cycles 229.16 ns
[root@llmp24l02 test]# ./latency2001 -l 8G
8589934592 463.69 cycles 130.40 ns
[root@llmp24l02 test]#
We are close to hugetlbfs in latency and we can achieve this with zero
config/page reservation. Most of the allocations above are fault allocated.
Another test that does 50000000 random access over 1GB area goes from
2.65 seconds to 1.07 seconds with this patchset.
split_huge_page impact:
---------------------
To look at the performance impact of large page invalidate, I tried the below
experiment. The test involved, accessing a large contiguous region of memory
location as below
for (i = 0; i < size; i += PAGE_SIZE)
data[i] = i;
We wanted to access the data in sequential order so that we look at the
worst case THP performance. Accesing the data in sequential order implies
we have the Page table cached and overhead of TLB miss is as minimal as
possible. We also don't touch the entire page, because that can result in
cache evict.
After we touched the full range as above, we now call mprotect on each
of that page. A mprotect will result in a hugepage split. This should
allow us to measure the impact of hugepage split.
for (i = 0; i < size; i += PAGE_SIZE)
mprotect(&data[i], PAGE_SIZE, PROT_READ);
Split hugepage impact:
---------------------
THP enabled: 2.851561705 seconds for test completion
THP disable: 3.599146098 seconds for test completion
We are 20.7% better than non THP case even when we have all the large pages split.
Detailed output:
THP enabled:
---------------------------------------
[root@llmp24l02 ~]# cat /proc/vmstat | grep thp
thp_fault_alloc 0
thp_fault_fallback 0
thp_collapse_alloc 0
thp_collapse_alloc_failed 0
thp_split 0
thp_zero_page_alloc 0
thp_zero_page_alloc_failed 0
[root@llmp24l02 ~]# /root/thp/tools/perf/perf stat -e page-faults,dTLB-load-misses ./split-huge-page-mpro 20G
time taken to touch all the data in ns: 2763096913
Performance counter stats for './split-huge-page-mpro 20G':
1,581 page-faults
3,159 dTLB-load-misses
2.851561705 seconds time elapsed
[root@llmp24l02 ~]#
[root@llmp24l02 ~]# cat /proc/vmstat | grep thp
thp_fault_alloc 1279
thp_fault_fallback 0
thp_collapse_alloc 0
thp_collapse_alloc_failed 0
thp_split 1279
thp_zero_page_alloc 0
thp_zero_page_alloc_failed 0
[root@llmp24l02 ~]#
77.05% split-huge-page [kernel.kallsyms] [k] .clear_user_page
7.10% split-huge-page [kernel.kallsyms] [k] .perf_event_mmap_ctx
1.51% split-huge-page split-huge-page-mpro [.] 0x0000000000000a70
0.96% split-huge-page [unknown] [H] 0x000000000157e3bc
0.81% split-huge-page [kernel.kallsyms] [k] .up_write
0.76% split-huge-page [kernel.kallsyms] [k] .perf_event_mmap
0.76% split-huge-page [kernel.kallsyms] [k] .down_write
0.74% split-huge-page [kernel.kallsyms] [k] .lru_add_page_tail
0.61% split-huge-page [kernel.kallsyms] [k] .split_huge_page
0.59% split-huge-page [kernel.kallsyms] [k] .change_protection
0.51% split-huge-page [kernel.kallsyms] [k] .release_pages
0.96% split-huge-page [unknown] [H] 0x000000000157e3bc
|
|--79.44%-- reloc_start
| |
| |--86.54%-- .__pSeries_lpar_hugepage_invalidate
| | .pSeries_lpar_hugepage_invalidate
| | .hpte_need_hugepage_flush
| | .split_huge_page
| | .__split_huge_page_pmd
| | .vma_adjust
| | .vma_merge
| | .mprotect_fixup
| | .SyS_mprotect
THP disabled:
---------------
[root@llmp24l02 ~]# echo never > /sys/kernel/mm/transparent_hugepage/enabled
[root@llmp24l02 ~]# /root/thp/tools/perf/perf stat -e page-faults,dTLB-load-misses ./split-huge-page-mpro 20G
time taken to touch all the data in ns: 3513767220
Performance counter stats for './split-huge-page-mpro 20G':
3,27,726 page-faults
3,29,654 dTLB-load-misses
3.599146098 seconds time elapsed
[root@llmp24l02 ~]#
Changes from V6:
* split the patch series into two patchset.
* Address review feedback.
Changes from V5:
* Address review comments
* Added new patch to not use hugepd for explcit hugepages. Explicit hugepaes
now use PTE format similar to transparent hugepages.
* We don't use page->_mapcount for tracking free PTE frags in a PTE page.
* rebased to a86d52667d8eda5de39393ce737794403bdce1eb
* Tested with libhugetlbfs test suite
Changes from V4:
* Fix bad page error in page_table_alloc
BUG: Bad page state in process stream pfn:f1a59
page:f0000000034dc378 count:1 mapcount:0 mapping: (null) index:0x0
[c000000f322c77d0] [c00000000015e198] .bad_page+0xe8/0x140
[c000000f322c7860] [c00000000015e3c4] .free_pages_prepare+0x1d4/0x1e0
[c000000f322c7910] [c000000000160450] .free_hot_cold_page+0x50/0x230
[c000000f322c79c0] [c00000000003ad18] .page_table_alloc+0x168/0x1c0
Changes from V3:
* PowerNV boot fixes
Change from V2:
* Change patch "powerpc: Reduce PTE table memory wastage" to use much simpler approach
for PTE page sharing.
* Changes to handle huge pages in KVM code.
* Address other review comments
Changes from V1
* Address review comments
* More patch split
* Add batch hpte invalidate for hugepages.
Changes from RFC V2:
* Address review comments
* More code cleanup and patch split
Changes from RFC V1:
* HugeTLB fs now works
* Compile issues fixed
* rebased to v3.8
* Patch series reorded so that ppc64 cleanups and MM THP changes are moved
early in the series. This should help in picking those patches early.
Thanks,
-aneesh
^ permalink raw reply
* [PATCH -V7 12/18] powerpc: Reduce PTE table memory wastage
From: Aneesh Kumar K.V @ 2013-04-28 19:37 UTC (permalink / raw)
To: benh, paulus, dwg, linux-mm; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1367177859-7893-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
We allocate one page for the last level of linux page table. With THP and
large page size of 16MB, that would mean we are wasting large part
of that page. To map 16MB area, we only need a PTE space of 2K with 64K
page size. This patch reduce the space wastage by sharing the page
allocated for the last level of linux page table with multiple pmd
entries. We call these smaller chunks PTE page fragments and allocated
page, PTE page.
In order to support systems which doesn't have 64K HPTE support, we also
add another 2K to PTE page fragment. The second half of the PTE fragments
is used for storing slot and secondary bit information of an HPTE. With this
we now have a 4K PTE fragment.
We use a simple approach to share the PTE page. On allocation, we bump the
PTE page refcount to 16 and share the PTE page with the next 16 pte alloc
request. This should help in the node locality of the PTE page fragment,
assuming that the immediate pte alloc request will mostly come from the
same NUMA node. We don't try to reuse the freed PTE page fragment. Hence
we could be waisting some space.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
arch/powerpc/include/asm/mmu-book3e.h | 4 ++
arch/powerpc/include/asm/mmu-hash64.h | 4 ++
arch/powerpc/include/asm/page.h | 4 ++
arch/powerpc/include/asm/pgalloc-64.h | 82 +++++++----------------
arch/powerpc/kernel/setup_64.c | 4 +-
arch/powerpc/mm/mmu_context_hash64.c | 37 +++++++++++
arch/powerpc/mm/pgtable_64.c | 118 ++++++++++++++++++++++++++++++++++
7 files changed, 195 insertions(+), 58 deletions(-)
diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h
index 99d43e0..8bd560c 100644
--- a/arch/powerpc/include/asm/mmu-book3e.h
+++ b/arch/powerpc/include/asm/mmu-book3e.h
@@ -231,6 +231,10 @@ typedef struct {
u64 high_slices_psize; /* 4 bits per slice for now */
u16 user_psize; /* page size index */
#endif
+#ifdef CONFIG_PPC_64K_PAGES
+ /* for 4K PTE fragment support */
+ void *pte_frag;
+#endif
} mm_context_t;
/* Page size definitions, common between 32 and 64-bit
diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index 05895cf..de9e577 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -516,6 +516,10 @@ typedef struct {
unsigned long acop; /* mask of enabled coprocessor types */
unsigned int cop_pid; /* pid value used with coprocessors */
#endif /* CONFIG_PPC_ICSWX */
+#ifdef CONFIG_PPC_64K_PAGES
+ /* for 4K PTE fragment support */
+ void *pte_frag;
+#endif
} mm_context_t;
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index 711e83a..988c812 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -393,7 +393,11 @@ void arch_free_page(struct page *page, int order);
struct vm_area_struct;
+#ifdef CONFIG_PPC_64K_PAGES
+typedef pte_t *pgtable_t;
+#else
typedef struct page *pgtable_t;
+#endif
#include <asm-generic/memory_model.h>
#endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h
index d390123..91acb12 100644
--- a/arch/powerpc/include/asm/pgalloc-64.h
+++ b/arch/powerpc/include/asm/pgalloc-64.h
@@ -152,6 +152,23 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
}
#else /* if CONFIG_PPC_64K_PAGES */
+/*
+ * we support 16 fragments per PTE page.
+ */
+#define PTE_FRAG_NR 16
+/*
+ * We use a 2K PTE page fragment and another 2K for storing
+ * real_pte_t hash index
+ */
+#define PTE_FRAG_SIZE_SHIFT 12
+#define PTE_FRAG_SIZE (2 * PTRS_PER_PTE * sizeof(pte_t))
+
+extern pte_t *page_table_alloc(struct mm_struct *, unsigned long, int);
+extern void page_table_free(struct mm_struct *, unsigned long *, int);
+extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift);
+#ifdef CONFIG_SMP
+extern void __tlb_remove_table(void *_table);
+#endif
#define pud_populate(mm, pud, pmd) pud_set(pud, (unsigned long)pmd)
@@ -164,90 +181,42 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
pgtable_t pte_page)
{
- pmd_populate_kernel(mm, pmd, page_address(pte_page));
+ pmd_set(pmd, (unsigned long)pte_page);
}
static inline pgtable_t pmd_pgtable(pmd_t pmd)
{
- return pmd_page(pmd);
+ return (pgtable_t)(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE);
}
static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
unsigned long address)
{
- return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO);
+ return (pte_t *)page_table_alloc(mm, address, 1);
}
static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
- unsigned long address)
+ unsigned long address)
{
- struct page *page;
- pte_t *pte;
-
- pte = pte_alloc_one_kernel(mm, address);
- if (!pte)
- return NULL;
- page = virt_to_page(pte);
- pgtable_page_ctor(page);
- return page;
+ return (pgtable_t)page_table_alloc(mm, address, 0);
}
static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
{
- free_page((unsigned long)pte);
+ page_table_free(mm, (unsigned long *)pte, 1);
}
static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
{
- pgtable_page_dtor(ptepage);
- __free_page(ptepage);
+ page_table_free(mm, (unsigned long *)ptepage, 0);
}
-static inline void pgtable_free(void *table, unsigned index_size)
-{
- if (!index_size)
- free_page((unsigned long)table);
- else {
- BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE);
- kmem_cache_free(PGT_CACHE(index_size), table);
- }
-}
-
-#ifdef CONFIG_SMP
-static inline void pgtable_free_tlb(struct mmu_gather *tlb,
- void *table, int shift)
-{
- unsigned long pgf = (unsigned long)table;
- BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
- pgf |= shift;
- tlb_remove_table(tlb, (void *)pgf);
-}
-
-static inline void __tlb_remove_table(void *_table)
-{
- void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
- unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
-
- pgtable_free(table, shift);
-}
-#else /* !CONFIG_SMP */
-static inline void pgtable_free_tlb(struct mmu_gather *tlb,
- void *table, int shift)
-{
- pgtable_free(table, shift);
-}
-#endif /* CONFIG_SMP */
-
static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
unsigned long address)
{
- struct page *page = page_address(table);
-
tlb_flush_pgtable(tlb, address);
- pgtable_page_dtor(page);
- pgtable_free_tlb(tlb, page, 0);
+ pgtable_free_tlb(tlb, table, 0);
}
-
#endif /* CONFIG_PPC_64K_PAGES */
static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
@@ -261,7 +230,6 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
kmem_cache_free(PGT_CACHE(PMD_INDEX_SIZE), pmd);
}
-
#define __pmd_free_tlb(tlb, pmd, addr) \
pgtable_free_tlb(tlb, pmd, PMD_INDEX_SIZE)
#ifndef CONFIG_PPC_64K_PAGES
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 75fbaceb..e379d3f 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -583,7 +583,9 @@ void __init setup_arch(char **cmdline_p)
init_mm.end_code = (unsigned long) _etext;
init_mm.end_data = (unsigned long) _edata;
init_mm.brk = klimit;
-
+#ifdef CONFIG_PPC_64K_PAGES
+ init_mm.context.pte_frag = NULL;
+#endif
irqstack_early_init();
exc_lvl_early_init();
emergency_stack_init();
diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c
index d1d1b92..178876ae 100644
--- a/arch/powerpc/mm/mmu_context_hash64.c
+++ b/arch/powerpc/mm/mmu_context_hash64.c
@@ -23,6 +23,7 @@
#include <linux/slab.h>
#include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
#include "icswx.h"
@@ -85,6 +86,9 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
spin_lock_init(mm->context.cop_lockp);
#endif /* CONFIG_PPC_ICSWX */
+#ifdef CONFIG_PPC_64K_PAGES
+ mm->context.pte_frag = NULL;
+#endif
return 0;
}
@@ -96,13 +100,46 @@ void __destroy_context(int context_id)
}
EXPORT_SYMBOL_GPL(__destroy_context);
+#ifdef CONFIG_PPC_64K_PAGES
+static void destroy_pagetable_page(struct mm_struct *mm)
+{
+ int count;
+ void *pte_frag;
+ struct page *page;
+
+ pte_frag = mm->context.pte_frag;
+ if (!pte_frag)
+ return;
+
+ page = virt_to_page(pte_frag);
+ /* drop all the pending references */
+ count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT;
+ /* We allow PTE_FRAG_NR fragments from a PTE page */
+ count = atomic_sub_return(PTE_FRAG_NR - count, &page->_count);
+ if (!count) {
+ pgtable_page_dtor(page);
+ free_hot_cold_page(page, 0);
+ }
+}
+
+#else
+static inline void destroy_pagetable_page(struct mm_struct *mm)
+{
+ return;
+}
+#endif
+
+
void destroy_context(struct mm_struct *mm)
{
+
#ifdef CONFIG_PPC_ICSWX
drop_cop(mm->context.acop, mm);
kfree(mm->context.cop_lockp);
mm->context.cop_lockp = NULL;
#endif /* CONFIG_PPC_ICSWX */
+
+ destroy_pagetable_page(mm);
__destroy_context(mm->context.id);
subpage_prot_free(mm);
mm->context.id = MMU_NO_CONTEXT;
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 654258f..a854096 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -337,3 +337,121 @@ EXPORT_SYMBOL(__ioremap_at);
EXPORT_SYMBOL(iounmap);
EXPORT_SYMBOL(__iounmap);
EXPORT_SYMBOL(__iounmap_at);
+
+#ifdef CONFIG_PPC_64K_PAGES
+static pte_t *get_from_cache(struct mm_struct *mm)
+{
+ void *pte_frag, *ret;
+
+ spin_lock(&mm->page_table_lock);
+ ret = mm->context.pte_frag;
+ if (ret) {
+ pte_frag = ret + PTE_FRAG_SIZE;
+ /*
+ * If we have taken up all the fragments mark PTE page NULL
+ */
+ if (((unsigned long)pte_frag & ~PAGE_MASK) == 0)
+ pte_frag = NULL;
+ mm->context.pte_frag = pte_frag;
+ }
+ spin_unlock(&mm->page_table_lock);
+ return (pte_t *)ret;
+}
+
+static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel)
+{
+ void *ret = NULL;
+ struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
+ __GFP_REPEAT | __GFP_ZERO);
+ if (!page)
+ return NULL;
+
+ ret = page_address(page);
+ spin_lock(&mm->page_table_lock);
+ /*
+ * If we find pgtable_page set, we return
+ * the allocated page with single fragement
+ * count.
+ */
+ if (likely(!mm->context.pte_frag)) {
+ atomic_set(&page->_count, PTE_FRAG_NR);
+ mm->context.pte_frag = ret + PTE_FRAG_SIZE;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ if (!kernel)
+ pgtable_page_ctor(page);
+
+ return (pte_t *)ret;
+}
+
+pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel)
+{
+ pte_t *pte;
+
+ pte = get_from_cache(mm);
+ if (pte)
+ return pte;
+
+ return __alloc_for_cache(mm, kernel);
+}
+
+void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel)
+{
+ struct page *page = virt_to_page(table);
+ if (put_page_testzero(page)) {
+ if (!kernel)
+ pgtable_page_dtor(page);
+ free_hot_cold_page(page, 0);
+ }
+}
+
+#ifdef CONFIG_SMP
+static void page_table_free_rcu(void *table)
+{
+ struct page *page = virt_to_page(table);
+ if (put_page_testzero(page)) {
+ pgtable_page_dtor(page);
+ free_hot_cold_page(page, 0);
+ }
+}
+
+void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
+{
+ unsigned long pgf = (unsigned long)table;
+
+ BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
+ pgf |= shift;
+ tlb_remove_table(tlb, (void *)pgf);
+}
+
+void __tlb_remove_table(void *_table)
+{
+ void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
+ unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
+
+ if (!shift)
+ /* PTE page needs special handling */
+ page_table_free_rcu(table);
+ else {
+ BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
+ kmem_cache_free(PGT_CACHE(shift), table);
+ }
+}
+#else
+void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
+{
+ if (!shift) {
+ /* PTE page needs special handling */
+ struct page *page = virt_to_page(table);
+ if (put_page_testzero(page)) {
+ pgtable_page_dtor(page);
+ free_hot_cold_page(page, 0);
+ }
+ } else {
+ BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
+ kmem_cache_free(PGT_CACHE(shift), table);
+ }
+}
+#endif
+#endif /* CONFIG_PPC_64K_PAGES */
--
1.8.1.2
^ permalink raw reply related
* [PATCH -V7 18/18] powerpc: Update tlbie/tlbiel as per ISA doc
From: Aneesh Kumar K.V @ 2013-04-28 19:37 UTC (permalink / raw)
To: benh, paulus, dwg, linux-mm; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1367177859-7893-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Encode the actual page correctly in tlbie/tlbiel. This make sure we handle
multiple page size segment correctly.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
arch/powerpc/mm/hash_native_64.c | 32 ++++++++++++++++++++++++++++++--
1 file changed, 30 insertions(+), 2 deletions(-)
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index bb920ee..6a2aead 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -61,7 +61,10 @@ static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
switch (psize) {
case MMU_PAGE_4K:
+ /* clear out bits after (52) [0....52.....63] */
+ va &= ~((1ul << (64 - 52)) - 1);
va |= ssize << 8;
+ va |= mmu_psize_defs[apsize].sllp << 6;
asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2)
: : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
: "memory");
@@ -69,9 +72,20 @@ static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
default:
/* We need 14 to 14 + i bits of va */
penc = mmu_psize_defs[psize].penc[apsize];
- va &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
+ va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1);
va |= penc << 12;
va |= ssize << 8;
+ /* Add AVAL part */
+ if (psize != apsize) {
+ /*
+ * MPSS, 64K base page size and 16MB parge page size
+ * We don't need all the bits, but rest of the bits
+ * must be ignored by the processor.
+ * vpn cover upto 65 bits of va. (0...65) and we need
+ * 58..64 bits of va.
+ */
+ va |= (vpn & 0xfe);
+ }
va |= 1; /* L */
asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2)
: : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
@@ -96,16 +110,30 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
switch (psize) {
case MMU_PAGE_4K:
+ /* clear out bits after(52) [0....52.....63] */
+ va &= ~((1ul << (64 - 52)) - 1);
va |= ssize << 8;
+ va |= mmu_psize_defs[apsize].sllp << 6;
asm volatile(".long 0x7c000224 | (%0 << 11) | (0 << 21)"
: : "r"(va) : "memory");
break;
default:
/* We need 14 to 14 + i bits of va */
penc = mmu_psize_defs[psize].penc[apsize];
- va &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
+ va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1);
va |= penc << 12;
va |= ssize << 8;
+ /* Add AVAL part */
+ if (psize != apsize) {
+ /*
+ * MPSS, 64K base page size and 16MB parge page size
+ * We don't need all the bits, but rest of the bits
+ * must be ignored by the processor.
+ * vpn cover upto 65 bits of va. (0...65) and we need
+ * 58..64 bits of va.
+ */
+ va |= (vpn & 0xfe);
+ }
va |= 1; /* L */
asm volatile(".long 0x7c000224 | (%0 << 11) | (1 << 21)"
: : "r"(va) : "memory");
--
1.8.1.2
^ permalink raw reply related
* [PATCH -V7 03/18] mm/THP: withdraw the pgtable after pmdp related operations
From: Aneesh Kumar K.V @ 2013-04-28 19:37 UTC (permalink / raw)
To: benh, paulus, dwg, linux-mm; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1367177859-7893-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
For architectures like ppc64 we look at deposited pgtable when
calling pmdp_get_and_clear. So do the pgtable_trans_huge_withdraw
after finishing pmdp related operations.
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
mm/huge_memory.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 84f3180..21c5ebd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1363,9 +1363,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
struct page *page;
pgtable_t pgtable;
pmd_t orig_pmd;
- pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
+ /*
+ * For architectures like ppc64 we look at deposited pgtable
+ * when calling pmdp_get_and_clear. So do the
+ * pgtable_trans_huge_withdraw after finishing pmdp related
+ * operations.
+ */
orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
if (is_huge_zero_pmd(orig_pmd)) {
tlb->mm->nr_ptes--;
spin_unlock(&tlb->mm->page_table_lock);
--
1.8.1.2
^ permalink raw reply related
* [PATCH -V7 01/18] mm/THP: HPAGE_SHIFT is not a #define on some arch
From: Aneesh Kumar K.V @ 2013-04-28 19:37 UTC (permalink / raw)
To: benh, paulus, dwg, linux-mm; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1367177859-7893-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
On archs like powerpc that support different hugepage sizes, HPAGE_SHIFT
and other derived values like HPAGE_PMD_ORDER are not constants. So move
that to hugepage_init
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
include/linux/huge_mm.h | 3 ---
mm/huge_memory.c | 9 ++++++---
2 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ee1c244..bdc5aef 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -119,9 +119,6 @@ extern void __split_huge_page_pmd(struct vm_area_struct *vma,
} while (0)
extern void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
pmd_t *pmd);
-#if HPAGE_PMD_ORDER > MAX_ORDER
-#error "hugepages can't be allocated by the buddy allocator"
-#endif
extern int hugepage_madvise(struct vm_area_struct *vma,
unsigned long *vm_flags, int advice);
extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e2f7f5aa..78bd84f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -45,7 +45,7 @@ unsigned long transparent_hugepage_flags __read_mostly =
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
/* default scan 8*512 pte (or vmas) every 30 second */
-static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
+static unsigned int khugepaged_pages_to_scan __read_mostly;
static unsigned int khugepaged_pages_collapsed;
static unsigned int khugepaged_full_scans;
static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
@@ -60,7 +60,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
* it would have happened if the vma was large enough during page
* fault.
*/
-static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
+static unsigned int khugepaged_max_ptes_none __read_mostly;
static int khugepaged(void *none);
static int khugepaged_slab_init(void);
@@ -620,11 +620,14 @@ static int __init hugepage_init(void)
int err;
struct kobject *hugepage_kobj;
- if (!has_transparent_hugepage()) {
+ if (!has_transparent_hugepage() || (HPAGE_PMD_ORDER > MAX_ORDER)) {
transparent_hugepage_flags = 0;
return -EINVAL;
}
+ khugepaged_pages_to_scan = HPAGE_PMD_NR*8;
+ khugepaged_max_ptes_none = HPAGE_PMD_NR-1;
+
err = hugepage_init_sysfs(&hugepage_kobj);
if (err)
return err;
--
1.8.1.2
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox