* [PATCH v1 1/2] powerpc/mmiotrace: Add MMIO Tracing tool for PowerPC
@ 2024-06-20 8:51 Jialong Yang
2024-06-20 8:51 ` [PATCH v1 2/2] powerpc/mmiotrace: bind ioremap and page fault to active mmiotrace Jialong Yang
` (2 more replies)
0 siblings, 3 replies; 6+ messages in thread
From: Jialong Yang @ 2024-06-20 8:51 UTC (permalink / raw)
To: Michael Ellerman, Nicholas Piggin, Christophe Leroy,
Naveen N. Rao
Cc: shenghui.qu, luming.yu, linuxppc-dev, linux-kernel, Jialong Yang
mmiotrace is a useful tool to trace MMIO accesses. Nowadays, it only
supported on x86 and x86_64 platforms. Here is a support for powerpc.
The manual is located at Documentation/trace/mmiotrace.rst which means
I have not changed user API. People will be easy to use it.
Almost all files are copied from x86/mm, there are only some
differences from hardware and architectures software.
LINK: https://lore.kernel.org/lkml/20080127195536.50809974@daedalus.pq.iki.fi/
Signed-off-by: Jialong Yang <jialong.yang@shingroup.cn>
---
arch/powerpc/Kconfig.debug | 3 +
arch/powerpc/mm/Makefile | 1 +
arch/powerpc/mm/kmmio.c | 649 +++++++++++++++++++++++++++++++
arch/powerpc/mm/mmio-mod.c | 414 ++++++++++++++++++++
arch/powerpc/mm/mmiotrace_arch.c | 149 +++++++
arch/powerpc/mm/mmiotrace_arch.h | 25 ++
arch/powerpc/mm/pf_in.c | 185 +++++++++
arch/powerpc/mm/pf_in.h | 33 ++
8 files changed, 1459 insertions(+)
create mode 100644 arch/powerpc/mm/kmmio.c
create mode 100644 arch/powerpc/mm/mmio-mod.c
create mode 100644 arch/powerpc/mm/mmiotrace_arch.c
create mode 100644 arch/powerpc/mm/mmiotrace_arch.h
create mode 100644 arch/powerpc/mm/pf_in.c
create mode 100644 arch/powerpc/mm/pf_in.h
diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index 8c80b154e814..8a69188aa75a 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -1,5 +1,8 @@
# SPDX-License-Identifier: GPL-2.0
+config HAVE_MMIOTRACE_SUPPORT
+ def_bool y
+
config PPC_DISABLE_WERROR
bool "Don't build arch/powerpc code with -Werror"
help
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 0fe2f085c05a..cb92049f1239 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -17,3 +17,4 @@ obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o
obj-$(CONFIG_PTDUMP_CORE) += ptdump/
obj-$(CONFIG_KASAN) += kasan/
+obj-$(CONFIG_MMIOTRACE) += kmmio.o mmio-mod.o pf_in.o mmiotrace_arch.o
diff --git a/arch/powerpc/mm/kmmio.c b/arch/powerpc/mm/kmmio.c
new file mode 100644
index 000000000000..f4374e721b37
--- /dev/null
+++ b/arch/powerpc/mm/kmmio.c
@@ -0,0 +1,649 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Support for MMIO probes.
+ * Derived from arch/x86/mm/kmmio.c:
+ * Copyright (C) 2024 Jialong Yang (jialong.yang@shingroup.cn)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/spinlock.h>
+#include <linux/hash.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/ptrace.h>
+#include <linux/preempt.h>
+#include <linux/percpu.h>
+#include <linux/kdebug.h>
+#include <linux/mutex.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/paca.h>
+#include <linux/errno.h>
+#include <linux/mmiotrace.h>
+
+#include "mmiotrace_arch.h"
+
+typedef unsigned long pteval_t;
+typedef unsigned long pmdval_t;
+
+#define KMMIO_PAGE_HASH_BITS 4
+#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
+
+struct kmmio_fault_page {
+ struct list_head list;
+ struct kmmio_fault_page *release_next;
+ unsigned long addr; /* the requested address */
+ pteval_t old_presence; /* page presence prior to arming */
+ bool armed;
+
+ /*
+ * Number of times this page has been registered as a part
+ * of a probe. If zero, page is disarmed and this may be freed.
+ * Used only by writers (RCU) and post_kmmio_handler().
+ * Protected by kmmio_lock, when linked into kmmio_page_table.
+ */
+ int count;
+
+ bool scheduled_for_release;
+};
+
+struct kmmio_delayed_release {
+ struct rcu_head rcu;
+ struct kmmio_fault_page *release_list;
+};
+
+struct kmmio_context {
+ struct kmmio_fault_page *fpage;
+ struct kmmio_probe *probe;
+ unsigned long saved_flags;
+ unsigned long saved_softe;
+ unsigned long addr;
+ int active;
+};
+
+/*
+ * The kmmio_lock is taken in int3 context, which is treated as NMI context.
+ * This causes lockdep to complain about it bein in both NMI and normal
+ * context. Hide it from lockdep, as it should not have any other locks
+ * taken under it, and this is only enabled for debugging mmio anyway.
+ */
+static arch_spinlock_t kmmio_lock = __ARCH_SPIN_LOCK_UNLOCKED;
+
+/* Protected by kmmio_lock */
+unsigned int kmmio_count;
+
+/* Read-protected by RCU, write-protected by kmmio_lock. */
+static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
+static LIST_HEAD(kmmio_probes);
+
+static struct list_head *kmmio_page_list(unsigned long addr)
+{
+ unsigned int l;
+ pte_t *pte = lookup_address(addr, &l);
+
+ if (!pte)
+ return NULL;
+ addr &= page_level_mask(l);
+
+ return &kmmio_page_table[hash_long(addr, KMMIO_PAGE_HASH_BITS)];
+}
+
+/* Accessed per-cpu */
+static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
+
+/*
+ * this is basically a dynamic stabbing problem:
+ * Could use the existing prio tree code or
+ * Possible better implementations:
+ * The Interval Skip List: A Data Structure for Finding All Intervals That
+ * Overlap a Point (might be simple)
+ * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
+ */
+/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
+static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
+{
+ struct kmmio_probe *p;
+
+ list_for_each_entry_rcu(p, &kmmio_probes, list) {
+ if (addr >= p->addr && addr < (p->addr + p->len))
+ return p;
+ }
+ return NULL;
+}
+
+/* You must be holding RCU read lock. */
+static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr)
+{
+ struct list_head *head;
+ struct kmmio_fault_page *f;
+ unsigned int l;
+ pte_t *pte = lookup_address(addr, &l);
+
+ if (!pte)
+ return NULL;
+ addr &= page_level_mask(l);
+ head = kmmio_page_list(addr);
+ list_for_each_entry_rcu(f, head, list) {
+ if (f->addr == addr)
+ return f;
+ }
+ return NULL;
+}
+
+static inline pmd_t pmd_mkinvalid(pmd_t pmd)
+{
+ return __pmd_raw(pmd_raw(pmd) & ~cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID));
+}
+
+static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
+{
+ pmd_t new_pmd;
+ pmdval_t v = pmd_val(*pmd);
+
+ if (clear) {
+ *old = v;
+ new_pmd = pmd_mkinvalid(*pmd);
+ } else {
+ /* Presume this has been called with clear==true previously */
+ new_pmd = __pmd(*old);
+ }
+ *pmd = new_pmd;
+}
+
+static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old, unsigned long addr)
+{
+ pteval_t v = pte_val(*pte);
+
+ if (clear) {
+ *old = v;
+ /* Nothing should care about address */
+ pte_clear(&init_mm, addr, pte);
+ } else {
+ /* Presume this has been called with clear==true previously */
+ set_pte_at(&init_mm, addr, pte, __pte(*old));
+ }
+}
+
+static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
+{
+ unsigned int level;
+ pte_t *pte = lookup_address(f->addr, &level);
+
+ if (!pte) {
+ pr_err("no pte for addr 0x%08lx\n", f->addr);
+ return -1;
+ }
+
+ if (level == PMD_SHIFT)
+ clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence);
+ else if (level == PAGE_SHIFT)
+ clear_pte_presence(pte, clear, &f->old_presence, f->addr);
+ else {
+ pr_err("unexpected page level 0x%x.\n", level);
+ return -1;
+ }
+
+ mmap_read_lock(&init_mm);
+ struct vm_area_struct *vma = find_vma(&init_mm, f->addr);
+
+ mmap_read_unlock(&init_mm);
+
+ flush_tlb_page(vma, f->addr);
+
+ return 0;
+}
+
+/*
+ * Mark the given page as not present. Access to it will trigger a fault.
+ *
+ * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the
+ * protection is ignored here. RCU read lock is assumed held, so the struct
+ * will not disappear unexpectedly. Furthermore, the caller must guarantee,
+ * that double arming the same virtual address (page) cannot occur.
+ *
+ * Double disarming on the other hand is allowed, and may occur when a fault
+ * and mmiotrace shutdown happen simultaneously.
+ */
+static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
+{
+ int ret;
+
+ WARN_ONCE(f->armed, pr_fmt("kmmio page already armed.\n"));
+ if (f->armed) {
+ pr_warn("double-arm: addr 0x%08lx, ref %d, old %d\n",
+ f->addr, f->count, !!f->old_presence);
+ }
+ ret = clear_page_presence(f, true);
+ WARN_ONCE(ret < 0, pr_fmt("arming at 0x%08lx failed.\n"),
+ f->addr);
+ f->armed = true;
+ return ret;
+}
+
+/** Restore the given page to saved presence state. */
+static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
+{
+ int ret = clear_page_presence(f, false);
+
+ WARN_ONCE(ret < 0,
+ KERN_ERR "kmmio disarming at 0x%08lx failed.\n", f->addr);
+ f->armed = false;
+}
+
+/*
+ * This is being called from do_page_fault().
+ *
+ * We may be in an interrupt or a critical section. Also prefecthing may
+ * trigger a page fault. We may be in the middle of process switch.
+ * We cannot take any locks, because we could be executing especially
+ * within a kmmio critical section.
+ *
+ * Local interrupts are disabled, so preemption cannot happen.
+ * Do not enable interrupts, do not sleep, and watch out for other CPUs.
+ */
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate
+ * and they remain disabled throughout this function.
+ */
+int kmmio_handler(struct pt_regs *regs, unsigned long addr)
+{
+ struct kmmio_context *ctx;
+ struct kmmio_fault_page *faultpage;
+ int ret = 0; /* default to fault not handled */
+ unsigned long page_base = addr;
+ unsigned int l;
+ pte_t *pte = lookup_address(addr, &l);
+
+ if (!pte)
+ return -EINVAL;
+ page_base &= page_level_mask(l);
+
+ /*
+ * Hold the RCU read lock over single stepping to avoid looking
+ * up the probe and kmmio_fault_page again. The rcu_read_lock_sched()
+ * also disables preemption and prevents process switch during
+ * the single stepping. We can only handle one active kmmio trace
+ * per cpu, so ensure that we finish it before something else
+ * gets to run.
+ */
+ rcu_read_lock_sched_notrace();
+
+ faultpage = get_kmmio_fault_page(page_base);
+ if (!faultpage) {
+ /*
+ * Either this page fault is not caused by kmmio, or
+ * another CPU just pulled the kmmio probe from under
+ * our feet. The latter case should not be possible.
+ */
+ goto no_kmmio;
+ }
+
+ ctx = this_cpu_ptr(&kmmio_ctx);
+ if (ctx->active) {
+ if (page_base == ctx->addr) {
+ /*
+ * A second fault on the same page means some other
+ * condition needs handling by do_page_fault(), the
+ * page really not being present is the most common.
+ */
+ pr_debug("secondary hit for 0x%08lx CPU %d.\n",
+ addr, smp_processor_id());
+
+ if (!faultpage->old_presence)
+ pr_info("unexpected secondary hit for address 0x%08lx on CPU %d.\n",
+ addr, smp_processor_id());
+ } else {
+ /*
+ * Prevent overwriting already in-flight context.
+ * This should not happen, let's hope disarming at
+ * least prevents a panic.
+ */
+ pr_emerg("recursive probe hit on CPU %d, for address 0x%08lx. Ignoring.\n",
+ smp_processor_id(), addr);
+ pr_emerg("previous hit was at 0x%08lx.\n", ctx->addr);
+ disarm_kmmio_fault_page(faultpage);
+ }
+ goto no_kmmio;
+ }
+ ctx->active++;
+
+ ctx->fpage = faultpage;
+ ctx->probe = get_kmmio_probe(page_base);
+ ctx->saved_flags = (regs->msr & (MSR_SE | MSR_EE));
+ ctx->saved_softe = regs->softe;
+ ctx->addr = page_base;
+
+ if (ctx->probe && ctx->probe->pre_handler)
+ ctx->probe->pre_handler(ctx->probe, regs, addr);
+
+ /*
+ * Enable single-stepping and disable interrupts for the faulting
+ * context. Local interrupts must not get enabled during stepping.
+ */
+ regs->msr |= MSR_SE; // single step
+ regs->msr &= ~MSR_EE; // hard interrupt
+ regs->softe = IRQS_DISABLED; // soft interrupt
+
+ local_paca->srr_valid = 0;
+
+ /* Now we set present bit in PTE and single step. */
+ disarm_kmmio_fault_page(ctx->fpage);
+
+ /*
+ * If another cpu accesses the same page while we are stepping,
+ * the access will not be caught. It will simply succeed and the
+ * only downside is we lose the event. If this becomes a problem,
+ * the user should drop to single cpu before tracing.
+ */
+
+ return 1; /* fault handled */
+
+no_kmmio:
+ rcu_read_unlock_sched_notrace();
+ return ret;
+}
+
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate
+ * and they remain disabled throughout this function.
+ * This must always get called as the pair to kmmio_handler().
+ */
+static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
+{
+ int ret = 0;
+ struct kmmio_context *ctx = this_cpu_ptr(&kmmio_ctx);
+
+ if (!ctx->active) {
+ /*
+ * debug traps without an active context are due to either
+ * something external causing them (f.e. using a debugger while
+ * mmio tracing enabled), or erroneous behaviour
+ */
+ pr_warn("unexpected debug trap on CPU %d.\n", smp_processor_id());
+ goto out;
+ }
+
+ if (ctx->probe && ctx->probe->post_handler)
+ ctx->probe->post_handler(ctx->probe, condition, regs);
+
+ /* Prevent racing against release_kmmio_fault_page(). */
+ arch_spin_lock(&kmmio_lock);
+ if (ctx->fpage->count)
+ arm_kmmio_fault_page(ctx->fpage);
+ arch_spin_unlock(&kmmio_lock);
+
+ // disabled single step in entry of single_step_exception.
+ // regs->msr &= ~MSR_SE;
+ regs->msr |= ctx->saved_flags;
+ regs->softe = ctx->saved_softe;
+
+ /* These were acquired in kmmio_handler(). */
+ ctx->active--;
+ BUG_ON(ctx->active);
+ rcu_read_unlock_sched_notrace();
+
+ /*
+ * if somebody else is singlestepping across a probe point, flags
+ * will have TF set, in which case, continue the remaining processing
+ * of do_debug, as if this is not a probe hit.
+ */
+ if (!(regs->msr & MSR_SE))
+ ret = 1;
+out:
+ return ret;
+}
+
+/* You must be holding kmmio_lock. */
+static int add_kmmio_fault_page(unsigned long addr)
+{
+ struct kmmio_fault_page *f;
+
+ f = get_kmmio_fault_page(addr);
+ if (f) {
+ if (!f->count)
+ arm_kmmio_fault_page(f);
+ f->count++;
+ return 0;
+ }
+
+ f = kzalloc(sizeof(*f), GFP_ATOMIC);
+ if (!f)
+ return -1;
+
+ f->count = 1;
+ f->addr = addr;
+
+ if (arm_kmmio_fault_page(f)) {
+ kfree(f);
+ return -1;
+ }
+
+ list_add_rcu(&f->list, kmmio_page_list(f->addr));
+
+ return 0;
+}
+
+/* You must be holding kmmio_lock. */
+static void release_kmmio_fault_page(unsigned long addr,
+ struct kmmio_fault_page **release_list)
+{
+ struct kmmio_fault_page *f;
+
+ f = get_kmmio_fault_page(addr);
+ if (!f)
+ return;
+
+ f->count--;
+ BUG_ON(f->count < 0);
+ if (!f->count) {
+ disarm_kmmio_fault_page(f);
+ if (!f->scheduled_for_release) {
+ f->release_next = *release_list;
+ *release_list = f;
+ f->scheduled_for_release = true;
+ }
+ }
+}
+
+/*
+ * With page-unaligned ioremaps, one or two armed pages may contain
+ * addresses from outside the intended mapping. Events for these addresses
+ * are currently silently dropped. The events may result only from programming
+ * mistakes by accessing addresses before the beginning or past the end of a
+ * mapping.
+ */
+int register_kmmio_probe(struct kmmio_probe *p)
+{
+ unsigned long flags;
+ int ret = 0;
+ unsigned long size = 0;
+ unsigned long addr = p->addr & PAGE_MASK;
+ const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
+ unsigned int l;
+ pte_t *pte;
+
+ local_irq_save(flags);
+ arch_spin_lock(&kmmio_lock);
+ if (get_kmmio_probe(addr)) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ pte = lookup_address(addr, &l);
+ if (!pte) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ kmmio_count++;
+ list_add_rcu(&p->list, &kmmio_probes);
+ while (size < size_lim) {
+ if (add_kmmio_fault_page(addr + size))
+ pr_err("Unable to set page fault.\n");
+ size += page_level_size(l);
+ }
+out:
+ arch_spin_unlock(&kmmio_lock);
+ local_irq_restore(flags);
+
+ /*
+ * XXX: What should I do here?
+ * Here was a call to global_flush_tlb(), but it does not exist
+ * anymore. It seems it's not needed after all.
+ */
+ return ret;
+}
+EXPORT_SYMBOL(register_kmmio_probe);
+
+static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
+{
+ struct kmmio_delayed_release *dr = container_of(
+ head,
+ struct kmmio_delayed_release,
+ rcu);
+ struct kmmio_fault_page *f = dr->release_list;
+
+ while (f) {
+ struct kmmio_fault_page *next = f->release_next;
+
+ BUG_ON(f->count);
+ kfree(f);
+ f = next;
+ }
+ kfree(dr);
+}
+
+static void remove_kmmio_fault_pages(struct rcu_head *head)
+{
+ struct kmmio_delayed_release *dr =
+ container_of(head, struct kmmio_delayed_release, rcu);
+ struct kmmio_fault_page *f = dr->release_list;
+ struct kmmio_fault_page **prevp = &dr->release_list;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ arch_spin_lock(&kmmio_lock);
+ while (f) {
+ if (!f->count) {
+ list_del_rcu(&f->list);
+ prevp = &f->release_next;
+ } else {
+ *prevp = f->release_next;
+ f->release_next = NULL;
+ f->scheduled_for_release = false;
+ }
+ f = *prevp;
+ }
+ arch_spin_unlock(&kmmio_lock);
+ local_irq_restore(flags);
+
+ /* This is the real RCU destroy call. */
+ call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
+}
+
+/*
+ * Remove a kmmio probe. You have to synchronize_rcu() before you can be
+ * sure that the callbacks will not be called anymore. Only after that
+ * you may actually release your struct kmmio_probe.
+ *
+ * Unregistering a kmmio fault page has three steps:
+ * 1. release_kmmio_fault_page()
+ * Disarm the page, wait a grace period to let all faults finish.
+ * 2. remove_kmmio_fault_pages()
+ * Remove the pages from kmmio_page_table.
+ * 3. rcu_free_kmmio_fault_pages()
+ * Actually free the kmmio_fault_page structs as with RCU.
+ */
+void unregister_kmmio_probe(struct kmmio_probe *p)
+{
+ unsigned long flags;
+ unsigned long size = 0;
+ unsigned long addr = p->addr & PAGE_MASK;
+ const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
+ struct kmmio_fault_page *release_list = NULL;
+ struct kmmio_delayed_release *drelease;
+ unsigned int l;
+ pte_t *pte;
+
+ pte = lookup_address(addr, &l);
+ if (!pte)
+ return;
+
+ local_irq_save(flags);
+ arch_spin_lock(&kmmio_lock);
+ while (size < size_lim) {
+ release_kmmio_fault_page(addr + size, &release_list);
+ size += page_level_size(l);
+ }
+ list_del_rcu(&p->list);
+ kmmio_count--;
+ arch_spin_unlock(&kmmio_lock);
+ local_irq_restore(flags);
+
+ if (!release_list)
+ return;
+
+ drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
+ if (!drelease)
+ return;
+
+ drelease->release_list = release_list;
+
+ /*
+ * This is not really RCU here. We have just disarmed a set of
+ * pages so that they cannot trigger page faults anymore. However,
+ * we cannot remove the pages from kmmio_page_table,
+ * because a probe hit might be in flight on another CPU. The
+ * pages are collected into a list, and they will be removed from
+ * kmmio_page_table when it is certain that no probe hit related to
+ * these pages can be in flight. RCU grace period sounds like a
+ * good choice.
+ *
+ * If we removed the pages too early, kmmio page fault handler might
+ * not find the respective kmmio_fault_page and determine it's not
+ * a kmmio fault, when it actually is. This would lead to madness.
+ */
+ call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
+}
+EXPORT_SYMBOL(unregister_kmmio_probe);
+
+static int
+kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
+{
+ struct die_args *arg = args;
+
+ if (val == DIE_SSTEP && post_kmmio_handler(0, arg->regs) == 1)
+ return NOTIFY_STOP;
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nb_die = {
+ .notifier_call = kmmio_die_notifier
+};
+
+int kmmio_init(void)
+{
+ int i;
+
+ for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
+ INIT_LIST_HEAD(&kmmio_page_table[i]);
+
+ return register_die_notifier(&nb_die);
+}
+
+void kmmio_cleanup(void)
+{
+ int i;
+
+ unregister_die_notifier(&nb_die);
+ for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) {
+ WARN_ONCE(!list_empty(&kmmio_page_table[i]),
+ pr_fmt("kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n"));
+ }
+}
diff --git a/arch/powerpc/mm/mmio-mod.c b/arch/powerpc/mm/mmio-mod.c
new file mode 100644
index 000000000000..68ba9f028678
--- /dev/null
+++ b/arch/powerpc/mm/mmio-mod.c
@@ -0,0 +1,414 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Derived from arch/x86/mm/mmio-mod.c:
+ * Copyright (C) 2024 Jialong Yang (jialong.yang@shingroup.cn)
+ */
+
+#define pr_fmt(fmt) "mmiotrace: " fmt
+
+#include <linux/moduleparam.h>
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/mmiotrace.h>
+#include <linux/pgtable.h>
+#include <linux/atomic.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+
+#include "pf_in.h"
+#include "mmiotrace_arch.h"
+
+struct remap_trace {
+ struct list_head list;
+ struct kmmio_probe probe;
+ resource_size_t phys;
+ unsigned long id;
+};
+
+/* Accessed per-cpu. */
+static DEFINE_PER_CPU(struct trap_reason, pf_reason);
+static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);
+
+static DEFINE_MUTEX(mmiotrace_mutex);
+static DEFINE_SPINLOCK(trace_lock);
+static atomic_t mmiotrace_enabled;
+static LIST_HEAD(trace_list); /* struct remap_trace */
+
+/*
+ * Locking in this file:
+ * - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections.
+ * - mmiotrace_enabled may be modified only when holding mmiotrace_mutex
+ * and trace_lock.
+ * - Routines depending on is_enabled() must take trace_lock.
+ * - trace_list users must hold trace_lock.
+ * - is_enabled() guarantees that mmio_trace_{rw,mapping} are allowed.
+ * - pre/post callbacks assume the effect of is_enabled() being true.
+ */
+
+/* module parameters */
+static unsigned long filter_offset;
+static bool nommiotrace;
+
+module_param(filter_offset, ulong, 0);
+module_param(nommiotrace, bool, 0);
+
+MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
+MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
+
+static bool is_enabled(void)
+{
+ return atomic_read(&mmiotrace_enabled);
+}
+
+static void print_pte(unsigned long address)
+{
+ unsigned int level;
+ pte_t *pte = lookup_address(address, &level);
+
+ if (!pte) {
+ pr_err("Error in %s: no pte for page 0x%08lx\n",
+ __func__, address);
+ return;
+ }
+
+ if (level == PMD_SHIFT) {
+ pr_emerg("4MB pages are not currently supported: 0x%08lx\n",
+ address);
+ BUG();
+ }
+ pr_info("pte for 0x%lx: 0x%llx 0x%llx\n",
+ address,
+ (unsigned long long)pte_val(*pte),
+ (unsigned long long)pte_val(*pte) & _PAGE_PRESENT);
+}
+
+/*
+ * For some reason the pre/post pairs have been called in an
+ * unmatched order. Report and die.
+ */
+static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
+{
+ const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
+
+ pr_emerg("unexpected fault for address: 0x%08lx, last fault for address: 0x%08lx\n",
+ addr, my_reason->addr);
+ print_pte(addr);
+ pr_emerg("faulting IP is at %pS\n", (void *)regs->nip);
+ pr_emerg("last faulting IP was at %pS\n", (void *)my_reason->ip);
+ put_cpu_var(pf_reason);
+ BUG();
+}
+
+static void pre(struct kmmio_probe *p, struct pt_regs *regs,
+ unsigned long addr)
+{
+ struct trap_reason *my_reason = &get_cpu_var(pf_reason);
+ struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
+ const unsigned long instptr = instruction_pointer(regs);
+ struct opcode_t *opcode = get_opcode((unsigned int *)instptr);
+ enum mm_io_opcode type = get_ins_type(opcode);
+ struct remap_trace *trace = p->private;
+
+ /* it doesn't make sense to have more than one active trace per cpu */
+ if (my_reason->active_traces)
+ die_kmmio_nesting_error(regs, addr);
+ else
+ my_reason->active_traces++;
+
+ if (!opcode) {
+ pr_warn("The ins may be not included in src. Tell the dever follow info:");
+ pr_warn("ins_addr: 0x%lx ins: 0x%lx", instptr, *(unsigned long *)instptr);
+ }
+
+ my_reason->opcode = opcode;
+
+ my_reason->addr = addr;
+ my_reason->ip = instptr;
+
+ my_trace->phys = addr - trace->probe.addr + trace->phys;
+ my_trace->map_id = trace->id;
+
+ my_trace->pc = instptr;
+
+ my_trace->opcode = type;
+ my_trace->width = get_ins_width(opcode);
+
+ if (type == MMIO_WRITE)
+ my_trace->value = get_ins_val(my_reason, regs);
+
+ put_cpu_var(cpu_trace);
+ put_cpu_var(pf_reason);
+}
+
+static void post(struct kmmio_probe *p, unsigned long condition,
+ struct pt_regs *regs)
+{
+ struct trap_reason *my_reason = &get_cpu_var(pf_reason);
+ struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
+ struct opcode_t *opcode = my_reason->opcode;
+ enum mm_io_opcode type = get_ins_type(opcode);
+
+ /* this should always return the active_trace count to 0 */
+ my_reason->active_traces--;
+ if (my_reason->active_traces) {
+ pr_emerg("unexpected post handler");
+ BUG();
+ }
+
+ if (type == MMIO_READ)
+ my_trace->value = get_ins_val(my_reason, regs);
+
+ mmio_trace_rw(my_trace);
+ put_cpu_var(cpu_trace);
+ put_cpu_var(pf_reason);
+}
+
+static void ioremap_trace_core(resource_size_t offset, unsigned long size,
+ void __iomem *addr)
+{
+ static atomic_t next_id;
+ struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL);
+ /* These are page-unaligned. */
+ struct mmiotrace_map map = {
+ .phys = offset,
+ .virt = (unsigned long)addr,
+ .len = size,
+ .opcode = MMIO_PROBE
+ };
+
+ if (!trace) {
+ pr_err("kmalloc failed in ioremap\n");
+ return;
+ }
+
+ *trace = (struct remap_trace) {
+ .probe = {
+ .addr = (unsigned long)addr,
+ .len = size,
+ .pre_handler = pre,
+ .post_handler = post,
+ .private = trace
+ },
+ .phys = offset,
+ .id = atomic_inc_return(&next_id)
+ };
+ map.map_id = trace->id;
+
+ spin_lock_irq(&trace_lock);
+ if (!is_enabled()) {
+ kfree(trace);
+ goto not_enabled;
+ }
+
+ mmio_trace_mapping(&map);
+ list_add_tail(&trace->list, &trace_list);
+ if (!nommiotrace)
+ register_kmmio_probe(&trace->probe);
+
+not_enabled:
+ spin_unlock_irq(&trace_lock);
+}
+
+void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
+ void __iomem *addr)
+{
+ pr_err("ioremap_*(0x%llx, 0x%lx) = %p\n",
+ (unsigned long long)offset, size, addr);
+ if (!is_enabled()) /* recheck and proper locking in *_core() */
+ return;
+
+ pr_debug("ioremap_*(0x%llx, 0x%lx) = %p\n",
+ (unsigned long long)offset, size, addr);
+ if ((filter_offset) && (offset != filter_offset))
+ return;
+ ioremap_trace_core(offset, size, addr);
+}
+
+static void iounmap_trace_core(volatile void __iomem *addr)
+{
+ struct mmiotrace_map map = {
+ .phys = 0,
+ .virt = (unsigned long)addr,
+ .len = 0,
+ .opcode = MMIO_UNPROBE
+ };
+ struct remap_trace *trace;
+ struct remap_trace *tmp;
+ struct remap_trace *found_trace = NULL;
+
+ pr_debug("Unmapping %p.\n", addr);
+
+ spin_lock_irq(&trace_lock);
+ if (!is_enabled())
+ goto not_enabled;
+
+ list_for_each_entry_safe(trace, tmp, &trace_list, list) {
+ if ((unsigned long)addr == trace->probe.addr) {
+ if (!nommiotrace)
+ unregister_kmmio_probe(&trace->probe);
+ list_del(&trace->list);
+ found_trace = trace;
+ break;
+ }
+ }
+ map.map_id = (found_trace) ? found_trace->id : -1;
+ mmio_trace_mapping(&map);
+
+not_enabled:
+ spin_unlock_irq(&trace_lock);
+ if (found_trace) {
+ synchronize_rcu(); /* unregister_kmmio_probe() requirement */
+ kfree(found_trace);
+ }
+}
+
+void mmiotrace_iounmap(volatile void __iomem *addr)
+{
+ might_sleep();
+ if (is_enabled()) /* recheck and proper locking in *_core() */
+ iounmap_trace_core(addr);
+}
+
+int mmiotrace_printk(const char *fmt, ...)
+{
+ int ret = 0;
+ va_list args;
+ unsigned long flags;
+
+ va_start(args, fmt);
+
+ spin_lock_irqsave(&trace_lock, flags);
+ if (is_enabled())
+ ret = mmio_trace_printk(fmt, args);
+ spin_unlock_irqrestore(&trace_lock, flags);
+
+ va_end(args);
+ return ret;
+}
+EXPORT_SYMBOL(mmiotrace_printk);
+
+static void clear_trace_list(void)
+{
+ struct remap_trace *trace;
+ struct remap_trace *tmp;
+
+ /*
+ * No locking required, because the caller ensures we are in a
+ * critical section via mutex, and is_enabled() is false,
+ * i.e. nothing can traverse or modify this list.
+ * Caller also ensures is_enabled() cannot change.
+ */
+ list_for_each_entry(trace, &trace_list, list) {
+ pr_notice("purging non-iounmapped trace @0x%08lx, size 0x%lx.\n",
+ trace->probe.addr, trace->probe.len);
+ if (!nommiotrace)
+ unregister_kmmio_probe(&trace->probe);
+ }
+ synchronize_rcu(); /* unregister_kmmio_probe() requirement */
+
+ list_for_each_entry_safe(trace, tmp, &trace_list, list) {
+ list_del(&trace->list);
+ kfree(trace);
+ }
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static cpumask_var_t downed_cpus;
+
+static void enter_uniprocessor(void)
+{
+ int cpu;
+ int err;
+
+ if (!cpumask_available(downed_cpus) &&
+ !alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) {
+ pr_notice("Failed to allocate mask\n");
+ goto out;
+ }
+
+ cpus_read_lock();
+ cpumask_copy(downed_cpus, cpu_online_mask);
+ cpumask_clear_cpu(cpumask_first(cpu_online_mask), downed_cpus);
+ if (num_online_cpus() > 1)
+ pr_notice("Disabling non-boot CPUs...\n");
+ cpus_read_unlock();
+
+ for_each_cpu(cpu, downed_cpus) {
+ err = remove_cpu(cpu);
+ if (!err)
+ pr_info("CPU%d is down.\n", cpu);
+ else
+ pr_err("Error taking CPU%d down: %d\n", cpu, err);
+ }
+out:
+ if (num_online_cpus() > 1)
+ pr_warn("multiple CPUs still online, may miss events.\n");
+}
+
+static void leave_uniprocessor(void)
+{
+ int cpu;
+ int err;
+
+ if (!cpumask_available(downed_cpus) || cpumask_empty(downed_cpus))
+ return;
+ pr_notice("Re-enabling CPUs...\n");
+ for_each_cpu(cpu, downed_cpus) {
+ err = add_cpu(cpu);
+ if (!err)
+ pr_info("enabled CPU%d.\n", cpu);
+ else
+ pr_err("cannot re-enable CPU%d: %d\n", cpu, err);
+ }
+}
+
+#else /* !CONFIG_HOTPLUG_CPU */
+static void enter_uniprocessor(void)
+{
+ if (num_online_cpus() > 1)
+ pr_warn("multiple CPUs are online, may miss events. Suggest booting with maxcpus=1 kernel argument.\n");
+}
+
+static void leave_uniprocessor(void)
+{
+}
+#endif
+
+void enable_mmiotrace(void)
+{
+ mutex_lock(&mmiotrace_mutex);
+ if (is_enabled())
+ goto out;
+
+ if (nommiotrace)
+ pr_info("MMIO tracing disabled.\n");
+ kmmio_init();
+ enter_uniprocessor();
+ spin_lock_irq(&trace_lock);
+ atomic_inc(&mmiotrace_enabled);
+ spin_unlock_irq(&trace_lock);
+ pr_info("enabled.\n");
+out:
+ mutex_unlock(&mmiotrace_mutex);
+}
+
+void disable_mmiotrace(void)
+{
+ mutex_lock(&mmiotrace_mutex);
+ if (!is_enabled())
+ goto out;
+
+ spin_lock_irq(&trace_lock);
+ atomic_dec(&mmiotrace_enabled);
+ BUG_ON(is_enabled());
+ spin_unlock_irq(&trace_lock);
+
+ clear_trace_list(); /* guarantees: no more kmmio callbacks */
+ leave_uniprocessor();
+ kmmio_cleanup();
+ pr_info("disabled.\n");
+out:
+ mutex_unlock(&mmiotrace_mutex);
+}
diff --git a/arch/powerpc/mm/mmiotrace_arch.c b/arch/powerpc/mm/mmiotrace_arch.c
new file mode 100644
index 000000000000..ccc8032384ef
--- /dev/null
+++ b/arch/powerpc/mm/mmiotrace_arch.c
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Derived from arch/powerpc/mm/pgtable.c:
+ * Copyright (C) 2024 Jialong Yang (jialong.yang@shingroup.cn)
+ */
+
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/hugetlb.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/hugetlb.h>
+
+#include "mmiotrace_arch.h"
+
+static pte_t *mmiotrace_find_linux_pte(pgd_t *pgdp, unsigned long ea,
+ bool *is_thp, unsigned int *hpage_shift)
+{
+ p4d_t p4d, *p4dp;
+ pud_t pud, *pudp;
+ pmd_t pmd, *pmdp;
+ pte_t *ret_pte;
+ hugepd_t *hpdp = NULL;
+ unsigned int pdshift;
+
+ if (hpage_shift)
+ *hpage_shift = 0;
+
+ if (is_thp)
+ *is_thp = false;
+
+ /*
+ * Always operate on the local stack value. This make sure the
+ * value don't get updated by a parallel THP split/collapse,
+ * page fault or a page unmap. The return pte_t * is still not
+ * stable. So should be checked there for above conditions.
+ * Top level is an exception because it is folded into p4d.
+ */
+ p4dp = p4d_offset(pgdp, ea);
+ p4d = READ_ONCE(*p4dp);
+ pdshift = P4D_SHIFT;
+
+ if (p4d_none(p4d))
+ return NULL;
+
+ if (p4d_leaf(p4d)) {
+ ret_pte = (pte_t *)p4dp;
+ goto out;
+ }
+
+ if (is_hugepd(__hugepd(p4d_val(p4d)))) {
+ hpdp = (hugepd_t *)&p4d;
+ goto out_huge;
+ }
+
+ /*
+ * Even if we end up with an unmap, the pgtable will not
+ * be freed, because we do an rcu free and here we are
+ * irq disabled
+ */
+ pdshift = PUD_SHIFT;
+ pudp = pud_offset(&p4d, ea);
+ pud = READ_ONCE(*pudp);
+
+ if (pud_none(pud))
+ return NULL;
+
+ if (pud_leaf(pud)) {
+ ret_pte = (pte_t *)pudp;
+ goto out;
+ }
+
+ if (is_hugepd(__hugepd(pud_val(pud)))) {
+ hpdp = (hugepd_t *)&pud;
+ goto out_huge;
+ }
+
+ pdshift = PMD_SHIFT;
+ pmdp = pmd_offset(&pud, ea);
+ pmd = READ_ONCE(*pmdp);
+
+ /*
+ * A hugepage collapse is captured by this condition, see
+ * pmdp_collapse_flush.
+ */
+ if (pmd_none(pmd))
+ return NULL;
+
+#ifdef CONFIG_PPC_BOOK3S_64
+ /*
+ * A hugepage split is captured by this condition, see
+ * pmdp_invalidate.
+ *
+ * Huge page modification can be caught here too.
+ */
+ if (pmd_is_serializing(pmd))
+ return NULL;
+#endif
+
+ if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
+ if (is_thp)
+ *is_thp = true;
+ ret_pte = (pte_t *)pmdp;
+ goto out;
+ }
+
+ if (pmd_leaf(pmd)) {
+ ret_pte = (pte_t *)pmdp;
+ goto out;
+ }
+
+ if (is_hugepd(__hugepd(pmd_val(pmd)))) {
+ hpdp = (hugepd_t *)&pmd;
+ goto out_huge;
+ }
+
+ pdshift = PAGE_SHIFT;
+
+ if (hpage_shift)
+ *hpage_shift = pdshift;
+
+ return pte_offset_kernel(&pmd, ea);
+
+out_huge:
+ if (!hpdp)
+ return NULL;
+
+ ret_pte = hugepte_offset(*hpdp, ea, pdshift);
+ pdshift = hugepd_shift(*hpdp);
+out:
+ if (hpage_shift)
+ *hpage_shift = pdshift;
+ return ret_pte;
+}
+
+pte_t *lookup_address(unsigned long address, unsigned int *shift)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ pte_t *pte = mmiotrace_find_linux_pte(pgd_offset_k(address), address, NULL, shift);
+
+ local_irq_restore(flags);
+
+ return pte;
+}
diff --git a/arch/powerpc/mm/mmiotrace_arch.h b/arch/powerpc/mm/mmiotrace_arch.h
new file mode 100644
index 000000000000..f4a5bff24a07
--- /dev/null
+++ b/arch/powerpc/mm/mmiotrace_arch.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Derived from arch/powerpc/mm/pgtable.c:
+ * Copyright (C) 2024 Jialong Yang (jialong.yang@shingroup.cn)
+ */
+
+#ifndef __MMIOTRACE_ARCH_
+#define __MMIOTRACE_ARCH_
+#include <asm/pgtable.h>
+
+static inline int page_level_shift(unsigned int level)
+{
+ return level;
+}
+static inline unsigned long page_level_size(unsigned int level)
+{
+ return 1UL << page_level_shift(level);
+}
+static inline unsigned long page_level_mask(unsigned int level)
+{
+ return ~(page_level_size(level) - 1);
+}
+
+pte_t *lookup_address(unsigned long address, unsigned int *level);
+#endif // __MMIOTRACE_ARCH_
diff --git a/arch/powerpc/mm/pf_in.c b/arch/powerpc/mm/pf_in.c
new file mode 100644
index 000000000000..e6c90b383e7f
--- /dev/null
+++ b/arch/powerpc/mm/pf_in.c
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Derived from arch/x86/mm/pf_in.c:
+ * Copyright (C) 2024 Jialong Yang (jialong.yang@shingroup.cn)
+ */
+
+#include <linux/ptrace.h> /* struct pt_regs */
+#include "pf_in.h"
+#include <linux/printk.h>
+#include <linux/mmiotrace.h>
+
+/* D 32 0x80000000 B lwz Load Word and Zero */
+/* D 33 0x84000000 B lwz Load Word and Zero with Update */
+/* D 34 0x88000000 B lbz Load Byte and Zero */
+/* D 33 0x8C000000 B lbzu Load Word and Zero with Update */
+/* D 35 0x90000000 B stw Store Word */
+/* D 36 0x94000000 B stwu Store Word with Update */
+/* D 37 0x98000000 B stb Store Byte */
+/* D 38 0x9C000000 B stbu Store Byte with Update */
+/* D 40 0xA0000000 B lhz Load Halfword and Zero with Update */
+/* D 41 0xA4000000 B lhzu Load Halfword and Zero with Update */
+/* D 42 0xA8000000 B lha Load Halfword Algebraic */
+/* D 43 0xAC000000 B lhau Load Halfword Algebraic with Update */
+/* D 44 0xB0000000 B sth Store Halfword */
+/* D 45 0xB4000000 B sthu Store Halfword with Update */
+/* D 46 0xB8000000 B lmw Load Multiple Word */
+/* D 47 0xBC000000 B stmw Store Multiple Word */
+/* D 48 0xC0000000 FP lfs Load Floating-Point Single */
+/* D 49 0xC4000000 FP lfsu Load Floating-Point Single with Update */
+/* D 50 0xC8000000 FP lfd Load Floating-Point Double */
+/* D 51 0xCC000000 FP lfdu Load Floating-Point Double with Update */
+/* D 52 0xD0000000 FP stfs Store Floating-Point Single */
+/* D 53 0xD4000000 FP stfsu Store Floating-Point Single with Update */
+/* D 54 0xD8000000 FP stfd Store Floating-Point Double */
+/* D 55 0xDC000000 FP stfdu Store Floating-Point Double with Update */
+/* DQ 56 0xE0000000 P 58 LSQ lq Load Quadword */
+/* DS 57 0xE4000000 140 FP.out Lfdp Load Floating-Point Double Pair */
+/* DS 58 0xE8000000 53 64 Ldu Load Doubleword with Update */
+/* DS 58 0xE8000001 53 64 Ld Load Doubleword */
+/* DS 58 0xE8000002 52 64 Lwa Load Word Algebraic */
+/* DS 62 0xF8000000 57 64 std Store Doubleword */
+/* DS 62 0xF8000001 57 64 stdu Store Doubleword with Update */
+/* DS 62 0xF8000002 59 LSQ stq Store Quadword */
+
+// D-form:
+// 0-5 6-10 11-15 16-31
+// opcode RT RA Offset
+
+// DQ-form:
+// 0-5 6-10 11-15 16-27
+// opcode RT RA Offset
+
+// DS-form:
+// 0-5 6-10 11-15 16-29 30-31
+// opcode RT RA Offset opcode
+
+#define D_OPCODE_MASK GENMASK(31, 26)
+#define DQ_OPCODE_MASK D_OPCODE_MASK
+#define DS_OPCODE_MASK (D_OPCODE_MASK | GENMASK(0, 1))
+#define RS_RT_OFFSET 21UL
+#define RS_RT_MASK GENMASK(25, 21)
+#define RA_MASK GENMASK(20, 16)
+#define D_OFFSET GENMASK(15, 0)
+#define DQ_OFFSET GENMASK(15, 4)
+#define DS_OFFSET GENMASK(15, 2)
+
+struct opcode_t opcodes[] = {
+ {0x80000000, D_FORMAT, "lwz", },
+ {0x84000000, D_FORMAT, "lwzu", },
+ {0x88000000, D_FORMAT, "lbz", },
+ {0x8C000000, D_FORMAT, "lbzu", },
+ {0x90000000, D_FORMAT, "stw", },
+ {0x94000000, D_FORMAT, "stwu", },
+ {0x98000000, D_FORMAT, "stb", },
+ {0x9C000000, D_FORMAT, "stbu", },
+ {0xA0000000, D_FORMAT, "lhz", },
+ {0xA4000000, D_FORMAT, "lhzu", },
+ {0xA8000000, D_FORMAT, "lha", },
+ {0xAC000000, D_FORMAT, "lhau", },
+ {0xB0000000, D_FORMAT, "sth", },
+ {0xB4000000, D_FORMAT, "sthu", },
+ {0xB8000000, D_FORMAT, "lmw", },
+ {0xBC000000, D_FORMAT, "stmw", },
+ {0xC0000000, D_FORMAT, "lfs", },
+ {0xC4000000, D_FORMAT, "lfsu", },
+ {0xC8000000, D_FORMAT, "lfd", },
+ {0xCC000000, D_FORMAT, "lfdu", },
+ {0xD0000000, D_FORMAT, "stfs", },
+ {0xD4000000, D_FORMAT, "stfsu", },
+ {0xD8000000, D_FORMAT, "stfd", },
+ {0xDC000000, D_FORMAT, "stfdu", },
+ {0xE0000000, DQ_FORMAT, "lq", },
+ {0xE4000000, DS_FORMAT, "lfdp", },
+ {0xE8000000, DS_FORMAT, "ldu", },
+ {0xE8000001, DS_FORMAT, "ld", },
+ {0xE8000002, DS_FORMAT, "lwa", },
+ {0xF8000000, DS_FORMAT, "std", },
+ {0xF8000001, DS_FORMAT, "stdu", },
+ {0xF8000002, DS_FORMAT, "stq", }
+};
+
+struct opcode_t *get_opcode(unsigned int *addr)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(opcodes); i++) {
+ switch (opcodes[i].form) {
+ case D_FORMAT:
+ if (opcodes[i].opcode == (*addr & D_OPCODE_MASK))
+ return &opcodes[i];
+ break;
+ case DQ_FORMAT:
+ if (opcodes[i].opcode == (*addr & DQ_OPCODE_MASK))
+ return &opcodes[i];
+ break;
+ case DS_FORMAT:
+ if (opcodes[i].opcode == (*addr & DQ_OPCODE_MASK))
+ return &opcodes[i];
+ break;
+ }
+ }
+
+ return NULL;
+}
+
+inline enum mm_io_opcode get_ins_type(struct opcode_t *opcode)
+{
+ if (!opcode)
+ return MMIO_UNKNOWN_OP;
+
+ if (opcode->name[0] == 'l')
+ return MMIO_READ;
+
+ if (opcode->name[0] == 's')
+ return MMIO_WRITE;
+
+ return MMIO_UNKNOWN_OP;
+}
+
+unsigned int get_ins_width(struct opcode_t *opcode)
+{
+ char width_ch;
+
+ if (!opcode)
+ return 0;
+
+ if (opcode->name[0] == 'l')
+ width_ch = opcode->name[1];
+
+ if (opcode->name[0] == 's')
+ width_ch = opcode->name[2];
+
+ switch (width_ch) {
+ case 'b': /* byte */
+ return 1;
+ case 'h': /* half word */
+ return sizeof(long) / 2;
+ case 'w': /* word */
+ /* return sizeof(long); */
+ case 'm': /* multi words(can be calculated out by (32-RT) * sizeof(long)) */
+ case 'f': /* float(not too much. So ignore word number) */
+ case 'd': /* double words */
+ /* return 2 * sizeof(long); */
+ case 'q': /* quad words */
+ /* return 4 * sizeof(long); */
+ default:
+ return sizeof(long);
+ }
+}
+
+unsigned long get_ins_val(struct trap_reason *reason, struct pt_regs *regs)
+{
+ struct opcode_t *opcode = reason->opcode;
+ unsigned int ins = *(unsigned int *)(reason->ip);
+ unsigned int reg_no;
+ unsigned long mask = ~0UL;
+
+ if (!opcode)
+ return 0;
+
+ mask >>= 8 * (sizeof(long) - get_ins_width(opcode));
+ reg_no = (ins & RS_RT_MASK) >> RS_RT_OFFSET;
+
+ return regs->gpr[reg_no] & mask;
+}
diff --git a/arch/powerpc/mm/pf_in.h b/arch/powerpc/mm/pf_in.h
new file mode 100644
index 000000000000..905ba4937137
--- /dev/null
+++ b/arch/powerpc/mm/pf_in.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Derived from arch/x86/mm/pf_in.h:
+ * Copyright (C) 2024 Jialong Yang (jialong.yang@shingroup.cn)
+ */
+
+#ifndef __PF_H_
+#define __PF_H_
+
+enum OPCODE_FORMAT {
+ D_FORMAT,
+ DQ_FORMAT,
+ DS_FORMAT,
+};
+
+struct opcode_t {
+ unsigned int opcode;
+ enum OPCODE_FORMAT form;
+ const char *name;
+};
+
+struct trap_reason {
+ unsigned long addr;
+ unsigned long ip;
+ struct opcode_t *opcode;
+ int active_traces;
+};
+
+struct opcode_t *get_opcode(unsigned int *addr);
+enum mm_io_opcode get_ins_type(struct opcode_t *opcode);
+unsigned int get_ins_width(struct opcode_t *opcode);
+unsigned long get_ins_val(struct trap_reason *reason, struct pt_regs *regs);
+#endif /* __PF_H_ */
--
2.34.1
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH v1 2/2] powerpc/mmiotrace: bind ioremap and page fault to active mmiotrace
2024-06-20 8:51 [PATCH v1 1/2] powerpc/mmiotrace: Add MMIO Tracing tool for PowerPC Jialong Yang
@ 2024-06-20 8:51 ` Jialong Yang
2024-06-27 12:31 ` [PATCH v1 1/2] powerpc/mmiotrace: Add MMIO Tracing tool for PowerPC kernel test robot
2024-06-28 7:02 ` Michael Ellerman
2 siblings, 0 replies; 6+ messages in thread
From: Jialong Yang @ 2024-06-20 8:51 UTC (permalink / raw)
To: Michael Ellerman, Nicholas Piggin, Christophe Leroy,
Naveen N. Rao
Cc: shenghui.qu, luming.yu, linuxppc-dev, linux-kernel, Jialong Yang
Hacking the code in ioremap entry and page fault handler entry to
integrate mmiotrace.
Signed-off-by: Jialong Yang <jialong.yang@shingroup.cn>
---
arch/powerpc/mm/fault.c | 17 +++++++++++++++++
arch/powerpc/mm/ioremap_64.c | 11 +++++++++--
2 files changed, 26 insertions(+), 2 deletions(-)
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 215690452495..b03cba73de92 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -22,6 +22,7 @@
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
+#include <linux/mmiotrace.h>
#include <linux/interrupt.h>
#include <linux/highmem.h>
#include <linux/extable.h>
@@ -50,6 +51,19 @@
* do_page_fault error handling helpers
*/
+/*
+ * Returns 0 if mmiotrace is disabled, or if the fault is not
+ * handled by mmiotrace:
+ */
+static nokprobe_inline int
+kmmio_fault(struct pt_regs *regs, unsigned long addr)
+{
+ if (unlikely(is_kmmio_active()))
+ if (kmmio_handler(regs, addr) == 1)
+ return -1;
+ return 0;
+}
+
static int
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int si_code)
{
@@ -422,6 +436,9 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
vm_fault_t fault, major = 0;
bool kprobe_fault = kprobe_page_fault(regs, 11);
+ if (unlikely(kmmio_fault(regs, address)))
+ return 0;
+
if (unlikely(debugger_fault_handler(regs) || kprobe_fault))
return 0;
diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c
index d24e5f166723..f5f717bf35df 100644
--- a/arch/powerpc/mm/ioremap_64.c
+++ b/arch/powerpc/mm/ioremap_64.c
@@ -3,12 +3,15 @@
#include <linux/io.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
+#include <linux/mmiotrace.h>
void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size,
pgprot_t prot, void *caller)
{
phys_addr_t paligned, offset;
void __iomem *ret;
+ phys_addr_t unaligned_phys_addr = addr;
+ const unsigned long unaligned_size = size;
int err;
/* We don't support the 4K PFN hack with ioremap */
@@ -28,8 +31,11 @@ void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size,
if (size == 0 || paligned == 0)
return NULL;
- if (slab_is_available())
- return generic_ioremap_prot(addr, size, prot);
+ if (slab_is_available()) {
+ ret = generic_ioremap_prot(addr, size, prot);
+ mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret);
+ return ret;
+ }
pr_warn("ioremap() called early from %pS. Use early_ioremap() instead\n", caller);
@@ -52,6 +58,7 @@ void iounmap(volatile void __iomem *token)
if (!slab_is_available())
return;
+ mmiotrace_iounmap(token);
generic_iounmap(PCI_FIX_ADDR(token));
}
EXPORT_SYMBOL(iounmap);
--
2.34.1
^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [PATCH v1 1/2] powerpc/mmiotrace: Add MMIO Tracing tool for PowerPC
2024-06-20 8:51 [PATCH v1 1/2] powerpc/mmiotrace: Add MMIO Tracing tool for PowerPC Jialong Yang
2024-06-20 8:51 ` [PATCH v1 2/2] powerpc/mmiotrace: bind ioremap and page fault to active mmiotrace Jialong Yang
@ 2024-06-27 12:31 ` kernel test robot
2024-06-28 7:02 ` Michael Ellerman
2 siblings, 0 replies; 6+ messages in thread
From: kernel test robot @ 2024-06-27 12:31 UTC (permalink / raw)
To: Jialong Yang, Michael Ellerman, Nicholas Piggin, Christophe Leroy,
Naveen N. Rao
Cc: luming.yu, linux-kernel, shenghui.qu, oe-kbuild-all, linuxppc-dev,
Jialong Yang
Hi Jialong,
kernel test robot noticed the following build errors:
[auto build test ERROR on powerpc/next]
[also build test ERROR on powerpc/fixes linus/master v6.10-rc5 next-20240626]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Jialong-Yang/powerpc-mmiotrace-bind-ioremap-and-page-fault-to-active-mmiotrace/20240624-163027
base: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
patch link: https://lore.kernel.org/r/2bf90acf7d29641ba6643934ff8dbba897dbd2d9.1718873074.git.jialong.yang%40shingroup.cn
patch subject: [PATCH v1 1/2] powerpc/mmiotrace: Add MMIO Tracing tool for PowerPC
config: powerpc-randconfig-r113-20240627 (https://download.01.org/0day-ci/archive/20240627/202406271946.A6jwFfaY-lkp@intel.com/config)
compiler: powerpc-linux-gcc (GCC) 13.2.0
reproduce: (https://download.01.org/0day-ci/archive/20240627/202406271946.A6jwFfaY-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202406271946.A6jwFfaY-lkp@intel.com/
All error/warnings (new ones prefixed by >>):
arch/powerpc/mm/kmmio.c: In function 'pmd_mkinvalid':
>> arch/powerpc/mm/kmmio.c:140:16: error: implicit declaration of function '__pmd_raw' [-Werror=implicit-function-declaration]
140 | return __pmd_raw(pmd_raw(pmd) & ~cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID));
| ^~~~~~~~~
>> arch/powerpc/mm/kmmio.c:140:26: error: implicit declaration of function 'pmd_raw'; did you mean 'pmd_bad'? [-Werror=implicit-function-declaration]
140 | return __pmd_raw(pmd_raw(pmd) & ~cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID));
| ^~~~~~~
| pmd_bad
In file included from include/linux/byteorder/big_endian.h:5,
from arch/powerpc/include/uapi/asm/byteorder.h:14,
from include/asm-generic/bitops/le.h:6,
from arch/powerpc/include/asm/bitops.h:325,
from include/linux/bitops.h:63,
from include/linux/thread_info.h:27,
from arch/powerpc/include/asm/ptrace.h:342,
from arch/powerpc/include/asm/hw_irq.h:12,
from arch/powerpc/include/asm/irqflags.h:12,
from include/linux/irqflags.h:18,
from include/asm-generic/cmpxchg-local.h:6,
from arch/powerpc/include/asm/cmpxchg.h:755,
from arch/powerpc/include/asm/atomic.h:11,
from include/linux/atomic.h:7,
from include/linux/rcupdate.h:25,
from include/linux/rculist.h:11,
from arch/powerpc/mm/kmmio.c:10:
>> arch/powerpc/mm/kmmio.c:140:70: error: '_PAGE_INVALID' undeclared (first use in this function); did you mean 'RPM_INVALID'?
140 | return __pmd_raw(pmd_raw(pmd) & ~cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID));
| ^~~~~~~~~~~~~
include/uapi/linux/byteorder/big_endian.h:38:51: note: in definition of macro '__cpu_to_be64'
38 | #define __cpu_to_be64(x) ((__force __be64)(__u64)(x))
| ^
arch/powerpc/mm/kmmio.c:140:42: note: in expansion of macro 'cpu_to_be64'
140 | return __pmd_raw(pmd_raw(pmd) & ~cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID));
| ^~~~~~~~~~~
arch/powerpc/mm/kmmio.c:140:70: note: each undeclared identifier is reported only once for each function it appears in
140 | return __pmd_raw(pmd_raw(pmd) & ~cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID));
| ^~~~~~~~~~~~~
include/uapi/linux/byteorder/big_endian.h:38:51: note: in definition of macro '__cpu_to_be64'
38 | #define __cpu_to_be64(x) ((__force __be64)(__u64)(x))
| ^
arch/powerpc/mm/kmmio.c:140:42: note: in expansion of macro 'cpu_to_be64'
140 | return __pmd_raw(pmd_raw(pmd) & ~cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID));
| ^~~~~~~~~~~
arch/powerpc/mm/kmmio.c: In function 'kmmio_handler':
>> arch/powerpc/mm/kmmio.c:318:32: error: 'struct pt_regs' has no member named 'softe'
318 | ctx->saved_softe = regs->softe;
| ^~
arch/powerpc/mm/kmmio.c:330:13: error: 'struct pt_regs' has no member named 'softe'
330 | regs->softe = IRQS_DISABLED; // soft interrupt
| ^~
>> arch/powerpc/mm/kmmio.c:332:9: error: 'local_paca' undeclared (first use in this function); did you mean 'local_lock'?
332 | local_paca->srr_valid = 0;
| ^~~~~~~~~~
| local_lock
arch/powerpc/mm/kmmio.c: In function 'post_kmmio_handler':
arch/powerpc/mm/kmmio.c:383:13: error: 'struct pt_regs' has no member named 'softe'
383 | regs->softe = ctx->saved_softe;
| ^~
arch/powerpc/mm/kmmio.c: In function 'pmd_mkinvalid':
>> arch/powerpc/mm/kmmio.c:141:1: warning: control reaches end of non-void function [-Wreturn-type]
141 | }
| ^
cc1: some warnings being treated as errors
vim +/__pmd_raw +140 arch/powerpc/mm/kmmio.c
137
138 static inline pmd_t pmd_mkinvalid(pmd_t pmd)
139 {
> 140 return __pmd_raw(pmd_raw(pmd) & ~cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID));
> 141 }
142
143 static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
144 {
145 pmd_t new_pmd;
146 pmdval_t v = pmd_val(*pmd);
147
148 if (clear) {
149 *old = v;
150 new_pmd = pmd_mkinvalid(*pmd);
151 } else {
152 /* Presume this has been called with clear==true previously */
153 new_pmd = __pmd(*old);
154 }
155 *pmd = new_pmd;
156 }
157
158 static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old, unsigned long addr)
159 {
160 pteval_t v = pte_val(*pte);
161
162 if (clear) {
163 *old = v;
164 /* Nothing should care about address */
165 pte_clear(&init_mm, addr, pte);
166 } else {
167 /* Presume this has been called with clear==true previously */
168 set_pte_at(&init_mm, addr, pte, __pte(*old));
169 }
170 }
171
172 static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
173 {
174 unsigned int level;
175 pte_t *pte = lookup_address(f->addr, &level);
176
177 if (!pte) {
178 pr_err("no pte for addr 0x%08lx\n", f->addr);
179 return -1;
180 }
181
182 if (level == PMD_SHIFT)
183 clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence);
184 else if (level == PAGE_SHIFT)
185 clear_pte_presence(pte, clear, &f->old_presence, f->addr);
186 else {
187 pr_err("unexpected page level 0x%x.\n", level);
188 return -1;
189 }
190
191 mmap_read_lock(&init_mm);
192 struct vm_area_struct *vma = find_vma(&init_mm, f->addr);
193
194 mmap_read_unlock(&init_mm);
195
196 flush_tlb_page(vma, f->addr);
197
198 return 0;
199 }
200
201 /*
202 * Mark the given page as not present. Access to it will trigger a fault.
203 *
204 * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the
205 * protection is ignored here. RCU read lock is assumed held, so the struct
206 * will not disappear unexpectedly. Furthermore, the caller must guarantee,
207 * that double arming the same virtual address (page) cannot occur.
208 *
209 * Double disarming on the other hand is allowed, and may occur when a fault
210 * and mmiotrace shutdown happen simultaneously.
211 */
212 static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
213 {
214 int ret;
215
216 WARN_ONCE(f->armed, pr_fmt("kmmio page already armed.\n"));
217 if (f->armed) {
218 pr_warn("double-arm: addr 0x%08lx, ref %d, old %d\n",
219 f->addr, f->count, !!f->old_presence);
220 }
221 ret = clear_page_presence(f, true);
222 WARN_ONCE(ret < 0, pr_fmt("arming at 0x%08lx failed.\n"),
223 f->addr);
224 f->armed = true;
225 return ret;
226 }
227
228 /** Restore the given page to saved presence state. */
229 static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
230 {
231 int ret = clear_page_presence(f, false);
232
233 WARN_ONCE(ret < 0,
234 KERN_ERR "kmmio disarming at 0x%08lx failed.\n", f->addr);
235 f->armed = false;
236 }
237
238 /*
239 * This is being called from do_page_fault().
240 *
241 * We may be in an interrupt or a critical section. Also prefecthing may
242 * trigger a page fault. We may be in the middle of process switch.
243 * We cannot take any locks, because we could be executing especially
244 * within a kmmio critical section.
245 *
246 * Local interrupts are disabled, so preemption cannot happen.
247 * Do not enable interrupts, do not sleep, and watch out for other CPUs.
248 */
249 /*
250 * Interrupts are disabled on entry as trap3 is an interrupt gate
251 * and they remain disabled throughout this function.
252 */
253 int kmmio_handler(struct pt_regs *regs, unsigned long addr)
254 {
255 struct kmmio_context *ctx;
256 struct kmmio_fault_page *faultpage;
257 int ret = 0; /* default to fault not handled */
258 unsigned long page_base = addr;
259 unsigned int l;
260 pte_t *pte = lookup_address(addr, &l);
261
262 if (!pte)
263 return -EINVAL;
264 page_base &= page_level_mask(l);
265
266 /*
267 * Hold the RCU read lock over single stepping to avoid looking
268 * up the probe and kmmio_fault_page again. The rcu_read_lock_sched()
269 * also disables preemption and prevents process switch during
270 * the single stepping. We can only handle one active kmmio trace
271 * per cpu, so ensure that we finish it before something else
272 * gets to run.
273 */
274 rcu_read_lock_sched_notrace();
275
276 faultpage = get_kmmio_fault_page(page_base);
277 if (!faultpage) {
278 /*
279 * Either this page fault is not caused by kmmio, or
280 * another CPU just pulled the kmmio probe from under
281 * our feet. The latter case should not be possible.
282 */
283 goto no_kmmio;
284 }
285
286 ctx = this_cpu_ptr(&kmmio_ctx);
287 if (ctx->active) {
288 if (page_base == ctx->addr) {
289 /*
290 * A second fault on the same page means some other
291 * condition needs handling by do_page_fault(), the
292 * page really not being present is the most common.
293 */
294 pr_debug("secondary hit for 0x%08lx CPU %d.\n",
295 addr, smp_processor_id());
296
297 if (!faultpage->old_presence)
298 pr_info("unexpected secondary hit for address 0x%08lx on CPU %d.\n",
299 addr, smp_processor_id());
300 } else {
301 /*
302 * Prevent overwriting already in-flight context.
303 * This should not happen, let's hope disarming at
304 * least prevents a panic.
305 */
306 pr_emerg("recursive probe hit on CPU %d, for address 0x%08lx. Ignoring.\n",
307 smp_processor_id(), addr);
308 pr_emerg("previous hit was at 0x%08lx.\n", ctx->addr);
309 disarm_kmmio_fault_page(faultpage);
310 }
311 goto no_kmmio;
312 }
313 ctx->active++;
314
315 ctx->fpage = faultpage;
316 ctx->probe = get_kmmio_probe(page_base);
317 ctx->saved_flags = (regs->msr & (MSR_SE | MSR_EE));
> 318 ctx->saved_softe = regs->softe;
319 ctx->addr = page_base;
320
321 if (ctx->probe && ctx->probe->pre_handler)
322 ctx->probe->pre_handler(ctx->probe, regs, addr);
323
324 /*
325 * Enable single-stepping and disable interrupts for the faulting
326 * context. Local interrupts must not get enabled during stepping.
327 */
328 regs->msr |= MSR_SE; // single step
329 regs->msr &= ~MSR_EE; // hard interrupt
330 regs->softe = IRQS_DISABLED; // soft interrupt
331
> 332 local_paca->srr_valid = 0;
333
334 /* Now we set present bit in PTE and single step. */
335 disarm_kmmio_fault_page(ctx->fpage);
336
337 /*
338 * If another cpu accesses the same page while we are stepping,
339 * the access will not be caught. It will simply succeed and the
340 * only downside is we lose the event. If this becomes a problem,
341 * the user should drop to single cpu before tracing.
342 */
343
344 return 1; /* fault handled */
345
346 no_kmmio:
347 rcu_read_unlock_sched_notrace();
348 return ret;
349 }
350
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v1 1/2] powerpc/mmiotrace: Add MMIO Tracing tool for PowerPC
2024-06-20 8:51 [PATCH v1 1/2] powerpc/mmiotrace: Add MMIO Tracing tool for PowerPC Jialong Yang
2024-06-20 8:51 ` [PATCH v1 2/2] powerpc/mmiotrace: bind ioremap and page fault to active mmiotrace Jialong Yang
2024-06-27 12:31 ` [PATCH v1 1/2] powerpc/mmiotrace: Add MMIO Tracing tool for PowerPC kernel test robot
@ 2024-06-28 7:02 ` Michael Ellerman
2024-06-28 8:21 ` Yang Jialong 杨佳龙
2 siblings, 1 reply; 6+ messages in thread
From: Michael Ellerman @ 2024-06-28 7:02 UTC (permalink / raw)
To: Jialong Yang, Nicholas Piggin, Christophe Leroy, Naveen N. Rao
Cc: luming.yu, nouveau, linux-kernel, Steven Rostedt, Karol Herbst,
shenghui.qu, Pekka Paalanen, Masami Hiramatsu, linuxppc-dev,
Jialong Yang
Jialong Yang <jialong.yang@shingroup.cn> writes:
> mmiotrace is a useful tool to trace MMIO accesses. Nowadays, it only
> supported on x86 and x86_64 platforms.
I've never used mmiotrace, and don't know it well.
I'm not necessarily opposed to merging it, but AFAIK it was mostly used
for reverse engineering proprietary drivers, where the driver itself
couldn't be easily instrumented. Is that what you're using it for?
For drivers where we have the source wouldn't it be easier to just use
tracepoints in the MMIO accessors?
Is it still in-use/maintained on the x86 side?
> Here is a support for powerpc.
> The manual is located at Documentation/trace/mmiotrace.rst which means
> I have not changed user API. People will be easy to use it.
> Almost all files are copied from x86/mm, there are only some
> differences from hardware and architectures software.
>
> LINK: https://lore.kernel.org/lkml/20080127195536.50809974@daedalus.pq.iki.fi/
>
> Signed-off-by: Jialong Yang <jialong.yang@shingroup.cn>
> ---
> arch/powerpc/Kconfig.debug | 3 +
> arch/powerpc/mm/Makefile | 1 +
> arch/powerpc/mm/kmmio.c | 649 +++++++++++++++++++++++++++++++
> arch/powerpc/mm/mmio-mod.c | 414 ++++++++++++++++++++
> arch/powerpc/mm/mmiotrace_arch.c | 149 +++++++
> arch/powerpc/mm/mmiotrace_arch.h | 25 ++
> arch/powerpc/mm/pf_in.c | 185 +++++++++
> arch/powerpc/mm/pf_in.h | 33 ++
> 8 files changed, 1459 insertions(+)
At a glance most of that code could be shared between arches. I don't
think I can merge that as-is, without some attempt to split the generic
parts out.
cheers
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v1 1/2] powerpc/mmiotrace: Add MMIO Tracing tool for PowerPC
2024-06-28 7:02 ` Michael Ellerman
@ 2024-06-28 8:21 ` Yang Jialong 杨佳龙
2024-09-04 7:47 ` 虞陆铭
0 siblings, 1 reply; 6+ messages in thread
From: Yang Jialong 杨佳龙 @ 2024-06-28 8:21 UTC (permalink / raw)
To: Michael Ellerman, Nicholas Piggin, Christophe Leroy,
Naveen N. Rao
Cc: luming.yu, nouveau, linux-kernel, Steven Rostedt, Karol Herbst,
shenghui.qu, Pekka Paalanen, Masami Hiramatsu, linuxppc-dev
在 2024/6/28 15:02, Michael Ellerman 写道:
> Jialong Yang <jialong.yang@shingroup.cn> writes:
>> mmiotrace is a useful tool to trace MMIO accesses. Nowadays, it only
>> supported on x86 and x86_64 platforms.
> I've never used mmiotrace, and don't know it well.
>
> I'm not necessarily opposed to merging it, but AFAIK it was mostly used
> for reverse engineering proprietary drivers, where the driver itself
> couldn't be easily instrumented. Is that what you're using it for?
Yes. Just like you think. We have used it for network stack debug in
ppc64le.
>
> For drivers where we have the source wouldn't it be easier to just use
> tracepoints in the MMIO accessors?
Tracepoints need pre-defined. And in some big driver, it's not easy to
overwrite
all points where access registers in io area. And tracepoint is C
function level filter.
mmiotrace is similar to set tracepoints in writel/readl... But it can do
deeperly.
mmiotrace is a asm level filter tool. It doesn't care what have done in
C level. It will
only find what have done by asm, such as stw(store word)/lw(load word),
just like standing
in the view of device.
>
> Is it still in-use/maintained on the x86 side?
Here is some core file patches number in x86:
| | mmio_mod.c | kmmio.c | pf_in.* | testmmiotrace.c |
|------+------------+---------+---------+-----------------|
| 2022 | 1 | 3 | | |
| 2021 | 2 | 1 | | |
| 2020 | 4 | 4 | | 1 |
| 2019 | 2 | 1 | 1 | 4 |
| 2018 | | 2 | | |
| 2017 | 2 | 2 | | 1 |
| 2016 | 1 | 2 | 1 | |
| 2014 | | 1 | | |
| 2013 | 1 | | | |
| 2012 | 1 | | | |
| 2011 | 3 | | 1 | |
| 2010 | 1 | 3 | 2 | 1 |
| 2009 | 4 | 19 | | 3 |
| 2008 | 13 | 5 | 2 | 3 |
>
>> Here is a support for powerpc.
>> The manual is located at Documentation/trace/mmiotrace.rst which means
>> I have not changed user API. People will be easy to use it.
>> Almost all files are copied from x86/mm, there are only some
>> differences from hardware and architectures software.
>>
>> LINK: https://lore.kernel.org/lkml/20080127195536.50809974@daedalus.pq.iki.fi/
>>
>> Signed-off-by: Jialong Yang <jialong.yang@shingroup.cn>
>> ---
>> arch/powerpc/Kconfig.debug | 3 +
>> arch/powerpc/mm/Makefile | 1 +
>> arch/powerpc/mm/kmmio.c | 649 +++++++++++++++++++++++++++++++
>> arch/powerpc/mm/mmio-mod.c | 414 ++++++++++++++++++++
>> arch/powerpc/mm/mmiotrace_arch.c | 149 +++++++
>> arch/powerpc/mm/mmiotrace_arch.h | 25 ++
>> arch/powerpc/mm/pf_in.c | 185 +++++++++
>> arch/powerpc/mm/pf_in.h | 33 ++
>> 8 files changed, 1459 insertions(+)
>
> At a glance most of that code could be shared between arches. I don't
> think I can merge that as-is, without some attempt to split the generic
> parts out.
Right.
I just copy them from arch/x86/mm. There are many code not arch specific.
> cheers
>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v1 1/2] powerpc/mmiotrace: Add MMIO Tracing tool for PowerPC
2024-06-28 8:21 ` Yang Jialong 杨佳龙
@ 2024-09-04 7:47 ` 虞陆铭
0 siblings, 0 replies; 6+ messages in thread
From: 虞陆铭 @ 2024-09-04 7:47 UTC (permalink / raw)
To: Yang Jialong 杨佳龙, mpe, npiggin,
christophe.leroy, Naveen N. Rao
Cc: shenghui.qu, linuxppc-dev, linux-kernel, Steven Rostedt,
Masami Hiramatsu, Karol Herbst, Pekka Paalanen, nouveau
Hi,
the patch set and mmio trace capability helped us to track down to the root cause of a mysterious EEH only on powernv platform
triggered by a testpmd dpdk user mode driver (UIO) by comparing the mmio trace from UIO and native kernel driver from the same nic.
And the problem is solved by switch to VFIO driver as backend.
the problem is the mmiotrace could not capture user mode mmio which I think it could be limitation by its current design.
so, I think we could tidy up the patch and do some feature enhancement then we can submit again for the debug value of the patch.
As it really is painful when EEH A/B record doesn't point to a clear root cause or suffient details that can help us to fix driver code for the unaligned
MMIO for a 3rd party nic vendor on powernv platform.
Cheers,
Luming
------------------ Original ------------------
From: "Yang Jialong 杨佳龙"<jialong.yang@shingroup.cn>;
Date: Fri, Jun 28, 2024 04:21 PM
To: "mpe"<mpe@ellerman.id.au>; "npiggin"<npiggin@gmail.com>; "christophe.leroy"<christophe.leroy@csgroup.eu>; "Naveen N. Rao"<naveen.n.rao@linux.ibm.com>;
Cc: "虞陆铭"<luming.yu@shingroup.cn>; "shenghui.qu"<shenghui.qu@shingroup.cn>; "linuxppc-dev"<linuxppc-dev@lists.ozlabs.org>; "linux-kernel"<linux-kernel@vger.kernel.org>; "Steven Rostedt"<rostedt@goodmis.org>; "Masami Hiramatsu"<mhiramat@kernel.org>; "Karol Herbst"<karolherbst@gmail.com>; "Pekka Paalanen"<ppaalanen@gmail.com>; "nouveau"<nouveau@lists.freedesktop.org>;
Subject: Re: [PATCH v1 1/2] powerpc/mmiotrace: Add MMIO Tracing tool for PowerPC
在 2024/6/28 15:02, Michael Ellerman 写道:
> Jialong Yang <jialong.yang@shingroup.cn> writes:
>> mmiotrace is a useful tool to trace MMIO accesses. Nowadays, it only
>> supported on x86 and x86_64 platforms.
> I've never used mmiotrace, and don't know it well.
>
> I'm not necessarily opposed to merging it, but AFAIK it was mostly used
> for reverse engineering proprietary drivers, where the driver itself
> couldn't be easily instrumented. Is that what you're using it for?
Yes. Just like you think. We have used it for network stack debug in
ppc64le.
>
> For drivers where we have the source wouldn't it be easier to just use
> tracepoints in the MMIO accessors?
Tracepoints need pre-defined. And in some big driver, it's not easy to
overwrite
all points where access registers in io area. And tracepoint is C
function level filter.
mmiotrace is similar to set tracepoints in writel/readl... But it can do
deeperly.
mmiotrace is a asm level filter tool. It doesn't care what have done in
C level. It will
only find what have done by asm, such as stw(store word)/lw(load word),
just like standing
in the view of device.
>
> Is it still in-use/maintained on the x86 side?
Here is some core file patches number in x86:
| | mmio_mod.c | kmmio.c | pf_in.* | testmmiotrace.c |
|------+------------+---------+---------+-----------------|
| 2022 | 1 | 3 | | |
| 2021 | 2 | 1 | | |
| 2020 | 4 | 4 | | 1 |
| 2019 | 2 | 1 | 1 | 4 |
| 2018 | | 2 | | |
| 2017 | 2 | 2 | | 1 |
| 2016 | 1 | 2 | 1 | |
| 2014 | | 1 | | |
| 2013 | 1 | | | |
| 2012 | 1 | | | |
| 2011 | 3 | | 1 | |
| 2010 | 1 | 3 | 2 | 1 |
| 2009 | 4 | 19 | | 3 |
| 2008 | 13 | 5 | 2 | 3 |
>
>> Here is a support for powerpc.
>> The manual is located at Documentation/trace/mmiotrace.rst which means
>> I have not changed user API. People will be easy to use it.
>> Almost all files are copied from x86/mm, there are only some
>> differences from hardware and architectures software.
>>
>> LINK: https://lore.kernel.org/lkml/20080127195536.50809974@daedalus.pq.iki.fi/
>>
>> Signed-off-by: Jialong Yang <jialong.yang@shingroup.cn>
>> ---
>> arch/powerpc/Kconfig.debug | 3 +
>> arch/powerpc/mm/Makefile | 1 +
>> arch/powerpc/mm/kmmio.c | 649 +++++++++++++++++++++++++++++++
>> arch/powerpc/mm/mmio-mod.c | 414 ++++++++++++++++++++
>> arch/powerpc/mm/mmiotrace_arch.c | 149 +++++++
>> arch/powerpc/mm/mmiotrace_arch.h | 25 ++
>> arch/powerpc/mm/pf_in.c | 185 +++++++++
>> arch/powerpc/mm/pf_in.h | 33 ++
>> 8 files changed, 1459 insertions(+)
>
> At a glance most of that code could be shared between arches. I don't
> think I can merge that as-is, without some attempt to split the generic
> parts out.
Right.
I just copy them from arch/x86/mm. There are many code not arch specific.
> cheers
>
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2024-09-04 7:49 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-06-20 8:51 [PATCH v1 1/2] powerpc/mmiotrace: Add MMIO Tracing tool for PowerPC Jialong Yang
2024-06-20 8:51 ` [PATCH v1 2/2] powerpc/mmiotrace: bind ioremap and page fault to active mmiotrace Jialong Yang
2024-06-27 12:31 ` [PATCH v1 1/2] powerpc/mmiotrace: Add MMIO Tracing tool for PowerPC kernel test robot
2024-06-28 7:02 ` Michael Ellerman
2024-06-28 8:21 ` Yang Jialong 杨佳龙
2024-09-04 7:47 ` 虞陆铭
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).