* [PATCH] lib/sys_info: add a simple timer based memory corruption detector
@ 2026-05-27 3:43 Feng Tang
2026-06-08 8:03 ` Petr Mladek
2026-06-10 16:50 ` Steven Rostedt
0 siblings, 2 replies; 4+ messages in thread
From: Feng Tang @ 2026-05-27 3:43 UTC (permalink / raw)
To: Andrew Morton, Petr Mladek, Steven Rostedt, paulmck, linux-kernel
Cc: Feng Tang
During debugging some bios/hardware related nasty memory corruption
issues, we found using periodic timer to monitor specific dram/mmio
physical address is very useful for debugging, which acts like
a basic software watchpoint.
For those bugs, who (and when) change(corrupt) those dram or mmio
register is hard to trace, and sometimes even hardware jtag debugger
can't help (say the physical address watchpoint doesn't work).
The biggest shortcoming is it can never capture the exact point like
a hardware watchpoint, no matter how small the timer interval is set,
the idea is trying to approach the point, hoping the caught context
have enough debug info (which did help us in solving bios/hardware
bugs)
The working flow is simple: after suspected address is identified,
start periodic timer polling it to catch if its value is changed to
target 'magic' value, then halt the cpu (better limit to have only
one cpu online), or panic, or print out system information, so that
the error environment is frozen for further check , or let
kexec/kdump to record the vmore, etc.
All the settings are module parameters:
watch_interval_ms: SW watchpoint check interval in ms
paddr_dram_to_watch: Physical dram address to monitor.
target_dram_val: Expected value at the dram address that triggers the watchpoint.
paddr_mmio_to_watch: Physical mmio address to monitor. Must be 32-bit aligned.
target_mmio_val: Expected value at the mmio address that triggers the watchpoint.
panic_on_hit: Trigger kernel panic when watchpoint condition hits.
hang_on_hit: halt the CPU (wait for HW debugger)
This RFC is trying to show the idea and get feedback, and there are
some todos:
* merge the dram/mmio interface to auto detect it's dram or mmio
* support runtime changing the address
* move the starting point earlier in boot phase
* currently is monitoring 'changing to a value', add support
for 'changing from a value'
Signed-off-by: Feng Tang <feng.tang@linux.alibaba.com>
---
lib/sys_info.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 104 insertions(+)
diff --git a/lib/sys_info.c b/lib/sys_info.c
index f32a06ec9ed4..90ddcf786b98 100644
--- a/lib/sys_info.c
+++ b/lib/sys_info.c
@@ -164,3 +164,107 @@ void sys_info(unsigned long si_mask)
{
__sys_info(si_mask ? : kernel_si_mask);
}
+
+#ifdef CONFIG_SW_WATCHPOINT
+
+/* default 100 ms interval */
+static unsigned long watch_interval_ms = 100;
+module_param(watch_interval_ms, ulong, 0644);
+MODULE_PARM_DESC(watch_interval_ms, "SW watchpoint check interval in ms");
+
+static unsigned long paddr_dram_to_watch;
+module_param(paddr_dram_to_watch, ulong, 0644);
+MODULE_PARM_DESC(paddr_dram_to_watch, "Physical DRAM address to watch");
+
+static unsigned long *vaddr_dram;
+
+static unsigned long target_dram_val;
+module_param(target_dram_val, ulong, 0644);
+MODULE_PARM_DESC(target_dram_val, "Target DRAM value to trigger watchpoint");
+
+/* The MMIO address should be 32b aligned */
+static unsigned long paddr_mmio_to_watch;
+module_param(paddr_mmio_to_watch, ulong, 0644);
+MODULE_PARM_DESC(paddr_mmio_to_watch, "Physical MMIO address to watch (32bit aligned)");
+
+static unsigned int *vaddr_mmio;
+
+static unsigned int target_mmio_val;
+module_param(target_mmio_val, uint, 0644);
+MODULE_PARM_DESC(target_mmio_val, "Target MMIO value to trigger watchpoint");
+
+static bool panic_on_hit;
+module_param(panic_on_hit, bool, 0644);
+MODULE_PARM_DESC(panic_on_hit, "Panic when watchpoint hits");
+
+static bool hang_on_hit;
+module_param(hang_on_hit, bool, 0644);
+MODULE_PARM_DESC(hang_on_hit, "Hang when watchpoint hits");
+
+/* Stop the watchpoint timer after first hit */
+static bool check_once = true;
+module_param(check_once, bool, 0644);
+MODULE_PARM_DESC(check_once, "Stop watching after first hit");
+
+static struct timer_list sw_watchpoint_timer;
+
+static void sw_watchpoint_timer_fn(struct timer_list *unused)
+{
+ bool hit = false;
+
+ if (vaddr_mmio && (*vaddr_mmio == target_mmio_val)) {
+ pr_info("MMIO [@0x%lx] hit the target value [0x%x]!\n",
+ paddr_mmio_to_watch, target_mmio_val);
+ hit = true;
+ }
+
+ if (vaddr_dram && (*vaddr_dram == target_dram_val)) {
+ pr_info("DRAM [@0x%lx] hit the target value [0x%lx]!\n",
+ paddr_dram_to_watch, target_dram_val);
+ hit = true;
+ }
+
+ if (hit) {
+ sys_info(0);
+
+ /* Useful for attaching HW debugger */
+ if (hang_on_hit) {
+ pr_warn("Will dead loop on this CPU\n");
+ while (1);
+ }
+
+ /* Could be used to trigger kexec/kdump */
+ if (panic_on_hit)
+ panic("SW watchpoint hit!");
+
+ if (check_once)
+ return;
+ }
+
+ mod_timer(&sw_watchpoint_timer, jiffies + msecs_to_jiffies(watch_interval_ms));
+}
+
+static int __init sw_watchpoint_timer_init(void)
+{
+ if (paddr_mmio_to_watch) {
+ vaddr_mmio = ioremap(paddr_mmio_to_watch & PAGE_MASK, PAGE_SIZE);
+ if (!vaddr_mmio)
+ return -ENOMEM;
+
+ vaddr_mmio += (paddr_mmio_to_watch % PAGE_SIZE) / 4;
+ }
+
+ if (paddr_dram_to_watch) {
+ vaddr_dram = phys_to_virt(paddr_dram_to_watch);
+ if (!vaddr_dram)
+ return -ENOMEM;
+ }
+
+ timer_setup(&sw_watchpoint_timer, sw_watchpoint_timer_fn, 0);
+ sw_watchpoint_timer.expires = jiffies + msecs_to_jiffies(watch_interval_ms);
+ add_timer(&sw_watchpoint_timer);
+
+ return 0;
+}
+core_initcall(sw_watchpoint_timer_init);
+#endif
base-commit: e7ae89a0c97ce2b68b0983cd01eda67cf373517d
--
2.39.5 (Apple Git-154)
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH] lib/sys_info: add a simple timer based memory corruption detector
2026-05-27 3:43 [PATCH] lib/sys_info: add a simple timer based memory corruption detector Feng Tang
@ 2026-06-08 8:03 ` Petr Mladek
2026-06-08 9:53 ` Feng Tang
2026-06-10 16:50 ` Steven Rostedt
1 sibling, 1 reply; 4+ messages in thread
From: Petr Mladek @ 2026-06-08 8:03 UTC (permalink / raw)
To: Feng Tang
Cc: Andrew Morton, Steven Rostedt, paulmck, linux-kernel,
Douglas Anderson, Thomas Gleixner, Peter Zijlstra,
Vlastimil Babka
Added few more people into Cc.
On Wed 2026-05-27 11:43:24, Feng Tang wrote:
> During debugging some bios/hardware related nasty memory corruption
> issues, we found using periodic timer to monitor specific dram/mmio
> physical address is very useful for debugging, which acts like
> a basic software watchpoint.
>
> For those bugs, who (and when) change(corrupt) those dram or mmio
> register is hard to trace, and sometimes even hardware jtag debugger
> can't help (say the physical address watchpoint doesn't work).
It seems that this approach helped you to debug a nasty problem.
I am not sure why the other ways did not work.
Could you please provide some more information about the use case?
Ideally, please describe one particular situation where it helped.
What was the bug, how it manifested, and how the crash dump helped
to analyze it. Feel free to use generic names, like graphics card,
or ssd disk, instead of exact producer names, ...
> The biggest shortcoming is it can never capture the exact point like
> a hardware watchpoint, no matter how small the timer interval is set,
> the idea is trying to approach the point, hoping the caught context
> have enough debug info (which did help us in solving bios/hardware
> bugs)
>
> The working flow is simple: after suspected address is identified,
> start periodic timer polling it to catch if its value is changed to
> target 'magic' value, then halt the cpu (better limit to have only
> one cpu online), or panic, or print out system information, so that
> the error environment is frozen for further check , or let
> kexec/kdump to record the vmore, etc.
>
> All the settings are module parameters:
>
> watch_interval_ms: SW watchpoint check interval in ms
> paddr_dram_to_watch: Physical dram address to monitor.
> target_dram_val: Expected value at the dram address that triggers the watchpoint.
> paddr_mmio_to_watch: Physical mmio address to monitor. Must be 32-bit aligned.
> target_mmio_val: Expected value at the mmio address that triggers the watchpoint.
> panic_on_hit: Trigger kernel panic when watchpoint condition hits.
> hang_on_hit: halt the CPU (wait for HW debugger)
>
> This RFC is trying to show the idea and get feedback, and there are
> some todos:
> * merge the dram/mmio interface to auto detect it's dram or mmio
> * support runtime changing the address
> * move the starting point earlier in boot phase
> * currently is monitoring 'changing to a value', add support
> for 'changing from a value'
Sashiko AI has pointed out several possible problems, see
https://sashiko.dev/#/patchset/20260527034324.51136-1-feng.tang%40linux.alibaba.com
> --- a/lib/sys_info.c
> +++ b/lib/sys_info.c
I we agreed that this feature would be useful then it would deserve
its own source file.
IMHO, it fits into the watchdog category. I would put it into
kernel/watch_mem or so.
Best Regards,
Petr
> @@ -164,3 +164,107 @@ void sys_info(unsigned long si_mask)
> {
> __sys_info(si_mask ? : kernel_si_mask);
> }
> +
> +#ifdef CONFIG_SW_WATCHPOINT
> +
> +/* default 100 ms interval */
> +static unsigned long watch_interval_ms = 100;
> +module_param(watch_interval_ms, ulong, 0644);
> +MODULE_PARM_DESC(watch_interval_ms, "SW watchpoint check interval in ms");
> +
> +static unsigned long paddr_dram_to_watch;
> +module_param(paddr_dram_to_watch, ulong, 0644);
> +MODULE_PARM_DESC(paddr_dram_to_watch, "Physical DRAM address to watch");
> +
> +static unsigned long *vaddr_dram;
> +
> +static unsigned long target_dram_val;
> +module_param(target_dram_val, ulong, 0644);
> +MODULE_PARM_DESC(target_dram_val, "Target DRAM value to trigger watchpoint");
> +
> +/* The MMIO address should be 32b aligned */
> +static unsigned long paddr_mmio_to_watch;
> +module_param(paddr_mmio_to_watch, ulong, 0644);
> +MODULE_PARM_DESC(paddr_mmio_to_watch, "Physical MMIO address to watch (32bit aligned)");
> +
> +static unsigned int *vaddr_mmio;
> +
> +static unsigned int target_mmio_val;
> +module_param(target_mmio_val, uint, 0644);
> +MODULE_PARM_DESC(target_mmio_val, "Target MMIO value to trigger watchpoint");
> +
> +static bool panic_on_hit;
> +module_param(panic_on_hit, bool, 0644);
> +MODULE_PARM_DESC(panic_on_hit, "Panic when watchpoint hits");
> +
> +static bool hang_on_hit;
> +module_param(hang_on_hit, bool, 0644);
> +MODULE_PARM_DESC(hang_on_hit, "Hang when watchpoint hits");
> +
> +/* Stop the watchpoint timer after first hit */
> +static bool check_once = true;
> +module_param(check_once, bool, 0644);
> +MODULE_PARM_DESC(check_once, "Stop watching after first hit");
> +
> +static struct timer_list sw_watchpoint_timer;
> +
> +static void sw_watchpoint_timer_fn(struct timer_list *unused)
> +{
> + bool hit = false;
> +
> + if (vaddr_mmio && (*vaddr_mmio == target_mmio_val)) {
> + pr_info("MMIO [@0x%lx] hit the target value [0x%x]!\n",
> + paddr_mmio_to_watch, target_mmio_val);
> + hit = true;
> + }
> +
> + if (vaddr_dram && (*vaddr_dram == target_dram_val)) {
> + pr_info("DRAM [@0x%lx] hit the target value [0x%lx]!\n",
> + paddr_dram_to_watch, target_dram_val);
> + hit = true;
> + }
> +
> + if (hit) {
> + sys_info(0);
> +
> + /* Useful for attaching HW debugger */
> + if (hang_on_hit) {
> + pr_warn("Will dead loop on this CPU\n");
> + while (1);
> + }
> +
> + /* Could be used to trigger kexec/kdump */
> + if (panic_on_hit)
> + panic("SW watchpoint hit!");
> +
> + if (check_once)
> + return;
> + }
> +
> + mod_timer(&sw_watchpoint_timer, jiffies + msecs_to_jiffies(watch_interval_ms));
> +}
> +
> +static int __init sw_watchpoint_timer_init(void)
> +{
> + if (paddr_mmio_to_watch) {
> + vaddr_mmio = ioremap(paddr_mmio_to_watch & PAGE_MASK, PAGE_SIZE);
> + if (!vaddr_mmio)
> + return -ENOMEM;
> +
> + vaddr_mmio += (paddr_mmio_to_watch % PAGE_SIZE) / 4;
> + }
> +
> + if (paddr_dram_to_watch) {
> + vaddr_dram = phys_to_virt(paddr_dram_to_watch);
> + if (!vaddr_dram)
> + return -ENOMEM;
> + }
> +
> + timer_setup(&sw_watchpoint_timer, sw_watchpoint_timer_fn, 0);
> + sw_watchpoint_timer.expires = jiffies + msecs_to_jiffies(watch_interval_ms);
> + add_timer(&sw_watchpoint_timer);
> +
> + return 0;
> +}
> +core_initcall(sw_watchpoint_timer_init);
> +#endif
>
> base-commit: e7ae89a0c97ce2b68b0983cd01eda67cf373517d
> --
> 2.39.5 (Apple Git-154)
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] lib/sys_info: add a simple timer based memory corruption detector
2026-06-08 8:03 ` Petr Mladek
@ 2026-06-08 9:53 ` Feng Tang
0 siblings, 0 replies; 4+ messages in thread
From: Feng Tang @ 2026-06-08 9:53 UTC (permalink / raw)
To: Petr Mladek
Cc: Andrew Morton, Steven Rostedt, paulmck, linux-kernel,
Douglas Anderson, Thomas Gleixner, Peter Zijlstra,
Vlastimil Babka
On Mon, Jun 08, 2026 at 10:03:05AM +0200, Petr Mladek wrote:
> Added few more people into Cc.
Thanks!
> On Wed 2026-05-27 11:43:24, Feng Tang wrote:
> > During debugging some bios/hardware related nasty memory corruption
> > issues, we found using periodic timer to monitor specific dram/mmio
> > physical address is very useful for debugging, which acts like
> > a basic software watchpoint.
> >
> > For those bugs, who (and when) change(corrupt) those dram or mmio
> > register is hard to trace, and sometimes even hardware jtag debugger
> > can't help (say the physical address watchpoint doesn't work).
>
> It seems that this approach helped you to debug a nasty problem.
> I am not sure why the other ways did not work.
>
> Could you please provide some more information about the use case?
>
> Ideally, please describe one particular situation where it helped.
> What was the bug, how it manifested, and how the crash dump helped
> to analyze it. Feel free to use generic names, like graphics card,
> or ssd disk, instead of exact producer names, ...
One bug was some BIOS/HW config causing the OS boot easy to stall
during systemd init phase, and then we simplified it by making it boot
to console with a function-reduced rootfs, and always triggering
'segmentation fault' when running 'less' command.
By using GDB we found some static array of 'less' is corrupted before
being initialized, and one byte in its memory is always '0x33' (checked
by gdb), and at this stage the static array is in bss segment, and
backed actually by kernel zero page, so it was an obvious memory
corruption issue.
HW engineers tried to capture HW traces after the issue happened, but
could not find valuable hints, as the corruption could happen long
before running the 'less', and the trace of that time was gone.
Good thing is we have the physical address of kernel zero page, and
the offset of the corrupted byte is fixed, say the address was A. But
HW debugger failed to breakpoint the point that address A was
written with '0x33', so we need some software watchpoint thing here.
We used this code to monitor/watchpoint 'writing 0x33 to A', and when
it hit, we halted the system by 'while (1);' (we already make it a
UP by adding 'nr_cpus=1' cmdline param), then HW people collected
the HW trace they need and root caused it to be a bad config.
Another bug we met is, the 'ACPI_ENABLE" register was written with a
strange value, while we are sure the kernel won't do it. Then we
maximized kernel's own debug message and added more, with short time
interval of this patch, we found the nearest action before the 'write'
is kernel calling EFI time service, then reported it to firmware team,
which turned out to be a firmware issue.
These 2 bugs both happened on an arm64 server.
We haven't used 'crash dump' yet in chasing real world issues, and I
added the panic (crash dump) for potential usage.
> > The biggest shortcoming is it can never capture the exact point like
> > a hardware watchpoint, no matter how small the timer interval is set,
> > the idea is trying to approach the point, hoping the caught context
> > have enough debug info (which did help us in solving bios/hardware
> > bugs)
> >
> > The working flow is simple: after suspected address is identified,
> > start periodic timer polling it to catch if its value is changed to
> > target 'magic' value, then halt the cpu (better limit to have only
> > one cpu online), or panic, or print out system information, so that
> > the error environment is frozen for further check , or let
> > kexec/kdump to record the vmore, etc.
> >
> > All the settings are module parameters:
> >
> > watch_interval_ms: SW watchpoint check interval in ms
> > paddr_dram_to_watch: Physical dram address to monitor.
> > target_dram_val: Expected value at the dram address that triggers the watchpoint.
> > paddr_mmio_to_watch: Physical mmio address to monitor. Must be 32-bit aligned.
> > target_mmio_val: Expected value at the mmio address that triggers the watchpoint.
> > panic_on_hit: Trigger kernel panic when watchpoint condition hits.
> > hang_on_hit: halt the CPU (wait for HW debugger)
> >
> > This RFC is trying to show the idea and get feedback, and there are
> > some todos:
> > * merge the dram/mmio interface to auto detect it's dram or mmio
> > * support runtime changing the address
> > * move the starting point earlier in boot phase
> > * currently is monitoring 'changing to a value', add support
> > for 'changing from a value'
>
> Sashiko AI has pointed out several possible problems, see
> https://sashiko.dev/#/patchset/20260527034324.51136-1-feng.tang%40linux.alibaba.com
Thanks for the pointer, will handle them in next version.
> > --- a/lib/sys_info.c
> > +++ b/lib/sys_info.c
>
> I we agreed that this feature would be useful then it would deserve
> its own source file.
>
> IMHO, it fits into the watchdog category. I would put it into
> kernel/watch_mem or so.
Great! This is exactly one suggestion I could get for this RFC.
Thanks,
Feng
> Best Regards,
> Petr
>
> > @@ -164,3 +164,107 @@ void sys_info(unsigned long si_mask)
> > {
> > __sys_info(si_mask ? : kernel_si_mask);
> > }
> > +
> > +#ifdef CONFIG_SW_WATCHPOINT
> > +
> > +/* default 100 ms interval */
> > +static unsigned long watch_interval_ms = 100;
> > +module_param(watch_interval_ms, ulong, 0644);
> > +MODULE_PARM_DESC(watch_interval_ms, "SW watchpoint check interval in ms");
> > +
> > +static unsigned long paddr_dram_to_watch;
> > +module_param(paddr_dram_to_watch, ulong, 0644);
> > +MODULE_PARM_DESC(paddr_dram_to_watch, "Physical DRAM address to watch");
> > +
> > +static unsigned long *vaddr_dram;
> > +
> > +static unsigned long target_dram_val;
> > +module_param(target_dram_val, ulong, 0644);
> > +MODULE_PARM_DESC(target_dram_val, "Target DRAM value to trigger watchpoint");
> > +
> > +/* The MMIO address should be 32b aligned */
> > +static unsigned long paddr_mmio_to_watch;
> > +module_param(paddr_mmio_to_watch, ulong, 0644);
> > +MODULE_PARM_DESC(paddr_mmio_to_watch, "Physical MMIO address to watch (32bit aligned)");
> > +
> > +static unsigned int *vaddr_mmio;
> > +
> > +static unsigned int target_mmio_val;
> > +module_param(target_mmio_val, uint, 0644);
> > +MODULE_PARM_DESC(target_mmio_val, "Target MMIO value to trigger watchpoint");
> > +
> > +static bool panic_on_hit;
> > +module_param(panic_on_hit, bool, 0644);
> > +MODULE_PARM_DESC(panic_on_hit, "Panic when watchpoint hits");
> > +
> > +static bool hang_on_hit;
> > +module_param(hang_on_hit, bool, 0644);
> > +MODULE_PARM_DESC(hang_on_hit, "Hang when watchpoint hits");
> > +
> > +/* Stop the watchpoint timer after first hit */
> > +static bool check_once = true;
> > +module_param(check_once, bool, 0644);
> > +MODULE_PARM_DESC(check_once, "Stop watching after first hit");
> > +
> > +static struct timer_list sw_watchpoint_timer;
> > +
> > +static void sw_watchpoint_timer_fn(struct timer_list *unused)
> > +{
> > + bool hit = false;
> > +
> > + if (vaddr_mmio && (*vaddr_mmio == target_mmio_val)) {
> > + pr_info("MMIO [@0x%lx] hit the target value [0x%x]!\n",
> > + paddr_mmio_to_watch, target_mmio_val);
> > + hit = true;
> > + }
> > +
> > + if (vaddr_dram && (*vaddr_dram == target_dram_val)) {
> > + pr_info("DRAM [@0x%lx] hit the target value [0x%lx]!\n",
> > + paddr_dram_to_watch, target_dram_val);
> > + hit = true;
> > + }
> > +
> > + if (hit) {
> > + sys_info(0);
> > +
> > + /* Useful for attaching HW debugger */
> > + if (hang_on_hit) {
> > + pr_warn("Will dead loop on this CPU\n");
> > + while (1);
> > + }
> > +
> > + /* Could be used to trigger kexec/kdump */
> > + if (panic_on_hit)
> > + panic("SW watchpoint hit!");
> > +
> > + if (check_once)
> > + return;
> > + }
> > +
> > + mod_timer(&sw_watchpoint_timer, jiffies + msecs_to_jiffies(watch_interval_ms));
> > +}
> > +
> > +static int __init sw_watchpoint_timer_init(void)
> > +{
> > + if (paddr_mmio_to_watch) {
> > + vaddr_mmio = ioremap(paddr_mmio_to_watch & PAGE_MASK, PAGE_SIZE);
> > + if (!vaddr_mmio)
> > + return -ENOMEM;
> > +
> > + vaddr_mmio += (paddr_mmio_to_watch % PAGE_SIZE) / 4;
> > + }
> > +
> > + if (paddr_dram_to_watch) {
> > + vaddr_dram = phys_to_virt(paddr_dram_to_watch);
> > + if (!vaddr_dram)
> > + return -ENOMEM;
> > + }
> > +
> > + timer_setup(&sw_watchpoint_timer, sw_watchpoint_timer_fn, 0);
> > + sw_watchpoint_timer.expires = jiffies + msecs_to_jiffies(watch_interval_ms);
> > + add_timer(&sw_watchpoint_timer);
> > +
> > + return 0;
> > +}
> > +core_initcall(sw_watchpoint_timer_init);
> > +#endif
> >
> > base-commit: e7ae89a0c97ce2b68b0983cd01eda67cf373517d
> > --
> > 2.39.5 (Apple Git-154)
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] lib/sys_info: add a simple timer based memory corruption detector
2026-05-27 3:43 [PATCH] lib/sys_info: add a simple timer based memory corruption detector Feng Tang
2026-06-08 8:03 ` Petr Mladek
@ 2026-06-10 16:50 ` Steven Rostedt
1 sibling, 0 replies; 4+ messages in thread
From: Steven Rostedt @ 2026-06-10 16:50 UTC (permalink / raw)
To: Feng Tang; +Cc: Andrew Morton, Petr Mladek, paulmck, linux-kernel
On Wed, 27 May 2026 11:43:24 +0800
Feng Tang <feng.tang@linux.alibaba.com> wrote:
> During debugging some bios/hardware related nasty memory corruption
> issues, we found using periodic timer to monitor specific dram/mmio
> physical address is very useful for debugging, which acts like
> a basic software watchpoint.
>
> For those bugs, who (and when) change(corrupt) those dram or mmio
> register is hard to trace, and sometimes even hardware jtag debugger
> can't help (say the physical address watchpoint doesn't work).
>
> The biggest shortcoming is it can never capture the exact point like
> a hardware watchpoint, no matter how small the timer interval is set,
> the idea is trying to approach the point, hoping the caught context
> have enough debug info (which did help us in solving bios/hardware
> bugs)
Instead of using a timer, can't you use the function tracer? That is,
have the value checked at *every* function call. You can easily add a
custom callback that gets called when every function is executed.
See https://docs.kernel.org/trace/ftrace-uses.html
>
> The working flow is simple: after suspected address is identified,
> start periodic timer polling it to catch if its value is changed to
> target 'magic' value, then halt the cpu (better limit to have only
> one cpu online), or panic, or print out system information, so that
> the error environment is frozen for further check , or let
> kexec/kdump to record the vmore, etc.
>
> All the settings are module parameters:
>
> watch_interval_ms: SW watchpoint check interval in ms
> paddr_dram_to_watch: Physical dram address to monitor.
> target_dram_val: Expected value at the dram address that triggers the watchpoint.
> paddr_mmio_to_watch: Physical mmio address to monitor. Must be 32-bit aligned.
> target_mmio_val: Expected value at the mmio address that triggers the watchpoint.
> panic_on_hit: Trigger kernel panic when watchpoint condition hits.
> hang_on_hit: halt the CPU (wait for HW debugger)
>
> This RFC is trying to show the idea and get feedback, and there are
> some todos:
> * merge the dram/mmio interface to auto detect it's dram or mmio
> * support runtime changing the address
> * move the starting point earlier in boot phase
> * currently is monitoring 'changing to a value', add support
> for 'changing from a value'
>
> Signed-off-by: Feng Tang <feng.tang@linux.alibaba.com>
> +static void sw_watchpoint_timer_fn(struct timer_list *unused)
> +{
> + bool hit = false;
> +
> + if (vaddr_mmio && (*vaddr_mmio == target_mmio_val)) {
> + pr_info("MMIO [@0x%lx] hit the target value [0x%x]!\n",
> + paddr_mmio_to_watch, target_mmio_val);
> + hit = true;
> + }
> +
> + if (vaddr_dram && (*vaddr_dram == target_dram_val)) {
> + pr_info("DRAM [@0x%lx] hit the target value [0x%lx]!\n",
> + paddr_dram_to_watch, target_dram_val);
> + hit = true;
> + }
> +
> + if (hit) {
> + sys_info(0);
> +
> + /* Useful for attaching HW debugger */
> + if (hang_on_hit) {
> + pr_warn("Will dead loop on this CPU\n");
> + while (1);
> + }
> +
> + /* Could be used to trigger kexec/kdump */
> + if (panic_on_hit)
> + panic("SW watchpoint hit!");
> +
> + if (check_once)
> + return;
> + }
The above function would be:
static bool no_check;
static void sw_watchpoint_fn(unsigned long ip, unsigned long pip,
struct ftrace_ops *op, struct ftrace_regs *fregs)
{
bool hit = false;
int bit;
if (no_check)
return;
bit = ftrace_test_recursion_trylock(ip, parent_ip);
if (bit < 0)
return;
[ do the above code ]
if (check_once) {
// possibly call irq_work to unregister ftrace
no_check = true;
}
ftrace_test_recursion_unlock(bit);
}
static struct ftrace_ops sw_watchpoint_fops = {
.func = sw_watchpoint_fn;
};
[..]
> +
> + mod_timer(&sw_watchpoint_timer, jiffies + msecs_to_jiffies(watch_interval_ms));
> +}
> +
> +static int __init sw_watchpoint_timer_init(void)
> +{
> + if (paddr_mmio_to_watch) {
> + vaddr_mmio = ioremap(paddr_mmio_to_watch & PAGE_MASK, PAGE_SIZE);
> + if (!vaddr_mmio)
> + return -ENOMEM;
> +
> + vaddr_mmio += (paddr_mmio_to_watch % PAGE_SIZE) / 4;
> + }
> +
> + if (paddr_dram_to_watch) {
> + vaddr_dram = phys_to_virt(paddr_dram_to_watch);
> + if (!vaddr_dram)
> + return -ENOMEM;
> + }
> +
> + timer_setup(&sw_watchpoint_timer, sw_watchpoint_timer_fn, 0);
> + sw_watchpoint_timer.expires = jiffies + msecs_to_jiffies(watch_interval_ms);
> + add_timer(&sw_watchpoint_timer);
Instead of the above, have:
register_ftrace_function(&sw_watchpoint_fops);
-- Steve
> +
> + return 0;
> +}
> +core_initcall(sw_watchpoint_timer_init);
> +#endif
>
> base-commit: e7ae89a0c97ce2b68b0983cd01eda67cf373517d
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2026-06-10 16:50 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-27 3:43 [PATCH] lib/sys_info: add a simple timer based memory corruption detector Feng Tang
2026-06-08 8:03 ` Petr Mladek
2026-06-08 9:53 ` Feng Tang
2026-06-10 16:50 ` Steven Rostedt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox