* [PATCH v4 1/5] Add a new optional ",cma" suffix to the crashkernel= command line option
2025-05-30 20:23 [PATCH v4 0/5] kdump: crashkernel reservation from CMA Jiri Bohac
@ 2025-05-30 20:26 ` Jiri Bohac
2025-05-30 20:27 ` [PATCH v4 2/5] kdump: implement reserve_crashkernel_cma Jiri Bohac
` (3 subsequent siblings)
4 siblings, 0 replies; 15+ messages in thread
From: Jiri Bohac @ 2025-05-30 20:26 UTC (permalink / raw)
To: Baoquan He, Vivek Goyal, Dave Young, kexec
Cc: Philipp Rudo, Donald Dutile, Pingfan Liu, Tao Liu, linux-kernel,
David Hildenbrand, Michal Hocko
Add a new cma_size parameter to parse_crashkernel().
When not NULL, call __parse_crashkernel to parse the CMA
reservation size from "crashkernel=size,cma" and store it
in cma_size.
Set cma_size to NULL in all calls to parse_crashkernel().
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
---
arch/arm/kernel/setup.c | 2 +-
arch/arm64/mm/init.c | 2 +-
arch/loongarch/kernel/setup.c | 2 +-
arch/mips/kernel/setup.c | 2 +-
arch/powerpc/kernel/fadump.c | 2 +-
arch/powerpc/kexec/core.c | 2 +-
arch/powerpc/mm/nohash/kaslr_booke.c | 2 +-
arch/riscv/mm/init.c | 2 +-
arch/s390/kernel/setup.c | 2 +-
arch/sh/kernel/machine_kexec.c | 2 +-
arch/x86/kernel/setup.c | 2 +-
include/linux/crash_reserve.h | 3 ++-
kernel/crash_reserve.c | 16 ++++++++++++++--
13 files changed, 27 insertions(+), 14 deletions(-)
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index a41c93988d2c..0bfd66c7ada0 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -1004,7 +1004,7 @@ static void __init reserve_crashkernel(void)
total_mem = get_total_mem();
ret = parse_crashkernel(boot_command_line, total_mem,
&crash_size, &crash_base,
- NULL, NULL);
+ NULL, NULL, NULL);
/* invalid value specified or crashkernel=0 */
if (ret || !crash_size)
return;
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 0c8c35dd645e..ea84a61ed508 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -106,7 +106,7 @@ static void __init arch_reserve_crashkernel(void)
ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
&crash_size, &crash_base,
- &low_size, &high);
+ &low_size, NULL, &high);
if (ret)
return;
diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c
index b99fbb388fe0..22b27cd447a1 100644
--- a/arch/loongarch/kernel/setup.c
+++ b/arch/loongarch/kernel/setup.c
@@ -265,7 +265,7 @@ static void __init arch_reserve_crashkernel(void)
return;
ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
- &crash_size, &crash_base, &low_size, &high);
+ &crash_size, &crash_base, &low_size, NULL, &high);
if (ret)
return;
diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index fbfe0771317e..11b9b6b63e19 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -458,7 +458,7 @@ static void __init mips_parse_crashkernel(void)
total_mem = memblock_phys_mem_size();
ret = parse_crashkernel(boot_command_line, total_mem,
&crash_size, &crash_base,
- NULL, NULL);
+ NULL, NULL, NULL);
if (ret != 0 || crash_size <= 0)
return;
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 8ca49e40c473..28cab25d5b33 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -333,7 +333,7 @@ static __init u64 fadump_calculate_reserve_size(void)
* memory at a predefined offset.
*/
ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
- &size, &base, NULL, NULL);
+ &size, &base, NULL, NULL, NULL);
if (ret == 0 && size > 0) {
unsigned long max_size;
diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c
index 00e9c267b912..d1a2d755381c 100644
--- a/arch/powerpc/kexec/core.c
+++ b/arch/powerpc/kexec/core.c
@@ -110,7 +110,7 @@ void __init arch_reserve_crashkernel(void)
/* use common parsing */
ret = parse_crashkernel(boot_command_line, total_mem_sz, &crash_size,
- &crash_base, NULL, NULL);
+ &crash_base, NULL, NULL, NULL);
if (ret)
return;
diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c
index 5c8d1bb98b3e..5e4897daaaea 100644
--- a/arch/powerpc/mm/nohash/kaslr_booke.c
+++ b/arch/powerpc/mm/nohash/kaslr_booke.c
@@ -178,7 +178,7 @@ static void __init get_crash_kernel(void *fdt, unsigned long size)
int ret;
ret = parse_crashkernel(boot_command_line, size, &crash_size,
- &crash_base, NULL, NULL);
+ &crash_base, NULL, NULL, NULL);
if (ret != 0 || crash_size == 0)
return;
if (crash_base == 0)
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index ab475ec6ca42..3f272aff2cf1 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -1402,7 +1402,7 @@ static void __init arch_reserve_crashkernel(void)
ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
&crash_size, &crash_base,
- &low_size, &high);
+ &low_size, NULL, &high);
if (ret)
return;
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index f244c5560e7f..b99aeb0db2ee 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -605,7 +605,7 @@ static void __init reserve_crashkernel(void)
int rc;
rc = parse_crashkernel(boot_command_line, ident_map_size,
- &crash_size, &crash_base, NULL, NULL);
+ &crash_size, &crash_base, NULL, NULL, NULL);
crash_base = ALIGN(crash_base, KEXEC_CRASH_MEM_ALIGN);
crash_size = ALIGN(crash_size, KEXEC_CRASH_MEM_ALIGN);
diff --git a/arch/sh/kernel/machine_kexec.c b/arch/sh/kernel/machine_kexec.c
index 8321b31d2e19..37073ca1e0ad 100644
--- a/arch/sh/kernel/machine_kexec.c
+++ b/arch/sh/kernel/machine_kexec.c
@@ -146,7 +146,7 @@ void __init reserve_crashkernel(void)
return;
ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
- &crash_size, &crash_base, NULL, NULL);
+ &crash_size, &crash_base, NULL, NULL, NULL);
if (ret == 0 && crash_size > 0) {
crashk_res.start = crash_base;
crashk_res.end = crash_base + crash_size - 1;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 7d9ed79a93c0..870b06571b2e 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -582,7 +582,7 @@ static void __init arch_reserve_crashkernel(void)
ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
&crash_size, &crash_base,
- &low_size, &high);
+ &low_size, NULL, &high);
if (ret)
return;
diff --git a/include/linux/crash_reserve.h b/include/linux/crash_reserve.h
index 1fe7e7d1b214..e784aaff2f5a 100644
--- a/include/linux/crash_reserve.h
+++ b/include/linux/crash_reserve.h
@@ -16,7 +16,8 @@ extern struct resource crashk_low_res;
int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
unsigned long long *crash_size, unsigned long long *crash_base,
- unsigned long long *low_size, bool *high);
+ unsigned long long *low_size, unsigned long long *cma_size,
+ bool *high);
#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c
index aff7c0fdbefa..a8861f3f64fe 100644
--- a/kernel/crash_reserve.c
+++ b/kernel/crash_reserve.c
@@ -172,17 +172,19 @@ static int __init parse_crashkernel_simple(char *cmdline,
#define SUFFIX_HIGH 0
#define SUFFIX_LOW 1
-#define SUFFIX_NULL 2
+#define SUFFIX_CMA 2
+#define SUFFIX_NULL 3
static __initdata char *suffix_tbl[] = {
[SUFFIX_HIGH] = ",high",
[SUFFIX_LOW] = ",low",
+ [SUFFIX_CMA] = ",cma",
[SUFFIX_NULL] = NULL,
};
/*
* That function parses "suffix" crashkernel command lines like
*
- * crashkernel=size,[high|low]
+ * crashkernel=size,[high|low|cma]
*
* It returns 0 on success and -EINVAL on failure.
*/
@@ -298,9 +300,11 @@ int __init parse_crashkernel(char *cmdline,
unsigned long long *crash_size,
unsigned long long *crash_base,
unsigned long long *low_size,
+ unsigned long long *cma_size,
bool *high)
{
int ret;
+ unsigned long long __always_unused cma_base;
/* crashkernel=X[@offset] */
ret = __parse_crashkernel(cmdline, system_ram, crash_size,
@@ -331,6 +335,14 @@ int __init parse_crashkernel(char *cmdline,
*high = true;
}
+
+ /*
+ * optional CMA reservation
+ * cma_base is ignored
+ */
+ if (cma_size)
+ __parse_crashkernel(cmdline, 0, cma_size,
+ &cma_base, suffix_tbl[SUFFIX_CMA]);
#endif
if (!*crash_size)
ret = -EINVAL;
--
Jiri Bohac <jbohac@suse.cz>
SUSE Labs, Prague, Czechia
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH v4 2/5] kdump: implement reserve_crashkernel_cma
2025-05-30 20:23 [PATCH v4 0/5] kdump: crashkernel reservation from CMA Jiri Bohac
2025-05-30 20:26 ` [PATCH v4 1/5] Add a new optional ",cma" suffix to the crashkernel= command line option Jiri Bohac
@ 2025-05-30 20:27 ` Jiri Bohac
2025-05-30 20:28 ` [PATCH v4 3/5] kdump, documentation: describe craskernel CMA reservation Jiri Bohac
` (2 subsequent siblings)
4 siblings, 0 replies; 15+ messages in thread
From: Jiri Bohac @ 2025-05-30 20:27 UTC (permalink / raw)
To: Baoquan He, Vivek Goyal, Dave Young, kexec
Cc: Philipp Rudo, Donald Dutile, Pingfan Liu, Tao Liu, linux-kernel,
David Hildenbrand, Michal Hocko
reserve_crashkernel_cma() reserves CMA ranges for the
crash kernel. If allocating the requested size fails,
try to reserve in smaller blocks.
Store the reserved ranges in the crashk_cma_ranges array
and the number of ranges in crashk_cma_cnt.
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
---
Changes since v3:
- make reserve_crashkernel_cma() return early when cma_size == 0
to avoid printing out the 0 cma-allocated size
---
include/linux/crash_reserve.h | 12 ++++++++
kernel/crash_reserve.c | 52 +++++++++++++++++++++++++++++++++++
2 files changed, 64 insertions(+)
diff --git a/include/linux/crash_reserve.h b/include/linux/crash_reserve.h
index e784aaff2f5a..7b44b41d0a20 100644
--- a/include/linux/crash_reserve.h
+++ b/include/linux/crash_reserve.h
@@ -13,12 +13,24 @@
*/
extern struct resource crashk_res;
extern struct resource crashk_low_res;
+extern struct range crashk_cma_ranges[];
+#if defined(CONFIG_CMA) && defined(CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION)
+#define CRASHKERNEL_CMA
+#define CRASHKERNEL_CMA_RANGES_MAX 4
+extern int crashk_cma_cnt;
+#else
+#define crashk_cma_cnt 0
+#define CRASHKERNEL_CMA_RANGES_MAX 0
+#endif
+
int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
unsigned long long *crash_size, unsigned long long *crash_base,
unsigned long long *low_size, unsigned long long *cma_size,
bool *high);
+void __init reserve_crashkernel_cma(unsigned long long cma_size);
+
#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
#define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20)
diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c
index a8861f3f64fe..ae32ea707678 100644
--- a/kernel/crash_reserve.c
+++ b/kernel/crash_reserve.c
@@ -14,6 +14,8 @@
#include <linux/cpuhotplug.h>
#include <linux/memblock.h>
#include <linux/kmemleak.h>
+#include <linux/cma.h>
+#include <linux/crash_reserve.h>
#include <asm/page.h>
#include <asm/sections.h>
@@ -469,6 +471,56 @@ void __init reserve_crashkernel_generic(unsigned long long crash_size,
#endif
}
+struct range crashk_cma_ranges[CRASHKERNEL_CMA_RANGES_MAX];
+#ifdef CRASHKERNEL_CMA
+int crashk_cma_cnt;
+void __init reserve_crashkernel_cma(unsigned long long cma_size)
+{
+ unsigned long long request_size = roundup(cma_size, PAGE_SIZE);
+ unsigned long long reserved_size = 0;
+
+ if (!cma_size)
+ return;
+
+ while (cma_size > reserved_size &&
+ crashk_cma_cnt < CRASHKERNEL_CMA_RANGES_MAX) {
+
+ struct cma *res;
+
+ if (cma_declare_contiguous(0, request_size, 0, 0, 0, false,
+ "crashkernel", &res)) {
+ /* reservation failed, try half-sized blocks */
+ if (request_size <= PAGE_SIZE)
+ break;
+
+ request_size = roundup(request_size / 2, PAGE_SIZE);
+ continue;
+ }
+
+ crashk_cma_ranges[crashk_cma_cnt].start = cma_get_base(res);
+ crashk_cma_ranges[crashk_cma_cnt].end =
+ crashk_cma_ranges[crashk_cma_cnt].start +
+ cma_get_size(res) - 1;
+ ++crashk_cma_cnt;
+ reserved_size += request_size;
+ }
+
+ if (cma_size > reserved_size)
+ pr_warn("crashkernel CMA reservation failed: %lld MB requested, %lld MB reserved in %d ranges\n",
+ cma_size >> 20, reserved_size >> 20, crashk_cma_cnt);
+ else
+ pr_info("crashkernel CMA reserved: %lld MB in %d ranges\n",
+ reserved_size >> 20, crashk_cma_cnt);
+}
+
+#else /* CRASHKERNEL_CMA */
+void __init reserve_crashkernel_cma(unsigned long long cma_size)
+{
+ if (cma_size)
+ pr_warn("crashkernel CMA reservation not supported\n");
+}
+#endif
+
#ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY
static __init int insert_crashkernel_resources(void)
{
--
Jiri Bohac <jbohac@suse.cz>
SUSE Labs, Prague, Czechia
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH v4 3/5] kdump, documentation: describe craskernel CMA reservation
2025-05-30 20:23 [PATCH v4 0/5] kdump: crashkernel reservation from CMA Jiri Bohac
2025-05-30 20:26 ` [PATCH v4 1/5] Add a new optional ",cma" suffix to the crashkernel= command line option Jiri Bohac
2025-05-30 20:27 ` [PATCH v4 2/5] kdump: implement reserve_crashkernel_cma Jiri Bohac
@ 2025-05-30 20:28 ` Jiri Bohac
2025-05-30 20:29 ` [PATCH v4 4/5] kdump: wait for DMA to finish when using CMA Jiri Bohac
2025-05-30 20:31 ` [PATCH v4 5/5] x86: implement crashkernel cma reservation Jiri Bohac
4 siblings, 0 replies; 15+ messages in thread
From: Jiri Bohac @ 2025-05-30 20:28 UTC (permalink / raw)
To: Baoquan He, Vivek Goyal, Dave Young, kexec
Cc: Philipp Rudo, Donald Dutile, Pingfan Liu, Tao Liu, linux-kernel,
David Hildenbrand, Michal Hocko
Describe the new crashkernel ",cma" suffix in Documentation/
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
---
Documentation/admin-guide/kdump/kdump.rst | 21 ++++++++++++++++++
.../admin-guide/kernel-parameters.txt | 22 +++++++++++++++++++
2 files changed, 43 insertions(+)
diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst
index 1f7f14c6e184..089665731509 100644
--- a/Documentation/admin-guide/kdump/kdump.rst
+++ b/Documentation/admin-guide/kdump/kdump.rst
@@ -311,6 +311,27 @@ crashkernel syntax
crashkernel=0,low
+4) crashkernel=size,cma
+
+ Reserve additional crash kernel memory from CMA. This reservation is
+ usable by the first system's userspace memory and kernel movable
+ allocations (memory balloon, zswap). Pages allocated from this memory
+ range will not be included in the vmcore so this should not be used if
+ dumping of userspace memory is intended and it has to be expected that
+ some movable kernel pages may be missing from the dump.
+
+ A standard crashkernel reservation, as described above, is still needed
+ to hold the crash kernel and initrd.
+
+ This option increases the risk of a kdump failure: DMA transfers
+ configured by the first kernel may end up corrupting the second
+ kernel's memory.
+
+ This reservation method is intended for systems that can't afford to
+ sacrifice enough memory for standard crashkernel reservation and where
+ less reliable and possibly incomplete kdump is preferable to no kdump at
+ all.
+
Boot into System Kernel
-----------------------
1) Update the boot loader (such as grub, yaboot, or lilo) configuration
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index ea81784be981..ee6be52dd8a5 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -983,6 +983,28 @@
0: to disable low allocation.
It will be ignored when crashkernel=X,high is not used
or memory reserved is below 4G.
+ crashkernel=size[KMG],cma
+ [KNL, X86] Reserve additional crash kernel memory from
+ CMA. This reservation is usable by the first system's
+ userspace memory and kernel movable allocations (memory
+ balloon, zswap). Pages allocated from this memory range
+ will not be included in the vmcore so this should not
+ be used if dumping of userspace memory is intended and
+ it has to be expected that some movable kernel pages
+ may be missing from the dump.
+
+ A standard crashkernel reservation, as described above,
+ is still needed to hold the crash kernel and initrd.
+
+ This option increases the risk of a kdump failure: DMA
+ transfers configured by the first kernel may end up
+ corrupting the second kernel's memory.
+
+ This reservation method is intended for systems that
+ can't afford to sacrifice enough memory for standard
+ crashkernel reservation and where less reliable and
+ possibly incomplete kdump is preferable to no kdump at
+ all.
cryptomgr.notests
[KNL] Disable crypto self-tests
--
Jiri Bohac <jbohac@suse.cz>
SUSE Labs, Prague, Czechia
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH v4 4/5] kdump: wait for DMA to finish when using CMA
2025-05-30 20:23 [PATCH v4 0/5] kdump: crashkernel reservation from CMA Jiri Bohac
` (2 preceding siblings ...)
2025-05-30 20:28 ` [PATCH v4 3/5] kdump, documentation: describe craskernel CMA reservation Jiri Bohac
@ 2025-05-30 20:29 ` Jiri Bohac
2025-06-03 13:15 ` David Hildenbrand
2025-05-30 20:31 ` [PATCH v4 5/5] x86: implement crashkernel cma reservation Jiri Bohac
4 siblings, 1 reply; 15+ messages in thread
From: Jiri Bohac @ 2025-05-30 20:29 UTC (permalink / raw)
To: Baoquan He, Vivek Goyal, Dave Young, kexec
Cc: Philipp Rudo, Donald Dutile, Pingfan Liu, Tao Liu, linux-kernel,
David Hildenbrand, Michal Hocko
When re-using the CMA area for kdump there is a risk of pending DMA into
pinned user pages in the CMA area.
Pages that are pinned long-term are migrated away from CMA, so these are
not a concern. Pages pinned without FOLL_LONGTERM remain in the CMA and may
possibly be the source or destination of a pending DMA transfer.
Although there is no clear specification how long a page may be pinned
without FOLL_LONGTERM, pinning without the flag shows an intent of the
caller to only use the memory for short-lived DMA transfers, not a transfer
initiated by a device asynchronously at a random time in the future.
Add a delay of CMA_DMA_TIMEOUT_SEC seconds before starting the kdump
kernel, giving such short-lived DMA transfers time to finish before the CMA
memory is re-used by the kdump kernel.
Set CMA_DMA_TIMEOUT_SEC to 10 seconds - chosen arbitrarily as both
a huge margin for a DMA transfer, yet not increasing the kdump time
too significantly.
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
---
Changes since v3:
- renamed CMA_DMA_TIMEOUT_SEC to CMA_DMA_TIMEOUT_MSEC, change delay to 10 seconds
- introduce a cma_dma_timeout_sec initialized to CMA_DMA_TIMEOUT_SEC
to make the timeout trivially tunable if needed in the future
---
include/linux/crash_core.h | 3 +++
kernel/crash_core.c | 17 +++++++++++++++++
2 files changed, 20 insertions(+)
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 44305336314e..805a07042c96 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -56,6 +56,9 @@ static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; }
/* Alignment required for elf header segment */
#define ELF_CORE_HEADER_ALIGN 4096
+/* Default value for cma_dma_timeout_sec */
+#define CMA_DMA_TIMEOUT_SEC 10
+
extern int crash_exclude_mem_range(struct crash_mem *mem,
unsigned long long mstart,
unsigned long long mend);
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 335b8425dd4b..a255c9e2ef29 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -21,6 +21,7 @@
#include <linux/reboot.h>
#include <linux/btf.h>
#include <linux/objtool.h>
+#include <linux/delay.h>
#include <asm/page.h>
#include <asm/sections.h>
@@ -33,6 +34,11 @@
/* Per cpu memory for storing cpu states in case of system crash. */
note_buf_t __percpu *crash_notes;
+/* time to wait for possible DMA to finish before starting the kdump kernel
+ * when a CMA reservation is used
+ */
+unsigned int cma_dma_timeout_sec = CMA_DMA_TIMEOUT_SEC;
+
#ifdef CONFIG_CRASH_DUMP
int kimage_crash_copy_vmcoreinfo(struct kimage *image)
@@ -97,6 +103,17 @@ int kexec_crash_loaded(void)
}
EXPORT_SYMBOL_GPL(kexec_crash_loaded);
+static void crash_cma_clear_pending_dma(void)
+{
+ unsigned int s = cma_dma_timeout_sec;
+
+ if (!crashk_cma_cnt)
+ return;
+
+ while (s--)
+ mdelay(1000);
+}
+
/*
* No panic_cpu check version of crash_kexec(). This function is called
* only when panic_cpu holds the current CPU number; this is the only CPU
@@ -119,6 +135,7 @@ void __noclone __crash_kexec(struct pt_regs *regs)
crash_setup_regs(&fixed_regs, regs);
crash_save_vmcoreinfo();
machine_crash_shutdown(&fixed_regs);
+ crash_cma_clear_pending_dma();
machine_kexec(kexec_crash_image);
}
kexec_unlock();
--
Jiri Bohac <jbohac@suse.cz>
SUSE Labs, Prague, Czechia
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH v4 4/5] kdump: wait for DMA to finish when using CMA
2025-05-30 20:29 ` [PATCH v4 4/5] kdump: wait for DMA to finish when using CMA Jiri Bohac
@ 2025-06-03 13:15 ` David Hildenbrand
2025-06-03 15:59 ` Jiri Bohac
0 siblings, 1 reply; 15+ messages in thread
From: David Hildenbrand @ 2025-06-03 13:15 UTC (permalink / raw)
To: Jiri Bohac, Baoquan He, Vivek Goyal, Dave Young, kexec
Cc: Philipp Rudo, Donald Dutile, Pingfan Liu, Tao Liu, linux-kernel,
David Hildenbrand, Michal Hocko
On 30.05.25 22:29, Jiri Bohac wrote:
> When re-using the CMA area for kdump there is a risk of pending DMA into
> pinned user pages in the CMA area.
>
> Pages that are pinned long-term are migrated away from CMA, so these are
> not a concern. Pages pinned without FOLL_LONGTERM remain in the CMA and may
> possibly be the source or destination of a pending DMA transfer.
I'll note that we right now do have an upstream BUG where that is
sometimes not the case. I mentioned it previously that such bugs will be
a problem :(
https://lkml.kernel.org/r/20250523023709epcms1p236d4f55b79adb9366ec1cf6d5792b06b@epcms1p2
>
> Although there is no clear specification how long a page may be pinned
> without FOLL_LONGTERM, pinning without the flag shows an intent of the
> caller to only use the memory for short-lived DMA transfers, not a transfer
> initiated by a device asynchronously at a random time in the future.
>
> Add a delay of CMA_DMA_TIMEOUT_SEC seconds before starting the kdump
> kernel, giving such short-lived DMA transfers time to finish before the CMA
> memory is re-used by the kdump kernel.
>
> Set CMA_DMA_TIMEOUT_SEC to 10 seconds - chosen arbitrarily as both
> a huge margin for a DMA transfer, yet not increasing the kdump time
> too significantly.
>
> Signed-off-by: Jiri Bohac <jbohac@suse.cz>
>
> ---
> Changes since v3:
> - renamed CMA_DMA_TIMEOUT_SEC to CMA_DMA_TIMEOUT_MSEC, change delay to 10 seconds
> - introduce a cma_dma_timeout_sec initialized to CMA_DMA_TIMEOUT_SEC
> to make the timeout trivially tunable if needed in the future
>
> ---
> include/linux/crash_core.h | 3 +++
> kernel/crash_core.c | 17 +++++++++++++++++
> 2 files changed, 20 insertions(+)
>
> diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
> index 44305336314e..805a07042c96 100644
> --- a/include/linux/crash_core.h
> +++ b/include/linux/crash_core.h
> @@ -56,6 +56,9 @@ static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; }
> /* Alignment required for elf header segment */
> #define ELF_CORE_HEADER_ALIGN 4096
>
> +/* Default value for cma_dma_timeout_sec */
> +#define CMA_DMA_TIMEOUT_SEC 10
> +
> extern int crash_exclude_mem_range(struct crash_mem *mem,
> unsigned long long mstart,
> unsigned long long mend);
> diff --git a/kernel/crash_core.c b/kernel/crash_core.c
> index 335b8425dd4b..a255c9e2ef29 100644
> --- a/kernel/crash_core.c
> +++ b/kernel/crash_core.c
> @@ -21,6 +21,7 @@
> #include <linux/reboot.h>
> #include <linux/btf.h>
> #include <linux/objtool.h>
> +#include <linux/delay.h>
>
> #include <asm/page.h>
> #include <asm/sections.h>
> @@ -33,6 +34,11 @@
> /* Per cpu memory for storing cpu states in case of system crash. */
> note_buf_t __percpu *crash_notes;
>
> +/* time to wait for possible DMA to finish before starting the kdump kernel
> + * when a CMA reservation is used
> + */
> +unsigned int cma_dma_timeout_sec = CMA_DMA_TIMEOUT_SEC;
> +
> #ifdef CONFIG_CRASH_DUMP
>
> int kimage_crash_copy_vmcoreinfo(struct kimage *image)
> @@ -97,6 +103,17 @@ int kexec_crash_loaded(void)
> }
> EXPORT_SYMBOL_GPL(kexec_crash_loaded);
>
> +static void crash_cma_clear_pending_dma(void)
> +{
> + unsigned int s = cma_dma_timeout_sec;
> +
> + if (!crashk_cma_cnt)
> + return;
> +
> + while (s--)
> + mdelay(1000);
Any reason we cannot do it in a single mdelay() invocation?
mdelay() already is a loop around udelay on larger values IIUC.
--
Cheers,
David / dhildenb
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v4 4/5] kdump: wait for DMA to finish when using CMA
2025-06-03 13:15 ` David Hildenbrand
@ 2025-06-03 15:59 ` Jiri Bohac
2025-06-03 16:25 ` David Hildenbrand
0 siblings, 1 reply; 15+ messages in thread
From: Jiri Bohac @ 2025-06-03 15:59 UTC (permalink / raw)
To: David Hildenbrand
Cc: Baoquan He, Vivek Goyal, Dave Young, kexec, Philipp Rudo,
Donald Dutile, Pingfan Liu, Tao Liu, linux-kernel,
David Hildenbrand, Michal Hocko
On Tue, Jun 03, 2025 at 03:15:03PM +0200, David Hildenbrand wrote:
> On 30.05.25 22:29, Jiri Bohac wrote:
> > When re-using the CMA area for kdump there is a risk of pending DMA into
> > pinned user pages in the CMA area.
> >
> > Pages that are pinned long-term are migrated away from CMA, so these are
> > not a concern. Pages pinned without FOLL_LONGTERM remain in the CMA and may
> > possibly be the source or destination of a pending DMA transfer.
>
> I'll note that we right now do have an upstream BUG where that is sometimes
> not the case. I mentioned it previously that such bugs will be a problem :(
>
> https://lkml.kernel.org/r/20250523023709epcms1p236d4f55b79adb9366ec1cf6d5792b06b@epcms1p2
I'll just reitarate the whole purpose of this patchset, as
added to Documentation:
+ This option increases the risk of a kdump failure: DMA transfers
+ configured by the first kernel may end up corrupting the second
+ kernel's memory.
+
+ This reservation method is intended for systems that can't afford to
+ sacrifice enough memory for standard crashkernel reservation and where
+ less reliable and possibly incomplete kdump is preferable to no kdump at
+ all.
It is expected that kdump may be less reliable when ,cma is used.
You mentioned a bug that augments this unreliability and that is surely going to get fixed.
I think this is fine.
The whole point is getting a completely optional best-effort kdump when
otherwise we would have no kdump.
> > +static void crash_cma_clear_pending_dma(void)
> > +{
> > + unsigned int s = cma_dma_timeout_sec;
> > +
> > + if (!crashk_cma_cnt)
> > + return;
> > +
> > + while (s--)
> > + mdelay(1000);
>
> Any reason we cannot do it in a single mdelay() invocation?
>
> mdelay() already is a loop around udelay on larger values IIUC.
No good reasons ;)
I just wanted to prevent a totally theoretical overflow (if cma_dma_timeout_sec was made configurable;
I also anticipated someone might want to add some progress printks into the cycle (without verifying if
that's even possible in this context).
If you want, I have no problem changing this to:
+ mdelay(cma_dma_timeout_sec * 1000);
--
Jiri Bohac <jbohac@suse.cz>
SUSE Labs, Prague, Czechia
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v4 4/5] kdump: wait for DMA to finish when using CMA
2025-06-03 15:59 ` Jiri Bohac
@ 2025-06-03 16:25 ` David Hildenbrand
2025-06-04 7:40 ` Jiri Bohac
0 siblings, 1 reply; 15+ messages in thread
From: David Hildenbrand @ 2025-06-03 16:25 UTC (permalink / raw)
To: Jiri Bohac
Cc: Baoquan He, Vivek Goyal, Dave Young, kexec, Philipp Rudo,
Donald Dutile, Pingfan Liu, Tao Liu, linux-kernel,
David Hildenbrand, Michal Hocko
On 03.06.25 17:59, Jiri Bohac wrote:
> On Tue, Jun 03, 2025 at 03:15:03PM +0200, David Hildenbrand wrote:
>> On 30.05.25 22:29, Jiri Bohac wrote:
>>> When re-using the CMA area for kdump there is a risk of pending DMA into
>>> pinned user pages in the CMA area.
>>>
>>> Pages that are pinned long-term are migrated away from CMA, so these are
>>> not a concern. Pages pinned without FOLL_LONGTERM remain in the CMA and may
>>> possibly be the source or destination of a pending DMA transfer.
>>
>> I'll note that we right now do have an upstream BUG where that is sometimes
>> not the case. I mentioned it previously that such bugs will be a problem :(
>>
>> https://lkml.kernel.org/r/20250523023709epcms1p236d4f55b79adb9366ec1cf6d5792b06b@epcms1p2
>
> I'll just reitarate the whole purpose of this patchset, as
> added to Documentation:
I know, but stating "these are not a concern", when they are currently a
concern upstream is a bit suboptimal. :)
I'd phrase it more like "Pages residing in CMA areas can usually not get
long-term pinned, so long-term pinning is typically not a concern. BUGs
in the kernel might still lead to long-term pinning of such pages if
everything goes wrong."
Or sth like that.
>>> +static void crash_cma_clear_pending_dma(void)
>>> +{
>>> + unsigned int s = cma_dma_timeout_sec;
>>> +
>>> + if (!crashk_cma_cnt)
>>> + return;
>>> +
>>> + while (s--)
>>> + mdelay(1000);
>>
>> Any reason we cannot do it in a single mdelay() invocation?
>>
>> mdelay() already is a loop around udelay on larger values IIUC.
>
> No good reasons ;)
> I just wanted to prevent a totally theoretical overflow (if cma_dma_timeout_sec was made configurable;
> I also anticipated someone might want to add some progress printks into the cycle (without verifying if
> that's even possible in this context).
>
> If you want, I have no problem changing this to:
> + mdelay(cma_dma_timeout_sec * 1000);
Probably good enough. Or just hard-code 10s and call it a day. :)
--
Cheers,
David / dhildenb
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v4 4/5] kdump: wait for DMA to finish when using CMA
2025-06-03 16:25 ` David Hildenbrand
@ 2025-06-04 7:40 ` Jiri Bohac
2025-06-04 7:42 ` David Hildenbrand
0 siblings, 1 reply; 15+ messages in thread
From: Jiri Bohac @ 2025-06-04 7:40 UTC (permalink / raw)
To: David Hildenbrand
Cc: Baoquan He, Vivek Goyal, Dave Young, kexec, Philipp Rudo,
Donald Dutile, Pingfan Liu, Tao Liu, linux-kernel,
David Hildenbrand, Michal Hocko
On Tue, Jun 03, 2025 at 06:25:57PM +0200, David Hildenbrand wrote:
> On 03.06.25 17:59, Jiri Bohac wrote:
> I'd phrase it more like "Pages residing in CMA areas can usually not get
> long-term pinned, so long-term pinning is typically not a concern. BUGs in
> the kernel might still lead to long-term pinning of such pages if everything
> goes wrong."
...
> > If you want, I have no problem changing this to:
> > + mdelay(cma_dma_timeout_sec * 1000);
>
> Probably good enough. Or just hard-code 10s and call it a day. :)
Thanks for your comments, David. This would be the v5 of this
patch:
Subject: [PATCH v5 4/5] kdump: wait for DMA to finish when using CMA
When re-using the CMA area for kdump there is a risk of pending DMA
into pinned user pages in the CMA area.
Pages residing in CMA areas can usually not get long-term pinned and
are instead migrated away from the CMA area, so long-term pinning is
typically not a concern. (BUGs in the kernel might still lead to
long-term pinning of such pages if everything goes wrong.)
Pages pinned without FOLL_LONGTERM remain in the CMA and may possibly
be the source or destination of a pending DMA transfer.
Although there is no clear specification how long a page may be pinned
without FOLL_LONGTERM, pinning without the flag shows an intent of the
caller to only use the memory for short-lived DMA transfers, not a transfer
initiated by a device asynchronously at a random time in the future.
Add a delay of CMA_DMA_TIMEOUT_SEC seconds before starting the kdump
kernel, giving such short-lived DMA transfers time to finish before
the CMA memory is re-used by the kdump kernel.
Set CMA_DMA_TIMEOUT_SEC to 10 seconds - chosen arbitrarily as both
a huge margin for a DMA transfer, yet not increasing the kdump time
too significantly.
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
---
Changes since v4:
- reworded the paragraph about long-term pinning
- simplified crash_cma_clear_pending_dma()
---
Changes since v3:
- renamed CMA_DMA_TIMEOUT_SEC to CMA_DMA_TIMEOUT_MSEC, change delay to 10 seconds
- introduce a cma_dma_timeout_sec initialized to CMA_DMA_TIMEOUT_SEC
to make the timeout trivially tunable if needed in the future
---
include/linux/crash_core.h | 3 +++
kernel/crash_core.c | 15 +++++++++++++++
2 files changed, 18 insertions(+)
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 44305336314e..805a07042c96 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -56,6 +56,9 @@ static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; }
/* Alignment required for elf header segment */
#define ELF_CORE_HEADER_ALIGN 4096
+/* Default value for cma_dma_timeout_sec */
+#define CMA_DMA_TIMEOUT_SEC 10
+
extern int crash_exclude_mem_range(struct crash_mem *mem,
unsigned long long mstart,
unsigned long long mend);
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 335b8425dd4b..540fd75a4a0d 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -21,6 +21,7 @@
#include <linux/reboot.h>
#include <linux/btf.h>
#include <linux/objtool.h>
+#include <linux/delay.h>
#include <asm/page.h>
#include <asm/sections.h>
@@ -33,6 +34,11 @@
/* Per cpu memory for storing cpu states in case of system crash. */
note_buf_t __percpu *crash_notes;
+/* time to wait for possible DMA to finish before starting the kdump kernel
+ * when a CMA reservation is used
+ */
+unsigned int cma_dma_timeout_sec = CMA_DMA_TIMEOUT_SEC;
+
#ifdef CONFIG_CRASH_DUMP
int kimage_crash_copy_vmcoreinfo(struct kimage *image)
@@ -97,6 +103,14 @@ int kexec_crash_loaded(void)
}
EXPORT_SYMBOL_GPL(kexec_crash_loaded);
+static void crash_cma_clear_pending_dma(void)
+{
+ if (!crashk_cma_cnt)
+ return;
+
+ mdelay(cma_dma_timeout_sec * 1000);
+}
+
/*
* No panic_cpu check version of crash_kexec(). This function is called
* only when panic_cpu holds the current CPU number; this is the only CPU
@@ -119,6 +133,7 @@ void __noclone __crash_kexec(struct pt_regs *regs)
crash_setup_regs(&fixed_regs, regs);
crash_save_vmcoreinfo();
machine_crash_shutdown(&fixed_regs);
+ crash_cma_clear_pending_dma();
machine_kexec(kexec_crash_image);
}
kexec_unlock();
--
Jiri Bohac <jbohac@suse.cz>
SUSE Labs, Prague, Czechia
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH v4 4/5] kdump: wait for DMA to finish when using CMA
2025-06-04 7:40 ` Jiri Bohac
@ 2025-06-04 7:42 ` David Hildenbrand
2025-06-04 8:15 ` [PATCH v5 " Jiri Bohac
0 siblings, 1 reply; 15+ messages in thread
From: David Hildenbrand @ 2025-06-04 7:42 UTC (permalink / raw)
To: Jiri Bohac
Cc: Baoquan He, Vivek Goyal, Dave Young, kexec, Philipp Rudo,
Donald Dutile, Pingfan Liu, Tao Liu, linux-kernel,
David Hildenbrand, Michal Hocko
On 04.06.25 09:40, Jiri Bohac wrote:
> On Tue, Jun 03, 2025 at 06:25:57PM +0200, David Hildenbrand wrote:
>> On 03.06.25 17:59, Jiri Bohac wrote:
>> I'd phrase it more like "Pages residing in CMA areas can usually not get
>> long-term pinned, so long-term pinning is typically not a concern. BUGs in
>> the kernel might still lead to long-term pinning of such pages if everything
>> goes wrong."
>
> ...
>
>>> If you want, I have no problem changing this to:
>>> + mdelay(cma_dma_timeout_sec * 1000);
>>
>> Probably good enough. Or just hard-code 10s and call it a day. :)
>
> Thanks for your comments, David. This would be the v5 of this
> patch:
>
> Subject: [PATCH v5 4/5] kdump: wait for DMA to finish when using CMA
>
> When re-using the CMA area for kdump there is a risk of pending DMA
> into pinned user pages in the CMA area.
>
> Pages residing in CMA areas can usually not get long-term pinned and
> are instead migrated away from the CMA area, so long-term pinning is
> typically not a concern. (BUGs in the kernel might still lead to
> long-term pinning of such pages if everything goes wrong.)
>
> Pages pinned without FOLL_LONGTERM remain in the CMA and may possibly
> be the source or destination of a pending DMA transfer.
>
> Although there is no clear specification how long a page may be pinned
> without FOLL_LONGTERM, pinning without the flag shows an intent of the
> caller to only use the memory for short-lived DMA transfers, not a transfer
> initiated by a device asynchronously at a random time in the future.
>
> Add a delay of CMA_DMA_TIMEOUT_SEC seconds before starting the kdump
> kernel, giving such short-lived DMA transfers time to finish before
> the CMA memory is re-used by the kdump kernel.
>
> Set CMA_DMA_TIMEOUT_SEC to 10 seconds - chosen arbitrarily as both
> a huge margin for a DMA transfer, yet not increasing the kdump time
> too significantly.
>
> Signed-off-by: Jiri Bohac <jbohac@suse.cz>
>
> ---
> Changes since v4:
> - reworded the paragraph about long-term pinning
> - simplified crash_cma_clear_pending_dma()
>
> ---
> Changes since v3:
> - renamed CMA_DMA_TIMEOUT_SEC to CMA_DMA_TIMEOUT_MSEC, change delay to 10 seconds
> - introduce a cma_dma_timeout_sec initialized to CMA_DMA_TIMEOUT_SEC
> to make the timeout trivially tunable if needed in the future
>
> ---
> include/linux/crash_core.h | 3 +++
> kernel/crash_core.c | 15 +++++++++++++++
> 2 files changed, 18 insertions(+)
>
> diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
> index 44305336314e..805a07042c96 100644
> --- a/include/linux/crash_core.h
> +++ b/include/linux/crash_core.h
> @@ -56,6 +56,9 @@ static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; }
> /* Alignment required for elf header segment */
> #define ELF_CORE_HEADER_ALIGN 4096
>
> +/* Default value for cma_dma_timeout_sec */
> +#define CMA_DMA_TIMEOUT_SEC 10
> +
> extern int crash_exclude_mem_range(struct crash_mem *mem,
> unsigned long long mstart,
> unsigned long long mend);
> diff --git a/kernel/crash_core.c b/kernel/crash_core.c
> index 335b8425dd4b..540fd75a4a0d 100644
> --- a/kernel/crash_core.c
> +++ b/kernel/crash_core.c
> @@ -21,6 +21,7 @@
> #include <linux/reboot.h>
> #include <linux/btf.h>
> #include <linux/objtool.h>
> +#include <linux/delay.h>
>
> #include <asm/page.h>
> #include <asm/sections.h>
> @@ -33,6 +34,11 @@
> /* Per cpu memory for storing cpu states in case of system crash. */
> note_buf_t __percpu *crash_notes;
>
> +/* time to wait for possible DMA to finish before starting the kdump kernel
> + * when a CMA reservation is used
> + */
> +unsigned int cma_dma_timeout_sec = CMA_DMA_TIMEOUT_SEC;
Likely no need for that variable?
mdelay(CMA_DMA_TIMEOUT_SEC * 1000);
Then, move the doc over to CMA_DMA_TIMEOUT_SEC
... or rather just move the "#define CMA_DMA_TIMEOUT_SEC 10" over here
With that
Acked-by: David Hildenbrand <david@redhat.com>
--
Cheers,
David / dhildenb
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v5 4/5] kdump: wait for DMA to finish when using CMA
2025-06-04 7:42 ` David Hildenbrand
@ 2025-06-04 8:15 ` Jiri Bohac
0 siblings, 0 replies; 15+ messages in thread
From: Jiri Bohac @ 2025-06-04 8:15 UTC (permalink / raw)
To: David Hildenbrand, Baoquan He, Vivek Goyal, Dave Young, kexec
Cc: Philipp Rudo, Donald Dutile, Pingfan Liu, Tao Liu, linux-kernel,
David Hildenbrand, Michal Hocko
When re-using the CMA area for kdump there is a risk of pending DMA
into pinned user pages in the CMA area.
Pages residing in CMA areas can usually not get long-term pinned and
are instead migrated away from the CMA area, so long-term pinning is
typically not a concern. (BUGs in the kernel might still lead to
long-term pinning of such pages if everything goes wrong.)
Pages pinned without FOLL_LONGTERM remain in the CMA and may possibly
be the source or destination of a pending DMA transfer.
Although there is no clear specification how long a page may be pinned
without FOLL_LONGTERM, pinning without the flag shows an intent of the
caller to only use the memory for short-lived DMA transfers, not a transfer
initiated by a device asynchronously at a random time in the future.
Add a delay of CMA_DMA_TIMEOUT_SEC seconds before starting the kdump
kernel, giving such short-lived DMA transfers time to finish before
the CMA memory is re-used by the kdump kernel.
Set CMA_DMA_TIMEOUT_SEC to 10 seconds - chosen arbitrarily as both
a huge margin for a DMA transfer, yet not increasing the kdump time
too significantly.
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Acked-by: David Hildenbrand <david@redhat.com>
---
Changes since v4:
- reworded the paragraph about long-term pinning
- simplified crash_cma_clear_pending_dma()
- dropped cma_dma_timeout_sec variable
---
Changes since v3:
- renamed CMA_DMA_TIMEOUT_SEC to CMA_DMA_TIMEOUT_MSEC, change delay to 10 seconds
- introduce a cma_dma_timeout_sec initialized to CMA_DMA_TIMEOUT_SEC
to make the timeout trivially tunable if needed in the future
---
kernel/crash_core.c | 15 +++++++++++++++
1 file changed, 15 insertions(+)
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 335b8425dd4b..a4ef79591eb2 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -21,6 +21,7 @@
#include <linux/reboot.h>
#include <linux/btf.h>
#include <linux/objtool.h>
+#include <linux/delay.h>
#include <asm/page.h>
#include <asm/sections.h>
@@ -33,6 +34,11 @@
/* Per cpu memory for storing cpu states in case of system crash. */
note_buf_t __percpu *crash_notes;
+/* time to wait for possible DMA to finish before starting the kdump kernel
+ * when a CMA reservation is used
+ */
+#define CMA_DMA_TIMEOUT_SEC 10
+
#ifdef CONFIG_CRASH_DUMP
int kimage_crash_copy_vmcoreinfo(struct kimage *image)
@@ -97,6 +103,14 @@ int kexec_crash_loaded(void)
}
EXPORT_SYMBOL_GPL(kexec_crash_loaded);
+static void crash_cma_clear_pending_dma(void)
+{
+ if (!crashk_cma_cnt)
+ return;
+
+ mdelay(CMA_DMA_TIMEOUT_SEC * 1000);
+}
+
/*
* No panic_cpu check version of crash_kexec(). This function is called
* only when panic_cpu holds the current CPU number; this is the only CPU
@@ -119,6 +133,7 @@ void __noclone __crash_kexec(struct pt_regs *regs)
crash_setup_regs(&fixed_regs, regs);
crash_save_vmcoreinfo();
machine_crash_shutdown(&fixed_regs);
+ crash_cma_clear_pending_dma();
machine_kexec(kexec_crash_image);
}
kexec_unlock();
--
Jiri Bohac <jbohac@suse.cz>
SUSE Labs, Prague, Czechia
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH v4 5/5] x86: implement crashkernel cma reservation
2025-05-30 20:23 [PATCH v4 0/5] kdump: crashkernel reservation from CMA Jiri Bohac
` (3 preceding siblings ...)
2025-05-30 20:29 ` [PATCH v4 4/5] kdump: wait for DMA to finish when using CMA Jiri Bohac
@ 2025-05-30 20:31 ` Jiri Bohac
2025-06-03 11:02 ` Baoquan He
4 siblings, 1 reply; 15+ messages in thread
From: Jiri Bohac @ 2025-05-30 20:31 UTC (permalink / raw)
To: Baoquan He, Vivek Goyal, Dave Young, kexec
Cc: Philipp Rudo, Donald Dutile, Pingfan Liu, Tao Liu, linux-kernel,
David Hildenbrand, Michal Hocko
Implement the crashkernel CMA reservation for x86:
- enable parsing of the cma suffix by parse_crashkernel()
- reserve memory with reserve_crashkernel_cma()
- add the CMA-reserved ranges to the e820 map for the crash kernel
- exclude the CMA-reserved ranges from vmcore
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
---
arch/x86/kernel/crash.c | 26 ++++++++++++++++++++++----
arch/x86/kernel/setup.c | 5 +++--
2 files changed, 25 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 0be61c45400c..670aa9b8b0f8 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -163,10 +163,10 @@ static struct crash_mem *fill_up_crash_elf_data(void)
return NULL;
/*
- * Exclusion of crash region and/or crashk_low_res may cause
- * another range split. So add extra two slots here.
+ * Exclusion of crash region, crashk_low_res and/or crashk_cma_ranges
+ * may cause range splits. So add extra slots here.
*/
- nr_ranges += 2;
+ nr_ranges += 2 + crashk_cma_cnt;
cmem = vzalloc(struct_size(cmem, ranges, nr_ranges));
if (!cmem)
return NULL;
@@ -184,6 +184,7 @@ static struct crash_mem *fill_up_crash_elf_data(void)
static int elf_header_exclude_ranges(struct crash_mem *cmem)
{
int ret = 0;
+ int i;
/* Exclude the low 1M because it is always reserved */
ret = crash_exclude_mem_range(cmem, 0, SZ_1M - 1);
@@ -198,8 +199,17 @@ static int elf_header_exclude_ranges(struct crash_mem *cmem)
if (crashk_low_res.end)
ret = crash_exclude_mem_range(cmem, crashk_low_res.start,
crashk_low_res.end);
+ if (ret)
+ return ret;
- return ret;
+ for (i = 0; i < crashk_cma_cnt; ++i) {
+ ret = crash_exclude_mem_range(cmem, crashk_cma_ranges[i].start,
+ crashk_cma_ranges[i].end);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
}
static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg)
@@ -352,6 +362,14 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
add_e820_entry(params, &ei);
}
+ for (i = 0; i < crashk_cma_cnt; ++i) {
+ ei.addr = crashk_cma_ranges[i].start;
+ ei.size = crashk_cma_ranges[i].end -
+ crashk_cma_ranges[i].start + 1;
+ ei.type = E820_TYPE_RAM;
+ add_e820_entry(params, &ei);
+ }
+
out:
vfree(cmem);
return ret;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 870b06571b2e..dcbeba344825 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -573,7 +573,7 @@ static void __init memblock_x86_reserve_range_setup_data(void)
static void __init arch_reserve_crashkernel(void)
{
- unsigned long long crash_base, crash_size, low_size = 0;
+ unsigned long long crash_base, crash_size, low_size = 0, cma_size = 0;
bool high = false;
int ret;
@@ -582,7 +582,7 @@ static void __init arch_reserve_crashkernel(void)
ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
&crash_size, &crash_base,
- &low_size, NULL, &high);
+ &low_size, &cma_size, &high);
if (ret)
return;
@@ -592,6 +592,7 @@ static void __init arch_reserve_crashkernel(void)
}
reserve_crashkernel_generic(crash_size, crash_base, low_size, high);
+ reserve_crashkernel_cma(cma_size);
}
static struct resource standard_io_resources[] = {
--
Jiri Bohac <jbohac@suse.cz>
SUSE Labs, Prague, Czechia
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH v4 5/5] x86: implement crashkernel cma reservation
2025-05-30 20:31 ` [PATCH v4 5/5] x86: implement crashkernel cma reservation Jiri Bohac
@ 2025-06-03 11:02 ` Baoquan He
2025-06-03 12:11 ` Jiri Bohac
0 siblings, 1 reply; 15+ messages in thread
From: Baoquan He @ 2025-06-03 11:02 UTC (permalink / raw)
To: Jiri Bohac
Cc: Vivek Goyal, Dave Young, kexec, Philipp Rudo, Donald Dutile,
Pingfan Liu, Tao Liu, linux-kernel, David Hildenbrand,
Michal Hocko
On 05/30/25 at 10:31pm, Jiri Bohac wrote:
......snip..
> @@ -582,7 +582,7 @@ static void __init arch_reserve_crashkernel(void)
>
> ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
> &crash_size, &crash_base,
> - &low_size, NULL, &high);
> + &low_size, &cma_size, &high);
> if (ret)
> return;
>
> @@ -592,6 +592,7 @@ static void __init arch_reserve_crashkernel(void)
> }
>
> reserve_crashkernel_generic(crash_size, crash_base, low_size, high);
> + reserve_crashkernel_cma(cma_size);
Wondering if ,high|low is still allowed (or needed) when ,cma is specified.
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v4 5/5] x86: implement crashkernel cma reservation
2025-06-03 11:02 ` Baoquan He
@ 2025-06-03 12:11 ` Jiri Bohac
2025-06-03 12:56 ` Baoquan He
0 siblings, 1 reply; 15+ messages in thread
From: Jiri Bohac @ 2025-06-03 12:11 UTC (permalink / raw)
To: Baoquan He
Cc: Vivek Goyal, Dave Young, kexec, Philipp Rudo, Donald Dutile,
Pingfan Liu, Tao Liu, linux-kernel, David Hildenbrand,
Michal Hocko
On Tue, Jun 03, 2025 at 07:02:06PM +0800, Baoquan He wrote:
> On 05/30/25 at 10:31pm, Jiri Bohac wrote:
> ......snip..
> > @@ -582,7 +582,7 @@ static void __init arch_reserve_crashkernel(void)
> >
> > ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
> > &crash_size, &crash_base,
> > - &low_size, NULL, &high);
> > + &low_size, &cma_size, &high);
> > if (ret)
> > return;
> >
> > @@ -592,6 +592,7 @@ static void __init arch_reserve_crashkernel(void)
> > }
> >
> > reserve_crashkernel_generic(crash_size, crash_base, low_size, high);
> > + reserve_crashkernel_cma(cma_size);
>
> Wondering if ,high|low is still allowed (or needed) when ,cma is specified.
Probably not needed but it works, totally independent of the
extra CMA-reserved area.
I saw no reason to artificially prevent it.
--
Jiri Bohac <jbohac@suse.cz>
SUSE Labs, Prague, Czechia
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v4 5/5] x86: implement crashkernel cma reservation
2025-06-03 12:11 ` Jiri Bohac
@ 2025-06-03 12:56 ` Baoquan He
0 siblings, 0 replies; 15+ messages in thread
From: Baoquan He @ 2025-06-03 12:56 UTC (permalink / raw)
To: Jiri Bohac
Cc: Vivek Goyal, Dave Young, kexec, Philipp Rudo, Donald Dutile,
Pingfan Liu, Tao Liu, linux-kernel, David Hildenbrand,
Michal Hocko
On 06/03/25 at 02:11pm, Jiri Bohac wrote:
> On Tue, Jun 03, 2025 at 07:02:06PM +0800, Baoquan He wrote:
> > On 05/30/25 at 10:31pm, Jiri Bohac wrote:
> > ......snip..
> > > @@ -582,7 +582,7 @@ static void __init arch_reserve_crashkernel(void)
> > >
> > > ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
> > > &crash_size, &crash_base,
> > > - &low_size, NULL, &high);
> > > + &low_size, &cma_size, &high);
> > > if (ret)
> > > return;
> > >
> > > @@ -592,6 +592,7 @@ static void __init arch_reserve_crashkernel(void)
> > > }
> > >
> > > reserve_crashkernel_generic(crash_size, crash_base, low_size, high);
> > > + reserve_crashkernel_cma(cma_size);
> >
> > Wondering if ,high|low is still allowed (or needed) when ,cma is specified.
>
> Probably not needed but it works, totally independent of the
> extra CMA-reserved area.
Allowing it can simplify the current code, while I can't imagine what
cases need people to specify
"crashkernel=xM,high crashkernel=xM,low crashkernel=zM,cma" at one time.
Just personal thought, I haven't think of a strong reason to prevent it
too.
>
> I saw no reason to artificially prevent it.
>
> --
> Jiri Bohac <jbohac@suse.cz>
> SUSE Labs, Prague, Czechia
>
^ permalink raw reply [flat|nested] 15+ messages in thread