[PATCH v7 1/1] numa: add 'memmap-type' option for memory type configuration

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v7 1/1] numa: add 'memmap-type' option for memory type configuration
  2026-03-06  8:27 [PATCH v7 0/1] numa: add 'memmap-type' " fanhuang
@ 2026-03-06  8:27 ` fanhuang
  2026-05-14 13:05   ` Igor Mammedov
  0 siblings, 1 reply; 12+ messages in thread
From: fanhuang @ 2026-03-06  8:27 UTC (permalink / raw)
  To: qemu-devel, david, imammedo, gourry, jonathan.cameron
  Cc: apopple, dan.j.williams, Zhigang.Luo, Lianjie.Shi, fanhuang,
	David Hildenbrand

Add a 'memmap-type' option to NUMA node configuration that allows
specifying the memory type for a NUMA node.

Supported values:
  - normal:   Regular system RAM (E820 type 1, default)
  - spm:      Specific Purpose Memory (E820 type 0xEFFFFFFF)
  - reserved: Reserved memory (E820 type 2)

The 'spm' type indicates Specific Purpose Memory - a hint to the guest
that this memory might be managed by device drivers based on guest policy.
The 'reserved' type marks memory as not usable as RAM.

Note: This option is only supported on x86 platforms.

Usage:
  -numa node,nodeid=1,memdev=m1,memmap-type=spm

Signed-off-by: fanhuang <FangSheng.Huang@amd.com>
Acked-by: David Hildenbrand <david@kernel.org>
Reviewed-by: Gregory Price <gourry@gourry.net>
---
 hw/core/numa.c               | 24 ++++++++++++
 hw/i386/acpi-build.c         |  8 ++++
 hw/i386/e820_memory_layout.c | 72 ++++++++++++++++++++++++++++++++++++
 hw/i386/e820_memory_layout.h | 12 +++---
 hw/i386/pc.c                 | 48 ++++++++++++++++++++++++
 include/system/numa.h        |  7 ++++
 qapi/machine.json            | 24 ++++++++++++
 qemu-options.hx              | 14 ++++++-
 8 files changed, 202 insertions(+), 7 deletions(-)

diff --git a/hw/core/numa.c b/hw/core/numa.c
index f462883c87..521c8f10f1 100644
--- a/hw/core/numa.c
+++ b/hw/core/numa.c
@@ -38,6 +38,7 @@
 #include "hw/mem/pc-dimm.h"
 #include "hw/core/boards.h"
 #include "hw/mem/memory-device.h"
+#include "hw/i386/x86.h"
 #include "qemu/option.h"
 #include "qemu/config-file.h"
 #include "qemu/cutils.h"
@@ -164,6 +165,29 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
         numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
     }
 
+    if (node->has_memmap_type && node->memmap_type != NUMA_MEMMAP_TYPE_NORMAL) {
+        if (!node->memdev) {
+            error_setg(errp, "memmap-type=%s requires memdev to be specified",
+                       NumaMemmapType_str(node->memmap_type));
+            return;
+        }
+        if (!object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE)) {
+            error_setg(errp, "memmap-type=%s is only supported on x86 machines",
+                       NumaMemmapType_str(node->memmap_type));
+            return;
+        }
+        switch (node->memmap_type) {
+        case NUMA_MEMMAP_TYPE_SPM:
+            numa_info[nodenr].memmap_type = NUMA_MEMMAP_SPM;
+            break;
+        case NUMA_MEMMAP_TYPE_RESERVED:
+            numa_info[nodenr].memmap_type = NUMA_MEMMAP_RESERVED;
+            break;
+        default:
+            break;
+        }
+    }
+
     numa_info[nodenr].present = true;
     max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
     ms->numa_state->num_nodes++;
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index f622b91b76..521bf66ca1 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -1417,6 +1417,14 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine)
         mem_len = numa_info[i - 1].node_mem;
         next_base = mem_base + mem_len;
 
+        /*
+         * Skip reserved memory nodes - E820 marks them as reserved,
+         * so SRAT should not report them as enabled memory affinity.
+         */
+        if (numa_info[i - 1].memmap_type == NUMA_MEMMAP_RESERVED) {
+            continue;
+        }
+
         /* Cut out the 640K hole */
         if (mem_base <= HOLE_640K_START &&
             next_base > HOLE_640K_START) {
diff --git a/hw/i386/e820_memory_layout.c b/hw/i386/e820_memory_layout.c
index 3e848fb69c..4c62b5ddea 100644
--- a/hw/i386/e820_memory_layout.c
+++ b/hw/i386/e820_memory_layout.c
@@ -46,3 +46,75 @@ bool e820_get_entry(int idx, uint32_t type, uint64_t *address, uint64_t *length)
     }
     return false;
 }
+
+bool e820_update_entry_type(uint64_t start, uint64_t length, uint32_t new_type)
+{
+    uint64_t end = start + length;
+    assert(!e820_done);
+
+    /* For E820_SOFT_RESERVED, validate range is within E820_RAM */
+    if (new_type == E820_SOFT_RESERVED) {
+        bool range_in_ram = false;
+
+        for (size_t j = 0; j < e820_entries; j++) {
+            uint64_t ram_start = le64_to_cpu(e820_table[j].address);
+            uint64_t ram_end = ram_start + le64_to_cpu(e820_table[j].length);
+            uint32_t ram_type = le32_to_cpu(e820_table[j].type);
+
+            if (ram_type == E820_RAM && ram_start <= start && ram_end >= end) {
+                range_in_ram = true;
+                break;
+            }
+        }
+        if (!range_in_ram) {
+            return false;
+        }
+    }
+
+    /* Find entry that contains the target range and update it */
+    for (size_t i = 0; i < e820_entries; i++) {
+        uint64_t entry_start = le64_to_cpu(e820_table[i].address);
+        uint64_t entry_length = le64_to_cpu(e820_table[i].length);
+        uint64_t entry_end = entry_start + entry_length;
+
+        if (entry_start <= start && entry_end >= end) {
+            uint32_t original_type = e820_table[i].type;
+
+            /* Remove original entry */
+            memmove(&e820_table[i], &e820_table[i + 1],
+                    (e820_entries - i - 1) * sizeof(struct e820_entry));
+            e820_entries--;
+
+            /* Add split parts inline */
+            if (entry_start < start) {
+                e820_table = g_renew(struct e820_entry, e820_table,
+                                     e820_entries + 1);
+                e820_table[e820_entries].address = cpu_to_le64(entry_start);
+                e820_table[e820_entries].length =
+                    cpu_to_le64(start - entry_start);
+                e820_table[e820_entries].type = original_type;
+                e820_entries++;
+            }
+
+            e820_table = g_renew(struct e820_entry, e820_table,
+                                 e820_entries + 1);
+            e820_table[e820_entries].address = cpu_to_le64(start);
+            e820_table[e820_entries].length = cpu_to_le64(length);
+            e820_table[e820_entries].type = cpu_to_le32(new_type);
+            e820_entries++;
+
+            if (end < entry_end) {
+                e820_table = g_renew(struct e820_entry, e820_table,
+                                     e820_entries + 1);
+                e820_table[e820_entries].address = cpu_to_le64(end);
+                e820_table[e820_entries].length = cpu_to_le64(entry_end - end);
+                e820_table[e820_entries].type = original_type;
+                e820_entries++;
+            }
+
+            return true;
+        }
+    }
+
+    return false;
+}
diff --git a/hw/i386/e820_memory_layout.h b/hw/i386/e820_memory_layout.h
index b50acfa201..a85b4fd14c 100644
--- a/hw/i386/e820_memory_layout.h
+++ b/hw/i386/e820_memory_layout.h
@@ -10,11 +10,12 @@
 #define HW_I386_E820_MEMORY_LAYOUT_H
 
 /* e820 types */
-#define E820_RAM        1
-#define E820_RESERVED   2
-#define E820_ACPI       3
-#define E820_NVS        4
-#define E820_UNUSABLE   5
+#define E820_RAM            1
+#define E820_RESERVED       2
+#define E820_ACPI           3
+#define E820_NVS            4
+#define E820_UNUSABLE       5
+#define E820_SOFT_RESERVED  0xEFFFFFFF
 
 struct e820_entry {
     uint64_t address;
@@ -26,5 +27,6 @@ void e820_add_entry(uint64_t address, uint64_t length, uint32_t type);
 bool e820_get_entry(int index, uint32_t type,
                     uint64_t *address, uint64_t *length);
 int e820_get_table(struct e820_entry **table);
+bool e820_update_entry_type(uint64_t start, uint64_t length, uint32_t new_type);
 
 #endif
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 819e729a6e..c024a34db2 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -740,6 +740,51 @@ static hwaddr pc_max_used_gpa(PCMachineState *pcms, uint64_t pci_hole64_size)
     return pc_above_4g_end(pcms) - 1;
 }
 
+/*
+ * Update E820 entries for NUMA nodes with non-default memory types.
+ */
+static void pc_update_numa_memory_types(X86MachineState *x86ms)
+{
+    MachineState *ms = MACHINE(x86ms);
+    uint64_t addr = 0;
+
+    for (int i = 0; i < ms->numa_state->num_nodes; i++) {
+        NodeInfo *numa_info = &ms->numa_state->nodes[i];
+        uint64_t node_size = numa_info->node_mem;
+
+        if (numa_info->node_memdev &&
+            (numa_info->memmap_type == NUMA_MEMMAP_SPM ||
+             numa_info->memmap_type == NUMA_MEMMAP_RESERVED)) {
+            uint64_t guest_addr;
+            uint32_t e820_type = (numa_info->memmap_type == NUMA_MEMMAP_SPM)
+                                  ? E820_SOFT_RESERVED : E820_RESERVED;
+
+            if (addr < x86ms->below_4g_mem_size) {
+                if (addr + node_size <= x86ms->below_4g_mem_size) {
+                    guest_addr = addr;
+                } else {
+                    error_report("NUMA node %d with memmap-type spans across "
+                                 "4GB boundary, not supported", i);
+                    exit(EXIT_FAILURE);
+                }
+            } else {
+                guest_addr = x86ms->above_4g_mem_start +
+                            (addr - x86ms->below_4g_mem_size);
+            }
+
+            if (!e820_update_entry_type(guest_addr, node_size, e820_type)) {
+                warn_report("Failed to update E820 entry for node %d "
+                           "at 0x%" PRIx64 " length 0x%" PRIx64,
+                           i, guest_addr, node_size);
+            }
+        }
+
+        if (numa_info->node_memdev) {
+            addr += node_size;
+        }
+    }
+}
+
 /*
  * AMD systems with an IOMMU have an additional hole close to the
  * 1Tb, which are special GPAs that cannot be DMA mapped. Depending
@@ -856,6 +901,9 @@ void pc_memory_init(PCMachineState *pcms,
         e820_add_entry(pcms->sgx_epc.base, pcms->sgx_epc.size, E820_RESERVED);
     }
 
+    /* Update E820 for NUMA nodes with special memory types */
+    pc_update_numa_memory_types(x86ms);
+
     if (!pcmc->has_reserved_memory &&
         (machine->ram_slots ||
          (machine->maxram_size > machine->ram_size))) {
diff --git a/include/system/numa.h b/include/system/numa.h
index 1044b0eb6e..64e8f63736 100644
--- a/include/system/numa.h
+++ b/include/system/numa.h
@@ -35,12 +35,19 @@ enum {
 
 #define UINT16_BITS       16
 
+typedef enum {
+    NUMA_MEMMAP_NORMAL = 0,
+    NUMA_MEMMAP_SPM,
+    NUMA_MEMMAP_RESERVED,
+} NumaMemmapTypeInternal;
+
 typedef struct NodeInfo {
     uint64_t node_mem;
     struct HostMemoryBackend *node_memdev;
     bool present;
     bool has_cpu;
     bool has_gi;
+    NumaMemmapTypeInternal memmap_type;
     uint8_t lb_info_provided;
     uint16_t initiator;
     uint8_t distance[MAX_NODES];
diff --git a/qapi/machine.json b/qapi/machine.json
index 685e4e29b8..67ba487f6c 100644
--- a/qapi/machine.json
+++ b/qapi/machine.json
@@ -466,6 +466,22 @@
 { 'enum': 'NumaOptionsType',
   'data': [ 'node', 'dist', 'cpu', 'hmat-lb', 'hmat-cache' ] }
 
+##
+# @NumaMemmapType:
+#
+# Memory mapping type for a NUMA node.
+#
+# @normal: Normal system RAM (E820 type 1)
+#
+# @spm: Specific Purpose Memory (E820 type 0xEFFFFFFF)
+#
+# @reserved: Reserved memory (E820 type 2)
+#
+# Since: 10.2
+##
+{ 'enum': 'NumaMemmapType',
+  'data': ['normal', 'spm', 'reserved'] }
+
 ##
 # @NumaOptions:
 #
@@ -502,6 +518,13 @@
 # @memdev: memory backend object.  If specified for one node, it must
 #     be specified for all nodes.
 #
+# @memmap-type: specifies the memory type for this NUMA node.
+#     'normal' (default) is regular system RAM.
+#     'spm' is Specific Purpose Memory - a hint to the guest that
+#     this memory might be managed by device drivers based on policy.
+#     'reserved' is reserved memory, not usable as RAM.
+#     Currently only supported on x86.  (since 10.2)
+#
 # @initiator: defined in ACPI 6.3 Chapter 5.2.27.3 Table 5-145, points
 #     to the nodeid which has the memory controller responsible for
 #     this NUMA node.  This field provides additional information as
@@ -516,6 +539,7 @@
    '*cpus':   ['uint16'],
    '*mem':    'size',
    '*memdev': 'str',
+   '*memmap-type': 'NumaMemmapType',
    '*initiator': 'uint16' }}
 
 ##
diff --git a/qemu-options.hx b/qemu-options.hx
index 0da2b4d034..c898428822 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -433,7 +433,7 @@ ERST
 
 DEF("numa", HAS_ARG, QEMU_OPTION_numa,
     "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
-    "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
+    "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node][,memmap-type=normal|spm|reserved]\n"
     "-numa dist,src=source,dst=destination,val=distance\n"
     "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n"
     "-numa hmat-lb,initiator=node,target=node,hierarchy=memory|first-level|second-level|third-level,data-type=access-latency|read-latency|write-latency[,latency=lat][,bandwidth=bw]\n"
@@ -442,7 +442,7 @@ DEF("numa", HAS_ARG, QEMU_OPTION_numa,
 SRST
 ``-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=initiator]``
   \ 
-``-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=initiator]``
+``-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=initiator][,memmap-type=type]``
   \
 ``-numa dist,src=source,dst=destination,val=distance``
   \ 
@@ -510,6 +510,16 @@ SRST
     largest bandwidth) to this NUMA node. Note that this option can be
     set only when the machine property 'hmat' is set to 'on'.
 
+    '\ ``memmap-type``\ ' specifies the memory type for this NUMA node:
+
+    - ``normal`` (default): Regular system RAM (E820 type 1)
+    - ``spm``: Specific Purpose Memory (E820 type 0xEFFFFFFF). This is a
+      hint to the guest that the memory might be managed by device drivers
+      based on guest policy.
+    - ``reserved``: Reserved memory (E820 type 2), not usable as RAM.
+
+    This option is only supported on x86 platforms.
+
     Following example creates a machine with 2 NUMA nodes, node 0 has
     CPU. node 1 has only memory, and its initiator is node 0. Note that
     because node 0 has CPU, by default the initiator of node 0 is itself
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v7 1/1] numa: add 'memmap-type' option for memory type configuration
  2026-03-06  8:27 ` [PATCH v7 1/1] " fanhuang
@ 2026-05-14 13:05   ` Igor Mammedov
  2026-05-14 13:38     ` Gregory Price
  2026-05-15  7:53     ` Huang, FangSheng (Jerry)
  0 siblings, 2 replies; 12+ messages in thread
From: Igor Mammedov @ 2026-05-14 13:05 UTC (permalink / raw)
  To: fanhuang
  Cc: qemu-devel, david, gourry, jonathan.cameron, apopple,
	dan.j.williams, Zhigang.Luo, Lianjie.Shi, David Hildenbrand

On Fri, 6 Mar 2026 16:27:35 +0800
fanhuang <FangSheng.Huang@amd.com> wrote:

> Add a 'memmap-type' option to NUMA node configuration that allows
> specifying the memory type for a NUMA node.
> 
> Supported values:
>   - normal:   Regular system RAM (E820 type 1, default)
>   - spm:      Specific Purpose Memory (E820 type 0xEFFFFFFF)
>   - reserved: Reserved memory (E820 type 2)
> 
> The 'spm' type indicates Specific Purpose Memory - a hint to the guest
> that this memory might be managed by device drivers based on guest policy.
> The 'reserved' type marks memory as not usable as RAM.
> 
> Note: This option is only supported on x86 platforms.
> 
> Usage:
>   -numa node,nodeid=1,memdev=m1,memmap-type=spm

in short:
  don't do it this way
  I'm against merging it as is, till you convince me otherwise.

more detailed answer:

* mandatory bashing chapter:

the more i look at it, the hackier this approach looks to me,
and what even worse that nonsense propagates to firmware.

Judging by commit message, the goal is to expose some RAM as
E820 SPM, to guest (that's it).

You however picked -numa node as a way to achieve that,
and then hack the numa code not to generate numa data for it (SRAT)
and massage e820 to exclude SPM from  RAM entries.

But at this stage I don't really see a good justification for hack(s)
this patch introduces (it's definitely is not in commit message not cover letter).

And until alternative approach is not explored and proved to be worse,
I'm against merging this patch.

* suggestion chapter:

I don't recall but I likely asked before
why not use device memory instead for it (aka DIMM device or some device derived
from device memory object and then add e820 entry for it).

It would be a way more simpler approach and impl. without need to resplit
anything in e820.
And no need for messing with firmware (SeaBIOS: RamSizeOver4G patch) nor EDK2.



> 
> Signed-off-by: fanhuang <FangSheng.Huang@amd.com>
> Acked-by: David Hildenbrand <david@kernel.org>
> Reviewed-by: Gregory Price <gourry@gourry.net>
> ---
>  hw/core/numa.c               | 24 ++++++++++++
>  hw/i386/acpi-build.c         |  8 ++++
>  hw/i386/e820_memory_layout.c | 72 ++++++++++++++++++++++++++++++++++++
>  hw/i386/e820_memory_layout.h | 12 +++---
>  hw/i386/pc.c                 | 48 ++++++++++++++++++++++++
>  include/system/numa.h        |  7 ++++
>  qapi/machine.json            | 24 ++++++++++++
>  qemu-options.hx              | 14 ++++++-
>  8 files changed, 202 insertions(+), 7 deletions(-)
> 
> diff --git a/hw/core/numa.c b/hw/core/numa.c
> index f462883c87..521c8f10f1 100644
> --- a/hw/core/numa.c
> +++ b/hw/core/numa.c
> @@ -38,6 +38,7 @@
>  #include "hw/mem/pc-dimm.h"
>  #include "hw/core/boards.h"
>  #include "hw/mem/memory-device.h"
> +#include "hw/i386/x86.h"
>  #include "qemu/option.h"
>  #include "qemu/config-file.h"
>  #include "qemu/cutils.h"
> @@ -164,6 +165,29 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
>          numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
>      }
>  
> +    if (node->has_memmap_type && node->memmap_type != NUMA_MEMMAP_TYPE_NORMAL) {
> +        if (!node->memdev) {
> +            error_setg(errp, "memmap-type=%s requires memdev to be specified",
> +                       NumaMemmapType_str(node->memmap_type));
> +            return;
> +        }
> +        if (!object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE)) {
> +            error_setg(errp, "memmap-type=%s is only supported on x86 machines",
> +                       NumaMemmapType_str(node->memmap_type));
> +            return;
> +        }
> +        switch (node->memmap_type) {
> +        case NUMA_MEMMAP_TYPE_SPM:
> +            numa_info[nodenr].memmap_type = NUMA_MEMMAP_SPM;
> +            break;
> +        case NUMA_MEMMAP_TYPE_RESERVED:
> +            numa_info[nodenr].memmap_type = NUMA_MEMMAP_RESERVED;
> +            break;
> +        default:
> +            break;
> +        }
> +    }
> +
>      numa_info[nodenr].present = true;
>      max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
>      ms->numa_state->num_nodes++;
> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
> index f622b91b76..521bf66ca1 100644
> --- a/hw/i386/acpi-build.c
> +++ b/hw/i386/acpi-build.c
> @@ -1417,6 +1417,14 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine)
>          mem_len = numa_info[i - 1].node_mem;
>          next_base = mem_base + mem_len;
>  
> +        /*
> +         * Skip reserved memory nodes - E820 marks them as reserved,
> +         * so SRAT should not report them as enabled memory affinity.
> +         */
> +        if (numa_info[i - 1].memmap_type == NUMA_MEMMAP_RESERVED) {
> +            continue;
> +        }
> +
>          /* Cut out the 640K hole */
>          if (mem_base <= HOLE_640K_START &&
>              next_base > HOLE_640K_START) {
> diff --git a/hw/i386/e820_memory_layout.c b/hw/i386/e820_memory_layout.c
> index 3e848fb69c..4c62b5ddea 100644
> --- a/hw/i386/e820_memory_layout.c
> +++ b/hw/i386/e820_memory_layout.c
> @@ -46,3 +46,75 @@ bool e820_get_entry(int idx, uint32_t type, uint64_t *address, uint64_t *length)
>      }
>      return false;
>  }
> +
> +bool e820_update_entry_type(uint64_t start, uint64_t length, uint32_t new_type)
> +{
> +    uint64_t end = start + length;
> +    assert(!e820_done);
> +
> +    /* For E820_SOFT_RESERVED, validate range is within E820_RAM */
> +    if (new_type == E820_SOFT_RESERVED) {
> +        bool range_in_ram = false;
> +
> +        for (size_t j = 0; j < e820_entries; j++) {
> +            uint64_t ram_start = le64_to_cpu(e820_table[j].address);
> +            uint64_t ram_end = ram_start + le64_to_cpu(e820_table[j].length);
> +            uint32_t ram_type = le32_to_cpu(e820_table[j].type);
> +
> +            if (ram_type == E820_RAM && ram_start <= start && ram_end >= end) {
> +                range_in_ram = true;
> +                break;
> +            }
> +        }
> +        if (!range_in_ram) {
> +            return false;
> +        }
> +    }
> +
> +    /* Find entry that contains the target range and update it */
> +    for (size_t i = 0; i < e820_entries; i++) {
> +        uint64_t entry_start = le64_to_cpu(e820_table[i].address);
> +        uint64_t entry_length = le64_to_cpu(e820_table[i].length);
> +        uint64_t entry_end = entry_start + entry_length;
> +
> +        if (entry_start <= start && entry_end >= end) {
> +            uint32_t original_type = e820_table[i].type;
> +
> +            /* Remove original entry */
> +            memmove(&e820_table[i], &e820_table[i + 1],
> +                    (e820_entries - i - 1) * sizeof(struct e820_entry));
> +            e820_entries--;
> +
> +            /* Add split parts inline */
> +            if (entry_start < start) {
> +                e820_table = g_renew(struct e820_entry, e820_table,
> +                                     e820_entries + 1);
> +                e820_table[e820_entries].address = cpu_to_le64(entry_start);
> +                e820_table[e820_entries].length =
> +                    cpu_to_le64(start - entry_start);
> +                e820_table[e820_entries].type = original_type;
> +                e820_entries++;
> +            }
> +
> +            e820_table = g_renew(struct e820_entry, e820_table,
> +                                 e820_entries + 1);
> +            e820_table[e820_entries].address = cpu_to_le64(start);
> +            e820_table[e820_entries].length = cpu_to_le64(length);
> +            e820_table[e820_entries].type = cpu_to_le32(new_type);
> +            e820_entries++;
> +
> +            if (end < entry_end) {
> +                e820_table = g_renew(struct e820_entry, e820_table,
> +                                     e820_entries + 1);
> +                e820_table[e820_entries].address = cpu_to_le64(end);
> +                e820_table[e820_entries].length = cpu_to_le64(entry_end - end);
> +                e820_table[e820_entries].type = original_type;
> +                e820_entries++;
> +            }
> +
> +            return true;
> +        }
> +    }
> +
> +    return false;
> +}
> diff --git a/hw/i386/e820_memory_layout.h b/hw/i386/e820_memory_layout.h
> index b50acfa201..a85b4fd14c 100644
> --- a/hw/i386/e820_memory_layout.h
> +++ b/hw/i386/e820_memory_layout.h
> @@ -10,11 +10,12 @@
>  #define HW_I386_E820_MEMORY_LAYOUT_H
>  
>  /* e820 types */
> -#define E820_RAM        1
> -#define E820_RESERVED   2
> -#define E820_ACPI       3
> -#define E820_NVS        4
> -#define E820_UNUSABLE   5
> +#define E820_RAM            1
> +#define E820_RESERVED       2
> +#define E820_ACPI           3
> +#define E820_NVS            4
> +#define E820_UNUSABLE       5
> +#define E820_SOFT_RESERVED  0xEFFFFFFF
>  
>  struct e820_entry {
>      uint64_t address;
> @@ -26,5 +27,6 @@ void e820_add_entry(uint64_t address, uint64_t length, uint32_t type);
>  bool e820_get_entry(int index, uint32_t type,
>                      uint64_t *address, uint64_t *length);
>  int e820_get_table(struct e820_entry **table);
> +bool e820_update_entry_type(uint64_t start, uint64_t length, uint32_t new_type);
>  
>  #endif
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 819e729a6e..c024a34db2 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -740,6 +740,51 @@ static hwaddr pc_max_used_gpa(PCMachineState *pcms, uint64_t pci_hole64_size)
>      return pc_above_4g_end(pcms) - 1;
>  }
>  
> +/*
> + * Update E820 entries for NUMA nodes with non-default memory types.
> + */
> +static void pc_update_numa_memory_types(X86MachineState *x86ms)
> +{
> +    MachineState *ms = MACHINE(x86ms);
> +    uint64_t addr = 0;
> +
> +    for (int i = 0; i < ms->numa_state->num_nodes; i++) {
> +        NodeInfo *numa_info = &ms->numa_state->nodes[i];
> +        uint64_t node_size = numa_info->node_mem;
> +
> +        if (numa_info->node_memdev &&
> +            (numa_info->memmap_type == NUMA_MEMMAP_SPM ||
> +             numa_info->memmap_type == NUMA_MEMMAP_RESERVED)) {
> +            uint64_t guest_addr;
> +            uint32_t e820_type = (numa_info->memmap_type == NUMA_MEMMAP_SPM)
> +                                  ? E820_SOFT_RESERVED : E820_RESERVED;
> +
> +            if (addr < x86ms->below_4g_mem_size) {
> +                if (addr + node_size <= x86ms->below_4g_mem_size) {
> +                    guest_addr = addr;
> +                } else {
> +                    error_report("NUMA node %d with memmap-type spans across "
> +                                 "4GB boundary, not supported", i);
> +                    exit(EXIT_FAILURE);
> +                }
> +            } else {
> +                guest_addr = x86ms->above_4g_mem_start +
> +                            (addr - x86ms->below_4g_mem_size);
> +            }
> +
> +            if (!e820_update_entry_type(guest_addr, node_size, e820_type)) {
> +                warn_report("Failed to update E820 entry for node %d "
> +                           "at 0x%" PRIx64 " length 0x%" PRIx64,
> +                           i, guest_addr, node_size);
> +            }
> +        }
> +
> +        if (numa_info->node_memdev) {
> +            addr += node_size;
> +        }
> +    }
> +}
> +
>  /*
>   * AMD systems with an IOMMU have an additional hole close to the
>   * 1Tb, which are special GPAs that cannot be DMA mapped. Depending
> @@ -856,6 +901,9 @@ void pc_memory_init(PCMachineState *pcms,
>          e820_add_entry(pcms->sgx_epc.base, pcms->sgx_epc.size, E820_RESERVED);
>      }
>  
> +    /* Update E820 for NUMA nodes with special memory types */
> +    pc_update_numa_memory_types(x86ms);
> +
>      if (!pcmc->has_reserved_memory &&
>          (machine->ram_slots ||
>           (machine->maxram_size > machine->ram_size))) {
> diff --git a/include/system/numa.h b/include/system/numa.h
> index 1044b0eb6e..64e8f63736 100644
> --- a/include/system/numa.h
> +++ b/include/system/numa.h
> @@ -35,12 +35,19 @@ enum {
>  
>  #define UINT16_BITS       16
>  
> +typedef enum {
> +    NUMA_MEMMAP_NORMAL = 0,
> +    NUMA_MEMMAP_SPM,
> +    NUMA_MEMMAP_RESERVED,
> +} NumaMemmapTypeInternal;
> +
>  typedef struct NodeInfo {
>      uint64_t node_mem;
>      struct HostMemoryBackend *node_memdev;
>      bool present;
>      bool has_cpu;
>      bool has_gi;
> +    NumaMemmapTypeInternal memmap_type;
>      uint8_t lb_info_provided;
>      uint16_t initiator;
>      uint8_t distance[MAX_NODES];
> diff --git a/qapi/machine.json b/qapi/machine.json
> index 685e4e29b8..67ba487f6c 100644
> --- a/qapi/machine.json
> +++ b/qapi/machine.json
> @@ -466,6 +466,22 @@
>  { 'enum': 'NumaOptionsType',
>    'data': [ 'node', 'dist', 'cpu', 'hmat-lb', 'hmat-cache' ] }
>  
> +##
> +# @NumaMemmapType:
> +#
> +# Memory mapping type for a NUMA node.
> +#
> +# @normal: Normal system RAM (E820 type 1)
> +#
> +# @spm: Specific Purpose Memory (E820 type 0xEFFFFFFF)
> +#
> +# @reserved: Reserved memory (E820 type 2)
> +#
> +# Since: 10.2
> +##
> +{ 'enum': 'NumaMemmapType',
> +  'data': ['normal', 'spm', 'reserved'] }
> +
>  ##
>  # @NumaOptions:
>  #
> @@ -502,6 +518,13 @@
>  # @memdev: memory backend object.  If specified for one node, it must
>  #     be specified for all nodes.
>  #
> +# @memmap-type: specifies the memory type for this NUMA node.
> +#     'normal' (default) is regular system RAM.
> +#     'spm' is Specific Purpose Memory - a hint to the guest that
> +#     this memory might be managed by device drivers based on policy.
> +#     'reserved' is reserved memory, not usable as RAM.
> +#     Currently only supported on x86.  (since 10.2)
> +#
>  # @initiator: defined in ACPI 6.3 Chapter 5.2.27.3 Table 5-145, points
>  #     to the nodeid which has the memory controller responsible for
>  #     this NUMA node.  This field provides additional information as
> @@ -516,6 +539,7 @@
>     '*cpus':   ['uint16'],
>     '*mem':    'size',
>     '*memdev': 'str',
> +   '*memmap-type': 'NumaMemmapType',
>     '*initiator': 'uint16' }}
>  
>  ##
> diff --git a/qemu-options.hx b/qemu-options.hx
> index 0da2b4d034..c898428822 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -433,7 +433,7 @@ ERST
>  
>  DEF("numa", HAS_ARG, QEMU_OPTION_numa,
>      "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
> -    "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
> +    "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node][,memmap-type=normal|spm|reserved]\n"
>      "-numa dist,src=source,dst=destination,val=distance\n"
>      "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n"
>      "-numa hmat-lb,initiator=node,target=node,hierarchy=memory|first-level|second-level|third-level,data-type=access-latency|read-latency|write-latency[,latency=lat][,bandwidth=bw]\n"
> @@ -442,7 +442,7 @@ DEF("numa", HAS_ARG, QEMU_OPTION_numa,
>  SRST
>  ``-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=initiator]``
>    \ 
> -``-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=initiator]``
> +``-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=initiator][,memmap-type=type]``
>    \
>  ``-numa dist,src=source,dst=destination,val=distance``
>    \ 
> @@ -510,6 +510,16 @@ SRST
>      largest bandwidth) to this NUMA node. Note that this option can be
>      set only when the machine property 'hmat' is set to 'on'.
>  
> +    '\ ``memmap-type``\ ' specifies the memory type for this NUMA node:
> +
> +    - ``normal`` (default): Regular system RAM (E820 type 1)
> +    - ``spm``: Specific Purpose Memory (E820 type 0xEFFFFFFF). This is a
> +      hint to the guest that the memory might be managed by device drivers
> +      based on guest policy.
> +    - ``reserved``: Reserved memory (E820 type 2), not usable as RAM.
> +
> +    This option is only supported on x86 platforms.
> +
>      Following example creates a machine with 2 NUMA nodes, node 0 has
>      CPU. node 1 has only memory, and its initiator is node 0. Note that
>      because node 0 has CPU, by default the initiator of node 0 is itself



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v7 1/1] numa: add 'memmap-type' option for memory type configuration
  2026-05-14 13:05   ` Igor Mammedov
@ 2026-05-14 13:38     ` Gregory Price
  2026-05-18  8:15       ` David Hildenbrand (Arm)
  2026-05-15  7:53     ` Huang, FangSheng (Jerry)
  1 sibling, 1 reply; 12+ messages in thread
From: Gregory Price @ 2026-05-14 13:38 UTC (permalink / raw)
  To: Igor Mammedov
  Cc: fanhuang, qemu-devel, david, jonathan.cameron, apopple,
	dan.j.williams, Zhigang.Luo, Lianjie.Shi, David Hildenbrand

On Thu, May 14, 2026 at 03:05:59PM +0200, Igor Mammedov wrote:
> 
> I don't recall but I likely asked before
> why not use device memory instead for it (aka DIMM device or some device derived
> from device memory object and then add e820 entry for it).
> 
> It would be a way more simpler approach and impl. without need to resplit
> anything in e820.
> And no need for messing with firmware (SeaBIOS: RamSizeOver4G patch) nor EDK2.
> 

David previously addressed your question on the original patch version:

https://lore.kernel.org/qemu-devel/6e7ad90d-a467-40cc-99fa-d0915438dd05@redhat.com/

  I wondered the same in my reply: I'm afraid it cannot be a DIMM/NVDIMM,
  these ranges are only described in E820 as "hotplug area".

  I think it must be something that's present in the memory map right from
  the start, where the OS would identify it as SP and treat it accordingly.

We're trending towards devices being given dedicated nodes for their
memory, so this actually makes sense as an extension to NUMA.

While heterogenous device/memory nodes are possible - they're also
pretty nonsensical outside of specifically the simple use case of:

   This node has both hotpluggable and not-hotpluggable memory.

Which can already be accomplished another way.

For a device being given a node with memory, marking it reserved or spm
in e820 is needed to make the memory hotpluggable in the future (as that
node has to be reserved and the hotplug memory region accounted for).

Unless I am misunderstanding your feedback here - please let me know.

~Gregory

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v7 1/1] numa: add 'memmap-type' option for memory type configuration
  2026-05-14 13:05   ` Igor Mammedov
  2026-05-14 13:38     ` Gregory Price
@ 2026-05-15  7:53     ` Huang, FangSheng (Jerry)
  2026-05-15 13:04       ` Igor Mammedov
  1 sibling, 1 reply; 12+ messages in thread
From: Huang, FangSheng (Jerry) @ 2026-05-15  7:53 UTC (permalink / raw)
  To: Igor Mammedov
  Cc: qemu-devel, david, gourry, jonathan.cameron, apopple,
	dan.j.williams, Zhigang.Luo, Lianjie.Shi, David Hildenbrand

On 5/14/2026 9:05 PM, Igor Mammedov wrote:
> [You don't often get email from imammedo@redhat.com. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
> 
> On Fri, 6 Mar 2026 16:27:35 +0800
> fanhuang <FangSheng.Huang@amd.com> wrote:
> 
>> Add a 'memmap-type' option to NUMA node configuration that allows
>> specifying the memory type for a NUMA node.
>>
>> Supported values:
>>    - normal:   Regular system RAM (E820 type 1, default)
>>    - spm:      Specific Purpose Memory (E820 type 0xEFFFFFFF)
>>    - reserved: Reserved memory (E820 type 2)
>>
>> The 'spm' type indicates Specific Purpose Memory - a hint to the guest
>> that this memory might be managed by device drivers based on guest policy.
>> The 'reserved' type marks memory as not usable as RAM.
>>
>> Note: This option is only supported on x86 platforms.
>>
>> Usage:
>>    -numa node,nodeid=1,memdev=m1,memmap-type=spm
> 
> in short:
>    don't do it this way
>    I'm against merging it as is, till you convince me otherwise.
> 
> more detailed answer:
> 
> * mandatory bashing chapter:
> 
> the more i look at it, the hackier this approach looks to me,
> and what even worse that nonsense propagates to firmware.
> 
> Judging by commit message, the goal is to expose some RAM as
> E820 SPM, to guest (that's it).
> 
> You however picked -numa node as a way to achieve that,
> and then hack the numa code not to generate numa data for it (SRAT)
> and massage e820 to exclude SPM from  RAM entries.
> 
> But at this stage I don't really see a good justification for hack(s)
> this patch introduces (it's definitely is not in commit message not cover letter).
> 
> And until alternative approach is not explored and proved to be worse,
> I'm against merging this patch.
> 
> * suggestion chapter:
> 
> I don't recall but I likely asked before
> why not use device memory instead for it (aka DIMM device or some device derived
> from device memory object and then add e820 entry for it).
> 
> It would be a way more simpler approach and impl. without need to resplit
> anything in e820.
> And no need for messing with firmware (SeaBIOS: RamSizeOver4G patch) nor EDK2.
> 
>

Hi Igor,

Thanks for taking the time to review this -- and for the candor in
the bashing chapter.  Before going into the bigger picture, let me
re-establish one factual point that v7 didn't carry forward from
the v6 cover letter.

On SRAT generation:

v7 only suppresses SRAT for memmap-type=reserved.  memmap-type=spm
nodes get a normal SRAT Memory Affinity entry.  This was shown
explicitly in the v6 cover letter, which v7 didn't carry forward
since v7 is a single-patch series.  For the spm case:

     [    0.042582] ACPI: SRAT: Node 1 PXM 1 [mem 0x280000000-0x47fffffff]

Full transcript with all three memmap-type variants side by side:
https://lore.kernel.org/qemu-devel/20260226105023.256568-1-FangSheng.Huang@amd.com/

The bigger picture -- real-world context that drove the design:

The use case is GPU/accelerator HBM exposed to the OS as SPM.  On
bare metal, the platform firmware:

   - emits E820 type 0xEFFFFFFF (SOFT_RESERVED) for the HBM region;
   - emits ACPI SRAT memory affinity entries that bind HBM to a
     dedicated proximity domain (NUMA node);
   - tags the accelerator's PCI device with _PXM matching that node.

That gives the device driver a stable lookup chain at runtime:

     dev -> pci_dev_to_node(dev) -> SRAT walk -> HBM GPA range

NUMA node here is not incidental -- it is the OS-exposed
intermediary ID that the device driver uses to find its own HBM.
This is the in-tree path used by accelerator drivers today.

The "-numa node + memmap-type=spm + E820 SOFT_RESERVED" combo in
v7 is a direct 1:1 model of this BM topology.  The E820 retyping
in the patch is exactly what makes the guest-visible E820 match
what BM firmware emits for the same kind of region.

On the DIMM / device-memory alternative:

David pointed this out in the v6 thread, and Gregory's reply in
this thread reinforces the same point -- DIMM / NVDIMM ranges are
described in E820 only as the hotplug area.  SPM needs to be in
the boot E820 from the start so the OS classifies it as SP and
treats it accordingly.  Going via DIMM would also detach the
memory from the NUMA topology (no SRAT entry tied to the device's
_PXM), which breaks the dev -> node -> SRAT -> HBM lookup the
driver relies on.

Happy to dig into any of this further, or to reshape parts you
still see as too hacky.

Best regards,
FangSheng Huang (Jerry)
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v7 1/1] numa: add 'memmap-type' option for memory type configuration
  2026-05-15  7:53     ` Huang, FangSheng (Jerry)
@ 2026-05-15 13:04       ` Igor Mammedov
  2026-05-18 10:43         ` Huang, FangSheng (Jerry)
  0 siblings, 1 reply; 12+ messages in thread
From: Igor Mammedov @ 2026-05-15 13:04 UTC (permalink / raw)
  To: Huang, FangSheng (Jerry)
  Cc: qemu-devel, david, gourry, jonathan.cameron, apopple,
	dan.j.williams, Zhigang.Luo, Lianjie.Shi, David Hildenbrand

On Fri, 15 May 2026 15:53:07 +0800
"Huang, FangSheng (Jerry)" <FangSheng.Huang@amd.com> wrote:

> On 5/14/2026 9:05 PM, Igor Mammedov wrote:
> > [You don't often get email from imammedo@redhat.com. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
> > 
> > On Fri, 6 Mar 2026 16:27:35 +0800
> > fanhuang <FangSheng.Huang@amd.com> wrote:
> >   
> >> Add a 'memmap-type' option to NUMA node configuration that allows
> >> specifying the memory type for a NUMA node.
> >>
> >> Supported values:
> >>    - normal:   Regular system RAM (E820 type 1, default)
> >>    - spm:      Specific Purpose Memory (E820 type 0xEFFFFFFF)
> >>    - reserved: Reserved memory (E820 type 2)
> >>
> >> The 'spm' type indicates Specific Purpose Memory - a hint to the guest
> >> that this memory might be managed by device drivers based on guest policy.
> >> The 'reserved' type marks memory as not usable as RAM.
> >>
> >> Note: This option is only supported on x86 platforms.
> >>
> >> Usage:
> >>    -numa node,nodeid=1,memdev=m1,memmap-type=spm  
> > 
> > in short:
> >    don't do it this way
> >    I'm against merging it as is, till you convince me otherwise.
> > 
> > more detailed answer:
> > 
> > * mandatory bashing chapter:
> > 
> > the more i look at it, the hackier this approach looks to me,
> > and what even worse that nonsense propagates to firmware.
> > 
> > Judging by commit message, the goal is to expose some RAM as
> > E820 SPM, to guest (that's it).
> > 
> > You however picked -numa node as a way to achieve that,
> > and then hack the numa code not to generate numa data for it (SRAT)
> > and massage e820 to exclude SPM from  RAM entries.
> > 
> > But at this stage I don't really see a good justification for hack(s)
> > this patch introduces (it's definitely is not in commit message not cover letter).
> > 
> > And until alternative approach is not explored and proved to be worse,
> > I'm against merging this patch.
> > 
> > * suggestion chapter:
> > 
> > I don't recall but I likely asked before
> > why not use device memory instead for it (aka DIMM device or some device derived
> > from device memory object and then add e820 entry for it).
> > 
> > It would be a way more simpler approach and impl. without need to resplit
> > anything in e820.
> > And no need for messing with firmware (SeaBIOS: RamSizeOver4G patch) nor EDK2.
> > 
> >  
> 
> Hi Igor,
> 
> Thanks for taking the time to review this -- and for the candor in
> the bashing chapter.  Before going into the bigger picture, let me
> re-establish one factual point that v7 didn't carry forward from
> the v6 cover letter.

feel free to bash my review as well, I hope that we end up with
clear picture what and why we are doing.

> 
> On SRAT generation:
> 
> v7 only suppresses SRAT for memmap-type=reserved.  memmap-type=spm
> nodes get a normal SRAT Memory Affinity entry.  This was shown
> explicitly in the v6 cover letter, which v7 didn't carry forward
> since v7 is a single-patch series.  For the spm case:
> 
>      [    0.042582] ACPI: SRAT: Node 1 PXM 1 [mem 0x280000000-0x47fffffff]
> 
> Full transcript with all three memmap-type variants side by side:
> https://lore.kernel.org/qemu-devel/20260226105023.256568-1-FangSheng.Huang@amd.com/
> 
> The bigger picture -- real-world context that drove the design:

bigger picture should be somewhere in commit message so later on
a reader could understand why we are doing it at all/this way.
 
lets continue with questions wrt impl.

> The use case is GPU/accelerator HBM exposed to the OS as SPM.  On
> bare metal, the platform firmware:
> 
>    - emits E820 type 0xEFFFFFFF (SOFT_RESERVED) for the HBM region;
>    - emits ACPI SRAT memory affinity entries that bind HBM to a
>      dedicated proximity domain (NUMA node);
>    - tags the accelerator's PCI device with _PXM matching that node.
> 
> That gives the device driver a stable lookup chain at runtime:
> 
>      dev -> pci_dev_to_node(dev) -> SRAT walk -> HBM GPA range

it looks kind of convoluted, isn't it.
PCI devices were supposed to be self describing/discoverable.
Preferably without above mentioned firmware 'hooks'.
Above example could be just early impl. issues, rather than by
design issue.

> NUMA node here is not incidental -- it is the OS-exposed
> intermediary ID that the device driver uses to find its own HBM.
> This is the in-tree path used by accelerator drivers today.

I'm assuming GPU is exposed as some composite PCI/CXL device.
and use-case is its pass-through to guest.

Perhaps we can't do anything about it now.
But shouldn't device driver discover its own memory (HBM and what not)
without external parties that magically gain knowledge about parts of
device that driver supposedly driving the device has not a clue about? 
How doesn bios know about SPM when device's driver with knowledge of
device internals knows nothing about?
 
> The "-numa node + memmap-type=spm + E820 SOFT_RESERVED" combo in
> v7 is a direct 1:1 model of this BM topology.  The E820 retyping
> in the patch is exactly what makes the guest-visible E820 match
> what BM firmware emits for the same kind of region.
> 
> On the DIMM / device-memory alternative:

wrt modeling GPU pass-through, my 1st attempt would be
to make -device gpu-foo take everything need to compose the device
(like in real hw) and be done with it (and PCI/CXL machinery would
take care of mapping/exposing memory to guest).
Why we aren't doing it?

barring that, and assuming we have to pass SPM as a separate memory
(why and why it should be exposed in E820 and at boot time only?)
I'd try -device foo-memory approach.

> David pointed this out in the v6 thread, and Gregory's reply in
> this thread reinforces the same point -- DIMM / NVDIMM ranges are
> described in E820 only as the hotplug area.  SPM needs to be in
> the boot E820 from the start so the OS classifies it as SP and
> treats it accordingly.  Going via DIMM would also detach the
> memory from the NUMA topology (no SRAT entry tied to the device's
> _PXM), which breaks the dev -> node -> SRAT -> HBM lookup the
> driver relies on.

Where we should bend modeling to driver behavior is questionable.
But I don't know nearly enough about subj, it could be parallel discussion.
But we need capture 'why' somewhere in commit message, to give
a justification for going pass-through as a separate memory approach.
 
For now lets leave it alone.

wrt my suggestion using memory-device.
It's true that the device memory region has started as hotpluggable memory.
But that's impl. detail, nothing fundamentally prevents us from
describing mix of present at boot time memory devices within it in e820/SRAT.
 
Answer to why DIMMs aren't in e820 was for us to avoid dealing with
linux kernel putting that memory into zone_normal instead of zone_movable.
On real hardware, one is likely to see all present at boot dimms, in e820
and SRAT.
For already existing memory devices, I'd like us continue dodging e820,
so we wouldn't break existing deployments. however for a new memory device
we don't have such limitations.

What I'd try is:
 1: inherit spm-memory device from memory-device
       (all memory mapping and APCI memory device descriptors, can be made
        to pick it along with DIMM devices) 
 2: figure out why device driver has to fetch memory map
   and proximity from static tables as opposed to getting it dynamically
   from _PXM -> maped-memory range. (at the time PCI devices enum runs, all ACPI
   info incl. run time one is fully accessible to in-kernel users)
   i.e. try to make driver work with runtime proximity
 3. if #2 is impossible, we can try to expose SPM memory devices in e820,
     and partition SRAT to match actual device_memory region layout. 

 
> Happy to dig into any of this further, or to reshape parts you
> still see as too hacky.
> 
> Best regards,
> FangSheng Huang (Jerry)
> >   
> 



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v7 1/1] numa: add 'memmap-type' option for memory type configuration
  2026-05-14 13:38     ` Gregory Price
@ 2026-05-18  8:15       ` David Hildenbrand (Arm)
  0 siblings, 0 replies; 12+ messages in thread
From: David Hildenbrand (Arm) @ 2026-05-18  8:15 UTC (permalink / raw)
  To: Gregory Price, Igor Mammedov
  Cc: fanhuang, qemu-devel, jonathan.cameron, apopple, dan.j.williams,
	Zhigang.Luo, Lianjie.Shi

On 5/14/26 15:38, Gregory Price wrote:
> On Thu, May 14, 2026 at 03:05:59PM +0200, Igor Mammedov wrote:
>>
>> I don't recall but I likely asked before
>> why not use device memory instead for it (aka DIMM device or some device derived
>> from device memory object and then add e820 entry for it).
>>
>> It would be a way more simpler approach and impl. without need to resplit
>> anything in e820.
>> And no need for messing with firmware (SeaBIOS: RamSizeOver4G patch) nor EDK2.
>>
> 
> David previously addressed your question on the original patch version:
> 
> https://lore.kernel.org/qemu-devel/6e7ad90d-a467-40cc-99fa-d0915438dd05@redhat.com/
> 
>   I wondered the same in my reply: I'm afraid it cannot be a DIMM/NVDIMM,
>   these ranges are only described in E820 as "hotplug area".
> 
>   I think it must be something that's present in the memory map right from
>   the start, where the OS would identify it as SP and treat it accordingly.
> 
> 
> We're trending towards devices being given dedicated nodes for their
> memory, so this actually makes sense as an extension to NUMA.

Yes.

-- 
Cheers,

David


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v7 1/1] numa: add 'memmap-type' option for memory type configuration
  2026-05-15 13:04       ` Igor Mammedov
@ 2026-05-18 10:43         ` Huang, FangSheng (Jerry)
  2026-05-18 14:32           ` Igor Mammedov
  0 siblings, 1 reply; 12+ messages in thread
From: Huang, FangSheng (Jerry) @ 2026-05-18 10:43 UTC (permalink / raw)
  To: Igor Mammedov
  Cc: qemu-devel, david, gourry, jonathan.cameron, apopple,
	dan.j.williams, Zhigang.Luo, Lianjie.Shi, David Hildenbrand

Hi Igor,

Thanks again for the careful read.

On 5/15/2026 9:04 PM, Igor Mammedov wrote:
> On Fri, 15 May 2026 15:53:07 +0800
> "Huang, FangSheng (Jerry)" <FangSheng.Huang@amd.com> wrote:
> 
>> On 5/14/2026 9:05 PM, Igor Mammedov wrote: 
>>
>> Hi Igor,
>>
>> Thanks for taking the time to review this -- and for the candor in
>> the bashing chapter.  Before going into the bigger picture, let me
>> re-establish one factual point that v7 didn't carry forward from
>> the v6 cover letter.
> 
> feel free to bash my review as well, I hope that we end up with
> clear picture what and why we are doing.
> 
Let me take your points in order: first the bigger-picture context
you asked for (which I'll also put in v8's commit message), then
the dax/kmem compatibility state, the prior thread on this CLI
shape, and where this could go longer-term.

> 
> bigger picture should be somewhere in commit message so later on
> a reader could understand why we are doing it at all/this way.
>   
> lets continue with questions wrt impl.
> 
Agreed, v8 will carry a condensed version of the bigger picture
in the commit message body.  Setting the architectural context
out in full here first -- the rest of the reply depends on it.

(1) The architectural context: HBM is system memory in this class
of systems

This patch targets coherent CPU+accelerator shared-address-space
systems, where the accelerator's high-bandwidth memory is no
longer a device-private framebuffer behind a PCIe BAR.  It lives
in the host physical address space -- visible to the CPU as
system memory (not as device MMIO) and shared coherently with
the accelerator over the platform fabric.  Both sides must keep
consistent views of those pages or the coherent contract breaks.

>> The use case is GPU/accelerator HBM exposed to the OS as SPM.  On
>> bare metal, the platform firmware:
>>
>>     - emits E820 type 0xEFFFFFFF (SOFT_RESERVED) for the HBM region;
>>     - emits ACPI SRAT memory affinity entries that bind HBM to a
>>       dedicated proximity domain (NUMA node);
>>     - tags the accelerator's PCI device with _PXM matching that node.
>>
>> That gives the device driver a stable lookup chain at runtime:
>>
>>       dev -> pci_dev_to_node(dev) -> SRAT walk -> HBM GPA range
> 
> it looks kind of convoluted, isn't it.
> PCI devices were supposed to be self describing/discoverable.
> Preferably without above mentioned firmware 'hooks'.
> Above example could be just early impl. issues, rather than by
> design issue.
> 
>> NUMA node here is not incidental -- it is the OS-exposed
>> intermediary ID that the device driver uses to find its own HBM.
>> This is the in-tree path used by accelerator drivers today.
> 
> I'm assuming GPU is exposed as some composite PCI/CXL device.
> and use-case is its pass-through to guest.
> 
> Perhaps we can't do anything about it now.
> But shouldn't device driver discover its own memory (HBM and what not)
> without external parties that magically gain knowledge about parts of
> device that driver supposedly driving the device has not a clue about?
> How doesn bios know about SPM when device's driver with knowledge of
> device internals knows nothing about?
>   
To be blunt about it: in this architecture, "the device alone
knows where its own memory is" doesn't apply.  The accelerator
package knows it has HBM and how much, but the GPA it's mapped to
is set by platform firmware during boot fabric training, in
coordination with the CPU host bridge and the fabric coherency
hardware.  Firmware "knows the GPA" because firmware authored
that binding, not because some external party magically learned
the device's internals; every party including the driver has to
conform to it.

That makes the OS-level requirement twofold: the memory has to be
visible in the system memory map (so the CPU side can address it)
and reserved exclusively for the accelerator's driver (so the
general allocator doesn't hand HBM pages to random workloads).
The SP marking therefore has to reach the CPU memory subsystem,
not just the driver.  EFI_MEMORY_SP / E820 SOFT_RESERVED + SRAT
memory-affinity is the mechanism that fits this: a system-level,
firmware-produced topology that the CPU memory subsystem honors
and that the accelerator's driver consumes to discover and
manage its HBM range.  Any path that emits only an ACPI namespace
node, without E820 SOFT_RESERVED, leaves the CPU memory subsystem
without that signal and the contract breaks; any path that does
also emit E820 SOFT_RESERVED restores it.

>> The "-numa node + memmap-type=spm + E820 SOFT_RESERVED" combo in
>> v7 is a direct 1:1 model of this BM topology.  The E820 retyping
>> in the patch is exactly what makes the guest-visible E820 match
>> what BM firmware emits for the same kind of region.
>>
>> On the DIMM / device-memory alternative:
> 
> wrt modeling GPU pass-through, my 1st attempt would be
> to make -device gpu-foo take everything need to compose the device
> (like in real hw) and be done with it (and PCI/CXL machinery would
> take care of mapping/exposing memory to guest).
> Why we aren't doing it?
> 
> barring that, and assuming we have to pass SPM as a separate memory
> (why and why it should be exposed in E820 and at boot time only?)
> I'd try -device foo-memory approach.
> 
So this isn't HBM-as-VRAM, the device-private framebuffer model
from discrete GPUs.  It's HBM as a system memory tier under
driver-owned allocation policy, and -numa node + memmap-type=spm
is the direct expression of that, not a workaround layered on top.

(2) v7 is naturally compatible with stock upstream Linux dax/kmem

v7 emits E820 SOFT_RESERVED via existing machine-init, into the
same fw_cfg blob the firmware and the guest kernel already
consume.  The upstream kernel already understands this signal --
dax/kmem is one such consumer in tree, picking the region up via
the IORES_DESC_SOFT_RESERVED hook in drivers/dax/hmem/device.c.
The accelerator's own driver discovers its HBM range from the
same E820 + SRAT entries via dev_to_node + SRAT walk.  Neither
side needs new code for v7.

For a memory-device-derived spm-memory device to reach the same
dax/kmem path, it would have to inject E820 SOFT_RESERVED from
its realize() -- which is new plumbing inside the device_memory
subsystem.

That also bears on two of your other points:
> And no need for messing with firmware (SeaBIOS: RamSizeOver4G
> patch) nor EDK2.

That holds only for the path where the driver doesn't go through
E820 SOFT_RESERVED at all -- but that path needs the in-tree
accelerator driver rewritten to walk ACPI namespace instead of
SRAT, plus a kernel-side patch to wire ACPI memory device
descriptors into dax/kmem.  For the e820+SRAT fallback path you
describe as Step 3, the firmware-side requirement is identical to
v7.

> figure out why device driver has to fetch memory map
> and proximity from static tables as opposed to getting it dynamically
> from _PXM -> maped-memory range. (at the time PCI devices enum runs, 
 > all ACPI
> info incl. run time one is fully accessible to in-kernel users)
> i.e. try to make driver work with runtime proximity

The current amdgpu lookup is already a runtime, PXM-keyed dynamic
lookup -- dev_to_node(pdev) evaluates _PXM on the PCI device,
then a SRAT memory-affinity walk matches by proximity_domain.
PXM is the matching key; SRAT is the storage venue.  What you're
suggesting is moving the (PXM -> range) mapping from a system-
level table to a per-device ACPI node.  Both are populated by
firmware at boot and queried by the driver at probe; the
difference is which kernel subsystem reads the result.  The
system-table choice is what keeps the CPU memory subsystem in the
loop, per (1).

(3) The current CLI shape reflects prior maintainer alignment

The memmap-type=[normal,spm,reserved] form itself came from
Gregory's suggestion on the v4 thread in January, where you also
noted that AMD hardware "such memory has 1:1 mapping" -- which
is exactly the use case this patch targets.  v6 adopted that
naming, and v7 carries Acked-by: David and Reviewed-by: Gregory.

On the firmware side, OVMF SOFT_RESERVED handling is merged
upstream (edk2 commit 4e90b65e7b); the SeaBIOS-side conversation
with Gerd and Paul is converging on a small downstream knob.

So the current shape isn't a unilateral pick -- it's the form
the review converged on, with explicit tags from David and
Gregory and consistent firmware-side adoption.

> 
> wrt my suggestion using memory-device.
> It's true that the device memory region has started as hotpluggable memory.
> But that's impl. detail, nothing fundamentally prevents us from
> describing mix of present at boot time memory devices within it in e820/SRAT.
>   
> Answer to why DIMMs aren't in e820 was for us to avoid dealing with
> linux kernel putting that memory into zone_normal instead of zone_movable.
> On real hardware, one is likely to see all present at boot dimms, in e820
> and SRAT.
> For already existing memory devices, I'd like us continue dodging e820,
> so we wouldn't break existing deployments. however for a new memory device
> we don't have such limitations.
> 
> What I'd try is:
>   1: inherit spm-memory device from memory-device
>         (all memory mapping and APCI memory device descriptors, can be made
>          to pick it along with DIMM devices)
>   2: figure out why device driver has to fetch memory map
>     and proximity from static tables as opposed to getting it dynamically
>     from _PXM -> maped-memory range. (at the time PCI devices enum runs, all ACPI
>     info incl. run time one is fully accessible to in-kernel users)
>     i.e. try to make driver work with runtime proximity
>   3. if #2 is impossible, we can try to expose SPM memory devices in e820,
>       and partition SRAT to match actual device_memory region layout.
> 
>   
(4) On the longer-term direction

An spm-memory device built on memory-device infrastructure -- one
that emits the same E820 SOFT_RESERVED + SRAT memory-affinity
entries v7 produces today -- is a reasonable direction for a
separate RFC in QEMU 11.2 / 12.x.  Such an RFC would close
exactly the gap David noted earlier, that boot memory isn't
currently modeled as a memory device on the QEMU side.

One thing that has to stay in any such direction: the SRAT
memory-affinity emission -- without it the CPU memory subsystem
loses visibility into HBM and the contract from (1) breaks.
Happy to take part in that RFC when someone kicks it off, but
it's better treated as its own work than as a gate on v7.

Best regards,
FangSheng Huang (Jerry)
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v7 1/1] numa: add 'memmap-type' option for memory type configuration
  2026-05-18 10:43         ` Huang, FangSheng (Jerry)
@ 2026-05-18 14:32           ` Igor Mammedov
  2026-05-19  4:18             ` Huang, FangSheng (Jerry)
  0 siblings, 1 reply; 12+ messages in thread
From: Igor Mammedov @ 2026-05-18 14:32 UTC (permalink / raw)
  To: Huang, FangSheng (Jerry)
  Cc: qemu-devel, david, gourry, jonathan.cameron, apopple,
	dan.j.williams, Zhigang.Luo, Lianjie.Shi, David Hildenbrand

On Mon, 18 May 2026 18:43:10 +0800
"Huang, FangSheng (Jerry)" <FangSheng.Huang@amd.com> wrote:

> Hi Igor,
> 
> Thanks again for the careful read.
> 
> On 5/15/2026 9:04 PM, Igor Mammedov wrote:
> > On Fri, 15 May 2026 15:53:07 +0800
> > "Huang, FangSheng (Jerry)" <FangSheng.Huang@amd.com> wrote:
> >   
> >> On 5/14/2026 9:05 PM, Igor Mammedov wrote: 
> >>
> >> Hi Igor,
> >>
> >> Thanks for taking the time to review this -- and for the candor in
> >> the bashing chapter.  Before going into the bigger picture, let me
> >> re-establish one factual point that v7 didn't carry forward from
> >> the v6 cover letter.  
> > 
> > feel free to bash my review as well, I hope that we end up with
> > clear picture what and why we are doing.
> >   
> Let me take your points in order: first the bigger-picture context
> you asked for (which I'll also put in v8's commit message), then
> the dax/kmem compatibility state, the prior thread on this CLI
> shape, and where this could go longer-term.
> 
> > 
> > bigger picture should be somewhere in commit message so later on
> > a reader could understand why we are doing it at all/this way.
> >   
> > lets continue with questions wrt impl.
> >   
> Agreed, v8 will carry a condensed version of the bigger picture
> in the commit message body.  Setting the architectural context
> out in full here first -- the rest of the reply depends on it.
> 
> (1) The architectural context: HBM is system memory in this class
> of systems
> 
> This patch targets coherent CPU+accelerator shared-address-space
> systems, where the accelerator's high-bandwidth memory is no
> longer a device-private framebuffer behind a PCIe BAR.  It lives
> in the host physical address space -- visible to the CPU as
> system memory (not as device MMIO) and shared coherently with
> the accelerator over the platform fabric.  Both sides must keep
> consistent views of those pages or the coherent contract breaks.
> 
> >> The use case is GPU/accelerator HBM exposed to the OS as SPM.  On
> >> bare metal, the platform firmware:
> >>
> >>     - emits E820 type 0xEFFFFFFF (SOFT_RESERVED) for the HBM region;
> >>     - emits ACPI SRAT memory affinity entries that bind HBM to a
> >>       dedicated proximity domain (NUMA node);
> >>     - tags the accelerator's PCI device with _PXM matching that node.
> >>
> >> That gives the device driver a stable lookup chain at runtime:
> >>
> >>       dev -> pci_dev_to_node(dev) -> SRAT walk -> HBM GPA range  
> > 
> > it looks kind of convoluted, isn't it.
> > PCI devices were supposed to be self describing/discoverable.
> > Preferably without above mentioned firmware 'hooks'.
> > Above example could be just early impl. issues, rather than by
> > design issue.
> >   
> >> NUMA node here is not incidental -- it is the OS-exposed
> >> intermediary ID that the device driver uses to find its own HBM.
> >> This is the in-tree path used by accelerator drivers today.  
> > 
> > I'm assuming GPU is exposed as some composite PCI/CXL device.
> > and use-case is its pass-through to guest.
> > 
> > Perhaps we can't do anything about it now.
> > But shouldn't device driver discover its own memory (HBM and what not)
> > without external parties that magically gain knowledge about parts of
> > device that driver supposedly driving the device has not a clue about?
> > How doesn bios know about SPM when device's driver with knowledge of
> > device internals knows nothing about?
> >     
> To be blunt about it: in this architecture, "the device alone
> knows where its own memory is" doesn't apply.  The accelerator
> package knows it has HBM and how much, but the GPA it's mapped to
> is set by platform firmware during boot fabric training, in
> coordination with the CPU host bridge and the fabric coherency
> hardware.  Firmware "knows the GPA" because firmware authored
> that binding, not because some external party magically learned
> the device's internals; every party including the driver has to
> conform to it.


Without magic/hardcodding, firmware is likely to discover HBM memory
as CXL device and adds E820/SRAT entries for it. That ideally how it
should be modeled in QEMU as well. (not ad-hoc -numa foo options)

I won't push towards the feature being part of GPU pass-through,
or being HBM being a CXL device (which it probably should be).
but read on ...
 
> That makes the OS-level requirement twofold: the memory has to be
> visible in the system memory map (so the CPU side can address it)
> and reserved exclusively for the accelerator's driver (so the
> general allocator doesn't hand HBM pages to random workloads).
> The SP marking therefore has to reach the CPU memory subsystem,
> not just the driver.  EFI_MEMORY_SP / E820 SOFT_RESERVED + SRAT
> memory-affinity is the mechanism that fits this: a system-level,
> firmware-produced topology that the CPU memory subsystem honors
> and that the accelerator's driver consumes to discover and
> manage its HBM range.  Any path that emits only an ACPI namespace
> node, without E820 SOFT_RESERVED, leaves the CPU memory subsystem
> without that signal and the contract breaks; any path that does
> also emit E820 SOFT_RESERVED restores it.
> 
> >> The "-numa node + memmap-type=spm + E820 SOFT_RESERVED" combo in
> >> v7 is a direct 1:1 model of this BM topology.  The E820 retyping
> >> in the patch is exactly what makes the guest-visible E820 match
> >> what BM firmware emits for the same kind of region.
> >>
> >> On the DIMM / device-memory alternative:  
> > 
> > wrt modeling GPU pass-through, my 1st attempt would be
> > to make -device gpu-foo take everything need to compose the device
> > (like in real hw) and be done with it (and PCI/CXL machinery would
> > take care of mapping/exposing memory to guest).
> > Why we aren't doing it?
> > 
> > barring that, and assuming we have to pass SPM as a separate memory
> > (why and why it should be exposed in E820 and at boot time only?)
> > I'd try -device foo-memory approach.
> >   
> So this isn't HBM-as-VRAM, the device-private framebuffer model
> from discrete GPUs.  It's HBM as a system memory tier under
> driver-owned allocation policy, and -numa node + memmap-type=spm
> is the direct expression of that, not a workaround layered on top.
> 
> (2) v7 is naturally compatible with stock upstream Linux dax/kmem
> 
> v7 emits E820 SOFT_RESERVED via existing machine-init, into the
> same fw_cfg blob the firmware and the guest kernel already
> consume.  The upstream kernel already understands this signal --
> dax/kmem is one such consumer in tree, picking the region up via
> the IORES_DESC_SOFT_RESERVED hook in drivers/dax/hmem/device.c.
> The accelerator's own driver discovers its HBM range from the
> same E820 + SRAT entries via dev_to_node + SRAT walk.  Neither
> side needs new code for v7.
> 
> For a memory-device-derived spm-memory device to reach the same
> dax/kmem path, it would have to inject E820 SOFT_RESERVED from
> its realize() -- which is new plumbing inside the device_memory
> subsystem.

It shouldn't be done at realize time.

Ultimately we publish E820 table at machine_done time,
and it's right time to iterate over present memory devices
and add relevant entries if necessary.
SRAT is produced even later
(effectively at runtime on 1st access to acpi tables blob),
so it can pickup memory devices at that time as well.

> That also bears on two of your other points:
> > And no need for messing with firmware (SeaBIOS: RamSizeOver4G
> > patch) nor EDK2.  
> 
> That holds only for the path where the driver doesn't go through
> E820 SOFT_RESERVED at all -- but that path needs the in-tree
> accelerator driver rewritten to walk ACPI namespace instead of
> SRAT, plus a kernel-side patch to wire ACPI memory device
> descriptors into dax/kmem.  For the e820+SRAT fallback path you
> describe as Step 3, the firmware-side requirement is identical to
> v7.
> 
> > figure out why device driver has to fetch memory map
> > and proximity from static tables as opposed to getting it dynamically
> > from _PXM -> maped-memory range. (at the time PCI devices enum runs, 
>  > all ACPI
> > info incl. run time one is fully accessible to in-kernel users)
> > i.e. try to make driver work with runtime proximity  
> 
> The current amdgpu lookup is already a runtime, PXM-keyed dynamic
> lookup -- dev_to_node(pdev) evaluates _PXM on the PCI device,
> then a SRAT memory-affinity walk matches by proximity_domain.
> PXM is the matching key; SRAT is the storage venue.  What you're
> suggesting is moving the (PXM -> range) mapping from a system-
> level table to a per-device ACPI node.  Both are populated by
> firmware at boot and queried by the driver at probe; the
> difference is which kernel subsystem reads the result.  The
> system-table choice is what keeps the CPU memory subsystem in the
> loop, per (1).
> 
> (3) The current CLI shape reflects prior maintainer alignment
> 
> The memmap-type=[normal,spm,reserved] form itself came from
> Gregory's suggestion on the v4 thread in January, where you also
> noted that AMD hardware "such memory has 1:1 mapping" -- which
> is exactly the use case this patch targets.  v6 adopted that
> naming, and v7 carries Acked-by: David and Reviewed-by: Gregory.
> 
> On the firmware side, OVMF SOFT_RESERVED handling is merged
> upstream (edk2 commit 4e90b65e7b); the SeaBIOS-side conversation
> with Gerd and Paul is converging on a small downstream knob.
> 
> So the current shape isn't a unilateral pick -- it's the form
> the review converged on, with explicit tags from David and
> Gregory and consistent firmware-side adoption.
> 
> > 
> > wrt my suggestion using memory-device.
> > It's true that the device memory region has started as hotpluggable memory.
> > But that's impl. detail, nothing fundamentally prevents us from
> > describing mix of present at boot time memory devices within it in e820/SRAT.
> >   
> > Answer to why DIMMs aren't in e820 was for us to avoid dealing with
> > linux kernel putting that memory into zone_normal instead of zone_movable.
> > On real hardware, one is likely to see all present at boot dimms, in e820
> > and SRAT.
> > For already existing memory devices, I'd like us continue dodging e820,
> > so we wouldn't break existing deployments. however for a new memory device
> > we don't have such limitations.
> > 
> > What I'd try is:
> >   1: inherit spm-memory device from memory-device
> >         (all memory mapping and APCI memory device descriptors, can be made
> >          to pick it along with DIMM devices)
> >   2: figure out why device driver has to fetch memory map
> >     and proximity from static tables as opposed to getting it dynamically
> >     from _PXM -> maped-memory range. (at the time PCI devices enum runs, all ACPI
> >     info incl. run time one is fully accessible to in-kernel users)
> >     i.e. try to make driver work with runtime proximity
> >   3. if #2 is impossible, we can try to expose SPM memory devices in e820,
> >       and partition SRAT to match actual device_memory region layout.
> > 
> >     
> (4) On the longer-term direction

that's exactly what I'm concerned about

> An spm-memory device built on memory-device infrastructure -- one
> that emits the same E820 SOFT_RESERVED + SRAT memory-affinity
> entries v7 produces today -- is a reasonable direction for a
> separate RFC in QEMU 11.2 / 12.x.  Such an RFC would close
> exactly the gap David noted earlier, that boot memory isn't
> currently modeled as a memory device on the QEMU side.
> 
> One thing that has to stay in any such direction: the SRAT
> memory-affinity emission -- without it the CPU memory subsystem
> loses visibility into HBM and the contract from (1) breaks.
> Happy to take part in that RFC when someone kicks it off, but
> it's better treated as its own work than as a gate on v7.

If we merge v7 approach, we are bound to support it for years
and if we later on we add device based variant, it will increase
support burden even more and complicate configuration, which
in turn will propagate up the stack.

Hence, I'd like possible options on the table explored 1st
before we commit to a particular approach. 

I wouldn't object much is it were a fix that we had to rush in,
but this is not the case (are we in hurry to rush this in?).

My hunch is that memory device based approach will end-up with
more straightforward and cleaner code, not to mention proper
backend/frontend modeling.

(also related to long term: ideally existing -numa 'some memory' interface,
should go away and be replaced by memory devices.
Adding more similar '-numa' options, doesn't help that case and only
increases out technical debt).


> Best regards,
> FangSheng Huang (Jerry)
> >   
> 



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v7 1/1] numa: add 'memmap-type' option for memory type configuration
  2026-05-18 14:32           ` Igor Mammedov
@ 2026-05-19  4:18             ` Huang, FangSheng (Jerry)
  2026-05-20 12:41               ` Igor Mammedov
  0 siblings, 1 reply; 12+ messages in thread
From: Huang, FangSheng (Jerry) @ 2026-05-19  4:18 UTC (permalink / raw)
  To: Igor Mammedov, gregory Price, David Hildenbrand,
	David Hildenbrand
  Cc: qemu-devel, jonathan.cameron, apopple, dan.j.williams,
	Zhigang.Luo, Lianjie.Shi

Hi Igor,

Thanks for the concrete pointers in this round -- the machine_done /
ACPI-runtime timing notes are useful regardless of where this lands.
Replying inline.

On 5/18/2026 10:32 PM, Igor Mammedov wrote:
> On Mon, 18 May 2026 18:43:10 +0800
> "Huang, FangSheng (Jerry)" <FangSheng.Huang@amd.com> wrote:
> 
>> Hi Igor,
>>
>> Thanks again for the careful read.
>>
>> On 5/15/2026 9:04 PM, Igor Mammedov wrote:
> 
> 
> Without magic/hardcodding, firmware is likely to discover HBM memory
> as CXL device and adds E820/SRAT entries for it. That ideally how it
> should be modeled in QEMU as well. (not ad-hoc -numa foo options)
> 
> I won't push towards the feature being part of GPU pass-through,
> or being HBM being a CXL device (which it probably should be).
> but read on ...
>
Understood, and thanks for not making those preconditions.  Setting
CXL and GPU-passthrough-bundle aside for this reply, the remaining
question is whether the SPM topology is expressed on the -numa node
line (v7) or on a separate -device line (your proposal).  I'll
address that below under the mixed-memory point.

> 
> It shouldn't be done at realize time.
> 
> Ultimately we publish E820 table at machine_done time,
> and it's right time to iterate over present memory devices
> and add relevant entries if necessary.
> SRAT is produced even later
> (effectively at runtime on 1st access to acpi tables blob),
> so it can pickup memory devices at that time as well.
>
Noted -- machine_done for E820 and ACPI-tables-runtime for SRAT is
the right hook set for a memory-device-based implementation; that's
where the iteration over present memory devices would land.
(v7 itself sets the E820 entries during pc machine-init, which is
the appropriate hook for the topology-driven -numa path -- there's
no per-device realize step whose results would need to be iterated
over.)

>> (4) On the longer-term direction
> 
> that's exactly what I'm concerned about
> 
> 
> If we merge v7 approach, we are bound to support it for years
> and if we later on we add device based variant, it will increase
> support burden even more and complicate configuration, which
> in turn will propagate up the stack.
> 
> Hence, I'd like possible options on the table explored 1st
> before we commit to a particular approach.
> 
> I wouldn't object much is it were a fix that we had to rush in,
> but this is not the case (are we in hurry to rush this in?).
>
This is where I'd like to push back, on three connected points: the
mixed-memory question that the v4 thread already converged on; the
relationship between v7 and a future spm-memory device; and the
review timeline.

(1) The mixed-memory question -- v4 thread carries directly over

In the v4 round, an -object/backend-property variant of this feature
was set aside after pushback from Gregory and David on a single
concern: allowing a single NUMA node to mix SPM and normal memory.
The current memmap-type=[normal,spm,reserved] form was Gregory's
suggestion in that same discussion, and v7 carries Acked-by from
David and Reviewed-by from Gregory.

A -device spm-memory model reopens that question.  A memory device
is plugged into a target NUMA node via its node= parameter; unless
additional constraints are added, that node can also have normal
memory attached via -numa node,memdev=.  Adding a runtime check
that errors out on mixed-node configurations is possible, but it
changes the failure mode from "read your command line and see which
node is SPM" to "assemble a command line that looks fine, hit a
realize-time error, edit, retry."  For users composing multi-node
topologies on the CLI (or via libvirt-generated cmdlines), that's a
meaningful UX regression.

The whole-node typing in v7 expresses the SPM-vs-normal property
declaratively, on the same -numa line that already carries the
node's other attributes -- which is the form the v4 review
converged on, precisely to keep this kind of constraint visible at
the configuration layer rather than at realize time.

(Gregory, David: this is exactly the regression you raised in v4
that pushed the series away from -object form.  Would value your
read on whether a -device spm-memory variant raises it again, and
what an acceptable resolution would look like in that form.)

(2) v7 and a future spm-memory device aren't mutually exclusive

I don't see anything in v7 that conflicts with the device-based
direction you sketched.  v7 establishes the user-visible semantic
(whole-node SPM, with E820 SOFT_RESERVED + SRAT memory-affinity
emission) and the firmware-side compatibility (OVMF already
upstream, SeaBIOS in flight).  A future spm-memory device built on
memory-device infrastructure can target the same E820 + SRAT
emission and either co-exist with v7 (different idioms for the same
underlying semantic) or, if and when it proves out, subsume v7 via
the standard QEMU deprecation flow.

Three observations on the "merging v7 means committing to support
burden for years" framing:

(A) On your own framing, the existing -numa node,memdev= interface
     for attaching memory to NUMA nodes is itself on a long
     deprecation arc toward memory-device.  v7's memmap-type= is a
     sibling attribute on the same -numa node configuration, sharing
     the same lifetime envelope; it doesn't create a new long-term
     contract beyond what -numa node,memdev= already commits us to.

(B) v7 is the interim shape that a future spm-memory device can
     subsume.  If the device-based variant lands later and proves
     out, v7's memmap-type=spm can be marked deprecated under the
     standard two-cycle policy and removed.  That's the same path
     we'd take for any of the -numa node,memdev= family.  Treating
     v7 as a permanent commitment overstates the contract.

(C) The marginal code surface of v7 is small: a single new -numa
     node attribute that routes into the existing E820 plumbing in
     machine-init.  The marginal maintenance cost is bounded by
     that surface.  Compare against the cost of holding the in-tree
     use case while a memory-device-based prototype is designed,
     implemented, reviewed and stabilised.

(3) The review timeline

I want to be transparent about this rather than leave it implied.
This series has been in upstream review for roughly 1.5 years.  The
v5, v6 and v7 cycles went through with Acked-by from David and
Reviewed-by from Gregory; your previous engagement on this thread
was on the v4 round in January.  v7 was posted with the QEMU 11.1
window in mind, and the soft freeze is 7 July.

I'm raising this not to dismiss the technical questions you've put
on the table -- those are worth a separate RFC and I'd be glad to
take part -- but because gating v7 on prototyping a memory-device-
based variant first effectively pushes this past 11.1, and the
production deployment timeline that this series enables doesn't
accommodate that.  If the in-tree review on v7 itself had surfaced
this direction earlier, we'd be in a different place; at this point
in the cycle, the cleanest path is to land v7 as it stands (with
the tags it already carries) and pursue the device-based variant as
its own RFC.

> My hunch is that memory device based approach will end-up with
> more straightforward and cleaner code, not to mention proper
> backend/frontend modeling.
> 
> (also related to long term: ideally existing -numa 'some memory' interface,
> should go away and be replaced by memory devices.
> Adding more similar '-numa' options, doesn't help that case and only
> increases out technical debt).
> 
>
On the cleaner-code hunch -- agreed in principle for a green-field
design.  On the -numa-should-go-away point -- agreed as a long-term
direction; that's exactly why v7 sits within the existing -numa
node,memdev= lifetime envelope rather than committing us to a new
one (point 2A above).

I'd be glad to take part in the spm-memory RFC and in the wider
migration of the -numa node,memdev= interface family to
memory-device that you sketched.  Realistically, there are
architecture-level questions still open on our side that pace what
we can take on -- and landing v7 in 11.1 helps here by giving us a
stable in-tree baseline to iterate forward from.

My ask on this thread is: that v7 lands for 11.1 on the tags it
already carries, and that the device-based direction proceeds as
separate, parallel work.

Best regards,
FangSheng Huang (Jerry)

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v7 1/1] numa: add 'memmap-type' option for memory type configuration
  2026-05-19  4:18             ` Huang, FangSheng (Jerry)
@ 2026-05-20 12:41               ` Igor Mammedov
  0 siblings, 0 replies; 12+ messages in thread
From: Igor Mammedov @ 2026-05-20 12:41 UTC (permalink / raw)
  To: Huang, FangSheng (Jerry)
  Cc: gregory Price, David Hildenbrand, David Hildenbrand, qemu-devel,
	jonathan.cameron, apopple, dan.j.williams, Zhigang.Luo,
	Lianjie.Shi

On Tue, 19 May 2026 12:18:19 +0800
"Huang, FangSheng (Jerry)" <FangSheng.Huang@amd.com> wrote:

[...] trimming point we agree upon, the rest is inline.

> > If we merge v7 approach, we are bound to support it for years
> > and if we later on we add device based variant, it will increase
> > support burden even more and complicate configuration, which
> > in turn will propagate up the stack.
> > 
> > Hence, I'd like possible options on the table explored 1st
> > before we commit to a particular approach.
> > 
> > I wouldn't object much is it were a fix that we had to rush in,
> > but this is not the case (are we in hurry to rush this in?).
> >  
> This is where I'd like to push back, on three connected points: the
> mixed-memory question that the v4 thread already converged on; the
> relationship between v7 and a future spm-memory device; and the
> review timeline.
> 
> (1) The mixed-memory question -- v4 thread carries directly over
> 
> In the v4 round, an -object/backend-property variant of this feature
> was set aside after pushback from Gregory and David on a single
> concern: allowing a single NUMA node to mix SPM and normal memory.
> The current memmap-type=[normal,spm,reserved] form was Gregory's
> suggestion in that same discussion, and v7 carries Acked-by from
> David and Reviewed-by from Gregory.
> 
> A -device spm-memory model reopens that question.  A memory device
> is plugged into a target NUMA node via its node= parameter; unless
> additional constraints are added, that node can also have normal
> memory attached via -numa node,memdev=.  Adding a runtime check
> that errors out on mixed-node configurations is possible, but it
> changes the failure mode from "read your command line and see which
> node is SPM" to "assemble a command line that looks fine, hit a
> realize-time error, edit, retry."  For users composing multi-node
> topologies on the CLI (or via libvirt-generated cmdlines), that's a
> meaningful UX regression.

This is a impl. detail, not a fundamental limitation of the
device-based approach.  A spm-memory device could own the entire
NUMA node's memory if desired.
Whether it must be the only memory on node is questionable,
it might be so atm, but in future it can change. I'd rather not
add limitation at interface level (CLI) to keep our options open.
It's plausible to have multiple SPMs on the same node, or a mixed
config from fundamental pov.

Also, realize-time errors are not a UX regression -- that's how
every other memory device in QEMU works today (pc-dimm, virtio-mem,
etc.). Users and management layers (libvirt) already handle those.

> The whole-node typing in v7 expresses the SPM-vs-normal property
> declaratively, on the same -numa line that already carries the
> node's other attributes -- which is the form the v4 review
> converged on, precisely to keep this kind of constraint visible at
> the configuration layer rather than at realize time.
> 
> (Gregory, David: this is exactly the regression you raised in v4
> that pushed the series away from -object form.  Would value your
> read on whether a -device spm-memory variant raises it again, and
> what an acceptable resolution would look like in that form.)

that's what we already do for devices that have numa awareness,
including some memory devices. (I'd say it's actually a preferable
approach instead extending -numa to support edge cases
(at this case a hack, since we have no idea/clue how to handle it
using CXL/vfio)).

More to the point: -numa node memory configuration exists
primarily for built-in RAM, and remains for legacy reasons.
There are no immediate plans to transition it to device-based
memory, but the long-term direction is clear.  SPM/HBM is
fundamentally a device memory -- it's a separate, distinct
memory resource, not part of the machine's built-in RAM.
Adding it to the -numa interface when a cleaner device-based
alternative exists doesn't make much sense from a technical pov. 
It would be forcing a device concept onto an interface that
wasn't designed for it, when we already have infrastructure
(memory-device) that is designed for it.

> (2) v7 and a future spm-memory device aren't mutually exclusive
> 
> I don't see anything in v7 that conflicts with the device-based
> direction you sketched.  v7 establishes the user-visible semantic
> (whole-node SPM, with E820 SOFT_RESERVED + SRAT memory-affinity
> emission) and the firmware-side compatibility (OVMF already
> upstream, SeaBIOS in flight).  A future spm-memory device built on
> memory-device infrastructure can target the same E820 + SRAT
> emission and either co-exist with v7 (different idioms for the same
> underlying semantic) or, if and when it proves out, subsume v7 via
> the standard QEMU deprecation flow.

This is exactly the pattern I want to avoid.

Once v7 lands, libvirt and other management layers will adopt the
-numa memmap-type= interface.  At that point, deprecating it
becomes practically impossible. We'd end up maintaining two parallel
interfaces for the same thing for a foreseeable future.

The "land now, deprecate later" argument sounds reasonable in theory,
but in practice we've seen how this plays out: 'temporary' interface
is hard to remove once genie is out of bottle.

> Three observations on the "merging v7 means committing to support
> burden for years" framing:
> 
> (A) On your own framing, the existing -numa node,memdev= interface
>      for attaching memory to NUMA nodes is itself on a long
>      deprecation arc toward memory-device.  v7's memmap-type= is a
>      sibling attribute on the same -numa node configuration, sharing
>      the same lifetime envelope; it doesn't create a new long-term
>      contract beyond what -numa node,memdev= already commits us to.

see above.
+
"Sharing the same lifetime envelope" is another way of saying
"adding to the technical debt.", and put burden on somebody else
to solve problem later.

> (B) v7 is the interim shape that a future spm-memory device can
>      subsume.  If the device-based variant lands later and proves
>      out, v7's memmap-type=spm can be marked deprecated under the
>      standard two-cycle policy and removed.  That's the same path
>      we'd take for any of the -numa node,memdev= family.  Treating
>      v7 as a permanent commitment overstates the contract.

See above.
I'm not overstating the contract, I'm being realistic about what
'deprecate later' means when management stacks have already adopted
the interface.

> (C) The marginal code surface of v7 is small: a single new -numa
>      node attribute that routes into the existing E820 plumbing in
>      machine-init.  The marginal maintenance cost is bounded by
>      that surface.  Compare against the cost of holding the in-tree
>      use case while a memory-device-based prototype is designed,
>      implemented, reviewed and stabilised.

The maintenance cost isn't just the code in QEMU, it's the
interface contract with the rest of the stack. And that quickly
becomes nightmare if you try to change it later on.

> 
> (3) The review timeline
> 
> I want to be transparent about this rather than leave it implied.
> This series has been in upstream review for roughly 1.5 years.  The
> v5, v6 and v7 cycles went through with Acked-by from David and
> Reviewed-by from Gregory; your previous engagement on this thread
> was on the v4 round in January.  v7 was posted with the QEMU 11.1
> window in mind, and the soft freeze is 7 July.
> 
> I'm raising this not to dismiss the technical questions you've put
> on the table -- those are worth a separate RFC and I'd be glad to
> take part -- but because gating v7 on prototyping a memory-device-
> based variant first effectively pushes this past 11.1, and the
> production deployment timeline that this series enables doesn't
> accommodate that.  If the in-tree review on v7 itself had surfaced
> this direction earlier, we'd be in a different place; at this point
> in the cycle, the cleanest path is to land v7 as it stands (with
> the tags it already carries) and pursue the device-based variant as
> its own RFC.

I understand it's annoying when something takes a long time to
converge and I appreciate the patience and effort you've put
into iterating on this series. 
But designing interfaces is hard, and on occasion it does take a long time. 
You're not the only one, others including myself have been there. And likely
will be there again when topic warrants it. I'd suggest keep exploring
the suggested direction, until it converges.

Upstream doesn't have release deadlines that override design concerns,
and production deployment timelines are downstream constraints, not upstream's.
The focus should be on getting a sustainable, maintainable design, not on
fitting a release window. The length of a review cycle doesn't entitle
a merge if the design direction isn't settled.

I'm not asking for a fully polished device-based implementation before
anything can land. I have suggested it at v4 review but that was dismissed.
What I'm asking is that we explore the device-based approach enough to understand
whether it works before we commit to an interface that will be hard to walk away.
An RFC with a rough prototype would go a long way.

> > My hunch is that memory device based approach will end-up with
> > more straightforward and cleaner code, not to mention proper
> > backend/frontend modeling.
> > 
> > (also related to long term: ideally existing -numa 'some memory' interface,
> > should go away and be replaced by memory devices.
> > Adding more similar '-numa' options, doesn't help that case and only
> > increases out technical debt).
> > 
> >  
> On the cleaner-code hunch -- agreed in principle for a green-field
> design.  On the -numa-should-go-away point -- agreed as a long-term
> direction; that's exactly why v7 sits within the existing -numa
> node,memdev= lifetime envelope rather than committing us to a new
> one (point 2A above).
> 
> I'd be glad to take part in the spm-memory RFC and in the wider
> migration of the -numa node,memdev= interface family to
> memory-device that you sketched.  Realistically, there are
> architecture-level questions still open on our side that pace what
> we can take on -- and landing v7 in 11.1 helps here by giving us a
> stable in-tree baseline to iterate forward from.
> 
> My ask on this thread is: that v7 lands for 11.1 on the tags it
> already carries, and that the device-based direction proceeds as
> separate, parallel work.

My position is: let's do the exploration first, then decide which interface
to commit to.  If the device-based approach turns out to have fundamental problems,
I'm open to revisiting. But let's find that out before we merge, not after.

PS:
After all, I'm not asking to rewrite half QEMU before doing your own thing,
as is customary here (someone has to work on existing tech debt).

IMHO, It really not worth fighting for memmap-type option approach. The time
is better spent on respiring it as memory device.

PS2:
Quick prototype based on dimm yields a usable experiment at half of size
of this patch (i.e. easier to read/reason about). A dedicated spm-memory based
memory-device will likely be a bit more due to boilerplate to create a new
device but SPM specific parts will be rather compact and self contained.

On firmware side, one would need to make it pass-through soft-reserved from
QEMU's E820 to OSPM but that's it, no need to deal with where memory ends
nor with reserved-end. It's all already accounted for by existing code base.

PS3:
I'd suggest to drop RESERVED as it's unused. All we need for device drivers
is SOFT memory, isn't it?

PS4:
I can put v8 on top of my review queue to follow up while problem is
still fresh in memory

> Best regards,
> FangSheng Huang (Jerry)
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v7 1/1] numa: add memmap-type option for memory type configuration
@ 2026-05-21 10:41 Huang, FangSheng (Jerry)
  2026-05-21 14:07 ` Igor Mammedov
  0 siblings, 1 reply; 12+ messages in thread
From: Huang, FangSheng (Jerry) @ 2026-05-21 10:41 UTC (permalink / raw)
  To: Igor Mammedov
  Cc: qemu-devel, David Hildenbrand, Gregory Price, Zhigang Luo,
	Lianjie Shi

Hi Igor,

Thanks for the candor in this round, and for laying out the
architectural reasoning rather than just the position. The framing is
clear: an RFC with a rough prototype demonstrating the device-based
approach, and the interface decision after that exploration
converges. That's a reasonable bar and I'm glad to converge on it.

Two of your clarifications particularly helped me converge:

- SPM belongs to the memory-device family by design, not as an
   extension of built-in RAM.

- The v4 -object/backend-property rejection is scoped to that form,
   not to any device-based shape -- so `-device spm-memory` doesn't
   carry the v4 risk forward.

One thing I should mention while we're aligning: in parallel with
this thread, I've actually been prototyping the spm-memory device
direction you outlined. I held off bringing it into the v7
discussion because I didn't want to muddy the interface debate with
implementation specifics before the direction was settled. Now that
you've made the path explicit, I can share what I've found and move
the discussion to the v8 / RFC thread properly.

A short prototype status and a couple of architectural findings
below. Some you've likely thought through already; raising them so
the v8 thread has a starting point.

(1) Prototype status

A working `spm-memory` prototype inheriting from TYPE_MEMORY_DEVICE
-- end-to-end verified on both SeaBIOS and OVMF across single,
multi-instance, and mixed-with-pc-dimm scenarios.

(2) Umbrella overlap finding + proposed mitigation for your read

The umbrella SRAT entry at acpi-build.c:1510-1515 (PXM =
nb_numa_nodes - 1, covering the full device_memory length per
pc.c:615) overlaps every per-device entry by construction. Any
driver that first-match-by-PXM via SRAT walk lands on the
umbrella's range rather than the device's actual range.

Mitigation in the prototype: in acpi-build.c, skip the umbrella
when every plugged memory device is TYPE_SPM_MEMORY. Empty
device_memory and mixed configs (SPM + pc-dimm / nvdimm /
virtio-mem) keep the umbrella, preserving Windows hotplug /
Linux <4G SWIOTLB. Verified in both directions. Honest scoping:
mixed mode still has the overlap, so this is a partial
mitigation.

(3) Process

I'll prepare a v8 PATCH series along the lines you sketched:

   - spm-memory device class (TYPE_MEMORY_DEVICE base for first cut)
   - drop the `reserved` enum value
   - commit message with the bigger-picture rationale

I'll post the RFC on qemu-devel under a fresh subject so the 
device-based discussion can start clean.

Setting the v7 memmap-type discussion aside accordingly. Thanks
again for the patience.

Best regards,
FangSheng Huang (Jerry)

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v7 1/1] numa: add memmap-type option for memory type configuration
  2026-05-21 10:41 [PATCH v7 1/1] numa: add memmap-type option for memory type configuration Huang, FangSheng (Jerry)
@ 2026-05-21 14:07 ` Igor Mammedov
  0 siblings, 0 replies; 12+ messages in thread
From: Igor Mammedov @ 2026-05-21 14:07 UTC (permalink / raw)
  To: Huang, FangSheng (Jerry)
  Cc: qemu-devel, David Hildenbrand, Gregory Price, Zhigang Luo,
	Lianjie Shi

On Thu, 21 May 2026 18:41:35 +0800
"Huang, FangSheng (Jerry)" <FangSheng.Huang@amd.com> wrote:

> Hi Igor,
> 
> Thanks for the candor in this round, and for laying out the
> architectural reasoning rather than just the position. The framing is
> clear: an RFC with a rough prototype demonstrating the device-based
> approach, and the interface decision after that exploration
> converges. That's a reasonable bar and I'm glad to converge on it.
> 
> Two of your clarifications particularly helped me converge:
> 
> - SPM belongs to the memory-device family by design, not as an
>    extension of built-in RAM.


> - The v4 -object/backend-property rejection is scoped to that form,
>    not to any device-based shape -- so `-device spm-memory` doesn't
>    carry the v4 risk forward.

Not sure what you a taking about, with device approach you will use
backend property ('memdev' if i recall correctly) to connect backend
(whatever it might be) to front-end (device model).

> 
> One thing I should mention while we're aligning: in parallel with
> this thread, I've actually been prototyping the spm-memory device
> direction you outlined. I held off bringing it into the v7
> discussion because I didn't want to muddy the interface debate with
> implementation specifics before the direction was settled. Now that
> you've made the path explicit, I can share what I've found and move
> the discussion to the v8 / RFC thread properly.
> 
> A short prototype status and a couple of architectural findings
> below. Some you've likely thought through already; raising them so
> the v8 thread has a starting point.
> 
> (1) Prototype status
> 
> A working `spm-memory` prototype inheriting from TYPE_MEMORY_DEVICE
> -- end-to-end verified on both SeaBIOS and OVMF across single,
> multi-instance, and mixed-with-pc-dimm scenarios.
> 
> (2) Umbrella overlap finding + proposed mitigation for your read
> 
> The umbrella SRAT entry at acpi-build.c:1510-1515 (PXM =
> nb_numa_nodes - 1, covering the full device_memory length per
> pc.c:615) overlaps every per-device entry by construction. Any
> driver that first-match-by-PXM via SRAT walk lands on the
> umbrella's range rather than the device's actual range.
> 
> Mitigation in the prototype: in acpi-build.c, skip the umbrella
> when every plugged memory device is TYPE_SPM_MEMORY. Empty
> device_memory and mixed configs (SPM + pc-dimm / nvdimm /
> virtio-mem) keep the umbrella, preserving Windows hotplug /
> Linux <4G SWIOTLB. Verified in both directions. Honest scoping:
> mixed mode still has the overlap, so this is a partial
> mitigation.

I'd suggest to:
  1: make smp-memory not hotpluggable
  2: when SRAT is built, partition 'umbrella region' region on chunks
    (current hotplug kind and spm kind). that will fragment the region
    and limit what can be (hot)plugged later on (especially if spm in the middle),
    but that will be the edge case, and user can reconfigure QEMU to
    put spm 1st and DIMMs on top.

If we do it like this, then mixed scenarios should work just fine.
 
> 
> (3) Process
> 
> I'll prepare a v8 PATCH series along the lines you sketched:
> 
>    - spm-memory device class (TYPE_MEMORY_DEVICE base for first cut)
>    - drop the `reserved` enum value
>    - commit message with the bigger-picture rationale

ps:
it would be nice to put references to previous versions in cover letter,
so it wouldn't be pain to find them (especially when threads are renamed)

> 
> I'll post the RFC on qemu-devel under a fresh subject so the 
> device-based discussion can start clean.
> 
> Setting the v7 memmap-type discussion aside accordingly. Thanks
> again for the patience.
> 
> Best regards,
> FangSheng Huang (Jerry)
> 



^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2026-05-21 14:08 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-21 10:41 [PATCH v7 1/1] numa: add memmap-type option for memory type configuration Huang, FangSheng (Jerry)
2026-05-21 14:07 ` Igor Mammedov
  -- strict thread matches above, loose matches on Subject: below --
2026-03-06  8:27 [PATCH v7 0/1] numa: add 'memmap-type' " fanhuang
2026-03-06  8:27 ` [PATCH v7 1/1] " fanhuang
2026-05-14 13:05   ` Igor Mammedov
2026-05-14 13:38     ` Gregory Price
2026-05-18  8:15       ` David Hildenbrand (Arm)
2026-05-15  7:53     ` Huang, FangSheng (Jerry)
2026-05-15 13:04       ` Igor Mammedov
2026-05-18 10:43         ` Huang, FangSheng (Jerry)
2026-05-18 14:32           ` Igor Mammedov
2026-05-19  4:18             ` Huang, FangSheng (Jerry)
2026-05-20 12:41               ` Igor Mammedov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.