[PATCH 0/3] ppc/pnv: Support sparse NUMA memory addresses

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/3] ppc/pnv: Support sparse NUMA memory addresses
@ 2025-03-03 10:07 Nicholas Piggin
  2025-03-03 10:07 ` [PATCH 1/3] ppc/pnv: Add support for NUMA configuration Nicholas Piggin
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Nicholas Piggin @ 2025-03-03 10:07 UTC (permalink / raw)
  To: qemu-ppc
  Cc: Nicholas Piggin, qemu-devel, Eduardo Habkost, Marcel Apfelbaum,
	Philippe Mathieu-Daudé, Yanan Wang, Zhao Liu,
	Frédéric Barrat, Igor Mammedov

IBM Power machines put NUMA node RAM at fixed addresses per node,
rather than packing it densely from address 0. This series implements
support for this by allowing machines to override the core NUMA
memory container packing, and add their own RAM regions to the
system memory space.

Thanks,
Nick

Nicholas Piggin (3):
  ppc/pnv: Add support for NUMA configuration
  hw/core/numa: add attribute to skip creation of MachineState.ram
    region
  ppc/pnv: Enable sparse chip RAM memory addresses

 include/hw/boards.h   |   6 ++
 include/system/numa.h |   1 +
 hw/core/numa.c        |  44 +++++++++++---
 hw/ppc/pnv.c          | 138 +++++++++++++++++++++++++++++++++++++-----
 4 files changed, 166 insertions(+), 23 deletions(-)

-- 
2.47.1



^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH 1/3] ppc/pnv: Add support for NUMA configuration
  2025-03-03 10:07 [PATCH 0/3] ppc/pnv: Support sparse NUMA memory addresses Nicholas Piggin
@ 2025-03-03 10:07 ` Nicholas Piggin
  2025-03-03 10:07 ` [PATCH 2/3] hw/core/numa: add attribute to skip creation of MachineState.ram region Nicholas Piggin
  2025-03-03 10:07 ` [PATCH 3/3] ppc/pnv: Enable sparse chip RAM memory addresses Nicholas Piggin
  2 siblings, 0 replies; 4+ messages in thread
From: Nicholas Piggin @ 2025-03-03 10:07 UTC (permalink / raw)
  To: qemu-ppc
  Cc: Nicholas Piggin, qemu-devel, Eduardo Habkost, Marcel Apfelbaum,
	Philippe Mathieu-Daudé, Yanan Wang, Zhao Liu,
	Frédéric Barrat, Igor Mammedov

Enable NUMA topology configuration for the powernv machine by
filling the necessary attributes and methods.

pnv_possible_cpu_arch_ids() runs before pnv_init(), so the hacky
big-core topology adjustment has to be moved there.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 hw/ppc/pnv.c | 101 +++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 89 insertions(+), 12 deletions(-)

diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index 11fd477b71b..5f2041f7f9d 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -1082,18 +1082,6 @@ static void pnv_init(MachineState *machine)
         exit(1);
     }
 
-    if (pnv->big_core) {
-        /*
-         * powernv models PnvCore as a SMT4 core. Big-core requires 2xPnvCore
-         * per core, so adjust topology here. pnv_dt_core() processor
-         * device-tree and TCG SMT code make the 2 cores appear as one big core
-         * from software point of view. pnv pervasive models and xscoms tend to
-         * see the big core as 2 small core halves.
-         */
-        machine->smp.cores *= 2;
-        machine->smp.threads /= 2;
-    }
-
     if (!is_power_of_2(machine->smp.threads)) {
         error_report("Cannot support %d threads/core on a powernv "
                      "machine because it must be a power of 2",
@@ -2865,6 +2853,87 @@ static void pnv_nmi(NMIState *n, int cpu_index, Error **errp)
     }
 }
 
+/* find cpu slot in machine->possible_cpus by core_id */
+static CPUArchId *pnv_find_cpu_slot(MachineState *ms, uint32_t id, int *idx)
+{
+    int index = id / ms->smp.threads;
+
+    if (index >= ms->possible_cpus->len) {
+        return NULL;
+    }
+    if (idx) {
+        *idx = index;
+    }
+    return &ms->possible_cpus->cpus[index];
+}
+
+static CpuInstanceProperties
+pnv_cpu_index_to_props(MachineState *machine, unsigned cpu_index)
+{
+    CPUArchId *core_slot;
+    MachineClass *mc = MACHINE_GET_CLASS(machine);
+
+    /* make sure possible_cpu are intialized */
+    mc->possible_cpu_arch_ids(machine);
+    /* get CPU core slot containing thread that matches cpu_index */
+    core_slot = pnv_find_cpu_slot(machine, cpu_index, NULL);
+    assert(core_slot);
+    return core_slot->props;
+}
+
+static const CPUArchIdList *pnv_possible_cpu_arch_ids(MachineState *machine)
+{
+    PnvMachineState *pnv = PNV_MACHINE(machine);
+    MachineClass *mc = MACHINE_GET_CLASS(machine);
+    unsigned int smp_cpus = machine->smp.cpus;
+    unsigned int smp_threads;
+    int max_cores;
+    int i;
+
+    if (pnv->big_core && !machine->possible_cpus) {
+        /*
+         * powernv models PnvCore as a SMT4 core. Big-core requires 2xPnvCore
+         * per core, so adjust topology here the first time it is called.
+         * pnv_dt_core() processor device-tree and TCG SMT code make the 2
+         * cores appear as one big core from software point of view. pnv
+         * pervasive models and xscoms tend to see the big core as 2 small core
+         * halves.
+         */
+        machine->smp.cores *= 2;
+        machine->smp.threads /= 2;
+    }
+
+    smp_threads = machine->smp.threads;
+    max_cores = machine->smp.max_cpus / smp_threads;
+
+    if (!mc->has_hotpluggable_cpus) {
+        max_cores = QEMU_ALIGN_UP(smp_cpus, smp_threads) / smp_threads;
+    }
+    if (machine->possible_cpus) {
+        assert(machine->possible_cpus->len == max_cores);
+        return machine->possible_cpus;
+    }
+
+    machine->possible_cpus = g_malloc0(sizeof(CPUArchIdList) +
+                             sizeof(CPUArchId) * max_cores);
+    machine->possible_cpus->len = max_cores;
+    for (i = 0; i < machine->possible_cpus->len; i++) {
+        int core_id = i * smp_threads;
+
+        machine->possible_cpus->cpus[i].type = machine->cpu_type;
+        machine->possible_cpus->cpus[i].vcpus_count = smp_threads;
+        machine->possible_cpus->cpus[i].arch_id = core_id;
+        machine->possible_cpus->cpus[i].props.has_core_id = true;
+        machine->possible_cpus->cpus[i].props.core_id = core_id;
+    }
+    return machine->possible_cpus;
+}
+
+static int64_t pnv_get_default_cpu_node_id(const MachineState *ms, int idx)
+{
+    return idx / ms->smp.cores % ms->numa_state->num_nodes;
+}
+
 static void pnv_machine_class_init(ObjectClass *oc, void *data)
 {
     MachineClass *mc = MACHINE_CLASS(oc);
@@ -2879,6 +2948,14 @@ static void pnv_machine_class_init(ObjectClass *oc, void *data)
     mc->block_default_type = IF_IDE;
     mc->no_parallel = 1;
     mc->default_boot_order = NULL;
+
+    mc->numa_mem_supported = true;
+    mc->auto_enable_numa = true;
+
+    mc->cpu_index_to_instance_props = pnv_cpu_index_to_props;
+    mc->get_default_cpu_node_id = pnv_get_default_cpu_node_id;
+    mc->possible_cpu_arch_ids = pnv_possible_cpu_arch_ids;
+
     /*
      * RAM defaults to less than 2048 for 32-bit hosts, and large
      * enough to fit the maximum initrd size at it's load address
-- 
2.47.1



^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 2/3] hw/core/numa: add attribute to skip creation of MachineState.ram region
  2025-03-03 10:07 [PATCH 0/3] ppc/pnv: Support sparse NUMA memory addresses Nicholas Piggin
  2025-03-03 10:07 ` [PATCH 1/3] ppc/pnv: Add support for NUMA configuration Nicholas Piggin
@ 2025-03-03 10:07 ` Nicholas Piggin
  2025-03-03 10:07 ` [PATCH 3/3] ppc/pnv: Enable sparse chip RAM memory addresses Nicholas Piggin
  2 siblings, 0 replies; 4+ messages in thread
From: Nicholas Piggin @ 2025-03-03 10:07 UTC (permalink / raw)
  To: qemu-ppc
  Cc: Nicholas Piggin, qemu-devel, Eduardo Habkost, Marcel Apfelbaum,
	Philippe Mathieu-Daudé, Yanan Wang, Zhao Liu,
	Frédéric Barrat, Igor Mammedov

NUMA machines with sparse address topologies do not want all NUMA
regions packed densely inside the MachineState.ram container region.
Add a machine class attribute that skips creating this container
region. Individual NUMA memory device regions are recorded in NodeInfo
where the machine init can add them to the system address space itself.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 include/hw/boards.h   |  6 ++++++
 include/system/numa.h |  1 +
 hw/core/numa.c        | 44 +++++++++++++++++++++++++++++++++++--------
 3 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/include/hw/boards.h b/include/hw/boards.h
index 9360d1ce394..9e6654ee9ca 100644
--- a/include/hw/boards.h
+++ b/include/hw/boards.h
@@ -233,6 +233,11 @@ typedef struct {
  *    is not needed.
  * @numa_mem_supported:
  *    true if '--numa node.mem' option is supported and false otherwise
+ * @numa_skip_ram_container:
+ *    If false, numa memory init creates the MachineState.ram memory region
+ *    with all numa node regions packed densely within it. If true, the .ram
+ *    region is not created. Machines can use this e.g., to place NUMA
+ *    regions sparsely within the address space.
  * @hotplug_allowed:
  *    If the hook is provided, then it'll be called for each device
  *    hotplug to check whether the device hotplug is allowed.  Return
@@ -311,6 +316,7 @@ struct MachineClass {
     bool nvdimm_supported;
     bool numa_mem_supported;
     bool auto_enable_numa;
+    bool numa_skip_ram_container;
     bool cpu_cluster_has_numa_boundary;
     SMPCompatProps smp_props;
     const char *default_ram_id;
diff --git a/include/system/numa.h b/include/system/numa.h
index 1044b0eb6e9..001e872d33e 100644
--- a/include/system/numa.h
+++ b/include/system/numa.h
@@ -38,6 +38,7 @@ enum {
 typedef struct NodeInfo {
     uint64_t node_mem;
     struct HostMemoryBackend *node_memdev;
+    MemoryRegion *node_mr;
     bool present;
     bool has_cpu;
     bool has_gi;
diff --git a/hw/core/numa.c b/hw/core/numa.c
index 218576f7455..d84b2d70849 100644
--- a/hw/core/numa.c
+++ b/hw/core/numa.c
@@ -623,19 +623,46 @@ static void complete_init_numa_distance(MachineState *ms)
     }
 }
 
-static void numa_init_memdev_container(MachineState *ms, MemoryRegion *ram)
+/*
+ * Consume all NUMA memory backends and store the regions in NodeInfo.node_mr.
+ */
+static void numa_init_memdev(MachineState *ms)
 {
     int i;
-    uint64_t addr = 0;
 
     for (i = 0; i < ms->numa_state->num_nodes; i++) {
-        uint64_t size = ms->numa_state->nodes[i].node_mem;
         HostMemoryBackend *backend = ms->numa_state->nodes[i].node_memdev;
         if (!backend) {
             continue;
         }
         MemoryRegion *seg = machine_consume_memdev(ms, backend);
-        memory_region_add_subregion(ram, addr, seg);
+        ms->numa_state->nodes[i].node_mr = seg;
+    }
+}
+
+/*
+ * Consume all NUMA memory backends as with numa_init_memdev, packing them
+ * densely into a MachineState.ram "container" region.
+ */
+static void numa_init_memdev_container(MachineState *ms)
+{
+    int i;
+    MachineClass *mc = MACHINE_GET_CLASS(ms);
+    uint64_t addr = 0;
+
+    ms->ram = g_new(MemoryRegion, 1);
+    memory_region_init(ms->ram, OBJECT(ms), mc->default_ram_id,
+                       ms->ram_size);
+
+    numa_init_memdev(ms);
+
+    for (i = 0; i < ms->numa_state->num_nodes; i++) {
+        uint64_t size = ms->numa_state->nodes[i].node_mem;
+        MemoryRegion *seg = ms->numa_state->nodes[i].node_mr;
+        if (!seg) {
+            continue;
+        }
+        memory_region_add_subregion(ms->ram, addr, seg);
         addr += size;
     }
 }
@@ -706,10 +733,11 @@ void numa_complete_configuration(MachineState *ms)
                              " properties are mutually exclusive");
                 exit(1);
             }
-            ms->ram = g_new(MemoryRegion, 1);
-            memory_region_init(ms->ram, OBJECT(ms), mc->default_ram_id,
-                               ms->ram_size);
-            numa_init_memdev_container(ms, ms->ram);
+            if (mc->numa_skip_ram_container) {
+                numa_init_memdev(ms);
+            } else {
+                numa_init_memdev_container(ms);
+            }
         }
         /* QEMU needs at least all unique node pair distances to build
          * the whole NUMA distance table. QEMU treats the distance table
-- 
2.47.1



^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 3/3] ppc/pnv: Enable sparse chip RAM memory addresses
  2025-03-03 10:07 [PATCH 0/3] ppc/pnv: Support sparse NUMA memory addresses Nicholas Piggin
  2025-03-03 10:07 ` [PATCH 1/3] ppc/pnv: Add support for NUMA configuration Nicholas Piggin
  2025-03-03 10:07 ` [PATCH 2/3] hw/core/numa: add attribute to skip creation of MachineState.ram region Nicholas Piggin
@ 2025-03-03 10:07 ` Nicholas Piggin
  2 siblings, 0 replies; 4+ messages in thread
From: Nicholas Piggin @ 2025-03-03 10:07 UTC (permalink / raw)
  To: qemu-ppc
  Cc: Nicholas Piggin, qemu-devel, Eduardo Habkost, Marcel Apfelbaum,
	Philippe Mathieu-Daudé, Yanan Wang, Zhao Liu,
	Frédéric Barrat, Igor Mammedov

Power CPUs place RAM memory regions for each chip (NUMA node) at
fixed locations in the real address space, resulting in a sparse
(disjoint) RAM address layout.

Use the new NUMA machine class attribute numa_skip_ram_container to
allow pnv machine init to lay out NUMA node memory regions into the
system address space in the proper location rather than packing them
densely from address 0.

With the following options:

  -smp 2,sockets=2 -m 4g
  -object memory-backend-ram,size=2G,id=mem0
  -object memory-backend-ram,size=2G,id=mem1
  -numa node,nodeid=0,memdev=mem0,cpus=0
  -numa node,nodeid=1,memdev=mem1,cpus=1

Linux (PowerNV) now boots with:

  node   0: [mem 0x0000000000000000-0x000000007fffffff]
  node   1: [mem 0x0000100000000000-0x000010007fffffff]

Prior to this change:

  node   0: [mem 0x0000000000000000-0x000000007fffffff]
  node   1: [mem 0x0000000000000000-0x00000000ffffffff]

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 hw/ppc/pnv.c | 37 ++++++++++++++++++++++++++++++++++---
 1 file changed, 34 insertions(+), 3 deletions(-)

diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index 5f2041f7f9d..b6308593335 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -966,7 +966,24 @@ static void pnv_init(MachineState *machine)
         exit(EXIT_FAILURE);
     }
 
-    memory_region_add_subregion(get_system_memory(), 0, machine->ram);
+    if (machine->ram) {
+        memory_region_add_subregion(get_system_memory(), 0, machine->ram);
+    } else if (machine->numa_state) {
+        for (i = 0; i < machine->numa_state->num_nodes; i++) {
+            MemoryRegion *mr = machine->numa_state->nodes[i].node_mr;
+
+            /*
+             * powernv uses numa_mem_align_shift to derive the base RAM address
+             * for each chip addr = Chip Number << shift.
+             */
+            chip_ram_start = (uint64_t)i << mc->numa_mem_align_shift;
+            if (!mr) {
+                continue;
+            }
+            memory_region_add_subregion(get_system_memory(), chip_ram_start,
+                                        mr);
+        }
+    }
 
     /*
      * Create our simple PNOR device
@@ -1100,20 +1117,30 @@ static void pnv_init(MachineState *machine)
         exit(1);
     }
 
+    chip_ram_start = 0;
     pnv->chips = g_new0(PnvChip *, pnv->num_chips);
     for (i = 0; i < pnv->num_chips; i++) {
         char chip_name[32];
         Object *chip = OBJECT(qdev_new(chip_typename));
-        uint64_t chip_ram_size =  pnv_chip_get_ram_size(pnv, i);
+        uint64_t chip_ram_size;
 
         pnv->chips[i] = PNV_CHIP(chip);
 
+        if (machine->numa_state) {
+            chip_ram_start = (uint64_t)i << mc->numa_mem_align_shift;
+            chip_ram_size = machine->numa_state->nodes[i].node_mem;
+        } else {
+            chip_ram_size =  pnv_chip_get_ram_size(pnv, i);
+        }
+
         /* Distribute RAM among the chips  */
         object_property_set_int(chip, "ram-start", chip_ram_start,
                                 &error_fatal);
         object_property_set_int(chip, "ram-size", chip_ram_size,
                                 &error_fatal);
-        chip_ram_start += chip_ram_size;
+        if (!machine->numa_state) {
+            chip_ram_start += chip_ram_size;
+        }
 
         snprintf(chip_name, sizeof(chip_name), "chip[%d]", i);
         object_property_add_child(OBJECT(pnv), chip_name, chip);
@@ -2680,6 +2707,7 @@ static void pnv_machine_power8_class_init(ObjectClass *oc, void *data)
 
     mc->desc = "IBM PowerNV (Non-Virtualized) POWER8";
     mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power8_v2.0");
+    mc->numa_mem_align_shift = 42;
     compat_props_add(mc->compat_props, phb_compat, G_N_ELEMENTS(phb_compat));
 
     xic->icp_get = pnv_icp_get;
@@ -2709,6 +2737,7 @@ static void pnv_machine_power9_class_init(ObjectClass *oc, void *data)
 
     mc->desc = "IBM PowerNV (Non-Virtualized) POWER9";
     mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power9_v2.2");
+    mc->numa_mem_align_shift = 42;
     compat_props_add(mc->compat_props, phb_compat, G_N_ELEMENTS(phb_compat));
 
     xfc->match_nvt = pnv_match_nvt;
@@ -2747,6 +2776,7 @@ static void pnv_machine_p10_common_class_init(ObjectClass *oc, void *data)
     };
 
     mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power10_v2.0");
+    mc->numa_mem_align_shift = 44;
     compat_props_add(mc->compat_props, phb_compat, G_N_ELEMENTS(phb_compat));
 
     mc->alias = "powernv";
@@ -2951,6 +2981,7 @@ static void pnv_machine_class_init(ObjectClass *oc, void *data)
 
     mc->numa_mem_supported = true;
     mc->auto_enable_numa = true;
+    mc->numa_skip_ram_container = true;
 
     mc->cpu_index_to_instance_props = pnv_cpu_index_to_props;
     mc->get_default_cpu_node_id = pnv_get_default_cpu_node_id;
-- 
2.47.1



^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2025-03-03 10:09 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-03-03 10:07 [PATCH 0/3] ppc/pnv: Support sparse NUMA memory addresses Nicholas Piggin
2025-03-03 10:07 ` [PATCH 1/3] ppc/pnv: Add support for NUMA configuration Nicholas Piggin
2025-03-03 10:07 ` [PATCH 2/3] hw/core/numa: add attribute to skip creation of MachineState.ram region Nicholas Piggin
2025-03-03 10:07 ` [PATCH 3/3] ppc/pnv: Enable sparse chip RAM memory addresses Nicholas Piggin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).