[PATCH v2 1/1] libnvdimm/e820: Add a new parameter to configure many regions per e820 entry

linux-doc.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v2 1/1] libnvdimm/e820: Add a new parameter to configure many regions per e820 entry
@ 2025-04-17 14:25 Michal Clapinski
  2025-04-21  2:06 ` Ira Weiny
  2025-04-21 23:20 ` Dan Williams
  0 siblings, 2 replies; 9+ messages in thread
From: Michal Clapinski @ 2025-04-17 14:25 UTC (permalink / raw)
  To: Pasha Tatashin, Dan Williams, Vishal Verma, Dave Jiang, Ira Weiny,
	Jonathan Corbet
  Cc: nvdimm, linux-doc, linux-kernel, Michal Clapinski

Currently, the user has to specify each memory region to be used with
nvdimm via the memmap parameter. Due to the character limit of the
command line, this makes it impossible to have a lot of pmem devices.
This new parameter solves this issue by allowing users to divide
one e820 entry into many nvdimm regions.

This change is needed for the hypervisor live update. VMs' memory will
be backed by those emulated pmem devices. To support various VM shapes
I want to create devdax devices at 1GB granularity similar to hugetlb.

It's also possible to expand this parameter in the future,
e.g. to specify the type of the device (fsdax/devdax).

Signed-off-by: Michal Clapinski <mclapinski@google.com>
---
v2: Fixed a crash when pmem parameter is omitted.
---
 .../admin-guide/kernel-parameters.txt         |   7 +
 drivers/nvdimm/e820.c                         | 149 +++++++++++++++++-
 2 files changed, 153 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index fb8752b42ec85..63af03eb850ed 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3849,6 +3849,13 @@
 
 	n2=		[NET] SDL Inc. RISCom/N2 synchronous serial card
 
+	nd_e820.pmem=ss[KMG],nn[KMG]
+			Divide one e820 entry specified by memmap=x!ss
+			(that is starting at ss) into pmem devices of size nn.
+			There can be only one pmem parameter per one e820
+			entry. The size of the e820 entry has to be divisible
+			by the device size.
+
 	netdev=		[NET] Network devices parameters
 			Format: <irq>,<io>,<mem_start>,<mem_end>,<name>
 			Note that mem_start is often overloaded to mean
diff --git a/drivers/nvdimm/e820.c b/drivers/nvdimm/e820.c
index 41c67dfa80158..b5559e2e6fc9f 100644
--- a/drivers/nvdimm/e820.c
+++ b/drivers/nvdimm/e820.c
@@ -8,6 +8,87 @@
 #include <linux/libnvdimm.h>
 #include <linux/module.h>
 #include <linux/numa.h>
+#include <linux/moduleparam.h>
+#include <linux/xarray.h>
+
+#define MAX_PMEM_ARGUMENTS 32
+
+static char *pmem[MAX_PMEM_ARGUMENTS];
+static int pmem_count;
+
+static int pmem_param_set(const char *arg, const struct kernel_param *kp)
+{
+	int rc;
+	struct kernel_param kp_new;
+
+	kp_new.name = kp->name;
+	kp_new.arg = &pmem[pmem_count];
+	rc = param_set_charp(arg, &kp_new);
+	if (rc)
+		return rc;
+	++pmem_count;
+	return 0;
+}
+
+static void pmem_param_free(void *arg)
+{
+	int i;
+
+	for (i = 0; i < pmem_count; ++i)
+		param_free_charp(&pmem[i]);
+
+	pmem_count = 0;
+}
+
+static const struct kernel_param_ops pmem_param_ops = {
+	.set =		pmem_param_set,
+	.free =		pmem_param_free,
+};
+module_param_cb(pmem, &pmem_param_ops, NULL, 0);
+
+struct pmem_entry {
+	unsigned long region_size;
+};
+
+static int parse_one_pmem_arg(struct xarray *xarray, char *p)
+{
+	int rc = -EINVAL;
+	char *oldp;
+	unsigned long start;
+	struct pmem_entry *entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+
+	if (!entry)
+		return -ENOMEM;
+
+	oldp = p;
+	start = memparse(p, &p);
+	if (p == oldp || *p != ',') {
+		pr_err("Can't parse pmem start: %s\n", oldp);
+		goto err;
+	}
+	++p;
+
+	oldp = p;
+	entry->region_size = memparse(p, &p);
+	if (p == oldp || (*p != ',' && *p != '\0')) {
+		pr_err("Can't parse pmem region size: %s\n", oldp);
+		goto err;
+	}
+
+	if (*p != '\0')
+		pr_warn("Unexpected parameters in pmem arg: %s\n", p);
+
+	rc = xa_err(xa_store(xarray, start, entry, GFP_KERNEL));
+	if (rc) {
+		pr_err("Failed to store 0x%lx in xarray, error %d\n", start, rc);
+		goto err;
+	}
+	return 0;
+
+err:
+	kfree(entry);
+	return rc;
+}
 
 static void e820_pmem_remove(struct platform_device *pdev)
 {
@@ -16,10 +97,9 @@ static void e820_pmem_remove(struct platform_device *pdev)
 	nvdimm_bus_unregister(nvdimm_bus);
 }
 
-static int e820_register_one(struct resource *res, void *data)
+static int register_one_pmem(struct resource *res, struct nvdimm_bus *nvdimm_bus)
 {
 	struct nd_region_desc ndr_desc;
-	struct nvdimm_bus *nvdimm_bus = data;
 	int nid = phys_to_target_node(res->start);
 
 	memset(&ndr_desc, 0, sizeof(ndr_desc));
@@ -32,12 +112,64 @@ static int e820_register_one(struct resource *res, void *data)
 	return 0;
 }
 
+struct walk_data {
+	struct xarray *pmem_xarray;
+	struct nvdimm_bus *nvdimm_bus;
+};
+
+static int e820_handle_one_entry(struct resource *res, void *data)
+{
+	struct walk_data *walk_data = data;
+	struct resource res_local;
+	struct pmem_entry *entry;
+	unsigned long entry_size = resource_size(res);
+	int rc;
+
+	entry = xa_load(walk_data->pmem_xarray, res->start);
+
+	if (!entry)
+		return register_one_pmem(res, walk_data->nvdimm_bus);
+
+	if (entry_size % entry->region_size != 0) {
+		pr_err("Entry size %lu is not divisible by region size %lu\n",
+		       entry_size, entry->region_size);
+		return -EINVAL;
+	}
+
+	res_local.start = res->start;
+	res_local.end = res->start + entry->region_size - 1;
+	while (res_local.end <= res->end) {
+		rc = register_one_pmem(&res_local, walk_data->nvdimm_bus);
+		if (rc)
+			return rc;
+
+		res_local.start += entry->region_size;
+		res_local.end += entry->region_size;
+	}
+
+	return 0;
+}
+
+static void free_pmem_xarray(struct xarray *pmem_xarray)
+{
+	unsigned long start;
+	struct pmem_entry *entry;
+
+	xa_for_each(pmem_xarray, start, entry) {
+		kfree(entry);
+	}
+	xa_destroy(pmem_xarray);
+}
+
 static int e820_pmem_probe(struct platform_device *pdev)
 {
 	static struct nvdimm_bus_descriptor nd_desc;
 	struct device *dev = &pdev->dev;
 	struct nvdimm_bus *nvdimm_bus;
+	struct xarray pmem_xarray;
+	struct walk_data walk_data = {.pmem_xarray = &pmem_xarray};
 	int rc = -ENXIO;
+	int i;
 
 	nd_desc.provider_name = "e820";
 	nd_desc.module = THIS_MODULE;
@@ -46,8 +178,19 @@ static int e820_pmem_probe(struct platform_device *pdev)
 		goto err;
 	platform_set_drvdata(pdev, nvdimm_bus);
 
+	xa_init(&pmem_xarray);
+	for (i = 0; i < pmem_count; i++) {
+		rc = parse_one_pmem_arg(&pmem_xarray, pmem[i]);
+		if (rc != 0 && rc != -EINVAL) {
+			free_pmem_xarray(&pmem_xarray);
+			goto err;
+		}
+	}
+
+	walk_data.nvdimm_bus = nvdimm_bus;
 	rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY,
-			IORESOURCE_MEM, 0, -1, nvdimm_bus, e820_register_one);
+		IORESOURCE_MEM, 0, -1, &walk_data, e820_handle_one_entry);
+	free_pmem_xarray(&pmem_xarray);
 	if (rc)
 		goto err;
 	return 0;
-- 
2.49.0.777.g153de2bbd5-goog


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 1/1] libnvdimm/e820: Add a new parameter to configure many regions per e820 entry
  2025-04-17 14:25 [PATCH v2 1/1] libnvdimm/e820: Add a new parameter to configure many regions per e820 entry Michal Clapinski
@ 2025-04-21  2:06 ` Ira Weiny
  2025-04-21 14:55   ` Pasha Tatashin
  2025-04-21 23:20 ` Dan Williams
  1 sibling, 1 reply; 9+ messages in thread
From: Ira Weiny @ 2025-04-21  2:06 UTC (permalink / raw)
  To: Michal Clapinski, Pasha Tatashin, Dan Williams, Vishal Verma,
	Dave Jiang, Ira Weiny, Jonathan Corbet
  Cc: nvdimm, linux-doc, linux-kernel, Michal Clapinski

Michal Clapinski wrote:
> Currently, the user has to specify each memory region to be used with
> nvdimm via the memmap parameter. Due to the character limit of the
> command line, this makes it impossible to have a lot of pmem devices.
> This new parameter solves this issue by allowing users to divide
> one e820 entry into many nvdimm regions.
> 
> This change is needed for the hypervisor live update. VMs' memory will
> be backed by those emulated pmem devices. To support various VM shapes
> I want to create devdax devices at 1GB granularity similar to hugetlb.

Why is it not sufficient to create a region out of a single memmap range
and create multiple 1G dax devices within that single range?

Ira

> 
> It's also possible to expand this parameter in the future,
> e.g. to specify the type of the device (fsdax/devdax).
> 
> Signed-off-by: Michal Clapinski <mclapinski@google.com>

[snip]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 1/1] libnvdimm/e820: Add a new parameter to configure many regions per e820 entry
  2025-04-21  2:06 ` Ira Weiny
@ 2025-04-21 14:55   ` Pasha Tatashin
  2025-05-27 13:25     ` Mike Rapoport
  0 siblings, 1 reply; 9+ messages in thread
From: Pasha Tatashin @ 2025-04-21 14:55 UTC (permalink / raw)
  To: Ira Weiny
  Cc: Michal Clapinski, Dan Williams, Vishal Verma, Dave Jiang,
	Jonathan Corbet, nvdimm, linux-doc, linux-kernel

On Sun, Apr 20, 2025 at 10:06 PM Ira Weiny <ira.weiny@intel.com> wrote:
>
> Michal Clapinski wrote:
> > Currently, the user has to specify each memory region to be used with
> > nvdimm via the memmap parameter. Due to the character limit of the
> > command line, this makes it impossible to have a lot of pmem devices.
> > This new parameter solves this issue by allowing users to divide
> > one e820 entry into many nvdimm regions.
> >
> > This change is needed for the hypervisor live update. VMs' memory will
> > be backed by those emulated pmem devices. To support various VM shapes
> > I want to create devdax devices at 1GB granularity similar to hugetlb.
>
> Why is it not sufficient to create a region out of a single memmap range
> and create multiple 1G dax devices within that single range?

This method implies using the ndctl tool to create regions and convert
them to dax devices from userspace. This does not work for our use
case. We must have these 1 GB regions available during boot because we
do not want to lose memory for a devdax label. I.e., if fsdax is
created during boot (i.e. default pmem format), it does not have a
label. However, if it is created from userspace, we create a label
with partition properties, UUID, etc. Here, we need to use kernel
parameters to specify the properties of the pmem devices during boot
so they can persist across reboots without losing any memory to
labels.

Pasha

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 1/1] libnvdimm/e820: Add a new parameter to configure many regions per e820 entry
  2025-04-17 14:25 [PATCH v2 1/1] libnvdimm/e820: Add a new parameter to configure many regions per e820 entry Michal Clapinski
  2025-04-21  2:06 ` Ira Weiny
@ 2025-04-21 23:20 ` Dan Williams
  2025-04-22 13:10   ` Pasha Tatashin
  1 sibling, 1 reply; 9+ messages in thread
From: Dan Williams @ 2025-04-21 23:20 UTC (permalink / raw)
  To: Michal Clapinski, Pasha Tatashin, Dan Williams, Vishal Verma,
	Dave Jiang, Ira Weiny, Jonathan Corbet
  Cc: nvdimm, linux-doc, linux-kernel, Michal Clapinski

Michal Clapinski wrote:
> Currently, the user has to specify each memory region to be used with
> nvdimm via the memmap parameter. Due to the character limit of the
> command line, this makes it impossible to have a lot of pmem devices.
> This new parameter solves this issue by allowing users to divide
> one e820 entry into many nvdimm regions.
> 
> This change is needed for the hypervisor live update. VMs' memory will
> be backed by those emulated pmem devices. To support various VM shapes
> I want to create devdax devices at 1GB granularity similar to hugetlb.

This looks fairly straightforward, but if this moves forward I would
explicitly call the parameter something like "split" instead of "pmem"
to align it better with its usage.

However, while this is expedient I wonder if you would be better
served with ACPI table injection to get more control and configuration
options...

> It's also possible to expand this parameter in the future,
> e.g. to specify the type of the device (fsdax/devdax).

...for example, if you injected or customized your BIOS to supply an
ACPI NFIT table you could get to deeper degrees of customization without
wrestling with command lines. Supply an ACPI NFIT that carves up a large
memory-type range into an aribtrary number of regions. In the NFIT there
is a natural place to specify whether the range gets sent to PMEM. See
call to nvdimm_pmem_region_create() near NFIT_SPA_PM in
acpi_nfit_register_region()", and "simply" pick a new guid to signify
direct routing to device-dax. I say simply, but that implies new ACPI
NFIT driver plumbing for the new mode.

Another overlooked detail about NFIT is that there is an opportunity to
determine cases where the platform might have changed the physical
address map from one boot to the next. In other words, I cringe at the
fragility of memmap=, but I understand that it has the benefit of being
simple. See the "nd_set cookie" concept in
acpi_nfit_init_interleave_set().

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 1/1] libnvdimm/e820: Add a new parameter to configure many regions per e820 entry
  2025-04-21 23:20 ` Dan Williams
@ 2025-04-22 13:10   ` Pasha Tatashin
  2025-04-22 19:00     ` Dan Williams
  0 siblings, 1 reply; 9+ messages in thread
From: Pasha Tatashin @ 2025-04-22 13:10 UTC (permalink / raw)
  To: Dan Williams
  Cc: Michal Clapinski, Vishal Verma, Dave Jiang, Ira Weiny,
	Jonathan Corbet, nvdimm, linux-doc, linux-kernel

On Mon, Apr 21, 2025 at 7:21 PM Dan Williams <dan.j.williams@intel.com> wrote:
>
> Michal Clapinski wrote:
> > Currently, the user has to specify each memory region to be used with
> > nvdimm via the memmap parameter. Due to the character limit of the
> > command line, this makes it impossible to have a lot of pmem devices.
> > This new parameter solves this issue by allowing users to divide
> > one e820 entry into many nvdimm regions.
> >
> > This change is needed for the hypervisor live update. VMs' memory will
> > be backed by those emulated pmem devices. To support various VM shapes
> > I want to create devdax devices at 1GB granularity similar to hugetlb.
>
> This looks fairly straightforward, but if this moves forward I would
> explicitly call the parameter something like "split" instead of "pmem"
> to align it better with its usage.
>
> However, while this is expedient I wonder if you would be better
> served with ACPI table injection to get more control and configuration
> options...
>
> > It's also possible to expand this parameter in the future,
> > e.g. to specify the type of the device (fsdax/devdax).
>
> ...for example, if you injected or customized your BIOS to supply an
> ACPI NFIT table you could get to deeper degrees of customization without
> wrestling with command lines. Supply an ACPI NFIT that carves up a large
> memory-type range into an aribtrary number of regions. In the NFIT there
> is a natural place to specify whether the range gets sent to PMEM. See
> call to nvdimm_pmem_region_create() near NFIT_SPA_PM in
> acpi_nfit_register_region()", and "simply" pick a new guid to signify
> direct routing to device-dax. I say simply, but that implies new ACPI
> NFIT driver plumbing for the new mode.
>
> Another overlooked detail about NFIT is that there is an opportunity to
> determine cases where the platform might have changed the physical
> address map from one boot to the next. In other words, I cringe at the
> fragility of memmap=, but I understand that it has the benefit of being
> simple. See the "nd_set cookie" concept in
> acpi_nfit_init_interleave_set().

I also dislike the potential fragility of the memmap= parameter;
however, in our environment, kernel parameters are specifically
crafted for target machine configurations and supplied separately from
the kernel binary, giving us good control.

Regarding the ACPI NFIT suggestion: Our use case involves reusing the
same physical machines (with unchanged firmware) for various
configurations (similar to loaning them out). An advantage for us is
that switching the machine's role only requires changing the kernel
parameters. The ACPI approach, potentially requiring firmware changes,
would break this dynamic reconfiguration.

As I understand, using ACPI injection instead of firmware change
doesn't eliminate fragility concerns either. We would still need to
carefully reserve the specific physical range for a particular machine
configuration, and it also adds a dependency on managing and packaging
an external NFIT injection file and process. We have a process for
kernel parameters but doing this externally would complicate things
for us.

Also, I might be missing something, but I haven't found a standard way
to automatically create devdax devices using NFIT injection. Our
current plan is to expand the proposed kernel parameter. We are
working on making it default to creating either fsdax or devdax type
regions, without requiring explicit labels, and ensuring these regions
remain stable across kexec as long as the kernel parameter itself
doesn't change (in a way kernel parameters take the role of the
labels).

Pasha

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 1/1] libnvdimm/e820: Add a new parameter to configure many regions per e820 entry
  2025-04-22 13:10   ` Pasha Tatashin
@ 2025-04-22 19:00     ` Dan Williams
  2025-04-22 19:40       ` Dan Williams
  0 siblings, 1 reply; 9+ messages in thread
From: Dan Williams @ 2025-04-22 19:00 UTC (permalink / raw)
  To: Pasha Tatashin, Dan Williams
  Cc: Michal Clapinski, Vishal Verma, Dave Jiang, Ira Weiny,
	Jonathan Corbet, nvdimm, linux-doc, linux-kernel

Pasha Tatashin wrote:
> On Mon, Apr 21, 2025 at 7:21 PM Dan Williams <dan.j.williams@intel.com> wrote:
> >
> > Michal Clapinski wrote:
> > > Currently, the user has to specify each memory region to be used with
> > > nvdimm via the memmap parameter. Due to the character limit of the
> > > command line, this makes it impossible to have a lot of pmem devices.
> > > This new parameter solves this issue by allowing users to divide
> > > one e820 entry into many nvdimm regions.
> > >
> > > This change is needed for the hypervisor live update. VMs' memory will
> > > be backed by those emulated pmem devices. To support various VM shapes
> > > I want to create devdax devices at 1GB granularity similar to hugetlb.
> >
> > This looks fairly straightforward, but if this moves forward I would
> > explicitly call the parameter something like "split" instead of "pmem"
> > to align it better with its usage.
> >
> > However, while this is expedient I wonder if you would be better
> > served with ACPI table injection to get more control and configuration
> > options...
> >
> > > It's also possible to expand this parameter in the future,
> > > e.g. to specify the type of the device (fsdax/devdax).
> >
> > ...for example, if you injected or customized your BIOS to supply an
> > ACPI NFIT table you could get to deeper degrees of customization without
> > wrestling with command lines. Supply an ACPI NFIT that carves up a large
> > memory-type range into an aribtrary number of regions. In the NFIT there
> > is a natural place to specify whether the range gets sent to PMEM. See
> > call to nvdimm_pmem_region_create() near NFIT_SPA_PM in
> > acpi_nfit_register_region()", and "simply" pick a new guid to signify
> > direct routing to device-dax. I say simply, but that implies new ACPI
> > NFIT driver plumbing for the new mode.
> >
> > Another overlooked detail about NFIT is that there is an opportunity to
> > determine cases where the platform might have changed the physical
> > address map from one boot to the next. In other words, I cringe at the
> > fragility of memmap=, but I understand that it has the benefit of being
> > simple. See the "nd_set cookie" concept in
> > acpi_nfit_init_interleave_set().
> 
> I also dislike the potential fragility of the memmap= parameter;
> however, in our environment, kernel parameters are specifically
> crafted for target machine configurations and supplied separately from
> the kernel binary, giving us good control.
> 
> Regarding the ACPI NFIT suggestion: Our use case involves reusing the
> same physical machines (with unchanged firmware) for various
> configurations (similar to loaning them out). An advantage for us is
> that switching the machine's role only requires changing the kernel
> parameters. The ACPI approach, potentially requiring firmware changes,
> would break this dynamic reconfiguration.
> 
> As I understand, using ACPI injection instead of firmware change
> doesn't eliminate fragility concerns either. We would still need to
> carefully reserve the specific physical range for a particular machine
> configuration, and it also adds a dependency on managing and packaging
> an external NFIT injection file and process. We have a process for
> kernel parameters but doing this externally would complicate things
> for us.

Lets unpack a few things. My assumption is that ACPI table injection
deployment is similar in complexity to kernel parameters because it is
data appended to an initrd. So if a deployment flow can:

    echo $parameters >> $boot_config

...it can instead:

    cat $base_initrd $nfit > $amended_initrd

As for the fragility I do agree that without platform firmware changes
(base system NFIT) then it would be difficult to detect that the
platform is booting in an unexpected physical memory layout.

So memmap= would be used to mark the memory as Reserved and then the
injected NFIT carves it up and optionally routes it to pmem or devdax.

The aspect I have not tried though is injecting an ACPI0012 device if
the platform does not already have one...

I think it is solvable and avoids continuing to stress the kernel
command line interface where ACPI can takeover. At a minimum confirm
whether amending initrds is a non-starter in your environment.

> Also, I might be missing something, but I haven't found a standard way
> to automatically create devdax devices using NFIT injection. Our

Yes, this is not there today, but would fit cleanly as a new Linux
specific "Address Range Type GUID".

> current plan is to expand the proposed kernel parameter. We are
> working on making it default to creating either fsdax or devdax type
> regions, without requiring explicit labels, and ensuring these regions
> remain stable across kexec as long as the kernel parameter itself
> doesn't change (in a way kernel parameters take the role of the
> labels).

Yes, this should all work without labels.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 1/1] libnvdimm/e820: Add a new parameter to configure many regions per e820 entry
  2025-04-22 19:00     ` Dan Williams
@ 2025-04-22 19:40       ` Dan Williams
  0 siblings, 0 replies; 9+ messages in thread
From: Dan Williams @ 2025-04-22 19:40 UTC (permalink / raw)
  To: Dan Williams, Pasha Tatashin
  Cc: Michal Clapinski, Vishal Verma, Dave Jiang, Ira Weiny,
	Jonathan Corbet, nvdimm, linux-doc, linux-kernel

Dan Williams wrote:
[..]
> I think it is solvable and avoids continuing to stress the kernel
> command line interface where ACPI can takeover. At a minimum confirm
> whether amending initrds is a non-starter in your environment.

When I say "it is solvable" I am specifically referring to tweaks to the
NFIT driver to get it to operate without an ACPI0012 device or otherwise
tweak the table injection code to automate adding an ACPI0012 device
when an ACPI NFIT is injected.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 1/1] libnvdimm/e820: Add a new parameter to configure many regions per e820 entry
  2025-04-21 14:55   ` Pasha Tatashin
@ 2025-05-27 13:25     ` Mike Rapoport
  2025-05-28 14:53       ` Pasha Tatashin
  0 siblings, 1 reply; 9+ messages in thread
From: Mike Rapoport @ 2025-05-27 13:25 UTC (permalink / raw)
  To: Pasha Tatashin
  Cc: Ira Weiny, Michal Clapinski, Dan Williams, Vishal Verma,
	Dave Jiang, Jonathan Corbet, nvdimm, linux-doc, linux-kernel

On Mon, Apr 21, 2025 at 10:55:25AM -0400, Pasha Tatashin wrote:
> On Sun, Apr 20, 2025 at 10:06 PM Ira Weiny <ira.weiny@intel.com> wrote:
> >
> > Michal Clapinski wrote:
> > > Currently, the user has to specify each memory region to be used with
> > > nvdimm via the memmap parameter. Due to the character limit of the
> > > command line, this makes it impossible to have a lot of pmem devices.
> > > This new parameter solves this issue by allowing users to divide
> > > one e820 entry into many nvdimm regions.
> > >
> > > This change is needed for the hypervisor live update. VMs' memory will
> > > be backed by those emulated pmem devices. To support various VM shapes
> > > I want to create devdax devices at 1GB granularity similar to hugetlb.
> >
> > Why is it not sufficient to create a region out of a single memmap range
> > and create multiple 1G dax devices within that single range?
> 
> This method implies using the ndctl tool to create regions and convert
> them to dax devices from userspace. This does not work for our use
> case. We must have these 1 GB regions available during boot because we
> do not want to lose memory for a devdax label. I.e., if fsdax is
> created during boot (i.e. default pmem format), it does not have a
> label. However, if it is created from userspace, we create a label
> with partition properties, UUID, etc. Here, we need to use kernel

Doesn't ndctl refuse to alter namespaces on "legacy" (i.e. memmap=)
regions?

> parameters to specify the properties of the pmem devices during boot
> so they can persist across reboots without losing any memory to
> labels.
> 
> Pasha

-- 
Sincerely yours,
Mike.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 1/1] libnvdimm/e820: Add a new parameter to configure many regions per e820 entry
  2025-05-27 13:25     ` Mike Rapoport
@ 2025-05-28 14:53       ` Pasha Tatashin
  0 siblings, 0 replies; 9+ messages in thread
From: Pasha Tatashin @ 2025-05-28 14:53 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: Ira Weiny, Michal Clapinski, Dan Williams, Vishal Verma,
	Dave Jiang, Jonathan Corbet, nvdimm, linux-doc, linux-kernel

On Tue, May 27, 2025 at 9:26 AM Mike Rapoport <rppt@kernel.org> wrote:
>
> On Mon, Apr 21, 2025 at 10:55:25AM -0400, Pasha Tatashin wrote:
> > On Sun, Apr 20, 2025 at 10:06 PM Ira Weiny <ira.weiny@intel.com> wrote:
> > >
> > > Michal Clapinski wrote:
> > > > Currently, the user has to specify each memory region to be used with
> > > > nvdimm via the memmap parameter. Due to the character limit of the
> > > > command line, this makes it impossible to have a lot of pmem devices.
> > > > This new parameter solves this issue by allowing users to divide
> > > > one e820 entry into many nvdimm regions.
> > > >
> > > > This change is needed for the hypervisor live update. VMs' memory will
> > > > be backed by those emulated pmem devices. To support various VM shapes
> > > > I want to create devdax devices at 1GB granularity similar to hugetlb.
> > >
> > > Why is it not sufficient to create a region out of a single memmap range
> > > and create multiple 1G dax devices within that single range?
> >
> > This method implies using the ndctl tool to create regions and convert
> > them to dax devices from userspace. This does not work for our use
> > case. We must have these 1 GB regions available during boot because we
> > do not want to lose memory for a devdax label. I.e., if fsdax is
> > created during boot (i.e. default pmem format), it does not have a
> > label. However, if it is created from userspace, we create a label
> > with partition properties, UUID, etc. Here, we need to use kernel
>
> Doesn't ndctl refuse to alter namespaces on "legacy" (i.e. memmap=)
> regions?

Hi Mike

ndctl works with legacy namespaces just fine. We can convert them to
devdax/fsdax/raw pmem, create remove label, etc.

Pasha

>
> > parameters to specify the properties of the pmem devices during boot
> > so they can persist across reboots without losing any memory to
> > labels.
> >
> > Pasha
>
> --
> Sincerely yours,
> Mike.

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2025-05-28 14:54 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-04-17 14:25 [PATCH v2 1/1] libnvdimm/e820: Add a new parameter to configure many regions per e820 entry Michal Clapinski
2025-04-21  2:06 ` Ira Weiny
2025-04-21 14:55   ` Pasha Tatashin
2025-05-27 13:25     ` Mike Rapoport
2025-05-28 14:53       ` Pasha Tatashin
2025-04-21 23:20 ` Dan Williams
2025-04-22 13:10   ` Pasha Tatashin
2025-04-22 19:00     ` Dan Williams
2025-04-22 19:40       ` Dan Williams

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).